import codecs
import os
import subprocess

def to_utf8(s):
    return codecs.getencoder("utf-8")(s, 'replace')[0]

def from_utf8(s):
    return codecs.getdecoder("utf-8")(s)[0]

class Analysis:
    def __init__(self, params):
        np = len(params)
        lemma = params[0]
        rule_id = np > 1 and params[1] or "-1"
        bastard = np > 2 and params[2] or ""
        stem_features = np > 3 and params[3] or ""
        form_features = np > 4 and params[4] or ""
        normal_form = np > 5 and params[5] or lemma
        startpos = np > 6 and params[6] or "0"
        length = np > 7 and params[7] or str(len(normal_form))
        language = np > 8 and params[8] or "unknown"
        flags = np > 9 and params[9] or ""
        paradigm = None

        self.lemma = lemma
        self.bastard = (bastard.find('Bastard') >= 0)

        self.features = []

        if stem_features:
            stem_features = stem_features.split(u',')
        else:
            stem_features = []

        if form_features:
            form_features = form_features.split(u',')
        else:
            form_features = []

        self.features = stem_features + form_features

        self.form = normal_form
        self.language = language
        self.flags = []
        if flags:
            self.flags = flags.split(u",")
        self.starttoken = int(startpos)
        self.tokenlength = int(length)

        if len(params) > 10:
            paradigm = []
            paritems  = [x.split(u'{') for x in params[10].rstrip(u'}').split(u"},")]
            for item in paritems:
                for feattext in item[1].split(u'|'):
                    parfeatures = stem_features[:]
                    if feattext:
                        parfeatures.extend(feattext.split(u','))
                    paradigm.append( (item[0], parfeatures) )
        self.paradigm = paradigm

    def dump(self, out):
        print >>out, self.language, self.lemma, u",".join(self.features)

        if self.paradigm:
            for item in self.paradigm:
                print >>out, "\t", item[0],  u",".join(item[1])

class Lemmer:
    def __init__ (self, lemmerPath, langs=['ru'], generate = False, postprocessors = []):
        path = os.path.abspath(lemmerPath)
        lemmerCommand = path + " -c -e utf8 "

        if generate:
            lemmerCommand += "-p "

        if langs:
            lemmerCommand += "-m " + ",".join(langs)

        args = lemmerCommand.strip().split()
        self.lemmer = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

        self.child_stdin = self.lemmer.stdin
        self.child_stdout = self.lemmer.stdout

        self.postprocessors = postprocessors

    def parse(self, word):
        word = word.strip()
        if not word:
            return []

        self.child_stdin.write(to_utf8(word) + "\n")
        self.child_stdin.flush()

        analyses = []
        while True:
            temp = self.child_stdout.readline()
            if not temp or not temp.strip():  break

            temp = from_utf8(temp.strip())
            temp = temp.split(u" ")

            form_features = temp[4]
            for token in form_features.split(u"|"):
                temp[4] = token
                analyses.append(Analysis(temp))

        for postprocessor in self.postprocessors:
            analyses = postprocessor(word, analyses)
        return analyses
