#
# -*- Encoding: UTF-8 -*-
#
import sys
from ctypes import *

# This module calls functions from disamblemmer_c_binding shared library using batch interface

class TaggedWord:
    def __init__(self, word, lemma, lex_f, infl_f, weight):
        self.word = word
        self.lemma = lemma
        self.lexical_features = lex_f
        self.inflection_features = infl_f
        self.weight = weight

class BatchTagger:
    def __init__(self):
        if sys.platform == 'darwin':
            soext = "dylib"
        elif sys.platform.lower().startswith('win'):
            soext = "dll"
        else:
            soext = 'so'
        self.lib = CDLL("./libdisamblemmer_c_binding." + soext)

        # Type checks
        self.lib.DL_AnalyzePhrase.restype = c_void_p
        self.lib.DL_AnalyzePhrase.argtypes = [POINTER(c_char_p), c_char_p]

        self.lib.DL_DestroyPhraseResults.restype = None
        self.lib.DL_DestroyPhraseResults.argtypes = [c_void_p]

        self.lib.DL_GetWordCount.restype = c_int
        self.lib.DL_GetWordCount.argtypes = [c_void_p]

        self.lib.DL_GetWordResultsAt.restype = c_void_p
        self.lib.DL_GetWordResultsAt.argtypes = [c_void_p, c_int]

        self.lib.DL_GetWordText.restype = c_char_p
        self.lib.DL_GetWordText.argtypes = [c_void_p]

        self.lib.DL_GetAnalysisCount.restype = c_int
        self.lib.DL_GetAnalysisCount.argtypes = [c_void_p]

        self.lib.DL_GetAnalysisAt.restype = c_void_p
        self.lib.DL_GetAnalysisAt.argtypes = [c_void_p, c_int]

        self.lib.DL_GetWordForm.restype = c_char_p
        self.lib.DL_GetWordForm.argtypes = [c_void_p]

        self.lib.DL_GetLemma.restype = c_char_p
        self.lib.DL_GetLemma.argtypes = [c_void_p]

        self.lib.DL_GetLexicalFeatures.restype = c_char_p
        self.lib.DL_GetLexicalFeatures.argtypes = [c_void_p]

        self.lib.DL_GetInflectionFeatures.restype = c_char_p
        self.lib.DL_GetInflectionFeatures.argtypes = [c_void_p]

        self.lib.DL_GetWeight.restype = c_float
        self.lib.DL_GetWeight.argtypes = [c_void_p]

    def AnalyzePhrase(self, phrase, language):
        result = []

        phrase_input = (c_char_p * (len(phrase) + 1))()
        phrase_input[:] = [p.encode("utf-8") for p in phrase] + [None]

        phrase_results = self.lib.DL_AnalyzePhrase(phrase_input, language)
        phrase_results = c_void_p(phrase_results)

        wordcount = self.lib.DL_GetWordCount(phrase_results)
        for i in range(wordcount):
            word_results = self.lib.DL_GetWordResultsAt(phrase_results, i)
            word_results = c_void_p(word_results)
            word = self.lib.DL_GetWordText(word_results).decode("utf-8")
            anacount = self.lib.DL_GetAnalysisCount(word_results)
            analyses = []
            for i in range(anacount):
                analysis = self.lib.DL_GetAnalysisAt(word_results, i)
                analysis = c_void_p(analysis)
                form = self.lib.DL_GetWordForm(analysis).decode("utf-8")
                lemma = self.lib.DL_GetLemma(analysis).decode("utf-8")
                lexfeatures = self.lib.DL_GetLexicalFeatures(analysis).decode("utf-8")
                inflfeatures = self.lib.DL_GetInflectionFeatures(analysis).decode("utf-8")
                weight = self.lib.DL_GetWeight(analysis)
                analyses.append(TaggedWord(form, lemma, lexfeatures, inflfeatures, weight))

            result.append( (word, analyses) )
        self.lib.DL_DestroyPhraseResults(phrase_results)

        return result

def main():
    # Just testing, nothing serious
    import codecs
    import re

    src = codecs.getreader("utf-8")(sys.stdin, "ignore")
    out = codecs.getwriter("utf-8")(sys.stdout, "xmlcharrefreplace")
    logger = codecs.getwriter("utf-8")(sys.stderr, "xmlcharrefreplace")
    word_re = re.compile(u"([a-zA-Z\u00C0-\u01FF\u0401-\u04FF]+(?:[-][a-zA-Z\u00C0-\u01FF\u0401-\u04FF]+)?)")

    tagger = BatchTagger()
    for line in src:
        words = word_re.split(line.strip())[1::2]
        for word, analyses in tagger.AnalyzePhrase(words, sys.argv[1]):
            if not analyses:
                print >> out, u"%s: <unknown>" % word
            else:
                print >> out, u"%s: %s (%s)" % (word, analyses[0].lemma, analyses[0].lexical_features)

if __name__ == "__main__":
    main()
