# 
# -*- Encoding: UTF-8 -*-
#
import sys
from ctypes import *

# This module calls functions from disamblemmer_c_binding shared library

class TaggedWord:
    def __init__(self, word, lemma, lex_f, infl_f, weight):
        self.word = word
        self.lemma = lemma
        self.lexical_features = lex_f
        self.inflection_features = infl_f
        self.weight = weight

class Session:
    def __init__(self, lib, cbfunc, language):
        self.lib = lib
        
        def result_callback(text, rawresults):
            rescount = self.lib.DL_GetResultCount(rawresults)
            results = []
            for i in range(rescount):
                analysis = self.lib.DL_GetResultAt(rawresults, i)
                analysis = c_void_p(analysis)
                form = self.lib.DL_GetWordForm(analysis).decode("utf-8")
                lemma = self.lib.DL_GetLemma(analysis).decode("utf-8")
                lexfeatures = self.lib.DL_GetLexicalFeatures(analysis).decode("utf-8")
                inflfeatures = self.lib.DL_GetInflectionFeatures(analysis).decode("utf-8")
                weight = self.lib.DL_GetWeight(analysis)
                results.append(TaggedWord(form, lemma, lexfeatures, inflfeatures, weight))
            cbfunc(text.decode("utf-8"), results)

        self.result_callback = DisambTagger.result_callback_proto(result_callback)
        self.session = lib.DL_CreateSession(self.result_callback, language)
        
    def __del__(self):
        self.lib.DL_DestroySession(self.session)
        
    def NextWord(self, text):
        self.lib.DL_NextWord(self.session, c_char_p(text.encode("utf-8")))
        
    def Flush(self):
        self.lib.DL_Flush(self.session)

# Utility class to avoid callback hassle: 
# process input sentence by sentence
class BatchSession (Session):
    def __init__(self, lib, language):
        class ResultStorage():
            def __init__(self):
                self.storage = []
                
            def __call__(self, text, result):
                self.storage.append( (text, result) )
        
        self.result_storage = ResultStorage()
        Session.__init__(self, lib, self.result_storage, language)

    def ParseBatch(self, words):
        self.result_storage.storage = []
        for w in words:
            self.NextWord(w)
        self.Flush()
        return self.result_storage.storage

class DisambTagger:
    result_callback_proto = CFUNCTYPE(None, c_char_p, c_void_p);

    def __init__(self):
        if sys.platform == 'darwin':
            soext = "dylib"
        elif sys.platform.lower().startswith('win'):
            soext = "dll"
        else:  
            soext = 'so'
        self.lib = CDLL("./libdisamblemmer_c_binding." + soext)
        
        # Type checks
        self.lib.DL_CreateSession.restype = c_void_p
        self.lib.DL_CreateSession.argtypes = [self.result_callback_proto, 
                                              c_char_p]

        self.lib.DL_DestroySession.restype = None
        self.lib.DL_DestroySession.argtypes = [c_void_p]

        self.lib.DL_NextWord.restype = None
        self.lib.DL_NextWord.argtypes = [c_void_p, c_char_p]

        self.lib.DL_Flush.restype = None
        self.lib.DL_Flush.argtypes = [c_void_p]

        self.lib.DL_GetResultCount.restype = c_int
        self.lib.DL_GetResultCount.argtypes = [c_void_p]

        self.lib.DL_GetResultAt.restype = c_void_p
        self.lib.DL_GetResultAt.argtypes = [c_void_p, c_int]
        
        self.lib.DL_GetWordForm.restype = c_char_p
        self.lib.DL_GetWordForm.argtypes = [c_void_p]
        
        self.lib.DL_GetLemma.restype = c_char_p
        self.lib.DL_GetLemma.argtypes = [c_void_p]
        
        self.lib.DL_GetLexicalFeatures.restype = c_char_p
        self.lib.DL_GetLexicalFeatures.argtypes = [c_void_p]
        
        self.lib.DL_GetInflectionFeatures.restype = c_char_p
        self.lib.DL_GetInflectionFeatures.argtypes = [c_void_p]
        
        self.lib.DL_GetWeight.restype = c_float
        self.lib.DL_GetWeight.argtypes = [c_void_p]

    def CreateSession(self, cbfunc, language):
        return Session(self.lib, cbfunc, language)

    def CreateBatchSession(self, language):
        return BatchSession(self.lib, language)

def main():
    # Just testing, nothing serious
    import codecs
    import re

    src = codecs.getreader("utf-8")(sys.stdin, "ignore")
    out = codecs.getwriter("utf-8")(sys.stdout, "xmlcharrefreplace")
    logger = codecs.getwriter("utf-8")(sys.stderr, "xmlcharrefreplace")
    word_re = re.compile(u"([a-zA-Z\u00C0-\u01FF\u0401-\u04FF]+(?:[-][a-zA-Z\u00C0-\u01FF\u0401-\u04FF]+)?)")
    
    session = DisambTagger().CreateBatchSession("ru")
    for line in src:
        tokens = word_re.split(line.strip())
        for word, results in session.ParseBatch(tokens[1::2]):
            if not results:
                tag = u"NONE"
            else:            
                tag = u"%s+%s" % (results[0].lemma, results[0].lexical_features)
            print >> out, word + u"/" + tag    


if __name__ == "__main__":
    main()
