#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import sys
import logging
import argparse
import codecs

if 'BINARY_ROOT_PATH' in os.environ:
    sys.path.append(os.path.join(os.environ['BINARY_ROOT_PATH'], 'bindings', 'python', 'lemmer'))

# binding could be found in arcadia/bindings/python/lemmer
from liblemmer_python_binding import AnalyzeWord


def get_qual(bastardness):
    """
    bastardness - int value from LemmaInfo.Bastardness from lemmer binding
        proper decalrations could be found here: arcadia/kernel/lemmer/core/lemmer.h:TQuality
    """
    if bastardness == 0:
        return "dict"  # слово из словаря
    marks = []
    if bastardness & 0x00000001 != 0:
        marks.append("bastard")  # не словарное
    if bastardness & 0x00000002 != 0:
        marks.append("sob")  # из "быстрого словаря"
    if bastardness & 0x00000004 != 0:
        marks.append("prefixoid")  # словарное + стандартный префикс (авто- мото- кино- фото-) всегда в компании с "bastard" или "qsob"
    if bastardness & 0x00000008 != 0:
        marks.append("foundling")  # непонятный набор букв, но проходящий в алфавит
    if bastardness & 0x00000010 != 0:
        marks.append("bad_request")  # доп. флаг.: "плохая лемма" при наличии "хорошей" альтернативы ("махать" по форме "маша")
    if bastardness & 0x00010000 != 0:
        marks.append("from_english")  # переведено с английского
    if bastardness & 0x00020000 != 0:
        marks.append("to_english")  # переведено на английский
    if bastardness & 0x00040000 != 0:
        marks.append("untranslit")  # "переведено" с транслита
    if bastardness & 0x00100000:
        marks.append("overrode")  # текст леммы был перезаписан
    if bastardness & 0x01000000:
        marks.append("fix")  # слово из фикс-листа
    return ",".join(marks)


class ILemmerInfoPrinter(object):
    def print_info(self, info, out):
        raise NotImplemented()

    def print_infos(self, infos, out):
        for info in infos:
            self.print_info(info, out)
        print >>out


class HumanReadable(ILemmerInfoPrinter):
    def print_info(self, info, out):
        print >>out, "Lemma:", info.Lemma
        print >>out, "Bastardness:", get_qual(info.Bastardness)
        print >>out, "Lemma weight:", info.Weight
        print >>out, "Lemma features:", ",".join(info.LexicalFeature)
        print >>out, "Inflection features:", " ".join(map(lambda x: ",".join(x), info.FormFeature))
        print >>out, "Language:", info.Language
        print >>out, "Form:", info.Form
        print >>out, "Rule ID: ", info.RuleId
        print >>out, "Span (in tokens):", info.First, info.Last

        print >>out


class ComputerReadable(ILemmerInfoPrinter):
    def print_info(self, info, out):
        print >>out, u"{lemma}\t{rule_id}\t{qual}\t{stemgr}\t{flexgr}\t{form}\t{first}\t{last}\t{lang}".format(
            lemma=info.Lemma,
            rule_id=info.RuleId,
            qual=get_qual(info.Bastardness),
            stemgr=",".join(info.LexicalFeature),
            flexgr="|".join(map(lambda x: ",".join(x), info.FormFeature)),
            form=info.Form,
            first=info.First,
            last=info.Last,
            lang=info.Language,
        )


class Simple(ILemmerInfoPrinter):
    def print_info(self, info, out):
        print >>out, u"{lemma}\t{stemgr}\t{flexgr}\t{weight}\t{qual}\t{lang}".format(
            lemma=info.Lemma,
            qual=get_qual(info.Bastardness),
            stemgr=",".join(info.LexicalFeature),
            weight=info.Weight,
            flexgr="|".join(map(lambda x: ",".join(x), info.FormFeature)),
            lang=info.Language,
        )


# ----------------------------------------------------
# --- Main section -----------------------------------
# ----------------------------------------------------
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "-l", "--langs",
        dest="langs",
        default="ru",
        help="languages, comma separated",
    )
    parser.add_argument(
        "-f", "--format",
        dest="format",
        default="human",
        choices=["human", "comp", "simple"],
        help="output format",
    )
    parser.add_argument(
        "-t", "--test-mode",
        dest="test",
        action="store_true",
        help="test mode, supress logging messages, use all possible languages",
    )

    args = parser.parse_args()

    logging.basicConfig(
        level=logging.DEBUG if not args.test else logging.WARNING,
        stream=sys.stderr,
        format=u'[%(asctime)s] %(levelname)s\t%(message)s',
    )
    logging.info("Reading from stdin...")

    printer = {
        "human": HumanReadable(),
        "comp": ComputerReadable(),
        "simple": Simple(),
    }.get(args.format)

    inp = sys.stdin
    out = codecs.getwriter("utf-8")(sys.stdout)
    with inp, out:
        while True:
            l = inp.readline()
            if not l:
                break
            l = l.strip()
            if not l:
                continue
            l = l.decode("utf-8")

            # l must be unicode string
            if not args.test:
                infos = AnalyzeWord(l, split=True, langs=args.langs.split(","))
            else:
                infos = AnalyzeWord(l, split=True)

            printer.print_infos(infos, out)

