# -*- coding: utf-8 -*-

import logging
import sys
import os.path

import sandbox.common.types.client as ctc

from sandbox.common.errors import SandboxException
from sandbox.projects import resource_types
import sandbox.projects.yane.common as yane
from sandbox.sandboxsdk.parameters import SandboxStringParameter, LastReleasedResource
from sandbox.sandboxsdk.process import run_process
from sandbox.sandboxsdk.svn import Arcadia

VCONTS_GZT_NAME = 'vconts'
RELWORDS_GZT_NAME = 'relwords'


class YaneParseRelwords(yane.YaneTaskBase):
    """
        Parses related words and generates MR-table
    """

    type = 'YANE_PARSE_RELWORDS'

    # tools - 5Gb, resource - 1Gb
    execution_space = 20 * 1024
    client_tags = ctc.Tag.LINUX_PRECISE

    class Config(LastReleasedResource):
        name = 'config'
        description = 'Config'
        resource_type = resource_types.YANE_CONFIG
        group = yane.GROUP_IN

    class ObjectsTrie(LastReleasedResource):
        name = 'trie'
        description = 'Objects external IDs trie'
        resource_type = resource_types.OTHER_RESOURCE
        group = yane.GROUP_IN

    class SourceDescriptions(SandboxStringParameter):
        name = 'descriptions'
        description = 'Description words table'
        default_value = 'home/dict/yane/db.NNN/cards/descriptions'
        group = yane.GROUP_IN
        required = True

    class SourceClarifications(SandboxStringParameter):
        name = 'clarifications'
        description = 'Clarification words table'
        default_value = 'home/dict/yane/db.NNN/cards/clarifications'
        group = yane.GROUP_IN
        required = True

    class Target(SandboxStringParameter):
        name = 'target'
        description = 'Target MR folder'
        default_value = 'home/dict/yane/db.NNN/relwords'
        group = yane.GROUP_OUT
        required = True

    input_parameters = \
        yane.get_base_params().params + \
        [Config, ObjectsTrie, SourceDescriptions, SourceClarifications, Target] + \
        yane.get_mr_params().params

    def __init__(self, task_id=0):
        yane.YaneTaskBase.__init__(self, task_id)
        self.ctx['kill_timeout'] = 8 * 60 * 60

    def on_enqueue(self):
        resource = self._create_resource('v_conts gzt file',
                                         VCONTS_GZT_NAME + '.gzt',
                                         resource_types.OTHER_RESOURCE,
                                         arch='any')
        self.ctx['vconts_id'] = resource.id

        resource = self._create_resource('Relwords gazetteer binary',
                                         RELWORDS_GZT_NAME + '.bin',
                                         resource_types.OTHER_RESOURCE,
                                         arch='any')
        self.ctx['gazetteer_id'] = resource.id
        resource = self._create_resource('Relwords gazetteer source',
                                         RELWORDS_GZT_NAME + '.gzt.gz',
                                         resource_types.OTHER_RESOURCE,
                                         arch='any')
        self.ctx['gazetteer_src_id'] = resource.id
        yane.YaneTaskBase.on_enqueue(self)

    def _get_svn_path(self, yane_config, config_svn_key):
        context_key = '%s_path' % config_svn_key
        if context_key in self.ctx:
            data_path = self.ctx[context_key]
            if os.path.exists(data_path):
                return data_path
        data_path = self.abs_path(os.path.basename(yane_config[config_svn_key]))
        Arcadia.export(self.get_svn_path(yane_config[config_svn_key]), path=self.abs_path())
        self.ctx[context_key] = data_path
        return data_path

    def _run_relwordsextractor(self, sources, target, merge, **kwargs):
        args = ['-s', self.ctx['mr_server'],
                '-i', ','.join(sources),
                '-o', target,
                ]
        if merge:
            args.append('-m')
        else:
            args.extend(['-I', kwargs['trie'], '-t', kwargs['src_type'], '-S', kwargs['synonyms']])
        self.run_tool('relwordsextractor', args, self.get_mr_env())

    def _get_corpus_parsed_text(self, table):
        parts = os.path.split(table)
        local = '%s.parsed/text' % parts[-1]
        return os.path.join(parts[0], local)

    def _run_corpusparser(self, src_table, config, stopwords):
        output_table = self._get_corpus_parsed_text(src_table)
        args = ['-s', self.ctx['mr_server'],
                '-i', src_table,
                '-o', output_table,
                '-c', config,
                '-b', stopwords,
                ]
        self.run_tool('corpusparser', args, self.get_mr_env())

    def _parse(self, yane_config, trie, synonyms, src_type):
        self._run_relwordsextractor(yane_config[src_type],
                                    os.path.join(self.ctx['target'], src_type),
                                    False,
                                    trie=trie,
                                    synonyms=synonyms,
                                    src_type=src_type)

    def _parse_corpus(self, yane_config, trie, synonyms, src_table, src_type):
        self._run_corpusparser(src_table, self._get_svn_path(yane_config, 'corpus_parser_config'), self._get_svn_path(yane_config, 'corpus_parser_stopwords'))
        sources = (self._get_corpus_parsed_text(src_table),)
        self._run_relwordsextractor(sources,
                                    os.path.join(self.ctx['target'], src_type),
                                    False,
                                    trie=trie,
                                    synonyms=synonyms,
                                    src_type=src_type)

    @yane.run_once
    def _parse_rel_words(self, yane_config, trie, synonyms):
        self._parse(yane_config, trie, synonyms, 'rel_words')

    @yane.run_once
    def _parse_descr_words(self, yane_config, trie, synonyms):
        self._parse_corpus(yane_config, trie, synonyms, self.ctx['descriptions'], 'descr_words')

    @yane.run_once
    def _parse_clar_words(self, yane_config, trie, synonyms):
        self._parse_corpus(yane_config, trie, synonyms, self.ctx['clarifications'], 'clar_words')

    @yane.run_once
    def _merge(self):
        src_types = ['rel_words', 'descr_words', 'clar_words']
        self._run_relwordsextractor([os.path.join(self.ctx['target'], s) for s in src_types],
                                    self.ctx['target'],
                                    True)

    @yane.run_once
    def _run_gzt_compiler(self, yane_config):
        self.compile_gzt(yane_config['gzt_header'], os.path.join(self.ctx['target'], RELWORDS_GZT_NAME + '.gzt'), RELWORDS_GZT_NAME, VCONTS_GZT_NAME + ".gzt")
        self.mark_resource_ready(self.ctx['gazetteer_id'])
        self.mark_resource_ready(self.ctx['gazetteer_src_id'])

    @yane.run_once
    def _run_vconts(self, yane_config):
        if "vconts" not in yane_config:
            raise SandboxException("no vconts section in yane config")

        Arcadia.export(url=self.get_svn_path("arcadia/dict/tools/ner/features/votingcontexts"), path=self.abs_path('votingcontexts'))

        python_dir = os.path.dirname(sys.executable)
        bin_path = python_dir + ":" + self.sync_resource(self.ctx['tools'])
        markup = yane_config["vconts"]["markup"]
        mr_bin = self.get_tool("mapreduce-yt")
        mr_table = yane_config["vconts"]["mr_aggregated_queries"]
        mr_prefix = os.path.join(self.ctx['target'], markup)
        mr_server = self.ctx['mr_server']
        mr_user = self.ctx['mr_user']

        lang = "ru"

        cmd = (
            "bash -x " +
            os.path.join(self.abs_path("votingcontexts"), "vconts.bash") +
            " -B {} -M {} -s {} -u {} -i {} -l {} -p {} -q {} -o {}".format(
                bin_path, mr_bin, mr_server, mr_user, markup, lang, mr_prefix, mr_table,
                os.path.join(self.abs_path(), VCONTS_GZT_NAME + ".gzt")
            )
        )

        logging.info("vconts cmd: %s" % cmd)

        log_filename = self.abs_path('vconts_log.txt')

        try:
            with open(log_filename, "w") as log:
                run_process(cmd, log_prefix="vconts", stdout=log, stderr=log, wait=True, work_dir=self.abs_path("votingcontexts"), environment=self.get_mr_env())
        except Exception as e:
            with open(log_filename, "r") as log:
                m = e.message + ": " + log.read()

                logging.info('vconts failed: %s' % m)
                raise SandboxException(m)

        with open(log_filename, "r") as log:
            logging.debug("run vconts logs: " + log.read())

        self.mark_resource_ready(self.ctx['vconts_id'])

    def do_execute(self):
        trie = self.sync_resource(self.ctx['trie'])
        yane_config = self.get_config('config')['relwords']
        synonyms = self._get_svn_path(yane_config, 'synonyms')

        self._parse_rel_words(yane_config, trie, synonyms)
        self._parse_descr_words(yane_config, trie, synonyms)
        self._parse_clar_words(yane_config, trie, synonyms)
        self._merge()
        self._run_vconts(yane_config)
        self._run_gzt_compiler(yane_config)


__Task__ = YaneParseRelwords
