# -*- coding: utf-8 -*-

import os
import os.path

import sandbox.common.types.client as ctc

from sandbox.projects import resource_types
import sandbox.projects.yane.common as yane
from sandbox.sandboxsdk.parameters import SandboxStringParameter, LastReleasedResource, SandboxBoolParameter
from sandbox.sandboxsdk.paths import make_folder, copy_path, remove_path, add_write_permissions_for_path
from sandbox.sandboxsdk.svn import Arcadia


class YanePrepareData(yane.YaneTaskBase):
    """
        Creates Yane data using OntoDB
    """

    type = 'YANE_PREPARE_DATA'

    execution_space = 60 * 1024
    client_tags = ctc.Tag.LINUX_PRECISE

    class BaseType(SandboxStringParameter):
        name = 'base_type'
        description = 'Base type'
        required = True
        group = yane.GROUP_IN
        default_value = 'main'
        choices = [('main', 'main')]

    class EnableBorsches(SandboxBoolParameter):
        name = 'with_borsches'
        description = 'Enable borsches'
        default_value = False

    class Config(LastReleasedResource):
        name = 'config'
        description = 'Config (leave blank to get from svn)'
        group = yane.GROUP_IN
        resource_type = resource_types.YANE_CONFIG
        required = False

    class SourceOntoDB(SandboxStringParameter):
        name = 'source_ontodb'
        description = 'Source Onto DB table'
        default_value = 'home/dict/ontodb/ver/main/production/all_cards_final'
        group = yane.GROUP_IN
        required = True

    class PredefinedIds(LastReleasedResource):
        name = 'predefined_ids'
        description = 'Predefined external ids'
        group = yane.GROUP_IN
        resource_type = resource_types.YANE_TSV
        required = False

    class TargetFolder(SandboxStringParameter):
        name = 'target'
        description = 'Target MR folder'
        default_value = 'home/dict/yane/db.NNN'
        group = yane.GROUP_OUT
        required = True

    input_parameters = yane.get_base_params().params + \
        [BaseType, SourceOntoDB, PredefinedIds, TargetFolder, EnableBorsches, Config] + \
        yane.get_mr_params().params

    def __init__(self, task_id=0):
        yane.YaneTaskBase.__init__(self, task_id)
        self.ctx['kill_timeout'] = 48 * 60 * 60

    def on_enqueue(self):
        resource = self._create_resource(self.descr,
                                         'ner_data',
                                         resource_types.YANE_DATA,
                                         arch='any')
        self.ctx['data_id'] = resource.id
        yane.YaneTaskBase.on_enqueue(self)

    def make_sub_ctx(self, copy_params=[]):
        params = {
            'svn_url': self.ctx['svn_url'],
            'tools': self.ctx['tools'],
            'config': self.ctx['config'],
            'mr_server': self.ctx['mr_server'],
            'mr_runtime': self.ctx['mr_runtime'],
            'mr_user': self.ctx['mr_user'],
            'notify_via': '',
            'notify_if_finished': '',
            'notify_if_failed': self.author,
            'execution_space': self.execution_space
        }
        for p in copy_params:
            if p in self.ctx:
                params[p] = self.ctx[p]
        return params

    def prepare_config(self):
        if not self.is_resource_selected('config'):  # use config from svn, "ru.config", "tr.config', etc.
            Arcadia.checkout(self.get_svn_path('data/dict/ner/fcalcersdata'), path=self.abs_path(), depth='files')
            config_file_name = 'ru'
            if self.ctx.get('with_borsches', False):
                config_file_name += '.borsch'
            config_file_name += '.config'

            local_config_path = os.path.join(self.abs_path(), config_file_name)
            res = self.create_resource(
                self.descr + '-config',
                resource_path=local_config_path,
                resource_type=resource_types.YANE_CONFIG,
                arch='any'
            )
            self.mark_resource_ready(res.id)
            self.ctx['config'] = res.id

        return self.get_config('config')

    def _run_parse_ontodb(self):
        params = self.make_sub_ctx(['base_type', 'source_ontodb', 'predefined_ids'])
        params['target'] = os.path.join(self.ctx['target'], 'cards')
        params['light_data'] = False

        subtask = self.create_subtask('YANE_PARSE_ONTODB',
                                      'Parse OntoDB (task:{})'.format(self.id),
                                      input_parameters=params,
                                      arch=self.arch)
        return subtask

    def _run_parse_freebase(self, trie_id):
        params = self.make_sub_ctx()
        params['trie'] = trie_id
        params['target'] = os.path.join(self.ctx['target'], 'freebase')

        subtask = self.create_subtask('YANE_PARSE_FREEBASE',
                                      'Parse Freebase (task:{})'.format(self.id),
                                      input_parameters=params,
                                      arch=self.arch)
        return subtask

    def _run_parse_wikilinks(self, trie_id):
        params = self.make_sub_ctx()
        params['trie'] = trie_id
        params['source_onto'] = os.path.join(self.ctx['target'], 'cards', 'related.objects')
        params['target'] = os.path.join(self.ctx['target'], 'relobject')

        subtask = self.create_subtask('YANE_PARSE_WIKILINKS',
                                      'Parse Wikilinks (task:{})'.format(self.id),
                                      input_parameters=params,
                                      arch=self.arch)
        return subtask

    def _run_parse_relwords(self, trie_id):
        params = self.make_sub_ctx()
        params['trie'] = trie_id
        params['target'] = os.path.join(self.ctx['target'], 'relwords')
        params['descriptions'] = os.path.join(self.ctx['target'], 'cards', 'descriptions')
        params['clarifications'] = os.path.join(self.ctx['target'], 'cards', 'clarifications')

        subtask = self.create_subtask('YANE_PARSE_RELWORDS',
                                      'Parse Relwords (task:{})'.format(self.id),
                                      input_parameters=params,
                                      arch=self.arch)
        return subtask

    def _run_parse_relurls(self, trie_id):
        params = self.make_sub_ctx()
        params['trie'] = trie_id
        params['target'] = os.path.join(self.ctx['target'], 'relurls', 'urls')

        subtask = self.create_subtask('YANE_PARSE_RELURLS',
                                      'Parse Relurls (task:{})'.format(self.id),
                                      input_parameters=params,
                                      arch=self.arch)
        return subtask

    def _run_parse_tasks(self):
        if 'parse_tasks' in self.ctx:
            return

        parse_tasks = []
        resources = []

        subtask = self._run_parse_ontodb()
        parse_tasks.append(subtask.id)
        trie_id = subtask.ctx['trie_id']
        resources.extend([trie_id, subtask.ctx['gazetteer_id'], subtask.ctx['gazetteer_src_id']])

        subtask = self._run_parse_freebase(trie_id)
        parse_tasks.append(subtask.id)

        subtask = self._run_parse_wikilinks(trie_id)
        parse_tasks.append(subtask.id)

        subtask = self._run_parse_relwords(trie_id)
        parse_tasks.append(subtask.id)
        resources.extend([subtask.ctx['gazetteer_id'], subtask.ctx['gazetteer_src_id']])

        subtask = self._run_parse_relurls(trie_id)
        parse_tasks.append(subtask.id)
        resources.append(subtask.ctx['trie_id'])

        self.ctx['intermediate_res'] = resources
        self.ctx['parse_tasks'] = parse_tasks
        self.wait_tasks(parse_tasks, tuple(self.Status.Group.FINISH), True)

    def _create_data(self, dir_name, attrs, proto_only=False):
        add_write_permissions_for_path(self.abs_path())
        target = make_folder(dir_name, True)
        if not proto_only:
            assert 'intermediate_res' in self.ctx
            for res_id in self.ctx['intermediate_res']:
                copy_path(self.sync_resource(res_id), target)
        proc = []
        for a in attrs:
            targets = [
                (os.path.join(self.ctx['target'], s[0]), s[1])
                for s in a['sources']
            ]
            proc.append(
                self.download_proto_data(
                    os.path.join(target, a['name']),
                    a['type'],
                    targets,
                    useQueryFreq=a.get('query-freq', True),
                    cache=a.get('cache'),
                    max_related_obj_count=a.get('max_relobjects', None),
                    wait=False
                )
            )
        self.wait_processes(proc)
        return target

    def _run_corpusfeatures(self, ner_data, config, human_readable=False):
        params = [
            '-s', self.ctx['mr_server'],
            '-c', config['corpus']['source'],
            '-d', ner_data,
            '-l', ','.join(config['lemmatization_lang']),
            '-o', os.path.join(self.ctx['target'], 'corpus')
        ]
        if human_readable:
            params.append('-H')

        return self.run_tool('corpusfeatures', params, self.get_mr_env(), False)

    def _run_querylogfeatures(self, ner_data, config):
        return self.run_tool('querylogfeatures',
                             ['-s', self.ctx['mr_server'],
                              '-i', config['clicks']['source'],
                              '-d', ner_data,
                              '-l', ','.join(config['lemmatization_lang']),
                              '-o', os.path.join(self.ctx['target'], 'querylog'),
                              ], self.get_mr_env(), False)

    def _calc_stats(self, config):
        if self.ctx.get('calc_stat', False):
            return

        object_data_sources = [('cards', 'json'), ('freebase', 'proto'), ('relwords', 'proto'), ('relobject', 'proto')]
        self.check_tasks('parse_tasks', 'Parsing data')
        attrs = [
            {
                'type': 'object',
                'sources': object_data_sources,
                'name': 'object.data.bin',
                'max_relobjects': config['data'].get('max_relobjects', None),
            },
            {
                'type': 'synonym',
                'sources': [('cards', 'json')],
                'name': 'synonym.data.bin',
            },
        ]
        ner_data = self._create_data('tmp_data', attrs)

        proc = [
            self._run_corpusfeatures(ner_data, config),
            self._run_querylogfeatures(ner_data, config)
        ]
        self.wait_processes(proc)
        remove_path(ner_data)

        self.ctx['calc_stat'] = True

    def do_execute(self):
        config = self.prepare_config()
        self._run_parse_tasks()

        self._calc_stats(config)

        object_data_sources = [
            ('cards', 'json'),
            ('freebase', 'proto'),
            ('relwords', 'proto'),
            ('relobject', 'proto'),
            ('corpus', 'proto'),
            ('querylog', 'proto')
        ]
        synonym_data_sources = [('cards', 'json'), ('corpus', 'proto'), ('querylog', 'proto')]

        target = self._create_data(
            'ner_data',
            [
                {
                    'type': 'object',
                    'sources': object_data_sources,
                    'name': 'object.data.bin',
                    'query-freq': config['data']['freq_type'] == 'query',
                    'cache': config['data']['object_cache'],
                    'max_relobjects': config['data'].get('max_relobjects', None),
                },
                {
                    'type': 'synonym',
                    'sources': synonym_data_sources,
                    'name': 'synonym.data.bin',
                    'query-freq': config['data']['freq_type'] == 'query',
                    'cache': config['data']['synonym_cache'],
                },
                {
                    'type': 'metaobject',
                    'sources': [('cards', 'json')],
                    'name': 'object.metadata.bin',
                },
            ]
        )

        # lda file added
        Arcadia.export(self.get_svn_path('arcadia_tests_data/wizard/reqtopicsclassifier/phrase.model.rus.bin'), path=self.abs_path())
        copy_path(os.path.join(self.abs_path(), "phrase.model.rus.bin"), target)

        self._create_data(
            os.path.join('ner_data', 'wizdata'),
            [
                {
                    'type': 'object',
                    'sources': object_data_sources,
                    'name': 'object.data.bin',
                    'query-freq': config['data']['freq_type'] == 'query',
                    'cache': config['data']['object_cache'],
                    'max_relobjects': 0,
                },
            ],
            proto_only=True
        )

        self.mark_resource_ready(self.ctx['data_id'])


__Task__ = YanePrepareData
