# -*- coding: utf-8 -*-

import os

import sandbox.common.types.client as ctc
from sandbox.projects import resource_types
import sandbox.projects.yane.common as yane
from sandbox.sandboxsdk.parameters import SandboxStringParameter, SandboxBoolParameter, LastReleasedResource
from sandbox.sandboxsdk.svn import Arcadia

EXTERNAL_IDS_TRIE_NAME = 'object.ids.trie'
SYNONYMS_GZT_NAME = 'synonyms'
OBJECT_DATA_NAME = 'object.data.bin'
SYNONYM_DATA_NAME = 'synonym.data.bin'


class YaneParseOntoDB(yane.YaneTaskBase):
    """
        Parses Onto DB and creates initial data for it. Compiles extrenal IDs trie and synonyms gazetteer
    """

    type = 'YANE_PARSE_ONTODB'

    # Arcadia - 16Gb, tools - 5Gb, resource - 3Gb
    execution_space = 25 * 1024

    class Config(LastReleasedResource):
        name = 'config'
        description = 'Config'
        resource_type = resource_types.YANE_CONFIG
        group = yane.GROUP_IN

    class SourceTable(SandboxStringParameter):
        name = 'source_ontodb'
        description = 'Source table'
        default_value = 'home/dict/ontodb/ver/main/production/all_cards_final'
        group = yane.GROUP_IN
        required = True

    class PredefinedIds(LastReleasedResource):
        name = 'predefined_ids'
        description = 'Predefined external ids'
        group = yane.GROUP_IN
        resource_type = resource_types.YANE_TSV
        required = False

    class LightData(SandboxBoolParameter):
        name = 'light_data'
        description = 'Create light data using temporary tables'
        default_value = False
        group = yane.GROUP_OUT
        required = True
        sub_fields = {'false': ['target']}

    class TargetFolder(SandboxStringParameter):
        name = 'target'
        description = 'Target MR folder'
        default_value = 'home/dict/yane/db.NNN/cards'
        group = yane.GROUP_OUT
        required = True

    class QueryrecPath(SandboxStringParameter):
        name = "queryrec_path"
        description = "Path to queryrecognizer files"
        group = yane.GROUP_IN
        required = True

    input_parameters = \
        yane.get_base_params().params + \
        [Config, SourceTable, PredefinedIds, LightData, TargetFolder, QueryrecPath] + \
        yane.get_mr_params().params

    # All MR-clusters are linux-only
    client_tags = ctc.Tag.LINUX_PRECISE

    def __init__(self, task_id=0):
        yane.YaneTaskBase.__init__(self, task_id)
        self.ctx['kill_timeout'] = 8 * 60 * 60

    def on_enqueue(self):
        resource = self._create_resource('Synonyms gazetteer binary',
                                         SYNONYMS_GZT_NAME + '.bin',
                                         resource_types.OTHER_RESOURCE,
                                         arch='any')
        self.ctx['gazetteer_id'] = resource.id
        resource = self._create_resource('Synonyms gazetteer source',
                                         SYNONYMS_GZT_NAME + '.gzt.gz',
                                         resource_types.OTHER_RESOURCE,
                                         arch='any')
        self.ctx['gazetteer_src_id'] = resource.id

        if self.ctx.get('light_data', False):
            resource = self._create_resource('Objects proto-data',
                                             OBJECT_DATA_NAME,
                                             resource_types.OTHER_RESOURCE,
                                             arch='any')
            self.ctx['object_data_id'] = resource.id
            resource = self._create_resource('Synonyms proto-data',
                                             SYNONYM_DATA_NAME,
                                             resource_types.OTHER_RESOURCE,
                                             arch='any')
            self.ctx['synonym_data_id'] = resource.id
        else:
            resource = self._create_resource('External IDs trie',
                                             EXTERNAL_IDS_TRIE_NAME,
                                             resource_types.OTHER_RESOURCE,
                                             arch='any')
            self.ctx['trie_id'] = resource.id

        yane.YaneTaskBase.on_enqueue(self)

    def _get_target(self):
        return os.path.join('tmp', 'yane') if self.ctx.get('light_data', False) else self.ctx['target']

    def _get_prefix(self):
        return "{}_".format(self.id) if self.ctx.get('light_data', False) else ''

    def _get_table_path(self, table_name):
        return os.path.join(self._get_target(), self._get_prefix() + table_name)

    @yane.run_once
    def _run_cardsparser(self):
        yane_config = self.get_config('config')
        cfg_svn_path, cfg_name = os.path.split(yane_config['ontodb']['config'])
        if not os.path.exists('cfg'):
            os.mkdir('cfg')
            Arcadia.checkout(self.get_svn_path(cfg_svn_path), path='cfg')

        args = ['-s', self.ctx['mr_server'],
                '-i', self.ctx['source_ontodb'],
                '-o', self._get_target(),
                '-c', os.path.join('cfg', cfg_name)
                ]

        if "lda_topics" in yane_config['ontodb']:
            lda_svn_path, lda_name = os.path.split(yane_config['ontodb']['lda_topics'])
            if not os.path.exists('lda'):
                os.mkdir('lda')
                Arcadia.checkout(self.get_svn_path(lda_svn_path), path='lda')
            args.extend(['-l', os.path.join('lda', lda_name)])

        if "queryrec_path" in yane_config['ontodb']:
            queryrec_svn_path = yane_config['ontodb']['queryrec_path']
            if not os.path.exists('queryrec'):
                os.mkdir('queryrec')
                Arcadia.checkout(self.get_svn_path(queryrec_svn_path), path='queryrec')
            args.extend(['-Q', 'queryrec'])

        if self.ctx.get('predefined_ids'):
            args.extend(['-I', self.sync_resource(self.ctx['predefined_ids'])])

        prefix = self._get_prefix()
        if prefix:
            args.extend(['-p', prefix])

        cardsparser_tool = 'cardsparser'
        self.run_tool(cardsparser_tool, args, self.get_mr_env())

    @yane.run_once
    def _run_trie_compiler(self):
        if self.ctx.get('light_data', False):
            return

        self.compile_trie(self._get_table_path('object.external.ids'), EXTERNAL_IDS_TRIE_NAME)
        self.mark_resource_ready(self.ctx['trie_id'])

    @yane.run_once
    def _run_gzt_compiler(self):
        yane_config = self.get_config('config')
        self.compile_gzt(yane_config['ontodb']['gzt_header'], self._get_table_path('synonym.gzt'), SYNONYMS_GZT_NAME)
        self.mark_resource_ready(self.ctx['gazetteer_id'])
        self.mark_resource_ready(self.ctx['gazetteer_src_id'])

    @yane.run_once
    def _run_proto_download(self):
        if not self.ctx.get('light_data', False):
            return

        yane_config = self.get_config('config')
        max_related_obj_count = yane_config['data'].get('max_relobjects', None)

        proc = []
        proc.append(self.download_proto_data(OBJECT_DATA_NAME,
                                             'object',
                                             [(self._get_target(), 'json')],
                                             prefix=self._get_prefix(),
                                             max_related_obj_count=max_related_obj_count,
                                             wait=False))
        proc.append(self.download_proto_data(SYNONYM_DATA_NAME,
                                             'synonym',
                                             [(self._get_target(), 'json')],
                                             prefix=self._get_prefix(),
                                             wait=False))
        self.wait_processes(proc)
        self.mark_resource_ready(self.ctx['object_data_id'])
        self.mark_resource_ready(self.ctx['synonym_data_id'])

    def do_execute(self):
        self._run_cardsparser()
        self._run_trie_compiler()
        self._run_gzt_compiler()
        self._run_proto_download()

        if self.ctx.get('light_data', False):
            self.run_tool('mr_rm',
                          ['-s', self.ctx['mr_server'], '-v', '-p', self._get_table_path('')],
                          self.get_mr_env())


__Task__ = YaneParseOntoDB
