# -*- coding: utf-8 -*-

import codecs
import json
import os
import os.path
import shutil
import logging
import subprocess as sp
from xml.dom.minidom import Node, parse

from sandbox import sdk2
from sandbox.projects import resource_types
from sandbox.projects.geosearch import resource_types as geo_types
from sandbox.projects.geosearch.tools.database_notifications import notify_by_telegram
from sandbox.projects.common.nanny import nanny
from sandbox.sandboxsdk import environments
import sandbox.common.types.client as ctc

from sandbox.projects.UpdateMapsWizardPpoData import inner_org_gzt_builder


def export_yt_json(client, yt_path, local_path):
    with open(local_path, 'w') as f:
        for row in client.read_table(yt_path):
            row['source_proto'] = ''
            print >>f, json.dumps(row)


def get_name(textNode):
    return {
        'type': 'synonym' if textNode.nodeName == 'searchText' else 'main',
        'value': {
            'locale': textNode.attributes['xml:lang'].nodeValue,
            'value': textNode.firstChild.nodeValue
        }
    }


def parse_simple_feature(node, obj):
    for child in node.childNodes:
        if child.nodeType != Node.ELEMENT_NODE:
            continue
        obj['names'].append(get_name(child))


def parse_enum_feature(node, obj):
    for child in node.childNodes:
        if child.nodeType != Node.ELEMENT_NODE or child.nodeName != 'Value':
            continue
        enum_val = {
            'value_id': child.attributes['id'].nodeValue,
            'names': [],
        }
        for valName in child.childNodes:
            if valName.nodeType != Node.ELEMENT_NODE:
                continue
            enum_val['names'].append(get_name(valName))
        obj['enum_values'].append(enum_val)


def xml2json(input_file, output_file):
    dom = parse(input_file)
    f = codecs.open(output_file, 'w', 'utf-8')

    for node in dom.firstChild.childNodes:
        if node.nodeType != Node.ELEMENT_NODE:
            continue

        obj = {
            'permalink': node.attributes['id'].nodeValue,
            'value_type': node.attributes['type'].nodeValue,
            'fast_feature': node.attributes['fastFeature'].nodeValue,
            'display_mode': node.attributes['displayMode'].nodeValue,
            'names': [],
            'enum_values': [],
        }
        if obj['value_type'] != 'enum':
            parse_simple_feature(node, obj)
        else:
            parse_enum_feature(node, obj)

        print >>f, json.dumps(obj, ensure_ascii=False)

    f.close()


def merge_rubric_files(main_file, added_file):

    def extract(cur_file, values_dict):
        for line in open(cur_file, 'r'):
            permalink, sep, value = line[:-1].partition('\t')
            values_dict.setdefault(permalink, []).append(value)

    values_dict = dict()
    extract(main_file, values_dict)
    extract(added_file, values_dict)
    with open(main_file, 'w') as f:
        for permalink, values in values_dict.items():
            print >> f, permalink + '\t' + ''.join(values)


def return_to_cwd(fn):
    def decorated_fn(*args, **kwargs):
        cwd = os.getcwd()
        fn(*args, **kwargs)
        os.chdir(cwd)

    return decorated_fn


class UpdateMapsWizardPpoData(nanny.ReleaseToNannyTask2, sdk2.Task):
    """
    Fetches actual data from YT export of Altay and generates PPO data files.
    """

    class Requirements(sdk2.Requirements):
        environments = [
            environments.PipEnvironment('yandex-yt', use_wheel=True),
            environments.PipEnvironment('yandex-yt-yson-bindings-skynet', use_wheel=True),
            environments.PipEnvironment('yql', use_wheel=True)
        ]
        disk_space = 16 * 1024  # 16 GiB
        client_tags = ctc.Tag.LINUX_PRECISE
        cores = 1
        ram = 8192

        class Caches(sdk2.Requirements.Caches):
            pass

    class Parameters(sdk2.Task.Parameters):
        with sdk2.parameters.Group("Arcadia") as arcadia_group:
            arcadia_url = sdk2.parameters.ArcadiaUrl("Arcadia URL")
        with sdk2.parameters.Group("Resources") as resources_group:
            ppo_gzt_builder = sdk2.parameters.Resource(
                'PPO gzt builder',
                resource_type=geo_types.GZT_BUILDER,
                required=False
            )
        source = sdk2.parameters.String('Source (YT path to tables)',
                                        default='//home/altay/db/export/current-state/snapshot',
                                        required=True)
        fast_features = sdk2.parameters.String('YT path to features2_fast.xml.gz',
                                               default_value='//home/altay/db/export/current-state/features2_fast.xml.gz',
                                               required=True)
        source_feature_triggers = sdk2.parameters.String('Feature rubric triggers (YT path to tables)',
                                                         default='//home/geo-search/yurakura/sprav/feature_rubric_triggers',
                                                         required=True)
        tag_tree = sdk2.parameters.String('Tag tree (YT path to tag_tree.json)',
                                          default_value='//home/sprav/goods_and_prices/config/tag_tree',
                                          required=True)
        rubric_synonym_priority = sdk2.parameters.String('YT path to table with special weights of rubric synonyms',
                                                         default_value='',
                                                         required=True)

    def on_failure(self, prev_status):
        notify_by_telegram(self, 'failed')

    def on_break(self, prev_status, status):
        notify_by_telegram(self, 'failed with exception')

    def on_timeout(self, prev_status):
        notify_by_telegram(self, 'timed out')

    def on_execute(self):
        self.config = {
            'proxy': {
                'url': "hahn.yt.yandex.net"
            },
            'token': sdk2.Vault.data('GEOMETA-SEARCH', 'yt-token')
        }
        self._make_dirs()
        self._fetch_yt_data()
        self._fetch_fast_features()
        self._fetch_tag_tree()
        self._build_wizard_data()
        self._create_resources()

    def _make_dirs(self):
        self.home_dir = os.getcwd()

        self.altay_data_dir = os.path.join(self.home_dir, 'altay_data')
        self.wizard_data_dir = os.path.join(self.home_dir, 'wizard_data')
        self.wizard_temp_data_dir = os.path.join(self.home_dir, 'wizard_temp_data')

        os.makedirs(self.altay_data_dir)
        os.makedirs(self.wizard_data_dir)
        os.makedirs(self.wizard_temp_data_dir)

    def _fetch_yt_data(self):
        import yt.wrapper as yt
        from yql.api.v1.client import YqlClient
        from yql.client.parameter_value_builder import YqlParameterValueBuilder as ValueBuilder

        client = yt.YtClient(config=self.config)

        logging.info('Exporting company')
        filtered_company_path = '//tmp/update_maps_wizard_ppo_data/company_{}'.format(self.id)
        with YqlClient(db='hahn', token=sdk2.Vault.data('GEOMETA-SEARCH', 'YQL_TOKEN')) as yql_client:
            request = yql_client.query('''
                   declare $filtered_company_path as string;
                   declare $company_path as string;

                   $parents = (
                       SELECT
                           parent_permalink
                       FROM (
                           SELECT
                               LIstMap(
                                   ListFilter(
                                       Yson::ConvertToList(relations),
                                       ($rel) -> { RETURN Yson::LookupString($rel, 'type') == 'located_at'; }
                                   ),
                                   ($rel) -> { RETURN Yson::LookupInt64($rel, 'permalink'); }
                               ) as parent_permalinks
                           FROM
                               $company_path
                       )
                       FLATTEN LIST BY
                           parent_permalinks as parent_permalink
                       GROUP BY
                           parent_permalink
                   );

                   INSERT INTO
                       $filtered_company_path
                   WITH TRUNCATE
                   SELECT
                       permalink, rubrics, names, address
                   FROM
                       $company_path as t
                   INNER JOIN
                       $parents as pt
                   ON
                       t.permalink == pt.parent_permalink
                   WHERE
                       t.publishing_status = 'publish'
                   ''', syntax_version=1)
            parameters = {
                '$filtered_company_path': ValueBuilder.make_string(yt.ypath.ypath_join(filtered_company_path)),
                '$company_path': ValueBuilder.make_string(yt.ypath.ypath_join(self.Parameters.source, 'company'))
            }
            request.run(parameters=ValueBuilder.build_json_map(parameters))
            request.get_results(wait=True)
            if not request.is_success:
                raise Exception(str(request.errors))
            export_yt_json(
                client,
                os.path.join(filtered_company_path),
                os.path.join(self.altay_data_dir, 'company.json')
            )

        for table_name in ['rubric', 'feature', 'feature_enum_value', 'chain']:
            logging.info('Exporting %s' % table_name)
            export_yt_json(
                client,
                os.path.join(self.Parameters.source, table_name),
                os.path.join(self.altay_data_dir, table_name + '.json')
            )

        logging.info('Exporting source_feature_triggers')
        export_yt_json(
            client,
            os.path.join(self.Parameters.source_feature_triggers),
            os.path.join(self.altay_data_dir, 'feature_rubric_triggers.json')
        )

        if self.Parameters.rubric_synonym_priority:
            export_yt_json(
                client,
                os.path.join(self.Parameters.rubric_synonym_priority),
                os.path.join(self.altay_data_dir, 'rubric_synonym_priority.json')
            )

        logging.info('Exporting company_to_chain')
        with YqlClient(db='hahn', token=sdk2.Vault.data('GEOMETA-SEARCH', 'YQL_TOKEN')) as yql_client:
            request = yql_client.query('declare $input as string; select chain_permalink from $input GROUP BY chain_permalink;', syntax_version=1)
            request.run(parameters=ValueBuilder.build_json_map(
                {'$input': ValueBuilder.make_string(yt.ypath.ypath_join(self.Parameters.source, 'company_to_chain'))}
            ))
            request.get_results(wait=True)
            if not request.is_success:
                raise Exception(str(request.errors))
            request.table.fetch_full_data()
            with open(os.path.join(self.altay_data_dir, 'company_chains.txt'), 'w') as company_chains:
                for row in request.table.rows:
                    company_chains.write(str(row[0]) + '\n')

        logging.info('All tables exported!')

    @return_to_cwd
    def _fetch_fast_features(self):
        import yt.wrapper as yt
        client = yt.YtClient(config=self.config)
        os.chdir(self.altay_data_dir)
        local_path = 'features2_fast.xml.gz'
        with open(local_path, 'w') as output:
            yt_file = client.read_file(self.Parameters.fast_features)
            output.write(yt_file.read())
        sp.check_call(["gzip", "-d", "features2_fast.xml.gz"])
        xml2json('features2_fast.xml',
                 os.path.join(self.altay_data_dir, 'features2_fast.json'))

    # Add parents to keys without parents
    def _update_tag_tree(self, tag_tree_content):
        json_tree = json.loads(tag_tree_content)
        for rule in json_tree:
            if 'parents' not in rule:
                parent = rule.copy()
                rule['parents'] = [parent]
        return json.dumps(json_tree, indent=2)

    def _fetch_tag_tree(self):
        import yt.wrapper as yt
        client = yt.YtClient(config=self.config)
        with open(os.path.join(self.wizard_data_dir, 'tag_tree.json'), 'w') as output:
            yt_file = client.read_file(self.Parameters.tag_tree)
            output.write(self._update_tag_tree(yt_file.read()))

    @return_to_cwd
    def _build_wizard_data(self):
        os.chdir(self.wizard_data_dir)
        gzt_builder = str(sdk2.ResourceData(self.Parameters.ppo_gzt_builder).path)

        gzt_builder_cmd = [
            gzt_builder,
            '-l',
            '--chain-ids=' + os.path.join(self.altay_data_dir, 'company_chains.txt'),
            '--rubric', os.path.join(self.altay_data_dir, 'rubric.json'),
            '--feature', os.path.join(self.altay_data_dir, 'feature.json'),
            '--feature-enum-value', os.path.join(self.altay_data_dir, 'feature_enum_value.json'),
            '--chain', os.path.join(self.altay_data_dir, 'chain.json'),
            '--features2-fast', os.path.join(self.altay_data_dir, 'features2_fast.json'),
            '--feature-rubric-triggers', os.path.join(self.altay_data_dir, 'feature_rubric_triggers.json'),
            '--tag-tree', os.path.join(self.wizard_data_dir, 'tag_tree.json'),
            '--compile'
        ]

        if self.Parameters.rubric_synonym_priority:
            gzt_builder_cmd.extend(['--rubric-synonym-priority',
                                    os.path.join(self.altay_data_dir, 'rubric_synonym_priority.json')])

        sp.check_call(gzt_builder_cmd)

        inner_org_gzt_builder.build_gzt(
            company=os.path.join(self.altay_data_dir, 'company.json'),
            rubric=os.path.join(self.altay_data_dir, 'rubric.json'),
            rubric_desc=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'rubric_desc.json'),
            features_gzt='inner_org_features.gzt',
            fix_list=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'fix_list.json')
        )
        shutil.copyfileobj(open('inner_org_features.gzt', 'r'), open('features.gzt', 'a'))

        inner_org_gzt_builder.build_rubric_txt(
            rubric=os.path.join(self.altay_data_dir, 'rubric.json'),
            rubric_txt='rubric_inner_org.txt'
        )
        merge_rubric_files('rubric_features.txt', 'rubric_inner_org.txt')
        merge_rubric_files('rubric_filters.txt', 'rubric_inner_org.txt')

        shutil.move('rubric_inner_org.txt', os.path.join(self.wizard_temp_data_dir, 'rubric_inner_org.txt'))
        shutil.move('inner_org_features.gzt', os.path.join(self.wizard_temp_data_dir, 'inner_org_features.gzt'))
        shutil.move('features.gzt', os.path.join(self.wizard_temp_data_dir, 'features.gzt'))
        shutil.move('rubrics.gzt', os.path.join(self.wizard_temp_data_dir, 'rubrics.gzt'))
        if os.path.exists('rubrics_new.gzt'):
            shutil.move('rubrics_new.gzt', os.path.join(self.wizard_temp_data_dir, 'rubrics_new.gzt'))
        shutil.move('chains.gzt', os.path.join(self.wizard_temp_data_dir, 'chains.gzt'))
        shutil.move('chains_pure.gzt', os.path.join(self.wizard_temp_data_dir, 'chains_pure.gzt'))
        shutil.move('chains_pure_unsorted.gzt', os.path.join(self.wizard_temp_data_dir, 'chains_pure_unsorted.gzt'))
        shutil.move(os.path.join(self.altay_data_dir, 'rubric.json'), os.path.join(self.wizard_temp_data_dir, 'rubric.json'))
        shutil.move(os.path.join(self.altay_data_dir, 'feature.json'), os.path.join(self.wizard_temp_data_dir, 'feature.json'))
        shutil.move(os.path.join(self.altay_data_dir, 'chain.json'), os.path.join(self.wizard_temp_data_dir, 'chain.json'))
        shutil.move(os.path.join(self.altay_data_dir, 'feature_enum_value.json'), os.path.join(self.wizard_temp_data_dir, 'feature_enum_value.json'))
        shutil.move(os.path.join(self.altay_data_dir, 'features2_fast.json'), os.path.join(self.wizard_temp_data_dir, 'features_fast.json'))

        shutil.move(os.path.join(self.altay_data_dir, 'feature_rubric_triggers.json'), os.path.join(self.wizard_temp_data_dir, 'feature_rubric_triggers.json'))
        shutil.move(os.path.join(self.altay_data_dir, 'company_chains.txt'), os.path.join(self.wizard_temp_data_dir, 'company_chains.txt'))

    def _create_resources(self):
        resource = resource_types.MAPS_DATABASE_BUSINESS_WIZARD_DATA(
            self, self.Parameters.description, self.wizard_data_dir)
        resource.mark = self.Parameters.source
        resource_data = sdk2.ResourceData(resource)
        resource_data.ready()

        resource_temp = resource_types.OTHER_RESOURCE(
            self, 'UPDATE_MAPS_WIZARD_PPO_DATA temp files', self.wizard_temp_data_dir
        )
        resource_temp_data = sdk2.ResourceData(resource_temp)
        resource_temp_data.ready()
