# coding=utf-8

import datetime
import logging
import os
import re
import subprocess

import bm.yt_tools
import irt.broadmatching.common_options


logger = logging.getLogger(__name__)


CDICT_GENERATION_PATH = bm.yt_tools.get_cdict_generation_params()['cdict_generations_yt_path']
CDICT_OUT_PATH = bm.yt_tools.get_cdict_generation_params()['cdict_out_yt_path']


def main_snorm_deleter(row):
    snorms = []
    for word_freq in row['norm_freq'].split(','):
        word = word_freq[:word_freq.index(':')]

        if word != row['snorm']:
            snorms.append(word)

    if snorms:
        yield {'snorm': row['snorm'], 'norm': ' , '.join(snorms)}


# remove row with ~0
class T0_row_remover:
    def __init__(self, field_key):
        self.field_key = field_key

    def __call__(self, row):
        if '~0' not in row[self.field_key]:
            yield row


def preproc_tails(row):
    val = row['freq_word_categs']
    val = re.sub('\s*:\s*', ' ', val)
    row['freq_word_categs'] = re.sub('\s*,\s*', ' , ', val)
    yield row


class CdictMapper:
    def __init__(self, table_info):
        self.preproc_row_generator = table_info['preproc_row_generator']
        self.namespace = table_info['namespace']
        self.cdict_field_key = table_info['cdict_key']
        self.cdict_field_val = table_info['cdict_value']

        if not self.preproc_row_generator:
            self.preproc_row_generator = self.default_preproc_row_generator

    @staticmethod
    def default_preproc_row_generator(row_in):
        yield row_in

    def __call__(self, in_row):
        for row in self.preproc_row_generator(in_row):
            yield {'cdict_namespace': self.namespace,
                   'cdict_key': row[self.cdict_field_key],
                   'cdict_value': str(row[self.cdict_field_val])}


class Cdict(object):
    def __init__(self, bmyt_cl, cdict_name):
        self._all_namespaces = []
        self.cdict_name = cdict_name
        self.bmyt_cl = bmyt_cl
        self.yt_client = bmyt_cl.yt_client
        self.cdict_dir = CDICT_OUT_PATH
        self.backup_count = 6

        self.target_yt_path = self.cdict_dir + '/' + self.cdict_name
        self._external_sourses = []

        dirs = irt.broadmatching.common_options.get_options()['dirs']
        cdict_temp_dir = dirs['temp'] + '/cdict/'
        cdict_work_dir = dirs['work'] + '/cdict/'

        for d in (cdict_temp_dir, cdict_work_dir):
            if not os.path.exists(d):
                os.makedirs(d, mode=755)

        self.target_file_path = cdict_temp_dir + self.cdict_name + '_file'
        self.target_binary_path = cdict_work_dir + self.cdict_name + '_binary'

    def check_external_sourses(self):
        for table in self._external_sourses:
            if not self.yt_client.exists(table):
                assert Exception('external table not exist: ' + str(table))

    def check_all_namespaces(self):
        logger.info('check all_namespaces')
        valid_namespaces_type = {'multi_lang', 'single'}
        for namespaces in self._all_namespaces:
            if 'type' not in namespaces:
                raise Exception('type not found')

            if 'base_dir' not in namespaces:
                raise Exception('dir not found')

            if namespaces['type'] not in valid_namespaces_type:
                raise Exception('type not valid; valid value -> [' + ' '.join(valid_namespaces_type)+']')

            if namespaces['type'] == 'multi_lang':
                if 'langs' not in namespaces:
                    raise Exception('multi_lang namespaces not have langs')

                if 'lang_namespace_postfix' not in namespaces:
                    raise Exception('multi_lang namespaces not have lang_namespace_postfix')

        for table_info in self._get_input_tables_info():
            if not self.yt_client.exists(table_info['table_yt_path']):
                raise Exception('table not exist: ' + str(table_info))

        logger.info('/ check all_namespaces')

    def add_namespaces(self, namespaces):
        self._all_namespaces.append(namespaces)

    def _get_input_tables_info(self):
        # возвращает список словарей вида
        # {table_yt_path:val, cdict_key:val, cdict_value:val [, preproc_row_generator:val]}

        res = []

        for namespaces in self._all_namespaces:

            for namespace_name, param in namespaces['namespaces'].items():

                if 'in_table' not in param or 'field_key' not in param or 'field_val' not in param:
                    raise Exception('bad namespace:' + namespace_name)

                langs = namespaces.get('langs', [None])
                for lang in langs:

                    namespace_name_local = namespace_name
                    if 'lang_namespace_postfix' in namespaces:
                        namespace_name_local += namespaces['lang_namespace_postfix'].get(lang, '')

                    if namespaces['type'] == 'single':
                        table_name = namespaces['base_dir'] + '/' + param['in_table']
                    elif namespaces['type'] == 'multi_lang':
                        dir_name = namespaces['base_dir'] + namespaces.get('lang_dir_path_postfix', dict()).get(lang, '')
                        table_name = dir_name + '/' + param['in_table'] + namespaces.get('lang_input_table_postfix', dict()).get(lang, '')
                    else:
                        raise Exception("It should never happen")

                    res.append({'table_yt_path': table_name,
                                'cdict_key': param['field_key'],
                                'cdict_value': param['field_val'],
                                'preproc_row_generator': param.get('preproc_row_generator', None),
                                'namespace': namespace_name_local,
                                })
        return res

    def gen_cdict_yt(self):
        if not self._all_namespaces:
            return

        self.check_all_namespaces()
        self.check_external_sourses()

        sc = bm.yt_tools.get_schema({
            'cdict_namespace': 'string',
            'cdict_key': 'string',
            'cdict_value': 'string',
        }, strict=True)

        logger.info('gen cdict %s', self.cdict_name)
        with self.yt_client.Transaction() as tx, \
                self.yt_client.TempTable() as tmp_cdict_table:

            logger.info('yt tmp table: %s', tmp_cdict_table)

            self.yt_client.alter_table(tmp_cdict_table, schema=sc)

            tmp_cdict_table_with_append = '<append=%true>' + tmp_cdict_table

            for table_info in self._get_input_tables_info():
                logger.info('add to %s table: %s', self.cdict_name, table_info)
                self.yt_client.run_map(CdictMapper(table_info),
                                       table_info['table_yt_path'],
                                       tmp_cdict_table_with_append)
                logger.info('/add to %s table: %s', self.cdict_name, table_info)

            for table in self._external_sourses:
                logger.info('add to %s external source %s', self.cdict_name, table)
                query = """
                INSERT INTO `{tmp_cdict_table}`
                SELECT  cdict_namespace, cdict_key, cdict_value
                FROM `{src_external_table}`
                """.format(tmp_cdict_table=tmp_cdict_table,
                           src_external_table=table)

                self.bmyt_cl.do_yql(query, transaction_id=tx.transaction_id)
                logger.info('/add to %s external source %s', self.cdict_name, table)

            self.yt_client.run_sort(tmp_cdict_table, sort_by=['cdict_namespace', 'cdict_key'])

            logger.info('gen tmp ok')
            logger.info('mv %s to %s', tmp_cdict_table, self.target_yt_path)

            self.yt_client.move(tmp_cdict_table, self.target_yt_path, force=True)
            bm.yt_tools.set_upload_time(self.target_yt_path, self.yt_client)

        logger.info('/ gen cdict %s', self.cdict_name)

    def upload_from_yt(self):
        logger.info('upload cdict %s from yt', self.cdict_name)

        temp_file_path = self.target_file_path + "_DOWNLOADING"

        try:
            if os.path.exists(temp_file_path):
                os.remove(temp_file_path)

            # скачивание через python работает медленно =(
            # используем yt cli
            env_cmd = 'YT_TOKEN_PATH=/opt/broadmatching/secrets/tokens/yt_plato YT_PROXY=hahn'
            yt_download_format = '<columns=[cdict_namespace;cdict_key;cdict_value]>schemaful_dsv'
            cli_read_config = '{read_parallel={enable=%true;max_thread_count=32;}}'
            cmd = '{env} yt read {yt_table} --format "{format}" --config "{read_config}" > {target_file}'.format(
                env=env_cmd,
                yt_table=self.target_yt_path,
                format=yt_download_format,
                read_config=cli_read_config,
                target_file=temp_file_path
            )

            logger.info('start cmd: %s', cmd)
            subprocess.check_call(cmd, shell=True)
            logger.info('cmd end')

            os.rename(temp_file_path, self.target_file_path)
            logger.info('OK upload to: %s', self.target_file_path)
        finally:
            if os.path.exists(temp_file_path):
                os.remove(temp_file_path)

        logger.info('/ upload cdict %s from yt', self.cdict_name)

    def binary_is_old(self, last_yt_generation):
        if not os.path.exists(self.target_binary_path):
            return True

        ts = os.path.getmtime(self.target_binary_path)
        dt = datetime.datetime.fromtimestamp(ts)
        return last_yt_generation > dt

    def add_binary_with_backup(self, new_file, target_file_name):

        if not os.path.exists(new_file):
            raise os.error("ERROR: this should not happen; file not found "+str(new_file))

        if os.stat(new_file).st_size == 0:
            raise Exception("ERROR: this should not happen; file is empty " + str(new_file))

        def get_backup_file_name(x):
            if x == 0:
                return target_file_name
            return target_file_name + '.' + str(x)

        logger.info('start move; backups count = %s', self.backup_count)

        backups_count_now = 0
        # ищем текущее количество бекапов
        while os.path.exists(get_backup_file_name(backups_count_now)):
            backups_count_now += 1
        backups_count_now -= 1

        if backups_count_now > 0:
            logger.info('found last backups cdict file: %s', get_backup_file_name(backups_count_now))
        elif backups_count_now == 0:
            logger.info('not found backup; found prev cdict file = %s', target_file_name)
        else:
            logger.info('nothing found; looks like first generation; cdict file not found')

        # удаляем протухшие
        for backup_num in range(self.backup_count, backups_count_now + 1):
            logger.info('remove %s', get_backup_file_name(backup_num))
            os.remove(get_backup_file_name(backup_num))

        # двигаем существующие
        for backup_num in range(min(self.backup_count, backups_count_now+1), 0, -1):
            logger.info("rename : %s -> %s", get_backup_file_name(backup_num-1), get_backup_file_name(backup_num))
            os.rename(get_backup_file_name(backup_num-1), get_backup_file_name(backup_num))

        # двигаем свежесгенерированный
        logger.info("rename : %s -> %s", new_file, target_file_name)
        os.rename(new_file, target_file_name)

    def generate_binary(self):
        logger.info('genrate_binary for %s cdict', self.cdict_name)

        cdict_params = irt.broadmatching.common_options.get_options()['cdict_params']
        server_dir = cdict_params['server_dir'] + '/cdict'

        temp_file_path = self.target_binary_path + '_CREATING'
        cmd = server_dir + " --mode={} --data-file={} --output-file={}".format('gen',
                                                                               self.target_file_path,
                                                                               temp_file_path)

        try:
            if os.path.exists(temp_file_path):
                os.remove(temp_file_path)
            logger.info("run cmd %s", cmd)
            subprocess.check_call(cmd, shell=True)
            logger.info("/ run cmd ok")
            self.add_binary_with_backup(temp_file_path, self.target_binary_path)
        finally:
            if os.path.exists(temp_file_path):
                os.remove(temp_file_path)

        logger.info('/ genrate_binary for %s cdict', self.cdict_name)


class Chronicle(Cdict):
    def __init__(self, bmyt_cl):
        super(Chronicle, self).__init__(bmyt_cl, 'cdict_chronicle')

        self.add_namespaces({
            'type': 'multi_lang',
            'base_dir': CDICT_GENERATION_PATH,

            'langs': ['ru', 'tr'],
            'lang_input_table_postfix': {'ru': '', 'tr': ''},
            'lang_namespace_postfix': {'ru': '', 'tr': '_tr'},
            'lang_dir_path_postfix': {'ru': '/ru', 'tr': '/tr'},

            # 'namespaces': []
            'namespaces': {
                # counts
                'count': ({'in_table': 'counts',
                           'field_key': 'norm',
                           'field_val': 'freq',
                           'preproc_row_generator': T0_row_remover('norm'),
                           }),
                # counts_geo
                'countg': ({'in_table': 'counts_geo',
                            'field_key': 'norm',
                            'field_val': 'regions',
                            'preproc_row_generator': T0_row_remover('norm'),
                            }),
                # counts_query
                'countq': ({'in_table': 'counts_query',
                            'field_key': 'norm',
                            'field_val': 'freq_query',
                            }),
                # counts_mob
                'countm': ({'in_table': 'counts_mob',
                            'field_key': 'norm',
                            'field_val': 'freq',
                            'preproc_row_generator': T0_row_remover('norm'),
                            }),
                # tails
                'tail': ({'in_table': 'tails',
                          'field_key': 'subph',
                          'field_val': 'freq_word_categs',
                          'preproc_row_generator': preproc_tails,
                          }),
            }
        })


class Datoteka(Cdict):
    def __init__(self, bmyt_cl):
        super(Datoteka, self).__init__(bmyt_cl, 'cdict_datoteka')

        self.add_namespaces({
            'type': 'multi_lang',
            'base_dir': CDICT_GENERATION_PATH,

            'langs': ['ru', 'tr'],
            'lang_input_table_postfix': {'ru': '', 'tr': ''},
            'lang_namespace_postfix': {'ru': '', 'tr': '_tr'},
            'lang_dir_path_postfix': {'ru': '/ru', 'tr': '/tr'},

            'namespaces': {

                'flags': ({'in_table': 'flags',
                           'field_key': 'phrase',
                           'field_val': 'flags',
                           }),

                # удален из-за ненадобности
                # 'snorm': ({'in_table': 'snorms',
                #            'field_key': 'norm',
                #            'field_val': 'snorm',
                #            }),

                'syns': ({'in_table': 'syns',
                          'field_key': 'snorm',
                          'field_val': 'norm',  # in syns norm_freq after main_snorm_deleter norm
                          'preproc_row_generator': main_snorm_deleter,
                          }),
                'categs': ({'in_table': 'categs',
                            'field_key': 'snorm',
                            'field_val': 'categs',
                            }),
                'regions': ({'in_table': 'regions',
                             'field_key': 'snorm',
                             'field_val': 'regions',
                             'preproc_row_generator': T0_row_remover('snorm'),
                             }),
                'harm': ({'in_table': 'harm',
                          'field_key': 'norm',
                          'field_val': 'orig',
                          }),
            }
        })

        self.add_namespaces({
            'type': 'single',
            'base_dir': CDICT_GENERATION_PATH,

            'namespaces': {
                'bnr_count': ({'in_table': 'bnr_counts',
                               'field_key': 'phrase',
                               'field_val': 'count',
                               }),
                'bnr_count_ru': ({'in_table': 'ru/bnr_counts',
                                  'field_key': 'phrase',
                                  'field_val': 'count',
                                  }),
                'bnr_count_tr': ({'in_table': 'tr/bnr_counts',
                                  'field_key': 'phrase',
                                  'field_val': 'count',
                                  }),
            }

        })
