# encoding: utf-8
import datetime
import json
import logging
import os
import os.path
import time
import six
import six.moves.urllib.parse
import urlpy

import ads.libs.yql
import yt.yson as yson
import yt.wrapper as yt

from irt.bannerland.options import get_option as get_bl_opt, get_cypress_config
import irt.iron.options as iron_opts

from bm.yt_tools import join_schemas_from_tables, TableIndexCopyMapper, TableIndexCompatibleJoinReducer, FirstReducer
from irt.bannerland.hosts import get_curr_host
import irt.broadmatching.common_options

import bannerland.yt_local

logger = logging.getLogger(__name__)

DEFAULT_DATE_FMT = get_bl_opt('bannerland_pocket_name_format')


def get_sec_level_domain(url):
    domain = six.moves.urllib.parse.urlparse(url).hostname
    return '.'.join(domain.split('.')[-2:]) if domain else ''


# заливка данных с железа в YT

class TasksOffersUploader:
    def __init__(self, expire_hours=24, delete_after=True, yt_client=None):
        self.delete_after = delete_after
        self.hostname = get_curr_host()
        self.expire_hours = expire_hours

        if yt_client is None:
            yt_client = yt.YtClient(proxy='hahn', config={'token_path': iron_opts.get('yt_token_path')})

        self.yt_client = yt_client

    def is_good_filename(self, file_path):
        # type: (str) -> bool
        id_time = os.path.basename(file_path).split('-')
        return len(id_time) == 2 and id_time[1].isdigit()

    def check_last_id_timestamp(self, file_path, last_times):
        task_id, timestamp = os.path.basename(file_path).split('-')
        return timestamp == last_times[task_id]

    def check_expire(self, file_path):
        if self.expire_hours is None:
            return True
        file_ts = os.path.getmtime(file_path)
        return (time.time() - file_ts) <= self.expire_hours * 3600

    def get_good_files_from_dir_remove_bad(self, input_dir):
        # type: (str) -> List[str]
        # сами ищем нужные файлы + фильтруем
        files = []
        for fname in os.listdir(input_dir):
            filename = os.path.join(input_dir, fname)
            files.append(filename)

        last_times = self.gen_last_times_dict(files)

        new_files = []
        for file_path in files:
            if all([
                self.is_good_filename(file_path),
                self.check_last_id_timestamp(file_path, last_times),
                self.check_expire(file_path),
            ]):
                new_files.append(file_path)
            else:
                os.remove(file_path)
                logger.info('delete bad file: %s', file_path)

        logger.info('found files: %d', len(new_files))
        return new_files

    def upload_prod_by_task_type(self, task_type):
        # type: (str) -> None
        d = {
            'perf': 'perf_export_offers_dir',
            'dyn': 'dyn_export_offers_dir',
        }

        task_dir = irt.broadmatching.common_options.get_options()[d[task_type]]
        files = self.get_good_files_from_dir_remove_bad(task_dir)

        cypress_config = get_cypress_config(task_type)
        task_dir = cypress_config.get_path('tasks_from_instances')
        offers_dir = cypress_config.get_path('offers_from_instances')
        self.upload(files, task_dir, offers_dir)

    def upload(self, files, tasks_yt_dir, offers_yt_dir):
        # type: (List[str], str, str) -> None

        if not files:
            logger.info('files with good_filename is not found!')
            return

        # квота для данного скрипта не более 500 нод (если квота превышена ничего не делаем)
        for yt_dir in [tasks_yt_dir, offers_yt_dir]:
            self.yt_client.create('map_node', yt_dir, ignore_existing=True, recursive=True)

        if len(self.yt_client.list(tasks_yt_dir)) > 500:
            logger.error('error node quota exceeded')
            return

        upload_time = datetime.datetime.now().strftime(DEFAULT_DATE_FMT)
        yt_table_name = self.hostname + '_' + upload_time
        yt_client = self.yt_client

        task_table_name = yt.ypath_join(tasks_yt_dir, yt_table_name)
        offer_table_name = yt.ypath_join(offers_yt_dir, yt_table_name)

        with yt_client.Transaction():
            task_schema = yson.YsonList([
                {'name': 'BusinessId',      'type': 'uint32', 'required': False},
                {'name': 'ShopId',          'type': 'uint32', 'required': False},
                {'name': 'host',            'type': 'string', 'required': True},
                {'name': 'export_offers_info', 'type': 'string', 'required': True},
                {'name': 'ppar',            'type': 'string', 'required': True},
                {'name': 'task_inf',        'type': 'string', 'required': True},
                {'name': 'task_id',         'type': 'string', 'required': True},
                {'name': 'timestamp',       'type': 'uint32', 'required': True},
                {'name': 'AllowedInProductGallery', 'type': 'boolean'},  # TODO (danila-eremin) BLRT-312 add 'required'
            ])
            task_schema.attributes['strict'] = False  # не делаем strict для безболезненного удаления полей

            offer_schema = yson.YsonList([
                {'name': 'product_class',   'type': 'string', 'required': True},
                {'name': 'OfferYabsId',     'type': 'uint64', 'required': False},
                {'name': 'product_inf',     'type': 'string', 'required': True},
                {'name': 'product_md5',     'type': 'string', 'required': True},
                {'name': 'task_id',         'type': 'string', 'required': True},
                {'name': 'timestamp',       'type': 'uint32', 'required': True},
            ])
            offer_schema.attributes['strict'] = False

            yt_client.create(
                'table', task_table_name,  recursive=True,
                attributes={'schema': task_schema, 'compression_codec': 'lz4'},
            )

            yt_client.create(
                'table', offer_table_name, recursive=True,
                attributes={'schema': offer_schema, 'compression_codec': 'lz4'},
            )

            write_format = '<enable_string_to_all_conversion=%true>dsv'
            yt_client.write_table(task_table_name, self.task_iterator(files), format=write_format, raw=True)
            yt_client.write_table(offer_table_name, self.offer_iterator(files), format=write_format, raw=True)

            logger.info('write offers and tasks in %s, %s done', task_table_name, offer_table_name)

            if self.delete_after:
                for file_path in files:
                    os.remove(file_path)
                    logger.info('delete done: %s', file_path)

    def is_task_line(self, line):
        return any([kv.startswith('task_inf=') for kv in line.split('\t')])

    def task_iterator(self, files):
        add_str = 'host=' + self.hostname
        for file_path in files:
            with open(file_path) as f:
                for line in f:
                    if self.is_task_line(line):
                        yield add_str + '\t' + line

    def offer_iterator(self, files):
        for file_path in files:
            with open(file_path) as f:
                for line in f:
                    if not self.is_task_line(line):
                        yield line

    def gen_last_times_dict(self, files):
        last_times = dict()
        for file_path in files:
            task_id, timestamp = os.path.basename(file_path).split('-')
            if (task_id not in last_times) or (last_times[task_id] < timestamp):
                last_times[task_id] = timestamp

        return last_times


# Таски и офферы должны быть получены из одной генерации
# Поэтому джойним их сразу и дальше работаем только с поджойненными таблицами

class TasksOffersJoiner:
    def __init__(self, yt_pool=None, mode=None, yt_client=None):
        self.no_yql = False
        self.yt_pool = yt_pool

        self.task_dir = ''
        self.offers_dir = ''

        if mode == 'local':
            self.no_yql = True

        self.target_table_prefix = 'merged_tasks_and_offers_'
        self.yt_client = yt_client

        if self.yt_client is None:
            if mode == 'local':
                yt_config = {'proxy': {'url': bannerland.yt_local.get_proxy()}}
                self.yt_client = yt.YtClient(config=yt_config)
            else:

                yt_config = {
                    'token_path': iron_opts.get('yt_token_path'),
                    'spec_defaults': {},
                }
                if self.yt_pool:
                    yt_config['spec_defaults']['pool'] = self.yt_pool

                self.yt_client = yt.YtClient(proxy='hahn', config=yt_config)

    def get_table_prefix(self):
        return self.target_table_prefix

    def merge_prod_by_task_type(self, task_type):
        # type: (str) -> None
        cypress_config = get_cypress_config(task_type)
        tasks_dir = cypress_config.get_path('tasks_from_instances')
        offers_dir = cypress_config.get_path('offers_from_instances')
        tao_yt_dir = cypress_config.get_path('tao_current')
        self.merge_to_tao(tasks_dir, offers_dir, tao_yt_dir)

    def merge_to_tao(self, tasks_dir, offers_dir, tasks_and_offers_dir):
        # type: (str, str, str) -> None

        yt_client = self.yt_client
        yt_client.create('map_node', tasks_and_offers_dir, ignore_existing=True)

        offer_tables = [yt.ypath_join(offers_dir, table) for table in yt_client.list(offers_dir, sort=True)]
        task_tables = [yt.ypath_join(tasks_dir, table) for table in yt_client.list(tasks_dir, sort=True)]
        if len(offer_tables) != len(task_tables):
            # добавились более новые таблицы (коммит между двумя yt.list), пока их не обрабатываем
            min_len = min(len(offer_tables), len(task_tables))
            offer_tables = offer_tables[0:min_len]
            task_tables = task_tables[0:min_len]

        pack_size = 100  # не более 100 таблиц за раз
        for pos in range(0, len(offer_tables), pack_size):
            offer_tables_pack = offer_tables[pos:pos + pack_size]
            task_tables_pack = task_tables[pos:pos + pack_size]
            tao_table_time = datetime.datetime.now().strftime(DEFAULT_DATE_FMT)
            target_table_name = '{0}/{1}{2}'.format(tasks_and_offers_dir, self.target_table_prefix, tao_table_time)

            with yt_client.Transaction() as tx:

                if self.no_yql:
                    logger.info('start yt_local join in %s', target_table_name)
                    with yt_client.TempTable() as tmp_offers, \
                            yt_client.TempTable() as tmp_tasks:

                        def append_timing_column(row):
                            row['tao_table_time'] = tao_table_time
                            yield row

                        yt_client.run_map(append_timing_column, task_tables_pack, tmp_tasks)
                        logger.info('merge tasks OK')
                        yt_client.run_merge(offer_tables_pack, tmp_offers)
                        logger.info('merge offer OK')

                        schema = join_schemas_from_tables(task_tables_pack + offer_tables_pack, yt_client=yt_client)
                        schema += [{'name': 'tao_table_time', 'type': 'string', 'required': False}]
                        # т.к. у нас left join, то снимаем требование required
                        for column in schema:
                            column['required'] = False
                        yt_client.create('table', target_table_name, attributes={'schema': schema}, force=True, recursive=True)
                        yt_client.run_map_reduce(TableIndexCopyMapper(),
                                                 TableIndexCompatibleJoinReducer(type='left'),
                                                 [tmp_tasks, tmp_offers],
                                                 target_table_name,
                                                 reduce_by=['task_id', 'timestamp'],
                                                 format=yt.YsonFormat(control_attributes_mode='row_fields'))
                        logger.info('JoinReducer OK')
                else:
                    query = """
                    PRAGMA SimpleColumns;
                    PRAGMA yt.ExternalTx = '{tx}';
                    INSERT INTO `{target}` WITH TRUNCATE
                    SELECT
                        offers.*,
                        '{tao_table_time}' as tao_table_time,
                        tasks.* WITHOUT tasks._other, offers._other
                    FROM (
                        SELECT *
                        FROM CONCAT({tasks})
                        ) as tasks
                    LEFT JOIN (
                        SELECT *
                        FROM CONCAT ({offers})
                        ) as offers
                    USING (`task_id`, `timestamp`)
                    ORDER BY RandomNumber(TableRow())
                    """.format(tasks=','.join(['`' + table + '`' for table in task_tables_pack]),
                               offers=','.join(['`' + table + '`' for table in offer_tables_pack]),
                               target=target_table_name,
                               tx=tx.transaction_id,
                               tao_table_time=tao_table_time)

                    logger.info('start yql for %s', target_table_name)
                    ads.libs.yql.run_yql_query(query=query, pool=self.yt_pool)
                    logger.info('yql OK')

                for table in offer_tables_pack + task_tables_pack:
                    yt_client.remove(table)
                logger.info('remove OK')


def merge_tao(input_tables, output_table, yql_client=None, yt_pool=None):
    input_tables_str = ','.join(['`{}`'.format(table) for table in input_tables])
    yql_query = """
        PRAGMA yt.DataSizePerJob = '100M';

        $task_last_timestamp = (
            SELECT
                `task_id`,
                max(`timestamp`) as `timestamp`
            FROM CONCAT({input_tables_str})
            GROUP BY `task_id`
        );

        INSERT INTO `{output_table}` WITH TRUNCATE
        SELECT *
        FROM CONCAT({input_tables_str}) as t_in
        LEFT SEMI JOIN $task_last_timestamp as t_ts
        USING (`task_id`, `timestamp`)
    """.format(input_tables_str=input_tables_str, output_table=output_table)
    ads.libs.yql.run_yql_query(query=yql_query, client=yql_client, pool=yt_pool)


def validate_url(url):
    try:
        parsed_url = six.moves.urllib.parse.urlparse(url)
    except:
        return False
    if not parsed_url.scheme or not parsed_url.netloc:
        return False
    try:
        urlpy.parse(six.ensure_text(url)).punycode()
    except:
        return False
    return True


def cache_avatars_for_offer_images(tasks_and_offers_tables, avatars_client, yt_client=yt, product_range=None, **kwargs):
    def get_images(row):
        if product_range is not None:
            lo_md5, hi_md5 = product_range
            pt_md5 = row['product_md5']
            if pt_md5 < lo_md5 or pt_md5 >= hi_md5:
                return

        try:
            pt_inf = json.loads(row['product_inf'])
        except Exception:
            return
        image_urls = pt_inf.get('images')
        if not image_urls:
            return

        # fallback for old format: csv
        if not isinstance(image_urls, list):
            image_urls = image_urls.split(',')

        banned_domains_for_downloading = set(get_bl_opt('banned_domains_for_avatars_downloading'))
        for url in image_urls:
            url = url.encode('utf8')
            if not validate_url(url):
                continue
            url = six.ensure_str(urlpy.parse(six.ensure_text(url)).punycode().unicode)
            if get_sec_level_domain(url) in banned_domains_for_downloading:
                continue
            yield {'url': url}

    with yt_client.TempTable() as tmp_tao_urls:
        yt_client.run_map_reduce(
            get_images,
            FirstReducer(),
            tasks_and_offers_tables,
            tmp_tao_urls,
            reduce_by=['url'],
        )
        # cache is a side-effect!
        stats = avatars_client.get_avatars_for_table(tmp_tao_urls, output_table=None, **kwargs)

    return stats
