# -*- coding: utf-8 -*-
import sys
from library.python.nyt import client as nyt_client
# nyt_client.initialize(sys.argv)
nyt_client.initialize(list(map(lambda it: it.encode(), sys.argv)))
import os
import hashlib
import yt.wrapper as yt_wrapper
from datacloud.features.dssm import fast_dssm45
from datacloud.dev_utils.logging.logger import get_basic_logger


logger = get_basic_logger(__name__)
VERSION = sys.version_info.major


@yt_wrapper.with_context
class TitleUrlReducer(object):
    def __init__(self, retro_days_to_take=175):
        self.chars_to_find = '?%#&='
        self.str_len = 512
        self.retro_days_to_take = retro_days_to_take
        self.past_time_border = 86400 * self.retro_days_to_take

    def clean_url(self, url):
        if url is None:
            return ''

        url = url[:self.str_len]
        max_ind = self.str_len

        for char in self.chars_to_find:
            if char in url:
                cur_ind = url.find(char)
                if cur_ind != -1 and cur_ind < max_ind:
                    max_ind = cur_ind
        if max_ind != self.str_len:
            url = url[:max_ind]
        return url

    def __call__(self, key, recs, context):
        external_id, counter = None, 0
        filter_timestamp = None
        for rec in recs:
            if context.table_index == 0:
                if 'external_id' in rec:
                    external_id = rec['external_id']
                    filter_timestamp = rec['timestamp']
                else:
                    external_id = rec['cid']
            elif external_id is not None:
                title = rec.get('title')
                url = rec.get('url')
                timestamp = rec.get('timestamp')
                if filter_timestamp is not None and not(0 < filter_timestamp - timestamp < self.past_time_border):
                    continue
                if url is not None and title is not None and len(url) > 3 and len(title) > 3 and counter < 1000000:
                    url = self.clean_url(url)
                    title = title[:self.str_len]

                    m = hashlib.md5()
                    # possible breaking change, check
                    if VERSION == 2:
                        m.update(title + url)
                    else:
                        m.update((title + url).encode('utf-8'))
                    hash_value = m.hexdigest()

                    try:
                        if VERSION == 2:
                            title = title.decode('utf-8')
                    except ValueError:
                        title = 'undecodable'
                    try:
                        if VERSION == 2:
                            url = url.decode('utf-8')
                    except ValueError:
                        url = 'undecodable'
                    title = title.replace('\'', '')
                    # have to replace this substrings because of awk preprocessing to run nn_applier in streaming mode
                    title = title.replace('hash=', ' ')
                    title = title.replace('title=', ' ')
                    title = title.replace('url=', ' ')
                    counter += 1
                    yield {
                        'title': title,
                        'url': url,
                        'key': external_id,
                        'hash': hash_value,
                        'timestamp': timestamp
                    }
            else:
                break


def run_prepare_title_url(config, yt_client):
    logger.info('Is retro: {}'.format(config.is_retro))
    yt_token = yt_wrapper.config['token'] or os.environ.get('YT_TOKEN')
    assert yt_token, '[DSSM JOIN CIDS] No YT_TOKEN provided'

    tables = config.get_grep_tables(yt_client, config)
    logger.info('Tables are')
    for table in tables:
        logger.info(table)

    if config.is_retro:
        reduce_key = ('external_id', 'yuid')
    else:
        reduce_key = 'yuid'

    yt_client.run_reduce(
        TitleUrlReducer(retro_days_to_take=config.days_to_take),
        [config.yuid2cid_table] + tables,
        config.yuid2title_url4_table_all,
        reduce_by=reduce_key,
        spec=dict(
            title='[{}] title url reducer'.format(config.tag),
            **config.cloud_nodes_spec
        )
    )
    # Sorted for later join_reduce call
    yt_client.run_sort(
        config.yuid2title_url4_table_all,
        sort_by=['hash'],
        spec=dict(
            title='[{}] sort after TitleUrlReducer'.format(config.tag),
            **config.cloud_nodes_spec
        )
    )
    fast_dssm45.fast_unique_hash_reduce(
        yt_token,
        yt_client.config['proxy']['url'],
        str(config.yuid2title_url4_table_all),
        str(config.yuid2title_url4_table)
    )
