# -*- coding: utf-8 -*-
import re
import yt.wrapper as yt_wrapper
from datacloud.dev_utils.yt import yt_utils


@yt_wrapper.aggregator
class HostNameUrlExtractor:
    def __init__(self, is_retro=False):
        self.pat = re.compile(r'((https?):\/\/)?(?P<domain>(\w+\.)*[\w\-]+\.(\w+))(\/.*)?')
        self.is_retro = is_retro
        self.external_id_key = 'external_id' if self.is_retro else 'yuid'

    def get_domain(self, url):
        matching = self.pat.match(url)
        host_name = ''
        if matching:
            host_name = matching.group('domain')
            if host_name.startswith('www.'):
                host_name = host_name[4:]
        return host_name

    def __call__(self, recs):
        prev_external_id, prev_host = None, None
        counter = 0
        for rec in recs:
            url = rec['url'] if 'url' in rec else None
            if url is not None:
                external_id, timestamp = rec[self.external_id_key], rec['timestamp']
                host = self.get_domain(url)
                if host:
                    host = host[:1024]
                    if not self.is_retro and (prev_external_id == external_id and prev_host == host):
                        counter += 1
                    else:
                        if prev_external_id is not None:
                            yield {
                                self.external_id_key: prev_external_id,
                                'host': prev_host,
                                'timestamp': timestamp,
                                'counter': counter if not self.is_retro else 1
                            }
                        prev_external_id, prev_host = external_id, host
                        counter = 1
        if prev_external_id and counter:
            yield {
                self.external_id_key: prev_external_id,
                'host': prev_host,
                'timestamp': timestamp,
                'counter': counter
            }


def get_input_log_tables(path_config, yt_client, date=None):
    input_tables = []
    log_folders = ('watch_log_tskv', 'spy_log')
    if path_config.is_retro:
        for folder in log_folders:
            input_tables.extend(
                yt_client.list(
                    yt_wrapper.ypath_join(path_config.external_logs_dir, folder),
                    absolute=True
                )
            )
    else:
        input_tables = \
            [
                yt_wrapper.ypath_join(path_config.external_logs_dir, folder, date)
                for folder in log_folders
            ]
    return input_tables


def reduce_counter(key, recs):
    result_rec = None
    for rec in recs:
        if result_rec is None:
            result_rec = rec
        else:
            result_rec['counter'] += rec['counter']
    yield result_rec


@yt_wrapper.with_context
class DateFilter:
    def __init__(self, days_to_take=180):
        self.days_to_take = days_to_take
        self.past_time_border = 86400 * self.days_to_take

    def __call__(self, key, recs, context):
        timestamp = None
        for rec in recs:
            if context.table_index == 0:
                timestamp = rec['timestamp']
            elif timestamp:
                if (rec['timestamp'] < timestamp) and (timestamp - rec['timestamp'] < self.past_time_border):
                    yield rec
            else:
                break


def daily_hostnames_extract(path_config, yt_client, date):
    input_tables = get_input_log_tables(path_config, yt_client, date)
    yt_utils.create_folders(
        [path_config.data_dir, path_config.extracted_urls_dir], yt_client
    )
    with yt_client.Transaction():
        if path_config.is_retro:
            filtered_table = yt_wrapper.ypath_join('//tmp', 'tmp-filtered-logs-' + path_config.retro_tag)

            yt_client.run_reduce(
                DateFilter(path_config.days_to_take),
                [path_config.all_users] + input_tables,
                filtered_table,
                reduce_by=[path_config.EXTRACTED_URLS_KEY],
                spec=dict(
                    title='[{}] RETRO Filter Extracted hosts by timestamp'.format(path_config.tag),
                    **path_config.cloud_nodes_spec
                )
            )
            input_tables = [filtered_table]
        yt_client.run_map(
            HostNameUrlExtractor(path_config.is_retro),
            input_tables,
            path_config.current_extracted_urls_table,
            spec=dict(
                title='[{}] HostNameUrlExtractor'.format(path_config.tag),
                **path_config.cloud_nodes_spec
            )
        )
    yt_client.run_map_reduce(
        None,
        reduce_counter,
        path_config.current_extracted_urls_table,
        path_config.current_extracted_urls_table,
        reduce_by=[path_config.EXTRACTED_URLS_KEY, 'host'],
        spec=dict(
            title='[{}] HostNameUrlExtractor Count daily urls'.format(path_config.tag),
            **path_config.cloud_nodes_spec
        )
    )
    if not path_config.is_retro:
        yt_client.run_merge(
            path_config.current_extracted_urls_table,
            path_config.current_extracted_urls_table,
            spec=dict(
                title='[{}] HostNameUrlExtractor Merge chunks'.format(path_config.tag),
                combine_chunks=True,
                **path_config.cloud_nodes_spec
            )
        )
    yt_client.run_sort(
        path_config.current_extracted_urls_table,
        sort_by=path_config.EXTRACTED_URLS_KEY,
        spec=dict(
            title='[{}] HostNameUrlExtractor Sort after'.format(path_config.tag),
            **path_config.cloud_nodes_spec
        )
    )
