# coding=utf-8

import re
import six
import six.moves.urllib.parse

import typing  # noqa
if typing.TYPE_CHECKING:
    from typing import List
    from bmyt import BMYT

import ads.quality.dssm.lib.python.url_norm
import yt.yson as yson
import yt.wrapper as yt

import irt.bannerland.options
import irt.broadmatching.common_options


url_normalizer = ads.quality.dssm.lib.python.url_norm.UrlNormalizer()

phrase_preprocess_site_re = re.compile(u'\\bsite:\S*', re.UNICODE)
phrase_preprocess_space_re = re.compile(u'\s{2,}', re.UNICODE)
phrase_preprocess_nonword_re = re.compile(u'(^|\s)\W+(\s|$)', re.UNICODE)

prepare_url_utm_re = re.compile(u'(?<=[&?])[^&?=]*?utm\_[^&?]*?(\&|$)', re.UNICODE)
prepare_url_pm_re = re.compile(u'(?<=[&?])pm\_[^&?]*?(\&|$)', re.UNICODE)
prepare_url_yd_re = re.compile(u'(?<=[&?])yd\_[^&?]*?(\&|$)', re.UNICODE)
prepare_url_r2_re = re.compile(u'(?<=[&?])r2=[^&?]*?(\&|$)', re.UNICODE)
prepare_url_r1_re = re.compile(u'(?<=[&?])r1=[^&?]*?(\&|$)', re.UNICODE)
prepare_url_roistat_re = re.compile(u'(?<=[&?])roistat=[^&?]*?(\&|$)', re.UNICODE)


canonical_url_re = re.compile(u'^(https?://)?(www\.)?', re.UNICODE)

clicked_banners_log_dir = '//home/bannerland/logs/dyn_bs_log'

DYNTABLE_ENFORCE_MAX_ROW_SIZE = 8 * yt.common.MB


class FirstReducer(object):
    def __call__(self, key, rows):
        for row in rows:
            if "@table_index" in row:
                del row["@table_index"]
            yield row
            break


def get_clicked_banner_urls(
        output_table,     # type: str
        table_count=180,  # type: int
        yt_client=yt      # type: yt.client.YtClient
):
    """
    Selects banner click events from recent logs on YT and puts urls into output_table, sorted by url
    Args:
        output_table:
            YT path to output table
        table_count:
            Number of daily tables to process.
        yt_client:
            YT client
    Returns:
    """
    daily_tables_names = sorted(yt_client.list(clicked_banners_log_dir), reverse=True)[:table_count]
    daily_tables_paths = [yt.TablePath(yt.ypath_join(clicked_banners_log_dir, t), columns=['Url', 'EventCost']) for t in daily_tables_names]

    def clicked_url_mapper(row):
        if row['Url'] and row['EventCost'] and row['EventCost'] > 0:
            yield {
                'url': row['Url'],
            }

    yt_client.run_map_reduce(
        clicked_url_mapper,
        FirstReducer(),
        daily_tables_paths,
        output_table,
        reduce_by=['url'],
        reduce_combiner=FirstReducer(),
        format=yt.YsonFormat(control_attributes_mode='row_fields'),
    )
    yt_client.run_sort(output_table, sort_by=['url'])


def phrase_preprocess(string):
    string = phrase_preprocess_site_re.sub(' ', string)
    string = phrase_preprocess_nonword_re.sub(' ', string)
    string = phrase_preprocess_space_re.sub(' ', string)
    string = string.strip()
    return string


def normalize_url_strong(url):
    url = url.strip()
    url = url_normalizer.normalize(url, "direct_draft,cut_https,strong")
    if len(url) < 4 or url[:4] != 'http':
        url = "http://" + url
    return url


def prepare_url(url):
    url = url.strip()
    if len(url) < 4 or url[:4] != 'http':
        url = "http://" + url
    url = prepare_url_utm_re.sub('', url)
    url = prepare_url_pm_re.sub('', url)
    url = prepare_url_yd_re.sub('', url)
    url = prepare_url_r1_re.sub('', url)
    url = prepare_url_r2_re.sub('', url)
    url = prepare_url_roistat_re.sub('', url)
    url = url.strip('?&')
    # sort get-params
    if url.find('&') != -1:
        params_begin = url[:url.rfind('&')].rfind('?')
        if params_begin != -1:
            params = sorted(url[params_begin+1:].split('&'))
            url = url[:params_begin+1] + "&".join(params)
    return url


def canonical_url(url):
    url = url.strip()
    url = url.lower()
    url = canonical_url_re.sub('', url)
    return url


def second_level_domain(url):
    domain = six.moves.urllib.parse.urlparse(url).netloc
    domain = '.'.join(domain.split('.')[-2:])
    return domain


def second_level_domains_equal(strong_url, url):
    try:
        return second_level_domain(strong_url) == second_level_domain(url)
    except ValueError:
        return False


# Checks if string contains only characters allowed in Direct. String is expected to be utf-8
def is_bs_compatible(string):
    utf8_string = string
    if not isinstance(string, six.text_type):
        utf8_string = six.ensure_text(string, errors='replace')
    if not hasattr(is_bs_compatible, 'allowed_set'):
        is_bs_compatible.allowed_set = set(irt.bannerland.options.get_option('bannerland_allowed_chars'))
    compat = True
    if not set(utf8_string).issubset(is_bs_compatible.allowed_set):
        compat = False
    return compat


def filter_freq(norm, sub, exact, exact_high=10000, sub_high=20000, sub_quote=1000, sub_low=5):
    wordcount = len(norm.split(' '))
    if wordcount > 3:  # long phrases are always unquoted
        sub_quote = sub_high
        exact_high = -1
    if wordcount == 1:  # one-word phrases are always quoted
        sub_quote = sub_low

    if sub < sub_high:
        if sub < sub_quote:
            if sub < sub_low:
                return None
            else:
                return norm
        else:
            return norm + ' ~0'
    else:
        if exact < exact_high:
            return norm + ' ~0'
        else:
            return None


class HitsJoinReducer():
    def __call__(self, key, recs):
        recs = iter(recs)
        first = next(recs)
        exact_hits = 0
        hits = 0
        if first['@table_index'] == 0:
            exact_hits = first['freq_query']
            hits = first['freq']
        else:
            first['exact_hits'] = exact_hits
            first['hits'] = hits
            first['@table_index'] = 0
            yield first
        for rec in recs:
            rec['exact_hits'] = exact_hits
            rec['hits'] = hits
            rec['@table_index'] = 0
            yield rec


def join_hits_by_norm(table, yt_client=yt.client):
    hits_table = irt.broadmatching.common_options.get_options()['QueryLogStat_params']['yt_table_counts_full']

    yt_client.run_reduce(
        HitsJoinReducer(),
        [hits_table, table],
        '<sorted_by=[norm]>' + table,
        reduce_by=['norm'],
        format=yt.YsonFormat(control_attributes_mode="row_fields"),
    )


def strong_url_weaken(input_table, output_table, yt_client=yt.client):
    with yt_client.TempTable() as temp_stat, \
            yt_client.TempTable() as temp_table, \
            yt_client.TempTable() as temp_table_uniq_urls:

        yt_client.run_sort(input_table, temp_table, sort_by=['strong_url', 'url'])

        def uniq_reducer(key, rows):
            yield next(iter(rows))

        yt_client.run_map_reduce(
            None,
            uniq_reducer,
            temp_table+'{strong_url,url}',
            temp_table_uniq_urls,
            reduce_combiner=uniq_reducer,
            reduce_by=['strong_url', 'url'],
            format=yt.YsonFormat(control_attributes_mode="row_fields"),
        )

        yt_client.run_sort(temp_table_uniq_urls, sort_by=['strong_url', 'url'])

        def count_reducer(key, rows):
            count = 0
            for row in rows:
                count += row.get('count', 1)
            yield {
                'strong_url': key['strong_url'],
                'count': count
            }

        yt_client.run_map_reduce(
            None,
            count_reducer,
            temp_table_uniq_urls,
            temp_stat,
            reduce_by=['strong_url'],
            reduce_combiner=count_reducer,
            format=yt.YsonFormat(control_attributes_mode="row_fields"),
        )

        yt_client.run_sort(temp_stat, sort_by=['strong_url'])

        def weaken_reducer(key, rows):
            rows = iter(rows)
            first = next(rows)
            weaken = 0
            if first['@table_index'] == 0:
                if first['count'] > 10:
                    weaken = 1
            else:
                first['@table_index'] = 0
                yield first
            for row in rows:
                if weaken:
                    row['strong_url'] = prepare_url(canonical_url(row['url']))
                row['@table_index'] = 0
                yield row

        yt_client.run_join_reduce(
            weaken_reducer,
            ['<foreign=%true>'+temp_stat, temp_table],
            output_table,
            join_by=['strong_url'],
            format=yt.YsonFormat(control_attributes_mode="row_fields"),
        )


def add_norm_url_to_table(
        input_table,      # type: str
        output_table,     # type: str
        bmyt_client=None  # type: BMYT
):
    """
    Adds norm_url field to table, calculated using prepare_url_key logic
    Args:
        input_table:
            YT path to input table
        output_table:
            YT path to output table
        bmyt_client:
            instance of BMYT to run operations with. Must incorporate bm_bannerland_lib
    """

    if bmyt_client is None:
        raise ValueError('bmyt_client expected')

    addkey_mapper = {
        'begin': """
            use BM::BannersMaker::BannerLandProject;
            $self->{proj} = BM::BannersMaker::BannerLandProject->new;
            $self->{task} = $self->{proj}->perftask({});
        """,
        'mapper': """
            $r->{norm_url} = $self->{task}->prepare_url_key($r->{url});
            yield($r);
        """,
        'dst_options': [
            {'unknown_as_string': True},
        ],
    }

    bmyt_client.run_bm_map(
        addkey_mapper,
        input_table,
        output_table,
        rows_per_bm_job=500000,
    )


def enforce_dyntable_guarantees(
        input_table,                                 # type: str
        output_table,                                # type: str
        key=['url'],                                 # type: List[str]
        max_row_size=DYNTABLE_ENFORCE_MAX_ROW_SIZE,  # type: int
        yt_client=yt                                 # type: yt.client.YtClient
):
    """
    Forces table to contain unique keys and rows smaller than limit
    Args:
        input_table:
            YT path to input table
        output_table:
            YT path to output table
        key:
            table key (to enforce unique keys)
        max_row_size:
            maximum allowed row size in bytes (bigger rows will be dropped)
    """

    def add_row_size(row):
        size = len(yson.dumps(row))
        if size < max_row_size:
            row['minus_row_size'] = -size
            yield row

    def row_size_reducer(key, rows):
        for row in rows:
            del row['minus_row_size']
            yield row
            break

    yt_client.run_map_reduce(
        add_row_size,
        row_size_reducer,
        input_table,
        output_table,
        sort_by=key + ['minus_row_size'],
        reduce_by=key
    )
