# coding=utf-8

import re
from collections import defaultdict
from statbox_bindings2.string_utils.misc import canonize_vhost
from qb2.api.v1 import extractors as qe, filters as qf, typing as qt
from nile.api.v1 import Record, clusters, with_hints, aggregators as na, statface as ns, cli

PAGES = {
    '/',
    '/yandsearch',
    '/images/',
    '/images/search',
    '/images/touch/',
    '/images/touch/search',
    '/images/smart/',
    '/images/smart/search',
    '/images/pad/',
    '/images/pad/search',
    '/gorsel/',
    '/gorsel/search',
    '/gorsel/touch/',
    '/gorsel/touch/search',
    '/gorsel/smart/',
    '/gorsel/smart/search',
    '/gorsel/pad/',
    '/gorsel/pad/search',
}

output_schema_0 = {
    'date': qt.String,
    'path': qt.String,
    'referer_page': qt.String,
    'referer_canonized_vhost': qt.String,
}

output_schema_1 = {
    'hits': qt.UInt64,
    'page': qt.String,
    'path': qt.List[qt.String],
    'vhost': qt.String,
    'fielddate': qt.String,
    'total_hits': qt.UInt64,
}

RE_PATH = re.compile(r'\d+(\.\d+)*\Z')
RE_VHOST = re.compile(r'(.*\.)?yandex\.(ru|ua|by|kz|com\.tr|com)$')


@with_hints(output_schema_0)
def process_paths(recs):
    for rec in recs:
        path = rec.path
        if not RE_PATH.match(path):
            yield rec.transform(path='8.228._garbage_')
        else:
            yield rec


@with_hints(output_schema_1)
def calc_paths_reducer(groups):
    for key, recs in groups:
        hits = defaultdict(int)
        total_hits = defaultdict(int)

        for rec in recs:
            hits[rec.path] += rec.hits
            path_parts = rec.path.split('.')
            for i in range(len(path_parts)):
                total_hits['.'.join(path_parts[:i + 1])] += rec.hits

        for path in total_hits:
            yield Record(
                fielddate=key.date,
                vhost=key.referer_canonized_vhost,
                page=key.referer_page,
                path=path.split('.'),
                hits=hits.get(path, 0),
                total_hits=total_hits[path],
            )


def add_work(job, date, statface_report):
    images_redir_log = job.table('//logs/images-redir-log/1d/{}'.format(date))
    images_tech_log = job.table('//logs/images-redir-tech-log/1d/{}'.format(date))

    job.concat(
        images_redir_log,
        images_tech_log,
    ).qb2(
        log='redir-log',
        fields=[
            'date',
            'path',
            'referer_page',
            qe.custom(
                'referer_canonized_vhost',
                lambda referer_vhost: canonize_vhost(referer_vhost.strip(':')),
            )
            .allow_override()
            .with_type(qt.Optional[qt.String]),
        ],
        filters=[
            qf.default_filtering('redir-log'),
            qf.or_(
                qf.startswith('path', '8.228.'),
                qf.equals('path', '8.228'),
                qf.equals('path', '8.584'),
                qf.startswith('path', '8.584.'),
            ),
            qf.one_of('referer_page', PAGES),
            qf.match('referer_canonized_vhost', RE_VHOST),
            qf.defined('date', 'path', 'referer_canonized_vhost', 'referer_page'),
        ],
        mode='yamr_lines',
    ).map(
        process_paths,
    ).groupby(
        'date',
        'referer_page',
        'referer_canonized_vhost',
        'path',
    ).aggregate(
        hits=na.count(),
    ).groupby(
        'date',
        'referer_page',
        'referer_canonized_vhost',
    ).reduce(
        calc_paths_reducer,
    ).publish(
        statface_report,
        mode='local',
    )


@cli.statinfra_job()
def click_on_serp(job, options, statface_client):
    statface_report = ns.StatfaceReport().path('Image/Adhoc/ClicksOnSerp').scale('daily').client(statface_client)

    for date in options.dates:
        add_work(job, date, statface_report)

    return job


if __name__ == '__main__':
    cli.run()
