#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import os
import codecs
import argparse
import itertools
import datetime
import json
import re
from collections import defaultdict, Counter
from mma_1559_live import proc_url


re_stream_id = re.compile(ur'stream_id=([a-f0-9]+)')


def web_get_url_for_check(url):
    return re_stream_id.search(url).group(1)


def component_check(comp, serp_type):
    if serp_type == 'video':
        return comp['type'] == 'SEARCH_RESULT'
    elif serp_type == 'web':
        return (
            comp['type'] == 'WIZARD' and
            comp['wizard-type'] == 'WIZARD_TV_ONLINE'
        )


def get_timestamp(sd):
    sd = sd.split()[-1].split('.')[0]
    return int(
        datetime.datetime.strptime(sd, '%Y-%m-%dT%H:%M:%S').strftime('%s')
    )


def wrap_channels(good_eps, channels):
    chs = [x["channel_name"] for x in good_eps]
    if any((channels.get(x) or {}).get("is_special_project") for x in chs):
        ch_type = u"Спецпроекты"
    if any((channels.get(x) or {}).get("channel_type", "").startswith(u"yatv") for x in chs):
        ch_type = u"Яндекс-каналы"
    else:
        ch_type = u"Эфирные"
    return chs + [ch_type, "_total_"]


def wrap_query_type(good_eps):
    result = list({x["query_type"] for x in good_eps})
    return result + ["_total_"]


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--serps', default='serps.json')
    parser.add_argument('--ptts', default='ptts.json')
    parser.add_argument('--raw_data', default='raw_data.json')
    parser.add_argument('--date')
    parser.add_argument('--report', default="Video/Others/live_online_beta")
    parser.add_argument('--serp_type', default='video')
    parser.add_argument('--output_queries', default='output_queries.json')
    parser.add_argument('--output_results', default='output_results.json')
    args = parser.parse_args()

    serps = json.load(open(args.serps))
    ptts = json.load(open(args.ptts))
    raw_data = json.load(open(args.raw_data))
    channels = {x["title"]: x for x in raw_data}

    if not args.date:
        args.date = datetime.datetime.now().replace(second=0, microsecond=0)
    else:
        args.date = datetime.datetime.strptime(
            args.date, '%Y-%m-%dT%H:%M'
        )
    serp_type = args.serp_type
    assert serp_type in {'web', 'video'}

    c = defaultdict(lambda: Counter())
    records_queries = []
    records_results = []
    for serp in serps:
        try:
            ts = get_timestamp(serp['status']['status-details'])
        except:
            raise Exception("failed when trying to get timestamp: {}".format(json.dumps(serp)))
        query = serp[
            'serpRequestExplained'
        ]['per-query-parameters']['query-text']
        good_eps = [x for x in ptts if x["query"] == query and x['ts_range'][0] <= ts <= x['ts_range'][1]]
        if not good_eps:
            records_queries.append({"query": query, "skipped": True})
            continue
        right_urls = set()
        for ep in good_eps:
            right_urls |= set(ep["right_urls"])
        components = serp['serp-page']['parser-results']['components']
        good_url_position = 0
        first_url = ''
        all_urls = []
        position = -1
        for comp in components:
            if not component_check(comp, serp_type):
                continue
            position += 1
            url = comp['page-url']
            processed = proc_url(url.split('?')[0])
            if serp_type == 'video':
                url_for_check = processed
            elif serp_type == 'web':
                url_for_check = web_get_url_for_check(url)
            all_urls.append(url)
            if url_for_check in right_urls and not good_url_position:
                good_url_position = position + 1
            if position == 0:
                first_url = url
        if not good_url_position:
            good_url_position = -1
        rq_dict = dict(
            query=query,
            good_url_position=good_url_position,
            ts=ts,
            right_programs=good_eps,
            right_urls=list(right_urls),
            first_url=first_url,
            all_urls=all_urls,
        )
        records_queries.append(rq_dict)
        if right_urls:
            for comb in itertools.product(
                [query, '_total_'],
                wrap_channels(good_eps, channels),
                wrap_query_type(good_eps)
            ):
                c[(comb[0], comb[1], comb[2])][good_url_position] += 1
    for comb in c:
        c_ = c[comb]
        result = {'query': comb[0], 'channel': comb[1], 'query_type': comb[2]}
        result['fielddate'] = args.date.strftime('%Y-%m-%d %H:%M:00')
        for threshold in [1, 3, 5, 10]:
            value = sum(
                c_[x] for x in c_ if 0 <= x <= threshold
            ) / sum(c_.values())
            result['top{}'.format(threshold)] = value
        result['count'] = sum(c_.values())
        records_results.append(result)

    if args.report and args.report.lower() != "none":
        from nile.api.v1 import statface as ns
        client = ns.StatfaceClient(
            proxy='upload.stat.yandex-team.ru',
            username=os.environ['STAT_LOGIN'],
            password=os.environ['STAT_TOKEN']
        )

        ns.StatfaceReport().path(
            args.report
        ).scale('minutely').replace_mask(
            'fielddate'
        ).client(
            client
        ).data(
            records_results
        ).publish()

    json.dump(
        records_queries, codecs.open(args.output_queries, 'w', 'utf8'),
        indent=2, ensure_ascii=False, sort_keys=True
    )

    json.dump(
        records_results, codecs.open(args.output_results, 'w', 'utf8'),
        indent=2, ensure_ascii=False, sort_keys=True
    )


if __name__ == "__main__":
    main()
