#!/usr/bin/env python
# -*- coding: utf-8 -*-
# from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import random
import argparse
from collections import defaultdict, Counter
import math
import json
import re
import fnmatch

from nile.api.v1 import (
    statface as ns,
    clusters,
    Record
)
import getpass
import datetime
from decimal import Decimal, getcontext
from urlparse import urlparse
from pytils import get_host, bad_prefices
import csv
import StringIO
from searchable_players import count_searchable_players


def load_names(filename, dct, value, cats2n):
    if not os.path.isfile(str(filename)):
        return
    # with codecs.open(filename, 'r', 'utf8') as f:
    #     for line in f:
    #         dct[line.strip()].add(value)
    for obj in json.load(open(filename)):
        dct[obj['page_url']].add(value)
        cats2n[obj['page_url']] = obj.get('cat2n', 1)


def process_host(x):
    while x.startswith(bad_prefices):
        for p in bad_prefices:
            if x.startswith(p):
                x = x[len(p):]
    return x


def fnmatch_wrapper(pattern, dict_):
    for key in dict_:
        if (
            key == pattern or
            fnmatch.fnmatch(key, pattern) or
            fnmatch.fnmatch(pattern, key)
        ):
            return key


def parse_rkn(rkn_file):
    obj = json.load(open(rkn_file))
    lines = list(
        csv.reader(
            StringIO.StringIO(obj.encode('utf8')),
            delimiter=';'
        )
    )
    lines = [[y.decode('utf8') for y in x] for x in lines]
    bans = [x for x in lines if len(x) >= 3]
    host_bans = [x for x in bans if x[1] and not x[2]]
    url_bans = [x for x in bans if x[1] and x[2]]
    banned_hosts = {x[1] for x in host_bans if '*' not in x[1]}
    banned_urls = set()
    for x in url_bans:
        banned_urls |= set(x[2].split(' | '))
    return banned_hosts, banned_urls


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results', '-r', required=True)
    parser.add_argument('--date', '-d')
    parser.add_argument('--output', default='output.tsv')
    parser.add_argument('--output_desktop', default='output_desktop.tsv')
    parser.add_argument('--basket', '-b', default=None)
    parser.add_argument('--basket_name', '-bn', default=None)
    parser.add_argument('--basket_type', '-bt', default='other')
    parser.add_argument('--urls_type', '-ut', default='desktop')
    parser.add_argument('--kpi_desktop', '-kd', default=None)
    parser.add_argument('--kpi_touch', '-kt', default=None)
    parser.add_argument('--rkn')
    parser.add_argument('--stat_reports')
    parser.add_argument('--add_hosts')
    parser.add_argument('--nostat', action='store_true')
    parser.add_argument('--replace_mask', action='store_true')
    parser.add_argument('--autoplayers_status')
    parser.add_argument('--report_sbr', default='Video/Others/SBR')
    parser.add_argument('--yt_suffix', default='sbr')
    parser.add_argument(
        '--report_sp', default='Video/Others/searchable_players'
    )
    parser.add_argument('--regexps', default=None)
    args = parser.parse_args()

    for el in ['report_sbr', 'report_sp', 'yt_suffix']:
        if getattr(args, el).lower() in {'none', 'null'}:
            setattr(args, el, None)

    cl = clusters.yt.Hahn(token=os.environ['YT_TOKEN'])

    if args.regexps:
        regexps = json.load(open(args.regexps))
    regexps = [r for r in regexps if 'force_status' in r]

    autoplayers_status = defaultdict(dict)
    for obj in json.load(open(args.autoplayers_status)):
        autoplayers_status[
            obj['Platform'].lower()
        ][process_host((obj['Host'] or ''))] = obj['Status']

    host_dict = {process_host(x.host): 'film/serial' for x in cl.read(
        '//home/videolog/hosts_tvt/fs_by_clicks'
    )}

    host_dict.update({process_host(x.host): 'porn' for x in cl.read(
        '//home/videolog/hosts_tvt/porn_by_clicks'
    )})

    basket = json.load(open(args.basket))

    page_url_to_frame_url = {
        x['page_url']: x.get('frame_url', x['page_url']) for x in basket
    }

    with codecs.open(args.add_hosts, 'r', 'utf8') as f:
        for line in f:
            tabs = line.strip().split('\t')
            if len(tabs) == 2:
                main_host, mirrors = tabs
                mirrors = [process_host(x) for x in mirrors.split(',')]
                main_host = process_host(main_host)
                all_hosts = [main_host] + mirrors
                existing_key = None
                for host in all_hosts:
                    fw = fnmatch_wrapper(host, host_dict)
                    if fw:
                        existing_key = fw
                        break
                if not existing_key:
                    continue
                for host in all_hosts:
                    if host == existing_key:
                        continue
                    host_dict[host] = host_dict[fw]
                    print('added {} as mirror for {}'.format(
                        host, existing_key
                    ))

    try:
        datetime.datetime.strptime(args.date, '%Y-%m-%d')
    except (TypeError, ValueError):
        args.date = datetime.date.today().strftime('%Y-%m-%d')

    print('Current datetime is {}'.format(datetime.datetime.now()))
    print('Date is {}'.format(args.date))

    basket_names = defaultdict(set)
    cats2n = {}
    load_names(args.basket, basket_names, args.basket_name, cats2n)

    c = defaultdict(lambda: defaultdict(lambda: Counter()))
    s = defaultdict(lambda: defaultdict(lambda: Counter()))

    banned_hosts, banned_urls = parse_rkn(args.rkn)
    sr = defaultdict(lambda: defaultdict(set))

    headers = (
        'component_page_url',
        'query_device',
        'query_country',
        'normalizations__robot_coverage_video',
        'label'
    )

    output_table_data = []

    with codecs.open(args.results, 'r', 'utf8') as f, \
            codecs.open(args.output, 'w', 'utf8') as f1:
        for line in f:
            tabs = line.strip().split('\t')
            if tabs[0] not in basket_names:
                continue
            host_list = ['other']
            host = get_host(tabs[0])
            cano_url = tabs[3]
            cano_host = get_host(tabs[3])
            host_cat = fnmatch_wrapper(host, host_dict)
            if host_cat:
                host_list[0] = host_dict[host_cat]
            host_list.append('_total_')
            frame_url = page_url_to_frame_url.get(tabs[0], '')

            banned = (
                cano_host in banned_hosts or cano_url in banned_urls
            )

            for r in regexps:
                if 'regexp' not in r:
                    continue
                if (
                    re.search(r['regexp'], tabs[0]) or
                    re.search(r['regexp'], cano_url)
                ):
                    tabs[-1] = r['force_status']

            if banned:
                tabs[-1] += '/RKN_BANNED'

            autoplayers = ['other', '_total_']
            if 'DESKTOP' in line:
                platform = 'desktop'
            else:
                platform = 'touch'
            if cano_host in autoplayers_status[platform]:
                autoplayers[0] = autoplayers_status[platform][cano_host]

            if args.basket_type != 'kpi':
                f1.write(u'{}\t{}\t{}\t{}\n'.format(
                    '\t'.join(tabs), frame_url, host_list[0], autoplayers[0]
                ))
                record = dict(list(zip(headers, tabs)))
                record['frame_url'] = frame_url
                record['host_cat'] = host_list[0]
                record['autoplayers_status'] = autoplayers[0]
                output_table_data.append(record)

                if tabs[-1].startswith('SR_'):
                    sr_id = '{}_{}'.format(tabs[-1], args.urls_type)
                    sr[sr_id][cats2n[tabs[0]]].add(tabs[0])
            if banned:
                tabs[-1] = 'RKN_BANNED'
            for cat in host_list:
                for basket in basket_names[tabs[0]]:
                    for ap_status in autoplayers:
                        if 'touch' in basket and 'DESKTOP' in line:
                            continue
                        elif 'desktop' in basket and 'DESKTOP' not in line:
                            continue
                        c[(basket, ap_status)][cat][tabs[-1]] += 1

    yt = cl.driver.client
    if args.yt_suffix and args.basket_type != 'kpi':
        if not yt.exists(
            '//home/videolog/sbr/{}/output/{}'.format(
                args.yt_suffix, args.date
            )
        ):
            yt.mkdir(
                '//home/videolog/sbr/{}/output/{}'.format(
                    args.yt_suffix, args.date
                ),
                recursive=True
            )
        yt.write_table(
            table='//home/videolog/sbr/{}/output/{}/output'.format(
                args.yt_suffix, args.date
            ),
            input_stream=output_table_data
        )

    for status in sr:
        result = []
        for bucket in sr[status]:
            result.extend(
                [
                    {
                        'status': '_'.join(status.split('_')[:-1]),
                        'count': len(sr[status][bucket]),
                        'bucket': bucket,
                        'urls': list(sr[status][bucket])
                    }
                ]
            )
        if args.yt_suffix and args.basket_type != 'kpi':
            table_name = '//home/videolog/sbr/{}/output/{}/{}'.format(
                args.yt_suffix, args.date, status
            )
            yt.write_table(
                table_name,
                result
            )
            yt.run_sort(
                source_table=table_name,
                destination_table=table_name,
                sort_by=['bucket']
            )

    for basket in c:
        for cat in c[basket]:
            for status in c[basket][cat]:
                print('{}\t{}\t{}\t{}'.format(
                    basket, cat, status, c[basket][cat][status]
                ))
                s[basket][cat][status] = round(
                    c[basket][cat][status] / sum(c[basket][cat].values()), 4
                )

    recs = []

    ps = []

    for b in c:
        basket = b[0]
        ap_status = b[1]
        for cat in c[b]:
            ps_value = count_searchable_players(c[b][cat])
            ps.append(dict(
                fielddate=args.date,
                host_cat=cat,
                basket=basket,
                autoplayers_status=ap_status,
                value=ps_value,
            ))
            for d in c[b][cat]:
                recs.append(dict(
                    fielddate=args.date,
                    host_cat=cat,
                    status=d,
                    basket=basket,
                    autoplayers_status=ap_status,
                    value=c[b][cat][d],
                    share=s[b][cat][d]
                ))
                print(
                    'Basket: {}, cat: {}, status: {}, value: {}'.format(
                        b, cat, d, c[b][cat][d]
                    )
                )

    json.dump(
        {'SBR': recs, "searchable_players": ps},
        open(args.stat_reports, 'w'), indent=2,
        sort_keys=True
    )

    if args.report_sbr or args.report_sp:
        client = ns.StatfaceClient(
            proxy='upload.stat.yandex-team.ru',
            username=os.environ['STAT_LOGIN'],
            password=os.environ['STAT_PASSWORD']
        )

    if args.report_sbr:
        report = ns.StatfaceReport().path(
            args.report_sbr
        ).scale('daily')

        if args.replace_mask:
            report = report.replace_mask(
                'fielddate'
            )

        report = report.client(
            client
        ).data(
            recs
        ).publish()
        print('Pushed to {}'.format(args.report_sbr))

    if args.report_sp:
        report = ns.StatfaceReport().path(
            args.report_sp
        ).scale('daily')

        if args.replace_mask:
            report = report.replace_mask(
                'fielddate'
            )

        report = report.client(
            client
        ).data(
            ps
        ).publish()

        print('Pushed to {}'.format(args.report_sp))


if __name__ == "__main__":
    main()
