#!/usr/bin/env python
# -*- coding: utf-8 -*-
# from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import random
import argparse
from collections import defaultdict, Counter
import math
import json

from nile.api.v1 import (
    statface as ns,
    Record,
    clusters
)
import getpass
import datetime
import copy
from decimal import Decimal, getcontext
from pytils import get_host


def load_names(filename, dct, value):
    if not os.path.isfile(filename):
        return
    with codecs.open(filename, 'r', 'utf8') as f:
        for line in f:
            dct[line.strip()] = value


def good_dump(obj, filename):
    json.dump(
        obj,
        codecs.open(filename, 'w', 'utf8'),
        indent=2,
        ensure_ascii=False
    )


def mean(l_):
    return sum(l_) / len(l_)


def get_aggregated(recs, system):
    porno_ru = [
        x for x in recs if x['system'] == system and x['query'] == u'порно'
    ]
    porno_en = [
        x for x in recs if x['system'] == system and x['query'] == u'porno'
    ]
    sex_ru = [
        x for x in recs if x['system'] == system and x['query'] == u'секс'
    ]
    sex_en = [
        x for x in recs if x['system'] == system and x['query'] == u'sex'
    ]
    anal = [
        x for x in recs if x['system'] == system and x['query'] == u'анал'
    ]
    oral = [
        x for x in recs if x['system'] == system and x['query'] == u'минет'
    ]
    home = [
        x for x in recs if x['system'] == system and
        x['query'] == u'домашнее порно'
    ]
    hentai = [
        x for x in recs if x['system'] == system and
        x['query'] == u'хентай'
    ]

    return (
        4 * porno_ru + 3 * sex_ru + 2 * porno_en + sex_en +
        anal + oral + home + hentai
    )


def count_url_scores(data, query):
    d = [
        x for x in data
        if x['query'] == query
    ]
    url_to_system = defaultdict(set)
    for x in d:
        url_to_system[x['yandex_url']].add('yandex')
        url_to_system[x['pornhub_url']].add('pornhub')
    d = [
        x for x in d
        if len(url_to_system[x['yandex_url']]) == 1 and
        len(url_to_system[x['pornhub_url']]) == 1
    ]
    url_to_system = {k: v.pop() for k, v in url_to_system.items()}
    url_scores = Counter()
    for x in d:
        url_scores[x['yandex_url']] += float(x['yandex_score'])
        url_scores[x['pornhub_url']] += float(x['pornhub_score'])
    return (url_to_system, url_scores)


def count_top5(url_scores, system):
    url_to_system, url_scores = url_scores
    return len([
        x for x in url_scores.most_common(5) if url_to_system[x[0]] == system
    ]) / 5


def count_weights(url_scores, system):
    url_to_system, url_scores = url_scores
    return sum([
        url_scores[x] for x in url_scores if url_to_system[x] == system
    ]) / sum(url_scores.values())


def process_serps(serps, url_to_position):
    result = {}
    for serp in serps:
        result[serp['query']] = set()
        for i, component in enumerate(serp['components']):
            result[serp['query']].add(component['url'])
            url_to_position[serp['query']][component['url']] = i + 1
    return result


def process_serps_pornhub(serps, url_to_position):
    result = {}
    for serp in serps:
        if isinstance(serp, list):
            serp = {'query': "", 'components': serp}
        result[serp['query']] = set()
        for i, component in enumerate(serp['components']):
            result[serp['query']].add(component['url'])
            if not serp['query']:
                for q in [u'секс', u'sex', u'порно', u'porno']:
                    url_to_position[q][component['url']] = i + 1
            else:
                url_to_position[serp['query']][component['url']] = i + 1
    return result


def safediv(x, y):
    try:
        return x / y
    except ZeroDivisionError:
        return 0


def sigmoid(x):
    return 1 / (1 + math.exp(-x))


def weighted_average(lst, weights):
    return sum(x * y for x, y in zip(lst, weights)) / sum(weights)


query_weights = {
    u'порно': 236096,
    u'секс': 74506,
    u'анал': 14188,
    u'porno': 7142,
    u'хентай': 10107,
    u'sex': 5550,
    u'домашнее порно': 16687,
    u'минет': 9535,
}


def process_data(data, url_to_score):
    data_processed = []
    for x in data:
        e = copy.deepcopy(x)
        e.update(e.pop('inputValues'))
        e.update(e.pop('outputValues'))
        e['yandex_host'] = get_host(e['yandex_url'])
        if url_to_score:
            ys = url_to_score[(e['yandex_url'], e['query'])]
            ps = url_to_score[(e['pornhub_url'], e['query'])]
            e['yandex_score_before_bt'] = e['yandex_score']
            e['pornhub_score_before_bt'] = e['pornhub_score']
            e['yandex_score'] = sigmoid(ys - ps)
            e['pornhub_score'] = 1 - e['yandex_score']
        data_processed.append(e)
    return data_processed


def count_data(
    data_processed, args, url_to_score, type_='default', cluster=None
):
    results = defaultdict(lambda: defaultdict(list))

    output_p = []
    output_y = []

    winner = 'winner_{}'.format(type_)
    query_c = Counter()
    tsnow = datetime.datetime.now().strftime('%s')
    recs_to_put = []
    for e in data_processed:
        score = 'score' if type_ == 'default' else 'score_before_bt'
        e['type'] = type_
        query_c[e['query']] += 1
        for k in {x for x in e if x.endswith(score)}:
            system = k.split('_score')[0]
            value = float(e[k])
            results[e['query']][system].append(value)
        if e['pornhub_{}'.format(score)] > e['yandex_{}'.format(score)]:
            output_p.append(copy.deepcopy(e))
            e['winner_{}'.format(type_)] = 'pornhub'
        elif e['yandex_{}'.format(score)] > e['pornhub_{}'.format(score)]:
            output_y.append(copy.deepcopy(e))
            e[winner] = 'yandex'
        else:
            e[winner] = 'equal'
        e.pop('type')
        if type_ != 'default':
            continue
        sbspm = 'Plus' if e[winner] == 'yandex' else 'Minus'
        recs_to_put.append(
            {
                "key": u"{}\t{}".format(e['query'], e['yandex_url']),
                "subkey": tsnow,
                "value": u"SbS{}@ru={}".format(sbspm, query_c[e['query']])
            }
        )

    if recs_to_put:
        cluster.write(
            '//home/videoquality/vuserdata2/external_data/external_factors/frequent_video_relev_toloka',
            [Record(**x) for x in recs_to_put],
            append=True
        )

    recs = []
    top5 = defaultdict(dict)
    weights = defaultdict(dict)

    for query in results:
        url_scores = count_url_scores(
            data_processed, query
        )
        for system in results[query]:
            top5[query][system] = count_top5(url_scores, system)
            weights[query][system] = count_weights(url_scores, system)
            recs.append(
                {
                    'fielddate': args.date,
                    'system': system,
                    'query': query,
                    'calc_type': type_,
                    'mean': safediv(sum(results[query][system]),
                                    len(results[query][system])),
                    'victory_share': safediv(len(
                        [
                            x for x in data_processed
                            if x[winner] == system and
                            x['query'] == query
                        ]
                    ), len(
                        [x for x in data_processed if x['query'] == query]
                    )),
                    'victory_share_high_prob': safediv(len(
                        [
                            x for x in data_processed
                            if x[winner] == system and
                            x['query'] == query and
                            x['probability'] >= 0.7
                        ]
                    ), len(
                        [
                            x for x in data_processed
                            if x['query'] == query and
                            x['probability'] >= 0.7
                        ]
                    )),
                    'victory_share_low_prob': safediv(len(
                        [
                            x for x in data_processed
                            if x[winner] == system and
                            x['query'] == query and
                            x['probability'] < 0.7
                        ]
                    ), len(
                        [
                            x for x in data_processed
                            if x['query'] == query and
                            x['probability'] < 0.7
                        ]
                    )),
                    'top5_virtual_serp': top5[query][system],
                    'weight_in_virtual_serp': weights[query][system]
                }
            )

    aggregated_data = 4 * [
        x for x in data_processed if x['query'] == u'порно'
    ] + 3 * [
        x for x in data_processed if x['query'] == u'секс'
    ] + 2 * [
        x for x in data_processed if x['query'] == u'porno'
    ] + [
        x for x in data_processed if x['query'] == u'sex'
    ]

    for system in ['yandex', 'pornhub']:
        aggregated_top5 = mean(
            4 * [top5[u'порно'][system]] +
            3 * [top5[u'секс'][system]] +
            2 * [top5[u'porno'][system]] +
            [top5[u'sex'][system]]
        )
        aggregated_weights = mean(
            4 * [weights[u'порно'][system]] +
            3 * [weights[u'секс'][system]] +
            2 * [weights[u'porno'][system]] +
            [weights[u'sex'][system]]
        )
        recs.append(
            {
                'fielddate': args.date,
                'system': system,
                'calc_type': type_,
                'query': 'aggregated',
                # 'mean': safediv(sum(
                #     x['mean'] for x in get_aggregated(recs, system)
                # ), len(get_aggregated(recs, system))),
                'mean': weighted_average(
                    [x['mean'] for x in recs if x['system'] == system],
                    [
                        query_weights[x['query']] for x in recs
                        if x['system'] == system
                    ],
                ),
                'victory_share': safediv(len(
                    [
                        x for x in aggregated_data if x[winner] == system
                    ]
                ), len(aggregated_data)),
                'victory_share_high_prob': safediv(len(
                    [
                        x for x in aggregated_data if x[winner] == system and
                        x['probability'] >= 0.7
                    ]
                ), len(
                    [x for x in aggregated_data if x['probability'] >= 0.7]
                )),
                'victory_share_low_prob': safediv(len(
                    [
                        x for x in aggregated_data if x[winner] == system and
                        x['probability'] < 0.7
                    ]
                ), len(
                    [x for x in aggregated_data if x['probability'] < 0.7]
                )),
                'top5_virtual_serp': aggregated_top5,
                'weight_in_virtual_serp': aggregated_weights
            }
        )
    return (recs, output_p, output_y)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', '-i', required=True)
    parser.add_argument('--input_scores', '-is', default=None)
    parser.add_argument('--date', '-d')
    parser.add_argument('--token')
    parser.add_argument('--report', default='Video/Others/yandex_vs_pornhub')
    parser.add_argument(
        '--report_winrates', default='Video/Others/yandex_vs_pornhub_winrates'
    )
    parser.add_argument('--output_stat', default='output_stat.json')
    parser.add_argument('--output_all_data', default='output_all_data.json')
    parser.add_argument('--output_y', '-y', default='output_y.json')
    parser.add_argument('--output_p', '-p', default='output_p.json')
    parser.add_argument('--types', '-t', default='default,no_bt')
    parser.add_argument('--stat_login', '-sl', default='robot_pecheny')
    parser.add_argument('--stat_password', '-sp')
    args = parser.parse_args()

    try:
        datetime.datetime.strptime(args.date, '%Y-%m-%d')
    except (TypeError, ValueError):
        args.date = datetime.date.today().strftime('%Y-%m-%d')

    print('Current datetime is {}'.format(datetime.datetime.now()))
    print('Date is {}'.format(args.date))

    url_to_score = {}
    # if args.input_tsv:
    #     with codecs.open(args.input_tsv, 'r', 'utf8') as f:
    #         for line in f:
    #             tabs = line.strip().split()
    #             if len(tabs) < 4:
    #                 continue
    #             try:
    #                 url_to_score[tabs[-2]] = float(tabs[-1])
    #             except ValueError:
    #                 print('ValueError on line {}'.format(line.strip()))
    #                 continue
    if args.input_scores:
        for obj in json.load(open(args.input_scores)):
            url_to_score[(obj['url'], obj['query'])] = obj['score']
    data = json.load(open(args.input))
    data_processed = process_data(data, url_to_score)

    recs = []
    output_p = []
    output_y = []
    winrates = defaultdict(lambda: defaultdict(lambda: Counter()))
    cluster = clusters.yt.Hahn(token=args.token)
    for type_ in args.types.split(','):
        r, p, y = count_data(
            data_processed, args, url_to_score, type_=type_, cluster=cluster
        )
        recs.extend(r)
        output_p.extend(p)
        output_y.extend(y)
        for x in data_processed:
            winrates[type_][x['yandex_host']][
                x['winner_{}'.format(type_)]
            ] += 1
    winrates_recs = []
    for type_ in winrates:
        for host in winrates[type_]:
            sum_ = sum(
                winrates[type_][host].values()
            )
            if sum_ < 5:
                continue
            winrate = winrates[type_][host]['yandex'] / sum_
            dct = dict(winrates[type_][host])
            dct['host'] = host
            dct['calc_type'] = type_
            dct['fielddate'] = args.date
            dct['winrate'] = round(winrate, 3)
            winrates_recs.append(dct)

    json.dump(recs, open(args.output_stat, 'w'), indent=2, sort_keys=True)
    json.dump(output_p, open(args.output_p, 'w'), indent=2, sort_keys=True)
    json.dump(output_y, open(args.output_y, 'w'), indent=2, sort_keys=True)
    json.dump(
        output_y, open(args.output_all_data, 'w'), indent=2, sort_keys=True
    )
    cluster.driver.client.write_table(
        '//home/videolog/pornhub_sbs/{}/{}/data'.format(
            args.report.split('/')[-1],
            args.date
        ),
        data_processed
    )
    cluster.driver.client.write_table(
        '//home/videolog/pornhub_sbs/{}/{}/winrates'.format(
            args.report.split('/')[-1],
            args.date
        ),
        winrates_recs
    )

    if args.report and args.report.lower() not in {'none', 'null'}:
        client = ns.StatfaceClient(
            proxy='upload.stat.yandex-team.ru',
            username=args.stat_login,
            password=args.stat_password
        )

        ns.StatfaceReport().path(
            args.report
        ).scale('daily').client(
            client
        ).data(
            recs
        ).publish()

        print('Pushed to stat')
    else:
        print('Report not specified, won\'t push to stat')

    if args.report_winrates and args.report_winrates.lower() not in {
        'none', 'null'
    }:
        client = ns.StatfaceClient(
            proxy='upload.stat.yandex-team.ru',
            username=args.stat_login,
            password=args.stat_password
        )

        ns.StatfaceReport().path(
            args.report_winrates
        ).scale('daily').client(
            client
        ).data(
            winrates_recs
        ).publish()

        print('Winrates pushed to stat')
    else:
        print('Winrates report not specified, won\'t push to stat')


if __name__ == "__main__":
    main()
