#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    statface as ns,
    Record
)
import getpass
import json
import datetime
import urlparse
from collections import Counter
import random

DATE = datetime.date(2017, 7, 4)
DATE_F = DATE.strftime('%Y-%m-%d')


def good_dump(obj, fn):
    json.dump(
        obj,
        codecs.open(fn, 'w', 'utf8'),
        indent=4, ensure_ascii=False, sort_keys=True
    )


bad_prefices = ('www.', 'm.')


def get_host(url, strip_tld=True):
    parsed = urlparse.urlparse(url)
    result = parsed.netloc
    while result.startswith(bad_prefices):
        for prefix in bad_prefices:
            if result.startswith(prefix):
                result = result[len(prefix):]
    if strip_tld:
        result = '.'.join(result.split('.')[:-1])
    return result


def safediv(x, y):
    try:
        return x / y
    except ZeroDivisionError:
        return 0


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', '-i', required=True)
    parser.add_argument('--detailed', default=None)
    parser.add_argument('--serps', '-s', required=True)
    parser.add_argument('--pairs', '-p', required=True)
    parser.add_argument('--output', '-o', default='output.json')
    parser.add_argument('--date', '-d')
    parser.add_argument('--datefile', '-df')
    parser.add_argument(
        '--table',
        default='//home/videoindex/full/dups/cdups/prevdata/cdups.hostany'
    )
    parser.add_argument('--basket', '-b', required=True)
    parser.add_argument('--serp_type', '-t', required=True)
    parser.add_argument('--stat_login', '-sl', default='robot_pecheny')
    parser.add_argument('--stat_password', '-sp', required=True)
    parser.add_argument('--local', '-l', action='store_true')
    parser.add_argument('--cache', '-c', action='store_true')
    parser.add_argument('--nopush', '-n', action='store_true')
    args = parser.parse_args()

    if not args.date:
        args.date = datetime.date.today().strftime('%Y-%m-%d')
    if args.datefile:
        with codecs.open(args.datefile, 'r', 'utf8') as f:
            args.date = f.read().strip()

    cl = clusters.Banach(token=os.environ['YT_TOKEN']).env(
        templates=dict(
            job_root='tmp/videolog/{}'.format(
                random.SystemRandom().randint(1000000, 100000000)
            )
        )
    )

    data = json.load(open(args.input))
    pairs = {tuple(x) for x in json.load(open(args.pairs))['pairs']}
    if not pairs:
        print('Pairs file is empty, exiting')
        good_dump({}, args.output)
        sys.exit(0)

    urls = set()
    results = {}
    for d in data:
        url1 = d['inputValues']['url1']
        url2 = d['inputValues']['url2']
        if (url1, url2) not in pairs:
            continue
        urls.add(url1)
        urls.add(url2)
        results[tuple(sorted([url1, url2]))] = d['outputValues']['result']
    if args.detailed and 'validate' in args.basket:
        detailed = [
            d for d in json.load(open(args.detailed))
            if (d['inputValues']['url1'], d['inputValues']['url2']) in pairs
        ]
        if detailed:
            hahn = clusters.yt.Hahn(token=os.environ['YT_TOKEN'])
            table_name = '//home/videolog/dups_metric/{}_{}_detailed'.format(
                args.basket, args.serp_type
            )
            hahn.driver.client.write_table(
                table_name,
                detailed
            )
            hahn.driver.client.run_sort(
                table_name, table_name, sort_by=['workerId']
            )
            hahn.driver.client.set_attribute(
                path=table_name,
                attribute='date',
                value=args.date
            )
            del detailed

    recs = [Record(GroupingUrl=url.split('//')[1], url=url) for url in urls]

    joined_table = '$job_root/{}_{}/joined'.format(args.basket, args.serp_type)

    cl.write('$job_root/for_join', recs)

    if not args.cache:
        job = cl.job()

        fj = job.table(
            '$job_root/for_join'
        )

        job.table(
            # '//home/videoindex/full/dupdb/prevdata/dupsdb'
            args.table
        ).project("GroupingUrl", dg="urlBaseHash").join(
            fj, by="GroupingUrl", type='inner'
        ).put(
            joined_table
        )

        job.run()

    joined = cl.read(joined_table)

    joined = {x.url: x.dg for x in joined}

    dup_states = {'EQUAL', 'EQUAL_ALGO', 'NEARLY_EQUAL', 'DIFFERENT_QUALITY'}
    final_results = []
    errors = []
    for pair in results:
        url1_dg = joined.get(pair[0], None)
        url2_dg = joined.get(pair[1], None)
        element = {
            'url1': pair[0],
            'url2': pair[1],
            'url1_dg': url1_dg,
            'url2_dg': url2_dg,
            'toloka': results[pair] in dup_states,
            'toloka_raw': results[pair],
            'dg': url1_dg and url2_dg and url1_dg == url2_dg
        }
        if url1_dg and url2_dg:
            final_results.append(element)
        else:
            element['error'] = True
            errors.append(element)

    both = len(
        [
            x for x in final_results
            if x['toloka'] and x['dg']
        ]
    )

    metrics = {}
    metrics['recall'] = safediv(both, len(
        [x for x in final_results if x['toloka']]
    ))
    metrics['precision'] = safediv(both, len(
        [x for x in final_results if x['dg']]
    ))

    all_dups = {
        tuple(
            sorted([x['url1'], x['url2']])
        ) for x in final_results if x['toloka']
    }

    final_results += errors
    good_dump(final_results, args.output)

    c = Counter()
    serps_out = []
    if args.serps:
        serps = json.load(open(args.serps))

        dups_metric_list = []
        for i, serp in enumerate(serps):
            buff = []
            serp_dups = []
            for u, url in enumerate(serp['serp']):
                dup_url = ""
                for url1 in buff:
                    if (
                        url == url1 or
                        (
                            get_host(url) == get_host(url1) and
                            tuple(sorted([url, url1])) in all_dups
                        )
                    ):
                        dup_url = url1
                        serp_dups.append(tuple(sorted([url, url1])))
                        break
                serps_out.append(
                    Record(
                        serp_id=i + 1,
                        url_id=u + 1,
                        url=url,
                        query=serp['query'],
                        dup_url=dup_url
                    )
                )
                buff.append(url)
            c[len(serp_dups)] += 1
            dups_metric_list.append(len(serp_dups) / 10)

        print(c.most_common())

        metrics['dups_measure'] = sum(dups_metric_list) / len(dups_metric_list)
    else:
        metrics['dups_measure'] = 0

    print('dups measure: {}'.format(metrics['dups_measure']))

    hahn = clusters.yt.Hahn()

    if 'kpi' not in args.basket:
        path = '//home/videolog/dups_metric/{}_{}'.format(
            args.basket, args.serp_type
        )
        path_serps = '//home/videolog/dups_metric/{}_{}_serps'.format(
            args.basket, args.serp_type
        )
        hahn.write(
            records=[Record(**x) for x in final_results],
            path=path
        )
        hahn.driver.client.set_attribute(
            path=path,
            attribute='build_datetime',
            value=datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
        )
        hahn.write(
            records=serps_out,
            path=path_serps
        )
        hahn.driver.client.set_attribute(
            path=path_serps,
            attribute='build_datetime',
            value=datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
        )

    if not args.nopush:
        recs = [
            Record(
                fielddate=args.date,
                basket=args.basket,
                serp_type=args.serp_type,
                recall=metrics['recall'],
                precision=metrics['precision'],
                dups_measure=metrics['dups_measure']
            )
        ]
        client = ns.StatfaceClient(
            proxy='upload.stat.yandex-team.ru',
            username=args.stat_login,
            password=args.stat_password
        )

        ns.StatfaceReport().path(
            'Video/Others/dups-metrics'
        ).scale(
            'daily'
        ).client(
            client
        ).data(
            recs
        ).publish()

    bn, ext = os.path.splitext(args.output)
    print('recall: {}'.format(metrics['recall']))
    print('precision: {}'.format(metrics['precision']))
    both_dups = [x for x in final_results if x['toloka'] and x['dg']]
    if args.local:
        good_dump(both_dups, bn + '_both_dups.json')
    both_ok = [x for x in final_results if not x['toloka'] and not x['dg']]
    if args.local:
        good_dump(both_ok, bn + '_both_ok.json')
    false_negatives = [
        x for x in final_results if x['toloka'] and not x['dg']
    ]
    if args.local:
        good_dump(false_negatives, bn + '_false_negatives.json')
    false_positives = [
        x for x in final_results if x['dg'] and not x['toloka']
    ]
    if args.local:
        good_dump(false_positives, bn + '_false_positives.json')

    print('both dups: {}'.format(len(both_dups)))
    print('both ok: {}'.format(len(both_ok)))
    print('false negatives: {}'.format(len(false_negatives)))
    print('false positives: {}'.format(len(false_positives)))


if __name__ == "__main__":
    main()
