#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
import json

from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    statface as ns,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
import getpass
import datetime
from pytils import date_range
import tldextract
import urlparse
import itertools

DATE = datetime.date(2017, 5, 20)
DATE_F = DATE.strftime('%Y-%m-%d')


def parse_date(str_):
    return datetime.datetime.strptime(str_, '%Y-%m-%d').date()


def extract_host(url):
    try:
        return tldextract.extract(url).registered_domain
    except TypeError:
        return ""


def percent_decode(url):
    if not isinstance(url, str):
        url = url.encode('utf8', errors='replace')
    url = urlparse.unquote(url)
    return url


def parse_vc(recs):
    for rec in recs:
        try:
            dec = json.loads(rec.dec)
        except:
            continue

        for d in dec:
            for url in d:
                try:
                    p = d[url]['p']
                except (KeyError, TypeError):
                    continue
                if not isinstance(p, list):
                    continue
                for el in p:
                    if len(el) == 10:
                        err = (el[7] or '')
                        src_url = (el[8] or '')
                    elif len(el) in {11, 13}:
                        err = (el[8] or '')
                        src_url = (el[9] or '')
                    else:
                        continue
                    if src_url.startswith('blob:'):
                        src_url = src_url[len('blob:'):]
                    src_url = percent_decode(src_url)

                    if 'yandex.ru/video' in url:
                        host = 'yandex.ru/video'
                    else:
                        try:
                            host = extract_host(url)
                        except:
                            continue
                    try:
                        src_host = extract_host(src_url)
                    except:
                        continue
                    if not err:
                        err = "no errors"
                    result = dict(
                        url=url, src_url=src_url,
                        host=host,
                        src_host=src_host,
                        plen=len(el), el=el, err=err,
                        yandexuid=rec.yandexuid
                    )
                    for i, x in enumerate(el[::-1]):
                        result["el_{:02}".format(i)] = x
                    yield Record(**result)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--from', '-f', required=True)
    parser.add_argument('--to', '-t', required=True)
    parser.add_argument('--stat_login', '-sl', default='robot_pecheny')
    parser.add_argument('--stat_password', '-sp', required=True)
    args = parser.parse_args()

    from_ = parse_date(getattr(args, 'from'))
    to_ = parse_date(getattr(args, 'to'))

    for date in date_range(from_, to_):
        process_date(date, args)


def totalize(records):
    for rec in records:
        content = vars(rec)
        src_host = content.pop('src_host')
        err = content.pop('err')
        if err != "no errors":
            err_tuple = (err, "_all_errors_", "_total_")
        else:
            err_tuple = (err, "_total_")
        for x in itertools.product(
            (src_host, '_total_'),
            err_tuple
        ):
            content['src_host'] = x[0]
            content['err'] = x[1]
            yield Record(**content)


def add_shares(groups):
    for key, records in groups:
        src_host = key.src_host
        before_total = []
        total = None
        for rec in records:
            if rec.err == '_total_':
                total = float(rec.absolute)
                yield Record(
                    src_host=src_host,
                    fielddate=rec.fielddate,
                    err=rec.err,
                    absolute=rec.absolute,
                    share=1
                )
            elif total is None:
                before_total.append(rec)
            else:
                share = round(rec.absolute / total, 6)
                yield Record(
                    src_host=src_host,
                    fielddate=rec.fielddate,
                    err=rec.err,
                    absolute=rec.absolute,
                    share=share
                )
        for rec in before_total:
            share = round(rec.absolute / total, 6)
            yield Record(
                src_host=src_host,
                fielddate=rec.fielddate,
                err=rec.err,
                absolute=rec.absolute,
                share=share
            )


def process_date(date, args):
    date_f = datetime.datetime.strftime(date, '%Y-%m-%d')

    hahn = clusters.yt.Hahn(
        pool='search-research_{}'.format(getpass.getuser())
    ).env(
        templates=dict(
            job_root='home/videolog/errors',
            date=DATE_F
        )
    )

    job = hahn.job().env(
        packages=['tldextract', 'idna', 'requests_file'],
        package_paths=[os.getcwd()]
    )

    log = job.table('logs/bar-navig-log/1d/{}'.format(date_f))

    log.qb2(
        log='bar-navig-log',
        fields=[
            'date', 'yandexuid', 'geo_id', 'url', 'parsed_http_params',
            se.dictitem('decoded_vc', from_='parsed_http_params'),
            se.custom('dec', lambda x: x[0] if x else '-', 'decoded_vc'),
        ],
        filters=[
            sf.defined('decoded_vc', 'yandexuid'),
            # sf.region_belongs([225], field='geo_id')
        ]
    ).map(
        parse_vc
    ).filter(
        nf.and_(
            nf.equals('host', 'yandex.ru/video'),
            nf.custom(lambda x: bool(x), 'src_host')
        )
    ).groupby(
        'src_host', 'err'
    ).aggregate(
        absolute=na.count()
    ).project(
        ne.all(), fielddate=ne.const(date_f)
    ).sort(
        'absolute'
    ).put(
        '$job_root/results'
    ).map(
        totalize
    ).groupby(
        'src_host', 'err', 'fielddate'
    ).aggregate(
        absolute=na.sum('absolute')
    ).groupby(
        'src_host'
    ).sort('err').reduce(
        add_shares
    ).project(
        ne.all(), platform=ne.const('desktop')
    ).sort('src_host', 'err').put(
        '$job_root/$date/report'
    )

    job.run()

    client = ns.StatfaceClient(
        proxy='upload.stat.yandex-team.ru',
        username=args.stat_login,
        password=args.stat_password
    )

    report = ns.StatfaceReport().path(
        'Video/Others/errors'
    ).scale('daily')

    report = report.client(client)

    report = report.data(
        hahn.read('$job_root/$date/report'.format(date))
    )

    report.publish()


if __name__ == "__main__":
    main()
