#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
import json

from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    statface as ns,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
import getpass
import datetime
from pytils import date_range
import urlparse
import itertools



def parse_date(str_):
    return datetime.datetime.strptime(str_, '%Y-%m-%d').date()


bad_prefices = ('www.', 'm.')


def extract_host(url, strip_tld=False):
    parsed = urlparse.urlparse(url)
    result = parsed.netloc
    while result.startswith(bad_prefices):
        for prefix in bad_prefices:
            if result.startswith(prefix):
                result = result[len(prefix):]
    if strip_tld:
        result = '.'.join(result.split('.')[:-1])
    if result == 'yandex.ru' and parsed.path.startswith('/video'):
        return 'yandex.ru/video'
    return result


def percent_decode(url):
    if not isinstance(url, str):
        url = url.encode('utf8', errors='replace')
    url = urlparse.unquote(url)
    return url


def parse_mm(records):
    for rec in records:
        try:
            event = json.loads(rec.raw_event_value)
        except (ValueError, AttributeError, TypeError):
            continue
        if not event.get('page url') or not event.get('frame url'):
            continue
        page_url = event['page url']
        host = extract_host(page_url)
        if host != 'yandex.ru/video':
            continue
        frame_url = event['frame url']
        src_host = extract_host(frame_url)
        if not src_host:
            continue
        err = event.get('errors')
        if not err:
            continue
        if err == "0":
            err = "no errors"
        else:
            err = err.split(';')[0]
        yield Record(
            page_url=page_url,
            host=host,
            src_host=src_host,
            err=err
        )


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--from', '-f', required=True)
    parser.add_argument('--to', '-t', required=True)
    parser.add_argument('--stat_login', '-sl', default='robot_pecheny')
    parser.add_argument('--stat_password', '-sp', required=True)
    args = parser.parse_args()

    from_ = parse_date(getattr(args, 'from'))
    to_ = parse_date(getattr(args, 'to'))

    for date in date_range(from_, to_):
        process_date(date, args)


def totalize(records):
    for rec in records:
        content = vars(rec)
        src_host = content.pop('src_host')
        err = content.pop('err')
        if err != "no errors":
            err_tuple = (err, "_all_errors_", "_total_")
        else:
            err_tuple = (err, "_total_")
        for x in itertools.product(
            (src_host, '_total_'),
            err_tuple
        ):
            content['src_host'] = x[0]
            content['err'] = x[1]
            yield Record(**content)


def add_shares(groups):
    for key, records in groups:
        src_host = key.src_host
        before_total = []
        total = None
        for rec in records:
            if rec.err == '_total_':
                total = float(rec.absolute)
                yield Record(
                    src_host=src_host,
                    fielddate=rec.fielddate,
                    err=rec.err,
                    absolute=rec.absolute,
                    share=1
                )
            elif total is None:
                before_total.append(rec)
            else:
                share = round(rec.absolute / total, 6)
                yield Record(
                    src_host=src_host,
                    fielddate=rec.fielddate,
                    err=rec.err,
                    absolute=rec.absolute,
                    share=share
                )
        for rec in before_total:
            share = round(rec.absolute / total, 6)
            yield Record(
                src_host=src_host,
                fielddate=rec.fielddate,
                err=rec.err,
                absolute=rec.absolute,
                share=share
            )


def process_date(date, args):
    date_f = datetime.datetime.strftime(date, '%Y-%m-%d')

    hahn = clusters.yt.Hahn(
        pool='search-research_pecheny'
    ).env(
        templates=dict(
            job_root='home/videolog/errors_mobile',
            date=date_f
        )
    )

    job = hahn.job().env()

    log = job.table('logs/metrika-mobile-log/1d/{}'.format(date_f))

    log.qb2(
        log='metrika-mobile-log',
        fields=[
            'device_id', 'event_name', 'raw_event_value',
        ],
        filters=[
            sf.equals('event_name', 'video statistics'),
            sf.default_filtering('metrika-mobile-log')
        ]
    ).map(
        parse_mm
    ).groupby(
        'src_host', 'err'
    ).aggregate(
        absolute=na.count()
    ).project(
        ne.all(), fielddate=ne.const(date_f)
    ).sort(
        'absolute'
    ).put(
        '$job_root/results'
    ).map(
        totalize
    ).groupby(
        'src_host', 'err', 'fielddate'
    ).aggregate(
        absolute=na.sum('absolute')
    ).groupby(
        'src_host'
    ).sort('err').reduce(
        add_shares
    ).project(
        ne.all(), platform=ne.const('mobile')
    ).sort('src_host', 'err').put(
        '$job_root/$date/report'
    )

    job.run()

    client = ns.StatfaceClient(
        proxy='upload.stat.yandex-team.ru',
        username=args.stat_login,
        password=args.stat_password
    )

    report = ns.StatfaceReport().path(
        'Video/Others/errors'
    ).scale('daily')

    report = report.client(client)

    report = report.data(
        hahn.read('$job_root/$date/report'.format(date))
    )

    report.publish()


if __name__ == "__main__":
    main()
