#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
import time
import requests
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    statface as ns,
    Record
)
import datetime
import itertools
from collections import Counter
from pytils import (
    yt_get_date_from_table as get_date,
    date_range,
    get_dates_from_stat
)


report = 'Video/Others/Strm/long_chunks_share'
JOB_ROOT = '//home/videoquality/vh_analytics/mma_1780_slow_chunks'
ASNAME_TABLE = (
    '//home/search-research/ensuetina/AS_MAP/proper_AS_names_corrected'
)
PERF_ROOT = '//logs/strm-perf-log/1d'
YQL_HEADERS = {
    'Content-Type': 'application/json',
    'Authorization': 'OAuth {}'.format(os.environ['YQL_TOKEN'])
}
STAT_HEADERS = {
    'StatRobotUser': os.environ['STAT_LOGIN'],
    'StatRobotPassword': os.environ['STAT_TOKEN']
}
QUERY = '''use hahn;
pragma yt.InferSchema;
insert into [{result}] with truncate
select
    ripe.ISP as provider,
    src.asn as asn,
    src.ip as ip,
    src.geo as geo,
    src.duration as duration,
    if (src.duration > 5, 1, 0) as long,
    src.region as region,
    src.district as district
from (
    select remote_addr as ip, Geo::GetAsset(remote_addr) as asn,
        min_of(20.0, cast(duration as Double) / 1000) as duration, geo,
        Geo::RoundRegionByIp(remote_addr, "region").name as region,
        Geo::RoundRegionByIp(remote_addr, "district").name as district
    from [{log}]
    where res_type = "ts"
    and Geo::IsIpInRegion(remote_addr, 225)
) as src
left join [{asnames}] as ripe
    on ripe.ASN == src.asn
'''


def yql_run(query):
    req = requests.post(
        'https://yql.yandex.net/api/v2/operations',
        json={
            'content': query,
            'action': 'RUN',
            'type': 'SQL',
            'title': 'MMA-1780 | YQL'
        },
        headers=YQL_HEADERS
    )
    id_ = req.json()['id']
    print('running query {}'.format(id_))
    status = req.json()['status']
    tries = 0
    while status in {'PENDING', 'RUNNING'} and tries < 60:
        req = requests.get(
            'https://yql.yandex.net/api/v2/operations/{}'.format(id_),
            headers=YQL_HEADERS
        )
        status = req.json()['status']
        print('operation status is {}'.format(status))
        if status not in {'PENDING', 'RUNNING'}:
            break
        time.sleep(60)
        tries += 1
    if status != 'COMPLETED':
        sys.stderr.write('operation {} failed: {}'.format(id_, req.content))


def totalize(recs):
    for rec in recs:
        for tup in itertools.product(
            (rec.provider, '_total_'),
            (rec.region, '_total_'),
            (rec.district, '_total_'),
        ):
            yield Record(
                provider=tup[0],
                region=tup[1],
                district=tup[2],
                long=rec.long
            )


def get_long_share(groups):
    for key, recs in groups:
        c = Counter()
        for rec in recs:
            c += Counter(dict(rec.long))
        result = key.to_dict()
        result['long_share'] = float(c[1] / sum(c.values()))
        yield Record(**result)


def process_date(cluster, date, proxy='hahn'):
    log_table = '{}/{}'.format(PERF_ROOT, date)
    tmp_table = '{}/{}/tmp'.format(JOB_ROOT, date)
    report_table = '{}/{}/report'.format(JOB_ROOT, date)
    query = QUERY.format(
        asnames=ASNAME_TABLE,
        log=log_table,
        result=tmp_table
    )
    yql_run(query)
    job = cluster.job()

    job.table(
        tmp_table
    ).filter(
        nf.custom(
            lambda x: bool(x) and x != '-', 'provider'
        )
    ).groupby(
        'provider', 'region', 'district'
    ).aggregate(
        long=na.histogram('long')
    ).map(
        totalize, intensity='ultra_cpu'
    ).groupby(
        'provider', 'region', 'district'
    ).reduce(
        get_long_share
    ).project(
        ne.all(), fielddate=ne.const(str(date))
    ).sort(
        'fielddate', 'provider', 'region', 'district'
    ).put(
        report_table,
        schema=dict(
            fielddate=str,
            provider=str,
            region=str,
            district=str,
            long_share=float,
        )
    )

    job.run()

    client = ns.StatfaceClient(
        proxy='upload.stat.yandex-team.ru',
        username=os.environ['STAT_LOGIN'],
        password=os.environ['STAT_TOKEN']
    )

    ns.StatfaceReport().path(
        report
    ).scale('daily').replace_mask(
        'fielddate'
    ).client(
        client
    ).remote_publish(
        proxy=proxy,
        table_path=report_table,
        async_mode=False,
        upload_config=False
    )


def main():
    global report
    parser = argparse.ArgumentParser()
    parser.add_argument('--from')
    parser.add_argument('--to')
    parser.add_argument('--report', default=report)
    parser.add_argument('--proxy', default='hahn')
    parser.add_argument('--pool', default='loadbase')
    args = parser.parse_args()

    report = args.report
    from_ = getattr(args, 'from')
    to_ = getattr(args, 'to')

    cluster = clusters.YT(
        proxy=os.environ['YT_PROXY'],
        token=os.environ['YT_TOKEN'],
        pool=args.pool,
    )

    if from_ and to_:
        dates_to_process = list(date_range(from_, to_))
    else:
        last_date = get_dates_from_stat(
            headers=STAT_HEADERS,
            report=report,
            dimensions=['provider', 'region', 'district']
        )[-1]
        yt = cluster.driver.client

        dates_to_process = [get_date(x) for x in yt.search(
            root=PERF_ROOT,
            path_filter=lambda x: (
                get_date(x) and get_date(x) > last_date
            )
        )]

    for date in dates_to_process:
        process_date(cluster, date, proxy=args.proxy)


if __name__ == "__main__":
    main()
