#-*- coding: UTF-8 -*-
import nile
import argparse
import time
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from copy import deepcopy
import datetime
import re
import json
import numpy as np
import sys
import os

TIMINGS_SEPARATOR = '!'
TIMING_TYPES = ['start', 'redirect', 'dns', 'cache', 'connection', 'request', 'response', 'summary', 'response_end']
TIMINGS_REGEX = re.compile(r'(\d+);(\d+);(\d+);(\d+);(\d+);(\d+);(\d+);(\d+);(\d+);')

THUMBS_METRIC_COUNT = [1, 3, 5, 10, 15, 20]
METRIC_PERCENTILES = [25, 50, 80, 95, 99]

REDIR_LOG_PREFIX = "statbox/redir-log"

def get_event_stats(groups):
    for key, recs in groups:
        timings = {}
        for timing_type in TIMING_TYPES:
            timings[timing_type] = {}
        for rec in recs:
            url = rec["url"]
            platform = "desktop"
            domain = "ru"
            request_type = "search"
            if not url:
                continue
            if "video/touch" in url:
                if "ui=webmobileapp.yandex" in url:
                    platform = "app"
                else:
                    platform = "touch"
            elif "video/pad" in url:
                platform = "pad"
            elif "ui=tvapp" in url:
                platform = "tvapp"
            if url.startswith("https://yandex."):
                domain = url[15:].split('/')[0]
            elif url.startswith("https://www.yandex."):
                domain = url[19:].split('/')[0]
            if "related" in rec["path"]:
                request_type = "related"
            timing_fields = rec["vars"].split(TIMINGS_SEPARATOR)
            for timing in timing_fields:
                matcher = TIMINGS_REGEX.match(timing)
                if matcher is None:
                    continue
                groups = matcher.groups()
                try:
                    pos = int(timing.split(';')[13])
                except:
                    continue
                for i in range(len(groups)):
                    timings[TIMING_TYPES[i]][pos] = int(groups[i])
        yield Record(platform=platform,
                     domain=domain,
                     timings=json.dumps(timings),
                     url=url,
                     request_type=request_type)

def calc_metrics(groups):
    for key, recs in groups:
        response_times = {}
        for cnt in THUMBS_METRIC_COUNT:
            response_times[cnt] = []

        for rec in recs:
            timings = json.loads(rec["timings"])
            values = [x[1] for x in sorted(timings["response_end"].items())]
            if len(values) == 0:
                continue
            for cnt in THUMBS_METRIC_COUNT:
                if len(values) >= cnt:
                    response_times[cnt].append(max(values[:cnt]))

        for cnt in THUMBS_METRIC_COUNT:
            data = response_times[cnt]
            results = {}
            if len(data) == 0:
                continue
            for p in METRIC_PERCENTILES:
                results['p{}'.format(p)] = np.percentile(data, p) / 1000.0

            yield Record(domain=key["domain"],
                         platform=key["platform"],
                         request_type=key["request_type"],
                         metric="response_time_{}".format(cnt),
                         hits=len(data),
                         **results)

def push_to_stat_new(report_table, scale, report, stat_username, stat_token):
    client = ns.StatfaceClient(proxy='upload.stat.yandex-team.ru',
                               username=stat_username,
                               token=stat_token)

    ns.StatfaceReport().path(report) \
                       .scale(scale) \
                       .client(client) \
                       .remote_publish(proxy='arnold',
                                       table_path=report_table,
                                       async_mode=False,
                                       upload_config=False)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--date', type=str, required=True)
    parser.add_argument('--stat_username', type=str, required=True)
    parser.add_argument('--stat_token', type=str, required=True)
    args = parser.parse_args()
    cluster = clusters.yt.Arnold().env(parallel_operations_limit=10,
                                    yt_spec_defaults=dict(
                                         pool_trees=["physical"],
                                         tentative_pool_trees=["cloud"],
                                         max_failed_job_count=200
                                     ),
                                    templates=dict(
                                        tmp_root='//home/videolog/tmp',
                                        title='ThumbLoadsStat'
                                  ))
    job = cluster.job()
    redir = job.table(REDIR_LOG_PREFIX + '/' + args.date)
    t = redir.qb2(log='redir-log',
                  fields=['path', 'vars', 'url', 'reqid'
                         ],
                  filters=[sf.contains('path','thumb-speed')],
                  mode='yamr_lines') \
             .groupby('reqid') \
             .reduce(get_event_stats) \
             .groupby('domain', 'platform', 'request_type') \
             .reduce(calc_metrics, memory_limit=16 * 1024) \
             .project(ne.all(), fielddate=ne.const(args.date)) \
             .filter(sf.custom(lambda x : x > 5000, 'hits')) \
             .put('//tmp/mma-1755/prepared_to_stat_' + args.date)
    job.run()

    push_to_stat_new('//tmp/mma-1755/prepared_to_stat_' + args.date, 'daily',
                         'Video.All/ThumbsSpeed', args.stat_username, args.stat_token)

if __name__ == '__main__':
    main()
