#!/usr/bin/env python
# -*- coding: utf-8 -*-

from nile.api.v1 import (
    clusters,
    Record
)

from nile.api.v1 import filters as nf, aggregators as na
from qb2.api.v1 import filters as sf, extractors as se
from nile.files import StatboxDict as sd

from datetime import datetime as dt, timedelta

import json
import re
import os
import requests
import numpy as np
import argparse
from pytils import date_range

from statface_client import StatfaceClient


# In[2]:

# где будут жить логи на кластере,
# в процессе работы могут получиться десятки гигабайт временных файлов,
# поэтому квоту нужно брать с запасом
job_root = '//home/videolog/strm_video'
# job_root = '//home/morda/video'

# Лимит на параллельные операции
parallel_operations_limit = 30

# Перцентили скорости ответа
percentiles = [20, 50, 80, 95]

# Лимит на job для reduce
memory_limit = 30000

# Лимит на число записей
mbps_count_limit = 0


# In[3]:

cluster = None


# In[4]:

def get_yesterday_date():
    yesterday = dt.now() - timedelta(days=1)
    return yesterday.strftime('%Y-%m-%d')


def check_table(table):
    # job = cluster.job()

    try:
        # job.table(table).read()
        return cluster.driver.client.get_attribute(
            table, 'row_count'
        ) > 0

        return True
    except:
        return False


# In[5]:

# статистика AS
def get_error_id(parsed_parameters):
    try:
        if parsed_parameters.get('error_id'):
            return parsed_parameters.get('error_id')[0]

        if parsed_parameters.get('event_id')[0] == 'Buffer.Empty':
            return parsed_parameters.get('event_id')[0]
    except:
        return None

    return None


def get_video_as_stats(date):
    as_stats_table = '{}/{}/as_stats'.format(job_root, date)

    if not check_table(as_stats_table):
        job = cluster.job()
        logs = job.table('//statbox/strm-access-log/%s' % date)

        logs.qb2(
            log='strm-access-log',
            fields=[
                ['date', 'timestamp', 'page',
                    'parsed_parameters', 'autonomous_systems'],
                se.custom('error_id', lambda parsed_parameters: get_error_id(
                    parsed_parameters)),
                'request_time',
                'bytes_sent',
                'status'
            ],
            filters=[
                nf.custom(video_as_stats_filter, 'request'),
                nf.custom(lambda x: x is not None and len(
                    x), 'autonomous_systems')
            ]
        ).map(unfold_autonomous_systems).put(as_stats_table)

        job.run()


def unfold_autonomous_systems(lines):
    for record in lines:
        autonomous_systems = record.autonomous_systems

        for asn in autonomous_systems:
            yield Record(record, autonomous_system=asn)

        # yield Record(record,autonomous_system='_total_')


def add_network(lines):
    import yaml
    asname_dict = yaml.load(open('as-name.yaml'), Loader=yaml.CLoader)
    for record in lines:
        name = asname_dict.get(record.autonomous_system)

        # if record.autonomous_system == '_total_':
        #    network = '_total_'

        if name:
            network = name.split('. Description')[0].split('- ')[1]
        else:
            network = '-'

        yield Record(record, network=network)


def get_video_networks(date):
    as_stats_table = '$job_root/%s/as_stats' % date
    network_table = '$job_root/%s/networks_stats' % date

    if not check_table(network_table):
        job = cluster.job()
        logs = job.table(as_stats_table)

        logs.map(add_network,
                 files=[sd('as-name.yaml')]
                 ).put(network_table)

        job.run()


def video_as_stats_filter(request):
    return request is not None and request.find('for-regional-cache=1') == -1 and request.find('monitoring=') == -1


class Reducer_as_stats(object):

    def __init__(self, date, percentiles, total):
        self.date = date
        self.percentiles = percentiles
        self.total = total

    def __call__(self, groups):
        for key, records in groups:

            errors_count = 0
            hits = 0
            sum_bytes = 0
            mbps = np.array([])

            for record in records:
                if record['request_time'] is None or record['bytes_sent'] is None:
                    continue

                if record['parsed_parameters'] and record['parsed_parameters'].get('error_id'):
                    errors_count += 1
                    continue

                if record['request_time'] != 0 and len(mbps) < mbps_count_limit:
                    mbps = np.append(mbps, float(
                        record['bytes_sent'] / (record['request_time'] * 1024 * 1024)))

                sum_bytes += record['bytes_sent']
                hits += 1

            mbps_percentiles = []

            if len(mbps) > 0 and len(mbps) < mbps_count_limit - 1:
                mbps_percentiles = [
                    round(x, 2) for x in np.percentile(mbps, self.percentiles)]

            if hits == 0:
                errors_percent = 100 if errors_count else 0
            else:
                errors_percent = round(100 * float(errors_count) / hits, 2)

            if self.total:
                error_id = self.total
            else:
                error_id = key.get('error_id')

            yield Record(date=self.date,
                         autonomous_system=key.get('autonomous_system'),
                         mbps_percentiles=mbps_percentiles,
                         sum_mbytes=sum_bytes / (1024 * 1024),
                         hits=hits,
                         error_id=error_id,
                         errors_count=errors_count,
                         errors_percent=errors_percent,
                         mbps_all_avg=round(
                             (float(sum_bytes) / 86400) / (1024 * 1024), 2)
                         )


class Reducer_network_stats(object):

    def __init__(self, date, total):
        self.date = date
        self.total = total

    def __call__(self, groups):
        for key, records in groups:

            errors_count = 0
            hits = 0
            sum_bytes = 0
            mbps = np.array([])

            for record in records:
                if record['request_time'] is None or record['bytes_sent'] is None:
                    continue

                if record['parsed_parameters'] and record['parsed_parameters'].get('error_id'):
                    errors_count += 1
                    continue

                sum_bytes += record['bytes_sent']
                hits += 1

            if hits == 0:
                errors_percent = 100 if errors_count else 0
            else:
                errors_percent = round(100 * float(errors_count) / hits, 2)

            if self.total:
                error_id = self.total
            else:
                error_id = key.get('error_id')

            yield Record(date=self.date,
                         network=key.get('network'),
                         sum_mbytes=sum_bytes / (1024 * 1024),
                         hits=hits,
                         error_id=error_id,
                         errors_count=errors_count,
                         errors_percent=errors_percent,
                         mbps_all_avg=round(
                             (float(sum_bytes) / 86400) / (1024 * 1024), 2)
                         )


def run_video_as_stats(date):
    as_stats_table = '$job_root/%s/as_stats' % date
    as_stats_report = '$job_root/%s/as_stats_report' % date

    job = cluster.job()

    job.table(as_stats_table).groupby('autonomous_system') \
        .reduce(Reducer_as_stats(date, percentiles, '_total_'), memory_limit=memory_limit) \
        .put(as_stats_report)

    job.table(as_stats_table).groupby('error_id', 'autonomous_system') \
        .reduce(Reducer_as_stats(date, percentiles, ''), memory_limit=memory_limit) \
        .put(as_stats_report + '_error')

    job.run()


def run_video_network_stats(date):
    networks_stats_table = '$job_root/%s/networks_stats' % date
    networks_stats_report = '$job_root/%s/networks_stats_report' % date

    job = cluster.job()

    job.table(networks_stats_table).groupby('network') \
        .reduce(Reducer_network_stats(date, '_total_'), memory_limit=memory_limit) \
        .put(networks_stats_report)

    job.table(networks_stats_table).groupby('error_id', 'network') \
        .reduce(Reducer_network_stats(date, ''), memory_limit=memory_limit) \
        .put(networks_stats_report + '_error')

    job.run()


def pub_video_as_stats(date):
    as_stats_report = '$job_root/%s/as_stats_report' % date
    as_stats_report_error = '$job_root/%s/as_stats_report_error' % date
    tables = [as_stats_report, as_stats_report_error]

    data = []

    for table in tables:
        as_data = cluster.job().table(table).read()

        for index, item in enumerate(as_data):
            data_item = {
                'fielddate': item['date'],
                'as': item['autonomous_system'],
                'hits': item['hits'],
                'errors_count': item['errors_count'],
                'error_id': item.get('error_id') if item.get('error_id') else 'none',
                'errors_percent': item['errors_percent'],
                'sum_mbytes': item['sum_mbytes'],
                'mbps_all_avg': item['mbps_all_avg']
            }

            for p_index, p in enumerate(percentiles):
                data_item['mbps_p' + str(p)] = item['mbps_percentiles'][
                    p_index] if len(item['mbps_percentiles']) else 0

            data.append(data_item)

    stat_client = StatfaceClient(
        'robot_ivan-karev', 'oos4Fah2Ai', host='upload.stat.yandex-team.ru:443')
    report = stat_client.get_report('Morda/Strm/AS')

    # with open('as_stats.yaml') as config_fd:
    #config = config_fd.read()

    # report.upload_config(config)
    report.upload_data('d', data)


def pub_video_networks_stats(date):
    networks_stats_report = '$job_root/%s/networks_stats_report' % date
    networks_stats_report_error = '$job_root/%s/networks_stats_report_error' % date
    tables = [networks_stats_report, networks_stats_report_error]

    data = []

    for table in tables:
        as_data = cluster.job().table(table).read()

        for index, item in enumerate(as_data):
            data_item = {
                'fielddate': item['date'],
                'network': item['network'],
                'hits': item['hits'],
                'errors_count': item['errors_count'],
                'error_id': item.get('error_id') if item.get('error_id') else 'none',
                'errors_percent': item['errors_percent'],
                'sum_mbytes': item['sum_mbytes'],
                'mbps_all_avg': item['mbps_all_avg']
            }

            data.append(data_item)

    stat_client = StatfaceClient(
        'robot_ivan-karev', 'oos4Fah2Ai', host='upload.stat.yandex-team.ru:443')
    report = stat_client.get_report('Morda/Strm/Networks')

    # with open('networks_stats.yaml') as config_fd:
    #config = config_fd.read()

    # report.upload_config(config)
    report.upload_data('d', data)


# In[6]:

def clean_tables(date):
    table_names = [
        '/%s/as_stats' % date,
        '/%s/as_stats_report' % date,
        '/%s/as_stats_report_error' % date,
        '/%s/networks_stats' % date,
        '/%s/networks_stats_report' % date,
        '/%s/networks_stats_report_error' % date
    ]

    for name in table_names:
        cluster.driver.remove(job_root + name)


# In[7]:

def run(**kwargs):
    date = get_yesterday_date()

    if kwargs.get('date'):
        date = kwargs.get('date')

    start_ts = dt.now()

    get_video_as_stats(date)
    run_video_as_stats(date)
    pub_video_as_stats(date)

    get_video_networks(date)
    run_video_network_stats(date)
    pub_video_networks_stats(date)

    clean_tables(date)

    running_time = int((dt.now() - start_ts).total_seconds())
    print 'Done! Running time %2d:%2d' % (running_time / 60, running_time % 60)


def get_date(s):
    try:
        return dt.strptime(
            re.search(r'[0-9]{4}-[0-9]{2}-[0-9]{2}', s).group(0),
            '%Y-%m-%d'
        ).date()
    except (ValueError, TypeError, AttributeError):
        return


# In[ ]:
if __name__ == '__main__':
    global cluster
    parser = argparse.ArgumentParser()
    parser.add_argument('--from')
    parser.add_argument('--to')
    args = parser.parse_args()

    from_ = getattr(args, 'from')
    to_ = getattr(args, 'to')

    cluster = clusters.yt.Hahn(token=os.environ['YT_TOKEN']).env(
        templates=dict(
            job_root=job_root
        ),
        parallel_operations_limit=parallel_operations_limit
    )
    if from_ and to_:
        dates_to_process = date_range(from_, to_)
    else:
        report = 'Morda/Strm/Networks'
        headers = {
            'StatRobotUser': 'robot_ivan-karev',
            'StatRobotPassword': 'oos4Fah2Ai'
        }
        print('getting dates from report')
        req = requests.get(
            'https://upload.stat.yandex-team.ru/{}?network=_in_table_&error_id=_total_&_type=json'.format(
                report),
            headers=headers, verify=False
        )
        print('parsing response')
        values = sorted(
            req.json()['values'], key=lambda x: x['fielddate'], reverse=True
        )

        last_date = get_date(
            values[0]['fielddate'].split(' ')[0]
        )
        print('last date: {}'.format(last_date))

        print('getting available dates...')
        available_dates = sorted(get_date(s) for s in cluster.driver.client.search(
            root='//statbox/strm-access-log', node_type="table"
        ) if get_date(s))

        dates_to_process = [
            x for x in available_dates if x > last_date
        ]

    print('dates to process: {}'.format(dates_to_process))

    for date in dates_to_process:
        print('running for {}'.format(date))
        run(date=format(date))
