#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
import re
from collections import Counter, defaultdict
from datetime import datetime, timedelta
from os import getenv, environ

import yt.wrapper as yt
from nile.api.v1 import aggregators, clusters
from nile.api.v1.statface.client import StatfaceProductionClient
from nile.api.v1.statface.report import StatfaceReport
from qb2.api.v1 import extractors, filters


re_prj = re.compile('^.+[^0-9](?=[0-9]+[a-z])')

interesting_services = {
    'abook.mail.yandex.net',
    'api.social.yandex.ru',
    'apidb',
    'avatars-int.mds.yandex.net',
    'avatars.mds.yandex.net',
    'bazinga.disk.yandex.net',
    'blackbox.yandex.net',
    'c.yandex-team.ru',
    'cards2.yandex.net',
    'clck.qloud.disk.yandex.net',
    'comments.qloud.disk.yandex.net',
    'counters.disk.yandex.net',
    'dapi-sharpei.disk.yandex.net',
    'dataapi.disk.yandex.net',
    'dataapidb',
    'disk-producer.search.yandex.net',
    'disk.search.yandex.net',
    'djfs-api.qloud.disk.yandex.net',
    'docviewer-corp.qloud.disk.yandex.net',
    'docviewer-worker.qloud.disk.yandex.net',
    'docviewer.qloud.disk.yandex.net',
    'docviewer.qloud.dst.yandex.net',
    'downloader.disk.yandex.ru',
    'ds-sharpei-qa.mail.yandex.net',
    'ds-sharpei-test.mail.yandex.net',
    'ds-sharpei.disk.yandex.net',
    'ds-sharpei.dsp.yandex.net',
    'ds-sharpei.dst.yandex.net',
    'hahn.yt.yandex-team.ru',
    'hahn.yt.yandex.net',
    'imageparser.ape.yandex.net',
    'intapi.disk.yandex.net',
    'lbk.search.yandex.net',
    'lenta-loader.qloud.disk.yandex.net',
    'logbroker.yandex.net',
    'meta.mail.yandex.net',
    'mpfs.disk.yandex.net',
    'mulcagate',
    'new-msearch-proxy.mail.yandex.net',
    'notes.qloud.disk.yandex.net',
    'notifier.qloud.disk.yandex.net',
    'oauth-internal.yandex.ru',
    'passport-internal.yandex.ru',
    'push.yandex-team.ru',
    'push.yandex.ru',
    'ratelimiter',
    'ratelimiter.disk.yandex.net',
    'disk-ratelimiter.deploy.disk.yandex.net',
    'rtx.yandex.net',
    'smartcache.disk.yandex.net',
    'smc-sharpei.disk.yandex.net',
    'smcdb',
    'staff-api.yandex-team.ru',
    'storage-int.mds.yandex.net',
    'storage.mail.yandex.net',
    'storage.mds.yandex.net',
    'storage.yandex.net',
    'streaming.disk.yandex.net',
    'tvm-api-test.yandex.net',
    'tvm-api.yandex.net',
    'uaas.search.yandex.net',
    'uploader',
}


def fetch_service(message):
    url = message.split()[-1]
    url_parts = url.split('/')
    if len(url_parts) < 3:
        return ''
    hostname = url_parts[2].split('?')[0].split(':')[0]
    if hostname.endswith('hahn.yt.yandex.net'):
        return 'hahn.yt.yandex.net'
    if hostname.endswith('logbroker.yandex.net'):
        return 'logbroker.yandex.net'
    if hostname.startswith('lbk-') and hostname.endswith('.search.yandex.net'):
        return 'lbk.search.yandex.net'
    if hostname.endswith('storage.yandex.net'):
        return 'storage.yandex.net'
    first_part = hostname.split('.')[0]
    if first_part.isdigit():
        hostname = '.'.join(hostname.split('.')[1:])
        first_part = hostname.split('.')[0]
    host_match = re_prj.search(first_part)
    if host_match:
        hostname = host_match.group(0)
    return hostname


def generate_retries_table(yql_token, pwd, yesterday):
    result_table = pwd + '/requests_retries/%s' % yesterday
    if yt.exists(result_table):
        return result_table

    sources = [
        '//home/logfeller/logs/%s/1d/%s' % (l, yesterday)
        for l in (
            'ydisk-counters-api-log',
            'ydisk-dataapi-log',
            'ydisk-dataapi-worker-log',
            'ydisk-docviewer-web-log',
            'ydisk-download-stat-log',
            'ydisk-event-loader-log',
            'ydisk-fotki-log',
            'ydisk-grelka-log',
            'ydisk-lenta-loader-log',
            'ydisk-lenta-worker-log',
            'ydisk-log-reader-log',
            'ydisk-migrator-log',
            'ydisk-notes-log',
            'ydisk-notifier-log',
            'ydisk-queller-log',
            'ydisk-smartcache-client-log',
            'ydisk-smartcache-worker-log',
            'ydisk-uaas-log',
            'ydisk-uploader-balancer-log',
            'ydisk-uploader-log',
            'ydisk-urlshortener-log',
            'ydisk-videostreaming-log',
            'ydisk-webdav-log',
            'ydisk-worker2-log',
        )
    ]
    job = clusters.Hahn(yql_token=yql_token).env(parallel_operations_limit=4).job()
    records = []
    for table in sources:
        src_log = job.table(table, weak_schema={'class': str, 'message': str, 'tskv_format': str})
        records.append(
            src_log.qb2(
                log='generic-yson-log',
                fields=[
                    extractors.log_field('class').hide(),
                    extractors.log_fields('tskv_format', 'message'),
                    extractors.custom('service', fetch_service, 'message').add_hints(type=str),
                ],
                filters=[
                    filters.or_(
                        filters.equals('class', 'r.y.mi.io.http.apache.v4.ApacheHttpClient4Utils'),
                        filters.equals('class', 'r.y.mi.io.http.apache.v4.ApacheHttpClientUtils'),
                    ),
                    filters.contains('message', 'HTTP'),
                    filters.one_of('service', interesting_services),
                ],
            )
            .groupby('tskv_format', 'service')
            .aggregate(
                n=aggregators.count(),
                r=aggregators.count(predicate=filters.contains('message', 'HTTP_RETRY')),
            )
        )

    job.concat(*records).groupby('tskv_format', 'service').aggregate(
        hits=aggregators.sum('n'),
        retries=aggregators.sum('r'),
    ).put(result_table)
    job.run()
    return result_table


def read_table(table_name):
    table = []
    all_apps = defaultdict(Counter)
    for record in yt.read_table(table_name, format=yt.YsonFormat()):
        table.append(record)
        all_apps[record['service']]['retries'] += record.get('retries') or 0
        all_apps[record['service']]['hits'] += record['hits']

    table.extend([
        {
            'tskv_format': 'ALL',
            'service': k,
            'retries': v['retries'],
            'hits': v['hits'],
        }
        for k, v in all_apps.items()
    ])
    return table


def publish_stat(fielddate, table, username, password):
    client = StatfaceProductionClient(username=username, password=password)
    report = StatfaceReport().path('Disk/Admin/JavaRetries').scale('daily')
    report = report.client(client).data((dict(fielddate=fielddate, **row) for row in table))
    report.publish()
    print("Report published")


if __name__ == '__main__':
    print(environ)
    yql_token = getenv("DISK_STAT_YQLTOKEN")
    stat_username = getenv("STAT_USER")
    stat_password = getenv("STAT_PASSWORD")
    pwd = getenv("JOB_ROOT")
    yt.config['token'] = yql_token
    yt.config.set_proxy("hahn.yt.yandex.net")
    yesterday = getenv("DAY_FOR_COUNT") or str((datetime.now() - timedelta(1)).date())
    table_name = generate_retries_table(yql_token, pwd, yesterday)
    table = read_table(table_name)
    publish_stat(yesterday, table, stat_username, stat_password)
