#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
import json
import logging
import re
import sys
import urlparse
import time
from collections import Counter, defaultdict
from datetime import datetime, timedelta
from multiprocessing import Pool
from os import getenv
from os import path as os_path
import itertools
from urllib import unquote

import yt.wrapper as yt
from nile.api.v1.datetime import Datetime
from nile.api.v1 import clusters, aggregators, extractors as n_extractors
from qb2.api.v1 import extractors, filters

dir_path = '/'.join(os_path.abspath(__file__).split('/')[:-2])
sys.path = [dir_path] + sys.path

from common.html import html_table
from common.mailer import send_email
from common.startrek import ErrorTicket
from java_investigation import create_java_investigation_ticket
from mpfs_investigation import create_mpfs_investigation_ticket, mpfs_fast_idd
from mobile_investigation import create_mobile_investigation_ticket
from dtrace.dtrace.report import IDDTrace


re_fetch_uid = re.compile('[^a-z]?uid=(?P<uid>[0-9]+)[^0-9]?')
re_fetch_etime = re.compile('etime=(?P<etime>[0-9]+)')
YT_LINK_PREFIX = u'https://yt.yandex-team.ru/hahn/navigation?path='
# POOL_NAME = 'disk-analytics'
POOL_NAME = None


def parse_uid(cookies):
    if ' uid=' in cookies:
        return cookies.split(' uid=')[1].split(';')[0]
    return ''


# пока не перевели на 3ий python
def str_to_timestamp(date_string, time_format = "%Y-%m-%d"):
    return time.mktime(datetime.strptime(date_string,time_format).timetuple())


def fetch_uid(uri):
    match = re_fetch_uid.search(uri)
    if match:
        return match.group('uid')
    return '-'


def valid_ycrid(ycrid):
    if not ycrid:
        return False
    if ycrid == "-":
        return False
    if ycrid == "null":
        return False
    return True


def generate_errors_table(yql_token, pwd, yesterday, uids):
    result_table = pwd + '/all_errors'
    if yt_client().exists(result_table):
        return result_table

    sources = [
        '//home/logfeller/logs/ydisk-nginx-access-log/1d/%s' % yesterday,
        '//home/logfeller/logs/disk-front-access-log/1d/%s' % yesterday,
        '//home/logfeller/logs/ydisk-nginx-access-public-log/1d/%s' % yesterday,
        '//home/logfeller/logs/ydisk-nginx-access-docfront-log/1d/%s' % yesterday,
        '//home/logfeller/logs/ydisk-nginx-access-front-auth-log/1d/%s' % yesterday,
        '//home/logfeller/logs/ydisk-mpfs-access-log/1d/%s' % yesterday,
        '//home/logfeller/logs/ydisk-java-access-log/1d/%s' % yesterday,
    ]
    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).env(parallel_operations_limit=4).job()
    records = []
    for table in sources:
        fields = [
            extractors.log_fields('status', 'tskv_format', 'iso_eventtime', 'request_time'),
            extractors.log_field('ycrid', '').add_hints(type=str),
            extractors.custom('fetch_uid', fetch_uid, 'request').add_hints(type=str),
        ]
        schema_fields = dict(
            status=str,
            tskv_format=str,
            ycrid=str,
            iso_eventtime=str,
            request_time=str,
        )
        if 'nginx' in table or 'disk-front' in table:
            fields += [
                extractors.log_field('cookies').hide(),
                extractors.custom('uid', parse_uid, 'cookies').add_hints(type=str),
            ]
            schema_fields['cookies'] = str
        else:
            fields += [
                extractors.log_field('uid'),
            ]
            schema_fields['uid'] = str
        if 'ydisk-mpfs-access-log' in table:
            fields.append(extractors.log_field('uri').rename('request'))
            schema_fields['uri'] = str
        else:
            fields.append(extractors.log_fields('request'))
            schema_fields['request'] = str

        if 'ydisk-java-access-log' in table:
            fields.append(extractors.log_field('rid').rename('request_id'))
            schema_fields['rid'] = str
        else:
            fields.append(extractors.log_field('request_id'))
            schema_fields['request_id'] = str

        src_log = job.table(table, weak_schema=schema_fields)
        records.append(
            src_log.qb2(
                log='generic-yson-log',
                fields=fields,
                filters=[
                    filters.not_(filters.or_(
                        filters.equals('request', '/ping'),
                        filters.equals('request', '/ping_pattern'),
                        filters.equals('request', '/timetail'),
                        filters.equals('request', '/exec_pattern'),
                        filters.startswith('request', '/z/'),
                        filters.startswith('request', '/unistat'),
                        filters.startswith('request', '/m/yasm/'),
                        filters.equals('status', '507'),
                    )),
                    filters.startswith('status', '5'),
                    filters.or_(
                        filters.one_of('uid', uids),
                        filters.one_of('fetch_uid', uids),
                    ),
                ],
            )
        )

    job.concat(*records).sort('uid', 'fetch_uid', 'iso_eventtime').put(result_table)
    job.run()
    return result_table


def generate_all_nginx_table(yql_token, pwd, yesterday, tables):
    result_table = pwd + '/all_nginx'
    if yt_client().exists(result_table):
        return result_table

    sources = [
        '//home/logfeller/logs/ydisk-nginx-access-log/1d/%s' % yesterday,
        '//home/logfeller/logs/mpfs-access-log/1d/%s' % yesterday,
    ]

    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).env(parallel_operations_limit=len(sources)).job()

    ycrids = {r['ycrid'] for table in tables for r in table if valid_ycrid(r['ycrid'])}

    records = []
    for table in sources:
        src_log = job.table(table)
        records.append(
            src_log.qb2(
                log='generic-yson-log',
                fields=[
                    extractors.log_field('ycrid', '').add_hints(type=str),
                    extractors.all(),
                ],
                filters=[
                    filters.one_of('ycrid', ycrids),
                ],
            )
        )

    job.concat(*records).sort('ycrid').put(result_table)
    job.run()
    return result_table


def urldecode(text):
    try:
        return unquote(text).decode('utf8')
    except Exception:
        return text


def parse_parameters(args):
    return urlparse.parse_qs(args)


def generate_front_errorPages_table(yql_token, pwd, yesterday, uids):
    result_table = pwd + '/front_errorPages'
    if yt_client().exists(result_table):
        return result_table

    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).job()
    src_log = job.table('//home/logfeller/logs/disk-front-access-log/1d/%s' % yesterday)
    (
        src_log.qb2(
            log='generic-yson-log',
            fields=[
                extractors.log_fields('iso_eventtime', 'referer', 'vhost', 'args', 'puid'),
                extractors.custom('parsed_parameters', parse_parameters, 'args').hide(),
                extractors.parameter('event'),
                extractors.parameter('reason'),
                extractors.parameter('error'),
            ],
            filters=[
                filters.equals('event', 'errorPage'),
                filters.one_of('puid', uids),
            ],
        )
        .project(
            'reason', 'iso_eventtime', 'referer', 'vhost', 'args',
            uid='puid',
            error=extractors.custom('error', urldecode, 'error').add_hints(type=str),
        )
        .sort('uid', 'iso_eventtime', 'reason')
        .put(result_table)
    )
    job.run()
    return result_table


def parse_uid_front(uri, http_headers):
    uid = fetch_uid(uri)
    if uid != '-':
        return uid
    try:
        headers = json.loads(http_headers)
    except Exception:
        return uid
    else:
        return headers.get('X-Uid', '-')


def generate_front_bad_requests_table(yql_token, pwd, yesterday, uids):
    result_table = pwd + '/front_bad_requests'
    if yt_client().exists(result_table):
        return result_table

    weak_schema = dict(
        iso_eventtime=str,
        http_url=str,
        http_status=str,
        http_headers=str,
    )
    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).job()
    src_log = job.table('//home/logfeller/logs/ydisk-front-node-log/1d/%s' % yesterday, weak_schema=weak_schema)
    (
        src_log.qb2(
            log='generic-yson-log',
            fields=[
                extractors.log_fields(*weak_schema.keys()),
                extractors.custom('uid', parse_uid_front, 'http_url', 'http_headers').add_hints(type=str),
                extractors.all(exclude=weak_schema.keys()),
            ],
            filters=[
                filters.not_(
                    filters.or_(
                        filters.startswith('http_status', '1'),
                        filters.startswith('http_status', '2'),
                        filters.startswith('http_status', '3'),
                        filters.and_(
                            filters.startswith('http_status', '0'),
                            filters.or_(
                                filters.startswith('http_url', 'http://uaas.search.yandex.net'),
                                filters.startswith('http_url', 'http://uaas.localhost:12701'),
                            ),
                        ),
                    ),
                ),
                filters.one_of('uid', uids),
            ],
        )
        .sort('uid', 'iso_eventtime')
        .put(result_table)
    )
    job.run()
    return result_table


def read_table(table_name):
    table = []
    for record in yt_client().read_table(table_name, format=yt.YsonFormat()):
        table.append(record)
    return table


def parse_uid_operation(message):
    if 'created operation ' not in message:
        return '-'
    oid = message.split('created operation ')[1]
    return oid.split(':')[0]


def parse_operation_name(message):
    if '.handle_operation ' not in message:
        return '-'
    oname = message.split('.handle_operation ')[1]
    if ' (processed: ' not in oname:
        return '-'
    return oname.split(' (processed: ')[0]


def parse_operation_lifetime(message):
    if 'lifetime: ' not in message:
        return None
    lifetime = message.split('lifetime: ')[1]
    return float(lifetime.split()[0])


def generate_long_operations_table(yql_token, pwd, yesterday, uids):
    result_table = pwd + '/long_operations'
    if yt_client().exists(result_table):
        return result_table

    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).job()
    src_log = job.table('//home/logfeller/logs/ydisk-mpfs-default-log/1d/%s' % yesterday, weak_schema=dict(
        message=str,
        ycrid=str,
        name=str,
    ))

    start_operation = (
        src_log.qb2(
            log='generic-yson-log',
            fields=[
                extractors.custom('uid', parse_uid_operation, 'message').add_hints(type=str),
                extractors.log_field('ycrid'),
                extractors.log_field('message').hide(),
                extractors.log_field('name').hide(),
            ],
            filters=[
                filters.equals('name', 'mpfs.core.operations.manager'),
                filters.contains('message', 'created operation '),
                filters.one_of('uid', uids),
            ]
        )
    )

    (
        src_log.qb2(
            log='generic-yson-log',
            fields=[
                extractors.log_field('ycrid'),
                extractors.log_field('message').hide(),
                extractors.custom('operation', parse_operation_name, 'message').add_hints(type=str),
                extractors.custom('lifetime', parse_operation_lifetime, 'message').add_hints(type=float),
            ],
            filters=[
                filters.contains('message', 'name: mpfs.core.job_handlers.operation.handle_operation'),
                filters.not_(filters.contains('message', 'mpfs.core.job_handlers.operation.handle_operation (office,')),
                filters.contains('message', 'task_status: OK, oper_state: 6, oper_title: COMPLETED'),
            ],
        )
        .join(start_operation, by='ycrid', type='inner')
        .groupby('uid')
        .top(10, by='lifetime')
        .sort('uid', 'lifetime')
        .put(result_table)
    )
    job.run()
    return result_table


def generate_long_requests_table(yql_token, pwd, yesterday, uids, source):
    result_table = pwd + "/%s_long_requests" % source
    if yt_client().exists(result_table):
        return result_table

    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).job()
    weak_schema = dict(
        uid=str,
        ycrid=str,
        tskv_format=str,
        iso_eventtime=str,
        method=str,
        status=str,
        request_time=float,
    )
    fields = [
        extractors.log_fields('uid', 'ycrid', 'tskv_format', 'iso_eventtime', 'method', 'status'),
        extractors.float_log_field('request_time'),
        extractors.custom('fetch_uid', fetch_uid, 'request').add_hints(type=str),
    ]
    if source == 'mpfs':
        fields.append(extractors.log_field('uri').rename('request'))
        weak_schema['uri'] = str
        fields.append(extractors.log_field('request_id'))
        weak_schema['request_id'] = str
    else:
        fields.append(extractors.log_fields('request'))
        weak_schema['request'] = str
        fields.append(extractors.log_field('rid').rename('request_id'))
        weak_schema['rid'] = str
    src_log = job.table(
        '//home/logfeller/logs/ydisk-%s-access-log/1d/%s' % (source, yesterday),
        weak_schema=weak_schema,
    )
    (
        src_log.qb2(
            log='generic-yson-log',
            fields=fields,
            filters=[
                filters.not_(filters.or_(
                    filters.equals('request', '/ping'),
                    filters.equals('request', '/ping_pattern'),
                    filters.equals('request', '/timetail'),
                    filters.equals('request', '/exec_pattern'),
                    filters.startswith('request', '/z/'),
                    filters.startswith('request', '/unistat'),
                    filters.startswith('request', '/m/yasm/'),
                    # Skip uploader uploads
                    filters.and_(
                        filters.equals('tskv_format', 'ydisk-java-access-log-uploader'),
                        filters.equals('method', 'PUT'),
                        filters.startswith('request', '/upload-target/'),
                    ),
                    # Skip support admin
                    filters.and_(
                        filters.equals('tskv_format', 'ydisk-mpfs-access-log'),
                        filters.startswith('request', '/support/'),
                    ),
                    # Skip dv preview CHEMODAN-63458
                    filters.and_(
                        filters.equals('tskv_format', 'ydisk-java-access-log-web'),
                        filters.startswith('request', '/preview'),
                    ),
                    # Skip REST slow handles CHEMODAN-68786
                    filters.and_(
                        filters.equals('tskv_format', 'ydisk-nginx-access-log-api'),
                        filters.and_(
                            filters.startswith('request', '/v1/disk/clients/'),
                            filters.endswith('request', '/installer?build=beta'),
                        ),
                        filters.startswith('request', '/billing/process_receipt'),
                    ),
                )),
                filters.or_(
                    filters.one_of('uid', uids),
                    filters.one_of('fetch_uid', uids),
                ),
                filters.compare('request_time', '>=', value=2.0),
            ]
        )
        .groupby('uid', 'fetch_uid')
        .top(10, by='request_time')
        .sort('uid', 'fetch_uid', 'request_time')
        .put(result_table)
    )
    job.run()
    return result_table


def generate_mobile_crashes_table(yql_token, pwd, yesterday, uids):
    result_table = pwd + '/mobile_crashes'
    if yt_client().exists(result_table):
        return result_table

    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).job()
    uid_device_id = job.table('//home/disk-stat/vertical_device_ids').qb2(
        log='generic-yson-log',
        fields=[extractors.log_fields('uid', 'DeviceID')],
        filters=[filters.one_of('uid', uids)],
    )

    src_log = job.table('//home/logfeller/logs/disk-metrika-mobile-log/1d/%s' % yesterday)
    (
        src_log.qb2(
            log='generic-yson-log',
            fields=[
                extractors.log_field('APIKey').hide(),
                extractors.log_field('EventType').hide(),
                extractors.log_fields(
                    'DeviceID',
                    'EventDateTime',
                    'EventName',
                    'EventID',
                    'AppPlatform',
                    'AppVersionName',
                    'AppBuildNumber',
                ),
            ],
            filters=[
                filters.equals('APIKey', 18895),
                filters.equals('EventType', 'EVENT_CRASH'),
            ]
        )
        .join(uid_device_id, by=('DeviceID'), type='inner', assume_small_right=True)
        .sort('uid')
        .put(result_table)
    )
    job.run()
    return result_table


def extract_json_field(str_json):
    try:
        return json.loads(str_json)
    except Exception:
        return None

def generate_mobile_autoupload_table(yql_token, pwd, yesterday, uids):
    result_table = pwd + '/mobile_autoupload'
    if yt_client().exists(result_table):
        return result_table

    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).job()

    src_log = job.table('//home/logfeller/logs/disk-metrika-mobile-log/1d/%s' % yesterday)
    (
        src_log.qb2(
            log='generic-yson-log',
            fields=[
                extractors.log_fields(
                    'APIKey',
                    'AppBuildNumber',
                    'AppPlatform',
                    'StartDate',
                    'EventDateTime',
                    'EventName',
                    'EventValue',
                    'EventNumber',
                    'ReportEnvironment_Keys',
                    'ReportEnvironment_Values',
                ),
                extractors.log_field('AccountID').rename('uid'),
                extractors.custom('EventValueJson', extract_json_field, 'EventValue').add_hints(type=dict).hide(),
                extractors.dictitem('event', 'EventValueJson').add_hints(type=str),
                extractors.dictitem('enqueued_on_appropriate_network', 'EventValueJson').add_hints(type=str),
                extractors.dictitem('identifier', 'EventValueJson').add_hints(type=str),
                extractors.dictitem('after_login', 'EventValueJson').add_hints(type=bool),
            ],
            filters=[
                filters.equals('APIKey', 18895),
                filters.equals('AppPlatform', 'android'),
                filters.equals('EventName', 'autoupload'),
                filters.equals('enqueued_on_appropriate_network', 'true'),
                filters.equals('after_login', True),
                filters.one_of('event', ('upload_finished', 'enqueued')),
                filters.equals('StartDate', int(str_to_timestamp(yesterday))),
                filters.one_of('uid', uids),
            ]
        )
        .groupby('uid', 'identifier')
        .aggregate(
            AppPlatform=aggregators.max('AppPlatform'),
            AppBuildNumber=aggregators.max('AppBuildNumber'),
            queueTime=aggregators.min('EventDateTime'),
            uploadTime=aggregators.max('EventDateTime'),
            EventValue=aggregators.max('EventValue'),
            ReportEnvironment_Keys=aggregators.max('ReportEnvironment_Keys'),
            ReportEnvironment_Values=aggregators.max('ReportEnvironment_Values'),
        )
        .project(
            n_extractors.all(),
            time_to_process_seconds=n_extractors.custom(
                lambda qT, uT: int((Datetime.from_iso(uT) - Datetime.from_iso(qT)).total_seconds()),
                'queueTime', 'uploadTime',
            ).add_hints(type=int),
        )
        .filter(
            filters.compare('time_to_process_seconds', '>', value=2 * 60),
        )
        .sort('uid', 'time_to_process_seconds')
        .put(result_table)
    )
    job.run()
    return result_table


def generate_metrika_table(yql_token, pwd, yesterday, uids):
    result_table = pwd + '/metrika_errors'
    if yt_client().exists(result_table):
        return result_table

    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).job()
    src_log = job.table('//home/logfeller/logs/bs-watch-log/1d/%s' % yesterday)

    (
        src_log.qb2(
            log='generic-yson-log',
            fields=[
                extractors.log_field('passportuid').rename('uid'),
                extractors.log_fields(
                    'counterid',
                    'iso_eventtime',
                    'params',
                    'url',
                    'referer',
                    'useragent',
                    'browserinfo',
                    'antivirusyes',
                    'clientip',
                    'domainzone',
                ),
            ],
            filters=[
                filters.one_of('counterid', {
                    '9618901',  # disk.yandex.ru
                    '15011071',  # yadi.sk
                    '15068284',  # docviewer.yandex.ru
                }),
                filters.one_of('uid', uids),
                filters.contains('params', 'error'),
            ]
        )
        .sort('uid')
        .put(result_table)
    )
    job.run()
    return result_table


def generate_joined_mpfs_table(yql_token, pwd, yesterday, errors_table_name):
    result_table = pwd + '/mpfs_joined'
    if yt_client().exists(result_table):
        return result_table

    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).env(
        yt_spec_defaults=dict(
            job_io=dict(
                table_writer=dict(
                    max_row_weight=128 * 1024 * 1024,
                    max_key_weight=256 * 1024,
                )
            ),
        )
    ).job()
    access_log_errors = job.table(errors_table_name).qb2(
        log='generic-yson-log',
        fields=[
            extractors.log_fields('fetch_uid', 'request', 'request_id', 'request_time', 'uid', 'ycrid'),
            extractors.log_field('tskv_format').hide(),
            extractors.log_field('iso_eventtime').rename('a.iso_eventtime'),
        ],
        filters=[filters.equals('tskv_format', 'ydisk-mpfs-access-log')],
    )

    job.table('//home/logfeller/logs/ydisk-mpfs-error-log/1d/%s' % yesterday).qb2(
        log='generic-yson-log',
        fields=[
            extractors.log_fields(
                'iso_eventtime',
                'message',
                'pid',
                'request_id',
                'ycrid',
            ),
        ],
        intensity='large_data',
    ).join(access_log_errors, by=('ycrid', 'request_id'), type='inner', assume_small_right=True).put(result_table)
    job.run()
    return result_table


def generate_filtered_mpfs_errors_table(yql_token, pwd, yesterday, errors_table):
    result_table = pwd + '/mpfs_filtered_errors'
    if yt_client().exists(result_table):
        return result_table

    filtered_records = filter(lambda r: r['tskv_format'] == 'ydisk-mpfs-access-log', errors_table)
    yt_client().write_table(result_table, filtered_records)
    return result_table


def parse_request_time(message):
    time_part = message.rsplit()[-1]
    try:
        return float(time_part)
    except ValueError:
        return 0.0


def generate_requests_mpfs_table(yql_token, pwd, yesterday, timings_table_name):
    result_table = pwd + '/mpfs_requests'
    if yt_client().exists(result_table):
        return result_table

    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).env(
        yt_spec_defaults=dict(
            job_io=dict(
                table_writer=dict(
                    max_row_weight=128 * 1024 * 1024,
                    max_key_weight=256 * 1024,
                )
            ),
        )
    ).job()
    access_log_timings = job.table(timings_table_name).qb2(
        log='generic-yson-log',
        fields=[
            extractors.log_fields('fetch_uid', 'request', 'uid', 'ycrid', 'request_id'),
            extractors.log_field('tskv_format').hide(),
            extractors.log_field('iso_eventtime').rename('a.iso_eventtime'),
            extractors.log_field('request_time').rename('a.request_time'),
        ],
        filters=[filters.equals('tskv_format', 'ydisk-mpfs-access-log')],
    )

    src_log = job.table('//home/logfeller/logs/ydisk-mpfs-requests-log/1d/%s' % yesterday)

    (
        src_log.qb2(
            log='generic-yson-log',
            fields=[
                extractors.log_fields(
                    'ycrid', 'request_id', 'iso_eventtime', 'message', 'pid'
                ),
                extractors.custom('request_time', parse_request_time, 'message').add_hints(type=float),
            ],
            intensity='large_data',
        )
        .join(access_log_timings, by=('ycrid', 'request_id'), type='inner', assume_small_right=True)
        .sort('ycrid', 'request_id', 'iso_eventtime')
        .put(result_table)
    )
    job.run()
    return result_table


def generate_max_requests_mpfs_table(yql_token, pwd, yesterday, mpfs_requests_table_name):
    result_table = pwd + '/mpfs_max_requests'
    if yt_client().exists(result_table):
        return result_table

    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).job()
    src_log = job.table(mpfs_requests_table_name)

    (
        src_log.qb2(
            log='generic-yson-log',
            fields=[
                extractors.log_fields('ycrid', 'request_id', 'request_time'),
                extractors.all(exclude=('ycrid', 'request_id', 'request_time')),
            ]
        )
        .groupby('ycrid', 'request_id')
        .top(1, by='request_time')
        .sort('ycrid', 'request_id')
        .put(result_table)
    )
    job.run()
    return result_table


def generate_envoy_mpfs_table(yql_token, pwd, yesterday, mpfs_long_timings_table):
    result_table = pwd + '/mpfs_envoy_requests'
    if yt_client().exists(result_table):
        return result_table

    ycrids = set(r.get('ycrid') for r in mpfs_long_timings_table if valid_ycrid(r.get('ycrid')))

    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).job()
    src_log = job.table('//home/logfeller/logs/ydisk-envoy-log/1d/%s' % yesterday)

    (
        src_log.qb2(
            log='generic-yson-log',
            fields=[
                extractors.log_fields('ycrid', 'iso_eventtime'),
                extractors.all(exclude=('ycrid', 'iso_eventtime', 'appname')),
            ],
            filters=[
                # filters.one_of('appname', ['disk_mpfs', 'disk_api']),
                filters.one_of('ycrid', ycrids),
            ],
        )
        .sort('ycrid', 'iso_eventtime')
        .put(result_table)
    )
    job.run()
    return result_table


def generate_java_nginx_access_table(yql_token, pwd, yesterday, timings_table):
    result_table = pwd + '/java_nginx_access_from_mpfs'
    if yt_client().exists(result_table):
        return result_table

    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).job()
    src_log = job.table('//home/logfeller/logs/ydisk-nginx-access-log/1d/%s' % yesterday)

    ycrids = [r['ycrid'] for r in timings_table if valid_ycrid(r['ycrid'])]
    (
        src_log.qb2(
            log='generic-yson-log',
            fields=[
                extractors.log_field('ycrid'),
                extractors.all(exclude=('ycrid')),
            ],
            filters=[
                filters.one_of('ycrid', ycrids),
            ]
        ).put(result_table)
    )

    job.run()
    return result_table


def generate_java_log_from_table(yql_token, pwd, yesterday, src_table, result_table_name):
    result_table = pwd + result_table_name
    if yt_client().exists(result_table):
        return result_table

    ycrids = set()
    tskv_format_suffixes = set()
    for row in src_table:
        for java_nginx_log_prefix in ('ydisk-nginx-access-log-', 'ydisk-java-access-log-'):
            if row['tskv_format'].startswith(java_nginx_log_prefix) and valid_ycrid(row['ycrid']):
                tskv_format_suffixes.add(row['tskv_format'][len(java_nginx_log_prefix):])
                ycrids.add(row['ycrid'])

    sources = []
    for tskv_format_suffix in tskv_format_suffixes:
        table_path = '//home/logfeller/logs/ydisk-%s-log/1d/%s' % (tskv_format_suffix, yesterday)
        if yt_client().exists(table_path):
            sources.append(table_path)

    if not sources:
        return result_table

    job = clusters.Hahn(yql_token=yql_token, pool=POOL_NAME).env(
        yt_spec_defaults=dict(
            job_io=dict(
                table_writer=dict(
                    max_row_weight=128 * 1024 * 1024,
                    max_key_weight=256 * 1024,
                )
            ),
        )
    ).job()

    records = []
    for table in sources:
        src_log = job.table(table)
        records.append(
            src_log.qb2(
                log='generic-yson-log',
                fields=[
                    extractors.log_field('ycrid', '').add_hints(type=str),
                    extractors.all(exclude=('timestamp', 'unixtime')),
                ],
                filters=[
                    filters.one_of('ycrid', ycrids),
                ],
            )
        )

    job.concat(*records).sort('ycrid').put(result_table)
    job.run()


def aggregated_errors_table(t):
    aggregated_table = defaultdict(lambda: defaultdict(lambda: defaultdict(Counter)))
    for row in t:
        aggregated_table[row['uid']][row['fetch_uid']][row['status']][row['tskv_format']] += 1
    return [
        dict(uid=uid, fetch_uid=fetch_uid, tskv_format=tskv_format, status=status, total=total)
        for uid, v1 in aggregated_table.items()
        for fetch_uid, v2 in v1.items()
        for status, v3 in v2.items()
        for tskv_format, total in v3.items()
    ]


def fetch_etime(uri):
    match = re_fetch_etime.search(uri)
    if match:
        return int(match.group('etime'))
    return None


def calc_duration(unixtime, etime):
    return int(unixtime) - etime


def fetch_user_agent(headers):
    if 'User-Agent:' not in headers:
        return ''
    user_agent = headers.split('User-Agent: ')[1]
    return user_agent.split('}"')[0] + '}'


def fetch_mob_platform(user_agent):
    if 'Yandex.Disk' not in user_agent:
        return ''

    if 'iOS' in user_agent:
        return 'ios'

    if 'android' in user_agent:
        return 'android'

    return ''


def html_errors_table(t, users):
    if len(t) > 20:
        return html_table(
            aggregated_errors_table(t),
            [
                ('User', lambda r: users.get(r['uid'], users.get(r['fetch_uid'], ''))),
                'status',
                ('TotalErrors', 'total'),
                ('App', 'tskv_format'),
            ]
        )

    return html_table(
        t,
        [
            ('User', lambda r: users.get(r['uid'], users.get(r['fetch_uid'], ''))),
            'status',
            'iso_eventtime',
            'request',
            'tskv_format',
            'ycrid',
        ]
    )


def html_operations_table(t, users):
    return html_table(
        t,
        [
            ('User', lambda r: users.get(r['uid'], '')),
            'operation',
            'lifetime',
            'ycrid',
        ]
    )


def html_timings_table(t, users):
    return html_table(
        t,
        [
            ('User', lambda r: users.get(r['uid'], users.get(r['fetch_uid'], ''))),
            'request_time',
            'request',
            'status',
            'method',
            'iso_eventtime',
            'ycrid',
            'tskv_format',
        ]
    )


def html_mob_crashes_table(t, users):
    return html_table(
        t,
        [
            ('User', lambda r: users.get(r['uid'], '')),
            'DeviceID',
            'EventDateTime',
            'EventName',
            'EventID',
            'AppPlatform',
            'AppVersionName',
            'AppBuildNumber',
        ]
    )


def html_mob_autoupload_table(t, users):
    return html_table(
        t,
        [
            ('User', lambda r: users.get(r['uid'], '')),
            'time_to_process_seconds',
            'AppPlatform',
            'uploadTime',
            'queueTime',
            'identifier',
            'AppBuildNumber',
            'EventValue',
        ]
    )


def html_metrika_table(t, users):
    return html_table(
        t,
        [
            ('User', lambda r: users.get(r['uid'], '')),
            'iso_eventtime',
            'counterid',
            'params',
            'url',
            'referer',
            'useragent',
            'browserinfo',
            'antivirusyes',
            'clientip',
            'domainzone',
        ]
    )


def html_front_errorPages_table(t, users):
    return html_table(
        t,
        [
            ('User', lambda r: users.get(r['uid'], '')),
            'iso_eventtime',
            'reason',
            'error',
            'args',
            'referer',
            'vhost',
        ]
    )


def html_front_bad_requests_table(t, users):
    return html_table(
        t,
        [
            ('User', lambda r: users.get(r['uid'], '')),
            'iso_eventtime',
            'http_url',
            'http_status',
        ]
    )


def get_sandbox_id():
    for line in getenv('GSID', '').split():
        if not line.startswith('SB:'):
            continue
        return line.split(':')[2]
    return ''


def generate_email_body(errors, operations, timings_mpfs, timings_java,
                        mob_crashes, mob_autoupload,metrika,
                        front_errorPages, front_bad_requests,
                        users):
    body = [
        u"Таблицы ниже для ознакомительной информации.",
        u"По ссылкам в YT полей может быть больше",
        u"Таблица сгенерирована <a href='https://sandbox.yandex-team.ru/task/%s'>SANDBOX</a>" % get_sandbox_id(),
    ]

    if errors[1]:
        body.append(u"<h1><a href='%s%s'>5xx</a></h1>\n%s"
                    % (YT_LINK_PREFIX, errors[0], html_errors_table(errors[1], users)))
    else:
        body.append(u"No 5xx")

    if timings_mpfs[1]:
        body.append(u"<h1><a href='%s%s'>TOP10 Long MPFS Requests</a></h1>\n%s"
                    % (YT_LINK_PREFIX, timings_mpfs[0], html_timings_table(timings_mpfs[1], users)))
    else:
        body.append(u"No long mpfs requests")

    if timings_java[1]:
        body.append(u"<h1><a href='%s%s'>TOP10 Long Java Requests</a></h1>\n%s"
                    % (YT_LINK_PREFIX, timings_java[0], html_timings_table(timings_java[1], users)))
    else:
        body.append(u"No long java requests")

    if operations[1]:
        body.append(u"<h1><a href='%s%s'>TOP10 Long Operations</a></h1>\n%s"
                    % (YT_LINK_PREFIX, operations[0], html_operations_table(operations[1], users)))
    else:
        body.append(u"No long operations")

    if mob_crashes[1]:
        body.append(u"<h1><a href='%s%s'>Mobile crashes</a></h1>\n%s"
                    % (YT_LINK_PREFIX, mob_crashes[0], html_mob_crashes_table(mob_crashes[1], users)))
    else:
        body.append(u"No mobile crashes")

    if mob_autoupload[1]:
        body.append(u"<h1><a href='%s%s'>Mobile slow autoupload</a></h1>\n%s"
                    % (YT_LINK_PREFIX, mob_autoupload[0], html_mob_autoupload_table(mob_autoupload[1], users)))
    else:
        body.append(u"No mobile slow autoupload")

    if metrika[1]:
        body.append(u"<h1><a href='%s%s'>Metrika error pages</a></h1>\n%s"
                    % (YT_LINK_PREFIX, metrika[0], html_metrika_table(metrika[1], users)))
    else:
        body.append(u"No errors from Metrika")

    if front_errorPages[1]:
        body.append(u"<h1><a href='%s%s'>Front error pages</a></h1>\n%s"
                    % (YT_LINK_PREFIX, front_errorPages[0], html_front_errorPages_table(front_errorPages[1], users)))
    else:
        body.append(u"No errorPages from front")

    if front_bad_requests[1]:
        body.append(u"<h1><a href='%s%s'>Front bad requests</a></h1>\n%s"
                    % (
                        YT_LINK_PREFIX,
                        front_bad_requests[0],
                        html_front_bad_requests_table(front_bad_requests[1], users),
                    ))
    else:
        body.append(u"No bad requests from front")

    return u"\n<br>".join(body)


def generate_and_read_table(generate_func, generate_func_args):
    try:
        result_table_name = generate_func(*generate_func_args)
        result_table = read_table(result_table_name)
        return result_table_name, result_table
    except Exception:
        logging.exception("FAIL: %s" % generate_func.__name__)
        return None, None


def generate_table(generate_func, generate_func_args):
    try:
        result_table_name = generate_func(*generate_func_args)
        return result_table_name
    except Exception:
        logging.exception("FAIL: %s" % generate_func.__name__)
        return None


def yt_client():
    return yt.YtClient(proxy="hahn", token=yql_token)


def dtrace(date, idd_yt_path, yql_token):
    app = IDDTrace(idd_yt_path, date, yql_token)
    try:
        report = app.report()
        return report
    except Exception:
        logging.exception("Can't dtrace IDD ticket")


if __name__ == '__main__':
    yql_token = getenv("DISK_STAT_YQLTOKEN")
    pwd = getenv("JOB_ROOT")
    users = dict(i.split('=')[::-1] for i in getenv("UIDS").split(','))
    yesterday = getenv("DAY_FOR_COUNT") or str((datetime.now() - timedelta(1)).date())
    java_calendar_private_token = getenv("JAVA_CALENDAR_PRIVATE_TOKEN")
    mpfs_calendar_private_token = getenv("MPFS_CALENDAR_PRIVATE_TOKEN")
    mobile_calendar_private_token = {
        'android': getenv("ANDROID_CALENDAR_PRIVATE_TOKEN"),
        'ios': getenv("IOS_CALENDAR_PRIVATE_TOKEN"),
    }
    pwd = pwd + '/vertical/%s' % yesterday.replace('-', '/')
    if not yt_client().exists(pwd):
        yt_client().create("map_node", pwd, recursive=True)

    job_pool = Pool(processes=4)

    mob_autoupload_result = job_pool.apply_async(
        generate_and_read_table,
        (generate_mobile_autoupload_table, (yql_token, pwd, yesterday, set(users.keys())))
    )

    timings_mpfs_result = job_pool.apply_async(
        generate_and_read_table,
        (generate_long_requests_table, (yql_token, pwd, yesterday, set(users.keys()), 'mpfs'))
    )
    errors_result = job_pool.apply_async(
        generate_and_read_table,
        (generate_errors_table, (yql_token, pwd, yesterday, set(users.keys())))
    )
    operations_result = job_pool.apply_async(
        generate_and_read_table,
        (generate_long_operations_table, (yql_token, pwd, yesterday, set(users.keys())))
    )
    timings_java_result = job_pool.apply_async(
        generate_and_read_table,
        (generate_long_requests_table, (yql_token, pwd, yesterday, set(users.keys()), 'java'))
    )
    metrika_result = job_pool.apply_async(
        generate_and_read_table,
        (generate_metrika_table, (yql_token, pwd, yesterday, set(users.keys())))
    )
    mob_crashes_result = job_pool.apply_async(
        generate_and_read_table,
        (generate_mobile_crashes_table, (yql_token, pwd, yesterday, set(users.keys())))
    )
    front_errorPages_result = job_pool.apply_async(
        generate_and_read_table,
        (generate_front_errorPages_table, (yql_token, pwd, yesterday, set(users.keys())))
    )
    front_bad_requests_result = job_pool.apply_async(
        generate_and_read_table,
        (generate_front_bad_requests_table, (yql_token, pwd, yesterday, set(users.keys())))
    )

    timings_table_name_mpfs, timings_table_mpfs = timings_mpfs_result.get()
    errors_table_name, errors_table = errors_result.get()
    filtered_mpfs_errors_result = job_pool.apply_async(
        generate_and_read_table,
        (generate_filtered_mpfs_errors_table, (yql_token, pwd, yesterday, errors_table))
    )
    joined_mpfs_result = job_pool.apply_async(
        generate_table,
        (generate_joined_mpfs_table, (yql_token, pwd, yesterday, errors_table_name))
    )
    operations_table_name, operations_table = operations_result.get()
    timings_table_name_java, timings_table_java = timings_java_result.get()
    metrika_table_name, metrika_table = metrika_result.get()
    mob_crashes_table_name, mob_crashes_table = mob_crashes_result.get()
    mob_autoupload_table_name, mob_autoupload_table = mob_autoupload_result.get()
    front_errorPages_table_name, front_errorPages_table = front_errorPages_result.get()
    front_bad_requests_table_name, front_bad_requests_table = front_bad_requests_result.get()

    email_body = generate_email_body(
        (errors_table_name, errors_table),
        (operations_table_name, operations_table),
        (timings_table_name_mpfs, timings_table_mpfs),
        (timings_table_name_java, timings_table_java),
        (mob_crashes_table_name, mob_crashes_table),
        (mob_autoupload_table_name, mob_autoupload_table),
        (metrika_table_name, metrika_table),
        (front_errorPages_table_name, front_errorPages_table),
        (front_bad_requests_table_name, front_bad_requests_table),
        users,
    )
    ticket = ErrorTicket(yesterday, queue='DISKIDD')
    is_ticket_new = ticket.is_can_be_created()
    ticket_key = ticket.create(description=u"<# %s #>" % email_body)
    # if is_ticket_new:
    #     send_email(
    #         getenv("RECIPIENT"),
    #         "%s IDD report %s" % (yesterday, ticket_key),
    #         (u'https://st.yandex-team.ru/%s\n\n<br>' % ticket_key) + email_body,
    #     )

    java_log_from_errors_result = job_pool.apply_async(
        generate_table,
        (generate_java_log_from_table, (yql_token, pwd, yesterday, errors_table, '/java_log_from_errors'))
    )
    java_log_from_long_result = job_pool.apply_async(
        generate_table,
        (generate_java_log_from_table, (yql_token, pwd, yesterday, timings_table_java, '/java_log_from_long'))
    )

    # joined_mpfs_table_name = joined_mpfs_result.get()

    all_nginx_log_result = job_pool.apply_async(
        generate_table,
        (generate_all_nginx_table, (yql_token, pwd, yesterday, (timings_table_mpfs, errors_table, timings_table_java)))
    )

    # logging.info('start dtrace')
    # dtrace_report = dtrace(yesterday, pwd, yql_token)

    # logging.info('create java investigation ticket')
    # java_ycrids = set()
    # for row in itertools.chain(errors_table or dict(), timings_table_java or dict()):
    #     item = row.get('ycrid')
    #     if item:
    #         java_ycrids.add(item)

    # create_java_investigation_ticket(ticket_key, java_calendar_private_token, yesterday, {
    #     'yt_prefix': YT_LINK_PREFIX,
    #     'errors_url': errors_table_name,
    #     'long_url': timings_table_name_java,
    #     'long_from_mpfs_url': pwd + '/java_nginx_access_from_mpfs',
    #     'java_log_from_errors': pwd + '/java_log_from_errors',
    #     'java_log_from_long': pwd + '/java_log_from_long',
    #     'java_log_from_long_mpfs': pwd + '/java_log_from_long_mpfs',
    #     'nginx_all_logs': pwd + '/all_nginx',
    #     'dtrace_report': dtrace_report.as_html(ycrids=java_ycrids),
    # })

    # logging.info('create mpfs investigation ticket')
    # filtered_mpfs_errors_table_name, filtered_mpfs_errors_table = filtered_mpfs_errors_result.get()
    # mpfs_ycrids = set()
    # for row in itertools.chain(filtered_mpfs_errors_table or dict(), timings_table_mpfs or dict()):
    #     item = row.get('ycrid')
    #     if item:
    #         mpfs_ycrids.add(item)

    # create_mpfs_investigation_ticket(ticket_key, mpfs_calendar_private_token, yesterday, {
    #     'yt_prefix': YT_LINK_PREFIX,
    #     'errors_url': filtered_mpfs_errors_table_name,
    #     'timings_url': timings_table_name_mpfs,
    #     'joined_errors_url': pwd + '/mpfs_joined',
    #     'joined_timings_url': pwd + '/mpfs_requests',
    #     'max_request_url': pwd + '/mpfs_max_requests',
    #     'mpfs_envoy_requests': pwd + '/mpfs_envoy_requests',
    #     'nginx_all_logs': pwd + '/all_nginx',
    #     'dtrace_report': dtrace_report.as_html(ycrids=mpfs_ycrids),
    # })

    # for mob_platform in {'ios', 'android'}:
    #     if mob_platform in {r.get('AppPlatform') for r in mob_autoupload_table}:
    #         create_mobile_investigation_ticket(
    #             mob_platform,
    #             'Photo upload deltas',
    #             ticket_key,
    #             mobile_calendar_private_token.get(mob_platform),
    #             yesterday,
    #             {
    #                 'yt_prefix': YT_LINK_PREFIX,
    #                 'table_url': mob_autoupload_table_name,
    #             },
    #         )
    #
    #     if mob_platform in {r.get('AppPlatform') for r in mob_crashes_table}:
    #         create_mobile_investigation_ticket(
    #             mob_platform,
    #             'Mobile crashes',
    #             ticket_key,
    #             mobile_calendar_private_token.get(mob_platform),
    #             yesterday,
    #             {
    #                 'yt_prefix': YT_LINK_PREFIX,
    #                 'table_url': mob_crashes_table_name,
    #             },
    #         )

    requests_mpfs_result = job_pool.apply_async(
        generate_table,
        (generate_requests_mpfs_table, (yql_token, pwd, yesterday, timings_table_name_mpfs))
    )
    java_nginx_access_result = job_pool.apply_async(
        generate_and_read_table,
        (generate_java_nginx_access_table, (yql_token, pwd, yesterday, timings_table_mpfs))
    )

    requests_mpfs_table_name = requests_mpfs_result.get()
    max_requests_mpfs_result = job_pool.apply_async(
        generate_table,
        (generate_max_requests_mpfs_table, (yql_token, pwd, yesterday, requests_mpfs_table_name))
    )
    envoy_mpfs_result = job_pool.apply_async(
        generate_table,
        (generate_envoy_mpfs_table, (yql_token, pwd, yesterday, timings_table_mpfs))
    )
    java_nginx_access_table_name, java_nginx_access_table = java_nginx_access_result.get()
    java_log_from_long_mpfs_result = job_pool.apply_async(
        generate_table,
        (generate_java_log_from_table, (yql_token, pwd, yesterday, java_nginx_access_table, '/java_log_from_long_mpfs'))
    )
    email_body += u"\n<h1><a href='%s%s'>MPFS filtered requests.log</a></h1>" % (
        YT_LINK_PREFIX, requests_mpfs_table_name)
    email_body += u"\n<h1><a href='%s%s'>Java filtered nginx/access-tskv.log</a></h1>" % (
        YT_LINK_PREFIX, java_nginx_access_table_name)
    ticket.set_description(u"<# %s #>" % email_body)
    max_requests_mpfs_table_name = max_requests_mpfs_result.get()
    envoy_mpfs_table_name = envoy_mpfs_result.get()
    java_log_from_errors_result.get()
    java_log_from_long_result.get()
    java_log_from_long_mpfs_result.get()
    all_nginx_log_result.get()

    logging.info('run mpfs fast idd')
    mpfs_fast_idd(ticket_key, yesterday)

    logging.info('well done!')
