#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
import re
import sys
import urlparse
from os import getenv
from os import path as os_path
from urllib import unquote

from nile.api.v1 import aggregators, clusters
from nile.api.v1.datetime import Datetime, next_day
from qb2.api.v1 import extractors, filters

dir_path = os_path.abspath(os_path.join(__file__, '..', '..'))
sys.path.insert(0, dir_path)

try:
    from disk.admin.monitors.common.startrek import ErrorTicket  # noqa
except ImportError:
    from common.startrek import ErrorTicket  # noqa

from components import COMPONENTS  # noqa
from followers import FOLLOWERS  # noqa
from thresholds import THRESHOLDS  # noqa
from url_patterns import URI_PATTERNS  # noqa
from skipped_logs import SKIPPED_LOGS, SKIPPED_SUFFIXES  # noqa

re_hostname = re.compile(r'^([a-z\-0-9]+?)\-?\d{1,2}[a-z]\.(?:qloud\.)?(?:disk|dsp|dst)\.yandex\.(?:net|ru)(?::\d+)?$')
re_numeric_node = re.compile(r'^[0-9]+\.([^.]+\.disk\.yandex\..+)$')
re_host_tld = re.compile(r'(.+\.yandex)\..+')


def api_request_normalizer(request):
    if not request.startswith('/'):
        return
    path = urlparse.urlparse(request).path
    path = unquote(path)
    # remove uids
    path = re.sub(r'/(yaid-)?\d+/', '/<INT>/', path)
    # remove resource_id
    path = re.sub(r'/(\d+:)?[0-9a-f]{64}(\.\d+)?/?', '/<RESOURCE_ID>/', path)
    # remove client
    path = re.sub(r'^/\w+/v(1|2)', r'/v\1', path)
    search_res = URI_PATTERNS['ydisk-nginx-access-log-api'].search(path)
    if search_res:
        path = '%s...' % search_res.group(1)
    return path


def get_handle(tskv_format, request, method):
    try:
        if tskv_format == 'ydisk-nginx-access-log-api':
            normalized_request = api_request_normalizer(request)
        else:
            normalized_request = URI_PATTERNS[tskv_format].search(request).group(1)
        return "%s:%s" % (method, normalized_request)
    except Exception:
        return "%s:%s" % (method, request)


def tld_vhost(vhost):
    tld = re_host_tld.search(vhost)
    if tld:
        return tld.group(1) + '.<TLD>'
    return vhost


def groupped_vhost(vhost):
    search_hostname = re_hostname.search(vhost) or re_numeric_node.search(vhost)
    if search_hostname:
        r = search_hostname.group(1)
    elif ':' in vhost:
        r = vhost.rsplit(':', 1)[0]
    else:
        r = vhost
    return tld_vhost(r)


def count_errors_prc(status_5xx, total):
    if not status_5xx:
        return 0
    if not total:
        return 100
    return status_5xx * 100.0 / total


def is_alert(record):
    vhost = record['host']
    if record['tskv_format'] in SKIPPED_LOGS:
        return False
    if any(record['tskv_format'].endswith(s) for s in SKIPPED_SUFFIXES):
        return False
    if record['total'] < 10:
        return False
    try:
        if record['errors_prc'] < THRESHOLDS["%s:%s" % (vhost, record['handle'])]:
            return False
    except:
        return False
    if '.dsp.' in vhost:
        return False
    if '.dst.' in vhost:
        return False
    if vhost.replace('.', '').isdigit():
        return False
    if '.qloud-c.' in vhost:
        return False
    if vhost == 'localhost':
        return False
    return True


def alert(record):
    ticket_followers = FOLLOWERS.get(record['tskv_format'], [])
    ticket = ErrorTicket(
        u'Пятисотит ручка %s%s' % (record['host'], record['handle'].split(':')[1]),
        followers=ticket_followers,
    )
    ticket.create(
        description=u'* host: %(host)s\n'
                    u'* Ручка: %(handle)s\n'
                    u'* Процент 5xx: %(errors_prc)s\n'
                    u'* Количество 5xx: %(status_5xx)s\n'
                    u'* Всего запросов: %(total)s\n'
                    u'* tskv_format: %(tskv_format)s\n'
                    % record,
        components=COMPONENTS.get(record['tskv_format'], []) + ['monitoring-tasks'],
    )


def generate_table(yqltoken, pwd):
    yesterday = next_day(Datetime.now(), offset=-1)
    result_table = pwd + '/nginx_errors/%s' % yesterday

    sources = [
        '//home/logfeller/logs/ydisk-nginx-access-log/1d',
        '//home/logfeller/logs/disk-front-access-log/1d',
        '//home/logfeller/logs/ydisk-nginx-access-front-auth-log/1d',
        '//home/logfeller/logs/ydisk-nginx-access-public-log/1d',
        '//home/logfeller/logs/ydisk-nginx-access-docfront-log/1d',
        '//home/logfeller/logs/mpfs-access-log/1d',
    ]
    job = clusters.Hahn(yql_token=yqltoken).env(parallel_operations_limit=len(sources)).job()
    tables = ["%s/%s" % (s, yesterday) for s in sources]
    records = []
    for table in tables:
        src_log = job.table(table, weak_schema=dict(
            status=str,
            tskv_format=str,
            method=str,
            request=str,
            vhost=str,
        ))
        records.append(
            src_log.qb2(
                log='generic-yson-log',
                fields=[
                    extractors.log_fields('status', 'tskv_format'),
                    extractors.log_fields('method', 'request', 'vhost').hide(),
                    extractors.custom('host', groupped_vhost, 'vhost').add_hints(type=str),
                    extractors.custom('handle', get_handle, 'tskv_format', 'request', 'method').add_hints(type=str),
                ],
                filters=[
                    filters.not_(filters.or_(
                        filters.equals('request', '/ping'),
                        filters.equals('request', '/ping_pattern'),
                        filters.equals('request', '/timetail'),
                        filters.equals('handle', 'GET:/exec_pattern'),
                        filters.equals('status', '507'),
                    ))
                ],
            )
            .groupby('host', 'handle', 'tskv_format')
            .aggregate(
                status_5xx=aggregators.count(predicate=filters.startswith('status', '5')),
                total=aggregators.count()
            )
            .project(
                'host',
                'handle',
                extractors.custom('errors_prc', count_errors_prc, 'status_5xx', 'total').add_hints(type=float),
                'status_5xx',
                'total',
                'tskv_format',
            )
            .filter(filters.nonzero('errors_prc'))
        )

    result_table = job.concat(*records).sort('errors_prc').put(result_table)
    job.run()
    return result_table.read()


def alert_table(table):
    for record in table:
        if not is_alert(record):
            continue
        try:
            alert(record)
        except Exception as e:
            print "Can't create ticket: alert %s, %s" % (record, e)


def main():
    yqltoken = getenv("DISK_STAT_YQLTOKEN")
    pwd = getenv("JOB_ROOT")
    table = generate_table(yqltoken, pwd)
    alert_table(table)


if __name__ == '__main__':
    main()
