#!/usr/bin/env python
# -*- coding: utf8 -*-
"""
Скрипт обрабатывает access-логи из clickhouse и выдает различные сигналы мониторинга
"""

from __future__ import print_function

import argparse
import copy
import json
import os
import requests
import sys
import time
import yaml
import logging

import direct_juggler.juggler as dj
import pandas as pd
from functools import wraps
from pandas.compat import StringIO
from numpy import dtype
from datetime import datetime as dt
from datetime import timedelta
from tabulate import tabulate

requests.packages.urllib3.disable_warnings() 

CH_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S'
CLICKHOUSE_USER = 'direct_reader'
SETTINGS = argparse.Namespace(
    until_datetime='now()',
    mon_slot=300,
    data_lag=120,
    juggler=False,
    verbose=False,
    debug=False,
    ya_env='development',
    clickhouse_conn_timeout=1,
    clickhouse_query_timeout=None,
    mon_conf_file='/etc/dt-slbmon/mon-default.yaml',
    clickhouse_table='directdb.nginx_access',
    clickhouse_url='https://ppchouse-cloud.direct.yandex.net:8443',
    solomon_url='http://solomon.yandex.net/api/v2/push?project={project}&cluster=ppcback&service=clh-nginx-access',
    apply=False,
    action='juggler',
    solomon_token_file='/etc/direct-tokens/solomon_robot-direct-slmn-p',
    clickhouse_token_file='/etc/direct-tokens/clickhouse_%s' % CLICKHOUSE_USER,
    clickhouse_user='%s' % CLICKHOUSE_USER,
    clickhouse_database='directdb',
)

SOLOMON_TOKEN = None

MON_CONF = {}

YA_ENV_TO_SOLOMON_PROJECT = {
    'development': 'direct-junk',
    'testing': 'direct-test',
    'prestable': 'direct-test',
    'production': 'direct',
}

# from https://a.yandex-team.ru/arc/trunk/arcadia/metrika/admin/python/mtutils/utils/__init__.py
# потому что ретраи через requests httpadapter работают не совсем так, как хочется
def retry(exception_to_check,
          tries=5,
          delay=0.5,
          backoff=2,
          logger=None,
          debug=False,
          msg_format=None,
          exception_callback=None,
          limit_arguments_in_logger=True,
          ):
    """
    Retry calling the decorated function using an exponential backoff.

    Args:
        exception_to_check (Exception, tuple of Exception): the exception to check
        tries (int): number of times to try (not retry) before givin up
        delay (int): initial delay between retires in seconds
        backoff (int): multiplier e.g. value of 2 will double the delay each retry
        logger (logging.Logger): logger to use. If None, `print` is using
        debug (bool): if True - logs all args and kwargs
        msg_format (str): log/debug message format
        exception_callback (func): function will be called with caught exception as first argument
        limit_arguments_in_logger (bool): restrict function arguments size when passing to logger
    """

    if logger is None:
        logger = logging.getLogger()

    if msg_format is None:
        if not debug:
            msg_format = 'Retrying in {mdelay} seconds...({mtries} tries left); {func_name}(*args_are_hidden, **kwargs_are_hidden)'
        else:
            msg_format = '{str_exc}, Retrying in {mdelay} seconds...({mtries} tries left); {func_name}(*{str_func_args}, **{str_func_kwargs}'

    def deco_retry(f):

        @wraps(f)
        def f_retry(*args, **kwargs):
            mtries, mdelay = tries, delay
            while mtries > 1:
                try:
                    return f(*args, **kwargs)
                except exception_to_check as e:
                    if exception_callback is not None:
                        exception_callback(e)

                    str_func_args = str(args) if not limit_arguments_in_logger else str(args)[:1000]
                    str_func_kwargs = str(kwargs) if not limit_arguments_in_logger else str(kwargs)[:1000]

                    msg = msg_format.format(
                        str_exc=str(e),
                        mdelay=mdelay,
                        func_name=f.__name__,
                        str_func_args=str_func_args,
                        str_func_kwargs=str_func_kwargs,
                        mtries=str(mtries - 1)
                    )
                    logger.warning(msg)

                    time.sleep(mdelay)
                    mtries -= 1
                    mdelay *= backoff
            return f(*args, **kwargs)

        return f_retry  # true decorator

    return deco_retry


def hash_merge(*dict_args):
    result = {}
    for d in dict_args:
        for key, value in d.items():
            if isinstance(value, dict):
                result[key] = hash_merge(result.get(key, {}), value)
            else:
                result[key] = copy.deepcopy(value)
    return result


def df_dumps(dataframe):
    return tabulate(dataframe, headers='keys')


def j_dumps(jdict, pretty=False):
    if pretty:
        return '\n' + json.dumps(jdict, sort_keys=True, ensure_ascii=False, indent=4, separators=(',', ': '))
    return json.dumps(jdict, sort_keys=True, ensure_ascii=False)


def get_vhost_config(vhost, section):
    vhost_conf = MON_CONF.get(vhost, MON_CONF['_default'])
    return vhost_conf[section]


@retry(exception_to_check=(requests.HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout))
def ch_select(query, url, schema=None):
    """
    :return: pandas DataFrame
    """
    global CLICKHOUSE_TOKEN
    auth=(SETTINGS.clickhouse_user, CLICKHOUSE_TOKEN)
    query += ' FORMAT CSVWithNames'
    query.replace('\n', ' ')
    query = ' '.join(query.split())

    logging.info('Run query: ' + query)
    try:
        resp = requests.get(url, verify=False, params={'query': query}, auth=auth, timeout=(SETTINGS.clickhouse_conn_timeout, SETTINGS.clickhouse_query_timeout))
        resp.raise_for_status()
    except requests.HTTPError as e:
        logging.error('Clickhouse error: ' + resp.text) # хочется видеть, что именно не понравилось ClH
        raise

    dframe = pd.read_csv(StringIO(resp.text), dtype=schema)
    return dframe


@retry(exception_to_check=(requests.HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout))
def push_to_solomon(url, sensors):
    global SOLOMON_TOKEN
    headers = {"Authorization": "OAuth " + SOLOMON_TOKEN}
    sensors = {'sensors': sensors}
    logging.debug('Solomon sensors are ready to push: ' + j_dumps(sensors))
    try:
        if SETTINGS.apply:
            resp = requests.post(url, json=sensors, headers=headers)
            resp.raise_for_status()
    except requests.HTTPError as e:
        logging.error('Solomon error: ' + resp.text)
        raise
    if SETTINGS.apply:
        logging.info("Solomon - %d sensors pushed to %s" % (len(sensors["sensors"]), url))
    else:
        logging.info("Solomon - use --apply to push %d sensors to %s" % (len(sensors["sensors"]), url))


def init_sensor(name, value, ts):
    labels = {'sensor': name }
    sensor = {
        'kind': 'DGAUGE', # в случае push не думаю, что имеет смысл писать что-то еще
        'labels': labels,
        'value': value,
        'ts': ts
    }
    return sensor, labels


def date_in_slot(until_datetime, slot):
    """
    :param until_datetime: строка вида CH_DATE_FORMAT
    :param slot: временной интервал, секунды
    :return: строковое условие для выборки в clickhouse
    """
    opts = { 'until_datetime': until_datetime, 'slot': slot }
    # округляем до начала slot
    opts['until_datetime'] = "intDiv(toUInt32(toDateTime('{until_datetime}')), {slot}) * {slot}".format(**opts)
    b = 'toDateTime({until_datetime} - {slot})'.format(**opts)
    e = 'toDateTime({until_datetime})'.format(**opts)
    return 'log_time >= %s and log_time < %s' % (b, e)


def get_common_query(opts):
    return "'{level}' as level, concat('{mon_slot}', 's') as slot".format(**opts)


def get_leveled_query(level='vhost'):
    opts = {}
    opts['query'] = ['lower(vhost) as vhost']
    opts['groupby'] = ['vhost']
    if level in ['dc', 'host']:
        opts['query'].append("lower(substringUTF8(hostname, 1, 3)) IN ('iva', 'myt', 'sas', 'vla', 'man') ? lower(substringUTF8(hostname, 1, 3)) : 'unknown' as dc")
        opts['groupby'].append('dc')
    if level == 'host':
        opts['query'].append('hostname as host')
        opts['groupby'].append('host')
    opts['query'] = ', '.join(opts['query'])
    opts['groupby'] = ', '.join(opts['groupby'])

    return opts


def fill_leveled_labels(labels, data_row, common_fields, level='vhost'):
    lvl_fields = ['vhost']
    if level in ['dc', 'host']:
        lvl_fields.append('dc')
    if level == 'host':
        lvl_fields.append('host')
    else:
        # если host нет в метках, solomon сам его разрезолвит из ptr, но тут хотим "нет - значит, не нужен"
        labels['host'] = ''

    for col in common_fields + lvl_fields:
        labels[col] = str(data_row[col])


def date_ch2solomon(date):
    return date.replace(' ', 'T', 1) + 'Z'


def timings_to_solomon(level):
    logging.info('Get timings stat for solomon from clickhouse, level: ' + level)
    quantile_levels = ["0.5", "0.8", "0.9", "0.95", "0.98", "0.99", "0.999", "1.0"]
    stat = get_solomon_timings(level, quantile_levels)
    sensors = []
    for _, row in stat.iterrows():
        sensor, labels = init_sensor('request_time', row.request_time, ts=date_ch2solomon(row.date_utc))
        fill_leveled_labels(labels, row, ['level', 'slot', 'quantile'], level)
        sensors.append(sensor)

    push_to_solomon(SETTINGS.solomon_url, sensors) 


def rps_to_solomon(level):
    logging.info('Get rps stat for solomon from clickhouse')
    stat = get_solomon_rps(level)
    sensors = []
    for _, row in stat.iterrows():
        sensor, labels = init_sensor('rps', row.rps, ts=date_ch2solomon(row.date_utc))
        fill_leveled_labels(labels, row, ['level', 'slot', 'class_of_response', 'status_code'], level)
        sensors.append(sensor)

    push_to_solomon(SETTINGS.solomon_url, sensors)


def send_stat_to_solomon():
    if SETTINGS.mon_slot != 60:
        logging.critical('Only one-minute slot (-p 60) allowed for solomon')
        return

    rps_to_solomon('dc')
    for level in ['vhost', 'dc']:
        timings_to_solomon(level)


def get_solomon_timings(level, quantile_levels):
    """
    Возвращает DataFrame вида
        qid  date_utc             level    slot    vhost                             dc    host                                                               quantile    request_time
----  -----  -------------------  -------  ------  --------------------------------  ----  ---------------------------------------------------------------  ----------  --------------
   0      1  2018-11-25 17:37:00  host     60s     api.direct.yandex.ru              iva   iva1-0043-a2b-msk-iva-ppc-dire-847-18274.gencfg-c.yandex.net          0.5             0.245
   1      2  2018-11-25 17:37:00  host     60s     api.direct.yandex.ru              iva   iva1-0043-a2b-msk-iva-ppc-dire-847-18274.gencfg-c.yandex.net          0.8             0.592§
    ...
    """
    assert(all(isinstance(x, str) for x in quantile_levels))

    opts = {
        'mon_slot': SETTINGS.mon_slot,
        'table': SETTINGS.clickhouse_table,
        'level': level,
        'quantile_levels': ', '.join([str(x) for x in quantile_levels]),
        "quantile_levels_str": ", ".join(["'%s'" %x for x in quantile_levels]),
    }
    opts['date_in_mon_slot'] = date_in_slot(SETTINGS.until_datetime, opts['mon_slot'])
    opts['date_to_start_of_mon_slot'] = 'intDiv(toUInt32(log_time), %(mon_slot)d) * %(mon_slot)d' % opts
    opts['common_query'] = get_common_query(opts)
    opts.update(get_leveled_query(level))

    query = """SELECT
        arrayJoin(arrayEnumerate(q)) as qid, date_utc,
        {common_query}, {groupby},
        q[qid] as quantile, rq[qid] as request_time
        FROM (SELECT
            toString(toDateTime({date_to_start_of_mon_slot}), 'UTC') as date_utc,
            {common_query},
            {query},
            array({quantile_levels_str}) as q,
            quantilesExact({quantile_levels})(request_time) as rq
            FROM {table} WHERE {date_in_mon_slot}
            AND match(vhost, '^[a-z][a-z0-9-]+[.].*')
            GROUP BY date_utc, level, slot, q, {groupby}
            ORDER BY date_utc, {groupby}
        )
    """.format(**opts)

    schema = {"qid": dtype(int), "date_utc": dtype(object), "level": dtype(object), "slot": dtype(object),
              "vhost": dtype(object), "dc": dtype(object), "host": dtype(object),
              "quantile": dtype(object), "request_time": dtype(float)}
    if level == "vhost":
        schema.pop("dc", None)
        schema.pop("host", None)
    elif level == "dc":
        schema.pop("host", None)
    stat = ch_select(query, url=SETTINGS.clickhouse_url, schema=schema)

    logging.debug("\nSolomon timings stat:\n" + df_dumps(stat))
    logging.debug("Solomon timings stat types: " + str(stat.dtypes.tolist()))
    logging.info("Solomon timings stat: %d rows" % (stat.shape[0]))
    if stat.shape[0] > 0:
        assert all(pd.Series(schema, index=stat.dtypes.index) == stat.dtypes)

    return stat


def get_solomon_rps(level):
    """
    Возвращает DataFrame вида
     date_utc             level    slot    vhost                             dc    host                                                             class_of_response      status_code         rps
---  -------------------  -------  ------  --------------------------------  ----  ---------------------------------------------------------------  -------------------  -------------  ----------
  0  2018-11-25 17:37:00  host     60s     api.direct.yandex.ru              iva   iva1-0043-a2b-msk-iva-ppc-dire-847-18274.gencfg-c.yandex.net     2xx                            200   2.41667
  1  2018-11-25 17:37:00  host     60s     api.direct.yandex.ru              iva   iva1-0043-a2b-msk-iva-ppc-dire-847-18274.gencfg-c.yandex.net     2xx                            200  32.2333
    ...
    """

    opts = {
        'mon_slot': SETTINGS.mon_slot,
        'level': level,
        'table': SETTINGS.clickhouse_table,
    }
    opts['date_in_mon_slot'] = date_in_slot(SETTINGS.until_datetime, opts['mon_slot'])
    opts['date_to_start_of_mon_slot'] = 'intDiv(toUInt32(log_time), %(mon_slot)d) * %(mon_slot)d' % opts
    opts['common_query'] = get_common_query(opts)
    opts.update(get_leveled_query(level))

    query = """SELECT
        toString(toDateTime({date_to_start_of_mon_slot}), 'UTC') as date_utc,
        {common_query},
        {query},
        replaceRegexpOne(toString(intDiv(status, 100) * 100), '00$', 'xx') as class_of_response,
        status as status_code,
        count() / {mon_slot} as rps
        FROM {table} WHERE {date_in_mon_slot}
        AND match(vhost, '^[a-z][a-z0-9-]+[.].*')
        GROUP BY date_utc, level, slot, {groupby}, class_of_response, status_code
        ORDER BY date_utc, level, slot, {groupby}, class_of_response, status_code
    """.format(**opts)

    stat = ch_select(query, url=SETTINGS.clickhouse_url)

    logging.debug('\nSolomon rps stat:\n' + df_dumps(stat))
    logging.debug('Solomon rps stat types: ' + str(stat.dtypes.tolist()))
    logging.info('Solomon rps stat: %d rows' % (stat.shape[0]))
    if stat.shape[0] > 0:
        stypes = stat.dtypes.tolist()
        assert [stypes[x] for x in [-2, -1]] == [dtype(int), dtype(float)]
        assert all([x == dtype(object) for x in stypes[:-2]])

    return stat


def get_rps_stat_datacenter():
    """
    Возвращает DataFrame вида
              vhost   dc         rps     max_rps  rps_to_max   total_rps      rps_to_total
    direct.yandex.ru  myt   18.856667   48.263333    0.390704  133.376667      0.141379
    direct.yandex.ru  sas   48.263333   48.263333    1.000000  133.376667      0.361857
    api.direct.y.ru   ...
    ...
    """

    opts = {
        'mon_slot': SETTINGS.mon_slot,
        'table': SETTINGS.clickhouse_table,
    }
    opts['date_in_mon_slot'] = date_in_slot(SETTINGS.until_datetime, opts['mon_slot'])
    opts['date_to_start_of_mon_slot'] = 'intDiv(toUInt32(log_time), %(mon_slot)d) * %(mon_slot)d' % opts

    query = """SELECT
        vhost,
        lower(substringUTF8(hostname, 1, 3)) as dc,
        count() / {mon_slot} as rps
        FROM {table} WHERE {date_in_mon_slot}
        AND length(vhost) != 0
        AND vhost != 'none'
        GROUP BY vhost, dc
        ORDER BY vhost, dc
    """.format(**opts)

    stat = ch_select(query, url=SETTINGS.clickhouse_url)

    stat['max_rps'] = stat.groupby(['vhost']).rps.transform('max')
    stat['rps_to_max'] = stat.rps / stat.max_rps
    stat['total_rps'] = stat.groupby(['vhost']).rps.transform('sum')
    stat['rps_to_total'] = stat.rps / stat.total_rps

    logging.debug('\nDatacenter stat:\n' + df_dumps(stat))
    logging.debug('Datacenter stat types:\n' + str(stat.dtypes.tolist()))
    if stat.shape[0] > 0:
        assert stat.dtypes.tolist() == [dtype(object), dtype(object), dtype(float), dtype(float), dtype(float), dtype(float), dtype(float)]

    return stat


def get_rps_stat_vhost():
    """
    Возвращает DataFrame вида
    row  vhost                 resp_class  rps         total_rps   rps_to_total
    0    api.direct.yandex.ru  200         673.133333  680.443333  0.989257
    1    api.direct.yandex.ru  400         4.700000    680.443333  0.006907
    2    api.direct.yandex.ru  500         2.610000    680.443333  0.003836
    """

    opts = {
        'mon_slot': SETTINGS.mon_slot,
        'table': SETTINGS.clickhouse_table,
    }
    opts['date_in_mon_slot'] = date_in_slot(SETTINGS.until_datetime, opts['mon_slot'])
    opts['date_to_start_of_mon_slot'] = 'intDiv(toUInt32(date), %(mon_slot)d) * %(mon_slot)d' % opts

    query = """SELECT
        vhost,
        intDiv(status, 100) * 100 as resp_class,
        count() / {mon_slot} as rps
        FROM {table} WHERE {date_in_mon_slot}
        AND length(vhost) != 0
        AND vhost != 'none'
        GROUP BY vhost, resp_class
        ORDER BY vhost, resp_class
    """.format(**opts)
    ##    intDiv({date_to_start_of_mon_slot} - toUInt32(toStartOfDay(date)), {mon_slot}) as mon_slot_num

    stat_t = ch_select(query, url=SETTINGS.clickhouse_url)

    # добиваем в таблицу недостающие классы ответов
    dr = pd.DataFrame({'resp_class': pd.Series([100, 200, 300, 400, 500]),
                       'rps': pd.Series([0, 0, 0, 0, 0]), 'phony': 1})
    dv = pd.DataFrame({'vhost': stat_t.vhost.unique(), 'phony': 1})
    # декартово произведение vhost * resp_class
    dp = pd.merge(dv, dr, on='phony')
    stat = pd.merge(stat_t, dp, how='right', on=['vhost', 'resp_class'], suffixes=('', '_y')).drop(['phony', 'rps_y'], axis=1)
    # заполняем NaN после мерджа
    ##stat.mon_slot_num = stat.mon_slot_num.ffill()
    stat.rps = stat.rps.fillna(0)
    stat = stat.sort(['vhost', 'resp_class']).reset_index(drop=True)

    stat['total_rps'] = stat.groupby(['vhost']).rps.transform('sum')
    stat['rps_to_total'] = stat.rps / stat.total_rps

    logging.debug('\nSLB vhost stat:\n' + df_dumps(stat))
    logging.debug('SLB vhost stat types:\n' + str(stat.dtypes.tolist()))
    if stat.shape[0] > 0:
        assert stat.dtypes.tolist() == [dtype(object), dtype(float), dtype(float), dtype(float), dtype(float)]

    return stat


def check_value_in_interval(value, ok_ge, ok_le, vhost, level, check, descr_prefix):
    """
    Проверяет, входит ли value в заданный отрезок
    :param ok_ge: нижняя граница (включительно), можно писать - float('inf')
    :param ok_le: верхняя граница (включительно), можно писать + float('inf')
    :return: словарь с описанием проверки
    """
    evt = {
        'vhost': vhost,
        'level': level,
        'check': check,
        'passed': False,
        'descr': '%s = %g' % (descr_prefix, value),
    }

    if ok_ge <= value <= ok_le:
        evt['passed'] = True
        evt['descr'] += ' in [%g, %g]' % (ok_ge, ok_le)
    elif value > ok_le:
        evt['descr'] += ' > %g' % (ok_le,)
    else:
        evt['descr'] += ' < %g' % (ok_ge,)

    return evt


def check_resp_class_to_total(checks, vhost_stat, resp_class, ok_ge, ok_le, level):
    """
    Проверяет, не выходит ли за пороги отношение rps заданного класса ответа (resp_class) к общему rps балансера (vhost)
    :param checks: list с проверками, куда будет добавлен результат
    :param vhost_stat: статистика из get_rps_stat_vhost
    :param level: степень критичности (0 - max) проверки, используется при отправке в juggler
    :param resp_class: класс ответа (200, 300, ...)
    пороги и формат результата проверки в check_value_in_interval
    """
    res = vhost_stat[vhost_stat.resp_class == resp_class]
    # одна строка
    assert res.shape[0] == 1

    row = res.iloc[0]
    evt = check_value_in_interval(
        value=row.rps_to_total,
        ok_ge=ok_ge,
        ok_le=ok_le,
        vhost=row.vhost,
        level=level,
        check='resps_{}_to_total'.format(resp_class),
        descr_prefix='responses of class {} / total resps'.format(resp_class),
    )
    checks.append(evt)


def check_rps(checks, vhost_stat, level, ok_ge, ok_le):
    """
    Проверяет, не выходит ли за пороги общий rps балансера. Параметры аналогичны check_resp_class_to_total
    """
    # total_rps уже посчитан и есть в каждой строке, берем первую
    row = vhost_stat.iloc[0]

    evt = check_value_in_interval(row.total_rps, ok_ge, ok_le, row.vhost, level, 'total_resps', 'total responses')
    checks.append(evt)


def check_dc_consistency(checks, stat_dc):
    """
    Проверяет, не выходит ли за пороги отношение rps в данном ДЦ к максимальному rps среди ДЦ заданного vhost
    Это отношение всегда будет <= 1 в каждом дц, хотя бы в одном будет = 1
    :param stat_dc: - статистика из get_rps_stat_datacenter
    добавляет полученные проверки в checks
    """
    for _, row in stat_dc.iterrows():
        conf = get_vhost_config(row.vhost, 'dc_rps_to_max')
        evt = check_value_in_interval(value=row.rps_to_max,
                                      vhost=row.vhost,
                                      check='dc_rps_to_max_' + row.dc,
                                      descr_prefix='dc {} rps / max dc rps'.format(row.dc),
                                      **conf)
        checks.append(evt)


def send_to_juggler(checks):
    processed_vhosts = {}
    events = []
    cols = ['vhost', 'passed', 'level', 'check', 'descr']
    for _, row in checks.sort(cols)[cols].iterrows():
        if processed_vhosts.get(row.vhost):
            continue

        event = {
            'host': row.vhost,
            'service': 'slbmon',
            'status': 'OK',
        }
        if not row.passed:
            event['description'] = '%s; run dt-slbmon %s for more info' % (row.descr, ' '.join(settings_to_args()))
            event['status'] = 'WARN'
            if row.level < get_vhost_config(row.vhost, 'crit_if_level_lt'):
                event['status'] = 'CRIT'

        if SETTINGS.ya_env != 'production':
            event['service'] += '.' + SETTINGS.ya_env

        logging.info('Juggler event: ' + j_dumps(event))
        events.append(event)
        processed_vhosts[row.vhost] = True

    if events and SETTINGS.apply:
        jres = dj.queue_events(events)
        logging.info('Juggler response: ' + j_dumps(jres))


def run_checks():
    with open(SETTINGS.mon_conf_file, 'r') as f:
        conf = yaml.load(f)
        conf_def = conf['_default']
        for k, v in conf.items():
            MON_CONF[k] = hash_merge(conf_def, v)
    logging.debug('slbmon config: ' + j_dumps(MON_CONF))

    # выгружаем статистику по rps по всем балансерам (vhost - virtual host, он же slb fqdn)
    checks = []
    stat = get_rps_stat_vhost()
    for vhost, vhost_stat in stat.groupby('vhost'):
        # провека vhost_rps - пороги на общий rps балансера
        conf = get_vhost_config(vhost, 'vhost_rps')
        check_rps(checks, vhost_stat, **conf)

        # проверки rclass_to_total для данного vhost:
        # rclass_to_total:
        #     отношение rps с ответами определенного класса (resp_class, rclass) к общему rps сервиса (vhost)
        #     2xx: {ok_ge: 0.3, ok_le: 1.0, resp_class: 200}
        for conf in get_vhost_config(vhost, 'rclass_to_total').values():
            check_resp_class_to_total(checks, vhost_stat, **conf)

    # выгружаем статистику rps с разбивкой по датацентрам
    stat_dc = get_rps_stat_datacenter()
    check_dc_consistency(checks, stat_dc)

    # обрабатываем полученные проверки в checks, получаем DataFrame вида
    # vhost                  passed   level  check               descr
    # ---------------------  -------  -----  ------------------  ---------------------------------------------------------------
    # api.direct.yandex.ru   True         1  total_resps         total responses = 747.437 in [500, inf]
    # api.direct.yandex.ru   True         1  resps_200_to_total  responses of class 200 / total resps = 0.98469 in [0.9, 1]
    # api.direct.yandex.ru   True         2  dc_rps_to_max_iva   dc iva rps / max dc rps = 0.991598 in [0.25, 1]
    cols = ['vhost', 'passed', 'level', 'check', 'descr']
    checks = pd.DataFrame(checks, columns=cols)

    logging.info('\nChecks:\n' + df_dumps(checks.sort(['vhost', 'level', 'check', 'descr']).reset_index(drop=True)))
    logging.info('Checks types: ' + str(checks.dtypes.tolist()))
    assert checks.dtypes.tolist() == [dtype(object), dtype(bool), dtype(int), dtype(object), dtype(object)]

    send_to_juggler(checks)

    # пишем отчет по сфейлившимся проверкам (в juggler уже все отправили, если просили)
    failed_checks = checks[checks.passed == False][['vhost', 'level', 'check', 'descr']].reset_index(drop=True)
    if failed_checks.empty:
        logging.info('All checks passed')
        # в лог мы всегда пишем, просто он уходит в NullHandler, если не задали --verbose
        if not SETTINGS.verbose:
            print('All checks passed. Run with --verbose for verbosity, see --help')
    else:
        logging.info('\nFailed checks:\n' + df_dumps(failed_checks))
        if not SETTINGS.verbose:
            print('Failed checks:\n' + df_dumps(failed_checks))


def settings_to_args():
    args = []
    args.append('-d')
    args.append(SETTINGS.until_datetime)

    args.append('-p')
    args.append(str(SETTINGS.mon_slot))

    args.append('-l')
    args.append(str(SETTINGS.data_lag))

    args.append('-c')
    args.append(str(SETTINGS.mon_conf_file))
    return args


ACTIONS = {
    'juggler': run_checks,
    'solomon': send_stat_to_solomon,
}


def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)
    parser.add_argument('--apply', default=SETTINGS.apply, action='store_true', help='применять все изменения соответственно action (без этого никаких пишущих действий не производится)')
    parser.add_argument('--action', default=SETTINGS.action, choices=ACTIONS.keys(), help='что делаем')
    parser.add_argument('-d', '--until-datetime', default=SETTINGS.until_datetime,
                        help='Дата окончания выборки ({}). Округляется до mon_slot'
                        .format(CH_DATE_FORMAT.replace('%', '%%')))
    parser.add_argument('-p', '--mon-slot', type=int, default=SETTINGS.mon_slot,
                        help='Разрешение (минимальный период) выборки, в секундах. Округляется до минут')
    parser.add_argument('-c', '--mon-conf-file', default=SETTINGS.mon_conf_file,
                        help='Конфиг с порогами мониторингов slb')
    parser.add_argument('-l', '--data-lag', type=int, default=SETTINGS.data_lag,
                        help='Возможное запаздывание данных в ClH, чтобы не считать статистику по неполным логам')
    parser.add_argument('-e', '--ya-env', default=SETTINGS.ya_env,
                        help='Окружение (чтобы случайно не писать в продовые juggler-агрегаты)')
    parser.add_argument('-v', '--verbose', action='store_true', default=SETTINGS.verbose, help='Писать подробный лог')
    parser.add_argument('--debug', action='store_true', default=SETTINGS.debug, help='Вывод всех промежуточных данных')
    args = parser.parse_args()

    logfmt = '[%(asctime)s]\t%(levelname)s\t' + str(os.getpid()) + '\t%(threadName)s\t%(name)s\t%(message)s'
    if args.verbose or args.debug:
        args.verbose = True
        level = logging.DEBUG if args.debug else logging.INFO
        logging.basicConfig(stream=sys.stdout, level=level, format=logfmt)

    if args.until_datetime == 'now()':
        args.until_datetime = dt.now().replace(second=0, microsecond=0)
    else:
        args.until_datetime = dt.strptime(args.until_datetime, CH_DATE_FORMAT)

    args.until_datetime -= timedelta(seconds=args.data_lag)
    args.until_datetime = args.until_datetime.strftime(CH_DATE_FORMAT)

    vars(SETTINGS).update(vars(args))

    SETTINGS.clickhouse_query_timeout = SETTINGS.mon_slot
    SETTINGS.solomon_url = SETTINGS.solomon_url.format(project=YA_ENV_TO_SOLOMON_PROJECT[SETTINGS.ya_env])
    logging.info('running with settings: ' + str(SETTINGS))

    global SOLOMON_TOKEN
    with open(SETTINGS.solomon_token_file, 'r') as f:
        SOLOMON_TOKEN = f.read().rstrip()
    global CLICKHOUSE_TOKEN
    with open(SETTINGS.clickhouse_token_file, 'r') as f:
        CLICKHOUSE_TOKEN = f.read().rstrip()

    ACTIONS[args.action]()


if __name__ == '__main__':
    main()

