#!/usr/bin/env python
# -*- coding: utf8 -*-
"""
Скрипт обрабатывает trace-логи из clickhouse и выдает различные сигналы мониторинга

По большей части повторяет dt-slbmon
"""

from __future__ import print_function

import argparse
import copy
import json
import yaml
import os
import requests
import urllib
import sys
import time
import logging

import direct_juggler.juggler as dj
import pandas as pd
from functools import wraps
from pandas.compat import StringIO
from numpy import dtype
from datetime import datetime as dt
from datetime import timedelta
from tabulate import tabulate

requests.packages.urllib3.disable_warnings() 

SCRIPT_NAME = "dt-tracemon"
JUGGLER_SERVICE_PREFIX = "trace."
CH_DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"
CLICKHOUSE_USER = 'direct_reader',
SETTINGS = argparse.Namespace(
    until_datetime="now()",
    verbose=False,
    debug=False,
    ya_env="development",
    conf_file=None,
    cron_prefix="",
    cron_suffix="",
    apply=False,
    action="juggler",
    graf_links=False,
    solomon_token_file='/etc/direct-tokens/solomon_robot-direct-slmn-p',
    clickhouse_token_file='/etc/direct-tokens/clickhouse_%s' % CLICKHOUSE_USER,
    clickhouse_user='%s' % CLICKHOUSE_USER,
    clickhouse_database='directdb',
)

YA_ENV_TO_SOLOMON_PROJECT = {
    "development": "direct-junk",
    "testing": "direct-test",
    "prestable": "direct-test",
    "production": "direct",
}

#параметры для генерации direct ссылки на графики
URL = '''https://direct.yandex.ru/registered/main.pl?'''
PARAMS = {
'cmd': 'internalReports',
'report_id': 'profile_logs_new',
'ir_param_stat_time_agg': '1min',
'ir_param_date_from': '{date}',
'ir_param_time_from': '{time}', 
'ir_param_date_to': '{date2}',
'ir_param_time_to': '{time2}', 
'ir_param_filter_cmd_type': '{cmdtype}', 
'ir_param_filter_cmd': '{cmd}',
'ir_param_group_by_1': 'stat_time',
'ir_param_group_by': 'func_param', 
'pivot_measure': 'func_ela',
'show_chart': 'on',
'chart_type': 'area',
'chart_stacking': 'normal', 
'chart_series_limit': '10',
}

PARAMS_TIMINGS = {
'ir_param_group_by_2': 'func',
'pivot_fields': 'func',
'pivot_measure': 'func_ela',
}

PARAMS_COUNT = {
'ir_param_group_by_2':'cmd',
'pivot_fields':'cmd',
'pivot_measure': 'cnt',
}

SOLOMON_TOKEN = None
CLICKHOUSE_TOKEN = None

def generate_direct_url(cmd_type, list_cmd):
        urls = []
        for cmd in list_cmd:
            time1 = dt.strptime(SETTINGS.until_datetime, CH_DATE_FORMAT)
            time2 = time1 - timedelta(hours=3)
            PARAMS['ir_param_date_from'] = time2.strftime('%Y-%m-%d')
            PARAMS['ir_param_time_from'] = time2.strftime('%H:%M')
            PARAMS['ir_param_date_to'] = time1.strftime('%Y-%m-%d')
            PARAMS['ir_param_time_to'] = time1.strftime('%H:%M')
            PARAMS['ir_param_filter_cmd_type'] = cmd_type
            PARAMS['ir_param_filter_cmd'] = cmd.split('(')[0]
            for i in [PARAMS_COUNT, PARAMS_TIMINGS]:
                PARAMS.update(i)
                types = "count" if PARAMS["ir_param_group_by_2"].find("cmd") != -1 else "timings"
                value = "{0}/{1}/{3}\n\t{2}".format(cmd_type, cmd, URL + urllib.urlencode(PARAMS), types)
                urls.append(value)
        return urls

# from https://a.yandex-team.ru/arc/trunk/arcadia/metrika/admin/python/mtutils/utils/__init__.py
# потому что ретраи через requests httpadapter работают не совсем так, как хочется
def retry(exception_to_check,
          tries=6,
          delay=0.3,
          backoff=2,
          logger=None,
          debug=False,
          msg_format=None,
          exception_callback=None,
          limit_arguments_in_logger=True,
          ):
    """
    Retry calling the decorated function using an exponential backoff.

    Args:
        exception_to_check (Exception, tuple of Exception): the exception to check
        tries (int): number of times to try (not retry) before givin up
        delay (int): initial delay between retires in seconds
        backoff (int): multiplier e.g. value of 2 will double the delay each retry
        logger (logging.Logger): logger to use. If None, `print` is using
        debug (bool): if True - logs all args and kwargs
        msg_format (str): log/debug message format
        exception_callback (func): function will be called with caught exception as first argument
        limit_arguments_in_logger (bool): restrict function arguments size when passing to logger
    """

    if logger is None:
        logger = logging.getLogger()

    if msg_format is None:
        if not debug:
            msg_format = "Retrying in {mdelay} seconds...({mtries} tries left); {func_name}(*args_are_hidden, **kwargs_are_hidden)"
        else:
            msg_format = "{str_exc}, Retrying in {mdelay} seconds...({mtries} tries left); {func_name}(*{str_func_args}, **{str_func_kwargs}"

    def deco_retry(f):

        @wraps(f)
        def f_retry(*args, **kwargs):
            mtries, mdelay = tries, delay
            while mtries > 1:
                try:
                    return f(*args, **kwargs)
                except exception_to_check as e:
                    if exception_callback is not None:
                        exception_callback(e)

                    str_func_args = str(args) if not limit_arguments_in_logger else str(args)[:1000]
                    str_func_kwargs = str(kwargs) if not limit_arguments_in_logger else str(kwargs)[:1000]

                    msg = msg_format.format(
                        str_exc=str(e),
                        mdelay=mdelay,
                        func_name=f.__name__,
                        str_func_args=str_func_args,
                        str_func_kwargs=str_func_kwargs,
                        mtries=str(mtries - 1)
                    )
                    logger.warning(msg)

                    time.sleep(mdelay)
                    mtries -= 1
                    mdelay *= backoff
            return f(*args, **kwargs)

        return f_retry  # true decorator

    return deco_retry


def hash_merge(*dict_args):
    result = {}
    for d in dict_args:
        for key, value in d.items():
            if isinstance(value, dict):
                result[key] = hash_merge(result.get(key, {}), value)
            else:
                result[key] = copy.deepcopy(value)
    return result


def df_dumps(dataframe):
    return tabulate(dataframe, headers="keys")


def j_dumps(jdict, pretty=False):
    if pretty:
        return "\n" + json.dumps(jdict, sort_keys=True, ensure_ascii=False, indent=4, separators=(",", ": "))
    return json.dumps(jdict, sort_keys=True, ensure_ascii=False)


@retry(exception_to_check=(requests.HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout))
def ch_select(query, schema=None, **kwargs):
    """
    :return: pandas DataFrame
    """
    global CLICKHOUSE_TOKEN
    auth=(SETTINGS.clickhouse_user, CLICKHOUSE_TOKEN)
    query += " FORMAT CSVWithNames"
    query.replace("\n", " ")
    query = " ".join(query.split())

    url = kwargs.get("clickhouse_url", "https://ppchouse-cloud.direct.yandex.net:8443")
    logging.info("Run query: " + query)
    try:
        resp = requests.get(url, verify = False, params={"query": query}, auth=auth,
                            timeout=(kwargs.get("clickhouse_conn_timeout", 2),
                                     kwargs.get("clickhouse_query_timeout", 60)))
        resp.raise_for_status()
    except requests.HTTPError as e:
        logging.error("Clickhouse error: " + resp.text) # хочется видеть, что именно не понравилось ClH
        raise

    dframe = pd.read_csv(StringIO(resp.text), dtype=schema)
    return dframe


@retry(exception_to_check=(requests.HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout))
def push_to_solomon(url, sensors):
    global SOLOMON_TOKEN
    headers = {"Authorization": "OAuth " + SOLOMON_TOKEN}
    sensors = {"sensors": sensors}
    logging.debug("Solomon sensors are ready to push: " + j_dumps(sensors))
    try:
        if SETTINGS.apply:
            resp = requests.post(url, json=sensors, headers=headers)
            resp.raise_for_status()
    except requests.HTTPError as e:
        logging.error("Solomon error: " + resp.text)
        raise
    if SETTINGS.apply:
        logging.info("Solomon - %d sensors pushed to %s" % (len(sensors["sensors"]), url))
    else:
        logging.info("Solomon - use --apply to push %d sensors to %s" % (len(sensors["sensors"]), url))


def init_sensor(name, value, ts):
    labels = {"sensor": name}
    sensor = {
        "kind": "DGAUGE", # в случае push не думаю, что имеет смысл писать что-то еще
        "labels": labels,
        "value": value,
        "ts": ts
    }
    return sensor, labels


def date_ch2solomon(date):
    return date.replace(" ", "T", 1) + "Z"


def fill_labels(labels, data_row, common_fields):
    # если host нет в метках, solomon сам его разрезолвит из ptr, но тут хотим "нет - значит, не нужен"
    labels["host"] = ""

    for col in common_fields:
        labels[col] = str(data_row[col])


def action_solomon():
    logging.info("run solomon action")
    solomon_url="http://solomon.yandex.net/api/v2/push?project={project}&cluster=ppcback&service=clh-trace".format(
        project=YA_ENV_TO_SOLOMON_PROJECT[SETTINGS.ya_env])
    quantile_levels = ["0.5", "0.8", "0.9", "0.95", "0.98", "0.99", "0.999", "1.0"]
    groupby = ["svc", "method"]
    stat = get_trace_timings("aggr_service as svc, aggr_method as method", SETTINGS.until_datetime, groupby, quantile_levels,
                             services=None, mon_slot=300, data_lag=3600, clickhouse_query_timeout=300)

    for svc, sstat in stat.groupby("svc"):
        logging.info("processing sensors for svc %s" % svc)
        sensors = []
        for _, row in sstat.iterrows():
            sensor, labels = init_sensor("ela", row.ela, ts=date_ch2solomon(row.date_utc))
            fill_labels(labels, row, groupby + ["quantile"])
            sensors.append(sensor)

        push_to_solomon(solomon_url, sensors)


def check_tracelog_service(stat, svc, service_conf):
    logging.info("process tracelog service %s" % svc)

    bad_methods = []
    processed_methods = set()
    service_stat = stat[stat.svc == svc]
    logging.debug("\nservice stat:\n" + df_dumps(service_stat))

    for _, row in service_stat.iterrows():
        processed_methods.add(row["method"])
        error_msg = None
        logging.debug("process %s:%s, ignore_unknown: %s %s" % (svc, row["method"], service_conf["ignore_unknown_methods"], type(service_conf["ignore_unknown_methods"])))

        if row["method"] in service_conf["methods"]:
            logging.debug("known method (in service_conf)")
            error_msg = check_method(row, service_conf["methods"][row["method"]])
        elif not service_conf["ignore_unknown_methods"]:
            logging.debug("unknown method")
            error_msg = check_method(row, service_conf["unknown_method"])

        if error_msg:
            bad_methods.append(error_msg)

    nodata_methods = set(service_conf["methods"]) - set(processed_methods)
    if nodata_methods and not service_conf["ignore_nodata"]:
        bad_methods.append("NODATA(" + ", ".join(nodata_methods) + ")")

    logging.info("bad methods %s, nodata methods %s" % (bad_methods, nodata_methods))
    return bad_methods


def action_get_cron():
    with open(SETTINGS.conf_file, "r") as f:
        conf = yaml.load(f)
    params = hash_merge(vars(SETTINGS), conf)

    params["args"] = " ".join(settings_to_args(params, include_timestamp=False))
    params["script_name"] = SCRIPT_NAME
    params["cron_prefix"] = SETTINGS.cron_prefix.format(**conf)
    params["cron_suffix"] = SETTINGS.cron_suffix.format(**conf)
    print("{cron_params[expr]} {cron_prefix} {script_name} {args} {cron_suffix}".format(**params))


def action_juggler():
    logging.info("run juggler action")
    alert_conf, quantile_levels, services = parse_alert_config(SETTINGS.conf_file)
    alert = alert_conf["alert_name"]

    events = {}
    logging.info("process alert %s" % alert)
    events[alert] = {"service": JUGGLER_SERVICE_PREFIX + alert, "status": "OK", "description": "OK"}

    logging.debug("alert config: " + j_dumps(alert_conf))
    logging.info("get trace timings for services %s with quantiles %s" % (services, quantile_levels))
    groupby = ["svc", "method"]
    stat = get_trace_timings("aggr_service as svc, aggr_method as method", SETTINGS.until_datetime, groupby,
                             quantile_levels, services, **alert_conf["fetch_params"])

    bad_services = []
    for svc, service_conf in alert_conf["tracelog_services"].items():
        bad_methods = check_tracelog_service(stat, svc, service_conf)
        if bad_methods:
            if SETTINGS.graf_links:
                bad_services.append("    %s\n" % ("\n    ".join(generate_direct_url(svc, bad_methods))))
            else:
                bad_services.append("  %s:\n    %s\n" % (svc, "\n    ".join(bad_methods)))

    logging.info("bad services for alert %s: %s" % (alert, bad_services))
    if bad_services:
        events[alert] = {
            "service": JUGGLER_SERVICE_PREFIX + alert,
            "status": "CRIT",
            "description": "run %s %s for more info" % (SCRIPT_NAME, " ".join(settings_to_args(vars(SETTINGS))))
        }

    events = sorted(events.values(), key=lambda x: x["status"])
    logging.info("%s events to juggler: %s" % ("send" if SETTINGS.apply else "dry-run mode, use --apply to send",
                                               j_dumps(events)))
    if events and SETTINGS.apply:
        jres = dj.queue_events(events)
        logging.info('juggler response: ' + j_dumps(jres))

    if not SETTINGS.verbose:
        print("dry-run mode, use --apply to send events:" if not SETTINGS.apply else "events sended to juggler:")
        print(j_dumps(events, pretty=False))
        if bad_services:
            print("slow methods:\n" + "\n".join(bad_services))


def check_method(row, method_conf):
    res = ""
    if row["quantile"] in method_conf and row["ela"] > method_conf[row["quantile"]]:
        res = "%s(%sq ela %.1f > %.1f)" % (row["method"], row["quantile"], row["ela"], method_conf[row["quantile"]])

    logging.debug("check if method %s with (%s %s) ela (%s %s) > (%s %s): %s" %
                  (row["method"], row["quantile"], type(row["quantile"]), row["ela"], type(row["ela"]),
                  method_conf.get(row["quantile"]), type(method_conf.get(row["quantile"])),
                  bool(res)))
    return res


def get_trace_timings(q, until_datetime, groupby, quantile_levels, services=None, **kwargs):
    """
    Возвращает DataFrame вида
            qid  date_utc             svc      method     quantile             ela
----  -----  -------------------  ------------ ------------- ----------  --------------
   0      1  2019-03-21 18:48:00  direct.api5  adextensions       0.5       0.00350474
   1      2  2019-03-21 18:48:00  direct.api5  adextensions       0.8       0.00350474
    """
    logging.info("Get timings stat from clickhouse")
    assert(all(isinstance(x, str) for x in quantile_levels))

    opts = {
        "until_datetime": until_datetime,
        "until_days_back": kwargs.get("until_days_back", 0),
        "quantile_levels": ", ".join(quantile_levels),
        "quantile_levels_str": ", ".join(["'%s'" %x for x in quantile_levels]),
        "mon_slot": kwargs.get("mon_slot", 300),
        "data_lag": kwargs.get("data_lag", 0),
        "from_hour": kwargs.get("from_hour", 0),
        "until_hour": kwargs.get("until_hour", 23),
        "table": kwargs.get("clickhouse_table", "trace"),
        "database": kwargs.get("clickhouse_database", "directdb")
    }
    if opts["until_days_back"] > 0:
        opts["mon_slot"] = 3600 * 24

    opts["until_expr"] = "intDiv(toUInt32(toDateTime('{until_datetime}'))".format(**opts) + \
                         " - {data_lag} - {until_days_back} * 3600 * 24".format(**opts) + \
                         ", {mon_slot}) * {mon_slot}".format(**opts)

    opts["date_selector"] = "log_time >= toDateTime({until_expr} - {mon_slot}) AND ".format(**opts) + \
                            "log_time < toDateTime({until_expr})".format(**opts)
    opts["hour_selector"] = "toHour(log_time) >= {from_hour} AND toHour(log_time) <= {until_hour}".format(**opts)

    opts["date_to_start_of_mon_slot"] = "intDiv(toUInt32(aggr_log_time), %(mon_slot)d) * %(mon_slot)d" % opts
    opts["groupby"] = ", ".join(groupby)
    opts["query"] = q
    opts["filter"] = ""
    if services:
        opts["filter"] = "AND service IN (" + ", ".join(["'%s'" % x for x in services]) + ")"

    # aggr_service, aggr_method, aggr_log_time выглядят неаккуратно, с этим может помочь настройка prefer_column_name_to_alias, которая, к сожалению, поддерживается только с версии 21.4
    # с prefer_column_name_to_alias можно было бы писать any(service) as service, при этом where работал бы с оригинальным service
    query = """SELECT
        arrayJoin(arrayEnumerate(q)) as qid, date_utc,
        {groupby},
        q[qid] as quantile, ela_q[qid] as ela
        FROM (SELECT
            toString(toDateTime({date_to_start_of_mon_slot}), 'UTC') as date_utc,
            {query},
            array({quantile_levels_str}) as q,
            quantilesExact({quantile_levels})(aggr_ela) as ela_q
            FROM (SELECT
                span_id,
                any(service) AS aggr_service,
                any(method) AS aggr_method,
                max(log_time) AS aggr_log_time,
                sum(ela) AS aggr_ela
                FROM {database}.{table} WHERE
                {date_selector} AND
                {hour_selector}
                {filter}
                GROUP BY span_id
            )
            GROUP BY date_utc, q, {groupby}
            ORDER BY date_utc, {groupby}
        )
    """.format(**opts)

    schema = {"qid": dtype(int), "date_utc": dtype(object), "svc": dtype(object), "method": dtype(object),
              "quantile": dtype(object), "ela": dtype(float)}
    stat = ch_select(query, schema=schema, **kwargs)

    logging.debug("\nTrace timings stat:\n" + df_dumps(stat))
    logging.debug("Trace timings stat types: " + str(stat.dtypes.tolist()))
    logging.info("Trace timings stat: %d rows" % (stat.shape[0]))
    if stat.shape[0] > 0:
        assert all(pd.Series(schema, index=stat.dtypes.index) == stat.dtypes)

    return stat


def parse_alert_config(conf_file):
    """
    Преобразует список правил вида:
    method_defaults:
        quantile: 0.99
        max_ela: 0.3
    rules:
      - quantile: 0.99
        max_ela: 0.5
        methods:
          - grid.constants
          - grid.constants.validation
      - quantile: 0.99
        max_ela: 10
        methods:
          - grid.constants
      - quantile: 0.80
        max_ela: 100
        methods:
          - grid.constants
          - campUnarc
          - confirmSaveCampXLS

    в словарь методов:
    method_defaults:
        "0.99": 0.3
    methods:
      grid.constants:
        "0.99": 0.5  # берем минимальное значение
        "0.80": 100
      campUnarc:
        "0.80": 100
      confirmSaveCampXLS:
        "0.80: 100
    """
    with open(conf_file, "r") as f:
        conf = yaml.load(f)

    quantile_levels = set()
    services = set()

    for svc, service_conf in conf["tracelog_services"].items():
        services.add(svc)
        quantile_levels.add(str(service_conf["method_defaults"]["quantile"]))

        defaults = {str(service_conf["method_defaults"]["quantile"]):
                    float(service_conf["method_defaults"]["max_ela"])}
        service_conf["unknown_method"] = defaults

        service_conf["methods"] = {}
        if "rules_file" in service_conf:
            with open(os.path.dirname(conf_file) + "/" + service_conf["rules_file"]) as f:
                service_conf["rules"] = yaml.load(f)

        for rule in service_conf["rules"]:
            rule["quantile"] = str(rule.get("quantile", service_conf["method_defaults"]["quantile"]))
            rule["max_ela"] = float(rule.get("max_ela", service_conf["method_defaults"]["max_ela"]))
            quantile_levels.add(rule["quantile"])

            for method in rule["methods"]:
                if method in service_conf["methods"]:
                    if rule["quantile"] in service_conf["methods"][method]:
                        service_conf["methods"][method][rule["quantile"]] = \
                            min(rule["max_ela"], service_conf["methods"][method][rule["quantile"]]["max_ela"])
                    else:
                        service_conf["methods"][method][rule["quantile"]] = rule["max_ela"]
                else:
                    service_conf["methods"][method] = {rule["quantile"]: rule["max_ela"]}

    conf.pop("rules", None)
    return conf, list(quantile_levels), list(services)


def parse_until_datetime(until_datetime):
    if until_datetime == "now()":
        return dt.now().replace(second=0, microsecond=0).strftime(CH_DATE_FORMAT)
    return until_datetime


def settings_to_args(settings, include_timestamp=True):
    args = []
    if include_timestamp:
        args.append("-d")
        args.append(settings["until_datetime"])

    args.append("-c")
    args.append(str(settings["conf_file"]))
    args.append("-g")
    return args


ACTIONS = {
    "juggler": action_juggler,
    "solomon": action_solomon,
    "get_cron_line": action_get_cron,
}


def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__)
    parser.add_argument("--apply", default=SETTINGS.apply, action="store_true", help="применять все изменения соответственно action (без этого никаких пишущих действий не производится)")
    parser.add_argument("--action", default=SETTINGS.action, choices=ACTIONS.keys(), help="что делаем")
    parser.add_argument("-d", "--until-datetime", default=SETTINGS.until_datetime,
                        help="Дата окончания выборки ({}). Округляется до mon_slot"
                        .format(CH_DATE_FORMAT.replace("%", "%%")))
    parser.add_argument("-c", "--conf-file", default=SETTINGS.conf_file,
                        help="Конфиг с порогами мониторингов tracemon")
    parser.add_argument("-e", "--ya-env", default=SETTINGS.ya_env,
                        help="Окружение (чтобы случайно не писать в продовые solomon-проекты)")
    parser.add_argument("-v", "--verbose", action="store_true", default=SETTINGS.verbose, help="Писать подробный лог")
    parser.add_argument("--cron-prefix", default=SETTINGS.cron_prefix)
    parser.add_argument("--cron-suffix", default=SETTINGS.cron_suffix)
    parser.add_argument("--debug", action="store_true", default=SETTINGS.debug, help="Вывод всех промежуточных данных")
    parser.add_argument("-g", "--graf-links", action="store_true", default=SETTINGS.graf_links, help="Показать в выводе ссылки на графики")
    args = parser.parse_args()

    logfmt = "[%(asctime)s]\t%(levelname)s\t" + str(os.getpid()) + "\t%(threadName)s\t%(name)s\t%(message)s"
    if args.verbose or args.debug:
        args.verbose = True
        level = logging.DEBUG if args.debug else logging.INFO
        logging.basicConfig(stream=sys.stdout, level=level, format=logfmt)

    args.until_datetime = parse_until_datetime(args.until_datetime)
    vars(SETTINGS).update(vars(args))

    logging.debug("running with settings: " + str(SETTINGS))

    global SOLOMON_TOKEN
    with open(SETTINGS.solomon_token_file, 'r') as f:
        SOLOMON_TOKEN = f.read().rstrip()
    global CLICKHOUSE_TOKEN
    with open(SETTINGS.clickhouse_token_file, 'r') as f:
        CLICKHOUSE_TOKEN = f.read().rstrip()
    ACTIONS[args.action]()


if __name__ == "__main__":
    main()

