#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Отчет о пересечении удаленных и запрашиваемых stid

https://st.yandex-team.ru/CHEMODAN-25961
https://stat.yandex-team.ru/Disk/DiskInternal/MulcaCleanIntersection

Как дополнить словарь deleted_stids_ignore_list:
0. Устанавливаем переменные окружения для YT (см disk-secrets)
> export YT_TOKEN=<TOKEN> YT_PROXY=<url>
1. Скачиваем текущий словарь.
>  yt download //home/mpfs-stat/storage/deleted_stids_ignore_list > deleted_stids_ignore_list
2. Исправляем словарь.
> vim deleted_stids_ignore_list
3. Заливаем исправленный словарь.
> cat deleted_stids_ignore_list | yt upload //home/mpfs-stat/storage/deleted_stids_ignore_list
"""
import datetime
import yt.wrapper as yt
from mpfs.core.mrstat.report import Job, parse_args

from mpfs.engine.process import setup_admin_script
setup_admin_script()

from mpfs.config import settings
from mpfs.core.filesystem.cleaner.worker import CheckingStid, DbChecker
from mpfs.core.services.juggler import FindDeletedStidJEvent, JStatus


YT_BASE_PATH = '//home/mpfs-stat/tmp/mpfs_mulca_clean_intersection/bad_stids'
YT_STIDS_PATH_TMPL = YT_BASE_PATH + '/%s'
YT_STIDS_CACHE_PATH_TMPL = YT_BASE_PATH + '/%s_checked'
YT_STIDS_ALARM_PATH_TMPL = YT_BASE_PATH + '/%s_alarm'

yt.config['token'] = settings.mrstat['yt_token']
yt.config['proxy']['url'] = settings.mrstat['yt_proxy']
yt.config['read_retries']['count'] = 2


def get_stids_to_check(check_date):
    yt_url = YT_STIDS_PATH_TMPL % check_date
    return {i['subkey'] for i in yt.read_table(yt_url, format="json", raw=False)}


def get_already_checked_stids(check_date):
    yt_url = YT_STIDS_CACHE_PATH_TMPL % check_date
    if not yt.exists(yt_url):
        return set()
    return {i['stid'] for i in yt.read_table(yt_url, format="json", raw=False)}


def save_checked_stids(check_date, stids):
    yt_url = YT_STIDS_CACHE_PATH_TMPL % check_date
    yt.write_table(yt_url, [{'stid': s} for s in stids], format="json")


def save_alarm_stids(check_date, stids):
    yt_url = YT_STIDS_ALARM_PATH_TMPL % check_date
    yt.write_table(yt_url, [{'stid': s} for s in stids], format="json")


def is_night_mode():
    return not(10 <= datetime.datetime.now().hour < 23)


def run_monitoring(check_date):
    try:
        stids_to_check = get_stids_to_check(check_date)
        already_checked_stids = get_already_checked_stids(check_date)
        new_stids_to_check = stids_to_check - already_checked_stids

        alarm_stids = set()
        if new_stids_to_check:
            stids = [CheckingStid(s) for s in new_stids_to_check]
            DbChecker().is_stids_in_db(stids)
            alarm_stids = {x.stid for x in stids if x.is_stid_in_db}
            if alarm_stids:
                save_alarm_stids(check_date, alarm_stids)

            new_checked_stids = new_stids_to_check - alarm_stids
            if new_checked_stids:
                save_checked_stids(check_date, new_checked_stids | already_checked_stids)

        # логика отправки евента в джагглер
        status = JStatus.OK
        description = 'OK'
        if alarm_stids:
            description = "Find %i deleted stids in DB" % len(alarm_stids)
            status = JStatus.CRIT
            if is_night_mode() and len(alarm_stids) <= 10:
                status = JStatus.OK
        FindDeletedStidJEvent(status, description=description).send()
    except Exception as e:
        status = JStatus.CRIT
        description = "Got monitorig exception: %r" % e
        if is_night_mode():
            status = JStatus.OK
        FindDeletedStidJEvent(status, description=description).send()
        raise


def custom_src(path):
    table_names = []
    from_dt = datetime.datetime.now() - datetime.timedelta(hours=8)
    raw_from_dt = from_dt.strftime("%Y-%m-%dT%H:%M:%S")
    for table_name in yt.list('//logs/lenulca-access-log/30min'):
        if table_name > raw_from_dt :
            table_names.append(table_name)
    table_paths = tuple("'%s/%s{{status,iso_eventtime,unixtime,request,source_uri,subkey}}'" % (path, t) for t in table_names)
    if not table_paths:
        raise ValueError("No input tables %s, greater then %s" % (path, raw_from_dt))
    return table_paths


class ReportJob(Job):
    operations = (
        {
            'type': 'map',
            'cmd': r'grep -vP "\tstatus=(200|206|result)" | ./main.py combined_raw_map_mulca',
            #'src': ('//statbox/lenulca-access-log/{date}',),
            'src': custom_src('//logs/lenulca-access-log/30min'),
            'dst': ('{yt_home_prefix}/tmp/{report_name}/map_mulca',),
            'files': ('{mrstat_dir}/{report_name}/main.py',),
        },
        {
            'type': 'reduce',
            'cmd': './main.py reduce_intersection',
            'src': ('{yt_home_prefix}/tmp/{report_name}/map_mulca',
                    '{yt_home_prefix}/storage/deleted_stids_v2'),
            'dst': ('{yt_home_prefix}/tmp/{report_name}/red',),
            'files': ('{mrstat_dir}/{report_name}/main.py',),
            'yt_files': ('{yt_home_prefix}/storage/deleted_stids_ignore_list',),
        },
        # достаем пересекающиеся stid-ы
        {
            'type': 'cmd',
            'func': yt.create,
            'args': ['table', '{yt_home_prefix}/tmp/{report_name}/bad_stids/{first_date}',],
            'kwargs': {
                'recursive': True,
                'ignore_existing': True
            },
        },
        {
            'type': 'map',
            'cmd': 'grep -v "\tdummy\t" || exit 0',
            'src': ('{yt_home_prefix}/tmp/{report_name}/red',),
            'dst': ('{yt_home_prefix}/tmp/{report_name}/bad_stids/{first_date}',),
        },
        # запускаем мониторинг
        {
            'type': 'cmd',
            'func': run_monitoring,
            'args': ['{first_date}'],
            'kwargs': {},
        },
        # публикуем отчет на stat
        {
            'type': 'reduce',
            'cmd': './main.py reduce_final',
            'src': ('{yt_home_prefix}/tmp/{report_name}/red',),
            'dst': ('{yt_home_prefix}/results/{report_name}/{scale}/{first_date}',),
            'files': ('{mrstat_dir}/{report_name}/main.py',),
        },
        {
            'type': 'publish',
            'yt_path': '{yt_home_prefix}/results/{report_name}/{scale}/{first_date}',
            'report_path': 'Disk/DiskInternal/MulcaCleanIntersection',
            'report_title': 'Чистка мульки. Мониторинг пересечений.',
            'config_path': '{mrstat_dir}/{report_name}/publish.yaml',
        },
    )


if __name__ == '__main__':
    (dates, scale, publish_only) = parse_args()
    ReportJob(scale=scale, dates=dates).run(publish_only=publish_only)
