#!/usr/bin/env python
# -*- coding: utf-8 -*-

""" METADATA

<crontab>
    time: 27 2,8 * * *
    env: YT_DIRECT_CLUSTER=hahn
    <switchman>
        lockname: archive_yt_logs.py.hahn
        group: scripts-other
    </switchman>
    package: scripts-switchman
</crontab>
<juggler>
    host:   checks_auto.direct.yandex.ru
    name:       scripts.archive_yt_logs.working.hahn
    raw_events: scripts.archive_yt_logs.working.hahn.$log
    vars:       log=ppclog_api,ppclog_cmd,trace_log,price_log,messages_log,bsexport_log,bsexport_response_log
    vars:       log=common_data,campaign_balance,autobudget_prices_log
    ttl:        1d20h
    tag:        direct_yt
    tag: direct_group_internal_systems
</juggler>

<crontab>
    time: 3 3,9 * * *
    env: YT_DIRECT_CLUSTER=arnold
    <switchman>
        lockname: archive_yt_logs.py.arnold
        group: scripts-other
    </switchman>
    package: scripts-switchman
</crontab>
<juggler>
    host:   checks_auto.direct.yandex.ru
    name:       scripts.archive_yt_logs.working.arnold
    raw_events: scripts.archive_yt_logs.working.arnold.$log
    vars:       log=ppclog_api,ppclog_cmd,trace_log,price_log,messages_log,bsexport_log,bsexport_response_log
    vars:       log=common_data,campaign_balance,autobudget_prices_log
    ttl:        1d20h
    tag:        direct_yt
    tag: direct_group_internal_systems
</juggler>


<crontab>
    time: 3 3,9 * * *
    params: bsexport_log
    env: YT_DIRECT_CLUSTER=freud
    <switchman>
        lockname: archive_yt_logs.py.freud
        group: scripts-test
    </switchman>
    package: conf-test-scripts
</crontab>


"""

"""
архивирование по-дневных таблиц с логами в YT
"""

import argparse

import logging, os, sys, re
from datetime import datetime, timedelta

sys.path[0:0] = [os.path.realpath(os.path.join(os.path.dirname(__file__), ".."))]

import settings
from yandex.juggler import juggler_queue_event
from direct.ytutils import compress_yt_table
import direct.tools

import yt.wrapper as yt

DEFAULT_LOG_LEVEL = 'warn'
DEFAULT_COMPRESS_AFTER_DAYS = 3
DEFAULT_SKIP_AFTER_DAYS = 365

MB=1024 * 1024
GB=1024 * MB

LOGS = {
    'ppclog_api': {
        'dir': 'logs/ppclog_api',
        # такая странная сортировка - для лучшего сжатия (в 2 раза по сравнению с log_time)
        'sort_by': ['cmd', 'uid', 'application_id', 'log_time'],
    },
    'ppclog_cmd': {
        'dir': 'logs/ppclog_cmd',
        'sort_by': ['cmd', 'uid', 'log_time'],
    },
    'trace_log': {
        'dir': 'logs/trace_log',
        'sort_by': ['service', 'method', 'log_time'],
    },
    'price_log': {
        'dir': 'logs/price_log',
        'sort_by': ['cid', 'log_time'],
    },
    'messages_log': {
        'dir': 'logs/messages_log',
        'sort_by': ['service', 'method', 'host', 'span_id', 'log_time'],
    },
    'common_data': {
        'dir': 'logs/common_data',
        'sort_by': ['log_type', 'log_time'],
    },
    'campaign_balance': {
        'dir': 'logs/campaign_balance',
        'sort_by': ['log_time'],
    },
    'bsexport_log': {
        'dir': 'logs/bsexport_log',
        'sort_by': ['cid', 'pid', 'log_time'],
        'data_size_per_job': 30 * GB,
        'desired_chunk_size': 30 * GB,
        'optimize_for': 'scan',
    },
    'bsexport_response_log': {
        'dir': 'logs/bsexport_response_log',
        'sort_by': ['cid', 'pid', 'log_time'],
    },
    'autobudget_prices_log': {
        'dir': 'logs/autobudget_prices_log',
        'sort_by': ['GroupExportID', 'PhraseID', 'ContextType', 'log_time'],
    },
}

def juggler_ok(log_name):
    prefix = 'scripts.archive_yt_logs.working.' + os.environ.get('YT_DIRECT_CLUSTER', 'prod')
    juggler_queue_event(prefix + '.' + log_name, 'OK', 'successfuly processed log: '+ log_name)

def process_log(args, log_name):
    desc = LOGS[log_name]

    if not yt.exists(desc['dir']):
        logging.warn("no log dir - skip, %s", desc['dir'])
        return

    from_date = (datetime.today() - timedelta(args.skip_after_days)).strftime("%Y-%m-%d")
    to_date = (datetime.today() - timedelta(args.compress_after_days)).strftime("%Y-%m-%d")
    for tbl in sorted(yt.list(desc['dir'], 100000)):
        if re.match(r'^\d{4}-\d{2}-\d{2}', tbl) and tbl >= from_date and tbl <= to_date:
            compress_yt_table(desc['dir'] + '/' + tbl,
                              force=args.force,
                              data_size_per_job=desc.get('data_size_per_job', None),
                              desired_chunk_size=desc.get('desired_chunk_size', None),
                              optimize_for=desc.get('optimize_for', None),
                              sort_by=desc.get('sort_by', None),
                          )
    juggler_ok(log_name)

def argparser():
    parser = argparse.ArgumentParser(description='Archive log tables in YT')
    parser.add_argument('--log', type=str, default=DEFAULT_LOG_LEVEL,
                        help='log level, default: '+DEFAULT_LOG_LEVEL)

    parser.add_argument('--compress-after-days', type=int, default=DEFAULT_COMPRESS_AFTER_DAYS,
                        help='after how many days compress tables, default - %s' % DEFAULT_COMPRESS_AFTER_DAYS)
    parser.add_argument('--skip-after-days', type=int, default=DEFAULT_SKIP_AFTER_DAYS,
                        help='after how many days do not compress tables, default - %s' % DEFAULT_SKIP_AFTER_DAYS)
    parser.add_argument('--force', action='store_true',
                        help='force compression')

    parser.add_argument('log_names', type=str, nargs=argparse.REMAINDER, choices=LOGS.keys(), help='directories with logs to be processed')

    return parser


if __name__ == '__main__':
    args = argparser().parse_args()

    direct.tools.set_logging(loglevel=args.log.upper(), add_info=os.environ.get('YT_PROXY', '-'))
    logging.warn("start")

    for log_name in args.log_names or sorted(LOGS.keys()):
        logging.warn("start process %s", log_name)
        process_log(args, log_name)

    logging.warn("finish")
