#!/usr/bin/env python
# -*- encoding: utf-8 -*-

""" METADATA

<crontab>
    env: YT_DIRECT_CLUSTER=hahn
    time: */15 * * * *
    params: logapi
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<crontab>
    env: YT_DIRECT_CLUSTER=hahn
    time: */15 * * * *
    params: logcmd
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<crontab>
    env: YT_DIRECT_CLUSTER=hahn
    time: */15 * * * *
    params: trace
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<crontab>
    env: YT_DIRECT_CLUSTER=hahn
    time: */15 * * * *
    params: messages
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<crontab>
    env: YT_DIRECT_CLUSTER=hahn
    time: */15 * * * *
    params: common_data
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<crontab>
    env: YT_DIRECT_CLUSTER=hahn
    time: */15 * * * *
    params: pricelog
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<crontab>
    env: YT_DIRECT_CLUSTER=hahn
    time: */15 * * * *
    params: bsexport_response
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<crontab>
    env: YT_DIRECT_CLUSTER=hahn
    time: */15 * * * *
    params: uaas_data
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<juggler>
    host:       checks_auto.direct.yandex.ru
    name:       scripts.yt_logbroker_parse_merge.working.hahn
    raw_events: scripts.yt_logbroker_parse_merge.working.hahn.$merger
    vars:       merger=logapi,logcmd,trace,messages,common_data,pricelog,bsexport_response,uaas_data
    ttl:        2h
    tag:        direct_yt
    tag: direct_group_internal_systems
    <notification>
        template: on_status_change
        status: OK
        status: CRIT
        method: telegram
        login: DISMonitoring
    </notification>
</juggler>

<crontab>
    env: YT_DIRECT_CLUSTER=arnold
    time: */15 * * * *
    params: logapi
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<crontab>
    env: YT_DIRECT_CLUSTER=arnold
    time: */15 * * * *
    params: logcmd
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<crontab>
    env: YT_DIRECT_CLUSTER=arnold
    time: */15 * * * *
    params: trace
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<crontab>
    env: YT_DIRECT_CLUSTER=arnold
    time: */15 * * * *
    params: messages
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<crontab>
    env: YT_DIRECT_CLUSTER=arnold
    time: */15 * * * *
    params: common_data
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<crontab>
    env: YT_DIRECT_CLUSTER=arnold
    time: */15 * * * *
    params: pricelog
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<crontab>
    env: YT_DIRECT_CLUSTER=arnold
    time: */15 * * * *
    params: bsexport_response
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<crontab>
    env: YT_DIRECT_CLUSTER=arnold
    time: */15 * * * *
    params: uaas_data
    <switchman>
        group: scripts-other
        <lockname_with_env/>
    </switchman>
    package: scripts-switchman
</crontab>
<juggler>
    host:       checks_auto.direct.yandex.ru
    name:       scripts.yt_logbroker_parse_merge.working.arnold
    raw_events: scripts.yt_logbroker_parse_merge.working.arnold.$merger
    vars:       merger=logapi,logcmd,trace,messages,common_data,pricelog,bsexport_response,uaas_data
    ttl:        2h
    tag:        direct_yt
    tag: direct_group_internal_systems
    <notification>
        template: on_status_change
        status: OK
        status: CRIT
        method: telegram
        login: DISMonitoring
    </notification>
</juggler>

"""

"""
Финальная стадия выгрузки логов в yt из logbroker (первая - заливка в logbroker).
Пример отладочного запуска:
export YT_PROXY=hahn.yt.yandex.net;
export YT_TOKEN_PATH=~/.yt/token;

./python/scripts/yt_logbroker_parse_merge.py --try-path='//home/direct/tmp/smagellan' --chunk-size=2 --time-gap=700 messages
"""

import argparse
import sys
from collections import Counter, OrderedDict, defaultdict
import os, os.path
import logging
import time
from datetime import datetime
from datetime import timedelta

sys.path[0:0] = [os.path.realpath(os.path.join(os.path.dirname(__file__), ".."),)]

import settings
from yandex.utils import chunks, dict_merge
from yandex.juggler import juggler_queue_event

from direct.ytutils import check_create_yt_tbl, run_simple_operation, yt_schema
import direct.tools

import yt.wrapper as yt

yt.config["write_progress_bar"]["enable"] = False

DEFAULT_LOG_LEVEL = 'info'
DEFAULT_CHUNK_SIZE = 200
DEFAULT_TIME_GAP = 700

YT_OP_SPEC_BASE = {
    "max_failed_job_count": 2,
    "weight": 1.5
}


def juggler_ok(merger_nick):
    prefix = 'scripts.yt_logbroker_parse_merge.working.' + os.environ.get('YT_DIRECT_CLUSTER', 'prod')
    juggler_queue_event(prefix + '.' + merger_nick, 'OK', 'LogBroker chunks successfuly merged for ' + merger_nick)


class SyslogParser(object):
    """
    Парсер строк syslog-a
    все модули подключаются в функциях, чтобы можно было выполнять код в YT
    через run_simple_optration
    """
    def __init__(self):
        import re
        # syslog format https://tools.ietf.org/html/draft-ietf-syslog-protocol-23#page-8
        self.re = re.compile(r"""^\s* <\d+>\d+    # pri version
                               \s \d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d[+-]\d\d:\d\d    # timestamp
                               \s (?P<host>\S+)    # host
                               \s (?P<type>\S+)    # app-name (logtype)
                               \s \d+              # pid
                               \s \S+              # msgid
                               \s \S+              # structured data id
                               \s (?P<date>\d\d\d\d-\d\d-\d\d)  # date
                               \s (?P<time>\d\d:\d\d:\d\d)(?:\.\d+)? # time
                               \s (?P<json>\{.*\}) # json
                               """, re.X)

    def parse(self, line, key=None):
        import json
        if line == '':
            return None
        m = self.re.match(line)
        if m:
            fields = m.groupdict()
            data = json.loads(fields['json'])
            data['log_type'] = fields['type']
            data['log_time'] = fields['date'] + ' ' + fields['time']
            data['log_hostname'] = fields['host']
            return data
        else:
            raise Exception("string is not matched to regexp")


class JsonParser(object):
    """
    Парсер json.log-ов
    """
    def __init__(self, log_type):
        import re
        self.key_rx = re.compile(r'^prt://.+?([a-zA-Z0-9\-\._]+?)(\.\d+)? (\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d)$')
        self.log_type = log_type

    def parse(self, line, key=None):
        import json
        if line.endswith("\\n"):
            # иногда, строка заканчивается на \n, эти символы препятствую парсингу json, их надо отрезать
            line = line[:-2]
        if line == '':
            return None
        json_data = json.loads(line)

        match = self.key_rx.search(key)
        log_time = match.group(3) if match else None

        if self.log_type is None:
            if isinstance(json_data, dict) and 'log_type' in json_data:
                ltype = json_data['log_type']
            elif match:
                ltype = match.group(1)
            else:
                raise Exception("can't determine log_type")
        else:
            ltype = self.log_type


        return {
            'log_type': ltype,
            'log_time': log_time,
            'json_data': json_data
        }


class RawParser(object):
    """
    Парсер просто выдаёт строку, как она есть
    """
    def __init__(self, log_type):
        self.log_type = log_type


    def parse(self, line, key=None):
        if line == '':
            return None

        return {
            'log_type': self.log_type,
            'data': line
        }


class SyslogOrPrefixedJsonParser(object):
    """
    Пробуем распарсить строку, как будто там syslog
    Если не получилось - ситаем, что там YYYY-MM-DD HH:MI:SS JSON
    """
    def __init__(self, log_type):
        import re
        self.json_line_rx = re.compile(r'^(\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d) (.*)$')
        self.log_host_rx = re.compile(r'@([a-zA-Z0-9\-\.]+)/')
        self.syslog_parser = SyslogParser()
        self.log_type = log_type

    def parse(self, line, key=None):
        try:
            return self.syslog_parser.parse(line, key)
        except Exception as e:
            pass

        import json
        line_match = self.json_line_rx.search(line)
        if not line_match:
            raise Exception("Unsupported line format")

        host_match = self.log_host_rx.search(key) if key else None

        json_data = json.loads(line_match.group(2))
        json_data['log_type'] = self.log_type
        json_data['log_time'] = line_match.group(1)
        json_data['log_hostname'] = host_match.group(1) if host_match else None

        return json_data


class LbMerger(object):
    """
    Основная работа
    """
    try_path = None

    def __init__(self, chunks_dir, parser, types, default_type=None, dsv_escaped_value=True):
        self.chunks_dir = chunks_dir
        self.parser = parser
        self.types = types
        self.default_type = default_type
        self.types_map = {t.log_type: t for t in types}
        self.fmt = yt.YsonFormat('binary')
        self.dsv_escaped_value = dsv_escaped_value


    def _find_chunks(self, d, time_gap):
        if not yt.exists(d):
            logging.warn("directory %s does not exists" % d)
            return []

        chunks = []
        for t in yt.list(d, attributes=['locks']):
            if t.attributes["locks"] and self.try_path is None:
                logging.warn("skip locked table: %s" % str(t))
            elif str(t) == "errors":
                pass
            elif not str(t).isdigit():
                logging.warn("skip strange table: %s" % str(t))
            elif int(t) > time.time() - time_gap:
                logging.warn("skip too young table: %s" % str(t))
            else:
                chunks.append(d + '/' + t)
        return chunks


    def process(self, chunk_size, time_gap):
        lb_tables = self._find_chunks(self.chunks_dir, time_gap)
        lb_tables = sorted(lb_tables)

        #for i in range(2):
        #    if lb_tables:
        #        skipped = lb_tables.pop()
        #        logging.warn("skip last tables: %s" % str(skipped))

        for lb_tables_chunk in chunks(lb_tables, chunk_size):
            with yt.Transaction():
                self.process_chunks(lb_tables_chunk)
            if self.try_path is not None:
                break


    @yt.aggregator
    @yt.raw_io
    def _mapper(self):
        """
        иногда, из-за проблем с битыми unicode-символами, мэппер ломается.
        чтобы иметь контроль над сериализацией и пропускать битые строки - используем raw_io
        """
        from collections import Counter
        import yt.wrapper as yt
        import traceback
        import sys
        import os
        import json
        from datetime import datetime

        # макисмально возможная вложенность в Yson - 64, оставляем еще небольшой запас
        MAX_DEPTH = 62

        # статистика: дата, тип -> количество строк
        stats = Counter()
        dsv = yt.DsvFormat()

        now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        def _out(table_index, row):
            os.write(1+3*table_index, self.fmt.dumps_row(row))

        def depth(el):
            if not el:
                return 1
            if isinstance(el, dict):
                return 1 + max(depth(v) for v in el.values())
            if isinstance(el, list):
                return 1 + max(depth(v) for v in el)
            return 1

        for row in self.fmt.load_rows(sys.stdin):
            # в одном row - чанк из множества строк
            try:
                # json заэскейплен по правилам dsv, поэтому разэскейаливаем
                lines_str = row['value']
                if self.dsv_escaped_value:
                    lines_str = lines_str.replace('\t', ' ')
                    lines_str = dsv.loads_row("v="+lines_str)['v']
                for line in lines_str.split('\n'):
                    try:
                        data = self.parser.parse(line, key=row['key'])
                        if data is None:
                            continue
                        normalizer = self.types_map.get(data['log_type'], self.default_type)
                        if normalizer:
                            for rec in normalizer.normalize(data):
                                if depth(rec) > MAX_DEPTH:
                                    # гадкие хакеры присылают сильно-вложенные структуры и запись в выходную таблицу падает
                                    raise Exception("Depth limit exceeded")
                                _out(0, rec)
                                stats[(rec['log_time'][0:10], rec['log_type'])] += 1
                        else:
                            _out(2, {'datetime': now, 'type': 'line-type', 'msg': 'Unsupported type: ' + data['log_type'], 'key': row['key'], 'value': line})
                    except Exception as e:
                        _out(2, {'datetime': now, 'type': 'line-parse', 'msg': traceback.format_exc(e), 'value': line})
            except Exception as e:
                _out(2, {'datetime': now, 'type': 'row-parse', 'msg': traceback.format_exc(e), 'value': json.dumps(row)[:100000]})

        for (date, stat_type), cnt in sorted(stats.items()):
            _out(1, {'date': date, 'type': stat_type, 'cnt': cnt})


    def process_chunks(self, lb_chunks):
        yt_result_tbl = yt.create_temp_table()
        yt_stats_tbl = yt.create_temp_table()
        yt_errors_tbl = yt.create_temp_table()
        logging.warn("start process lb chunks: " + ','.join(lb_chunks))

        logging.warn("start map")
        run_simple_operation(yt.run_map,
                             self._mapper,
                             lb_chunks,
                             [yt_result_tbl, yt_stats_tbl, yt_errors_tbl,],
                             format=self.fmt,
                             table_writer={ "max_row_weight": 128 * 1024 * 1024 },
                             spec=dict_merge(YT_OP_SPEC_BASE, {
                                    "mapper": {"memory_limit": 2 * 1024 * 1024 * 1024},
                                  })
                             )
        logging.warn("start sort")
        yt.run_sort(
            yt_result_tbl,
            sort_by=['log_type', 'log_time'],
            spec=YT_OP_SPEC_BASE,
        )

        agg_stats = self.collect_agg_stats(yt_stats_tbl)
        logging.warn("aggregated stats: %s" % str(agg_stats))

        path_date2types = defaultdict(list)
        path2result = {}
        for (date, type), _ in agg_stats.items():
            result = self.types_map.get(type, self.default_type)
            path2result[result.path] = result
            path_date2types[(result.path, date)].append(type)

        for (path, date), types in path_date2types.items():
            self.move_log_data(yt_result_tbl, date, types, path2result[path])

        if yt.get_attribute(yt_errors_tbl, "row_count") > 0:
            yt.run_sort(yt_errors_tbl, sort_by=['datetime'], spec=YT_OP_SPEC_BASE)
            yt.run_merge(
                source_table=[self.errors_table(), yt_errors_tbl],
                destination_table=self.errors_table(),
                mode="sorted",
                spec=YT_OP_SPEC_BASE,
            )

        if self.try_path is None:
            for lb_table in lb_chunks:
                logging.warn("remove chunk: %s" % lb_table)
                yt.remove(lb_table)

        for tmp_tbl in [yt_result_tbl, yt_stats_tbl, yt_errors_tbl]:
            logging.warn("remove table: %s" % tmp_tbl)
            yt.remove(tmp_tbl)


    def collect_agg_stats(self, yt_stats_tbl):
        logging.warn("merge stats table")
        yt.run_merge(yt_stats_tbl,
                     yt_stats_tbl,
                     spec=dict_merge(YT_OP_SPEC_BASE, { 'combine_chunks': True }),
                     )
        stats = Counter()
        for row in yt.read_table(yt_stats_tbl, raw=False):
            stats[(row['date'], row['type'])] += row['cnt']
        return stats


    def move_log_data(self, yt_result_tbl, date, types, result):
        log_table = result.log_table(date, self.try_path)
        chunks = [yt.TablePath(yt_result_tbl, lower_key=(type, date), upper_key=(type, date+'z')) for type in types]

        logging.info("move data to %s" % log_table)
        attrs = yt.get(log_table + "/@")
        if not attrs['sorted']:
            logging.info("unordered merge %s, %s -> %s" % (log_table, str(chunks), log_table))
            yt.run_merge(
                source_table=[log_table] + chunks,
                destination_table=log_table,
                mode="unordered",
                spec=YT_OP_SPEC_BASE,
            )
        else:
            tmp_tbl = yt.create_temp_table()
            logging.info("unordered merge %s -> %s" % (str(chunks), tmp_tbl))
            yt.run_merge(
                source_table=chunks,
                destination_table=tmp_tbl,
                mode="unordered",
                spec=YT_OP_SPEC_BASE,
            )
            logging.info("sort %s" % (tmp_tbl))
            yt.run_sort(
                tmp_tbl,
                sort_by=attrs['sorted_by'],
                spec=YT_OP_SPEC_BASE,
            )
            logging.info("sorted merge %s, %s -> %s" % (log_table, tmp_tbl, log_table))
            yt.run_merge(
                source_table=[log_table, tmp_tbl],
                destination_table=log_table,
                mode="sorted",
                spec=YT_OP_SPEC_BASE,
            )
            yt.remove(tmp_tbl)
        yt.set_attribute(log_table, 'data_modification_time', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))


    def errors_table(self):
        if self.try_path is None:
            table = self.chunks_dir + "/" + "errors"
        else:
            table = self.try_path + "/" + self.chunks_dir + "/" + "errors"
        if not yt.exists(table):
            logging.warn("create table %s" % table)
            check_create_yt_tbl(table,
                                compressed=True,
                                sort_by=('datetime', ),
                                )
        return table


def set_schema(src_table, schema):
    """
    удобный способ вручную прописать схему для старых таблиц
    > import yt_logbroker_parse_merge
    > yt_logbroker_parse_merge.set_schema("//home/direct/logs/ppclog_cmd/2017-10-17", yt_logbroker_parse_merge.LogcmdResult.schema)
    """
    attrs = yt.get(src_table + "/@")
    with yt.Transaction():
        tmp_table = src_table + ".tmp"
        check_create_yt_tbl(
            tmp_table,
            sort_by=attrs.get("sorted_by", None),
            compressed=True,
            schema=schema,
        )
        yt.run_merge(
            source_table=[src_table],
            destination_table=tmp_table,
            mode="sorted",
            spec=YT_OP_SPEC_BASE,
        )
        del_table = src_table + ".to_delete"
        yt.move(src_table, del_table)
        yt.move(tmp_table, src_table)
        yt.remove(del_table)


class AbstractResult(object):
    MAX_INT64 = 2 ** 63 - 1
    MIN_INT64 = -(2 ** 63)

    log_type = None
    path = None
    compressed = False
    schema = None
    ttl = None
    optimize_for = 'lookup'

    def normalize(self, row):
        for _ in []:
            yield None

    def log_table(self, date, try_path):
        log_table = self.path + '/' + date
        if try_path is not None:
            log_table = try_path + '/' + log_table
        if not yt.exists(log_table):
            logging.warn("create table %s" % log_table)
            expiration_time = None
            if self.ttl is not None:
                expiration_time = (datetime.strptime(date, '%Y-%m-%d') + timedelta(days = self.ttl)).strftime("%Y-%m-%d %H:%M:%S")
            check_create_yt_tbl(log_table,
                                compressed=self.compressed,
                                schema=self.schema,
                                optimize_for=self.optimize_for,
                                expiration_time=expiration_time,
                                )
        return log_table

    def int64(self, s):
        i = int(s)
        if i > self.MAX_INT64 or i < self.MIN_INT64:
            raise Exception("Incorrect int value: " + s)
        return i


class LogapiResult(AbstractResult):
    """
    Правила обработки логов определённого типа - как данные нормализовать,
    куда складывать
    """
    log_type = 'PPCLOG.ppclog_api.log'
    path = "logs/ppclog_api"
    optimize_for = 'scan'
    schema = yt_schema(False, (
        {"name": 'log_type',     "type": 'string'},
        {"name": 'log_time',     "type": 'string'},
        {"name": 'log_hostname', "type": 'string'},
        {"name": 'host',         "type": 'string'},

        {"name": 'reqid',  "type": 'int64'},
        {"name": 'runtime',  "type": 'double'},
        {"name": 'sleeptime',  "type": 'double'},
        {"name": 'cmd',  "type": 'string'},
        {"name": 'interface',  "type": 'string'},
        {"name": 'ip',  "type": 'string'},
        {"name": 'http_status',  "type": 'int64'},
        {"name": 'error_detail',  "type": 'string'},

        {"name": 'uid',  "type": 'int64'},
        {"name": 'cid',  "type": 'any'},
        {"name": 'cluid',  "type": 'any'},

        {"name": 'application_id',  "type": 'string'},
        {"name": 'api_version',  "type": 'int64'},
        {"name": 'error_object_count',  "type": 'int64'},
        {"name": 'warning_object_count',  "type": 'int64'},

        {"name": 'units',  "type": 'int64'},
        {"name": 'units_stats',  "type": 'string'},
        {"name": 'units_spending_user_client_id',  "type": 'int64'},

        {"name": 'param',  "type": 'any'},
        {"name": 'response',  "type": 'any'},
        {"name": 'response_ids',  "type": 'any'},
        ))

    def normalize(self, row):
        for f in ('bid', 'logtime', 'proc_id'):
            if f in row:
                del row[f]
        for f in ('units_stats',):
            if f in row and row[f] is not None:
                row[f] = str(row[f])
        for f in ('api_version', 'http_status', 'reqid', 'uid', 'units', 'units_spending_user_client_id', 'error_object_count', 'warning_object_count'):
            if f in row and row[f] is not None:
                row[f] = int(row[f])
        for f in ('cid', 'cluid'):
            if f not in row or row[f] == '' or row[f] is None:
                row[f] = []
            elif isinstance(row[f], list):
                row[f] = [int(x) for x in row[f] if x is not None]
            elif isinstance(row[f], (int, long)):
                row[f] = [row[f]]
            else:
                row[f] = [int(x) for x in row[f].split(',')]
        for f in ('sleeptime', 'runtime'):
            if f in row and row[f] is not None:
                row[f] = float(row[f])
        for f in ('param', 'response'):
            if f in row and row[f] is not None and isinstance(row[f], basestring) and row[f].startswith('{'):
                import json
                row[f] = json.loads(row[f])
        yield row


class LogcmdResult(AbstractResult):
    log_type = 'PPCLOG.ppclog_cmd.log'
    path = "logs/ppclog_cmd"
    optimize_for = 'scan'
    schema = yt_schema(False, (
        {"name": 'log_type',     "type": 'string'},
        {"name": 'log_time',     "type": 'string'},
        {"name": 'log_hostname', "type": 'string'},
        {"name": 'host',         "type": 'string'},

        {"name": 'reqid',  "type": 'int64'},
        {"name": 'runtime',  "type": 'double'},
        {"name": 'cpu_user_time',  "type": 'double'},
        {"name": 'service',  "type": 'string'},
        {"name": 'cmd',  "type": 'string'},
        {"name": 'ip',  "type": 'string'},
        {"name": 'yandexuid',  "type": 'string'},
        {"name": 'http_status',  "type": 'int64'},

        {"name": 'uid',  "type": 'int64'},
        {"name": 'role',  "type": 'string'},
        {"name": 'cid',  "type": 'any'},
        {"name": 'cluid',  "type": 'any'},

        {"name": 'param',  "type": 'any'},
        ))

    def normalize(self, row):
        for f in ('bid', 'pid', 'proc_id', 'logtime'):
            if f in row:
                del row[f]
        for f in ('cid', 'cluid'):
            if f not in row or row[f] == '' or row[f] is None:
                row[f] = []
            elif isinstance(row[f], list):
                row[f] = [int(x) for x in row[f] if x is not None]
            elif isinstance(row[f], (int, long)):
                row[f] = [row[f]]
            else:
                row[f] = [int(x) for x in row[f].split(',')]
        for f in ('uid', 'http_status', 'reqid'):
            if f in row and row[f] is not None:
                row[f] = int(row[f])
        for f in ('cpu_user_time', 'runtime'):
            if f in row and row[f] is not None:
                row[f] = float(row[f])
        for f in ('yandexuid',):
            if f in row and row[f] is not None:
                row[f] = str(row[f])
        yield row


class TraceResult(AbstractResult):
    log_type = 'trace.log'
    path = "logs/trace_log"
    optimize_for = 'scan'
    schema = yt_schema(False, (
        {"name": 'format_id',  "type": 'int64'},

        {"name": 'trace_id',  "type": 'int64'},
        {"name": 'parent_id', "type": 'int64'},
        {"name": 'span_id',   "type": 'int64'},

        {"name": 'chunk_index', "type": 'int64'},
        # почему-то в используемой версии yson boolean превращается в string ;-(
        #{"name": 'chunk_last',  "type": 'boolean'},

        {"name": 'log_hostname', "type": 'string'},
        {"name": 'log_type',     "type": 'string'},
        {"name": 'log_time',     "type": 'string'},

        {"name": 'service',  "type": 'string'},
        {"name": 'method',   "type": 'string'},
        {"name": 'tags',     "type": 'string'},

        {"name": 'span_time', "type": 'double'},
        {"name": 'times',     "type": 'any'},

        {"name": 'samplerate', "type": 'int64'},
        {"name": 'pid',        "type": 'int64'},

        {"name": 'profile',     "type": 'any'},
        {"name": 'annotations', "type": 'any'},
        {"name": 'marks',       "type": 'any'},
        {"name": 'services',    "type": 'any'},
        ))

    trace_v3_fields = ("format_id", "log_time",
                       "log_hostname", "pid",
                       "service", "method", "tags",
                       "trace_id", "parent_id", "span_id",
                       "chunk_index", "chunk_last",
                       "span_time", "samplerate",
                       "data")

    def normalize(self, row):
        trace = row.pop("json_data")
        for i, field in enumerate(self.trace_v3_fields):
            row[field] = trace[i]

        if str(row['format_id']) != '3':
            # parse only v3
            raise Exception("unsupported trace version "+str(row['format_id']))

        data = row.pop("data")

        row['times'] = {k: float(v) for k, v in data['times'].items()}

        row['profile'] = []
        for func, tags, all_ela, childs_ela, calls, obj_num in data['profile']:
            row['profile'].append({
                'func': func,
                'tags': str(tags) if tags != None else '',
                'all_ela': float(all_ela),
                'childs_ela': float(childs_ela),
                'calls': self.int64(calls),
                'obj_num': int(obj_num)
                })

        row['services'] = []
        for service, method, span_id, rel_client_send, ela in data['services']:
            row['services'].append({
                'service': str(service),
                'method': str(method),
                'span_id': self.int64(span_id),
                'rel_client_send': float(rel_client_send),
                'ela': float(ela),
            })

        row['marks'] = [{'relative_time': float(reltime), 'message': str(message)} for reltime, message in data.get('marks', [])]

        row['annotations'] = [{'key': str(key), 'value': str(value)} for key, value in data.get('annotations', [])]

        for f in ('pid', 'trace_id', 'parent_id', 'span_id', 'chunk_index', 'samplerate'):
            row[f] = self.int64(row[f])
        for f in ('span_time',):
            row[f] = float(row[f])
        for f in ('chunk_last',):
            row[f] = bool(row[f])
        yield row


class PriceResult(AbstractResult):
    log_type = 'price.log'
    path = "logs/price_log"
    schema = yt_schema(False, (
        {"name": 'log_type',     "type": 'string'},
        {"name": 'log_time',     "type": 'string'},
        {"name": 'log_hostname', "type": 'string'},

        {"name": 'service',  "type": 'string'},
        {"name": 'method',   "type": 'string'},
        {"name": 'reqid',  "type": 'int64'},

        {"name": 'cid',  "type": 'int64'},
        {"name": 'pid',  "type": 'int64'},
        {"name": 'id',  "type": 'int64'},

        {"name": 'type',   "type": 'string'},
        {"name": 'currency',   "type": 'string'},
        {"name": 'price',  "type": 'double'},
        {"name": 'price_ctx',  "type": 'double'},

        {"name": 'ip',   "type": 'string'},
        ))

    def normalize(self, line):
        data = line.pop("json_data")

        common = {'log_type': line['log_type']}

        for f in ('ip', 'log_time', 'log_hostname', 'log_time', 'service', 'method'):
            common[f] = str(data[f])
        for f in ('reqid',):
            common[f] = int(data[f])

        for row in data['data']:
            rec = dict(common)
            for f in ('currency', 'type'):
                rec[f] = str(row[f])
            for f in ('cid', 'pid', 'id'):
                rec[f] = int(row[f])
            for f in ('price', 'price_ctx'):
                rec[f] = float(row[f])
            yield rec


class AutoBudgetPricesSet(AbstractResult):
    log_type = 'autobudget_prices'
    path = "logs/autobudget_prices_log"
    optimize_for = 'scan'
    ttl = 45
    schema = yt_schema(False, (
        {"name": 'log_type', "type": 'string'},
        {"name": 'log_time', "type": 'string'},
        {"name": 'log_hostname', "type": 'string'},

        {"name": 'service', "type": 'string'},
        {"name": 'method', "type": 'string'},
        {"name": 'reqid', "type": 'int64'},

        {"name": 'PhraseID', "type": 'uint64'},
        {"name": 'GroupExportID', "type": 'int64'},
        {"name": 'currency', "type": 'int8'},
        {"name": 'ContextType', "type": 'int8'},

        {"name": 'price',  "type": 'string'},
        {"name": 'context_price',  "type": 'string'},
        ))

    def normalize(self, line):
        import yt.yson as yson

        data = line.pop("json_data")

        rec = {}

        for f in ('log_type', 'log_time', 'log_hostname', 'service', 'method'):
            rec[f] = str(data[f])
        for f in ('reqid', ):
            rec[f] = int(data[f])

        for f in ('GroupExportID', 'currency', 'ContextType'):
            rec[f] = int(data['data'][f])
        for f in ('PhraseID', ):
            rec[f] = yson.YsonUint64(data['data'][f])
        for f in ('price', 'context_price'):
            if f in data['data']:
                rec[f] = str(data['data'][f])
            else:
                rec[f] = None

        yield rec


class BsExportResponse(AbstractResult):
    log_type = 'bsexport_response.log'
    path = "logs/bsexport_response_log"
    optimize_for = 'scan'
    schema = yt_schema(False, (
        {"name": 'log_type',  "type": 'string'},
        {"name": 'log_host',  "type": 'string'},
        {"name": 'log_time',  "type": 'string'},

        {"name": 'level',     "type": 'string'},
        {"name": 'reqid',     "type": 'int64'},
        {"name": 'uuid',      "type": 'string'},
        {"name": 'shard',     "type": 'int64'},
        {"name": 'par_type',  "type": 'string'},
        {"name": 'backend_host', "type": 'string'},

        {"name": 'iter_id',   "type": 'int64'},
        {"name": 'cid',       "type": 'int64'},
        {"name": 'pid',       "type": 'int64'},

        {"name": 'data',      "type": 'string'},
        ))

    def normalize(self, line):
        import json
        data = line.pop("json_data")

        ret = {
            'log_type': line['log_type'],
            'log_host': data.get("host"),
            'log_time': line['log_time'],
        }

        for k in ('reqid', 'shard', 'iter_id', 'cid', 'pid'):
            ret[k] = int(data.get(k, 0))

        for k in ('level', 'uuid', 'backend_host'):
            ret[k] = str(data.get(k, ''))

        ret['par_type'] = str(data.get("par_norm_nick", ''))

        ret['data'] = json.dumps(data['data'], ensure_ascii=False, sort_keys=True, separators=(',', ':'))

        yield ret


class UaasData(AbstractResult):
    log_type = 'uaas_data.log'
    path = "logs/uaas_data_log"
    optimize_for = 'scan'
    schema = yt_schema(False, (
        {"name": 'log_type',  "type": 'string'},
        {"name": 'log_host',  "type": 'string'},
        {"name": 'log_time',  "type": 'string'},

        {"name": 'method',    "type": 'string'},
        {"name": 'client_id', "type": 'int64'},
        {"name": 'yandexuid', "type": 'string'},
        {"name": 'exp_boxes', "type": 'string'},
        {"name": 'features',  "type": 'string'},
        ))

    def normalize(self, line):
        import json
        data = line.pop("json_data")

        common = {
            'log_type': line['log_type'],
            'log_host': data.get("log_hostname"),
            'log_time': line['log_time'],
        }

        for k in ('method', ):
            common[k] = str(data.get(k, ''))

        rows = data['data'] if isinstance(data['data'], (list, tuple)) else (data['data'],)
        for row in rows:
            rec = dict(common)
            rec['client_id'] = int(row.get('ClientID', 0))
            for k in ('yandexuid', 'exp_boxes', 'features', ):
                rec[k] = str(row.get(k, ''))
            yield rec


class MessagesResult(AbstractResult):
    log_type = 'messages.log'
    path = "logs/messages_log"
    optimize_for = 'scan'
    schema = yt_schema(False, (
        {"name": 'log_type',   "type": 'string'},
        {"name": 'log_time',   "type": 'string'},
        {"name": 'log_time_nanos', "type": 'int64'},
        {"name": 'host',       "type": 'string'},
        {"name": 'service',    "type": 'string'},
        {"name": 'method',     "type": 'string'},
        {"name": 'trace_id',   "type": 'int64'},
        {"name": 'parent_id',  "type": 'int64'},
        {"name": 'span_id',    "type": 'int64'},
        {"name": 'prefix',     "type": 'string'},
        {"name": 'log_level',  "type": 'string'},
        {"name": 'class_name', "type": 'string'},
        {"name": 'message',    "type": 'string'},
        ))

    def __init__(self):
        import re
        # регексп взят из direct-utils/dscribe/lib/DScribe/Parser/messages.pm
        self.rx = re.compile(r"""^(?P<datetime>\d\d\d\d-\d\d-\d\d:\d\d:\d\d:\d\d)(?:\.(?P<nanos>\d+))?
                      \s (?P<host>[^,]+) , (?P<service>[^\/]+) \/ (?P<method>[^,]*) , (?P<trace_id>\d+) : (?P<parent_id>\d+) : (?P<span_id>\d+)
                      (?:
                          (?P<_bulk_flag>\#bulk) \s
                          | \s (?:
                                   \[ (?P<prefix>[^\]]*?) \] \s
                                   (?:
                                        (?P<log_level>[_a-zA-Z0-9\.\-]+)
                                        \s+
                                        (?P<class_name>[_a-zA-Z0-9\.\-\$]+)
                                        \s - \s
                                   )?
                               )?
                      )
                      (?P<message>.*)""",
                      re.X
                      )

    def _nanos(self, nanos):
        if nanos is None:
            return 0
        elif len(nanos) >= 9:
            return int(nanos[0:9])
        else:
            return int(nanos) * (10**(9-len(nanos)))

    def normalize(self, line):
        data = self.rx.match(line.pop("data")).groupdict()

        common = {
            'log_type': line['log_type'],
            'log_time': data['datetime'].replace(':', ' ', 1),
            'log_time_nanos': self._nanos(data.get('nanos')),
            'host': data['host'],
            'service': data['service'],
            'method': data['method'],
            'trace_id': self.int64(data['trace_id']),
            'parent_id': self.int64(data['parent_id']),
            'span_id': self.int64(data['span_id']),
            'prefix': None
        }

        for fld_name in ['log_level', 'class_name', 'prefix']:
            if fld_name in data:
                common[fld_name] = data[fld_name]

        if data['_bulk_flag'] is None:
            common['message'] = data['message']
            yield common
        else:
            import json
            for msg in json.loads(data['message']):
                if isinstance(msg, basestring):
                    msg_str = msg
                else:
                    msg_str = json.dumps(msg)
                yield dict(common, message=msg_str)


class CampaignBalanceResult(AbstractResult):
    log_type = 'campaign_balance'
    path = "logs/campaign_balance"
    schema = yt_schema(False, (
        {"name": 'log_type',     "type": 'string'},
        {"name": 'log_time',     "type": 'string'},
        {"name": 'log_hostname', "type": 'string'},

        {"name": 'service',  "type": 'string'},
        {"name": 'method',   "type": 'string'},
        {"name": 'reqid',    "type": 'int64'},

        {"name": 'ClientID',  "type": 'int64'},
        {"name": 'cid',       "type": 'int64'},

        {"name": 'tid',        "type": 'string'},
        {"name": 'type'    ,   "type": 'string'},
        {"name": 'currency',   "type": 'string'},

        {"name": 'sum',        "type": 'double'},
        {"name": 'sum_delta',  "type": 'double'},

        {"name": 'ip',  "type": 'string'},
        ))

    def normalize(self, line):
        data = line.pop("json_data")

        rec = {}

        for f in ('log_type', 'log_time', 'log_hostname', 'service', 'method', 'ip'):
            rec[f] = str(data[f])
        for f in ('reqid', ):
            rec[f] = int(data[f])

        for f in ('tid', 'type', 'currency'):
            rec[f] = str(data['data'][f])
        for f in ('cid', 'ClientID', ):
            rec[f] = int(data['data'][f])
        for f in ('sum', 'sum_delta'):
            rec[f] = float(data['data'][f])

        yield rec


class RecommendationResult(AbstractResult):
    log_type = 'recommendations'
    path = "logs/recommendations"
    schema = yt_schema(False, (
        {"name": 'log_type', "type": 'string'},
        {"name": 'log_time', "type": 'string'},
        {"name": 'log_hostname', "type": 'string'},

        {"name": 'service', "type": 'string'},
        {"name": 'method', "type": 'string'},
        {"name": 'reqid', "type": 'int64'},

        {"name": 'type', "type": 'int64'},
        {"name": 'ClientID', "type": 'int64'},
        {"name": 'cid', "type": 'int64'},
        {"name": 'pid', "type": 'int64'},
        {"name": 'bid', "type": 'int64'},
        {"name": 'user_key1', "type": 'string'},
        {"name": 'user_key2', "type": 'string'},
        {"name": 'user_key3', "type": 'string'},
        {"name": 'timestamp', "type": 'int64'},

        {"name": 'data', "type": 'string'},
        {"name": 'status', "type": 'string'},
    ))

    def normalize(self, line):
        data = line.pop("json_data")
        inner_data = data['data'][0]

        rec = {}

        for f in ('log_type', 'log_time', 'log_hostname', 'service', 'method', 'ip'):
            rec[f] = str(data[f])
        for f in ('reqid',):
            rec[f] = int(data[f])

        for f in ('type', 'timestamp', ):
            rec[f] = int(inner_data[f])

        rec['ClientID'] = int(inner_data['clientId'])
        rec['cid'] = int(inner_data['campaignId'])
        rec['pid'] = int(inner_data['adGroupId'])
        rec['bid'] = int(inner_data['bannerId'])

        rec['user_key1'] = str(inner_data['userKey1'])
        rec['user_key2'] = str(inner_data['userKey2'])
        rec['user_key3'] = str(inner_data['userKey3'])

        rec['data'] = str(inner_data['jsonData'])
        rec['status'] = str(inner_data['status'])

        yield rec


class CommonDataResult(AbstractResult):
    log_type = 'common_data'
    path = "logs/common_data"
    schema = yt_schema(False, (
        {"name": 'log_type',     "type": 'string',  "sort_order": "ascending",},
        {"name": 'log_time',     "type": 'string',  "sort_order": "ascending",},
        {"name": 'log_hostname', "type": 'string'},

        {"name": 'service',  "type": 'string'},
        {"name": 'method',   "type": 'string'},
        {"name": 'reqid',    "type": 'int64'},
        {"name": 'uid',      "type": 'int64'},

        {"name": 'ip', "type": 'string'},

        {"name": "data", "type": 'string'}
        ))

    def normalize(self, line):
        import json
        data = line.pop("json_data")

        common = {}

        for f in ('log_type', 'log_time', 'log_hostname', 'service', 'method'):
            common[f] = str(data[f])
        for f in ('ip',):
            if f in data:
                common[f] = str(data[f])
        for f in ('reqid', 'uid'):
            if f in data:
                common[f] = int(data[f])

        rows = data['data'] if isinstance(data['data'], (list, tuple)) else (data['data'],)
        for row in rows:
            rec = dict(common)
            rec['data'] = str(row if isinstance(row, basestring) else json.dumps(row))
            yield rec


MERGERS_SPEC = OrderedDict([
    ('logapi',
        LbMerger(
            'logbroker-export/direct-ppclog-api-log',
            parser=SyslogOrPrefixedJsonParser('PPCLOG.ppclog_api.log'),
            types=(LogapiResult(), ),
            dsv_escaped_value=False,
            ),
    ),
    ('logcmd',
        LbMerger(
            'logbroker-export/direct-ppclog-cmd-log',
            parser=SyslogOrPrefixedJsonParser('PPCLOG.ppclog_cmd.log'),
            types=(LogcmdResult(), ),
            dsv_escaped_value=False,
            ),
    ),
    ('trace',
        LbMerger(
            'logbroker-export/direct-trace-log',
            parser=JsonParser('trace.log'),
            types=(TraceResult(), ),
            dsv_escaped_value=False,
            ),
    ),
    ('messages',
        LbMerger(
            'logbroker-export/direct-messages-log',
            parser=RawParser('messages.log'),
            types=(MessagesResult(), ),
            dsv_escaped_value=False,
            ),
    ),
    ('common_data',
        LbMerger(
            'logbroker-export/direct-common-data-log',
            parser=JsonParser(None),
            types=(CampaignBalanceResult(), RecommendationResult(), AutoBudgetPricesSet(), ),
            default_type=CommonDataResult(),
            dsv_escaped_value=False,
            ),
    ),
    ('pricelog',
        LbMerger(
            'logbroker-export/direct-ppclog-price-log',
            parser=JsonParser('price.log'),
            types=(PriceResult(), ),
            dsv_escaped_value=False,
            ),
    ),
    ('bsexport_response', # ответы bssoap-ов в старом транспорте
        LbMerger(
            'logbroker-export/direct-bsexport-log',
            parser=JsonParser("bsexport_response.log"),
            types=(BsExportResponse(), ),
            dsv_escaped_value=False,
            ),
    ),
    ('uaas_data', # эксперименты AB
        LbMerger(
            'logbroker-export/direct-uaas-data-log',
            parser=JsonParser("uaas_data.log"),
            types=(UaasData(), ),
            dsv_escaped_value=False,
            ),
    ),
])


def argparser():
    """ создать объект argparse """
    parser = argparse.ArgumentParser(description='Parse and aggregate LogBroker chunks in YT')
    parser.add_argument('--log', type=str, default=DEFAULT_LOG_LEVEL,
                        help='log level')
    parser.add_argument('--try-path', type=str,
                        help="merge to specified path, don't remove source tables, don't repeat iterations")
    parser.add_argument('--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE,
                        help="number of tables to merge at once")
    parser.add_argument('--time-gap', type=int, default=DEFAULT_TIME_GAP,
                        help="don't touch table, younger than time-gap seconds")
    parser.add_argument('mergers', type=str, nargs=argparse.REMAINDER, choices=MERGERS_SPEC.keys(),
                        help='merges to execute')
    return parser


def main():
    args = argparser().parse_args()

    direct.tools.set_logging(loglevel=args.log.upper(), add_info=os.environ.get('YT_PROXY', '-'))
    logging.warn("start")
    if settings.YT_POOL_PROCESS_LB is not None:
        YT_OP_SPEC_BASE['pool'] = settings.YT_POOL_PROCESS_LB

    if not args.mergers:
        args.mergers = MERGERS_SPEC.keys()

    for merger in args.mergers:
        if merger not in MERGERS_SPEC:
            logging.error("incorrect merger: %s" % merger)
            raise Exception("incorrect merger: %s" % merger)
        try:
            logging.warn("processing %s" % merger)
            MERGERS_SPEC[merger].try_path = args.try_path
            MERGERS_SPEC[merger].process(chunk_size=args.chunk_size, time_gap=args.time_gap)
            juggler_ok(merger)
        except Exception as e:
            import traceback
            traceback.print_exc(file=sys.stderr)
            logging.error(e, exc_info=True)

    logging.warn("finish")


if __name__ == '__main__':
    main()


