#!/usr/bin/env python3
"""
1. Sanitize string (replace double quote escape ("") with '''
2. Yield a complete statement string
3. Add service fields (timestamp, log_format)
4. Emit a line of tskv
"""
import argparse
import logging
import re
import socket
import sys
import time
from datetime import datetime as dt
from os import environ

# RE_INST = re.compile('^(?P<__pk_data>\d+;\d+;\d+;)(?P<datetime>[^\s]+)\s+(?P<severity>[A-Z-])\s+(?P<component>[A-Z0-9-]+)\s+\[(?P<context>[\w\d-]+)\](\s+)?(?P<message>.*)')
RE_INST = re.compile('^(?P<__pk_data>\d+;\d+;\d+;)(?P<datetime>[^\s]+)\s+(?P<severity>[^\s])\s+(?P<component>[^\s]+)\s+\[(?P<context>[^\]]+)\](\s+)?(?P<message>.*)')

LOG_TYPE = environ.get('LOG_ENVIRONMENT', 'dbaas_int_log')
MONGODB_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f%z'
ADDITIONAL_DATE_FORMAT = '%Y-%m-%d %H:%M:%S.%f %Z'
PC_KEY = '__pk_data'


def get_times(ts_candidate):
    """
    Convert mongodb timestamp to unix
    """
    try:
        dtime = dt.strptime(ts_candidate, MONGODB_DATE_FORMAT)
        tstamp = dtime.timestamp()
        millis = tstamp % 1
        dtime_additional = dtime.strftime(ADDITIONAL_DATE_FORMAT).replace('UTC+03:00', 'MSK')
    except (AttributeError, ValueError) as err:
        raise ValueError('bad timestamp %s: %s' % (ts_candidate, err))
    return int(tstamp), millis, dtime_additional


def prepare(string):
    """
    Cast to string, sanitize according to
    https://wiki.yandex-team.ru/statbox/LogRequirements/
    """
    patterns = [
        # pattern, replacement
        ['\t', ' '],
        ['=', r'\='],
        ['\n', r'\n']
    ]
    string = str(string)
    for pattern, repl in patterns:
        string = string.replace(pattern, repl)
    return string


def statements():
    """
    Yields a single parsed statement
    """
    while True:
        row = sys.stdin.readline()
        if not row:
            raise EOFError()
#        print(row)
        match = RE_INST.match(row)
        # either empty or multiline string. Both are corner cases,
        # we ignore or truncate (for multiline) them.
        if match is None:
            continue
        try:
            row_dict = match.groupdict()
        except AttributeError as err:
            raise ValueError('bad string %s: %s' % (row, err))
#        print(row_dict)
        yield row_dict


def add_tracking_data(stm, cluster, hostname, origin):
    """
    Add timestamps, milliseconds and logtype.
    """
    assert isinstance(stm, dict), 'stm must be a dict!'
    tstamp, millis, dtime = get_times(stm['datetime'])
    addendum = {
        'timestamp': tstamp,
        'ms': '%d' % int(millis * 1000),  # Clickhouse does not like floats.
        'log_time': dtime,
        'log_format': LOG_TYPE,
        'origin': origin,
        'cluster': cluster,
        'hostname': hostname
    }
    stm.update(addendum)


def print_tskv(stm):
    """
    Print TSKV-line from statement dictionary.
    """
    # Find out PK line coords.
    pos = stm[PC_KEY]
    # Remove from resulting string.
    del stm[PC_KEY]
    # Form string.
    tskv = ['%s=%s' % (k, prepare(v)) for k, v in stm.items()]
    print('{pk_key}tskv\t{msg}'.format(
        pk_key=pos,
        msg='\t'.join(tskv),
    ))


def _do_processing(args, hostname):
    for stm in statements():
        add_tracking_data(stm, args.cluster, hostname, args.origin)
        print_tskv(stm)


def main():
    """
    Wait for input, parse and emit a tskv line.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--cluster', type=str, help='Cluster name', default='')
    parser.add_argument('-o', '--origin', type=str, help='Originator identifier', default='')
    parser.add_argument(
        '-l', '--log-file',
        type=str, help='Debug log file path',
        default='/var/log/statbox/mongodb_parser.log')
    args = parser.parse_args()
    logging.basicConfig(
        filename=args.log_file,
        format='%(asctime)s:\t%(message)s')

    hostname = socket.getfqdn()

    # Doesnt stop even on EOF.
    while True:
        try:
            _do_processing(args, hostname)
        except EOFError:
            break
        except ValueError as exc:
            logging.error('parser error: %s', exc)
        except Exception as exc:
            logging.exception('general error: %s', exc)
            time.sleep(1)


if __name__ == '__main__':
    main()

