#!/usr/bin/python -ut
# -*- coding: utf-8 -*-
import re, sys, argparse

REGEX = re.compile(r'^(?P<magic>\d+;\d+;\d+;)(?P<string>.+)$')

class TSKV(object):
    data = {}
    id = ''
    string = ''

    def __init__(self, string):
        (self.id, self.string) = self.__split(string)
        self.data = self.__parse(self.string)

    def __split(self, string):
        m = re.match(REGEX, string)
        try:
            return m.groups()
        except AttributeError:
            return ('', string)

    def __parse(self, string):
        data = {}
        # look for an identifier.
        if string.startswith('tskv'):
            # chop off leading 'tskv' substring
            for field in string.split('\t')[1:]:
                kv = field.split('=',1)
                if len(kv) > 1:
                    data[kv[0]] = kv[1]
                else:
                    # Emtpy field -- probably a runaway tab. Skip it.
                    continue
        else:
            raise ValueError("given string is not in TSKV format")
        return data

    def __unicode__(self):
        string = []
        for key, value in self.data.iteritems():
            string.append('{k}={v}'.format(k=key, v=value))
        if string:
            return '{magic}tskv\t{data}\n'.format(magic=self.id, data='\t'.join(string))
        else:
            return ''

    def __dict__(self):
        return data

    def set(self, **kwargs):
        self.data.update(**kwargs)

    def include_fields(self, *filter_keys):
        self.data = dict([(key, self.data[key]) for key in filter_keys if self.data.get(key)])

    def exclude_fields(self, *remove_keys):
        keep_keys = set(self.data.keys()) - set(remove_keys)
        self.data = dict([(key, self.data[key]) for key in keep_keys if self.data.get(key)])

if __name__ == '__main__':

    # Init
    action = None
    filter = []
    
    # parse cmdline options
    arg = argparse.ArgumentParser(description="""
            TSKV log stream filter
            """
            )  
    
    filter_type = arg.add_mutually_exclusive_group(required=True)
    
    filter_type.add_argument('-e', '--exclude', nargs='+', metavar='<str>',
            help='exclude given fields')
    filter_type.add_argument('-i', '--include', nargs='+', metavar='<str>',
            help='include given fields')
    
    settings = vars(arg.parse_args())
    
    if settings.get('include'):
        action = 'include_fields'
        filter = settings.get('include')
    if settings.get('exclude'):
        action = 'exclude_fields'
        filter = settings.get('exclude')

    # process stdin line by line.
    for line in sys.stdin:
        try:
            record = TSKV(line)
            getattr(record, action)(*filter)
            record.set(tskv_format='mail-user-journal-depers-tskv-log') # https://st.yandex-team.ru/STATDATA-994
            sys.stdout.write(unicode(record))
        except (IOError, KeyboardInterrupt):
            sys.exit(0)
        except Exception:
            pass
