#!/usr/bin/env python
# -*- coding: utf-8 -*-

import yt.wrapper as yt
import sys
import argparse


def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="yt server",default='hahn.yt.yandex.net', required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default='/home/itajn/serploader/blockstat.dict', required=False)
    return parser

def findcounter(rec):
    tmp = rec['value'].split('\t')
    message = ''
    ok = False
    for t in tmp:
        if t == 'path=183.2155.1030':
            ok = True
        if t[:5] == 'vars=':
            vars = t[5:]
    if ok:
        tmp = vars.split(',')
        for t in tmp:
            if t[:9] == '-message=':
                yield {'message' : t[9:]}
                break

def glue (key, recs):
    count = 0
    for r in recs:
        count += 1
    yield {'message' : key['message'], 'count' : count}

def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})
    days = ['2016-09-14', '2016-09-15', '2016-09-16', '2016-09-17', '2016-09-18', '2016-09-19']
    processed = []
    for day in days:
        input = '//home/logfeller/logs/redir-log/1d/' + day
        output = '//home/freshness/staff/itajn/FR-2386/' + day + '_redir'
        if not yt.exists(output):
            yt.create_table(path=output, recursive=True)
        yt.run_map(findcounter,
                   source_table = input,
                   destination_table = output,
                   spec = {'data_size_per_job': 16000000000} #~16GB
                   )
        yt.run_sort(source_table = output,
                    destination_table = output,
                    sort_by = 'message'
                    )
        processed.append(output)
    yt.run_reduce(glue,
                  source_table = processed,
                  destination_table = '//home/freshness/staff/itajn/FR-2386/allmessages',
                  reduce_by = 'message'
                  )

if __name__ == '__main__':
    main()
