#!/usr/bin/env python
# -*- coding: utf-8 -*-

import yt.wrapper as yt
import sys
import argparse
import datetime

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server",default='hahn.yt.yandex.net:80', required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default='/home/itajn/serploader/blockstat.dict', required=False)
    return parser


def processuser(key, recs):
    max_timestamp = 0
    min_timestamp = 241000000000
    last_fg = ''
    first_fg = ''
    user = key['DeviceID']
    for r in recs:
        if int(r['StartTimestamp']) > max_timestamp:
            max_timestamp = int(r['StartTimestamp'])
            if r['SessionType'] == 'SESSION_FOREGROUND' and r['EventType'] == 'EVENT_CLIENT':
                last_fg = r['EventValue']
        if int(r['StartTimestamp']) < min_timestamp:
            min_timestamp = int(r['StartTimestamp'])
            if r['SessionType'] == 'SESSION_FOREGROUND' and r['EventType'] == 'EVENT_CLIENT':
                first_fg = r['EventValue']

    yield {'user' : user,
           'first_action' : min_timestamp,
           'first_fg' : first_fg,
           'last_action' : max_timestamp,
           'last_fg' : last_fg,
           }

def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    dates = ['2016-08-10','2016-08-11']
    output = '//home/freshness/staff/itajn/KA-149/newusers'
    if not yt.exists(output):
        yt.create_table(path=output, recursive=True)
    input = []
    for day in dates:
        table = '//home/freshness/keyboard_logs/android/' + day
        if yt.exists(table):
            yt.run_sort(source_table=table,
            destination_table=table,
            sort_by='DeviceID')
            input.append(table)
    yt.run_reduce(processuser,
                  source_table = input,
                  destination_table = output,
                  reduce_by = 'DeviceID',
                  spec={'data_size_per_job': 16000000000}#~16GB
                  )

if __name__ == '__main__':
    main()
