#!/usr/bin/env python
# -*- coding: utf-8 -*-

import yt.wrapper as yt
import sys
import libra
import argparse
import datetime

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server",default='hahn.yt.yandex.net:80', required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default='/home/itajn/serploader/blockstat.dict', required=False)
    return parser

def extract(key, recs):
    uid = key
    try:
        session = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    for request in session:
        if not (request.IsA("TYandexWebRequest") or request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest")):
            continue

        region = request.ServiceDomRegion
        if (region == 'ru') or (region == None):
            region = 'ru'
        else:
            continue


        for block in request.GetMainBlocks():
            cl = 0
            for click in block.GetClicks():
                if int(click.DwellTime) >= 15:
                    cl += 1
            res = block.GetMainResult()
            if not res.IsA("TWebResult"):
                continue
            if 'youtube.com/channel/' in res.Url or 'youtube.com/user/' in res.Url:
                yield {'url' : res.Url}
            else:
                continue

def glue(key, recs):
    total = 0
    for r in recs:
        total += 1
    yield { 'url' : key['url'],
            'total' : total
          }


def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    dates=['2017-04-03', '2017-04-04', '2017-04-05', '2017-04-06', '2017-04-07', '2017-04-08', '2017-04-09', '2017-04-10', '2017-04-11',]
    proc = []

    for day in dates:
        usersessions= '//user_sessions/pub/search/daily/' + day + '/clean'
        output = '//home/freshness/staff/itajn/PS-1187/' + day
        if not yt.exists(output):
            yt.create_table(path = output, recursive=True)
        yt.run_reduce(extract,
                      source_table = usersessions,
                      destination_table = output,
                      local_files = [args.blockstat],
                      reduce_by = 'key',
                      spec = {'data_size_per_job': 16000000000} #~16GB
                      )
        yt.run_sort(source_table = output,
                    destination_table = output,
                    sort_by = 'url')
        proc.append(output)

    yt.run_reduce(glue,
                  source_table = proc,
                  destination_table = '//home/freshness/staff/itajn/PS-1187/total',
                  reduce_by = ['url']
                  )
    for r in yt.read_table('//home/freshness/staff/itajn/PS-1187/total'):
        if int(r['total']) > 20:
            print r['url'], '\t', r['total']


if __name__ == '__main__':
    main()
