#!/usr/bin/env python
# -*- coding: utf-8 -*-

import yt.wrapper as yt
import sys
import libra
import argparse

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server",default='hahn.yt.yandex.net:80', required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default='/home/itajn/serploader/blockstat.dict', required=False)
    parser.add_argument("--file", dest="file", help="output file for extra info", required=True)
    return parser

def extract(key, recs):
    uid = key
    try:
        session = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    for request in session:
        if not (request.IsA("TYandexWebRequest") or request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest")):
            continue

        region = request.ServiceDomRegion
        if (region == 'ru') or (region == None):
            region = 'ru'
        else:
            continue

        query = request.Query.lower().strip()

        if "смотреть" in query or "онлайн" in query:
            watch=True
        else:
            watch=False

        if "первый канал" in query or "первый телеканал" in query or "1 канал" in query:
            yield {'channel':'1tv', 'watch': watch}
            continue
        if "россия 1" in query or "телеканал россия" in query or "канал россия" in query:
            yield {'channel':'russia 1', 'watch': watch}
            continue
        if "россия 24" in query:
            yield {'channel':'russia 24', 'watch': watch}
            continue
        if "нтв " in query or " нтв" in query or query=="нтв":
            yield {'channel':'ntv', 'watch': watch}
            continue
        if "твц" in query or "тв центр" in query:
            yield {'channel':'tvc', 'watch': watch}
            continue
        if "культура" in query :
            yield {'channel':'culture', 'watch': watch , 'query': query}
            continue
        if "телеканал life" in query or "канал life" in query:
            yield {'channel':'life', 'watch': watch}
            continue
        if " отр " in query or query=="отр" or "общественное телевидение россии" in query:
            yield {'channel':'otr', 'watch': watch}
            continue
        if "совершенно секретно" in query or "сов секретно" in query or "совсекретно" in query:
            yield {'channel':'secret', 'watch': watch}


def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    channels=['1tv','russia 1','russia 24','ntv','tvc','culture','life','otr','secret']
    counters={}
    header=' \t \t'
    for c in channels:
        counters[c]={}
        counters[c]['True']=0
        counters[c]['False']=0
        header += c + '\t'
    print header

    dates=[ #jan
          '2016-01-04','2016-01-05','2016-01-06','2016-01-07','2016-01-08','2016-01-09','2016-01-10',
            #apr
          '2016-04-20','2016-04-21','2016-04-22','2016-04-23','2016-04-24','2016-04-25','2016-04-26',
            #june
          '2016-06-15','2016-06-16','2016-06-17','2016-06-18','2016-06-19','2016-06-20','2016-06-21',
            #sept
          '2016-09-01','2016-09-02','2016-09-03','2016-09-04','2016-09-05','2016-09-06','2016-09-07'
          ]
    with open(args.file, 'w') as file:
        for day in dates:
            for c in channels:
                counters[c]['True']=0
                counters[c]['False']=0
            usersessions= '//user_sessions/pub/search/daily/' + day + '/clean'
            with yt.TempTable(prefix="tmp_freshness") as tmptable:
                yt.run_reduce(extract,
                             source_table=usersessions,
                             destination_table=tmptable,
                             local_files = [args.blockstat],
                             reduce_by = 'key',
                             spec = {'data_size_per_job': 16000000000}#~16GB
                             )
                result=yt.read_table(tmptable)
                for r in result:
                    if r['channel']=='culture':
                        file.write(day + '\t' + r['query'] + '\n')
                    for c in channels:
                        if r['channel']==c:
                            counters[c][str(r['watch'])]+=1
                            break
            for w in ['True','False']:
                out = day + '\t' + w + '\t'
                for c in channels:
                    out += str(counters[c][w]) + '\t'
                print out

if __name__ == '__main__':
    main()
