#!/usr/bin/env python
# -*- coding: utf-8 -*-

import yt.wrapper as yt
import sys
import libra
import argparse
import datetime

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server",default='hahn.yt.yandex.net:80', required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default='/home/itajn/serploader/blockstat.dict', required=False)
    return parser

def extract(key, recs):
    uid = key
    try:
        session = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    for request in session:
        if not (request.IsA("TYandexWebRequest") or request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest")):
            continue

        region = request.ServiceDomRegion
        if (region == 'ru') or (region == None):
            region = 'ru'
        else:
            continue

        timestamp = request.Timestamp
        month = datetime.datetime.fromtimestamp(int(timestamp)).month
        query = request.Query.lower().strip()

        if "первый канал" in query or "первый телеканал" in query or "1 канал" in query:
            yield {'query': query,
                   'season' : month
                   }

def glue(key, recs):
    total = 01
    for r in recs:
        total+=1
    yield { 'query' : key['query'],
            'season' : key['season'],
            'total' : total
          }


def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    junk = u'\\\'-_.,—"‘’“”•;:›><()#%@!^&*+=№?[]►▼✔/|`~'
    words = {'1':{}, '4':{}, '6':{}, '9':{}}

    dates=[ #jan
          '2016-01-04','2016-01-05','2016-01-06','2016-01-07','2016-01-08','2016-01-09','2016-01-10',
            #apr
          '2016-04-20','2016-04-21','2016-04-22','2016-04-23','2016-04-24','2016-04-25','2016-04-26',
            #june
          '2016-06-15','2016-06-16','2016-06-17','2016-06-18','2016-06-19','2016-06-20','2016-06-21',
            #sept
          '2016-09-01','2016-09-02','2016-09-03','2016-09-04','2016-09-05','2016-09-06','2016-09-07'
          ]

    for day in dates:
        usersessions= '//user_sessions/pub/search/daily/' + day + '/clean'
        output = '//home/freshness/staff/itajn/FU-3364/'+day
        if not yt.exists(output):
            yt.create_table(path=output, recursive=True)
        yt.run_reduce(extract,
                      source_table=usersessions,
                      destination_table=output,
                      local_files = [args.blockstat],
                      reduce_by = 'key'
                      )
        yt.run_sort(source_table=output,
                    destination_table=output,
                    sort_by=['query','season']
                    )
        yt.run_reduce(glue,
                      source_table=output,
                      destination_table=output,
                      reduce_by = ['query','season']
                      )
        result = yt.read_table(output)
        for r in result:
            line = r['query'].decode('utf-8').lower()
            w = r['total']
            s = str(r['season'])
            for j in junk:
                line = line.replace(j,'')
            tmp = line.split(' ')
            for i in range(len(tmp)):
                if tmp[i] == '':
                    continue
                elif tmp[i] in words[s]:
                    words[s][tmp[i]] += w
                else:
                    words[s][tmp[i]] = w
    for s in words:
        for a in words[s]:
            print a.encode('utf-8'), '\t', words[s][a], '\t', s

if __name__ == '__main__':
    main()
