#!/usr/bin/env python
# -*- coding: utf-8 -*-

from mapreducelib import MapReduce, Record, TemporaryTable
import sys
import libra
import random
import argparse
import datetime

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--ts", dest="timestamp", help="fast logs timestamp", required=True)
    parser.add_argument("--server", dest="server", help="mapreduce server",default='sakura.search.yandex.net:8013', required=False)
    parser.add_argument("--user", dest="user", help="mapreduce user",default='freshness', required=False)
    parser.add_argument("--mr", dest="mr", help="mapreduce binary",default='/Berkanavt/bin/mapreduce-dev', required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default='/home/itajn/serploader/blockstat.dict', required=False)
    parser.add_argument("--logs", dest="logs", help="path to user_sessions on cluster", required=True)
    parser.add_argument("--mode", dest="mode", help="mode: hour/day", default='hour', required=False)
    parser.add_argument("--num", dest="num", help="number of queries", default='10000', required=False)
    parser.add_argument("--top", dest="outtop", help="file for best queries", required=True)
    parser.add_argument("--low", dest="outlow", help="file for worst queries", required=True)
    return parser

def countsurplus(key, recs):
    uid = key
    try:
        session = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    for request in session:
        win = 0
        loss = 0
        hasnews= False
        if request.IsA("TYandexWebRequest") or request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest"):
            querytype="ok"
        else:
            continue

        region = request.ServiceDomRegion
        if (region == 'ru') or (region == None):
            region = 'ru'
        else:
            continue
        query = request.Query
        query = query.lower()
        if len(query) > 4095:
            query=query[:4095]
        for block in request.GetMainBlocks():
            c=0
            for click in block.GetClicks():
                if int(click.DwellTime) >= 15:
                    c+=1
            res = block.GetMainResult()
            if res.IsA("TBlenderWizardResult")or res.IsA("TWizardResult"):
                if res.Name=="news":
                    win+=c
                    hasnews=True
                    continue
            if hasnews:
                loss+=c
                break
        if hasnews:
            yield Record(query,str(win),str(loss))

def aggsurplus(key, recs):
    win = 0
    loss = 0
    total = 0
    query = key
    for r in recs:
        total+=1
        win+=int(r.subkey)
        loss+=int(r.value)
    yield Record(query,str(win-loss*2),str(total))

def main():
    args = HandleOption().parse_args()
    MapReduce.useDefaults(
                            server   = args.server,
                            username = args.user,
                            mrExec   = args.mr,
                            verbose  = True,
                         )
    timestamp = args.timestamp[:10]
    if args.mode == 'day':
        timestamp = datetime.datetime.fromtimestamp(int(timestamp)).strftime('%Y%m%d')
    usersessions=args.logs+timestamp
    with TemporaryTable(project="tmp") as tmptable:
        MapReduce.runReduce(countsurplus,
                            srcTable = usersessions,
                            dstTable = tmptable.name,
                            files = [args.blockstat],
                            sortMode = True
                           )
        MapReduce.runReduce(aggsurplus,
                            srcTable = tmptable.name,
                            dstTable = tmptable.name,
                            files = [args.blockstat],
                            sortMode = True
                           )
        result=MapReduce.getSample(tmptable.name, count=None)
        queries={}
        count={}
        num=int(args.num)

        for q in result:
            queries[q.key]=int(q.subkey)
            count[q.key]=int(q.value)
        i=0
        with open(args.outtop, "w") as top:
            for q in sorted(queries, key=queries.get, reverse=True):
                top.write(q+'\t'+str(queries[q])+'\n')
                i+=1
                if i>=num:
                   break
        i=0
        with open(args.outlow, "w") as low:
            for q in sorted(queries, key=queries.get, reverse=False):
                low.write(q+'\t'+str(queries[q])+'\n')
                i+=1
                if i>=num:
                   break

if __name__ == '__main__':
    main()
