#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import libra
import argparse
import yt.wrapper as yt
import heapq

fage=259200

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server",default='hahn.yt.yandex.net:80', required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default='/home/itajn/serploader/blockstat.dict', required=False)
    return parser

def extract(key,recs):
    uid = key
    try:
        session = libra.ParseSession(recs, './blockstat.dict')
    except:
        return
    for request in session:
        if not request.IsA('TYandexWebRequest'):
            continue
        if request.ServiceDomRegion !='ru':
            continue
        if request.IsA('TMiscRequestProperties'):
            sprops=request.SearchPropsValues
            if ('UPPER.Fresh.IntentProbability' in sprops) and (float(sprops['UPPER.Fresh.IntentProbability']>0.25)):
                ip=sprops['UPPER.Fresh.IntentProbability']
            else:
                ip=0.0
        else:
            continue
        query = request.Query
        if len(query)>1000:
            query=query[:1000]
        fclicks=-1
        gclicks=0
        tclicks=0
        for block in request.GetMainBlocks():
            tclicks+=len(block.GetClicks())
            res = block.GetMainResult()
            if res.IsA("TWebResult"):
                m=res.Markers
                if ("FreshAge" in m) and (int(m['FreshAge'])<=fage):
                    if fclicks>=0:
                        fclicks+=len(block.GetClicks())
                    else:
                        fclicks=len(block.GetClicks())
        for click in request.GetMiscClicks():
            if click.IsA('TMiscResultClick'):
                if click.Path == '65.66':
                    gclicks=1
        yield {'query':query,
               'ip': ip,
               'clicks':'\t'.join([str(tclicks),str(fclicks),str(gclicks)])
               }

def glue(key,recs):
    avgf=0
    avgc=0
    avgg=0
    ip=0
    count=0
    for r in recs:
        count+=1
        tmp=r['clicks'].split('\t')
        avgc+=int(tmp[0])
        if tmp[1]<>'-1':
            avgf+=int(tmp[1])
        avgg+=int(tmp[2])
        ip+=float(r['ip'])
    yield {'query': key['query'],
           'ip': ip,
           'count': count,
           'clicks_total': avgc,
           'clicks_fresh': avgf,
           'clicks_google': avgg
           }

def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    days=["2016-07-25",'2016-08-11','2016-03-01']
    heapsize=1000
    best=[]
    worst=[]

    for day in days:
        usersessions='//user_sessions/pub/search/daily/'+day+'/clean'
        output = '//home/freshness/staff/itajn/FR-1805/'+day
        if not yt.exists(output):
            yt.create_table(path=output, recursive=True)
        yt.run_reduce(extract,
                      source_table=usersessions,
                      destination_table=output,
                      local_files = [args.blockstat],
                      reduce_by = 'key')
        yt.run_sort(source_table=output,
                    destination_table=output,
                    sort_by='query')
        yt.run_reduce(glue,
                      source_table=output,
                      destination_table=output+'_glue',
                      reduce_by = 'query')
        result=yt.read_table(output+'_glue')
        for rec in result:
            if rec['ip']=='0.0':
                continue
            new=(rec['clicks_google']/rec['count'], rec['query'], rec['ip']/rec['count'], rec['clicks_fresh']/rec['count'], rec['clicks_total']/rec['count'])
            new_low=(-1*rec['clicks_google']/rec['count'], rec['query'], rec['ip']/rec['count'], rec['clicks_fresh']/rec['count'], rec['clicks_total']/rec['count'])
            if len(worst) < heapsize:
                heapq.heappush(worst, new)
            else:
                heapq.heappushpop(worst,new)
            if rec['clicks_google']<>0:
                if len(best) < heapsize:
                    heapq.heappush(best, new_low)
                else:
                    heapq.heappushpop(best,new_low)
        for q in sorted(best, reverse=True):
            print day,'\t','best','\t',q
        for q in sorted(worst, reverse=True):
            print day,'\t','worst','\t',q


if __name__ == '__main__':
    main()
