# -*- coding: UTF-8 -*-
from mapreducelib import MapReduce, Record
import sys
import random

def grep(rec):
    line = rec.value
    ip='0'
    if not 'type=REQUEST' in line:
        return
    elif not 'service=www.yandex' in line:
        return
    elif not 'dom-region=ru' in line:
        return
    else:
        tmp=line.split('\t')
        for i in range (len(tmp)):
            if tmp[i][:6]=="query=":
                query=tmp[i][6:].lower()
            elif tmp[i][:13] == "search-props=":
                props=tmp[i].split(',')
                for j in range(len(props)):
                    if props[j][:32]=="ApplyBlender.IntentWeight/FRESH=":
                        ip=props[j][32:]
                        break
        if not 'серия' in query.lower():
            return
        prob = float(ip)/(1+float(ip))
        if prob <=0.2:
            return
        if len(query) > 4096:
            query=query[:4096]
        yield Record(query,'1',' ')

def reduce(key, recs):
    querycount={}
    for r in recs:
        query=r.key
        if query=='':
            continue
        if query in querycount:
            querycount[query]+=int(r.subkey)
        else:
            querycount[query]=int(r.subkey)
    for q in querycount:
        yield Record(q,str(querycount[q]),' ')

def main():
    blockstat = '/home/itajn/serploader/blockstat.dict'
    MapReduce.useDefaults(
                            server   = 'sakura.search.yandex.net:8013',
                            username = 'freshness',
                            mrExec   = '/Berkanavt/bin/mapreduce-dev',
                            verbose  = True,
                            #testMode = True,
                         )
    dates=['20160518','20160517','20160516','20160515','20160514','20160513','20160512']
    #'1456392600'
    for day in dates:

        usersessions='user_sessions/'+day
        output = 'itajn/FR-2071/'+day
        MapReduce.runMap(grep,
                         srcTable = usersessions,
                         dstTables = [output+'_map'],
                         files = [blockstat],
                         sortMode = True
                        )
        MapReduce.runReduce(reduce,
                            srcTable = output+'_map',
                            dstTables = [output],
                            sortMode = True
                            )



if __name__ == '__main__':
    main()
