#!/usr/bin/env python
# -*- coding: UTF-8 -*-
from mapreducelib import MapReduce, Record
import sys
import libra
import urllib, cgi
import json
def parse(rec):
    line = rec.value
    tmp=line.split('\t')
    for i in range(len(tmp)):
        if tmp[i][:6]=='reqid=':
            reqid=tmp[i][6:]
            break
    if 'type=REQUEST' in line:
        type='request'
    elif 'type=ACCESS' in line:
        type='access'
    elif 'type=CLICK' in line:
        type='click'
    data=line
    yield Record(reqid,type,data)
def aggregate(key, recs):
    reqid = key
    wizard=0
    clicks=[]
    serp={}
    filter={'network':0,'age':0,'total':0,'geo':0,'edu':0,'job':0}
    for r in recs:
        type = r.subkey
        val = r.value
        if type == 'access':
            wizard+=1
            continue
        elif type == 'request':
            tmp=val.split('\t')
            for i in range(len(tmp)):
                if tmp[i][:13]=='full-request=':
                    f=0
                    if 'ps_network' in tmp[i][13:]:
                        filter['network']+=1
                        f+=1
                    if 'ps_age' in tmp[i][13:]:
                        filter['age']+=1
                        f+=1
                    if 'ps_geo' in tmp[i][13:]:
                        filter['geo']+=1
                        f+=1
                    if 'ps_edu' in tmp[i][13:]:
                        filter['edu']+=1
                        f+=1
                    if 'ps_job' in tmp[i][13:]:
                        filter['job']+=1
                        f+=1
                    if f>0: filter['total']+=1
                elif tmp[i][:4]=="url=":
                    num=tmp[i+1][4:]
                    serp[num]=tmp[i][4:]
        elif type == 'click':
            tmp=val.split('\t')
            for i in range(len(tmp)):
                if tmp[i][:4]=="url=":
                    clicks.append(tmp[i][4:])
    result={'wizard':wizard,'clicks':clicks,'filter':filter,'serp':serp}
    yield Record(reqid,'',json.dumps(result))
def main():
    blockstat = '/home/itajn/serploader/blockstat.dict'
    MapReduce.useDefaults(
                            server   = 'sakura.search.yandex.net:8013',
                            username = 'freshness',
                            mrExec   = '/Berkanavt/bin/mapreduce-dev',
                            verbose  = True,
                            #testMode = True,
                         )
    dates=['20160330','20160402','20160404']
    #'1456392600'
    for day in dates:
        usersessions='itajn/FR-1979/people_'+day
        output = 'itajn/FR-1979/peopleclean_'+day
        MapReduce.runMap(parse,
                         srcTable = usersessions,
                         dstTables = [output+'_map'],
                         files = [blockstat],
                         sortMode = True
                        )
        MapReduce.runReduce(aggregate,
                            srcTable = output+'_map',
                            dstTables = [output],
                            sortMode = True
                            )

if __name__ == '__main__':
    main()
