#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import libra
import argparse
import yt.wrapper as yt
import json

testid='31885'

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server",default='hahn.yt.yandex.net:80', required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default='/home/itajn/serploader/blockstat.dict', required=False)
    return parser

def extract(key,recs):
    uid = key
    try:
        session = libra.ParseSession(recs, './blockstat.dict')
    except:
        return
    for request in session:
        if not request.IsA('TYandexWebRequest'):
            continue
        if request.ServiceDomRegion !='ru':
            continue
        if not request.HasTestID(testid):
            continue
        query = request.Query
        if not (query == 'утюг' or query == 'гепард'):
            continue
        if request.IsA('TMiscRequestProperties'):
            sprops=request.SearchPropsValues
            if 'UPPER.Fresh.IntentProbability' in sprops:
                ip=sprops['UPPER.Fresh.IntentProbability']
            else:
                ip=0.0
            if 'UPPER.ApplyBlender.RandomizedIntents' in sprops and 'FRESH' in sprops['UPPER.ApplyBlender.RandomizedIntents']:
                yield {'sprops' : json.dumps(sprops), 'ip' : ip}
            else:
                yield {'error' : 'nolocrandom'}
        else:
            yield {'error' : 'nosearchprops'}
            continue

def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    day = '2016-09-27'
    usersessions='//user_sessions/pub/search/daily/'+day+'/clean'
    output = '//home/freshness/staff/itajn/FR-2404/'+day
    if not yt.exists(output):
        yt.create_table(path=output, recursive=True)
    yt.run_reduce(extract,
                  source_table=usersessions,
                  destination_table=output,
                  local_files = [args.blockstat],
                  reduce_by = 'key',
                  spec={'data_size_per_job': 16000000000}#~16GB
                  )

if __name__ == '__main__':
    main()
