#!/usr/bin/env python2
# coding=UTF-8
from mapreducelib import Record, MapReduce as MR
from mymrutils import *
import random
import libra
import re
import json
from top100_queries import Queries

def main():

        MR.useDefaults(username='sitelinks', server='sakura.yandex.net:8013', verbose=True, files=['blockstat.dict', 'top100_queries.py'], mrExec='/home/pesitnikova/mapreduce/mapreduce')

        DST = 'pesitnikova/reformulations/top100queries_extentions_dsk_full_month'
        with mktmp() as tmp:
                for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015,9,4), (2015,10,5))]:
                        MR.runReduce(getData, srcTable=src, dstTable=tmp, appendMode=True)
                        MR.runReduce(Summarizer(useSubkey=True), srcTable=tmp, dstTable=tmp)
		MR.copyTable(srcTable=tmp, dstTable=DST)
		MR.runReduce(top, srcTable=DST, dstTable=DST)
		mrsort(DST)

def top(key, recs):
	urls = [(rec.subkey, rec.value) for rec in recs]
	urls.sort(reverse = True, key = lambda x: int(x[1]))
	urls = urls[:10]
	for url in urls:
		yield Record(key, url[0], url[1])
		

def getData(key, recs):
		try:

					for req in libra.ParseSession(recs, 'blockstat.dict'):

						if not req.IsA('TYandexWebRequest'):
							continue

						lang = req.ServiceDomRegion
						if lang != 'ru':
							continue
						
						for q in Queries:
							if re.match(q + ".+", req.Query):
								yield Record(q, req.Query[:4000], '1')
					
		except (NameError, AttributeError, TypeError):
			raise
		except Exception:
			pass

if __name__ == '__main__':
	main()
