#!/usr/bin/env python2
# coding=UTF-8
from mapreducelib import Record, MapReduce as MR
from mymrutils import *
import random
import libra
import re
import json

def main():

        MR.useDefaults(username='sitelinks', server='sakura.yandex.net:8013', verbose=True, files=['blockstat.dict'], mrExec='/home/pesitnikova/mapreduce/mapreduce')

        DST = 'pesitnikova/tmp'
        with mktmp() as tmp:
                for src in ['sample_by_yuid_1p/user_sessions/{}'.format(d) for d in strdaterange((2015,9,10), (2015,9,17))]:
                        MR.runReduce(getData, srcTable=src, dstTable=tmp, appendMode=True)
                        MR.runReduce(Summarizer(useSubkey=True), srcTable=tmp, dstTable=tmp)
		MR.runMap(parseSLDB, srcTable = 'sitelinks/production/sitelinks-json', dstTable=tmp, appendMode=True)
		MR.copyTable(srcTable=tmp, dstTable=DST)
		MR.runReduce(checkDB, srcTable=DST, dstTable=DST)
		MR.runReduce(top, srcTable=DST, dstTable=DST)
		mrsort(DST)

def top(key, recs):
	urls = [(rec.subkey, rec.value) for rec in recs]
	urls.sort(reverse = True, key = lambda x: int(x[1]))
	urls = urls[:10]
	for url in urls:
		yield Record(key, url[0], url[1])
		

def checkDB(key, recs):
	isInDb = False
	qc = dict()
	for rec in recs:
		if len(rec.subkey) > 0:
			qc[rec.subkey] = rec.value
		else:
			isInDb = True
	url = key + '\tinSLDB' if isInDb else key 
	for q,c in qc.iteritems():
		yield Record(q, url, c)

def getData(key, recs):
		Queries = [
			"вк",
			"одноклассники",
			"яндекс",
			"авито",
			"одноклассники моя страница",
			"вконтакте",
			"дойки",
			"майл",
			"вконтакте моя страница",
			"в контакте",
			"ютуб",
			"сбербанк онлайн",
			"vk",
			"yandex",
			"контакт",
			"порно",
			"в контакте моя страница",
			"youtube",
			"гугл",
			"дом 2 официальный сайт",
			"vk.com",
			"рамблер",
			"mail.ru",
			"дром",
			"сбербанк",
			"дом 2",
			"мтс",
			"google",
			"переводчик онлайн",
			"одноклассники социальная сеть",
			"ржд",
			"дойки ком",
			"facebook",
			"авто ру",
			"маил",
			"mail",
			"алиэкспресс",
			"майл почта",
			"odnoklassniki.ru",
			"од",
			"контакт моя страница",
			"фейсбук",
			"мегафон",
			"однокласники",
			"спортбокс",
			"xnxx",
			"билайн личный кабинет",
			"гисметео",
			"чемпионат"]
		try:

					for req in libra.ParseSession(recs, 'blockstat.dict'):

						if not req.IsA('TTouchYandexWebRequest'):
							continue

						lang = req.ServiceDomRegion
						if lang != 'ru':
							continue
						
						for q in Queries:
							if re.search(q + ".+", req.Query):
								for clck in req.GetClicks():
									if clck.Url and not clck.Url.startswith("http://yandex.ru"):
										#yield Record(q, clck.Url[:4000], '1')
										yield Record(clck.Url[:4000], q, '1')
								break
					
		except (NameError, AttributeError, TypeError):
			raise
		except Exception:
			pass

def parseSLDB(rec):
	if rec.key.endswith('\tru'):
		sls = json.loads(rec.value, encoding = 'UTF-8')
		for sl in sls:
			yield Record(sl['url'].encode('UTF-8'),'','')


if __name__ == '__main__':
	main()
