#!/usr/bin/env python2
# coding=UTF-8
from mapreducelib import Record, MapReduce as MR
from mymrutils import *
import libra
import re
from whitelist_ru import Queries
import random

def main():

        MR.useDefaults(username='sitelinks', server='sakura.yandex.net:8013', verbose=True, files=['blockstat.dict'], mrExec='/home/pesitnikova/mapreduce/mapreduce')

        DST = 'pesitnikova/news_hosts_with_bna_clicks_as_bna_touch_full_month'
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015,11,20), (2015,12,21))]:
        	MR.runReduce(getData, srcTable=src, dstTable=DST, appendMode=True)
        MR.runReduce(Summarizer(), srcTable=DST, dstTable=DST)
	MR.runMap(trans, srcTable=DST, dstTable=DST)
	MR.runReduce(Summarizer(), srcTable=DST, dstTable=DST)
	mrsort(DST)

def trans(rec):
	if rec.key.startswith('All'):
		yield Record ('All', '', rec.value)
	elif rec.key.startswith('BNA'):
		yield Record ('BNA', '', rec.value)
	else:
		yield Record (rec.key, '', rec.value)


def makeRE():
	hosts = [re.escape(host) for host in Queries]
	return '^https?://(www\.)?(' + '|'.join(hosts) + ')/.*'

hostsRE = makeRE()
def getData(key, recs):

	try:

					for req in libra.ParseSession(recs, 'blockstat.dict'):

						if not req.IsA('TTouchYandexWebRequest'):
							continue

						lang = req.ServiceDomRegion
						if lang != 'ru':
							continue
						
						yield Record ('All' + str(random.randrange(1000)), '', '1')

						bno = [x for x in req.GetBSBlocks() if x.Path == '/snippet/bno/link']
						if not bno:
  							continue
						
						clcks = set(clck.Url for clck in req.GetClicks())

						yield Record ('BNA' + str(random.randrange(1000)), '', '1')

						block = bno[0]
    						for v in block.GetVars():
        						if v[0] == 'pos':
            							pos = int(v[1][1:])
            							organic_web_results = [x.GetMainResult() for x in req.GetMainBlocks() if x.GetMainResult().IsA("TWebResult")]
            							result = [x for x in organic_web_results if x.Position == pos]
            							if result[0].Url in clcks and re.search (hostsRE, result[0].Url, re.IGNORECASE):
									yield Record(getHost(result[0].Url), '', '1')

	except (NameError, AttributeError, TypeError):
		raise
	except Exception:
		pass

if __name__ == '__main__':
	main()
