#!/usr/bin/env python2
# coding=UTF-8
from mapreducelib import Record, MapReduce as MR
from mymrutils import *
import random
import libra
import re
from news_hosts import Queries

def main():

        MR.useDefaults(username='sitelinks', server='sakura.search.yandex.net:8013', verbose=True, files=['blockstat.dict', 'news_hosts.py'], mrExec='/home/pesitnikova/mapreduce/mapreduce')

        dst = 'pesitnikova/news_hosts_shows_test_full_week'

	with mktmp() as tmp:
		for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015,10,7), (2015,10,14))]:
			MR.runReduce(processUSS, srcTable = src, dstTable = tmp, appendMode = True)
			MR.runReduce(Summarizer(), srcTable = tmp, dstTable = tmp)
		MR.runMap(trans, srcTable = tmp, dstTable = dst)
		MR.runReduce(Summarizer(useSubkey=True), srcTable = dst, dstTable = dst)
		MR.runReduce(summ, srcTable = dst, dstTable = tmp)
		MR.copyTable(srcTable = tmp, dstTable = dst, appendMode=True)
		MR.runMap(rotate, srcTable = dst, dstTable = dst)
		MR.runMap(presort, srcTable = dst, dstTable = dst)
		mrsort(dst)

def rotate(rec):
	yield Record (rec.subkey, rec.key, rec.value)

def trans(rec):
	if rec.key.startswith('All_'):
		yield Record ('All', 'all', rec.value)
	elif rec.key.startswith('BNA_'):
		yield Record ('BNA', 'bna', rec.value)
	else:
		yield Record ('host', rec.key, rec.value)


def presort(rec):
        n = int(rec.value)
        yield Record('0', str(10000000000 - n), '{}\t{}'.format(n, rec.key))

def summ(key, recs):
	count = 0
	for rec in recs:
		if rec.key == 'host':
			count += int(rec.value)
	yield Record ('news_hosts', 'News_Hosts', str(count))

def makeRE():
	hosts = [host.replace('.','\.') for host in Queries]
	return '^https?://(www\.)?(' + '|'.join(Queries) + ')/.*'

def processUSS(key, recs):

	hostsRE = makeRE()

        try:
                for req in libra.ParseSession(recs, 'blockstat.dict'):

                        if not req.IsA('TYandexWebRequest'):
                                continue

                        lang = req.ServiceDomRegion
                        if lang != 'ru':
                                continue
				
	 		yield Record('All_' + str(random.randrange(1000)), '', '1')

			bno = [x for x in req.GetBSBlocks() if x.Path == '/snippet/bno/link']
			if not bno:
  				continue
			
			yield Record('BNA_' + str(random.randrange(1000)), '', '1')	

			for block in req.GetMainBlocks():

                                res = block.GetMainResult()

                                if not res.IsA('TWebResult'):
                                        continue

				if re.search(hostsRE, res.Url, re.IGNORECASE):
					yield Record(getHost(res.Url), '', '1') 

	except (NameError, AttributeError, TypeError):
                raise
        except Exception:
                pass

if __name__ == '__main__':
        main()
