#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random

bno = set()
for l in open('bno_hosts'):
    l = l.strip()
    for sch in ('http', 'https'):
        for www in ('www.', ''):
            for sfx in ('/', ''):
                bno.add(sch + '://' + www + l + sfx)

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['blockstat.dict', 'bno_hosts'])
    DST = 'likhomanov/bno_stat'
    with mktmp() as tmp:
        for src in ['user_sessions/20141024']:
            MR.runReduce(getData, srcTable=src, dstTable=tmp, appendMode=True)
        MR.runReduce(summarizeBig, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Summarizer(), srcTable=tmp, dstTable=DST)

def getData(key, recs):
    try:
        for req in libra.ParseSessionWithFat(recs, 'blockstat.dict'):
            if not req.IsA('TYandexWebRequest'):
                continue
            lang = req.ServiceDomRegion
            if lang in ('ru', 'ua', 'by', 'kz'):
                lang = 'kubr'
            elif lang != 'tr':
                continue
            lang = md5(str(random.random())).hexdigest()[:4] + '\t' + lang
            for block in req.GetMainBlocks():
                res = block.GetMainResult()
                if not res.IsA('TWebResult'):
                    continue
                yield Record(lang + '\tALL', '', '1')
                url = res.Url
                if url in bno:
                    yield Record(lang + '\tNEEDED', '', '1')
                    sl = len(res.GetSiteLinks())
                    fst = (res.Position == 0)
                    if sl:
                        yield Record(lang + '\tSL', '', '1')
                    if fst:
                        yield Record(lang + '\t1', '', '1')
                    if fst and sl:
                        yield Record(lang + '\tSL1', '', '1')
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def summarizeBig(key, recs):
    _, newkey = key.split('\t', 1)
    n = 0
    for _ in recs:
        n += 1
    yield Record(newkey, '', str(n))

if __name__ == '__main__':
    main()

