#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
import libra
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['blockstat.dict'])
    DST_T = 'likhomanov/sl_turk_top'
    DST_S = 'likhomanov/sl_turk_sample'
    with mktmp() as tmptop, mktmp() as tmpsample:
        for src in ['user_sessions/201407{:02}'.format(i) for i in range(21,24)]:
            MR.runReduce(getData, srcTable=src, dstTable=tmptop.name, appendMode=True)
        MR.runReduce(count, srcTable=tmptop.name, dstTables=[tmptop.name, tmpsample.name])
        MR.runReduce(Limiter(100), srcTable=tmpsample.name, dstTable=DST_S)
        MR.runReduce(Limiter(100), srcTable=tmptop.name, dstTable=DST_T)

def getData(key, recs):
    try:
        for req in libra.ParseSessionWithFat(recs, 'blockstat.dict'):
            if not req.IsA('TYandexWebRequest'):
                continue
            if req.ServiceDomRegion != 'tr':
                continue
            for block in req.GetMainBlocks():
                res = block.GetMainResult()
                if not res.IsA('TWebResult'):
                    continue
                if res.Position >= 5:
                    continue
                if not len(res.GetSiteLinks()):
                    continue
                pair = '{}\t{}'.format(res.Url, req.Query)
                phash = md5(pair).hexdigest()
                if res.Position == 0:
                    yield Record('sl1\t{}'.format(phash), '', pair)
                yield Record('sl5\t{}'.format(phash), '', pair)
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def count(key, recs):
    rec = next(recs)
    key, _ = rec.key.split('\t', 1)
    pair = rec.value
    n = 1
    for _ in recs:
        n += 1
    yield Record(key, str(10000000 - n), '{}\t{}'.format(n, pair), tableIndex=0)
    yield Record(key, str(random.random()), pair, tableIndex=1)

if __name__ == '__main__':
    main()

