#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
import libra
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['blockstat.dict'])
    DST = 'likhomanov/sl_turk_urls'
    DST_S = 'likhomanov/sl_turk_stats'
    with mktmp() as tmpstat:
        for src in ['user_sessions/201407{:02}'.format(i) for i in range(21,24)]:
            MR.runReduce(getData, srcTable=src, dstTable=tmpstat.name, appendMode=True)
        MR.runReduce(Summarizer(), srcTable=tmpstat.name, dstTable=tmpstat.name)
        MR.runMap(separate, srcTable=tmpstat.name, dstTables=[DST_S, tmpstat.name])
        MR.runReduce(Limiter(1000), srcTable=tmpstat.name, dstTable=DST)

def getData(key, recs):
    try:
        for req in libra.ParseSessionWithFat(recs, 'blockstat.dict'):
            if not req.IsA('TYandexWebRequest'):
                continue
            if req.ServiceDomRegion != 'tr':
                continue
            sl5 = 0
            for block in req.GetMainBlocks():
                res = block.GetMainResult()
                if not res.IsA('TWebResult'):
                    continue
                if res.Position >= 5:
                    continue
                if not len(res.GetSiteLinks()):
                    continue
                if res.Position == 0:
                    yield Record('sl1', '', '1')
                    yield Record('sl1\t{}'.format(res.Url), '', '1')
                sl5 += 1
                yield Record('sl5\t{}'.format(res.Url), '', '1')
            if sl5:
                yield Record('sl5_serp', '', '1')
                yield Record('sl5_snip', '', str(sl5))
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def separate(rec):
    if '\t' not in rec.key:
        yield Record(rec.key, '', rec.value, tableIndex=0)
    else:
        key, url = rec.key.split('\t', 1)
        n = int(rec.value)
        yield Record(key, str(100000000 - n), '{}\t{}'.format(n, url), tableIndex=1)

if __name__ == '__main__':
    main()

