#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re
from hashlib import md5


def main():
    MR.useDefaults(username='tmp', server='cedar00.search.yandex.net:8013', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/spec_3102_top'
    DSTR = 'likhomanov/spec_3102_rnd'
    with mktmp() as tmp, mktmp() as tmpr:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 7, 1), (2015, 7, 4))]:
            MR.runReduce(getData, srcTable=src, dstTables=[tmp, tmpr], appendMode=True)
            MR.runReduce(Limiter(1), srcTable=tmpr, dstTable=tmpr)
        MR.runReduce(summarize, srcTable=tmp, dstTable=tmp)
        MR.runCombine(prepare, srcTable=tmpr, dstTable=tmpr)
        MR.runReduce(Limiter(300), srcTable=tmpr, dstTable=DSTR)
        MR.runReduce(Limiter(200), srcTable=tmp, dstTable=DST)

def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if not req.IsA('TYandexWebRequest'):
                continue
            if req.ServiceDomRegion in ('ru', 'ua', 'by', 'kz'):
                lang = 'kubr'
            elif req.ServiceDomRegion == 'tr':
                lang = 'tr'
            else:
                continue
            seenst = set()
            for res in (x.GetMainResult() for x in req.GetMainBlocks()):
                if not res.IsA('TWebResult'):
                    continue
                stype = res.SnippetType
                key = '{}\t{}\t{}'.format(lang, stype, md5(req.Query).hexdigest())
                if stype not in seenst:
                    seenst.add(stype)
                    yield Record(key, str(random.random()), res.Url + '\t' + req.Query, tableIndex=0)
                yield Record(key, str(random.random()), res.Url + '\t' + req.Query, tableIndex=1)
    except (NameError, AttributeError, TypeError, ValueError):
        raise
    except Exception:
        pass

def summarize(key, recs):
    lang, stype, _ = key.split('\t')
    url, query = next(recs).value.split('\t', 1)
    n = 1
    for _ in recs:
        n += 1
    yield Record(lang + '\t' + stype, str(100000000 - n), '\t'.join([str(n), query, url]))

def prepare(recs):
    for rec in recs:
        lang, stype, _ = rec.key.split('\t')
        url, query = rec.value.split('\t', 1)
        yield Record(lang + '\t' + stype, str(random.random()), query + '\t' + url)

if __name__ == '__main__':
    main()

