#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
import random
import libra

NEEDED = 1000

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['blockstat.dict'])
    DST = 'likhomanov/forum_2747_us'
    with mktmp() as tmp:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015,1,19), (2015,1,26))]:
            MR.runReduce(getData, srcTable=src, dstTable=tmp, appendMode=True)
            mrsort(tmp)
        MR.runReduce(getNeeded, srcTable=tmp, dstTable=DST)

def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if not req.IsA('TYandexWebRequest'):
                continue
            lang = req.ServiceDomRegion
            if lang not in ('ru', 'tr'):
                continue
            for block in req.GetMainBlocks():
                res = block.GetMainResult()
                if not res.IsA('TWebResult'):
                    continue
                stype = res.SnippetType
                if 'forum' not in stype:
                    continue
                yield Record(lang + '\t' + stype, str(random.random()), req.Query + '\t' + res.Url)
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def getNeeded(key, recs):
    s = set()
    for rec in recs:
        s.add(rec.value)
        if len(s) == NEEDED:
            break
    for v in s:
        yield Record(key, '', v)

if __name__ == '__main__':
    main()

