#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
import random

NEEDED = 10000

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/addr_2796'
    with mktmp() as tmp:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2015,2,3), (2015,2,7))]:
            MR.runCombine(getData, srcTable=src, dstTable=tmp, appendMode=True)
            mrsort(tmp)
        MR.runReduce(getNeeded, srcTable=tmp, dstTable=DST)

def getData(recs):
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if not iswww(req):
            continue
        lang = serpLang(req)
        if lang not in ('ru', 'ua', 'by', 'kz'):
            continue
        for res in ress:
            stype = res.get('snippets_type')
            if not stype:
                continue
            if 'adres' in stype:
                yield Record(stype, str(random.random()), req['req'] + '\t' + res['url'])

def getNeeded(key, recs):
    s = set()
    for rec in recs:
        s.add(rec.value)
        if len(s) == NEEDED:
            break
    for v in s:
        yield Record(key, '', v)

if __name__ == '__main__':
    main()

