#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
import re
from random import random

NEEDED = 10000

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/nav_sample'
    with mktmp() as tmp:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2015, 1, 7), (2015, 1, 14))]:
            MR.runCombine(getData, srcTable=src, dstTable=tmp, appendMode=True)
        MR.runReduce(getSample, srcTable=tmp, dstTable=DST)

def getData(recs):
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if req.get('stype') != 'www':
            continue
        if req.get('is_yandex', '0') == '1':
            continue
        if serpLang(req) != 'ru':
            continue
        if 'reqrelev' in req and req['reqrelev'].get('is_nav') == '1' and random() < 0.01:
            for res in ress:
                if 'snippets_type' not in res:
                    continue
                url = res.get('url')
                if url:
                    yield Record('0', str(random()), req['req'] + '\t' + url)
                    break


def getSample(key, recs):
    reqs = set()
    for rec in recs:
        reqs.add(rec.value)
        if len(reqs) == NEEDED:
            for req in reqs:
                yield Record('0', '', req)
            return

if __name__ == '__main__':
    main()

