#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random
from collections import defaultdict

addr = set(l.split('\t')[0] for l in open('top_addrs'))

needed = 50

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['top_addrs'])
    DST = 'likhomanov/addr_reqs'
    with mktmp() as tmp:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014,10,27), (2014,10,30))]:
            MR.runCombine(getData, srcTable=src, dstTable=tmp, appendMode=True)
        MR.runReduce(getSample, srcTable=tmp, dstTable=DST)

def getData(recs):
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if req.get('stype') != 'www':
            continue
        if req.get('is_yandex', '0') == '1':
            continue
        lang = serpLang(req)
        if lang not in ('ru', 'ua', 'by', 'kz'):
            continue
        query = req['reg'] + '\t' + req['req']
        for res in ress:
            if 'snippets_type' not in res:
                continue
            url = res['url']
            if url in addr:
                yield Record(url, str(random.random()), query)

def getSample(key, recs):
    found = set()
    for r in recs:
        q = r.value
        if q not in found:
            found.add(q)
            yield Record(key, '', q)
            if len(found) == needed:
                return

if __name__ == '__main__':
    main()

