#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from reqansparse import parseRARecord
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/list_sample'
    with mktmp() as tmp:
        for src in ['reqans_log/201407{:02}'.format(i) for i in (16,)]:
            MR.runMap(getList, srcTable=src, dstTable=tmp.name)
        MR.runReduce(randomize, srcTable=tmp.name, dstTable=tmp.name)
        mrsort(tmp)
        MR.runReduce(Limiter(200), srcTable=tmp.name, dstTable=DST)

def getList(rec):
    ra = parseRARecord(rec.value)
    if ra['request'].get('stype') != 'www':
        return
    if ra['request'].get('is_yandex', '0') == '1':
        return
    lang = serpLang(ra['request'].get('serp_url'))
    if lang not in ('ru', 'com.tr'):
        return
    for res in ra['results']:
        if res.get('snippets_type') == 'list_snip':
            s = '{}\t{}'.format(ra['request']['req'], res['url'])
            yield Record('{}\t{}'.format(lang, md5(s).hexdigest()), '', s)

def randomize(key, recs):
    r = next(recs)
    yield Record(r.key.split('\t')[0], str(random.random()), r.value)

if __name__ == '__main__':
    main()

