#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from reqansparse import parseRARecord
from itertools import islice
import re
from hashlib import md5
import random

serpRE = re.compile(r'https?://(www\.)?yandex\.([^/]*)/yandsearch')
needed = 100000
prob = 0.0001

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    mktmp = lambda: TemporaryTable(project='likhomanov')
    mrsort = lambda table: MR.sortTable(srcTable=table.name, dstTable=table.name)
    with mktmp() as tmp, mktmp() as tmpsample:
        for src in ['reqans_log/201406{:02}'.format(i) for i in range(1,15)]:
            MR.runMap(getsample, srcTable=src, dstTable=tmp.name)
            mrsort(tmp)
            MR.runReduce(uniq, srcTable=tmp.name, dstTable=tmpsample.name, appendMode=True)
        mrsort(tmpsample)
        MR.runReduce(rnduniq, srcTable=tmpsample.name, dstTable=tmp.name)
        mrsort(tmp)
        MR.runReduce(getNeeded, srcTable=tmp.name, dstTable='likhomanov/sitasample')

def serpLang(url):
    if not url:
        return None
    m = re.match(serpRE, url)
    if not m:
        return None
    return m.group(2)

def getsample(rec):
    ra = parseRARecord(rec.value)
    if ra['request'].get('stype') != 'www':
        return
    if ra['request'].get('is_yandex', '0') == '1':
        return
    lang = serpLang(ra['request'].get('serp_url'))
    if lang != 'ru':
        return
    for res in ra['results']:
        if random.random() > prob:
            continue
        url = res.get('url')
        if not url:
            continue
        yield Record(md5(url).hexdigest(), '', url)

def uniq(key, recs):
    yield next(recs)

def rnduniq(key, recs):
    r = next(recs)
    yield Record('0', str(random.random()), r.value)

def getNeeded(key, recs):
    for rec in islice(recs, needed):
        yield rec

if __name__ == '__main__':
    main()

