#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/youtube'
    with mktmp() as tmp:
        for src in ['reqans_log/201407{:02}'.format(i) for i in range(15,22)]:
            MR.runMap(getData, srcTable=src, dstTable=tmp.name, appendMode=True)
            mrsort(tmp)
        MR.runReduce(Limiter(100000), srcTable=tmp.name, dstTable=DST)

def getData(rec):
    req, ress = parseReqans(rec.value)
    lang = serpLang(req)
    if lang in ('ru', 'ua', 'by', 'kz'):
        dom = 'kubr'
    elif lang == 'com.tr':
        dom = 'tr'
    else:
        return
    for res in ress:
        url = res.get('url')
        if getHost(url) != 'youtube.com':
            continue
        if random.random() >= 0.01:
            continue
        yield Record(dom, str(random.random()), url)

if __name__ == '__main__':
    main()

