#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random

fhosts = set(l.strip() for l in open('form_hosts'))

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['form_hosts'])
    DST = 'likhomanov/form_sample'
    with mktmp() as tmp:
        for src in ['reqans_log/201407{:02}'.format(i) for i in range(29, 32)]:
            MR.runMap(getData, srcTable=src, dstTable=tmp.name, appendMode=True)
            mrsort(tmp)
        MR.runMap(cat, srcTable='likhomanov/form_urls', dstTable=tmp.name, appendMode=True)
        MR.runReduce(getShows, srcTable=tmp.name, dstTable=tmp.name)
        MR.runReduce(randomize, srcTable=tmp.name, dstTable=tmp.name)
        MR.runReduce(Limiter(10000), srcTable=tmp.name, dstTable=DST)

def getData(rec):
    req, ress = parseReqans(rec.value)
    checkwww(req)
    for res in ress:
        url = res.get('url')
        host = getHost(url)
        if host not in fhosts:
            continue
        yield Record(url, '1', '{}\t{}'.format(req['req'], req.get('reg', 0)))

def getShows(url, recs):
    if next(recs).subkey != '0':
        return
    for rec in recs:
        pair = '{}\t{}'.format(rec.value, url)
        yield Record(md5(pair).hexdigest(), '', pair)

def randomize(key, recs):
    v = next(recs).value
    yield Record('0', str(random.random()), v)

if __name__ == '__main__':
    main()

