#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from hashlib import md5
import random
from collections import defaultdict

NEEDED = 5000

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/bno_2816'
    DSTR = 'likhomanov/bno_2816_rnd'
    with mktmp() as tmp, mktmp() as tmpr:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2015,2,25), (2015,2,27))]:
            MR.runCombine(getData, srcTable=src, dstTables=[tmp, tmpr], appendMode=True)
        MR.runReduce(summarize, srcTable=tmp, dstTable=DST)
        mrsort(DST)
        MR.runReduce(getNeeded, srcTable=tmpr, dstTable=DSTR)

def getData(recs):
    d = defaultdict(int)
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if not iswww(req):
            continue
        lang = serpLang(req)
        if lang == 'com.tr':
            lang = 'tr'
        elif lang != 'ru':
            continue
        for res in ress:
            if 'snippets_type' not in res:
                continue
            extra = res.get('extralinks')
            if extra and 'bno' in extra:
                d[(lang, '{}\t{}'.format(req['req'], res['url']))] += 1
                if len(d) > 100000:
                    for s, v in d.iteritems():
                        yield Record(s[0] + '\t' + md5(s[1]).hexdigest(), '', str(v) + '\t' + s[1], tableIndex=0)
                        yield Record(s[0], str(random.random()), s[1], tableIndex=1)
                    d = defaultdict(int)
    for s, v in d.iteritems():
        yield Record(s[0] + '\t' + md5(s[1]).hexdigest(), '', str(v) + '\t' + s[1], tableIndex=0)
        yield Record(s[0], str(random.random()), s[1], tableIndex=1)

def getNeeded(key, recs):
    sample = set()
    for rec in recs:
        sample.add(rec.value)
        if len(sample) == NEEDED:
            break
    for s in sample:
        yield Record(key, '', s)

def summarize(key, recs):
    lang = key.split('\t', 1)[0]
    n, s = next(recs).value.split('\t', 1)
    n = int(n)
    for rec in recs:
        n += int(rec.value.split('\t', 1)[0])
    yield Record(lang, str(1000000000000 - n), '{}\t{}'.format(n, s))

if __name__ == '__main__':
    main()

