#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/empty_july'
    DST_R = 'likhomanov/empty_req_july'
    with mktmp() as tmp:
        for src in ['reqans_log/201407{:02}'.format(i) for i in range(15,22)]:
            MR.runMap(getData, srcTable=src, dstTable=tmp.name, appendMode=True)
        MR.runReduce(summarize, srcTable=tmp.name, dstTables=[tmp.name, DST])
        MR.runReduce(Limiter(1000), srcTable=tmp.name, dstTable=DST_R)

def getData(rec):
    req, ress = parseReqans(rec.value)
    lang = serpLang(req)
    if lang in ('ru', 'ua', 'by', 'kz'):
        dom = 'kubr'
    elif lang == 'com.tr':
        dom = 'tr'
    else:
        return
    snips = empty = 0.0
    for res in ress:
        if 'snippets_type' in res:
            snips += 1
        if res.get('snippets_type') == 'empty' or res.get('snippets_length') == '0':
            empty += 1
    if snips > 0:
        frac = empty / snips
        if frac >= 0.9:
            val = '0.9'
        elif frac >= 0.7:
            val = '0.7'
        elif frac >= 0.5:
            val = '0.5'
        elif frac >= 0.3:
            val = '0.3'
        elif frac >= 0.1:
            val = '0.1'
        else:
            return
        yield Record('{}_{}'.format(dom, val), '', '')
        yield Record('{}_{}\t{}'.format(dom, val, md5(req['req']).hexdigest()), '', req['req'])

def summarize(key, recs):
    if '\t' not in key:
        n = 0
        for _ in recs:
            n += 1
        yield Record(key, '', str(n), tableIndex=1)
    else:
        key, _ = key.split('\t', 1)
        req = next(recs).value
        n = 1
        for _ in recs:
            n += 1
        yield Record(key, str(10000000 - n), '{}\t{}'.format(n, req), tableIndex=0)

if __name__ == '__main__':
    main()

