#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *


def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/empty_top'
    with mktmp() as tmp:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2015, 3, 30), (2015, 4, 6))]:
            MR.runCombine(getData, srcTable=src, dstTable=tmp, appendMode=True)
            MR.runReduce(Summarizer(), srcTable=tmp, dstTable=tmp)
        MR.runMap(presort, srcTable=tmp, dstTable=DST)
        #MR.runReduce(Limiter(10000), srcTable=DST, dstTable=DST)
        mrsort(DST)

def getData(recs):
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if req.get('stype') != 'www':
            continue
        if req.get('is_yandex', '0') == '1':
            continue
        lang = serpLang(req)
        if lang not in ('ru', 'ua', 'by', 'kz', 'com.tr'):
            continue
        if lang == 'com.tr':
            lang = 'tr'
        for res in ress:
            if 'snippets_length' not in res or 'url' not in res:
                continue
            if int(res['snippets_length']) > 20:
                continue
            ans = '\t'.join([lang, res.get('snippets_type', '-'), res['url']])
            if len(ans) > 4096:
                continue
            yield Record(ans, '', '1')

def presort(rec):
    lang, data = rec.key.split('\t', 1)
    n = int(rec.value)
    yield Record(lang, str(1000000000 - n), str(n) + '\t' + data)

if __name__ == '__main__':
    main()

