#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/turk_short'
    with mktmp() as tmp:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2015, 1, 1), (2015, 2, 1))]:
            MR.runCombine(getShort, srcTable=src, dstTable=tmp, appendMode=True)
            MR.runReduce(Summarizer(limit=5), srcTable=tmp, dstTable=tmp)
        MR.runMap(presort, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Limiter(50000), srcTable=tmp, dstTable=DST)

def getShort(recs):
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if req.get('stype') != 'www':
            continue
        if req.get('is_yandex', '0') == '1':
            continue
        if serpLang(req) != 'com.tr':
            continue
        for res in ress:
            if res.get('snippets_type') not in ('generic', 'empty', 'trash_annotation'):
                continue
            if int(res.get('snippets_length', 0)) <= 20:
                yield Record(res['url'], '', '1')

def presort(rec):
    yield Record('0', str(10000000 - int(rec.value)), '{}\t{}'.format(rec.value, rec.key))

if __name__ == '__main__':
    main()

