#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
import re
from collections import defaultdict


def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/sabah'
    with open('log2', 'w') as log:
        with mktmp() as tmp:
            #for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014, 11, 1), (2014, 11, 2))]:
            for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014, 11, 15), (2014, 12, 15))]:
                MR.runCombine(getData, srcTable=src, dstTable=tmp)
                print >>log, src, 'consumed'
                log.flush()
                MR.runReduce(Summarizer(), srcTable=tmp, dstTable=DST, appendMode=True)
                print >>log, src, 'merged'
                log.flush()
            MR.runReduce(Summarizer(), srcTable=DST, dstTable=DST)
            MR.runMap(presort, srcTable=DST, dstTable=DST)
            mrsort(DST)
            #MR.runReduce(Limiter(10000), srcTable=DST, dstTable=DST)

def getData(recs):
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if req.get('stype') != 'www':
            continue
        if req.get('is_yandex', '0') == '1':
            continue
        for res in ress:
            if 'snippets_type' not in res:
                continue
            url = res['url']
            if getHost(url) != 'sabah.com.tr':
                continue
            yield Record(url, '', '1')

def presort(rec):
    n = int(rec.value)
    yield Record('0', str(100000000 - n), '{}\t{}'.format(rec.key, n))

if __name__ == '__main__':
    main()

