#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from hashlib import md5
from itertools import groupby


tophosts = [l.split()[0] for l in open('trlarge')]

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['trlarge'])
    DST = 'likhomanov/tr_top_host_reqs'
    with mktmp() as tmp:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014, 8, 4), (2014, 8, 11))]:
            MR.runMap(getData, srcTable=src, dstTable=tmp, appendMode=True)
        MR.runReduce(summarize, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Limiter(100), srcTable=tmp, dstTable=DST)

def getData(rec):
    req, ress = parseReqans(rec.value)
    checkwww(req)
    if serpLang(req) != 'com.tr':
        return
    for res in ress:
        if 'snippets_type' not in res:
            continue
        url = res.get('url')
        host = getHost(url)
        if host in tophosts:
            yield Record(host, md5(req['req']).hexdigest(), req['req'])

def summarize(host, recs):
    for h, rs in groupby(recs, lambda rec: rec.subkey):
        req = next(rs).value
        n = 1
        for _ in rs:
            n += 1
        yield Record(host, str(100000000 - n), req)

if __name__ == '__main__':
    main()

