#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
import re
from collections import defaultdict


def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/tr_top_hosts_nov'
    with open('log1', 'w') as log:
        with mktmp() as tmp:
            #for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014, 11, 1), (2014, 11, 2))]:
            for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014, 11, 1), (2014, 12, 1))]:
                MR.runMap(getData, srcTable=src, dstTable=tmp)
                print >>log, src, 'consumed'
                log.flush()
                MR.runReduce(Summarizer(), srcTable=tmp, dstTable=DST, appendMode=True)
                print >>log, src, 'merged'
                log.flush()
            MR.runReduce(Summarizer(), srcTable=DST, dstTable=DST)
            MR.runMap(presort, srcTable=DST, dstTable=DST)
            MR.runReduce(Limiter(10000), srcTable=DST, dstTable=DST)

def getData(rec):
    req, ress = parseReqans(rec.value)
    checkwww(req)
    if serpLang(req) != 'com.tr':
        return
    d = defaultdict(int)
    for res in ress:
        if 'snippets_type' not in res:
            continue
        url = res.get('url')
        host = getHost(url)
        if host:
            d[host] += 1
            inner = getInnerPath(url)
            if inner:
                parts = [p for p in re.split(r'([/?]+)', inner) if p]
                if parts:
                    host += parts[0]
                    parts = parts[1:]
                    i = 0
                    while parts and i < 5:
                        host += ''.join(parts[:2])
                        d[host] += 1
                        parts = parts[2:]
    for k , v in d.iteritems():
        yield Record('doc\t' + k, '', str(v))
        yield Record('serp\t' + k, '', '1')


def presort(rec):
    cls, key = rec.key.split('\t',1)
    n = int(rec.value)
    yield Record(cls, str(10000000000 - n), key + '\t' + rec.value)

if __name__ == '__main__':
    main()

