#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
import re
from collections import defaultdict

def main():
    MR.useDefaults(username='tmp', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/top_2815'
    with mktmp() as tmp:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2015, 2, 12), (2015, 2, 19))]:
            MR.runCombine(getData, srcTable=src, dstTable=tmp, appendMode=True)
            MR.runReduce(Summarizer(), srcTable=tmp, dstTable=tmp)
        MR.runMap(presort, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Limiter(10000), srcTable=tmp, dstTable=DST)

def getData(recs):
    d = defaultdict(int)
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if not iswww(req):
            continue
        lang = serpLang(req)
        if lang == 'com.tr':
            lang = 'tr'
        elif lang != 'ru':
            continue
        lang += '\t'
        for res in ress:
            if 'snippets_type' not in res:
                continue
            d[lang + 'ALL'] += 1
            url = res['url']
            host = getHost(url)
            inner = getInnerPath(url)
            if inner in (None, '', '/'):
                continue
            d[lang + 'IN'] += 1
            d[lang + host] += 1
            parts = [p for p in re.split(r'([/?]+)', inner) if p]
            if parts:
                host += parts[0]
                parts = parts[1:]
                i = 0
                while parts and i < 5:
                    host += ''.join(parts[:2])
                    d[lang + host] += 1
                    parts = parts[2:]
            if len(d) > 100000:
                for k, v in d.iteritems():
                    if len(k) < 4096:
                        yield Record(k, '', str(v))
                d = defaultdict(int)
    for k, v in d.iteritems():
        if len(k) < 4096:
            yield Record(k, '', str(v))

def presort(rec):
    lang, arg = rec.key.split('\t', 1)
    n = int(rec.value)
    yield Record(lang, str(10000000000000 - n), '{}\t{}'.format(n, arg))

if __name__ == '__main__':
    main()

