#!/usr/bin/env python2
# coding: utf-8

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
import re
from collections import defaultdict


def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/recipes_top'
    with mktmp() as tmp:
        MR.runMap(getData, srcTable='reqans_log/20141223', dstTable=tmp)
        MR.runReduce(Summarizer(), srcTable=tmp, dstTable=tmp)
        MR.runMap(presort, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Limiter(50000), srcTable=tmp, dstTable=DST)

def getData(rec):
    req, ress = parseReqans(rec.value)
    checkwww(req)
    if serpLang(req) not in ('ru', 'ua', 'by', 'kz'):
        return
    if 'рецепт' not in req['req'].lower():
        return
    for res in ress:
        if 'snippets_type' not in res:
            continue
        url = res.get('url')
        inner = getInnerPath(url)
        if not inner or inner == '/':
            continue
        host = getHost(url)
        yield Record('url\t' + url, '', '1')
        yield Record('host\t' + host, '', '1')

def presort(rec):
    tp, url = rec.key.split('\t')
    n = int(rec.value)
    yield Record(tp, str(1000000000 - n), '{}\t{}'.format(n, url))

if __name__ == '__main__':
    main()

