#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random
from collections import defaultdict

rhosts = set()
for l in open('recipe_hosts_09'):
    l = l.strip()
    if l.startswith('www.'):
        l = l[4:]
    rhosts.add(l)

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['recipe_hosts_09'])
    DST = 'likhomanov/recipe_stats'
    with mktmp() as tmp:
        #for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014,9, 15), (2014,9,22))]:
        #    MR.runCombine(getRecipes, srcTable=src, dstTable=tmp, appendMode=True)
        MR.runReduce(Summarizer(), srcTable='likhomanov/rec_tmp', dstTable=DST)


def getRecipes(recs):
    d = defaultdict(int)
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if req.get('stype') != 'www':
            continue
        if req.get('is_yandex', '0') != '1':
            continue
        lang = serpLang(req)
        if lang not in ('ru', 'ua', 'by', 'kz', 'com.tr'):
            continue
        if lang == 'com.tr':
            lang = 'tr'
        for res in ress:
            if not res.get('snippets_type') :
                continue
            url = res.get('url')
            if not url:
                continue
            d[lang + ' ALL'] += 1
            if getHost(url) not in rhosts:
                continue
            ip = getInnerPath(url)
            if ip and len(ip) > 1:
                d[lang + ' rec'] += 1
    for k, v in d.iteritems():
        yield Record(k, '', str(v))

if __name__ == '__main__':
    main()

