#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random
from collections import defaultdict

rhosts = set()
for l in open('rechosts'):
    l = l.strip()
    #if l.startswith('www.'):
        #l = l[4:]
    rhosts.add(l)

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['rechosts'])
    DST = 'likhomanov/rec_stats'
    with mktmp() as tmp:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014,10,22), (2014,10,23))]:
            MR.runCombine(getRecipes, srcTable=src, dstTable=tmp, appendMode=True)
        MR.runMap(cat1, srcTable='evilenka/hasrecipe/reduced', dstTable=tmp, appendMode=True)
        MR.runReduce(summarize, srcTable=tmp, dstTable=DST)


def getRecipes(recs):
    ALL = 0
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if req.get('stype') != 'www':
            continue
        if req.get('is_yandex', '0') == '1':
            continue
        if serpLang(req) != 'com.tr':
            continue
        for res in ress:
            if not res.get('snippets_type'):
                continue
            url = res.get('url')
            if not url:
                continue
            ALL += 1
            if getHost(url) not in rhosts:
                continue
            yield Record(url, 'z', '1')
    yield Record('ALL', '', str(ALL))

def cat1(rec):
    yield Record(rec.key, 'a', '')

def summarize(key, recs):
    n = 0
    if key in ( 'ALL', ):
        for r in recs:
            n += int(r.value)
    else:
        r = next(recs)
        if r.subkey == 'z':
            return
        for _ in recs:
            n += 1
    if n:
        yield Record(key, '', str(n))

if __name__ == '__main__':
    main()

