#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from collections import defaultdict

newshosts = [l.strip() for l in open('news_hosts.lst')]

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['news_hosts.lst'])
    DST = 'likhomanov/news_2780'
    with mktmp() as tmp:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2015, 3, 1), (2015, 3, 8))]:
            MR.runCombine(getData, srcTable=src, dstTable=tmp, appendMode=True)
            mrsort(tmp)
        MR.runReduce(Summarizer(), srcTable=tmp, dstTable=DST)


def getData(recs):
    d = defaultdict(int)
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if not iswww(req):
            continue
        if serpLang(req) != 'com.tr':
            continue
        haveSnip = haveNews = False
        for res in ress:
            if 'snippets_type' not in res:
                continue
            haveSnip = True
            d['ALL'] += 1
            url = res['url'].lower()
            host = getHost(url)
            if host in newshosts and getInnerPath(url) not in (None, '', '/'):
                d[host] += 1
                d['NEWS'] += 1
                haveNews = True
        if haveSnip:
            d['ALLSERP'] += 1
            if haveNews:
                d['NEWSSERP'] += 1
    for k, v in d.iteritems():
        yield Record(k, '', str(v))

if __name__ == '__main__':
    main()

