#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from collections import defaultdict

pohosts = set(l.strip() for l in open('prodoffhosts2746'))

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['prodoffhosts2746'])
    DST = 'likhomanov/prodoff_new_2'
    DSTCNT = 'likhomanov/po_cnt_all_2'
    with mktmp() as tmpcnt:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2015, 3, 1), (2015, 3, 4))]:
            MR.runCombine(getData, srcTable=src, dstTables=['likhomanov/prodoffnew', tmpcnt], appendMode=True)
            mrsort('likhomanov/prodoffnew')
        MR.runReduce(Summarizer(), srcTable=tmpcnt, dstTable=DSTCNT)
        MR.runReduce(join, srcTable='likhomanov/prodoffnew', dstTable=tmpcnt)
        MR.runReduce(Summarizer(), srcTable=tmpcnt, dstTable=tmpcnt)
        MR.runMap(presort, srcTable=tmpcnt, dstTable=DSTCNT, appendMode=True)
        MR.runReduce(Limiter(20), srcTable=DSTCNT, dstTable=DSTCNT)

def getData(recs):
    d = defaultdict(int)
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if not iswww(req):
            continue
        if serpLang(req) != 'com.tr':
            continue
        haveHost = False
        haveSnip = False
        for res in ress:
            if 'snippets_type' not in res:
                continue
            d['allsnip'] += 1
            haveSnip = True
            url = res['url']
            if getHost(url) in pohosts:
                haveHost = True
                d['allhost'] += 1
                yield Record(url, '1', '', tableIndex=0)
        if haveSnip:
            d['allserp'] += 1
        if haveHost:
            d['hostserp'] += 1
    for k, v in d.iteritems():
        yield Record(k, '', str(v), tableIndex=1)

def join(url, recs):
    havepo = False
    n = 0
    for rec in recs:
        if rec.subkey == '1':
            n += 1
        else:
            havepo = True
    if havepo and n:
        yield Record(getHost(url), '', str(n))
        yield Record('ALL', '', str(n))

def presort(rec):
    if rec.key == 'ALL':
        yield rec
    else:
        n = int(rec.value)
        yield Record('host', str(100000000000 - n), '{}\t{}'.format(n, rec.key))

if __name__ == '__main__':
    main()

