#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random
from collections import defaultdict

hosts = set()
for l in open('2672_tr'):
    l = l.strip()
    hosts.add(l if l.startswith('https://') else 'http://' + l)

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['2672_tr'])
    DST = 'likhomanov/bno_2672_tr'
    with mktmp() as tmp:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014,12,4), (2014,12,7))]:
            MR.runCombine(getData, srcTable=src, dstTable=tmp, appendMode=True)
        MR.runReduce(Summarizer(), srcTable=tmp, dstTable=DST)

def getData(recs):
    d = defaultdict(int)
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if req.get('stype') != 'www':
            continue
        if req.get('is_yandex', '0') == '1':
            continue
        lang = serpLang(req)
        if lang != 'com.tr':
        #if lang not in ('ru', 'ua', 'by', 'kz'):
            continue
        have10 = False
        haveUrl = False
        for pos, res in enumerate(ress[:10]):
            url = res.get('url')
            if url:
                haveUrl = True
            else:
                continue
            if url in hosts:
                have10 = True
            if pos == 0:
                if url in hosts:
                    d['top1'] += 1
                extra = res.get('extralinks')
                if extra and 'bno' in extra:
                    d['bno'] += 1
                    if url in hosts:
                        d['bnogood'] += 1
        if have10:
            d['top10'] += 1
        if haveUrl:
            d['serp'] += 1
    for k, v in d.iteritems():
        yield Record(k, '', str(v))

if __name__ == '__main__':
    main()

