#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random
from collections import defaultdict

yt = set()
for sch in ('', 's'):
    for www in ('', 'www.'):
        for dom in ('', '.tr'):
            for tail in ('', '/'):
                yt.add('http' + sch + '://' + www + 'youtube.com' + dom + tail)

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/bno_youtube'
    with mktmp() as tmp:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014,11,10), (2014,11,14))]:
            MR.runCombine(getData, srcTable=src, dstTable=tmp, appendMode=True)
        MR.runReduce(summarize, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Limiter(500), srcTable=tmp, dstTable=DST)


def getData(recs):
    d = defaultdict(set)
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if req.get('stype') != 'www':
            continue
        if req.get('is_yandex', '0') == '1':
            continue
        if serpLang(req) != 'com.tr':
            continue
        for res in ress:
            if 'snippets_type' not in res:
                continue
            url = res.get('url')
            if url not in yt:
                continue
            extra = res.get('extralinks')
            if extra and 'bno' in extra:
                d[req['req']].add(url)
    for k, v in d.iteritems():
        yield Record(md5(k).hexdigest(), '', '{}\t{}'.format(k, '\t'.join(v)))

def summarize(key, recs):
    urls = set()
    n = 0
    for r in recs:
        parts = r.value.split('\t')
        req = parts[0]
        for u in parts[1:]:
            urls.add(u)
        n += 1
    yield Record('0', str(1000000000 - n), '{}\t{}\t{}'.format(n, req, '\t'.join(urls)))

if __name__ == '__main__':
    main()

