#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random

hosts = set(l.strip() for l in open('mp3_hosts'))

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['mp3_hosts'])
    with mktmp() as tmpurls, mktmp() as tmphosts, mktmp() as tmpreqs:
        MR.runMap(cat, srcTable='gurrrik/htmls_with_good_mp3s_2', dstTable=tmpurls)
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014, 10, 12), (2014, 10, 14))]:
            MR.runMap(getData, srcTable=src, dstTables=[tmpurls, tmphosts], appendMode=True)
        #MR.runReduce(Summarizer(), srcTable=tmphosts, dstTable='likhomanov/allmp3_tr_alldocs')
        MR.runReduce(join, srcTable=tmpurls, dstTables=[tmpurls, tmphosts, tmpreqs])
        MR.runReduce(Limiter(5000), srcTable=tmpurls, dstTable='likhomanov/goodmp3_kubr_urls')
        MR.runReduce(summarize, srcTable=tmphosts, dstTable=tmphosts)
        MR.runReduce(Limiter(101), srcTable=tmphosts, dstTable='likhomanov/goodmp3_kubr_hosts')
        MR.runReduce(sum_reqs, srcTable=tmpreqs, dstTable=tmpreqs)
        MR.runReduce(Limiter(5000), srcTable=tmpreqs, dstTable='likhomanov/goodmp3_kubr_reqs')


def getData(rec):
    req, ress = parseReqans(rec.value)
    checkwww(req)
    lang = serpLang(req)
    if lang not in ('ru', 'ua', 'by', 'kz'):
    #if lang != 'com.tr':
        return
    n = 0
    for res in ress:
        if 'snippets_type' not in res:
            continue
        n += 1
        url = res.get('url')
        host = getHost(url)
        if host not in hosts:
            continue
        yield Record(url, host, req['req'], tableIndex=0)
    #yield Record('all', '', str(n), tableIndex=1)

def join(key, recs):
    rec = next(recs)
    if rec.subkey != '':
        return
    n = 0
    for rec in recs:
        n += 1
        yield Record(md5(rec.value).hexdigest(), '', rec.value, tableIndex=2)
    if n:
        yield Record(rec.subkey, '', str(n), tableIndex=1)
        yield Record('ALL', '', str(n), tableIndex=1)
        yield Record('top', str(1000000000 - n), key, tableIndex=0)
        yield Record('rnd', str(random.random()), key, tableIndex=0)

def summarize(key, recs):
    n = 0
    for rec in recs:
        n += int(rec.value)
    yield Record('0', str(1000000000 - n), str(n) + '\t' + key)

def sum_reqs(key, recs):
    req = next(recs).value
    n = 1
    for _ in recs:
        n += 1
    yield Record('top', str(1000000000 - n), str(n) + '\t' + req)
    yield Record('rnd', str(random.random()), req)

if __name__ == '__main__':
    main()

