#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random

mhosts = set(l.strip() for l in open('musichosts'))

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['musichosts'])
    DST_K = 'likhomanov/musicstats_kubr'
    DST_T = 'likhomanov/musicstats_tr'
    with mktmp() as tmp:
        for src in ['reqans_log/201407{:02}'.format(i) for i in range(22, 25)]:
            MR.runMap(getMusic, srcTable=src, dstTable=tmp.name, appendMode=True)
            mrsort(tmp)
        MR.runMap(cat, srcTable='likhomanov/music_urls', dstTable=tmp.name, appendMode=True)
        MR.runReduce(getStats, srcTable=tmp.name, dstTables=[DST_K, DST_T])
        mrsort(DST_K)
        mrsort(DST_T)

def getMusic(rec):
    req, ress = parseReqans(rec.value)
    checkwww(req)
    lang = serpLang(req)
    if lang in ('ru', 'ua', 'by', 'kz'):
        kubr = True
    elif lang == 'com.tr':
        kubr = False
    else:
        return
    for res in ress:
        url = res.get('url')
        host = getHost(url)
        if host not in mhosts:
            continue
        yield Record(url, 'k' if kubr else 't', '')

def getStats(url, recs):
    k = t = 0
    music = False
    for rec in recs:
        if rec.subkey == 'k':
            k += 1
        elif rec.subkey == 't':
            t += 1
        else:
            music = True
    if music:
        if k:
            yield Record(url, str(10000000 - k), str(k), tableIndex=0)
        if t:
            yield Record(url, str(10000000 - t), str(t), tableIndex=1)

if __name__ == '__main__':
    main()

