#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re
from hashlib import md5


musichosts = [
        'ololo.fm',
        'muzofon.com',
        'zaycev.net',
        'vmusice.net',
        'plus-music.org',
        'music.yandex.ru',
        'audiopoisk.com',
        'get-tune.net',
        'mp3-pesnja.com',
        'mp3ostrov.com',
        'kibergrad.com',
        'muz-color.ru',
        'vmuzike.net',
        'music.nur.kz',
        'myzuka.org',
        'petamusic.ru',
        'x-minus.org',
        'moskva.fm',
        'supamusic.net',
        'mp3crazy.ru',
        'pesni-tut.net',
        'guzei.com',
        'muzbaron.com',
        'iplayer.fm',
        'poiskm.com',
        'zvukoff.ru',
        'mp3poisk.net',
        'megalyrics.ru'
        ]

def main():
    MR.useDefaults(username='snippets', server='sakura00.search.yandex.net', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/music_3223'
    with mktmp() as tmp:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 7, 20), (2015, 7, 27))]:
            MR.runReduce(getData, srcTable=src, dstTable=tmp, appendMode=True)
            MR.runReduce(summarize, srcTable=tmp, dstTable=tmp)
        MR.runCombine(presort, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Limiter(500), srcTable=tmp, dstTable=DST)

def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if req.IsA('TYandexWebRequest'):
                platf = 'desk'
            elif req.IsA('TTouchYandexWebRequest'):
                platf = 'touch'
            else:
                continue
            if req.ServiceDomRegion not in ('ru', 'ua', 'by', 'kz'):
                continue
            ndoc = 0
            seen = set()
            for res in (x.GetMainResult() for x in req.GetMainBlocks()):
                if not res.IsA('TWebResult'):
                    continue
                ndoc += 1
                host = getHost(res.Url)
                if host not in musichosts:
                    continue
                if getInnerPath(res.Url) in ('', '/', None):
                    continue
                seen.add(host)
                yield Record(platf + '\t' + host + '\t' + md5(res.Url).hexdigest(), '', '1\t' + res.Url)
                yield Record(platf + '\t' + host + '\tSNIP', '', '1')
                yield Record(platf + '\tMUSIC\tSNIP', '', '1')
            if ndoc:
                yield Record(platf + '\tALL\tSNIP', '', str(ndoc))
                yield Record(platf + '\tALL\tSERP', '', '1')
                if seen:
                    yield Record(platf + '\tMUSIC\tSERP', '', '1')
                    for host in seen:
                        yield Record(platf + '\t' + host + '\tSERP', '', '1')
    except (NameError, AttributeError, TypeError, ValueError):
            raise
    except Exception:
            pass

def summarize(key, recs):
    rec = next(recs)
    if '\t' in rec.value:
        n, url = rec.value.split('\t')
        n = int(n)
    else:
        url = None
        n = int(rec.value)
    for rec in recs:
        n += int(rec.value.split('\t')[0])
    yield Record(key, '', str(n) + ('\t' + url if url else ''))

def presort(recs):
    for rec in recs:
        platf, host, tp = rec.key.split('\t')
        if tp in ('SERP', 'SNIP'):
            yield rec
            continue
        n, _ = rec.value.split('\t')
        yield Record(platf + '\t' + host, str(1000000000 - int(n)), rec.value)

if __name__ == '__main__':
    main()

