#!/usr/bin/env python2
# coding: utf-8

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re
from hashlib import md5

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/music_3068'
    DSTREQ = 'likhomanov/music_reqs_3068'
    with mktmp() as tmp, mktmp() as tmpreq:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 5, 27), (2015, 5, 30))]:
            MR.runReduce(getData, srcTable=src, dstTables=[tmp, tmpreq], appendMode=True)
            MR.runReduce(Summarizer(), srcTable=tmp, dstTable=tmp)
            MR.runReduce(summarize_reqs, srcTable=tmpreq, dstTable=tmpreq)
        MR.runMap(cleankey, srcTable=tmp, dstTable=tmp)
        MR.runMap(cleankey, srcTable=tmpreq, dstTable=tmpreq)
        MR.runReduce(Summarizer(), srcTable=tmp, dstTable=DST)
        MR.runReduce(presort_reqs, srcTable=tmpreq, dstTable=tmpreq)
        MR.runReduce(Limiter(1000), srcTable=tmpreq, dstTable=DSTREQ)


def getMarkers(req):
    req = req.lower()
    res = []
    if 'слушать' in req:
        res.append('listen')
    if 'текст песни' in req or 'слова песни' in req or 'перевод песни' in req:
        res.append('song')
    if 'скачать песню' in req:
        res.append('download')
    return res

def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if req.IsA('TYandexWebRequest'):
                rtype = 'web'
            elif req.IsA('TTouchYandexWebRequest'):
                rtype = 'touch'
            else:
                continue
            rtype = str(random.randrange(256)) + '\t' + rtype + '\t'
            if req.ServiceDomRegion not in ('ru', 'ua', 'by', 'kz'):
                continue
            yield Record(rtype + 'ALLSERP', '', '1', tableIndex=0)
            markers = getMarkers(req.Query)
            if not markers:
                continue
            for m in markers:
                yield Record(rtype + m + '\tSERP', '', '1', tableIndex=0)
                yield Record(rtype + m + '\t' + md5(req.Query).hexdigest(), '', '1\t' + req.Query, tableIndex=1)
            hosts = set()
            for block in req.GetMainBlocks():
                res = block.GetMainResult()
                if not res.IsA('TWebResult'):
                    continue
                if getInnerPath(res.Url) in ('', '/', None):
                    continue
                host = getHost(res.Url)
                hosts.add(host)
                for m in markers:
                    yield Record(rtype + m + '\t' + host + '\tshow', '', '1', tableIndex=0)
            for host in hosts:
                for m in markers:
                    yield Record(rtype + m + '\t' + host + '\tserp', '', '1', tableIndex=0)
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass


def summarize_reqs(key, recs):
    n, query = next(recs).value.split('\t', 1)
    n = int(n)
    for rec in recs:
        n += int(rec.value.split('\t', 1)[0])
    yield Record(key, '', str(n) + '\t' + query)

def presort_reqs(key, recs):
    n, query = next(recs).value.split('\t', 1)
    n = int(n)
    for rec in recs:
        n += int(rec.value.split('\t', 1)[0])
    rtype, marker, _ = key.split('\t')
    yield Record(rtype + '\t' + marker, str(1000000000000 - n), str(n) + '\t' + query)

def cleankey(rec):
    _, k = rec.key.split('\t', 1)
    yield Record(k, '', rec.value)

if __name__ == '__main__':
    main()

