#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re
from hashlib import md5

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/blogs_3032'
    DSTSTAT = 'likhomanov/blogs_3032_stat_2'
    with mktmp() as tmp:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 5, 20), (2015, 5, 23))]:
            MR.runReduce(getData, srcTable=src, dstTables=[tmp, DSTSTAT], appendMode=True)
            MR.runReduce(summarize, srcTable=tmp, dstTable=tmp)
            MR.runReduce(Summarizer(), srcTable=DSTSTAT, dstTable=DSTSTAT)
        MR.runMap(cleankey, srcTable=DSTSTAT, dstTable=DSTSTAT)
        MR.runReduce(Summarizer(), srcTable=DSTSTAT, dstTable=DSTSTAT)
        MR.runMap(presort, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Limiter(3000), srcTable=tmp, dstTable=DST)

def checkHost(url):
    if getInnerPath(url) in (None, '', '/'):
        return None
    host = getHost(url).lower()
    if host == 'liveinternet.ru':
        return 'li'
    elif host == 'livejournal.com' or host.endswith('.livejournal.com'):
        return 'lj'
    elif host == 'diary.ru' or host.endswith('.diary.ru'):
        return 'diary'
    else:
        return None

def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if req.IsA('TYandexWebRequest'):
                rtype = 'web'
            elif req.IsA('TTouchYandexWebRequest'):
                rtype = 'touch'
            else:
                continue
            if req.ServiceDomRegion not in ('ru', 'ua', 'by', 'kz'):
                continue
            have = {}
            for block in req.GetMainBlocks():
                res = block.GetMainResult()
                if not res.IsA('TWebResult'):
                    continue
                host = checkHost(res.Url)
                if not host:
                    continue
                have[host] = True
                yield Record(rtype + '\t' + 'query\t{}\t{}'.format(host, md5(req.Query).hexdigest()), '', '1\t' + req.Query, tableIndex=0)
                yield Record(rtype + '\t' + 'url\t{}\t{}'.format(host, md5(res.Url).hexdigest()), '', '1\t' + res.Url, tableIndex=0)
            pfx = str(random.randrange(256)) + '\t' + rtype + '\t'
            yield Record(pfx + 'serp', '', '1', tableIndex=1)
            if have:
                yield Record(pfx + 'blogs', '', '1', tableIndex=1)
                for host in have:
                    yield Record(pfx + host, '', '1', tableIndex=1)
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def summarize(key, recs):
    n, v = next(recs).value.split('\t', 1)
    n = int(n)
    for rec in recs:
        n += int(rec.value.split('\t', 1)[0])
    yield Record(key, '', str(n) + '\t' + v)

def  cleankey(rec):
    _, k = rec.key.split('\t', 1)
    yield Record(k, '', rec.value)

def presort(rec):
    n, v = rec.value.split('\t', 1)
    rt, t, h, _ = rec.key.split('\t')
    n = int(n)
    yield Record(rt + '\t' + t + '\t' + h, str(10000000000 - n), str(n) + '\t' + v)

if __name__ == '__main__':
    main()

