#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re
from hashlib import md5


def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/forums_3218'
    DSTH = 'likhomanov/forums_3218_hosts'
    with mktmp() as tmp, mktmp() as tmph:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 2, 1), (2015, 2, 8))]:
            MR.runReduce(getOld, srcTable=src, dstTables=[tmp, tmph], appendMode=True)
            MR.runReduce(Summarizer(), srcTable=tmph, dstTable=tmph)
            MR.runReduce(Limiter(1), srcTable=tmp, dstTable=tmp)
        MR.copyTable(srcTable=tmp, dstTable=DST)
        MR.copyTable(srcTable=tmph, dstTable=DSTH)
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 6, 20), (2015, 6, 28))]:
            MR.runReduce(getNew, srcTable=src, dstTables=[tmp, tmph], appendMode=True)
            MR.runReduce(Summarizer(), srcTable=tmph, dstTable=tmph)
            MR.runReduce(cmb, srcTable=tmp, dstTable=tmp)
        MR.runCombine(prepareHosts, srcTable=tmph, dstTable=DSTH)
        MR.sortTable(DSTH)
        MR.runCombine(getDiff, srcTable=tmp, dstTable=DST)

def getOld(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if not req.IsA('TYandexWebRequest'):
                continue
            for res in (x.GetMainResult() for x in req.GetMainBlocks()):
                if not res.IsA('TWebResult'):
                    continue
                if 'forum' in res.SnippetType:
                    yield Record('old\t' + getHost(res.Url), '', '1', tableIndex=1)
                    yield Record(md5(res.Url).hexdigest(), 'old', res.Url, tableIndex=0)
    except (NameError, AttributeError, TypeError, ValueError):
        raise
    except Exception:
        pass

def getNew(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if not req.IsA('TYandexWebRequest'):
                continue
            for res in (x.GetMainResult() for x in req.GetMainBlocks()):
                if not res.IsA('TWebResult'):
                    continue
                if 'forum' in res.SnippetType:
                    yield Record('new\t' + getHost(res.Url), '', '1', tableIndex=1)
                    yield Record(md5(res.Url).hexdigest(), 'newforum', res.Url, tableIndex=0)
                else:
                    yield Record(md5(res.Url).hexdigest(), 'newother', res.Url, tableIndex=0)
    except (NameError, AttributeError, TypeError, ValueError):
        raise
    except Exception:
        pass

def cmb(key, recs):
    seen = set()
    for rec in recs:
        seen.add(rec.subkey)
        url = rec.value
    if 'newforum' in seen:
        return
    if 'both' in seen or ('old' in seen and 'newother' in seen):
        yield Record(key, 'both', url)
    elif 'old' in seen:
        yield Record(key, 'old', url)

def prepareHosts(recs):
    for rec in recs:
        tp, host = rec.key.split('\t')
        yield Record(host, tp, rec.value)

def getDiff(recs):
    for rec in recs:
        if rec.subkey == 'both':
            yield Record('0', str(random.random()), rec.value)

if __name__ == '__main__':
    main()

