#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re
import itertools
from collections import Counter
from hashlib import md5

def main():
    MR.useDefaults(username='snippets', server='sakura00.search.yandex.net', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/readmore'
    with open('log_3403', 'w') as logf:
        #for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 9, 1), (2015, 10, 1))]:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 9, 26), (2015, 10, 1))]:
            MR.runReduce(getData, srcTable=src, dstTable=DST, appendMode=True)
            print >>logf, src + ' read'
            if src[-1] in "05":
                MR.runReduce(summarize, srcTable=DST, dstTable=DST)
        MR.runCombine(drop_pfx, srcTable=DST, dstTable=DST)
        MR.runReduce(serp_sum, srcTable=DST, dstTable=DST)

def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if not req.IsA('TTouchYandexWebRequest'):
                continue
            if req.ServiceDomRegion != 'ru':
                continue
            nclicks = 0
            for res in (b.GetMainResult() for b in req.GetMainBlocks()):
                if res.IsA('TWebResult'):
                    nclicks += len(res.GetClicks())
            ngood = len([c for c in req.GetClicks() if c.ConvertedPath == '/web/item/more'])
            nclicks += ngood
            frac = (ngood + 0.0) / nclicks if nclicks else 0.0
            pfx = str(random.randrange(1000)) + '\t'
            yield Record(pfx + 'serp_click', '', repr(Counter({ngood: 1})))
            yield Record(pfx + 'serp_frac', '', repr(Counter({frac: 1})))
            rhash = md5(req.Query).hexdigest() + '\t'
            yield Record(rhash + 'req_click', '', repr(ngood) + '\t1\t' + req.Query)
            yield Record(rhash + 'req_frac', '', repr(frac) + '\t1\t' + req.Query)
    except (NameError, AttributeError, TypeError, ValueError):
        raise
    except Exception:
        pass

def summarize(key, recs):
    tp = key.split('\t', 1)[1]
    if tp.startswith('serp_'):
        c = Counter()
        for rec in recs:
            c += eval(rec.value)
        yield Record(key, '', repr(c))
    else:
        s = n = sday = day = 0
        for rec in recs:
            v, k, query = rec.value.split('\t', 2)
            v = eval(v)
            k = int(k)
            if k == 1:  # record from getData
                sday += v
                day += 1
            else:
                s += v
                n += k
        if day >= 3:    # non-noise query
            n += day
            s += sday
        if n:
            yield Record(key, '', '{!r}\t{}\t{}'.format(s, n, query))

def drop_pfx(recs):
    for rec in recs:
        k = rec.key.split('\t', 1)[1]
        if k.startswith('req_'):
            k = rec.key # avoid monsters for the last reduce
        yield Record(k, '', rec.value)

def serp_sum(key, recs):
    if '\t' in key:
        tp = key.split('\t', 1)[1]
        for rec in recs:
            yield Record(tp, '', rec.value)
    else:
        c = Counter()
        for rec in recs:
            c += eval(rec.value)
        yield Record(key, '', repr(c))

if __name__ == '__main__':
    main()

