#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re
from collections import defaultdict
from hashlib import md5

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/bno_3101_3'
    with mktmp() as tmp:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 6, 12), (2015, 6, 13))]:
            MR.runReduce(getData, srcTable=src, dstTable=DST)


def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if not req.IsA('TYandexWebRequest'):
                continue
            if req.ServiceDomRegion != 'tr':
                continue
            bno = [x for x in req.GetBSBlocks() if x.Path == '/snippet/bno/link']
            if not bno:
                continue
            if random.random() > 0.01:
                continue
            if not any(int(test.TestID) == 13919 for test in req.GetTestInfo()):
                continue
            block = bno[0]
            for v in block.GetVars():
                if v[0] == 'pos':
                    pos = int(v[1][1:])
                    break
            res = [x for x in (x.GetMainResult() for x in req.GetMainBlocks()) 
                    if x.IsA("TOrganicResultProperties") and x.IsA("TWebResult") and x.Position == pos][0]
            clicks = [x.ConvertedPath for x in req.GetClicks()]
            key = md5('{}\t{}'.format(req.Query, res.Source)).hexdigest()
            yield Record(key, '', '{}\t{}\t{}'.format(req.Query, res.Source, ' '.join(clicks)))
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def summarize(key, recs):
    bno = clicks = 0
    for rec in recs:
        query, source, b, c =  rec.value.split('\t')
        bno += int(b)
        clicks += int(c)
    yield Record(key, '',  '{}\t{}\t{}\t{}'.format(query, source, bno, clicks))

def premerge(rec):
    query, _ = rec.value.split('\t', 1)
    yield Record(md5(query).hexdigest(), '', rec.value)

def merge(key, recs):
    d = defaultdict(lambda: (0, 0))
    for rec in recs:
        query, source, b, c = rec.value.split('\t')
        bno, clicks = d[source]
        d[source] = (bno + int(b), clicks + int(c))
    res = [query]
    for k in sorted(d.keys()):
        res.append('{0}: {1[0]} shows, {1[1]} clicks;'.format(k, d[k]))
    yield Record(key, '', '\t'.join(res))

if __name__ == '__main__':
    main()

