#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re
from collections import defaultdict
from hashlib import md5

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/bno_3101'
    with mktmp() as tmp:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 6, 11), (2015, 6, 15))]:
            MR.runReduce(getData, srcTable=src, dstTable=tmp, appendMode=True)
            mrsort(tmp)
        MR.runReduce(merge, srcTable=tmp, dstTable=DST)


def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if not req.IsA('TYandexWebRequest'):
                continue
            if req.ServiceDomRegion != 'tr':
                continue
            if not any(int(test.TestID) == 13919 for test in req.GetTestInfo()):
                continue
            bno = [x for x in req.GetBSBlocks() if x.Path == '/snippet/bno/link']
            key = md5(req.Query).hexdigest()
            if bno:
                yield Record(key, 'bno', '')
            ress = [x for x in (x.GetMainResult() for x in req.GetMainBlocks()) 
                    if x.IsA("TOrganicResultProperties") and x.IsA("TWebResult")]
            sources = set(x.Source for x in ress)
            out = [req.Query]
            for src in sources:
                rs = [x for x in ress if x.Source == src]
                nclicks = sum(len(x.GetClicks()) for x in rs)
                out.append('{} {} {}'.format(src, len(rs), nclicks))
            yield Record(key, 'stat', '\t'.join(out))
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def merge(key, recs):
    if next(recs).subkey != 'bno':
        return
    d = defaultdict(lambda: (0, 0))
    for rec in recs:
        if rec.subkey =='bno':
            continue
        parts = rec.value.split('\t')
        query = parts.pop(0)
        for part in parts:
            src, s, c = part.split(' ')
            shows, clicks = d[src]
            d[src] = (shows + int(s), clicks + int(c))
    res = [query]
    for k in sorted(d.keys()):
        res.append('{0}: {1[0]} shows, {1[1]} clicks;'.format(k, d[k]))
    yield Record(key, '', '\t'.join(res))

if __name__ == '__main__':
    main()

