#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re
from collections import defaultdict
from hashlib import md5


def main():
    MR.useDefaults(username='snippets', server='sakura00.search.yandex.net', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/bno_3350'
    with open('log_3350', 'w') as logf:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 6, 1), (2015, 9, 1))]:
            MR.runReduce(getData, srcTable=src, dstTable=DST, appendMode=True)
            print >>logf, src + ' read'
            MR.runReduce(summarize, srcTable=DST, dstTable=DST)
            print >>logf, src + ' summed'
        MR.runCombine(presort, srcTable=DST, dstTable=DST)
        MR.sortTable(DST)

def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if req.IsA('TYandexWebRequest'):
                platf = 'web'
            elif req.IsA('TTouchYandexWebRequest'):
                platf = 'touch'
            else:
                continue
            if req.ServiceDomRegion in ('ru', 'tr'):
                pfx = platf + '\t' + req.ServiceDomRegion
            else:
                continue
            bno = [x for x in req.GetBSBlocks() if x.Path == '/snippet/bno/link']
            if not bno:
                continue
            block = bno[0]
            for v in block.GetVars():
                if v[0] == 'pos':
                    pos = int(v[1][1:])
                    break
            res = [x for x in (x.GetMainResult() for x in req.GetMainBlocks()) 
                    if x.IsA("TOrganicResultProperties") and x.IsA("TWebResult") and x.Position == pos][0]
            for sl in res.GetSiteLinks():
                pair = res.Url + '\t'+ sl.Url
                yield Record(pfx + '\t'+  md5(pair).hexdigest(), '', '1\t' + pair)
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def summarize(key, recs):
    n = 0
    for rec in recs:
        sn, val = rec.value.split('\t', 1)
        n += int(sn)
    yield Record(key, '', str(n) + '\t' + val)

def presort(recs):
    for rec in recs:
        k, _ = rec.key.rsplit('\t', 1)
        n = int(rec.valur.split('\t', 1)[0])
        yield Record(k, str(100000000000 - n), rec.value)

if __name__ == '__main__':
    main()

