#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re
from hashlib import md5


def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/us_2952_urls'
    with mktmp() as tmp:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 5, 12), (2015, 5, 15))]:
            MR.runReduce(getData, srcTable=src, dstTable=tmp, appendMode=True)
            MR.runReduce(summarize, srcTable=tmp, dstTable=tmp)
        MR.runMap(presort, srcTable=tmp, dstTable=tmp)
        MR.sortTable(srcTable=tmp, dstTable=DST)


def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if not req.IsA('TYandexWebRequest'):
                continue
            if req.ServiceDomRegion != 'tr':
                continue
            for block in req.GetMainBlocks():
                res = block.GetMainResult()
                if not res.IsA('TWebResult'):
                    continue
                if getHost(res.Url) != 'sahibinden.com':
                    continue
                if getInnerPath(res.Url) in ('', '/', None):
                    continue
                if res.SnippetType in ('yaca', 'sahibinden_template'):
                    continue
                k = '\t'.join([res.SnippetType, req.Query, res.Url])
                yield Record(md5(k).hexdigest(), '', '1\t' + k)
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def summarize(key, recs):
    n, k = next(recs).value.split('\t', 1)
    n = int(n)
    for rec in recs:
        n += int(rec.value.split('\t', 1)[0])
    yield Record(key, '', str(n) + '\t' + k)

def presort(rec):
    n, stype, val = rec.value.split('\t', 2)
    n = int(n)
    yield Record(stype, str(10000000000 - n), '{}\t{}'.format(n, val))

if __name__ == '__main__':
    main()
 
