#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re
from hashlib import md5

def main():
    MR.useDefaults(username='snippets', server='sakura00.search.yandex.net', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/rutop'
    DST2 = 'likhomanov/rutop_100k'
    MR.runReduce(Limiter(10000), srcTable=DST, dstTable=DST2)
    MR.sortTable(DST2)

def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if req.IsA('TYandexWebRequest'):
                platf = 'web'
            elif req.IsA('TTouchYandexWebRequest'):
                platf = 'touch'
            else:
                continue
            if req.ServiceDomRegion != 'ru':
                continue
            urls = set()
            hosts = set()
            ndoc = 0
            for res in (x.GetMainResult() for x in req.GetMainBlocks()):
                if not res.IsA('TWebResult'):
                    continue
                ndoc += 1
                host = getHost(res.Url)
                urls.add(res.Url)
                hosts.add(host)
                yield Record(platf + '\turl\tdoc\t' + md5(res.Url).hexdigest(), '', '1\t' + res.Url)
                yield Record(platf + '\thost\tdoc\t' + md5(host).hexdigest(), '', '1\t' + host)
            if ndoc:
                yield Record(platf + '\tALL\tdoc\t0', '', str(ndoc) + '\t0')
                yield Record(platf + '\tALL\tserp\t0', '', '1\t0')
                for u in urls:
                    yield Record(platf + '\turl\tserp\t' + md5(u).hexdigest(), '', '1\t' + u)
                for h in hosts:
                    yield Record(platf + '\thost\tserp\t' + md5(h).hexdigest(), '', '1\t' + h)
    except (NameError, AttributeError, TypeError, ValueError):
        raise
    except Exception:
        pass

def summarize(key, recs):
    n = 0
    for rec in recs:
        sn, val = rec.value.split('\t', 1)
        n += int(sn)
    if n >= 10:
        yield Record(key, '', str(n) + '\t' + val)

def presort(rec):
    k = rec.key.rsplit('\t', 1)[0]
    n = int(rec.value.split('\t', 1)[0])
    yield Record(k, str(10000000000000000 - n), rec.value)

if __name__ == '__main__':
    main()

