#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
#from logparse import parseReqans
import libra
from mymrutils import *
import re
from hashlib import md5
import random

addrurls = set(l.strip() for l in open('traddr'))

dates = ['20140104', '20140105', '20140111', '20140123', '20140129', '20140202', '20140205', '20140208', '20140217', '20140223', '20140308', '20140309', '20140313', '20140318', '20140326', '20140413', '20140417', '20140425', '20140426', '20140428', '20140502', '20140503', '20140507', '20140525', '20140527', '20140612', '20140615', '20140625', '20140629', '20140630', '20140705', '20140710', '20140714', '20140719', '20140720']

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['traddr', 'blockstat.dict'])
    DST = 'likhomanov/tr_addr_show_top'
    with mktmp() as tmp:
        for src in ['user_sessions/{}'.format(d) for d in dates]:
            MR.runReduce(getData, srcTable=src, dstTable=tmp.name, appendMode=True)
            MR.runReduce(Summarizer(), srcTable=tmp.name, dstTable=tmp.name)
        MR.runMap(rekey, srcTable=tmp.name, dstTable=tmp.name)
        MR.runReduce(Limiter(1000), srcTable=tmp.name, dstTable=DST)

def getData(key, recs):
    try:
        for req in libra.ParseSessionWithFat(recs, 'blockstat.dict'):
            if not req.IsA('TYandexWebRequest'):
                continue
            if req.ServiceDomRegion != 'tr':
                continue
            for block in req.GetMainBlocks():
                res = block.GetMainResult()
                if not res.IsA('TWebResult'):
                    continue
                url = res.Url
                if url.startswith('http://'):
                    url = url[7:]
                if url in addrurls:
                    yield Record(url, '', '1')
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def rekey(rec):
    n = int(rec.value)
    yield Record('0', str(1000000000 - n), '{}\t{}'.format(n, rec.key))

if __name__ == '__main__':
    main()

