#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re
from hashlib import md5

NEEDED = 1000
rasphosts = ('rasp.yandex.by', 'rasp.yandex.kz', 'rasp.yandex.ru', 'rasp.yandex.ua', 'seyahat.yandex.com.tr')

def main():
    MR.useDefaults(username='snippets', server='sakura00.search.yandex.net', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/rasp_3122_stat'
    DSTURL = 'likhomanov/rasp_3122_urls'
    DSTSAMPLE = 'likhomanov/rasp_3122_sample'
    with mktmp() as tmp, mktmp() as tmpsample, mktmp() as tmpurl:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 7, 27), (2015, 8, 3))]:
            MR.runReduce(getData, srcTable=src, dstTables=[tmp, tmpurl, tmpsample], appendMode=True)
            MR.runReduce(Summarizer(), srcTable=tmp, dstTable=tmp)
            MR.runReduce(summarize_url, srcTable=tmpurl, dstTable=tmpurl)
        MR.runCombine(presort, srcTable=tmpurl, dstTable=DSTURL)
        MR.sortTable(DSTURL)
        MR.copyTable(srcTable=tmp, dstTable=DST)
        MR.runReduce(getNeeded, srcTable=tmpsample, dstTable=DSTSAMPLE)

def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if req.IsA('TYandexWebRequest'):
                platfs = ('desk', 'both')
            elif req.IsA('TTouchYandexWebRequest'):
                platfs = ('touch', 'both')
            else:
                continue
            if req.ServiceDomRegion == 'ru':
                langs = ('kubr', 'ru')
            elif req.ServiceDomRegion == 'tr':
                langs = ('tr',)
            elif req.ServiceDomRegion in ('ua', 'by', 'kz'):
                langs = ('kubr',)
            else:
                continue
            pfxs = [pl + '\t' + ln for pl in platfs for ln in langs]
            for res in (x.GetMainResult() for x in req.GetMainBlocks()):
                if not res.IsA('TWebResult'):
                    continue
                if res.Position > 9:
                    continue
                if getHost(res.Url) not in rasphosts:
                    continue
                for pfx in pfxs:
                    yield Record(pfx + '\t' + md5(res.Url).hexdigest(), '', '{}\t1\t{}'.format(res.Url, len(res.GetClicks())), tableIndex=1)
                    yield Record(pfx + '\t' + str(res.Position), '', '1', tableIndex=0)
                    yield Record(pfx + '\t' + str(res.Position), str(random.random()), res.Url, tableIndex=2)
    except (NameError, AttributeError, TypeError, ValueError):
            raise
    except Exception:
            pass

def summarize_url(key, recs):
    shows = clicks = 0
    for rec in recs:
        url, s, c = rec.value.split('\t')
        shows += int(s)
        clicks += int(c)
    yield Record(key, '', '{}\t{}\t{}'.format(url, shows, clicks))

def presort(recs):
    for rec in recs:
        pl, ln, _ = rec.key.split('\t')
        url, shows, clicks = rec.value.split('\t')
        yield Record(pl + '\t' + ln , str(1000000000 - int(shows)), rec.value)

def getNeeded(key, recs):
    urls = set()
    for rec in recs:
        if rec.value not in urls:
            yield rec
            urls.add(rec.value)
            if len(urls) == NEEDED:
                return

if __name__ == '__main__':
    main()

