#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from logparse import parseBlockstat, getReq
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/addr_sl_turk_stat'
    DST2 = 'likhomanov/addr_sl_turk_sample'
    DST3 = 'likhomanov/addr_sl_turk_top'
    with mktmp() as tmpstat, mktmp() as tmpsample, mktmp() as tmpreq:
        for src in ['blockstat_log/201407{:02}'.format(i) for i in range(21,24)]:
            MR.runMap(getData, srcTable=src, dstTables=[tmpstat.name, tmpsample.name, tmpreq.name], appendMode=True)
        MR.runReduce(Summarizer(), srcTable=tmpstat.name, dstTable=DST)
        #MR.runMap(separate, srcTable=tmpstat.name, dstTables=[DST, tmpstat.name])
        MR.runReduce(countReq, srcTable=tmpreq.name, dstTable=tmpreq.name)
        MR.runReduce(Limiter(5000), srcTable=tmpreq.name, dstTable=DST3)
        MR.runReduce(Limiter(5000), srcTable=tmpsample.name, dstTable=DST2)

def getData(rec):
    try:
        hdr, blocks = parseBlockstat(rec.value)
    except ValueError:
        return
    except Exception as e:
        raise Exception(str(e) + ' ' + rec.value)
    if hdr['host'] != 'yandex.com.tr':
        return
    req = getReq(hdr['uri'])
    if not req:
        return
    addr1 = sl1 = False
    addr5 = set()
    sl5 = set()
    for b in blocks:
        pos = b.get('pos')
        name = b['NAME']
        if pos not in ('p0', 'p1', 'p2', 'p3', 'p4'):
            continue
        sl = ad = False
        if 'snippet/addr' in name:
            ad = True
        if 'sitelink' in name:
            sl = True
        if pos == 'p0':
            if ad:
                addr1 = True
            if sl:
                sl1 = True
        if ad:
            addr5.add(pos)
        if sl:
            sl5.add(pos)
    reqhash = md5(req).hexdigest()
    if sl1:
        yield Record('sl1_serp', '', '1', tableIndex=0)
        yield Record('sl1\t{}'.format(reqhash), '', req, tableIndex=2)
        yield Record('sl1', str(random.random()), req, tableIndex=1)
        if addr1:
            yield Record('asl1_serp', '', '1', tableIndex=0)
            yield Record('asl1_req\t{}'.format(reqhash), '', req, tableIndex=2)
            yield Record('asl1', str(random.random()), req, tableIndex=1)
    nsl = len(sl5)
    nasl = len(sl5 & addr5)
    if nsl:
        yield Record('sl5_serp', '', '1', tableIndex=0)
        yield Record('sl5_snip', '', str(nsl), tableIndex=0)
        yield Record('sl5\t{}'.format(reqhash), '', req, tableIndex=2)
        yield Record('sl5', str(random.random()), req, tableIndex=1)
        if nasl:
            yield Record('asl5_serp', '', '1', tableIndex=0)
            yield Record('asl5_snip', '', str(nasl), tableIndex=0)
            yield Record('asl5\t{}'.format(reqhash), '', req, tableIndex=2)
            yield Record('asl5', str(random.random()), req, tableIndex=1)

def countReq(key, recs):
    key, _ = key.split('\t', 1)
    n = 1
    req = next(recs).value
    for _ in recs:
        n += 1
    yield Record(key, str(100000000 - n), '{}\t{}'.format(n, req))

if __name__ == '__main__':
    main()

