#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
import re
from random import random
from collections import defaultdict

masks = {s: re.compile('^https?://(www\.)?' + s) for s in [
    r'market\.yandex\.ru/shop',
    r'market\.yandex\.ru/model\.xml',
    r'market\.yandex\.ru/product/',
    r'market\.yandex\.ru/forums\?modelid=',
    r'market\.yandex\.ru/model-spec.xml\?modelid=',
    r'market\.yandex\.ru/catalogmodels\.xml',
    r'market\.yandex\.ru/guru\.xml',
    r'market\.yandex\.ru/brands'
    ]}

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/market_stat'
    DSTREQ = 'likhomanov/market_req'
    with mktmp() as tmp, mktmp() as tmpreq:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014, 12, 1), (2014, 12, 4))]:
            MR.runCombine(getdata, srcTable=src, dstTables=[tmp, tmpreq], appendMode=True)
        MR.runReduce(Limiter(100), srcTable=tmpreq, dstTable=DSTREQ)
        MR.runReduce(summarize, srcTable=tmp, dstTable=DST)

def getdata(recs):
    pos = defaultdict(int)
    shows = defaultdict(int)
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if req.get('stype') != 'www':
            continue
        if req.get('is_yandex', '0') == '1':
            continue
        if serpLang(req) != 'ru':
            continue
        for i, res in enumerate(ress, 1):
            url = res.get('url')
            if not url:
                continue
            for m in masks:
                if masks[m].match(url):
                    shows[m] += 1
                    pos[m] += i
                    if random() < 0.01:
                        yield Record(m, str(random()), req['req'], tableIndex=1)
    for m in masks:
        yield Record(m, '', '{}\t{}'.format(shows[m], pos[m]))

def summarize(key, recs):
    shows = pos = 0
    for r in recs:
        c, i = r.value.split('\t')
        shows += int(c)
        pos += int(i)
    yield Record(key, '', '{}\t{}\t{}'.format(shows, pos, pos/shows if shows else None))

if __name__ == '__main__':
    main()

