#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random
from collections import defaultdict


def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/avito_stat'
    DSTR = 'likhomanov/avito_req'
    with mktmp() as tmp, mktmp() as tmpr:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2014,11,4), (2014,11,6))]:
            MR.runCombine(getData, srcTable=src, dstTables=[tmp, tmpr], appendMode=True)
        MR.runReduce(Summarizer(), srcTable=tmp, dstTable=DST)
        MR.runReduce(getTop, srcTable=tmpr, dstTable=DSTR)


def getData(recs):
    d = defaultdict(int)
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if req.get('stype') != 'www':
            continue
        if req.get('is_yandex', '0') == '1':
            continue
        lang = serpLang(req)
        if lang not in ('ru', 'ua', 'by', 'kz'):
            continue
        for res in ress:
            stype = res.get('snippets_type')
            if not stype:
                continue
            d[lang + ' ALL'] += 1
            url = res['url']
            if getHost(url) != 'avito.ru':
                continue
            inner = (len(getInnerPath(url)) > 1)
            po = (stype == 'productoffer_snip')
            if inner and not po:
                yield Record('0', str(random.random()), req['req'], tableIndex=1)
            d[lang + ' ' + ('INNER' if inner else 'MAIN') + ' ' + ('PO' if po else 'OTHER')] += 1
    for k, v in d.iteritems():
        yield Record(k, '', str(v), tableIndex=0)

def getTop(key, recs):
    reqs = set()
    for r in recs:
        req = r.value
        if req not in reqs:
            yield Record('0', '', req)
            reqs.add(req)
            if len(reqs) >= 1000:
                return

if __name__ == '__main__':
    main()

