#!/usr/bin/env python2
# coding: utf-8

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
import random


def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/turk_goods'
    for src in ['reqans_log/{}'.format(d) for d in strdaterange((2015, 2, 18), (2015, 2, 21))]:
        MR.runCombine(getData, srcTable=src, dstTable=DST, appendMode=True)
        MR.runReduce(summarize, srcTable=DST, dstTable=DST)

def getData(recs):
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if req.get('stype') != 'www':
            continue
        if req.get('is_yandex', '0') == '1':
            continue
        if serpLang(req) != 'com.tr':
            continue
        query = req['req'].lower()
        if 'satın al' not in query and 'fiyat' not in query:
            continue
        for res in ress:
            if 'snippets_type' not in res:
                continue
            url = res['url']
            if getInnerPath(url) in (None, '', '/'):
                continue
            host = getHost(url)
            yield Record(host, 'n', '1')
            yield Record(host, 'q' + str(random.random()), query)

def summarize(key, recs):
    n = 0
    qs = set()
    for rec in recs:
        if rec.subkey == 'n':
            n += int(rec.value)
        else:
            qs.add(rec.value)
            if len(qs) > 50:
                break
    yield Record(key, 'n', str(n))
    for q in qs:
        yield Record(key, 'q' + str(random.random()), q)

if __name__ == '__main__':
    main()

