#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from reqansparse import parseRARecord
from itertools import islice
import re
from hashlib import md5
import random

serpRE = re.compile(r'https?://(www\.)?yandex\.([^/]*)/yandsearch')
hostRE = re.compile(r'https?://(www\.)?([^/]*)(/.*)?$')
hosts = [
"ye-mek.net",
"yemekloji.com",
"lezzetbank.com",
"afiyetle.com",
"sihirlitarif.com",
"nefisyemektarifleri.com",
"tarifler.com.tr",
"lezzetli.com",
"ellerimesaglik.com",
"kevserinmutfagi.com",
"mutfaksirlari.com",
"tarifyurdu.net",
"diyetkolik.com",
"yemektarifi.com",
"seramitokilelezzetler.com",
"lezzet.com.tr",
"hurriyetaile.com",
"dolutabak.com",
"favoritatlar.com",
"misssgibi.com"]


def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    mktmp = lambda: TemporaryTable(project='likhomanov')
    mrsort = lambda table: MR.sortTable(srcTable=table.name, dstTable=table.name)
    with mktmp() as tmp:
        for src in ['reqans_log/201406{:2}'.format(i) for i in (25,)]:
            MR.runMap(getrecipes, srcTable=src, dstTable=tmp.name)
        mrsort(tmp)
        MR.runReduce(count, srcTable=tmp.name, dstTable='likhomanov/recipeshows')

def serpLang(url):
    if not url:
        return None
    m = re.match(serpRE, url)
    if not m:
        return None
    return m.group(2)

def getrecipes(rec):
    ra = parseRARecord(rec.value)
    lang = serpLang(ra['request'].get('serp_url'))
    if lang != 'com.tr': #not in ('ru', 'ua', 'by', 'kz'):
        return
    for res in ra['results']:
        url = res.get('url')
        if not url:
            continue
        m = re.match(hostRE, url)
        if not m:
            continue
        host = m.group(2)
        if host not in hosts:
            continue
        yield Record(url, '', '')

def count(key, recs):
    n = 0
    for _ in recs:
        n += 1
    yield Record(key, '', str(n))

if __name__ == '__main__':
    main()


