#!/usr/bin/env python2

from mapreducelib import Record, TemporaryTable, MapReduce as MR
from reqansparse import parseRARecord
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random

serpRE = re.compile(r'https?://(www\.)?yandex\.([^/]*)/yandsearch')
hostRE = re.compile(r'https?://(www\.)?([^/]*)(/.*)?$')

foodhosts = set()
for l in open('turkishfoodsites'):
    l = l.strip()
    if l:
        foodhosts.add(l)

foodreqs = set()
for l in open('turkishfoodqueries'):
    l = l.strip()
    if l:
        foodreqs.add(l)

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True, files=['turkishfoodsites', 'turkishfoodqueries'])
    mktmp = lambda: TemporaryTable(project='likhomanov')
    mrsort = lambda table: MR.sortTable(srcTable=table.name, dstTable=table.name)
    DST = 'likhomanov/turkfoodstat'
    with mktmp() as tmp:
        for src in ['reqans_log/201406{:02}'.format(i) for i in range(1,31)]:
            MR.runMap(getshows, srcTable=src, dstTable=tmp.name)
            MR.runReduce(summarize, srcTable=tmp.name, dstTable=DST, appendMode=True)
        MR.sortTable(srcTable=DST,dstTable=DST)
        MR.runReduce(summarize, srcTable=DST, dstTable=DST)

def serpLang(url):
    if not url:
        return None
    m = re.match(serpRE, url)
    if not m:
        return None
    return m.group(2)

def getHost(url):
    if not url:
        return None
    m = re.match(hostRE, url)
    if not m:
        return None
    return m.group(2)

def getshows(rec):
    ra = parseRARecord(rec.value)
    lang = serpLang(ra['request'].get('serp_url'))
    if lang != 'com.tr': #not in ('ru', 'ua', 'by', 'kz'):
        return
    if ra['request']['req'] not in foodreqs:
        return
    yield Record('foodreq', '', '1')
    top3 = top10 = 0
    for i, res in enumerate(ra['results']):
        if getHost(res.get('url')) not in foodhosts:
            continue
        if i < 3:
            top3 += 1
        if i < 10:
            top10 += 1
    if top3:
        yield Record('top3', '', '1')
        yield Record('top3n', '', str(top3))
    if top10:
        yield Record('top10', '', '1')
        yield Record('top10n', '', str(top10))

if __name__ == '__main__':
    main()

