#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from collections import defaultdict
import re
from hashlib import md5

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/tripadvisor'
    DSTREQ = 'likhomanov/tripadvisor_reqs'
    with mktmp() as tmp, mktmp() as tmpreq:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2015, 4, 9), (2015, 4, 11))]:
            MR.runCombine(getData, srcTable=src, dstTables=[tmp, tmpreq], appendMode=True)
        MR.runReduce(Summarizer(), srcTable=tmp, dstTable=DST)
        MR.runReduce(presort, srcTable=tmpreq, dstTable=tmpreq)
        MR.runReduce(Limiter(1000), srcTable=tmpreq, dstTable=DSTREQ)


tripadv = re.compile(r'(https?://)?www\.tripadvisor\.(ru|com|com\.tr)/(Hotel|Attraction|Restaurant)_Review-')

def reviewType(url):
    m = tripadv.match(url)
    if not m:
        return None
    return m.group(3)

def getData(recs):
    d = defaultdict(int)
    for rec in recs:
        req, ress = parseReqans(rec.value)
        if req.get('is_yandex', '0') == '1':
            continue
        if req.get('stype') not in ('www', 'gateway'):
            continue
        lang = serpLang(req)
        if lang in ('ru', 'ua', 'by', 'kz'):
            lang = 'kubr'
        elif lang == 'com.tr':
            lang = 'tr'
        else:
            continue
        serp = haveHotel = haveAttr = haveRest = False
        for res in ress:
            if 'snippets_type' not in res or 'url' not in res:
                continue
            serp = True
            d[lang + ' ALL'] += 1
            r = reviewType(res['url'])
            if r is None:
                continue
            elif r == 'Hotel':
                haveHotel = True
            elif r == 'Attraction':
                haveAttr = True
            elif r == 'Restaurant':
                haveRest = True
            d[lang + ' ' + r] += 1
        if serp:
            d[lang + ' ALLSERP'] += 1
            hsh = md5(req['req']).hexdigest()
            if haveHotel:
                d[lang + ' serp Hotel'] += 1
                yield Record(lang + '\tHotel\t' + hsh, '', req['req'], tableIndex=1)
            if haveAttr:
                d[lang + ' serp Attraction'] += 1
                yield Record(lang + '\tAttraction\t' + hsh, '', req['req'], tableIndex=1)
            if haveRest:
                d[lang + ' serp Restaurant'] += 1
                yield Record(lang + '\tRestaurant\t' + hsh, '', req['req'], tableIndex=1)
    for k, v in d.iteritems():
        yield Record(k, '', str(v), tableIndex=0)


def presort(key, recs):
    lang, tp, _ = key.split('\t')
    n = 1
    req = next(recs).value
    for _ in recs:
        n += 1
    yield Record(lang + '\t' + tp, str(100000000 - n), str(n) + '\t' + req)

if __name__ == '__main__':
    main()

