#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re

urls = set()

for l in open('review_urls'):
    l = l.strip()
    if not l.startswith('https://'):
        l = 'http://' + l
    urls.add(l.lower())

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', files=['blockstat.dict', 'review_urls'], verbose=True)
    DST = 'likhomanov/reviews_3061'
    DSTS = 'likhoamnov/_review_sample'
    with mktmp() as tmp:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 5, 27), (2015, 5, 30))]:
            MR.runReduce(getData, srcTable=src, dstTables=[tmp, DSTS], appendMode=True)
            MR.runReduce(Summarizer(), srcTable=tmp, dstTable=tmp)
        MR.runMap(cleankey, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Summarizer(), srcTable=tmp, dstTable=DST)

def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if req.IsA('TYandexWebRequest'):
                rtype = 'web'
            elif req.IsA('TTouchYandexWebRequest'):
                rtype = 'touch'
            else:
                continue
            rtype = str(random.randrange(256)) + '\t' + rtype + '\t'
            if req.ServiceDomRegion not in ('ru', 'ua', 'by', 'kz'):
                continue
            found = False
            for block in req.GetMainBlocks():
                res = block.GetMainResult()
                if not res.IsA('TWebResult'):
                    continue
                if res.Url.lower() in urls:
                    found = True
                    if random.random() < 0.001:
                        yield Record('0', '', res.Url, tableIndex=1)
            yield Record(rtype + 'ALL', '', '1', tableIndex=0)
            if found:
                yield Record(rtype + 'reviews', '', '1', tableIndex=0)
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def cleankey(rec):
    _, k = rec.key.split('\t', 1)
    yield Record(k, '', rec.value)

if __name__ == '__main__':
    main()

