#!/usr/bin/env python2

from mapreducelib import Record, MapReduce as MR
import libra
from mymrutils import *
import random
import re

hosts = ['dojki.com', 'prostoporno.net', 'xyu.tv', 'ogli.org', 'ru-porn.tv', 'mega-porno.ru',
        'gigporno.com', 'pornoflv.net', 'vuku.tv', 'vtrahe.tv', 'pornoboss.tv', '24xxx.net', 'ru.pornhub.com', 'pornotube.name']

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', files=['blockstat.dict'], verbose=True)
    DST = 'likhomanov/porno_2997'
    with mktmp() as tmp:
        for src in ['user_sessions/{}'.format(d) for d in strdaterange((2015, 5, 6), (2015, 5, 9))]:
            MR.runReduce(getData, srcTable=src, dstTable=tmp, appendMode=True)
            MR.runReduce(Summarizer(), srcTable=tmp, dstTable=tmp)
        MR.runMap(cleankey, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Summarizer(), srcTable=tmp, dstTable=DST)

masks = {
        'dojki': r'https?://(www\.)?dojki\.com/[0-9]+/',
        'prostoporno': r'https?://prostoporno\.net/videos/.+',
        'xyu': r'https?://xyu\.tv/video/.+',
        'mega-porno': r'https?://(www\.)?mega-porno\.ru/online/.+',
        'gigporno': r'https?://gigporno\.com/online-video-rolik-.+',
        'pornoflv': r'https?://pornoflv\.net/video.+',
        'vuku': r'https?://vuku\.xxx/pornovideo/.+',
        'pornoboss': r'https?://pornoboss\.tv/[0-9]+-.+',
        '24xxx': r'https?://(www\.)?24xxx\.net/video/.+',
        'pornhub': r'https?://ru\.pornhub\.com/view_video\.php\?.+',
        'pornotube': r'https?://www\.pornotube\.name/porno/.+'
        }

for k in masks:
    masks[k] = re.compile(masks[k])

def mask(url):
    for name, r in masks.iteritems():
        if r.match(url):
            if name == 'vuku' and url.endswith('index.html'):
                continue
            return name
    return None

def getData(key, recs):
    try:
        for req in libra.ParseSession(recs, 'blockstat.dict'):
            if req.IsA('TYandexWebRequest'):
                rtype = 'web'
            elif req.IsA('TTouchYandexWebRequest') or req.IsA('TTouchYandexPortalRequest'):
                rtype = 'touch'
            else:
                continue
            rtype = str(random.randrange(256)) + '\t' + rtype
            if req.ServiceDomRegion != 'ru':
                continue
            foundHosts = set()
            foundMasks = set()
            nurl = 0
            for block in req.GetMainBlocks():
                res = block.GetMainResult()
                if not res.IsA('TWebResult'):
                    continue
                nurl += 1
                url = res.Url.lower()
                host = getHost(url)
                if host in hosts:
                    yield Record(rtype + '\tdoc\thost\t' + host, '', '1')
                    foundHosts.add(host)
                m = mask(url)
                if m:
                    yield Record(rtype + '\tdoc\tmask\t' + m, '', '1')
                    foundMasks.add(m)
            for host in foundHosts:
                yield Record(rtype + '\tserp\thost\t' + host, '', '1')
            for m in foundMasks:
                yield Record(rtype + '\tserp\tmask\t' + m, '', '1')
            yield Record(rtype + '\tserp\tALL', '', '1')
            yield Record(rtype + '\tdoc\tALL', '', str(nurl))
    except (NameError, AttributeError, TypeError):
        raise
    except Exception:
        pass

def cleankey(rec):
    _, k = rec.key.split('\t', 1)
    yield Record(k, '', rec.value)

if __name__ == '__main__':
    main()

