#!/usr/bin/env python2
# coding: utf-8


from mapreducelib import Record, TemporaryTable, MapReduce as MR
from logparse import parseReqans
from mymrutils import *
from itertools import islice
import re
from hashlib import md5
import random

words = ['torrent', 'торрент', 'скачать']

sites = ['rutracker.org',
'fast-torrent.ru',
'serialu.net',
'rutor.org',
'torrentino.com',
'tfile.ru',
'katushka.net',
'bigtorrent.org',
'opensharing.org',
'megashara.com',
'nnm-club.me']


def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/torrent_req_top'
    with mktmp() as tmp:
        for src in ['reqans_log/{}'.format(d) for d in strdaterange((2015, 3, 9), (2015, 3, 16))]:
            MR.runMap(gettorrents, srcTable=src, dstTable=tmp, appendMode=True)
            mrsort(tmp)
        MR.runReduce(countReq, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Limiter(200), srcTable=tmp, dstTable=DST)


def gettorrents(rec):
    req, ress = parseReqans(rec.value)
    checkwww(req)
    if serpLang(req) != 'ru':
        return
    query = req['req']
    h = md5(query).hexdigest()
    if any(w in query for w in words): #and random.random() < 0.01:
        yield Record('word '+ h, '', query)
    haveTorrent = False
    for res in ress:
        url = res.get('url')
        if url and getHost(url) in sites and getInnerPath(url) not in (None, '',  '/'):
            haveTorrent = True
            break
    if haveTorrent: #and random.random() < 0.01:
        yield Record('site ' + h, '', query)

def countReq(key, recs):
    query = next(recs).value
    n = 1
    tp = key[:4]
    for _ in recs:
        n += 1
    yield Record(tp, str(1000000000 - n), '{}\t{}'.format(n, query))

if __name__ == '__main__':
    main()

