#!/usr/bin/env python2
# coding: utf-8
from mapreducelib import Record, TemporaryTable, MapReduce as MR
from mymrutils import *
import re
import sys
from hashlib import md5

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/rca_215'
    parser = SitaParser()
    with mktmp() as tmp:
        for src in ['sita_log/{}'.format(d) for d in strdaterange((2015, 6, 1), (2015, 7, 1))]:
            MR.runCombine(parser, srcTable=src, dstTable=tmp, appendMode=True)
            MR.runReduce(summarize, srcTable=tmp, dstTable=tmp)
        MR.runCombine(presort, srcTable=tmp, dstTable=tmp)
        MR.runReduce(Limiter(10000), srcTable=tmp, dstTable=DST)


class SitaParser(object):

    def extract_url(self, logline):
        if ': rca ' not in logline:
            return None
        if 'REQUEST [' not in logline:
            return None
        if 'param is invalid' in logline or 'param is missing' in logline:
            return None
        m = re.search(r' inurl# ([^ ]*) ', logline)
        if not m:
            print >>sys.stderr, logline
            return None # Нет поля inurl, не тот тип запроса
        inurl = m.group(1)
        if re.match(r'(https?://)?yadi.sk/.+', inurl):
            tgt = 'yadisk'
        elif re.match(r'(https?://)?dl.dropboxusercontent.com/.+', inurl):
            tgt = 'dropbox'
        else:
            return None
        return (tgt, inurl)

    def __call__(self, recs):
        for rec in recs:
            res = self.extract_url(rec.value)
            if res is None:
                continue
            tgt, url = res
            yield Record(tgt + '\t' + md5(url).hexdigest(), '', '1\t' + url)

def summarize(key, recs):
    n, url = next(recs).value.split('\t', 1)
    n = int(n)
    for rec in recs:
        n += int(rec.value.split('\t', 1)[0])
    yield Record(key, '', str(n) + '\t' + url)

def presort(recs):
    for rec in recs:
        tgt, _ = rec.key.split('\t', 1)
        n, _ = rec.value.split('\t', 1)
        yield Record(tgt, str(1000000000 - int(n)), rec.value)

if __name__ == '__main__':
    main()

