#!/usr/bin/env python2
# coding: utf-8
from mapreducelib import Record, TemporaryTable, MapReduce as MR
from mymrutils import *
import re
import sys
from hashlib import md5
import random

def main():
    MR.useDefaults(username='snippets', server='cedar00.search.yandex.net:8013', verbose=True)
    DST = 'likhomanov/rca_216'
    parser = SitaParser()
    with mktmp() as tmp:
        for src in ['sita_log/{}'.format(d) for d in strdaterange((2015, 6, 1), (2015, 7, 1))]:
            MR.runCombine(parser, srcTable=src, dstTable=tmp, appendMode=True)
            if src.endswith(('5', '0')):
                MR.defragTable(tmp)
        MR.runReduce(Limiter(10000), srcTable=tmp, dstTable=DST)


class SitaParser(object):

    def extract_url(self, logline):
        if ': rca ' not in logline:
            return None
        if 'REQUEST [' not in logline:
            return None
        if 'param is invalid' in logline or 'param is missing' in logline:
            return None
        if 'key=rca.1.1.20140801T061641Z.7fa8bd3dc18062f5.b1bb3025967b1f2435a0d9ca96437b5e62f52be2' not in logline:
            return None
        m = re.search(r' inurl# ([^ ]*) ', logline)
        if not m:
            print >>sys.stderr, logline
            return None # Нет поля inurl, не тот тип запроса
        inurl = m.group(1)
        return inurl

    def __call__(self, recs):
        for rec in recs:
            url = self.extract_url(rec.value)
            if url is None:
                continue
            if random.random() > 0.01:
                continue
            yield Record('0', str(random.random()) , url)

if __name__ == '__main__':
    main()

