#!/usr/bin/env python2
# coding: utf-8
from mapreducelib import Record, TemporaryTable, MapReduce as MR
import re
from datetime import datetime, timedelta
import argparse
import sys
import random
import json

URLS_PER_DAY = 10000000 # Нижняя оценка на число документов, прокачанных за один день
MR_PARAMS = {
        'username' : 'snippets',
        'server': 'cedar00.search.yandex.net:8013'
        }

def parse_args():
    def date_parser(s):
        try:
            return datetime.strptime(s, '%Y%m%d')
        except ValueError:
            return argparse.ArgumentTypeError('Invalid date: ' + s)
    parser = argparse.ArgumentParser(description='Get a sample of URLs from Sita logs.')
    parser.add_argument('-u', '--urls', help='number of urls to extract (default: 10000)',
            type=int, default=10000)
    parser.add_argument('-s', '--start', help='start date (YYYYMMDD)',
            type=date_parser, required=True)
    parser.add_argument('-n', '--ndays', help='number of days to process (default: 1)',
            type=int, default=1)
    parser.add_argument('-d', '--distinct', help='distinct urls (default: false, i.e. duplicates are allowed)',
            action='store_true')
    return parser.parse_args()


class SitaRCAUrlExtractor(object):
    """Извлекает из логов Sita урлы успешно прокачанных для RCA документов. Каждый урл берётся с заданной вероятностью."""

    def __init__(self, prob_keep):
        self.prob_keep = prob_keep # Вероятность сохранения найденного урла

    def extract_url(self, logline):
        if ' RichContentAPI ' not in logline:
            return None # Не RCA
        m = re.search(r' inurl# ([^ ]*) ', logline)
        if not m:
            return None # Нет поля inurl, не тот тип запроса
        inurl = m.group(1)
        if ' RedisTmp ' in logline and ' RESPONSE CACHE ' in logline:
            if re.search(r" RESPONSE CACHE .*\[.*data=''", logline):
                return None # cache miss
            else:
                return inurl # cache hit
        elif ' UrlRichContentExtraction ' in logline:
            if ' RESPONSE ONLINEZORA ' not in logline:
                return None # Не результат прокачки
            if not re.search(r'[ \[]httpcode=200[ \]]', logline):
                return None # Прокачка завершилась неудачей
            m = re.search(r' url# ([^ ]*) ', logline)
            if not m:
                print >>sys.stderr, 'No url!', logline
                return None # Для действия URCE должен быть урл
            url = m.group(1)
            if url != inurl:
                return None # Прокачка картинки
            else:
                return url
        else:
            return None # не тот тип записи

    def __call__(self, recs):
        for rec in recs:
            url = self.extract_url(rec.value)
            if url and random.random() < self.prob_keep:
                yield Record('0', str(random.random()), url)

def emitUrl(url):
    print json.dumps({'source': 'sita_rca_images', 'url': url})

def main():
    args = parse_args()
    prob = (10.0 * args.urls) / (URLS_PER_DAY * args.ndays)
    parser = SitaRCAUrlExtractor(prob)
    day = args.start
    MR.useDefaults(verbose=True, **MR_PARAMS)
    with TemporaryTable(project='likhomanov') as tmp:
        for i in xrange(args.ndays):
            MR.runCombine(parser, srcTable='sita_log/' + day.strftime('%Y%m%d'), dstTable=tmp, appendMode=True)
            if i % 5 == 4:
                MR.defragTable(tmp)
            day += timedelta(days=1)
        MR.sortTable(tmp)
        seen = set()
        needed = args.urls
        if args.distinct:
            for rec in MR.getSample(tmp):
                seen.add(rec.value)
                if len(seen) == needed:
                    break
            for url in seen:
                emitUrl(url)
        else:
            for rec in MR.getSample(tmp, count=needed):
                emitUrl(rec.value)


if __name__ == '__main__':
    main()

