#!/usr/bin/env python2
# coding: utf-8

import re
from datetime import datetime, timedelta
import argparse
import sys
import random
import json
from itertools import islice
import yt.wrapper as yt

URLS_PER_DAY = 100000 # Нижняя оценка на число документов, прокачанных за один день

def parse_args():
    def date_parser(s):
        try:
            return datetime.strptime(s, '%Y%m%d')
        except ValueError:
            return argparse.ArgumentTypeError('Invalid date: ' + s)
    parser = argparse.ArgumentParser(description='Get a sample of URLs from Sita logs.')
    parser.add_argument('-u', '--urls', help='number of urls to extract (default: 10000)',
            type=int, default=10000)
    parser.add_argument('-s', '--start', help='start date (YYYYMMDD)',
            type=date_parser, required=True)
    parser.add_argument('-n', '--ndays', help='number of days to process (default: 1)',
            type=int, default=1)
    parser.add_argument('-d', '--distinct', help='distinct urls (default: false, i.e. duplicates are allowed)',
            action='store_true')
    return parser.parse_args()


class ZenUrlExtractor(object):
    """Извлекает из логов Zen показанные пользователю урлы. Каждый урл берётся с заданной вероятностью."""

    def __init__(self, prob_keep):
        self.prob_keep = prob_keep # Вероятность сохранения найденного урла

    def extract_urls(self, logrec):
        if 'ev' not in logrec:
            raise ValueError('No ev') # Запись без типа
        if logrec['ev'] != 'show':
            return # Не тот тип записи
        if 'data' not in logrec:
            raise ValueError('No data') # Для ev == show в data должны лежать урлы
        data = json.loads(logrec['data'])
        if 'urls' not in data:
            raise ValueError('No urls')
        for u in data['urls']:
            if 'url' not in u:
                raise ValueError('Broken url')
            yield u['url']

    def __call__(self, rec):
        try:
            for url in self.extract_urls(rec):
                if random.random() < self.prob_keep:
                    yield {'rnd': random.random(), 'url': url}
        except ValueError as e:
            print >>sys.stderr, e, rec

def emitUrl(url):
    print json.dumps({'source': 'zen', 'url': url})

def main():
    args = parse_args()
    prob = (10.0 * args.urls) / (URLS_PER_DAY * args.ndays)
    parser = ZenUrlExtractor(prob)
    day = args.start
    tmp_name = yt.create_temp_table(path='//tmp', prefix='RCA_')
    print >>sys.stderr, 'Temp table name:', tmp_name
    tmp = yt.TablePath(tmp_name, append=True)
    for i in xrange(args.ndays):
        yt.run_map(parser, '//statbox/zen-stats-log/' + day.strftime('%Y-%m-%d'), tmp)
        day += timedelta(days=1)
    yt.run_sort(tmp_name, sort_by='rnd')
    needed = args.urls
    if args.distinct:
        seen = set()
        for rec in yt.read_table(tmp_name, format=yt.JsonFormat()):
            seen.add(json.loads(rec)['url'])
            if len(seen) == needed:
                break
        for url in seen:
            emitUrl(url)
    else:
        for rec in yt.read_table(yt.TablePath(tmp_name, end_index=needed), format=yt.JsonFormat()):
            emitUrl(json.loads(rec)['url'])
    yt.remove(tmp_name)

if __name__ == '__main__':
    main()

