#!/usr/bin/env python
# -*- coding: utf-8 -*-
import numpy

from urlparse import urlparse
from collections import Counter

import statbox.qb
import statbox.qb.extractors2 as se
import statbox.qb.filters2 as sf
from statbox import mrlib
from statbox import mrtools


def get_qb():
    def get_groups(ycrid):
        groups = {
            'ios': ['g_mobile'],
            'andr': ['g_mobile'],
            'wp': ['g_mobile'],
            'win': ['g_desktop'],
            'mac': ['g_desktop'],
            'lnx': ['g_desktop'],
            'web': [],
            '-': ['g_mpfs'],
            'mpfs': ['g_mpfs'],
            'dav': [],
            'rest': [],
            'sdk': [],
            'sw': [],
            'xmpp': [],
        }
        result = ['*']

        if ycrid == '-':
            first_part = '-'
        else:
            first_part = ycrid.split('-', 1)[0]

        if first_part in groups:
            result += [first_part] + groups[first_part]
        else:
            result += ['other']
        return result

    return statbox.qb.QB(
        log='access-log',
        fields=[
            'date',
            'datetime',
            'page',
            'status',
            se.log_fields('request_time', 'method', 'source_uri', 'ycrid'),
            se.custom('source_host', lambda x: urlparse(x).hostname, args=('source_uri',),),
            se.custom('source_groups', get_groups, args=('ycrid',)),
        ],
        filters=[
            sf.search('page', r'^/(json|billing|service|support|desktop|mail)/'),
            sf.defined('ycrid'),
        ]
    )


@mrlib.combiner(mrtools.reduce_aggregator(group_by_subkeys=True, do_eval=False))
def combined_raw_map(lines):
    qb = get_qb()
    for r in qb.map_lines(lines):
        for group in r['source_groups']:
            yield mrlib.Record((r['date'], r['page'], r['status'], group), r['datetime'][:16], 1)


@mrlib.combiner(mrtools.reduce_aggregator(group_by_subkeys=False, do_eval=False))
def combined_raw_map_request_time(lines):
    qb = get_qb()
    for r in qb.map_lines(lines):
        for group in r['source_groups']:
            try:
                rt = float(r['request_time'])
                yield mrlib.Record((r['date'], r['page'], r['status'], group, rt), '', 1)
            except TypeError:
                pass


@mrlib.combiner(mrtools.reduce_aggregator(group_by_subkeys=True, do_eval=True))
def reduce_sum(key, raw_records):
    date, page, status, host, rt = eval(key)
    count = 0
    for r in raw_records:
        count += int(r.value)
    yield mrlib.DummyRecord((date, page, status, host), rt, count)


def _percentile(value_num, percentiles, total_values=None):
    if not total_values:
        total_values = sum(value_num.itervalues())
    keys = value_num.keys()
    keys.sort()

    percentiles_pos = [int(p / 100.0 * total_values) for p in percentiles]
    cur_pos = 0
    percentiles = []
    for key in keys:
        next_pos = cur_pos + value_num[key]
        for percentile_pos in percentiles_pos[:]:
            if cur_pos <= percentile_pos <= next_pos:
                percentiles_pos.pop(0)
                percentiles.append(key)
            else:
                break
        cur_pos = next_pos
    return percentiles


def reduce_hits_request_time(key, raw_records):
    date, page, status, host = eval(key)

    min_v = max_v = None
    total_hits = 0
    total_duration = 0
    value_num = {}
    for r in raw_records:
        rt = eval(r.subkey)
        if min_v is None or rt < min_v:
            min_v = rt
        if max_v is None or rt > max_v:
            max_v = rt

        value = int(r.value)
        value_num[rt] = value
        total_hits += value
        total_duration += rt

    result = {
        'fielddate': date,
        'page': page,
        'status': status,
        'host': host,
        'requests': total_hits
    }
    if result['requests'] > 10:
        result['max'] = max_v
        result['min'] = min_v
        result['average'] = total_duration / total_hits
        result['q25'], result['q50'], result['q75'], result['q90'], result['q95'] = [i for i in _percentile(value_num, [25, 50, 75, 90, 95], total_values=total_hits)]
        result['interquartile'] = result['q75'] - result['q25']
        yield mrlib.SimpleTSKVRecord(**result)


def reduce_hits(key, raw_records):
    date, page, status, host = eval(key)

    rpm = Counter()
    for r in raw_records:
        dt = r.subkey
        rpm[dt] += int(r.value)

    values = rpm.values()
    result = {
        'fielddate': date,
        'page': page,
        'status': status,
        'host': host,
        'requests': sum(values),
    }
    if result['requests'] > 10:
        result['max'] = numpy.max(values)
        result['min'] = numpy.min(values)
        result['average'] = numpy.average(values)
        result['q25'], result['q50'], result['q75'], result['q90'], result['q95'] = numpy.percentile(values, [25, 50, 75, 90, 95])
        result['interquartile'] = result['q75'] - result['q25']
        yield mrlib.SimpleTSKVRecord(**result)


if __name__ == '__main__':
    mrtools.run()
