#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import re
import urllib
import statbox.qb
import statbox.qb.filters2 as sf
import statbox.qb.extractors2 as se
from statbox import mrlib
from statbox import mrtools
from collections import defaultdict


def combined_raw_map(lines):
    qb = statbox.qb.QB(
        log='default',
        fields=[
            'date',
            se.log_field('stid'),
            se.log_field('can_clean'),
            se.log_field('status'),
            se.log_field('stid_size'),
            se.log_field('stid_source'),
        ],
    )
    for r in qb.map_lines(lines):
        yield mrlib.DummyRecord(
            (r['date'], r['stid']),
            '',
            (
                r['can_clean'] == 'True',
                r['status'],
                str(r['stid_size']),
                r['stid_source'],
            )
        )


@mrlib.combiner(mrtools.reduce_aggregator(group_by_subkeys=False, do_eval=False, processor=mrtools.listed_add))
def reduce_stids(key, raw_records):
    date, _ = eval(key)
    hits = 0
    hits_500 = 0
    can_clean_false = 0
    can_clean_true = 0
    stid_clean_true_size = 0
    stid_clean_false_size = 0
    stid_clean_source = "None"
    for r in raw_records:
        can_clean, status, size, stid_source = eval(r.value)
        hits += 1
        if status.startswith('5'):
            hits_500 += 1

        if stid_source:
            stid_clean_source = stid_source

        if can_clean in (True, 'True'):
            can_clean_true = 1
            if size.isdigit():
                stid_clean_true_size = int(size)
        else:
            can_clean_false = 1
            if size.isdigit():
                stid_clean_false_size = int(size)

    for stid_source in (stid_clean_source, '_total_'):
        yield mrlib.Record((date, stid_source), '', (
            hits,
            1,
            hits_500,
            1 if hits_500 > 0 else 0,
            can_clean_true,
            can_clean_false,
            stid_clean_true_size,
            stid_clean_false_size,
        ))


def reduce_final(key, raw_records):
    date, stid_source = eval(key)
    count = defaultdict(int)
    for r in raw_records:
        values = eval(r.value)
        count['hits'] += values[0]
        count['stids'] += values[1]
        count['hits_500'] += values[2]
        count['stids_500'] += values[3]
        count['clean_true'] += values[4]
        count['clean_false'] += values[5]
        count['deleted_size'] += values[6]
        count['not_deleted_size'] += values[7]

    count['k_recheck'] = 1 - float(count['stids']) / count['hits']
    count['k_recheck_strg'] = float(count['hits_500']) / count['hits']
    count['k_dedup'] = float(count['clean_false']) / count['stids']
    count['k_retry'] = float(count['hits_500']) / count['stids_500'] if count['stids_500'] else 0
    count['k_dedup_by_size'] = float(count['not_deleted_size']) / (count['deleted_size'] + count['not_deleted_size'])

    yield mrlib.SimpleTSKVRecord(
        fielddate=date,
        stid_source=stid_source,
        **count
    )


if __name__ == '__main__':
    mrtools.run()
