#!/usr/bin/env python

# -*- coding: UTF-8 -*-

import os
import re
import sys
import time
import urllib2
import json
import random
from collections import defaultdict, Counter


def http_get_json_cache(url, cache_file, cache_ttl=300, retries=1, sleep=1):
    cache_need_update = True
    json_data = dict()

    if os.path.exists(cache_file):
        ts_now = time.time()
        ts_cache = os.path.getmtime(cache_file)
        if (ts_now - ts_cache) < cache_ttl:
            cache_need_update = False

    if cache_need_update:
        json_data = http_get_json(url, retries, sleep)
        if len(json_data) > 0:
            cache_update(json.dumps(json_data), cache_file)

    cache_data = cache_read(cache_file)
    try:
        data = json.loads(cache_data)
    except:
        data = json_data

    return data


def http_get(url, retries, sleep):
    for i in xrange(retries):
        try:
            http_req = urllib2.urlopen(url, timeout=10)
            if http_req.getcode() == 200:
                return http_req.read()
        except:
            pass
    return dict()


def http_get_json(url, retries, sleep):
    for i in xrange(retries):
        try:
            bdata = http_get(url, retries, sleep)
            json_data = json.loads(bdata)
            if json_data:
                return json_data
        except:
            pass
    return dict()


def cache_update(data, cache_file):
    result = dict()
    try:
        cache_dir = os.path.dirname(cache_file)
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        with open(cache_file, "w") as cache_fd:
            cache_fd.write(data)
    except:
        pass
    return result


def cache_read(cache_file):
    try:
        with open(cache_file, "r") as cache_fd:
            return cache_fd.read()
    except:
        return dict()


common_cache_dir = "/var/tmp/mworker-jobs-stat"
### get queller hosts
conductor_quellermap_url = 'http://c.yandex-team.ru/api-cached/groups2hosts/disk_queller?format=json'
conductor_quellermap_cache_file = common_cache_dir + "/quellermap.cache"
conductor_quellermap_cache_ttl = 300
conductor_quellermap_retries = 2
conductor_quellermap_delay = 1

quellermap = http_get_json_cache(
    url=conductor_quellermap_url,
    cache_file=conductor_quellermap_cache_file,
    cache_ttl=conductor_quellermap_cache_ttl,
    retries=conductor_quellermap_retries,
    sleep=conductor_quellermap_delay
)

### get task on queue mapping from queller
queller_taskmap_cache_file = common_cache_dir + "/taskmap.cache"
queller_taskmap_retries = 1
queller_taskmap_sleep = 1

taskmap = dict()
random.shuffle(quellermap)
for queller in quellermap:
    queller_taskmap_url = "http://{}:30811/z/celery-tasks.json".format(queller['fqdn'])
    taskmap = http_get_json_cache(url=queller_taskmap_url,
                                  cache_file=queller_taskmap_cache_file,
                                  retries=queller_taskmap_retries,
                                  sleep=queller_taskmap_sleep)
    if taskmap:
        break

task_queue = dict()
if taskmap > 0 and 'tasks' in taskmap:
    for match in taskmap['tasks']:
        task = match['id'].replace('.', '-')
        queue = match['queue']
        task_queue[task] = queue

results_errors = {'parse': 0}

# Collect errors from logs
results_log_errors = dict()
for error in ['parse']:
    results_log_errors[error] = 0

# task stats
# "status" field
results_count_task_status = defaultdict(Counter)
results_count_aggr_task_status = defaultdict(Counter)

# "task_status" field
results_count_task_tstatus = defaultdict(Counter)
results_count_aggr_task_tstatus = defaultdict(Counter)

# lifetime
results_timings_task_lifetime = defaultdict(Counter)
results_timings_task_processed = defaultdict(Counter)

results_time_task_lifetime = defaultdict(int)
results_time_task_processed = defaultdict(int)

### queue status
results_count_queues_status = defaultdict(Counter)
results_count_queues_tstatus = defaultdict(Counter)

results_timings_queues_lifetime = defaultdict(Counter)
results_timings_queues_processed = defaultdict(Counter)

results_time_queues_lifetime = defaultdict(int)
results_time_queues_processed = defaultdict(int)

### operations stats

results_count_opertype_status = defaultdict(Counter)
results_count_opertype_tstatus = defaultdict(Counter)

results_count_opertype_title = defaultdict(Counter)

results_count_aggr_opertype_title = defaultdict(Counter)

results_timings_opertype_lifetime = defaultdict(Counter)
results_timings_opertype_processed = defaultdict(Counter)

#time_re = re.compile('Task [^ ]+ (?P<status>OK|FAIL|TEMP_FAIL) \(try [-\d]+\), name: (?P<task_name>[^ ]+) \((?P<oper_type>[\w_-]+), (?P<oper_subtype>[\w_-]+)\) \(processed: (?P<processed>\d+\.\d+) sec, lifetime: (?P<lifetime>\d+\.\d+) sec\)$')

time_re = re.compile(
    r'Task [^ ]+ (?P<status>OK|FAIL|TEMP_FAIL) \(try [-\d]+\), '
    r'name: (?P<task_name>[^ ]+) \((?P<oper_type>[\w_-]+), '
    r'(?P<oper_subtype>[\w_-]+)\) \(processed: (?P<processed>\d+\.\d+) sec, '
    r'lifetime: (?P<lifetime>\d+\.\d+) sec\), task_status: (?P<task_status>[\w_-]+), '
    r'oper_state: (?P<oper_state>[\w_-]+), oper_title: (?P<oper_title>[\w_-]+), oper_id: [^ ,]+'
)


for line in sys.stdin:
    line = line.strip()
    if not line:
        continue

    parts = line.split("\t")
    parsed = {}
    for part in parts:
        eq = part.find('=')
        key = part[:eq]
        value = part[(eq + 1):]
        parsed[key] = value

    time_matches = re.search(time_re, parsed['message'])

    if time_matches:
        task_name = time_matches.group('task_name').replace('.', '-')
        status = time_matches.group('status').lower()
        tstatus = time_matches.group('task_status').lower()
        processed = time_matches.group('processed')
        lifetime = time_matches.group('lifetime')
        oper_type = time_matches.group('oper_type')
        oper_title = time_matches.group('oper_title').lower()

        results_timings_task_lifetime[task_name][lifetime] += 1
        results_timings_task_processed[task_name][processed] += 1

        results_time_task_lifetime[task_name] += float(lifetime)
        results_time_task_processed[task_name] += float(processed)

        if oper_type != '-':
            results_count_opertype_status[oper_type][status] += 1
            results_count_opertype_tstatus[oper_type][tstatus] += 1
            results_count_opertype_title[oper_type][oper_title] += 1
            results_timings_opertype_lifetime[oper_type][lifetime] += 1
            results_timings_opertype_processed[oper_type][processed] += 1
            results_count_aggr_opertype_title[oper_type][oper_title] += 1

        results_count_task_status[task_name][status] += 1
        results_count_task_tstatus[task_name][tstatus] += 1
        results_count_aggr_task_status['total'][status] += 1
        results_count_aggr_task_tstatus['total'][tstatus] += 1

        # per queue
        if task_name in task_queue:
            queue_name = task_queue[task_name]
            results_timings_queues_lifetime[queue_name][lifetime] += 1
            results_timings_queues_processed[queue_name][processed] += 1

            results_time_queues_lifetime[queue_name] += float(lifetime)
            results_time_queues_processed[queue_name] += float(processed)
            results_count_queues_status[queue_name][status] += 1
            results_count_queues_tstatus[queue_name][tstatus] += 1


def print_timings(name, result):
    for task_name, timings in sorted(result.items()):
        if timings:
            packed_timings = map(lambda t: "%s@%s" % t, sorted(timings.items()))
            print("@%s_%s %s" % (name, task_name, " ".join(packed_timings)))


def print_summary_time(name, result):
    for task_name, times in sorted(result.items()):
        print("%s_%s %.2f" % (name, task_name, times))


def print_counters(name, result):
    for key, value in sorted(result.items()):
        print("%s_%s %s" % (name, key, value))


def print_status_counters(name, result):
    for task_name, task_result in sorted(result.items()):
        for status_k, status_v in sorted(task_result.items()):
            print("%s_%s_%s %s" % (name, task_name, status_k, status_v))


print_counters('mpfs_error', results_errors)
# TASK
print_timings('mpfs_timings_lifetime_task', results_timings_task_lifetime)
print_summary_time('mpfs_count_lifetime_time_task', results_time_task_lifetime)
print_timings('mpfs_timings_processed_task', results_timings_task_processed)
print_summary_time('mpfs_count_processed_time_task', results_time_task_processed)
print_status_counters('mpfs_count_status_task', results_count_task_status)
print_status_counters('mpfs_count_aggr_task_status', results_count_aggr_task_status)
print_status_counters('mpfs_count_tstatus_task', results_count_task_tstatus)
print_status_counters('mpfs_count_aggr_task_tstatus', results_count_aggr_task_tstatus)

# QUEUES
print_timings('mpfs_timings_lifetime_queue', results_timings_queues_lifetime)
print_summary_time('mpfs_count_lifetime_time_queue', results_time_queues_lifetime)
print_timings('mpfs_timings_processed_queue', results_timings_queues_processed)
print_summary_time('mpfs_count_processed_time_queue', results_time_queues_processed)
print_status_counters('mpfs_count_status_queue', results_count_queues_status)

# OPERATIONS
print_status_counters('mpfs_count_status_opertype', results_count_opertype_status)
print_status_counters('mpfs_count_tstatus_opertype', results_count_opertype_tstatus)
print_status_counters('mpfs_count_title_opertype', results_count_opertype_title)
print_status_counters('mpfs_count_aggr_title_opertype', results_count_aggr_opertype_title)

print_timings('mpfs_timings_processed_opertype', results_timings_opertype_processed)
print_timings('mpfs_timings_lifetime_opertype', results_timings_opertype_lifetime)
