# -*- coding: utf-8 -*-
import subprocess
import re
import datetime
import os
import json
import gzip


CACHE_FILE_PATH = '/tmp/started_tasks_cache.json'
DEFAULT_LOG_PATH = '/var/log/mpfs/default-tskv.log'


def execute_shell(cmd):
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
    result, _ = proc.communicate()
    return result.strip()


def parse_tskv_line(line):
    line_parts = line.split('\t')
    if line_parts[0] != 'tskv':
        return None
    return dict([kv.split('=', 1) for kv in line_parts[1:]])


def get_active_pids():
    # собираем пиды всех процессов пользователя nginx, с именем queue2, дочерних:
    active_pids = execute_shell("ps auxf | grep '^nginx' | grep queue2 | grep '\\_ ' | awk '{print $2}'")
    return filter(lambda p: p, active_pids.split('\n'))


def load_started_tasks_from_cache():
    started_tasks = []
    if os.path.exists(CACHE_FILE_PATH):
        cache_file = open(CACHE_FILE_PATH)
        content = cache_file.read()
        started_tasks = json.loads(content)['tasks']
    return started_tasks


def save_started_tasks_from_cache(tasks):
    cache_file = open(CACHE_FILE_PATH, 'w')
    cache_file.write(json.dumps({'tasks': tasks}))


def filter_tasks_by_active_pids(tasks):
    active_pids = get_active_pids()
    return [t for t in tasks if t['pid'] in active_pids]


def get_hang_tasks(last_started_tasks, started_more_than_minutes_ago=30):
    # для каждой стартованой таски проверяем timestamp
    long_ago_started_tasks = []
    for task in last_started_tasks:
        timestamp = task['timestamp'].split('timezone')[0]
        ts_started = datetime.datetime.strptime(timestamp + '000', '%Y-%m-%d %H:%M:%S,%f')

        delta = datetime.datetime.now() - ts_started
        if delta > datetime.timedelta(minutes=started_more_than_minutes_ago):
            long_ago_started_tasks.append(task)

    # среди тех, которые не завершились, надо найти те, которые сейчас еще бегут - для этого по requests логу смотрим,
    # были ли записи от этого таска за последние 10 минут, если были, то все ок
    hang_tasks = []
    for task in long_ago_started_tasks:
        request_id = task['request_id']
        cmd = "timetail -n 600 -t java /var/log/mpfs/requests-tskv.log | grep %s" % request_id
        requests_log_lines = execute_shell(cmd)
        if not requests_log_lines:
            hang_tasks.append(task)

    return hang_tasks


def alarm_hang_tasks(last_started_tasks):
    hang_tasks = get_hang_tasks(last_started_tasks)
    for task in hang_tasks:
        print json.dumps(task)


if __name__ == '__main__':
    started_tasks = load_started_tasks_from_cache()
    alive_started_tasks = filter_tasks_by_active_pids(started_tasks)

    DEFAULT_LOG_PATH_LAST_GZIPPED = execute_shell('ls %s-* | tail -n1' % DEFAULT_LOG_PATH)
    default_log_file = open(DEFAULT_LOG_PATH)
    default_log_file_last_gzipped = gzip.open(DEFAULT_LOG_PATH_LAST_GZIPPED)

    last_started_task_by_pid = {}
    for t in alive_started_tasks:
        last_started_task_by_pid[t['pid']] = t

    start_task_re = re.compile('Task .* started \(try .*\), name:')
    finish_task_re = re.compile('Task .* (OK|FAIL) \(try .*\), name:')
    for f in (default_log_file, default_log_file_last_gzipped):
        for line in f:
            if start_task_re.search(line) is not None:
                task = parse_tskv_line(line)
                if task is None:
                    continue
                last_started_task_by_pid[task['pid']] = task
            elif finish_task_re.search(line) is not None:
                task = parse_tskv_line(line)
                if task is None:
                    continue
                started_task = last_started_task_by_pid.get(task['pid'])
                if started_task and started_task['request_id'] == task['request_id']:
                    last_started_task_by_pid.pop(task['pid'])
        f.close()

    last_started_tasks = filter_tasks_by_active_pids(last_started_task_by_pid.values())
    alarm_hang_tasks(last_started_tasks)
    save_started_tasks_from_cache(last_started_tasks)
