#!/usr/bin/env python
# -*-coding: utf-8 -*-
# vim: sw=4 ts=4 expandtab ai

import os
import sys
import datetime
import time
import socket
import json

# Путь к файлу мониторинга graphite-sender
SENDER_MONFILE = '/tmp/yabs-graphite-sender-monitoring'
# Максимальное время выполнения graphite-sender, сек
MAX_RUN_TIME = 30
# Время, когда проверка будет работать.
CHECK_TIME = [(10, 22)]
# Максимально возможное время бездействия graphite-sender, сек
NOACTION_TIMEOUT = 30*60
# Максимально допустимое время неотправки на сервера graphite, сек
NOTSEND_TIMEOUT = 60*60*2
# Временный файл для мониторинга
TMPFILE = '/tmp/mon-graphite-sender-tmp'
# Порт на котором будет слушать sender
SENDER_PORT = 42000
# Кол-во попыток для подключения к сокету sender
CONNECT_TRY_COUNT = 20
# Файл в который складываются удаленные метрики.
DROPPED_METRIC_FILE = '/tmp/yabs-graphite-sender-dropped'
# Время в течении которого сообщать о удаленных метриках, сек
DROPPED_METRIC_TIMEOUT = 3600
UNIQ_MONITORING_FILE = '/tmp/yabs-graphite-sender-metrics'
MAX_UNIQ_METRICS = 96000


def read_mon_data(fname, night, ext_out):
    int_out = ext_out[:]
    f = None
    mon_data = None
    try:
        mf_mtime = time.time() - os.stat(fname).st_mtime
    except Exception as err:
        if night:
            print '1; %s' % str(err)
        else:
            print '2;  %s' % str(err)
        sys.exit()
    if os.path.exists(fname) and mf_mtime < NOACTION_TIMEOUT:
        try:
            f = open(fname)
            mon_data = json.loads(f.read())
        except Exception as err:
            if night:
                int_out.append((1, 'Can\'t parse mon file %s' % str(err)))
            else:
                int_out.append((2, 'Can\'t parse mon file %s' % str(err)))

        finally:
            f.close()
    else:
        if night:
            int_out.append((1, 'Too long no action'))
        else:
            int_out.append((2, 'Too long no action'))
        try:
            os.remove(fname)
        except OSError:
            # не хватило прав удалить -- игнорируем
            pass

    return mon_data, int_out


def main():
    # Check time
    check_accept = False
    for mint, maxt in CHECK_TIME:
        check_accept = check_accept or \
                (mint <= datetime.datetime.today().hour <= maxt)
    if not check_accept:
        night = True
    else:
        night = False

    out = []

    # Check socket
    sock = None
    try_count = CONNECT_TRY_COUNT
    connect_done = False
    while True:
        for sock_type in [socket.AF_INET, socket.AF_INET6]:
            try:
                sock = socket.socket(sock_type, socket.SOCK_STREAM)
                sock.settimeout(1.0)
                sock.connect( ('localhost', SENDER_PORT) )
                connect_done = True
                break
            except Exception, err:
                if try_count <= 0:
                    out.append((2, 'Can\'t connect to localhost:%d : %s' % \
                            (SENDER_PORT, str(err))))
                    connect_done = True
                else:
                    try_count -= 1
            finally:
                if sock:
                    sock.close()

        if connect_done:
            break
        else:
            time.sleep(1)


    # Check sender
    sender_data = None
    for i in xrange(10):
        sender_data,out = read_mon_data(SENDER_MONFILE, night, out)
        if sender_data:
            break
        else:
            time.sleep(5)

    if sender_data:
        # Parse monitoring stats
        exec_time = sender_data['send_time']
        max_srv = sender_data['total_srv']
        sucess_srv = sender_data['sucess_srv']

#        if max_srv > sucess_srv:
#            if night:
#                out.append((1, 'Error send data to all graphite servers'))
#            else:
#                out.append((2, 'Error send data to all graphite servers'))

        # Check sucess server send count
        if not sucess_srv:
            tmpfile = None
            if os.path.exists(TMPFILE):
                try:
                    tmpfile = open(TMPFILE)
                    old_time = float(tmpfile.read().strip())
                finally:
                    tmpfile.close()

                if (time.time() - old_time) > NOTSEND_TIMEOUT:
                    out.append((2, 'Too long not send to all graphite srv'))
                else:
                    out.append((1, 'Error send to all graphite srv'))
            else:
                try:
                    tmpfile = open(TMPFILE, 'w')
                    tmpfile.write(str(time.time()))
                finally:
                    tmpfile.close()
        else:
            if os.path.exists(TMPFILE):
                os.remove(TMPFILE)

            # Check run time
        if exec_time > MAX_RUN_TIME:
            if night:
                out.append((1, 'Send too long: %.3f' % exec_time))
            else:
                out.append((2, 'Send too long: %.3f' % exec_time))

    else:
        if night:
            out.append((1, 'No data in monitoring file %s' % SENDER_MONFILE))
        else:
            out.append((2, 'No data in monitoring file %s' % SENDER_MONFILE))


    # Check dropped metrics
    if os.path.exists(DROPPED_METRIC_FILE):
        # Check file time
        if (time.time() - os.stat(DROPPED_METRIC_FILE).st_mtime) < DROPPED_METRIC_TIMEOUT:
            dropped_metrics_count = 0
            # get dropped metrics count
            try:
                dropped_metrics_count = len(read_mon_data(DROPPED_METRIC_FILE, night, out)[0])
            except BaseException:
                pass
#            if night:
#                out.append((1, 'Find %s dropped metrics' % str(dropped_metrics_count)))
#            else:
#                out.append((2, 'Find %s dropped metrics' % str(dropped_metrics_count)))

    #check uniq_metrics

    if os.path.exists(UNIQ_MONITORING_FILE):
        uniq_metrics = None
        try:
            f = open(UNIQ_MONITORING_FILE)
            uniq_metrics = [el.strip() for el in f.readlines()]
            f.close()
        except Exception, err:
            out.append((1, "Cannot read uniq metric counter %s" % str(err)))
        #check keys count
        if uniq_metrics:
            uniq_keys = len(uniq_metrics)
            if uniq_keys > MAX_UNIQ_METRICS:
                out.append((1, 'There is %d uniq metrics' % uniq_keys))

    out_msg(out)


def out_msg(messages_list):
    if not messages_list:
        message = '0;OK'
    else:
        messages_list.sort()
        messages_list.reverse()
        message = '%d;%s' % (max([msg[0] for msg in messages_list]),
                ', '.join([msg[1] for msg in messages_list]).strip() )
    print 'PASSIVE-CHECK:graphite-client;%s' % message
    sys.exit(0)


if __name__ == '__main__':
    main()

