#!/usr/bin/env python

import sys
import os
from datetime import timedelta
from fcntl import flock, LOCK_EX, LOCK_NB
import errno
import argparse
import yaml

parser = argparse.ArgumentParser()

parser.add_argument('-w', '--warn',
                    type=int,
                    default=7200,
                    help='Warning limit')

parser.add_argument('-c', '--crit',
                    type=int,
                    default=14400,
                    help='Critical limit')

parser.add_argument('-i', '--init',
                    type=str,
                    default='/etc/init.d/mongodb-backup',
                    help='Init-script path')

parser.add_argument('-u', '--uri',
                    type=str,
                    default='mongodb://monitor:monitor@localhost:{port}/admin',
                    help='URI to connect')

args = parser.parse_args()

###############################################################################
# How to use this toolkit ##################################################
# The toolkit operates on "databases"  -- an abstraction representing a
# collection of shards, and "instances" -- collections of actual mongod daemons
# This abstraction is required, however: if you need to backup/monitor a single
# instance (e.g. shardless mongodb installation), you still need to adhere to
# "database"-"instance" hierarchy.
#
# Before you can actually use it, you need to make sure that following two
# requirements are met:
# 1. "Database"-"instance" configuration resides at
#    /etc/mongodb/<database>/<instance{1,2,...}>.conf
#    This is important, as all scripts take these filenames as instance names
#    and use them in their work.
#    e.g.: /etc/mongodb/docviewer/doc1.conf /etc/mongodb/docviewer/doc2.conf
#          /etc/mongodb/abook/moko4-1.conf
# 2. Configuration must keep its data, pidfile etc at following locations:
#    pidfilepath = /var/run/mongodb/<database>/<instance>.pid
#    logpath = /var/log/mongodb/<database>/<instance>.log
#    dbpath = /srv/mongodb/data/<database>/<instance>
#
# As long as you follow this requirements, you can add an unlimited number
# of "databases" and "instances".
#
# Questions? Patches? Wanna punch my face for losing you data?
# please write to arhipov@yandex-team.ru.


class StatusException(Exception):
    def __init__(self, code, text):
        self.code = code
        self.text = text


class Status:
    def __init__(self):
        self.code = 0
        self.text = []

    def set_code(self, new_code):
        if new_code > self.code:
            self.code = new_code

    def append(self, new_text):
        self.text.append(new_text)

    def report(self, retcode=0, message=None):
        # concatenate all received statuses
        if message is None:
            message = ', '.join(self.text)
        # Check if code is above current setting
        self.set_code(retcode)

        if not message and self.code == 0:
            message = 'OK'
        # strip extension (so file.with.dots.txt will also work)
        print '%d;%s' % (self.code, message)
        sys.exit(0)

def parse_mongod_yaml(config_path):
    with open(config_path) as f:
        try:
            return yaml.load(f)
        except Exception:
            raise Exception("unable to parse config file '%s'" % config_path)

def parse_mongod_config(config_path):
    """ parse_mongod_config(config_path) => { 'conf': 'value' } """
    options = {}
    conf_file = open(config_path, 'r')
    for line in conf_file:
        # ignore comments
        if line.lstrip().startswith('#'):
            continue
        if '=' in line:
            # ignore inline comments, get config by presence of '=' sign.
            opt, val = line.split('#', 1)[0].split('=')
            opt = opt.strip()
            val = val.strip()
            options[opt] = val
    if not options['port']:
        raise Exception("config file does not have a 'port' directive!")
    return options


def get_dbs_configs(config_root='/etc/mongodb'):
    """ Scan local configs at config_root and return a dictionary with
        all necessary paths and numbers, eg:
            {'docviewer': {'doc1': { 'bind_ip': '0.0.0.0',
             'port': '27021',
             ...,
             'pidfilepath': '/var/run/mongodb/docviewer/doc2.pid',
             ....}
    """
    conf = {}
    if not os.path.exists(config_root):
        return conf
    for db in os.listdir(config_root):
        conf[db] = {}
        for instance in os.listdir(os.sep.join([config_root, db])):
            if instance.startswith('.') or not instance.endswith('.conf'):
                continue
            instance_name = os.path.splitext(instance)[0]
            conf[db][instance_name] = parse_mongod_yaml(
                os.sep.join([config_root, db, instance]))
    return conf


def check_lockfile(lockfile_path):
    """ Open file and try to put a lock on it.
    Normally we wont succeed in it.
    If we do, StatusException will be raised.
    """

    # 1. if we succeed both to open file and to acquire lock
    #    that means we have a stale lockfile.
    #    This is a dangerous situation as it means that
    #    backup script exited abnormally.
    try:
        lockfile_fd = open(lockfile_path, 'r')
        # Exclusive lock in a non-blocking fashion.
        flock(lockfile_fd, LOCK_EX | LOCK_NB)
        raise StatusException(2, "stale lockfile found. " +
                                 "backup probably failed")
    except IOError as e:

        # 3. if we succeed to open file, but FAIL to acquire lock
        #    that means that backup is currently in progress.
        #    Light WARN so if it takes too long, it ll be noticed.
        if e.errno == errno.EAGAIN:
            raise StatusException(1, "backup in progress")

        # 2. if we get ENOENT, 'No such file or directory', open() failed,
        #    but things are fine: no lockfile is present. We can proceed safely
        if e.errno == errno.ENOENT:
            pass
    return True

def connect(uri, port):
    try:
        return pymongo.MongoClient(
            uri.format(port=port),
            socketTimeoutMS=10000,
            connectTimeoutMS=10000,
            waitQueueTimeoutMS=10000,
        )['admin']
    except pymongo.errors.ConfigurationError as exc:
        return pymongo.MongoClient('localhost:{p}'.format(p=port))['admin']

def check_mongod_health(dbs, lag_warn=args.warn, lag_crit=args.crit, uri=args.uri):
    db = connect(uri, dbs['net']['port'])
    # Get a big picture: status of all members and sync information.
    status = db.command('replSetGetStatus')

    # Get master and my own status
    master = False
    for member in status['members']:
        if member['stateStr'] == 'PRIMARY':
            master = member
        if 'self' in member:
            own = member

    if not master:
        raise StatusException(2, "No master in RS")

    # Backup copy should never become a primary
    if own['stateStr'] == 'PRIMARY':
        raise StatusException(2, "Backup replica became PRIMARY!")

    # Removed copies should not raise alarms.
    if own['stateStr'] == 'REMOVED':
        raise StatusException(0, "replica removed")

    lag = master['optimeDate'] - own['optimeDate']
    lag_seconds = (lag.days * 86400 + lag.seconds)

    # perform comparison against thresholds
    if lag > timedelta(seconds=lag_warn):
        raise StatusException(2, "replication lag: %s" % lag_seconds)
    elif lag > timedelta(seconds=lag_crit):
        raise StatusException(2, "replication lag: %s" % lag_seconds)

    return {'lag': lag, 'state': own['stateStr']}

if __name__ == "__main__":
    status = Status()
    # If we cannot find init-script,
    # chances are this host does not serve any backup replicas.
    if not os.path.isfile(args.init):
        status.report(0)

    try:
        import pymongo
        dbs_config = get_dbs_configs()
    except Exception as message:
        # Config-parsing errors and imports are fatal,
        # cause I wouldnt know what to do otherwise.
        # Error out...
        status.report(2, 'error at config parse: %s' % message)

    # Check each instance (mongod) for each database in found configs.
    for database in dbs_config:
        try:
            # See /etc/cron.yandex/mongodb-instances-backup
            lockfile = "/var/lock/mongodb-backup-%s.lock" % database
            # if lockfile present and can be locked, its fatal: backup exited
            # abnormally. Raise StatusException with critical code.
            # if lockfile present and locked, backup is in progress.
            # Raise StatusException with warn code.
            # otherwise continue. Also see function definition.
            check_lockfile(lockfile)
            # play doctor with all instances of mongod.
            for instance in dbs_config[database]:
                try:
                    # Basically, we just need to know port.
                    # But for possible future needs (read: never) I decided
                    # to pass the whole config.
                    # If lag is abnormal, StatusException will be raised.
                    # otherwise return dict() with needed parameters.
                    diagnosis = check_mongod_health(
                        dbs_config[database][instance])
                    # all seems ok: write it down and continue
                    # to the next instance.

                # do we really need it if things are okay? If they are not,
                # we get info thru StatusException anyway.
                # Otherwise we risk polluting critical alarm text with useless
                # diagnostic data of instances still in operation.
                # status.append( '%s-%s lag: %s' % (database,
                #                                   instance,
                #                                   diagnosis['lag']) )
                except StatusException as e:
                    # something is wrong with mongod state or its chiselkas.
                    status.append('%s-%s %s' % (database, instance, e.text))
                    status.set_code(e.code)
                except Exception as e:
                    status.append('%s-%s error: %s' % (database, instance, e))
                    # general exception likely indicates a critical state,
                    # e.g., a connection failure
                    status.set_code(2)

        except StatusException as e:
            # Something is wrong with database (likely, a lockfile)
            status.append('%s: %s' % (database, e.text))
            status.set_code(e.code)

    # Asked everybody, been everywhere, ready to report back.
    status.report()
