#!/skynet/python/bin/python

import os
import fcntl
import time
import socket
import subprocess
import argparse
from pprint import pprint
import msgpack
import re

TIMEOUT_STATUSES = [None, -9, 255]

def run_command(args, raise_failed=True, timeout=None, sleep_timeout=0.1, cwd=None, close_fds=False, stdin=None):
    try:
        if stdin is None:
            p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd, close_fds=close_fds)
        else:
            p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd,
                                 close_fds=close_fds)
    except Exception, e:
        if raise_failed:
            raise Exception, "subpocess.Popen for <<%s>> failed: %s" % (args, e)
        else:
            return (1, 'Got unknown error %s' % e, '')

    try:
        if not stdin is None:
            p.stdin.write(stdin) # Is it correct ?
            p.stdin.close()

        if timeout is None:
            out, err = p.communicate()
            if p.returncode != 0 and raise_failed:
                raise Exception, "Command <<%s>> returned %d\nStdout:%s\nStderr:%s" % (
                    args, p.returncode, out, err)
        else:
            out, err = '', ''
            wait_till = time.time() + timeout

            fcntl.fcntl(p.stdout, fcntl.F_SETFL, fcntl.fcntl(p.stdout, fcntl.F_GETFL) | os.O_NONBLOCK)
            fcntl.fcntl(p.stderr, fcntl.F_SETFL, fcntl.fcntl(p.stderr, fcntl.F_GETFL) | os.O_NONBLOCK)

            while time.time() < wait_till:
                p.poll()
                try:
                    while True:
                        r = os.read(p.stdout.fileno(), 1024)
                        out += r
                        if len(r) == 0:
                            break
                except OSError:
                    pass
                try:
                    while True:
                        r = os.read(p.stderr.fileno(), 1024)
                        err += r
                        if len(r) == 0:
                            break
                except OSError:
                    pass
                if p.returncode == 0:
                    return (p.returncode, out, err)
                if p.returncode != None:
                    if raise_failed:
                        raise Exception, "Command <<%s>> returned %d\nStdout:%s\nStderr:%s" % (args, p.returncode, out, err)
                    else:
                        return (p.returncode, out, err)
                time.sleep(sleep_timeout)

            if raise_failed:
                raise Exception, "Command <<%s>> timed out (%f seconds)" % (args, timeout)
    finally:
        if p.returncode is None:
            p.kill()
            p.wait()

    return p.returncode, out, err

class ERunInMulti:
    THREAD = 'THREAD'
    PROCESS = 'PROCESS'
    SKYNET = 'SKYNET'
    ALL = [THREAD, PROCESS, SKYNET]

def _gen_check_command(command_str, host, params):
    if params['mode'] == ERunInMulti.SKYNET:
        args = ["bash", "-c", command_str]
    else:
        args = ["ssh", "-o", "PasswordAuthentication=false", host, command_str]

    return args

def _check_via_proto(host, proto):
    try:
        s = socket.socket(proto, socket.SOCK_STREAM)
        s.settimeout(5)
        s.connect((host, 22))

        r = s.recv(1)
        if len(r) == 1:
            return True
    finally:
        s.settimeout(None)

    return False

def _get_raids_info(host, params):
    """
        Return list of disks for every raid device along with raid type

        :return (dict): raid devices info
    """

    def _to_dict(line):
        parts = re.findall('.*?=".*?"', line)
        parts = map(lambda x: x.lstrip().strip(), parts)
        result = dict(map(lambda x: (x.partition('=')[0], x.partition('=')[2].replace('"', '')), parts))
        return result

    args = _gen_check_command("lsblk -io KNAME,TYPE,SIZE,MODEL,ROTA -b -P", host, params)
    status, out, err = run_command(args, raise_failed = False, sleep_timeout = 0.1, timeout = 10)
    if status != 0:
        raise Exception, "Got status <%s>" % status

    raids_info = {}
    last_disk = None
    for line in out.strip().split('\n'):
        d = _to_dict(line)
        if d['TYPE'] == 'disk':
            last_disk = { 'name' : d['KNAME'], 'model' : d['MODEL'], 'size' : int(d['SIZE']), 'rota' : int(d['ROTA']) }
        if d['TYPE'].startswith('raid'):
            if d['KNAME'] not in raids_info:
                raids_info[d['KNAME']] = { 'raid_type' : d['TYPE'], 'disks' : [] }

            assert(last_disk is not None)
            assert(d['TYPE'] == raids_info[d['KNAME']]['raid_type'])

            raids_info[d['KNAME']]['disks'].append(last_disk)

    return raids_info

def check_badraid10(host, params):
    raids_info = _get_raids_info(host, params)

    for raid_device in raids_info.itervalues():
        if raid_device['raid_type'] == 'raid10':
            rota_types = set(map(lambda x: x['rota'], raid_device['disks']))
            if len(rota_types) > 1: # have both ssd and hdd in one raid10 device
                return False

    return True

def check_fastbone(host, params):
    args = _gen_check_command("ifconfig | grep -A4 vlan | grep -c 2a02:6b8", host, params)
    status, out, err = run_command(args, raise_failed = False, sleep_timeout = 0.1, timeout = 10)
    if status in TIMEOUT_STATUSES:
        raise Exception, "Timed out"

    return int(out) > 0

def check_lowfreq(host, params):
    args = _gen_check_command("cat /proc/cpuinfo | grep 'cpu MHz' | awk '{print $4}'", host, params)

    status, out, err = run_command(args, sleep_timeout = 0.1, timeout = params.get('timeout', 10))
    if status in TIMEOUT_STATUSES:
        raise Exception, "Timed out"

    freq_list = sorted(map(lambda x: float(x), out.strip().split('\n')))
    if freq_list[-1] < 1300:
        return False
    if freq_list[len(freq_list) / 2] < 1300:
        return False

    return True

def check_lowfreq_intel_pstate(host, params):
    args = _gen_check_command("cat /sys/devices/system/cpu/intel_pstate/max_perf_pct", host, params)
    status, out, err = run_command(args,  sleep_timeout = 0.1, timeout = params.get('timeout', 20))

    percents = int(out)

    if percents < 100:
        return False

    return True

def check_lowfreq_unsafe(host, params):
    args = _gen_check_command("(export HEATUP_DIR=`mktemp -d` && chmod 777 $HEATUP_DIR && cd $HEATUP_DIR && sky get -w rbtorrent:b9fb6467ea74d6bbfa6b405fb6ee3e6ec0547879 && cd heatup && ./heatup.Linux 100000 &); sleep 3; cat /proc/cpuinfo | grep 'cpu MHz' | awk '{print $4}'; killall heatup.Linux", host, params)

    status, out, err = run_command(args, sleep_timeout = 0.1, timeout = params.get('timeout', 20))
    if status in TIMEOUT_STATUSES:
        raise Exception, "Timed out"

    freq_list = sorted(map(lambda x: float(x), out.strip().split('\n')))
    if freq_list[0] < 2000:
        return False

    return True

def check_memleak(host, params):
    args = _gen_check_command("export TOTAL=`cat /proc/meminfo | grep MemTotal | awk '{print $2 * 1024}'`; cat /proc/vmstat  | grep \"nr_inactive_anon \|nr_active_anon \|nr_anon_pages \|nr_shmem\" | xargs | awk -v TOTAL=$TOTAL '{print ($2 + $4 - $6 - $8) * 4096 / TOTAL}'", host, params)
    status, out, err = run_command(args, sleep_timeout = 0.1, timeout = 10)
    if status in TIMEOUT_STATUSES:
        raise Exception, "Timed out"

    if float(out) > 0.1:
        return False
    return True

def check_open_ssh_port(host, params):
    try:
        return _check_via_proto(host, socket.AF_INET)
    except:
        try:
            return _check_via_proto(host, socket.AF_INET6)
        except:
            return False

def check_overheat(host, params):
    args = _gen_check_command("export LAST_THROTTLE=`tail -500 /var/log/mcelog | grep -B 1 'Throttling enabled' | grep TIME | awk '{print $2}' | sort -g | tail -1`; export NOW=`date +%s`; if (( $LAST_THROTTLE + 600 >= $NOW )); then exit 1; fi", host, params)
    status, out, err = run_command(args, raise_failed = False, sleep_timeout = 0.1, timeout = 10)
    if status in TIMEOUT_STATUSES:
        raise Exception, "Timed out"

    return status == 0

def check_pcapped(host, params):
    args = _gen_check_command("export CCC=`ps auwwx | grep ipmitool | grep -v grep | wc -l`; if [ $CCC -le 2 ]; then sudo modprobe ipmi_devintf && sudo modprobe ipmi_si && sudo ipmitool -t 0x2c -b 0x06 raw 0x2e 0xd3 0x57 0x01 0x00 0x00 | awk '{print strtonum(\"0x\"$4);}'; else echo 0; fi", host, params)

    status, out, err = run_command(args, sleep_timeout = 0.1, timeout = params.get('timeout', 10))
    if status in TIMEOUT_STATUSES:
        raise Exception, "Timed out"

    if int(out) != 0:
        return False

    return True

def check_slownet(host, params):
    args = _gen_check_command(r"(ethtool eth0; ethtool eth1; ethtool eth2) 2>/dev/null | grep -E '\s+Speed:\s+[0-9]+Mb\/s' | sed 's/[\t ]*Speed: \(.*\)Mb\/s/\1/'", host, params)
    status, out, err = run_command(args, raise_failed = False, sleep_timeout = 0.1, timeout = 10)
    if status in TIMEOUT_STATUSES:
        raise Exception, "Timed out"

    if int(out) >= 1000:
        return True
    return False

def check_ssdmirror(host, params):
    raids_info = _get_raids_info(host, params)

    for raid_device in raids_info.itervalues():
        if raid_device['raid_type'] in ['raid1', 'raid10']:
            ssd_count = len(filter(lambda x: x['rota'] == 0, raid_device['disks']))
            if ssd_count > 1:
                return False

    return True

def check_tcapped(host, params):
    args = _gen_check_command("export CCC=`ps auwwx | grep ipmitool | grep -v grep | wc -l`; if [ $CCC -le 2 ]; then sudo modprobe ipmi_devintf && sudo modprobe ipmi_si && sudo ipmitool -t 0x2c -b 0x06 raw 0x2e 0xd3 0x57 0x01 0x00 0x00 | awk '{print strtonum(\"0x\"$5);}'; else echo 0; fi", host, params)

    status, out, err = run_command(args, sleep_timeout = 0.1, timeout = params.get('timeout', 10))
    if status in TIMEOUT_STATUSES:
        raise Exception, "Timed out"

    if int(out) != 0:
        return False

    return True

def check_wrongname(host, params):
    """
        Check if contents of /etc/hosts corresponds to real host ssh address
    """
    args = _gen_check_command("cat /etc/hosts | tail -1 | awk '{print $NF}'", host, params)
    status, out, err = run_command(args, raise_failed = False, sleep_timeout = 0.1, timeout = 10)
    if status in TIMEOUT_STATUSES:
        raise Exception, "Timed out"

    if out.strip() == host.partition('.')[0]:
        return True
    else:
        print "Host <%s> presented as <%s>" % (host, out.strip())
    return False

def get_checkers():
    return {
        'sshport' : check_open_ssh_port,
        'lowfreq' : check_lowfreq,
        # 'lowfreq_unsafe' : check_lowfreq_unsafe,
        'lowfreq_intel_pstate' : check_lowfreq_intel_pstate,
        'memleak' : check_memleak,
        'fastbone' : check_fastbone,
        # 'xfastbone' : check_xfastbone,
        'overheat' : check_overheat,
        'slownet' : check_slownet,
        'pcapped' : check_pcapped,
        'tcapped' : check_tcapped,
        'badraid10' : check_badraid10,
        'wrongname' : check_wrongname,
        'ssdmirror' : check_ssdmirror,
    }

def detect_unworking():
    try:
        hname = socket.gethostbyaddr(socket.gethostbyname(socket.gethostname()))[0]
    except:
        hname = socket.gethostname()
    params = { 'mode' : ERunInMulti.SKYNET }

    result = {}
    for checker in get_checkers():
        try:
            result[checker] = get_checkers()[checker](hname, params)
        except:
            result[checker] = True

    return result

# ===============================================================================
# main stuff
# ===============================================================================
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', '--format', choices=('pretty', 'binary'), default='pretty')
    return parser.parse_args()

def main():
    args = parse_args()

    result = detect_unworking()

    if args.format == 'pretty':
        pprint(result)
    elif args.format == 'binary':
        print(msgpack.packb(result))

if __name__ == '__main__':
    main()
