#!/bin/sh
#
# $Id$
#
# Check status of storages based on Dell's PERC controllers
#

me=${0##*/}
me=${me%.*}
HOME=/home/monitor
BASE=$HOME/agents
TMP=$BASE/tmp
PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin


#-- Subroutines --------------------------------------------------------

die()
{
	echo "PASSIVE-CHECK:$me;$1;$2"
	exit 0
}

check_depends()
{
	local _cmd

	if [ ! -x "${sudo_cmd}" ]; then
		die 1 "Sudo command not found !"
	fi

	for _cmd in $megacli_cmds; do
		if [ -x $_cmd ]; then
			megacli_cmd=$_cmd
			break
		fi
	done

	if [ -z "${megacli_cmd}" ]; then
		die 1 "Megacli command not found !"
	fi
}

get_adp_count()
{
	local _count

	_count=$(${tool} -adpcount -nolog | \
		awk '/Controller Count:/ { sub(/\./,"", $3); printf "%d", $3; }')

	case $_count in
		0)	die 1 "Can't found adapters."
			;;
		[1-9])
			;;
		*)	die 1 "Can't get the number of adapters (${_count})."
			;;
	esac

	adp_count=$_count
}

add_to_warn_msg()
{
	local _msg

	_msg="$1"

	warn_msg="${warn_msg}${warn_msg:+, }${_msg}"
}

add_to_crit_msg()
{
	local _msg

	_msg="$1"

	crit_msg="${crit_msg}${crit_msg:+, }${_msg}"
}

rmv_pds_detect_threshold_reached()
{
	local _prev _new

	if [ -r $rmv_pds_counter ]; then
		_prev=$(cat ${rmv_pds_counter})
	else
		_prev=0
	fi

	_new=$((${_prev} + 1))

	if ! echo $_new > $rmv_pds_counter; then
		return 0
	fi

	if [ $_new -ge $rmv_pds_detect_threshold ]; then
		# PDs is "true" removed
		return 0
	fi

	# Flap is detected
	return 1
}

get_pd_list()
{
	local _adp _pd_list

	_adp=$1

	_pd_list="$(${tool} -pdlist -a${_adp} -nolog | awk '
		/^Enclosure Device ID:/ { enc=$4; }
		/^Slot Number:/ { slot=$3; printf "%d:%d\n", enc, slot; }
		')"

	if [ -n "${_pd_list}" ]; then
		echo "${_pd_list}"
	fi
}

check_degraded_ld()
{
	local _adp _ld _ld_is_degraded _pd _rbld_pds _complete

	_adp=$1
	_ld=$2

	eval $(${tool} -cfgdsply -a${_adp} -nolog | awk '
		/DISK GROUPS: '${_ld}'/ { active = 1 }
		/DISK GROUPS: '$((${_ld} + 1))'/ { active = 0 }
		{
			if (! active) next;

			if (/Enclosure Device ID:/) enc = $4;
			if (/Slot Number:/) slot = $3;
			if (/Firmware state:/) {
				state = $3
				if (state == "Online");
				else if (state == "Rebuild") {
					if (_rbld_pds) _rbld_pds = sprintf("%s %d:%d", _rbld_pds, enc, slot);
					else _rbld_pds = sprintf("%d:%d", enc, slot);
				}
				else {
					printf "_ld_is_degraded=1\n"
					exit
				}
			}
		}
		END{
			if (_rbld_pds) printf "_rbld_pds=\"%s\"\n", _rbld_pds
			else printf "_ld_is_degraded=1\n"
		}')

	if [ -n "${_ld_is_degraded}" ]; then
		return 1
	fi

	for _pd in $_rbld_pds; do
		_complete="$(${tool} -pdrbld -showprog -physdrv[${_pd}] -a${_adp} -nolog | \
			awk '/Rebuild Progress on Device/ {print $11}')"
		add_to_warn_msg "LD ${_ld} -> PD ${_pd} -> Rebuild (${_complete})"
	done

	return 0
}

check_lds()
{
	local _adp _ld_count _ld _ld_state

	_adp=$1

	_ld_count=$(${tool} -ldgetnum -a${_adp} -nolog | \
		awk '/Number of Virtual Drives/ {printf "%d", $9}')

	case $_ld_count in
		0)	die 1 "Can't found LDs on adp ${_adp}."
			;;
		[1-9]|[1-9][0-9]|[1-9][0-9][0-9])
			;;
		*)
			die 1 "Can't get the number of LDs (${_ld_count})."
			;;
	esac

	for _ld in $(jot ${_ld_count} 0); do
		_ld_state="$(${tool} -ldinfo -l${_ld} -a${_adp} -nolog | \
			awk -F": " '$1 ~ /State/ {print $2}')"

		case "${_ld_state}" in
			Optimal)
				;;
			Degraded|"Partially Degraded")
				if ! check_degraded_ld $_adp $_ld; then
					add_to_crit_msg "LD ${_ld} -> ${_ld_state}"
				fi
				;;
			Offline)
				add_to_crit_msg "LD ${_ld} -> ${_ld_state}"
				;;
			*)
				die 1 "LD ${_ld}: unknown state ${_ld_state}."
		esac
	done
}

check_pds()
{
	local _adp _warn_pds_msg _crit_pds_msg _pd_file _pd_file_tmp
	local _add _rmv _num_add _num_rmv

	_adp=$1

	# PD states
	eval $(${tool} -pdlist -a${_adp} -nolog | \
		awk -v err_counters_priority="${err_counters_priority}" '
		function join_str(__d, __str, __add) {
			if (__str) __res = sprintf("%s%s%s", __str, __d, __add)
			else __res = __add
			return __res
		}

		BEGIN {
			warn_pds_msg = crit_pds_msg = failed_pds = ""
			max_enc = max_slot = bad_es = 0
		}

		/Enclosure Device ID:/	{ enc  = $4; if (enc  != int(enc))  bad_es++; else if (max_enc  < enc  ) max_enc  = enc;  }
		/Slot Number:/		{ slot = $3; if (slot != int(slot)) bad_es++; else if (max_slot < slot ) max_slot = slot; }
		{
			if (/(Media|Other) Error Count:/ && $4 > 0) {
				if (err_counters_priority ~ /[Cc][Rr][Ii][Tt][Ii][Cc][Aa][Ll]/)
					crit_pd[enc, slot] = join_str(" / ", crit_pd[enc, slot], sprintf("%d %s errors", $4, tolower($1)))
				else if (err_counters_priority ~ /[Ww][Aa][Rr][Nn][Ii][Nn][Gg]/)
					warn_pd[enc, slot] = join_str(" / ", warn_pd[enc, slot], sprintf("%d %s errors", $4, tolower($1)))
			}
			if (/Firmware state: (Unconfigured|Failed|Missing|Offline|None)/)
				crit_pd[enc,slot] = $3;
		}

		END{
			for (i = 0; i <= max_enc; i++)
				for (j = 0; j <= max_slot; j++) {
					if (crit_pd[i, j]) {
						crit_pds_msg = join_str(", ", crit_pds_msg, sprintf("PD %d:%d -> %s", i, j, crit_pd[i,j]))
						if (crit_pd[i, j] ~ /^(Failed|Unconfigured\(bad\))$/)
							failed_pds = join_str(" ", failed_pds, sprintf("%d:%d", i, j))
					}
					else if (warn_pd[i, j])
						warn_pds_msg = join_str(", ", warn_pds_msg, sprintf("PD %d:%d -> %s", i, j, warn_pd[i,j]))
				}
			if (bad_es > 0)
				crit_pds_msg = join_str(", ", crit_pds_msg, sprintf("%d bad [E:S] found", bad_es))
			printf "failed_pds=\"%s\"\n", failed_pds
			printf "_warn_pds_msg=\"%s\"\n", warn_pds_msg
			printf "_crit_pds_msg=\"%s\"\n", crit_pds_msg
		}')


	if [ -n "${_crit_pds_msg}" ]; then
		add_to_crit_msg "${_crit_pds_msg}"
	fi

	if [ -n "${_warn_pds_msg}" ]; then
		add_to_warn_msg "${_warn_pds_msg}"
	fi

	# Missing PDs
	if [ -n "$(${tool} -pdgetmissing -a${_adp} -nolog | \
		awk '/Missing Physical drives/')" ]
	then
		add_to_crit_msg "missing PD(s) found"
	fi

	# PD List
	_pd_file="${pd_list_prefix}${_adp}"
	_pd_file_tmp="${_pd_file}.tmp"

	if [ ! -s $_pd_file ]; then
		get_pd_list $_adp > $_pd_file
	else
		get_pd_list $_adp > $_pd_file_tmp

		if [ -s $_pd_file_tmp ]; then
			eval $(diff -u ${_pd_file} ${_pd_file_tmp} | awk '
				BEGIN{ add = rmv = num_add = num_rmv = 0; }
				/^\+[^\+]/ {
					num_add++;
					sub(/^\+/, "", $1);
					if (add) add = add" "$1;
					else add = $1;
				}
				/^-[^-]/ {
					num_rmv++;
					sub(/^-/, "", $1);
					if (rmv) rmv = rmv", "$1;
					else rmv = $1;
				}
				END{
					printf "_num_add=%d;", num_add;
					printf "_num_rmv=%d;", num_rmv;
					if (add) printf "_add=\"%s\";", add;
					if (rmv) printf "_rmv=\"%s\";", rmv;
				}')

			if [ -n "${_add}" -a -z "${_rmv}" -o \
				-n "${_rmv}" -a ${_num_add} -eq ${_num_rmv} ]
			then
				# ( New drives has been added ) OR
				# ( Numeration has been changed )
				mv $_pd_file_tmp $_pd_file
			elif [ -n "${_rmv}" ]; then
				if rmv_pds_detect_threshold_reached; then
					add_to_crit_msg "PD(s) ${_rmv} has been removed"
				else
					add_to_warn_msg "PD(s) ${_rmv} has been removed"
				fi
			fi
		fi

		if [ -f "${_pd_file_tmp}" ]; then
			rm -f $_pd_file_tmp 2>/dev/null
		fi
	fi
}

check_depends_for_handle_pds()
{
	local _failed_count

	case $handle_failed_pds in
		[Yy][Ee][Ss]) ;;
		*) return 1 ;;
	esac

	# Failed PDs = New failed PDs + Already existed failed PDs
	_failed_count=$(($(echo $failed_pds | wc -w) +
		$(ls -1 ${pd_ticket_prefix}* 2>/dev/null | wc -l)))

	if [ $_failed_count -gt $max_pds_for_handle ]; then
		return 1
	fi

	return 0
}

ch_pd_state()
{
	local _st _pd _adp _tool_cmd _ack_str _fail_count

	_st="$1"
	_pd="$2"
	_adp="$3"

	case $_st in
		locate)
			_tool_cmd="-pdlocate start"
			_ack_str="Exit Code: 0x00"
			;;
		offline)
			_tool_cmd="-pdoffline"
			_ack_str=" state changed to OffLine."
			;;
		markmissing)
			_tool_cmd="-pdmarkmissing"
			_ack_str=" is marked Missing."
			;;
		*)
			return 1
	esac

	_fail_count=0

	while ! $tool $_tool_cmd -physdrv"[${_pd}]" -a$_adp -nolog | \
		grep -q "${_ack_str}"
	do
		_fail_count=$(($_fail_count + 1))
		if [ $_fail_count -lt $ch_pd_st_try_count ]; then
			sleep $ch_pd_st_try_timeout
		else
			return 1
		fi
	done

	return 0

}

get_dc()
{
	local _dc

	_dc=$(fetch -q -T 10 -o - $golem_api_url 2>/dev/null)

	echo ${_dc:="Unknown_DC"}
}

handle_failed_pds()
{
	local _pd _pd_enc _pd_num _pd_type _pd_size _pd_state _pd_model _pd_sn
	local _adp _state _state_targets _ticket_tag
	local _enc_con_pos _enc_con_pos_notavail

	if ! check_depends_for_handle_pds; then
		return
	fi

	_adp=$1

	for _pd in $failed_pds; do
		_ticket_tag="${pd_ticket_prefix}_${_pd}_${_adp}"

		if [ -f "${_ticket_tag}" ]; then
			add_to_crit_msg "Ticket for PD ${_pd} is created"
			continue
		fi

		eval $(${tool} -pdinfo -physdrv"[${_pd}]" -a$_adp -nolog | awk '
			/Enclosure Device ID:/	{ pd_enc  = $4 }
			/Slot Number:/		{ pd_num  = $3 }
			/PD Type:/		{ pd_type = $3 }
			/Raw Size:/	{
				sub(/MB/, "", $3)
				if ($3 > 0) pd_size = sprintf("%d MB", $3)
				else pd_size = "Unknown size"
			}
			/Firmware state:/	{ pd_state = $3 }
			/Inquiry Data:/	{
				for (i = 3; i < NF; i++) {
					if (pd_model) pd_model = sprintf("%s %s", pd_model, $i)
					else pd_model = $i
				}
				pd_sn = $NF
			}
			END{
				printf "_pd_enc=\"%d\";",   pd_enc
				printf "_pd_num=\"%d\";",   pd_num
				printf "_pd_type=\"%s\";",  pd_type
				printf "_pd_size=\"%s\";",  pd_size
				printf "_pd_state=\"%s\";", pd_state
				printf "_pd_model=\"%s\";", pd_model
				printf "_pd_sn=\"%s\";",    pd_sn
			}
		')

		if [ -z "${_pd_enc}" -o -z "${_pd_num}" -o -z "${_pd_type}" -o \
			-z "${_pd_size}" -o -z "${_pd_state}" -o \
			-z "${_pd_model}" -o -z "${_pd_sn}" ]
		then
			add_to_crit_msg "Not enough info about PD ${_pd} for create ticket"
			continue
		fi

		eval $(${tool} -encinfo -a$_adp -nolog | awk -v enc="${_pd_enc}" '
			BEGIN{ pos = con = "NULL" }

			/Number of enclosures /		{ num_encs = $8 }
			/Number of Physical Drives/	{ num_pds += $6 }

			/^ +Device ID +:/ && $4 == enc	{ in_enc = 1 }
			/^ +Device ID +:/ && $4 != enc	{ in_enc = 0 }

			/Position/			{ if (in_enc) pos = $3 }
			/Connector Name/		{ if (in_enc) con = $4 }

			END{
				if (num_encs > 1 && num_pds > 16) {
					if (pos ~ /^[0-9]+$/ && con ~ /^[0-9]+$/)
						printf "_enc_con_pos=\"%d:%d\";", con, pos
					else
						printf "_enc_con_pos_notavail=\"yes\";"
				}
			}
		')

		case $_pd_state in
			"Failed")
				_state_targets="locate offline markmissing"
				;;
			"Unconfigured(bad)")
				_state_targets="locate"
				;;
			*)
				continue
				;;
		esac

		for _state in $_state_targets; do
			if ! ch_pd_state $_state $_pd $_adp; then
				continue 2
			fi
		done

		# TODO Tickets have to be sent via bot.yandex-team.ru API

		ticket_subj=$(echo "${ticket_subj_tpl}" | sed -e "s/<DC>/$(get_dc)/")
		ticket_subj_encoded=$(python -c \
			"print('=?${ticket_encoding}?B?%s?=' % \
				'${ticket_subj}'.encode('base64').strip())")

		sendmail -t <<EOF_TICKET
From: ${ticket_from}
Subject: ${ticket_subj_encoded}
To: ${ticket_to}
Cc: ${ticket_cc}
Content-Type: text/plain; charset=${ticket_encoding}
Content-Transfer-Encoding: 8bit
MIME-Version: 1.0
User-Agent: ${ticket_user_agent:-$0}

Hi;

Нужно заменить диск ${_pd_num} (считая с нуля) в полке ${_enc_con_pos:+"${_enc_con_pos} "}машины ${hostname_s}
на аналогичный:
${_pd_model} (${_pd_type} ${_pd_size})
S/N: ${_pd_sn}

Диск подсвечен.
${_enc_con_pos:+Идентификация полки: (порт контроллера, с нуля):(позиция полки, с нуля).}${_enc_con_pos_notavail:+Номер полки определить не удалось - искать по подсвеченному ${_pd_num}-му диску.}

Спасибо.


-- 
${ticket_signature}
EOF_TICKET

		touch $_ticket_tag
	done
}

cleanup()
{
	if [ -z "${crit_msg}" -a -z "${warn_msg}" ]; then
		rm -f $rmv_pds_counter 2>/dev/null
		rm -f ${pd_ticket_prefix}* 2>/dev/null
	fi
}

report()
{
	if [ -n "${crit_msg}" ]; then
		die 2 "${crit_msg}."
	elif [ -n "${warn_msg}" ]; then
		die 1 "${warn_msg}."
	else
		die 0 Ok
	fi
}


#-- Variables ----------------------------------------------------------

config="${BASE}/etc/${me}.conf"

megacli_cmds="/usr/local/sbin/MegaCli /usr/local/sbin/megacli"
sudo_cmd="/usr/local/bin/sudo"

pd_list_prefix="${TMP}/${me}.pdlist"
pd_ticket_prefix="${TMP}/${me}.ticket"
rmv_pds_counter="${TMP}/${me}.rmv_cnt"

ch_pd_st_try_count=3
ch_pd_st_try_timeout=5

hostname=$(hostname)
hostname_s=$(hostname -s)

ticket_from="HW Raid <search-maintenance@yandex-team.ru>"
ticket_subj_tpl="[<DC>] Заменить диск в полке на ${hostname_s}"
ticket_to="helpdc@yandex-team.ru"
ticket_cc="search-maintenance@yandex-team.ru"
ticket_user_agent="${me} $(echo '$Revision$' | awk '{printf "%s", $2}')"
ticket_signature="HW Raid at ${hostname_s}"
ticket_encoding="utf-8"

golem_api_url="https://golem.yandex-team.ru/api/host_query.sbml?hostname=${hostname}&columns=dc&encoding=${ticket_encoding}"

warn_msg=""
crit_msg=""

handle_failed_pds="YES"
max_pds_for_handle=1

err_counters_priority="WARNING"

rmv_pds_detect_threshold=3


#-- Main ---------------------------------------------------------------

check_depends

tool="${sudo_cmd} ${megacli_cmd}"

if [ -r "${config}" ]; then
	. ${config}
fi

get_adp_count

for adp in $(jot ${adp_count} 0); do
	failed_pds=""

	check_lds $adp
	check_pds $adp

	handle_failed_pds $adp
done

cleanup
report

