#!/bin/sh

#######
## Global variables description
# AUTO: =0 if running with cmdline (admin wants to do smth special), or =1 if not (e.g. from cron)
# BADDISK: name of bad disk (e.g. ada1), used by 'disk_remove' and emails
# BOT: flag, '=1' indicates that bot info is a reliable source of disk data
# CHECKMIRROR_CHILD: flag, unique global var. '=1' indicates that current process is a child and FILE_LCK is locked
# CMDLINE_ACTION: what to do
# CMDLINE_COMMENT: optional comment line to send to bot&email
# CMDLINE_DISKN: disk number for action
# CMDLINE_DISKNAME: disk name for action
# DEBUG: debugging mode (dont call bot, send emails to debug_email_to)
# DISKTYPE: base for disk names (type of disks), e.g. 'ad', 'ada', 'da'
# FILE_LCK: lock file path
# FILE_OFF: this file indicates that script must stop doing smth automatically and wait for admin action
# FORCE: this enables 'force disk removal', which sends ticket to replace even if 'insdisk' script fails
# HDD_COUNT: count of disks in system
# HDD_DATA: multi-string text 'database' (delimited by space, 1 string = 1 disk)
#   contains disk name, model, serial, and optional a flag (see ORIGIN)
#   filled on script start, and then used by 'hddinfo' function (generally for emails)
#   if BOT=0, hddinfo doesn't look to this var
#   e.g.:
#   da0 WD5002ABYS WCASYE025888    <-- data was fetched from disk directly (ORIGIN=1)
#   da1 WD5002ABYS WCASY9200375
#   da2 WD5002ABYS WCASYE507425 0  <-- data was fetched from bot by 'guessing' (ORIGIN=0)
# MODEL, SERIAL, DISKN: model, serial and number of currently picked disk
# ORIGIN: flag for picked disk data, indicates that data was fetched from disk directly (=1) or from bot by exclusion (=0)
# SMART_STRICT_CHECK: enable strict check in 'disk_check_smart'
# UPTIME: uptime in seconds
#

. /etc/ya.subr
umask 077
AUTO=1
BOT=0
DEBUG=0
CMDLINE_COMMENT=""
FORCE=0
HDD_DATA=""
FILE_LCK="/tmp/checkmirror.sh.lck"
FILE_OFF="/tmp/checkmirror.sh.off"
SMART_STRICT_CHECK="0"
br="
"
debug_email_to="dlarionov@yandex-team.ru"
email_to="helpdc@yandex-team.ru"
email_list="seek-hw@yandex-team.ru"
email_cc=$email_list
email_replyto="search-maintenance@yandex-team.ru"
h=`hostname`
signature="-- 
$h
Сервер отдела поисковых систем департамента эксплуатации компании \"Яндекс\"
Тел.: +7 495 7397000"

send_to_bot()
{
    # this function sends info to bot, and return 0 if success
    # using global vars: email_to, h, DISKN, MODEL, SERIAL
    # [$1 = run-assist flag: (any) or "run-assist"]
    local _result _comment

    if [ "$CMDLINE_COMMENT" = "" ]; then
        _comment=""
    else
        _comment="$CMDLINE_COMMENT"
    fi

    if [ "$1" = "run-assist" ]; then
        _comment="${_comment}${br}`email_hddinfo_assist bot`"
    fi

    # preparing for sending message, making a part of HTTP GET request
    if [ "$_comment" != "" ]; then
        _comment="&comment=${_comment}"
    fi

    if [ "$DEBUG" = "0" ]; then
        # sending request to bot, get bid to var
        _result="`fetch -q -o - "http://bot.yandex-team.ru/api/request.php?initiator=search&operation=hdd&email=yes&replyto=${email_replyto}&name=${h}&option=changeanalog&slot=${DISKN}&model=${MODEL}&serial=${SERIAL}${_comment}" 2>/dev/null`"
    else
        echo "DISKN = $DISKN"
        echo "MODEL = $MODEL"
        echo "SERIAL= $SERIAL"
        echo "COMMENT:"
        echo "$_comment"
        echo "--"
        # force to send email too
        return 1
    fi

    # check if fetch returns valid number
    if [ "$?" -eq 0 ] && is_number $_result; then
        echo "Task number: ${_result}"
        return 0
    else
        echo "Error sending request to bot"
        return 1
    fi
}

email_hddinfo_assist()
{
    # this function prints inline information for email and bot
    # printing all hddinfo's except baddisk
    # [$1="bot" -> generating comment to bot]

    local _out_bot

    if [ "$1" != "" ]; then
        _out_bot="1"
    fi

    echo "Система потеряла доступ к диску."
    echo "Ниже данные других присутствующих дисков:"
    echo
    for num in `jot $HDD_COUNT 0 | grep -v $DISKN`; do
        if hddinfo ${DISKTYPE}${num} ; then
            echo "${num}:"
            echo "Model:  $MODEL"
            echo "Serial: $SERIAL"
        fi
    done
    # if bot info is correct, print hddinfo for baddisk
    # if not, we cant obtain it (have tried already)
    if [ "$BOT" = "1" ] && [ "$_out_bot" != "1" ] && hddinfo $BADDISK ; then
        echo
        echo "По информации в базе, это может быть следующий диск:"
        echo "Model:  $MODEL"
        echo "Serial: $SERIAL"
    fi
    echo " "
}

email_general_raid_error() {
# some problems found, needs assistance
# [$1 = disk name]
echo "Sending email: general error $1"
if [ "$AUTO" = "0" ] || [ -t 0 ]; then
    echo "Cancelled (running manual mode or in terminal)"
    return 0
fi
sendmail -t <<EOF
From: "$h" <${email_list}>
To: ${email_list}
Reply-To: ${email_replyto}
Subject: bad disk in $h
Content-Type: text/plain; charset="utf-8"
Mime-Version: 1.0

В машине $h что-то не так с RAID-ом.
Стоит отвлечься на машину.

Автоматика отключена на 1 сутки.

PS. Данные для диагностики:
`
if ! [ "$1" = "" ]; then
    echo "Проблемы с диском ${1}"
fi
echo "# uptime"         ; uptime
echo "# gmirror status" ; gmirror status
`

${signature}
EOF

if [ "$DEBUG" = "0" ]; then
    touch $FILE_OFF
fi
}

email_clean_disk_eject() {
#
# [$1 = disk name]
echo "Sending email: clean disk eject $1"
if [ "$AUTO" = "0" ] || [ -t 0 ]; then
    echo "Cancelled (running manual mode or in terminal)"
    return 0
fi
sendmail -t <<EOF
From: "$h" <${email_list}>
To: ${email_list}
Reply-To: ${email_replyto}
Subject: RAID problem on $h
Content-Type: text/plain; charset="utf-8"
Mime-Version: 1.0

На машине $h из RAID-а выпал исправный диск $1
Это может свидетельствовать о сбоях диска, с которого он синхронизировался.
Стоит отвлечься на машину.

Автоматика отключена на 1 сутки.

PS. Данные для диагностики:
`
echo "# uptime"         ; uptime
echo "# gmirror status" ; gmirror status
`

${signature}
EOF

if [ "$DEBUG" = "0" ]; then
    touch $FILE_OFF
fi
}

email_replace() {
# disk needs replace, no other problems detected
# global vars are actual already
if send_to_bot; then
    return 0
fi

echo "Sending email: replace to $email_to"
sendmail -t <<EOF
From: "$h" <${email_list}>
To: ${email_to}
Cc: ${email_cc}
Reply-To: ${email_replyto}
Subject: ${DC}: bad disk in $h
Content-Type: text/plain; charset="utf-8"
Mime-Version: 1.0

Замените, пожалуйста, диск в машине $h на диск аналогичного объема и форм-фактора.
Номер:  $DISKN с 0 (предположительно)
Model:  $MODEL
Serial: $SERIAL
 
`if [ "$CMDLINE_COMMENT" != "" ]; then
    echo "Комментарий: $CMDLINE_COMMENT"
fi`

Проверять по активности и серийному номеру.
Выключать сервер не нужно.
Благодарю.

Если Вы получили это письмо, значит bot.yandex-team.ru не смог принять заявку.

PS. Данные для диагностики:
# gmirror status
`gmirror status`

${signature}
EOF
}

email_replace_unknown() {
# disk needs replace, but we cant take its info, printing other disks info
# numbers of other disks are in exec parameters
hddinfo $BADDISK
if send_to_bot "run-assist" ; then
    return 0
fi

echo "Sending email: replace_unknown to $email_to"
sendmail -t <<EOF
From: "$h" <${email_list}>
To: ${email_to}
Cc: ${email_cc}
Reply-To: ${email_replyto}
Subject: ${DC}: bad disk in $h
Content-Type: text/plain; charset="utf-8"
Mime-Version: 1.0

Замените, пожалуйста, диск в машине $h на диск аналогичного объема и форм-фактора.
Номер: $DISKN с 0 (предположительно)

`if [ "$CMDLINE_COMMENT" != "" ]; then
    echo "Комментарий: $CMDLINE_COMMENT"
    echo
fi
email_hddinfo_assist`

Проверять по активности и серийному номеру.
Выключать сервер не нужно.
Благодарю.

Если Вы получили это письмо, значит bot.yandex-team.ru не смог принять заявку.

PS. Данные для диагностики:
# gmirror status
`gmirror status`

${signature}
EOF
}

gmirror_forget()
{
    # A='key/value'
    # echo ${A#*/}
    gmirror status | awk '/DEGRADED/ {print $1}' | awk -F/ '{print $2}' | xargs gmirror forget
}

hddinfo_fin()
{
    # finalize disk info, make it more human-readable
    # this is because of the fact that bot info is human-entered

    # 1. get rid of '<...>' braces (it occasionally happens with 'da' disks)
    # 2. print the longest word of factory string (using "-" as delimiter in s/n)
    # 3. decrease length of s/n by cutting rest of string after lots of zeros

    # "ATA ST3500320NS SN05" -> "ST3500320NS"
    MODEL=`echo  $MODEL  | tr -d '<>' | awk     '{ word=""; for (i=1;i<=NF;i++) if (length($(i))>length(word)) { word=$(i) } ; print word }'`
    # "WD-WCASYD543955" -> "WCASYD543955"
    SERIAL=`echo $SERIAL | tr -d '<>' | awk -F- '{ word=""; for (i=1;i<=NF;i++) if (length($(i))>length(word)) { word=$(i) } ; print word }'`
}

hddinfo()
{
    # this function sets vars MODEL/SERIAL/ORIGIN to real disk info
    local _diskname _diskdata

    MODEL=""
    SERIAL=""
    ORIGIN=0
    if [ "$1" = "" ]; then
        return 1
    fi
    _diskname=$1

    # if bot info is correct, get info from var
    if [ "$BOT" = "1" ]; then
        _diskdata=`echo "$HDD_DATA" | grep $_diskname`

        MODEL=`echo  $_diskdata | awk '{print $2}'`
        SERIAL=`echo $_diskdata | awk '{print $3}'`
        # if data was not fetched from origin ($4=="0"), set var to 0
        ORIGIN=`echo $_diskdata | awk '{if ($4=="0") {print 0;} else {print 1;} }'`
        return 0
    fi

    # if disk doesnt even exist, return error
    if [ ! -e /dev/$_diskname ]; then
        return 1
    fi

    # all fetched data below is original only
    ORIGIN=1
    case $DISKTYPE in
        ad)
            _diskdata=`atacontrol cap $_diskname 2>/dev/null`
            MODEL=`echo  "$_diskdata" | grep "device model"  | awk '{print $3" "$4}'`
            SERIAL=`echo "$_diskdata" | grep "serial number" | awk '{print $3}'`
            ;;
        ada)
            _diskdata=`camcontrol identify $_diskname 2>/dev/null`
            MODEL=`echo  "$_diskdata" | grep "device model"  | awk '{print $3" "$4}'`
            SERIAL=`echo "$_diskdata" | grep "serial number" | awk '{print $3}'`
            ;;
        da)
            if [ "`camcontrol tur ${_diskname}`" = "Unit is ready" ]; then
                MODEL=`camcontrol  inquiry $_diskname -D 2>/dev/null | awk '{print $2" "$3" "$4}'`
                SERIAL=`camcontrol inquiry $_diskname -S 2>/dev/null | awk '{print $1}'`
            fi
            ;;
    esac

    # finalize disk info
    hddinfo_fin

    # if model or s/n is empty, return error
    if [ "$MODEL" = "" ] || [ "$SERIAL" = "" ]; then
        return 1
    fi
}

disk_remove()
{
    # this function removes $BADDISK from active mirrors and sends suitable report

    if [ "$BADDISK" = "" ]; then
        return 1
    fi


    if [ "$AUTO" = "1" ] && [ "$DEBUG" = "0" ] && gmirror status -s `lookup_degraded` | egrep -q " DEGRADED *${BADDISK}" ; then
        # check if this disk doesnt exist in any degraded mirror
        # if it does, it means we have just found wrong disk
        return 1
    fi

    DISKN=`echo $BADDISK | sed 's/[^0-9]//g'`
    gmirror_forget

    SMART_STRICT_CHECK="1"
    if [ "$AUTO" = "1" ] && disk_check_smart $BADDISK ; then
        # disk seems to be clean, report a potential problem
        email_clean_disk_eject $BADDISK
        exit 1
    fi

    # hddinfo after detaching "ad" drive will not work
    # so, first try to get it, and then call "insdisk stop"
    hddinfo $BADDISK

    if [ "$DEBUG" = "0" ] && ! $(dirname $0)/insdisk_ws2.sh stop $BADDISK ; then
        # some error occured while removing disk
        if [ "$FORCE" = "0" ] ; then
            email_general_raid_error $BADDISK
            exit 1
        fi
        echo
        echo "CAUTION! Disk remove failed, probably it is still in use."
        echo "Physical disk replacement WILL cause data loss!"
        echo "Generating ticket..."
    fi

    # replace DISKN by CMDLINE_DISKN if specified
    if [ "$AUTO" = "0" ] && ! [ "$CMDLINE_DISKN" = "" ] ; then
        DISKN="$CMDLINE_DISKN"
    fi

    if [ "$ORIGIN" = "0" ]; then
        # disk doesnt respond (hddinfo fails), or its data is fetched from non-authoritative source (needs to print other data)
        email_replace_unknown
        exit 0
    fi

    # none of conditions match - no problems found, report bad disk as usual
    email_replace
    exit 0
}

lookup_degraded()
{
    # this function prints names of "degraded and not rebuilding" mirrors
    # and if found, returns an error
    gmirror status -s | awk -F/ '{print $2}' | awk '
        # $1=mirror_name
        # $2=status
        # $3=partition_name
        # [$4=rebuild_percent] ($4+$5 indicates mirror status in FreeBSD 9)
        {
        if ( $2=="DEGRADED" && status[$1]!="rebuild" ) status[$1]="degraded"
        if ( status[$1]=="degraded" && $4 $5 ~ /%/ ) status[$1]="rebuild"
        }
    END {
        for (mirror in status)
            if (status[mirror]=="degraded")
                {
                # print degraded mirror names, store status
                print mirror
                found=1
                }
        if (found==1) exit 1
        }'
    return $?
}

get_missing_number()
{
    # this function gets disk or partition names (e.g. da1, ada2f) from stdin
    # and returns number (only digit) of missing partition
    # if count of missing elements != 1, it will print nothing and return an error

    # sed cuts off any letters to leave numbers only
    grep $DISKTYPE | sed 's/[^0-'$(($HDD_COUNT - 1))']//g' | awk -v total=$HDD_COUNT '
    # $1 = disk number
    # init array
    BEGIN { for (n=0;n<total;n++) disks[n]="" }
        {
        # processing input. Each input number is deleted from array
        delete disks[$1]
        }

    END {
        # now search for remaining elements in array
        for (n=0;n<total;n++)
            if (n in disks)
                {
                # counting remaining numbers
                count++
                # printing found number
                print n
                }
        # exit status is a number of found elements
        exit count
        }'
    return $?
}

disk_check_smart()
    {
    # this function checks smart status of disk
    # $1 = disk name

    local _disk _smart _failed _exit _unc _pending
    _unc="50"
    _pending="3"

    if [ "$1" = "" ]; then
        return 1
    fi
    _disk=$1

    if [ "$SMART_STRICT_CHECK" = "1" ]; then
        _unc="1"
        _pending="1"
    fi

    if ! [ -e /dev/${_disk} ]; then
        echo "Error: /dev/${_disk} doesnt exist"
        return 1
    fi

    _smart="`/usr/local/sbin/smartctl -a /dev/${_disk}`"
    _exit="$?"
    if [ "${_exit}" -ge 1 ] && [ "${_exit}" -le 3 ]; then
        echo "Error: cannot read SMART for ${_disk}, disk is probably down"
        return 1
    fi
    _failed=0

    # ATA drive
    if echo "$_smart" | grep -q "ATA" ; then
        echo "$_smart" | awk '/Uncorrectable|Realloc/ { if ($(NF)>='${_unc}') {exit 1} }
                              /Pending/ { if ($(NF)>='${_pending}') {exit 1} }
                              /FAILING/ {exit 1}' || _failed=1
    fi

    # SCSI or SAS drive
    if echo "$_smart" | egrep -q "SCSI|SAS" ; then
        echo "$_smart" | grep -A4 uncorrected | awk '/read:|write:|verify:/ { if ($(NF)>='${_unc}') {exit 1} }' || _failed=1
    fi

    if [ "$_failed" != "0" ]; then
        echo "Error: there are SMART errors (fails or badblocks), don't use this disk"
        # report error only in strict checking mode
        # this must be only temporary solution, but we have to insert disk regardless of errors, because it has been tested
        return $SMART_STRICT_CHECK
    fi
    }

ata_channel_probe()
    {
    # this procedure probes specified ata channel for disk by detach/attach (if channel is empty) or reinit (if not)
    # $1 = ata channel number
    # $2 = disk name for search
    local _chan _disk _result
    
    if [ "$2" = "" ]; then
        return 1
    fi
    _chan="ata$1"
    _disk=$2

    if [ "$DEBUG" = "1" ]; then
        echo "Skipping ATA channel probe: ${_chan}"
        return 1
    fi

    _result="`atacontrol info ${_chan} 2>/dev/null`"
    if [ "$?" != "0" ]; then
        return 1
    fi

    if echo "$_result" | grep -q " ${DISKTYPE}. " ; then
        # channel is not empty
        atacontrol reinit ${_chan} 2>/dev/null | grep -q ${_disk} && disk_check_smart ${_disk}
        return $?
    else
        # channel is empty
        if atacontrol detach ${_chan} 2>/dev/null ; then
            # channel was not detached
            atacontrol attach ${_chan} 2>/dev/null | grep -q ${_disk} && disk_check_smart ${_disk}
            return $?
        else
            # channel was detached
            atacontrol attach ${_chan} 2>/dev/null | grep -q ${_disk} && disk_check_smart ${_disk}
            if [ "$?" = "0" ]; then
                return 0
            else
                atacontrol detach ${_chan} 2>/dev/null
                return 1
            fi
        fi
    fi
    }

disk_reinit()
    {
    # this procedure calls 'camcontrol rescan' or 'ata_channel_probe' depending on disktype and status.
    # $1 = disk name for search
    
    local n _result _disk _diskn _channels_all _channels_used

    if [ "$1" = "" ]; then
        return 1
    fi
    _disk=$1
    _diskn="${_disk#${DISKTYPE}}"
    
    case $DISKTYPE in
        ada)
            # nothing to do, just check smart
            disk_check_smart $_disk
            return $?
            ;;
        da)
            camcontrol rescan all

            if [ -e /dev/$_disk ]; then
                disk_check_smart $_disk
                return $?
            fi
            ;;
        ad)
            # check if disk exists
            if [ -e /dev/$_disk ]; then
                disk_check_smart $_disk
                return $?
            fi

            _result="`atacontrol list 2>/dev/null`"
            # error executing or empty list
            if [ "$?" != "0" ] || ! echo "$_result" | grep -q " ${DISKTYPE}. "; then
                return 1
            fi
            # take numbers of "all" and "used" ata channels
            _channels_all="` echo "$_result" |                             grep 'ATA channel' | sed 's/[^0-9]//g'`"
            _channels_used="`echo "$_result" | grep -B2 " ${DISKTYPE}. " | grep 'ATA channel' | sed 's/[^0-9]//g'`"
            # take count of ata channels
            case `echo "$_channels_used" | grep -c ""` in
                0)
                    return 1
                    ;;
                1)
                    # only one channel used
                    # examine every channel
                    for n in ${_channels_all} ; do
                        if ata_channel_probe $n $_disk ; then
                            return 0
                        fi
                    done
                    ;;
                `expr $HDD_COUNT - 1`)
                    # 1 drive per channel
                    # examine every empty channel
                    for n in ${_channels_all} ; do
                        if ! echo "${_channels_used}" | grep -q $n ; then
                            if ata_channel_probe $n $_disk ; then
                                return 0
                            fi
                        fi
                    done
                    ;;
                `expr $HDD_COUNT / 2`)
                    # 2 drives per channel
                    # search and examine channel with 1 disk
                    for n in ${_channels_used} ; do
                        # check if channel has vacant slot
                        if echo "${_result}" | grep -A2 "ATA channel $n" | grep -q "no device present"; then
                            # probe channel for drive
                            ata_channel_probe $n $_disk
                            return $?
                        fi
                    done
                    ;;
            esac
            ;;
    esac
    # if we are here, it seems that nothing found
    echo "Error: disk not found"
    return 1
    }

kill_foreign_mirrors()
    {
    # this procedure looks for mirrors, which come with inserted disk,
    # checks if they are foreign, and kills them
    # $1 = disk name for search
    
    local _d _disk _mirror _mirror_list

    if [ "$1" = "" ] || ! [ -e /dev/${1} ]; then
        return 1
    fi
    _disk=$1

    # "touch" disk (open for writing and just close), it will wake up geom elements for this disk
    echo "Disk is UP. GEOM info refreshing..."
    dd if=/dev/zero of=/dev/${_disk} count=0 >/dev/null 2>/dev/null
    sleep 5

    # doing 'forget' to all discovered (and possibly degraded) mirrors
    gmirror_forget

    _mirror_list=`gmirror status -s | grep ${_disk} | awk '{print $1}'`
    # '_mirror_list' elements look like 'mirror/db1'

    if [ "${_mirror_list}" = "" ]; then
        # no mirrors found
        return 0
    fi

    for _mirror in ${_mirror_list} ; do
        if gstripe status -s | egrep -x ".* ${_mirror}" ; then
            # gstripe knows this mirror, we shouldn't kill it.
            # grep will print found element if so.
            echo "It seems that found mirror is not foreign."
            return 1
        fi

        if [ "$DEBUG" = "0" ]; then
            echo "Killing foreign mirror: ${_mirror}"
            # gmirror remove 'mirror_name' 'mirror_elements'
            gmirror remove `echo ${_mirror} | awk -F/ '{print $2}'` `gmirror status -s | grep "${_mirror} " | awk '{print $3}'`
            if [ "$?" -ne 0 ]; then
                echo "'gmirror remove' execution failed"
                return 1
            fi
        else
            echo "Skipping foreign mirror kill: ${_mirror}"
        fi
    done

    # any other problems will be discovered by 'insdisk'
    return 0
    }

raid_autoinsert()
    {
    # this procedure finds disk, which is not present in any mirror, and starts it
    # todo: check for incomplete mirrors, run insert for disks with active foreign mirror(s)

    local _disk _diskn _disk_numbers _exit
    _exit=0 # store exit status

    _disk_numbers="`gmirror status -s | awk '{print $3}' | get_missing_number`"
    if [ "$?" = "0" ]; then
        echo "All drives are present. Please, check config:"
        echo
        echo "# gmirror status"
        gmirror status
        return 0
    fi

    for _diskn in ${_disk_numbers} ; do

        _disk="${DISKTYPE}${_diskn}"
        echo "Drive is missing: ${_disk}"

        if ! disk_reinit ${_disk} || ! kill_foreign_mirrors ${_disk} ; then
            _exit=1
            continue
        fi

        if [ "$DEBUG" = "0" ]; then
            echo "Inserting..."
            $(dirname $0)/insdisk_ws2.sh start $_disk
            if [ "$?" != "0" ]; then
                _exit=1
                continue
            fi
        else
            echo "Skipping disk insertion: ${_disk}"
        fi
    done

    return ${_exit}
    }

#########################################################
#########################################################
# start is here :)

if ! [ `uname` = "FreeBSD" ]; then
    echo "Error: this script is for FreeBSD only!"
    exit 0
fi

# checking cmdline

case "$1" in
    -h|--help)
        echo "Usage: ./`basename $0` [-h|--help] | [ACTION|--] [OPTIONS]"
        echo
        echo "ACTIONS"
        echo "    ins"
        echo "        Find and insert unused disk into mirrors."
        echo
        echo "    stop diskname"
        echo "        Stop (remove) specified disk and send report."
        echo "        'diskname' may be one of [ad|da|ada][0-9]"
        echo "        or 'missing' (auto-replaced by missing disk name)"
        echo
        echo "OPTIONS"
        echo "    -c \"comment line\" | --comment=\"comment line\""
        echo "        Specify a comment line to send with email or bot request."
        echo "        Use UTF-8 encoding for non-english symbols."
        echo "        Works only with 'stop' action, otherwise ignored."
        echo
        echo "    --debug"
        echo "        Enable debug mode: don't touch system, don't call bot, send emails to"
        echo "        'email_debug_to', print debug info."
        echo
        echo "    -f"
        echo "        Enable 'force remove' mode. Ticket for disk replace will be created regardless"
        echo "        of removal errors. Works only with 'stop' action, otherwise ignored."
        echo "        WARNING !!! NEVER use this on servers with valuable data!"
        echo "        Running force mode will probably result in data loss and/or system crash!"
        echo
        echo "    -nN"
        echo "        Override disk number, set probable value by hand."
        echo "        E.g. '-n2' means that '2' will be used in bot request as disk number,"
        echo "        regardless of number in diskname. See also 'missing' keyword."
        echo
        echo "If running without arguments, it will check current config and automatically"
        echo "remove bad disk if found, and generate ticket to replace."
        echo
        exit 0
        ;;
esac

if [ "$USER" != "root" ] ; then
    echo "current user is not root, running sudo..."
    sudo $0 "$@"
    exit $?
fi

case "$1" in
    stop)
        # remove and report bad disk by hand
        CMDLINE_ACTION="stop"
        AUTO=0
        # checking disk name
        case $2 in
            ad[0-9])
                ;;
            da[0-9])
                ;;
            ada[0-9])
                ;;
            missing)
                ;;
            *)
                echo "Invalid diskname - $2. For general help, run with '-h'"
                exit 1
                ;;
        esac
        CMDLINE_DISKNAME=$2
        shift
        ;;
    ins)
        # 
        CMDLINE_ACTION="autoinsert"
        AUTO=0
        ;;
    ""|--)
        ;;
    *)
        echo "Invalid action - $1. For general help, run with '-h'"
        exit 1
        ;;
esac
shift

# searching for additional options in cmdline
while [ "$1" != "" ]; do
    case $1 in
        --comment=*)
            if [ "$AUTO" = "0" ]; then
                CMDLINE_COMMENT="`echo $1 | sed 's/--comment=//'`"
            fi
            ;;
        -c)
            shift
            if [ "$AUTO" = "0" ]; then
                CMDLINE_COMMENT="$1"
            fi
            ;;
        -n?)
            if [ "$AUTO" = "0" ]; then
                CMDLINE_DISKN="`echo $1 | sed 's/[^0-9]//g'`"
                if [ "$CMDLINE_DISKN" = "" ] ; then
                    echo "Invalid disk number. For general help, run with '-h'"
                    exit 1
                fi
            fi
            ;;
        --debug)
            DEBUG=1
            email_to=$debug_email_to
            email_cc=$debug_email_to
            echo "Enable debug mode"
            ;;
        -f)
            if [ "$AUTO" = "0" ] && [ "$CMDLINE_ACTION" = "stop" ]; then
                echo "WARNING: force mode enabled!"
                FORCE=1
            fi
            ;;
        *)
            echo "Invalid option - $1. For general help, run with '-h'"
            exit 1
            ;;
    esac
    shift
done

if [ `sysctl -n security.jail.jailed` -eq 1 ]; then
    exit 0
fi

# FILE_OFF check - only in auto mode
if [ "$AUTO" = "1" ]; then
    # remove file if its age has reached 2day limit
    find $(dirname $FILE_OFF) -type f -name $(basename $FILE_OFF) -ctime +2d -delete
    # just exit if file still exists
    if [ -f $FILE_OFF ] ; then
        exit 0
    fi
fi

# check if there is another copy running - preventing flood if something stalled
# (only in auto mode, and not in debug mode)
# !! it is critical that none of 'gmirror/atacontrol/camcontrol/...' command has been executed before
if [ "$AUTO" = "1" ] && [ "$DEBUG" = "0" ] && [ "$CHECKMIRROR_CHILD" != "1" ]; then
    # current process is not a child under lockf
    # preventing possible fork bomb due to unreasonable behaviour
    if [ `ps ax | grep $0 | grep -v grep | wc -l` -ge 10 ]; then
        exit 1
    fi
    # set var, run child copy under lockf and exit
    export CHECKMIRROR_CHILD="1"
    lockf -st 0 $FILE_LCK $0
    exit $?
fi

# detecting rather rarely situation, which must be handled by human only
#   mirror/db1       N/A  N/A
if gmirror status -s | grep " N/A" ; then
    email_general_raid_error
    exit 1
fi

if [ "$AUTO" = "1" ] && lookup_degraded >/dev/null ; then
    # running auto mode, and there are no "degraded and not rebuilding" mirrors
    # todo: find replaced disks and run 'autoinsert' if found
    exit 0
fi

# now, we are here only if there are degraded mirror(s), OR in manual mode
# 'raid_autoinsert' will not run if there are degraded mirror(s)

###
# initialize vars
BOOTTIME=`sysctl kern.boottime | awk '{print $5}' | tr -d ','`
DATE=`date "+%s"`
UPTIME=`expr $DATE - $BOOTTIME`
DISKTYPE=`gmirror status -s | awk '{print substr($3,1,match($3,"[0-9]")-1) ; exit 0}'`
DC=`. /etc/rc.conf.local; eval $(ya_network_info $ipaddr); echo $ya_DC | tr a-z A-Z`

if [ "$AUTO" = "0" ] || [ -t 0 ]; then echo -n "Fetching info from bot... " ; fi
# fetch all HDD info from bot, and check it. Replacing <br> with newline
HDD_DATA_BOT="`fetch -q -o - "http://bot.yandex-team.ru/api/consistof.php?name=${h}" 2>/dev/null | sed 's/<br>/\
/g' | grep "HDD"`"
# check if bot returns any info (looking for grep exit code)
if [ "$?" -eq 0 ]; then
    if [ "$AUTO" = "0" ] || [ -t 0 ]; then echo "ok" ; fi
    # check if this info seems to be real
    HDD_COUNT=`echo "$HDD_DATA_BOT" | grep -cw HDD | awk '{if ( $1<2 || $1>10 ) {print 4 ; exit 1; } else {print $1} }'`
    if [ "$?" -eq 0 ]; then
        # presuming bot info is correct
        BOT_CORRECT=1
        # check if all existing disks match bot info
        for num in `jot $HDD_COUNT 0` ; do
            # get info for disk
            # BOT var must be =0 here: hddinfo must look for real disk info
            hddinfo ${DISKTYPE}${num}
            if [ "$?" -eq 0 ]; then
                # every new line looks like "DISK MODEL SERIAL"
                HDD_DATA="${HDD_DATA}${br}${DISKTYPE}${num} ${MODEL} ${SERIAL}"
                # if disk is up, and bot doesnt know s/n, then we cant trust its info more
                # (but, at least, we can use HDD_COUNT)
                if ! echo "$HDD_DATA_BOT" | grep -q $SERIAL ; then
                    BOT_CORRECT=0
                fi
            fi
        done

        if [ "$BOT_CORRECT" = "1" ]; then
            # applying filter: all fields (disktype, model, serial) must be present
            # also cutting empty lines
            HDD_DATA=`echo "$HDD_DATA" | awk '(NF==3)'`

            # count of disks actually present on system
            N=`echo "$HDD_DATA" | grep -c ""`
            # difference between HDD_COUNT and N
            case `expr $HDD_COUNT - $N` in
                0)
                    # no difference - nothing to do
                    ;;
                1)
                    # one disk is absent, try to get info...

                    # get number of absent disk
                    DISKN=`echo "$HDD_DATA" | awk '{print $1}' | get_missing_number`
                    # number of missing elements must be 1
                    if [ "$?" -eq 1 ]; then
                        # extracting serial from bot data (excluding known disk data)
                        for num in `jot $HDD_COUNT 0 | grep -v $DISKN` ; do
                            # excluding known data
                            SERIAL=`echo "$HDD_DATA" | grep ${DISKTYPE}${num} | awk '{print $3}'`
                            HDD_DATA_BOT=`echo "$HDD_DATA_BOT" | grep -v $SERIAL`
                        done

                        # check if we have only one disk remaining
                        if [ "`echo "$HDD_DATA_BOT" | grep -c ""`" -eq 1 ]; then
                            # get its model/serial
                            # parsing line looks like:
                            # 860467 HDD WD - WD5002ABYS (500Gb/SATA/3,5/7,2K) S/N:WCASYE025888
                            MODEL=`echo  "$HDD_DATA_BOT" | cut -f1 -d'(' | awk '{print $(NF)}'`
                            SERIAL=`echo "$HDD_DATA_BOT" | cut -f2 -d':' | awk '{print $1}'`
                            # finalize disk info
                            hddinfo_fin
                            # check for model/serial
                            if [ "$MODEL" != "" ] && [ "$SERIAL" != "" ]; then
                                # merge info with already collected data
                                # last "0" ($4) indicates that data was not fetched from origin
                                HDD_DATA="${HDD_DATA}${br}${DISKTYPE}${DISKN} ${MODEL} ${SERIAL} 0"
                            fi
                        else
                            # more than one line remaining, cant say which to take
                            BOT_CORRECT=0
                        fi
                    else
                        # too many missing disks
                        BOT_CORRECT=0
                    fi
                    ;;
                *)
                    # other situation (info is not correct, or >1 disks are absent)
                    # setting flag
                    BOT_CORRECT=0
                    ;;
            esac
        fi
        # finally set var to flag state. If =1, then there is HDD_DATA var with actual data.
        BOT=$BOT_CORRECT

        if [ "$BOT" = "0" ] ; then
            if [ "$AUTO" = "0" ] || [ -t 0 ]; then echo "Warning: bot info is missing or not actual!"; fi
        fi
    fi
else
    if [ "$AUTO" = "0" ] || [ -t 0 ]; then echo "error"; fi
    # request to bot has failed, try to guess by current config
    # todo: optimize
    HDD_COUNT=`gmirror status -s | awk '{print $3}' | sed 's/[^0-9]//g' | sort | tail -n1 | awk '{ n=$1+1 ; print n+(n%2) }'`
fi

# print debug info
if [ "$DEBUG" = "1" ]; then
    echo "AUTO     = $AUTO"
    echo "BOT      = $BOT"
    echo "HDD_COUNT= $HDD_COUNT"
    echo "ACTION   = $CMDLINE_ACTION"
    echo "DISKN    = $CMDLINE_DISKN"
    echo "DISKNAME = $CMDLINE_DISKNAME"
    echo "COMMENT  = $CMDLINE_COMMENT"
    echo "HDD_DATA:"
    echo "$HDD_DATA"
    echo "--"
fi

# info collecting is complete, lets check if running manual mode
if [ "$AUTO" = "0" ]; then
    case $CMDLINE_ACTION in
        autoinsert)
            # don't exec if there are degraded mirror(s)
            DEGRADED="`lookup_degraded`"
            if [ "$?" = "0" ]; then
                if ! raid_autoinsert ; then
                    echo "Autoinsert error occured, check output above"
                    exit 1
                fi
            else
                echo "Error, degraded mirror(s) detected: " $DEGRADED
                echo "First solve this."
                exit 1
            fi
            exit 0
            ;;
        stop)
            if [ "$CMDLINE_DISKNAME" = "missing" ] ; then
                # replace 'missing' keyword by drive name
                DISKN="`gmirror status -s | awk '{print $3}' | get_missing_number`"
                case "$?" in
                    0)
                        echo "Error: no missing drive detected, all drives are present. Please specify diskname."
                        exit 1
                        ;;
                    1)
                        # all ok
                        ;;
                    *)
                        echo "Error: more than 1 missing drives detected, or something's wrong."
                        exit 1
                        ;;
                esac
                CMDLINE_DISKNAME="${DISKTYPE}${DISKN}"
                echo "Drive is missing: $CMDLINE_DISKNAME"
            fi
            BADDISK=$CMDLINE_DISKNAME
            disk_remove
            exit 0
            ;;
    esac
fi

#######
# starting checks

# uptime < 20min
# gmirror failed at boot, all disks may be alive
if [ $UPTIME -le 1200 ]; then
    email_general_raid_error
    gmirror_forget
    exit 0
fi

# check if there are missing disk(s) in raid config
# it means that disk might suddenly gone offline, and may be alive, but it needs a check
DISKN="`gmirror status -s | awk '{print $3}' | get_missing_number`"
case "$?" in
    0)
        # no results - nothing to do, just continue
        ;;
    1)
        # one disk is missing - if it exists (or is available after reinit) and has smart errors - remove it
        DISK="${DISKTYPE}${DISKN}"
        echo "Missing drive detected: $DISK"
        SMART_STRICT_CHECK="1"

        gmirror_forget
        if ! disk_reinit $DISK ; then
            # reinit checks drive smart status!
            BADDISK=$DISK
            disk_remove && exit 0
            # drive is not ok, and disk_remove has failed - send general error message
            email_general_raid_error $DISK
            exit 1
        fi

        # if we are here, it seems that smart status is ok
        # probably disk has been just replaced, or has gone offline and back online...

        gmirror_forget # one more time after reinit

        echo "Trying to insert..."
        if raid_autoinsert ; then
            echo "All ok"
        else
            echo "Autoinsert failed, removing..."
            BADDISK=$DISK
            disk_remove
        fi

        gmirror_forget # one more time after kill_foreign_mirrors
        exit 0
        ;;
    *)
        # 2 or more missing disks, report general error
        echo "Too many missing drives"
        email_general_raid_error
        gmirror_forget
        exit 1
        ;;
esac

# looking for dmesg errors, as more common criteria...
# dont forget that disk has failed during current boot! see above
if dmesg | egrep -q "GEOM_MIRROR.*error=[5,6]" ; then
    # parsing line looks like:
    # GEOM_MIRROR: Request failed (error=5). ada1f[READ(offset=446397743104, length=65536)]
    # getting expected disk name:
    BADDISK=`dmesg | egrep "GEOM_MIRROR.*error=[5,6]" | tail -n1 | cut -f1 -d'[' | awk '{d=$(NF); i=match(d,"[0-9]"); print substr(d,1,i)}'`
    # check that we just found exactly a disk name, not any shit, just in case
    if echo "$BADDISK $DISKTYPE" | awk '{if ( NF==2 && $1~$2 && length($1)==length($2)+1 ) {exit 0} ; exit 1 }' ; then
        [ "$DEBUG" = "1" ] && echo "Found baddisk $BADDISK in dmesg, trying to remove"
        disk_remove && exit 0
    fi
fi
# if we are here, it looks like simple check does not work, so lets go deeper
[ "$DEBUG" = "1" ] && echo "dmesg check failed"

# looking for every degraded partition...
for part in `lookup_degraded`; do
    BADDISK=""
    # check for mirror type
    # if last symbol is a digit, mirror must be part of raid10
    # if not - it is a system mirror, and must be present on every disk
    if echo $part | egrep -qx "..*[0-9]" ; then
        # mirror must be part of raid10
        # 1. get stripe name for mirror
        STRN=`gstripe status -s | egrep -x "..* UP *mirror/${part}" | awk '{print $1 ; exit }' | awk -F/ '{print $2}'`
        # continue only if gstripe knows this mirror (unknown mirror may belong to disk from another server, and must be skipped)
        if ! [ "$STRN" = "" ]; then
            # 2. if so, get mirror names of gstripe members
            MIRRORS=`gstripe status -s $STRN | awk '{print $3}' | awk -F/ '{print $2}'`
            # 3. get all mirrors members (disk partitions) in one pipe, and pass them to special function
            DISKN=`gmirror status -s $MIRRORS | awk '{print $3}' | get_missing_number`
            if [ "$?" -eq 1 ]; then
                BADDISK="${DISKTYPE}${DISKN}"
                disk_remove && exit 0
            fi
        fi
    else
        # system mirror
        # get mirror members (disk partitions), and pass them to special function
        DISKN=`gmirror status -s $part | awk '{print $3}' | get_missing_number`
        if [ "$?" -eq 1 ]; then
            BADDISK="${DISKTYPE}${DISKN}"
            disk_remove && exit 0
        fi
    fi
done

# if we are here, it looks like deep check fails
# so, just report a problem and exit

email_general_raid_error
gmirror_forget
exit 1
