#!/bin/sh -e
#
# Script for finding processes which use too much memory on robot
# cluster.
#
# $Header$

PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin:/Berkanavt/bin
export PATH

#-- Subroutines --------------------------------------------------------

err()
{
	local _exitval

	_exitval=$1
	shift

	echo 1>&2 "ERROR: $*"
	exit $_exitval
}

get_swapuse()
{
	local _var _val
	_var="$1"
	_val=$(swapinfo | awk '
		! /^Dev/ { s = $NF }
		END{ sub(/%/,"",s); printf "%d\n", s }
		')
	if [ -z "${_val}" ]; then
		err 1 "Can't get swapuse !"
	fi
	eval $_var=\"\$_val\"
}

check_depends()
{
	local _swuse
	get_swapuse _swuse
	if [ $_swuse -lt ${swapuse_threshold} ]; then
		exit
	fi
}

mk_tmp_file()
{
	local _var _f
	_var="$1"
	_f=$(mktemp -t $thiscmd) || err 1 "Can't make temporary file !"
	tmp_files="${tmp_files}${tmp_files:+ }$_f"
	eval $_var=\"\$_f\"
}

get_stat()
{
	local _i _swuse _start _finish
	_start=$(date "${time_fmt}")
	for _i in $(jot $repeat_times); do
		if ! ps -o pid -o rss -o vsz -o stat -o start -o %cpu \
			-o time -o command -wwU webbase | \
			awk -v thr=$rss_threshold '$2 > thr'
		then
			err 1 "Can't run ps !"
		fi
		sleep $repeat_interval
		get_swapuse _swuse
		if [ $_swuse -lt ${swapuse_threshold} ]; then
			break
		fi
	done | \
	awk '
		$1 == "PID" { hdr = $0 }
		$1 ~ /^[0-9]+$/ {
			pid = $1
			rss = $2	
			if (!rss_max[pid]) {
				rss_max[pid] = rss
				ps[pid] = $0
			}
			if (rss > rss_max[pid]) {
				rss_max[pid] = rss
				ps[pid] = $0
			}
		}
		END{
			for (pid in ps) {
				if (!hdr_printed) {
					print hdr
					hdr_printed = 1
				}
				print ps[pid]
			}
		}
		' > $stat_file || \
			err 1 "Can't save statistics to ${stat_file} !"
	_finish=$(date "${time_fmt}")
	if ! echo "PERIOD: [${_start}] [${_finish}]" >> $stat_file; then
		err 1 "Can't save period to ${stat_file} !"
	fi
}

send_stat()
{
	if ! $send_cmd $send_opts $stat_file $send_url; then
		err 1 "Can't save statistics to ${send_url} !"
	fi
}

collect()
{
	check_depends
	mk_tmp_file stat_file
	get_stat
	send_stat
}

set_f_vars()
{
	if ! cd $recv_dir; then
		err 1 "Can't change dir to ${recv_dir} !"
	fi

	eval $(ls -1 *${stat_sfx} 2>/dev/null | awk -F_ '
		function join(_arr, _el) {
			if (!_arr) _arr = _el
			else _arr = sprintf("%s %s", _arr, _el)
			return _arr
		}
		{
			files[$1] = join(files[$1], $0)
		}
		END{
			for (h in files) {
				h_list = join(h_list, h)
				printf "files_%s=\"%s\";\n", h, files[h]
			}
			printf "host_list=\"%s\";\n", h_list
		}
		')

	if [ -z "${host_list}" ]; then
		# There are no files to aggregate
		return
	fi
}

sort_keys()
{
	if ! echo $* | tr ' ' '\n' | sort; then
		err 1 "Can't sort keys !"
	fi
}

aggregate_by_hosts()
{
	local _h _files _aggr_file

	for _h in $(sort_keys $host_list); do
		eval _files=\"\${files_${_h}}\"
		_aggr_file="${aggr_dir}/${_h}${aggr_sfx}"
		if ! echo "HOST: $_h" > $_aggr_file; then
			err 1 "Can't write to ${_aggr_file} !"
		fi
		if ! awk -v _aggr_file=$_aggr_file '
			$1 == "PID" { hdr = $0 }
			$1 ~ /^[0-9]+$/ {
				pid = $1
				rss = $2	
				start = $5
				key = sprintf("%d_%s", pid, start)
				if (!rss_max[key]) {
					rss_max[key] = rss
					ps[key] = $0
				}
				if (rss > rss_max[key]) {
					rss_max[key] = rss
					ps[key] = $0
				}
			}
			END{
				for (pid in ps) {
					if (!hdr_printed) {
						print hdr >> _aggr_file
						hdr_printed = 1
					}
					print ps[pid]
				}
			}
			' $_files | sort -n -k 2 -r >> $_aggr_file
		then
			err 1 "Can't aggregate files for ${_h} !"
		fi
		if ! mv $_files $recv_bak_dir; then
			err 1 "Can't move processed files to ${recv_bak_dir} !"
		fi
		aggr_f_list="${aggr_f_list}${aggr_f_list:+ }${_aggr_file}"
	done
}

represent()
{
	if [ -z "${aggr_f_list}" ]; then
		return
	fi
	if ! cat $aggr_f_list; then
		err 1 "Can't print data from $_aggr_f_list"
	fi
	if ! mv $aggr_f_list $aggr_bak_dir; then
		err 1 "Can't move processed files to ${aggr_bak_dir} !"
	fi
}

aggregate()
{
	set_f_vars
	aggregate_by_hosts
	represent
}

cleanup()
{
	if [ -n "${tmp_files}" ]; then
		rm -f $tmp_files 2>/dev/null || true
	fi
}


#-- Variables ----------------------------------------------------------

thiscmd=$(basename $0)
hostname_s=$(hostname -s)

swapuse_threshold=50			# %
rss_threshold=$((500 * 1024))		# KB

repeat_times=10
repeat_interval=30

epoch_time=$(date +%s)
time_fmt="+%F %T"

stat_sfx="_stat"
aggr_sfx="_aggr"

send_cmd="rsync"
send_opts="--password-file=/usr/local/etc/rsync/secret.webbase \
	--contimeout=30 --timeout=60 -qt"
send_url="rsync://webbase@ya.fb.yandex.ru/berkanavt_rw/swap/recv/${hostname_s}_${epoch_time}${stat_sfx}"

recv_dir="/Berkanavt/swap/recv"
recv_bak_dir="/Berkanavt/swap/recv_bak"
aggr_dir="/Berkanavt/swap/aggr"
aggr_bak_dir="/Berkanavt/swap/aggr_bak"


#-- Main ---------------------------------------------------------------

trap cleanup 2 3 EXIT

case "$1" in
	collect|"") $1 ;;
	aggregate) $1 ;;
esac

