#!/bin/bash -E
set -aeo pipefail # x

script_dir="$(dirname $(readlink -f $0))"

bin_path=${script_dir}
result_filename="${RESULT_FILE}" # which filename wanted in caller
curr_dir="${CURR_DIR_PATH}"      # where result should be after build
ramdrive_dir="${RAM_DRIVE_PATH}" # ability to speedup build-step for BIG data
db_kind="${RDBMS_KIND}"

if [ -z "${result_filename}" ]; then
    echo ">>> NB: result filename not specified"
    result_filename="_fake_result_"
fi

if [ -z "${curr_dir}" ]; then
    echo ">>> curr-dir not specified"
    exit 1
fi

if [ -z "${db_kind}" ]; then
    echo ">>> db_kind not specified"
    exit 1
fi

data_path="${curr_dir}"
if [ -n "${ramdrive_dir}" ]; then
    data_path="${ramdrive_dir}"
fi
result_fname_path="${data_path}/${result_filename}"

ipreg_file=IPREG.json
ipreg_datafile="${data_path}/${ipreg_file}"
assets_datafile="${data_path}/assets.txt"
tor_datafile="${data_path}/tor.txt"
isp_datafile="${data_path}/isp.txt"
borders_file="reverse_borders_world.bin"
borders_datafile="${data_path}/${borders_file}"

yt_traits_datafile="${data_path}/yt.traits"
db_traits_datafile="${data_path}/db.traits"
s3mds_traits_datafile="${data_path}/s3mds.credentials"

geodata_check_flag=
geodata_format="0.0.0"
geobin_datafile="${data_path}/geodata.bin"

geogen_tool="/opt/yandex/geobase/geobase6-generator"
geocheck_tool="geobase5-checker"
geoutils_tool="geobase5-util"

city_id_checker="/opt/yandex/geobase/geobase6-city-id-checker"

check_file() {
	local fname=$1
	if [ -z $fname ]; then
		return 0
	fi

	if [ ! -s $fname ]; then
		echo ">>> BUG: '$fname' does not exists"
		exit 1
	fi

	local rows_min_limit=${2:-0}
	if [ $rows_min_limit -ne 0 ]; then
		local rows=$(cat $fname | wc -l)
		if [ $rows -lt $rows_min_limit ]; then
			echo ">>> BUG: '$fname' rows limit violation; ${rows_min_limit} required; but: ${rows}"
			exit 1
		fi
	fi

	echo "'$fname' - OK"
}

borders_datafile_src_url="https://proxy.sandbox.yandex-team.ru/464203585/${borders_file}" # geocoder#18.01.19-3
tor_exitnodes_src_url="https://check.torproject.org/exit-addresses"

ipreg_data_base_url="https://proxy.sandbox.yandex-team.ru/last/IPREG_EXPORT"
ipreg_sbr_traits="?owner=IPREG&attrs=%7B%22released%22:%22stable%22%7D"

asdata_src="${ipreg_data_base_url}/IPv6/ip_origin.gz${ipreg_sbr_traits}"
ipreg_src="${ipreg_data_base_url}/IPv6/${ipreg_file}${ipreg_sbr_traits}"

ipreg_last_traits_src="${ipreg_data_base_url}/content.list${ipreg_sbr_traits}"

curl_cmd="curl --verbose --insecure --silent --location --retry 3 --retry-delay 60 --connect-timeout 60 --show-error --fail"

eu_regions_fname="${data_path}/eu-regions.list"
iso_alpha3_fname="${data_path}/iso-alpha3.list"
iso_alpha3_column_name="iso_alpha3"

get_assets() {
    ${curl_cmd} "${asdata_src}" \
    | zcat
}

download_ipregs() {
    echo $FUNCNAME

    if [ ! -s $ipreg_datafile ]; then
        echo "ipreg download [${ipreg_src}]..."
        ${curl_cmd} "${ipreg_src}" > $ipreg_datafile
    fi
    wc -l $ipreg_datafile
}

prepare_faked_tor() {
    echo $FUNCNAME

    # this data is useless
    echo "0.0.0.0" > $tor_datafile
}

download_tor() {
    echo $FUNCNAME

    if [ ! -s $tor_datafile ]; then
        echo "tor-nodes download [${tor_exitnodes_src_url}]..."
        ${curl_cmd} "${tor_exitnodes_src_url}" > $tor_datafile.tmp

        local nodes_qty=$(grep -c ^ExitAddress $tor_datafile.tmp)
        if [ $nodes_qty -eq 0 ]; then
            echo ">>> NB: no TOR-data <<<"
            prepare_faked_tor
            return
        fi

        grep ^ExitAddress $tor_datafile.tmp \
        | awk '{ print $2 }' \
        > $tor_datafile
    fi
    wc -l $tor_datafile
}

fake_isp_data() {
    cat <<- EOF_ISP
	::-ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff	1
	EOF_ISP
}

download_isp() {
    echo "isp-fake-gen..."
    fake_isp_data > ${isp_datafile}

    wc -l $isp_datafile
    md5sum $isp_datafile
}

download_borders_section() {
    echo $FUNCNAME

    if [ ! -s ${borders_datafile} ]; then
        echo "borders download [${borders_datafile_src_url}]..."
        ${curl_cmd} "${borders_datafile_src_url}" > ${borders_datafile}
    fi
    ls -Al ${borders_datafile}
}

fake_ipreg_data() {
    cat <<- EOF_IPREG
	::-::fffe:ffff:ffff	{"region_id":10000,"reliability":0}
	::ffff:0:0-::ffff:ffff:ffff	{"region_id":10000,"reliability":0}
	::1:0:0:0-ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff	{"region_id":10000,"reliability":0}
	EOF_IPREG
}

prepare_faked_resource() {
    echo $FUNCNAME

    # this data is useless
    echo "0 0 AS0" > $assets_datafile
    fake_ipreg_data > $ipreg_datafile
    prepare_faked_tor
}

check_basic_resources() {
    echo $FUNCNAME

    check_file $ipreg_datafile 8500000
    check_file $assets_datafile 220000
    check_file $tor_datafile 1  # NB: 2022-05-05 empty output from tor-project
    check_file ${db_traits_datafile} 5
}

check_additional_resources() {
    echo $FUNCNAME

    check_file $isp_datafile 1
    check_file $borders_datafile
}

generate_geodata5_bin() {
    echo $FUNCNAME

    . ${db_traits_datafile}

    echo "'${geobin_datafile}' generation..."
    $geogen_tool \
        $geodata_check_flag \
        --format=$(echo $geodata_format | tr -d '.') \
        --output=$geobin_datafile \
        --isp-section=$isp_datafile \
        --regions-section=@DB \
        --regions-locales-section=@DB \
        --as-section=$assets_datafile,200 \
        --ipreg-section=$ipreg_datafile,200 \
        --tor-section=$tor_datafile \
        --borders-section=$borders_datafile \
        --dbhost=$db_host \
        --dbuser=$db_user \
        --dbpass="${db_pswd}" \
        --dbport=$db_port \
        --dbname=$db_name \
        $*
}

generate_geodata4_bin() {
    echo $FUNCNAME

    . ${db_traits_datafile}

    echo "'${geobin_datafile}' generation..."
    $geogen_tool \
        $geodata_check_flag \
        --format=$(echo $geodata_format | tr -d '.') \
        --output=$geobin_datafile \
        --asset=$assets_datafile \
        --tor=$tor_datafile \
        --ipregv6=$ipreg_datafile \
        --dbhost=$db_host \
        --dbuser=$db_user \
        --dbpass="${db_pswd}" \
        --dbport=$db_port \
        --dbname=$db_name \
        $*
}

check_dangling_regions() {
    echo $FUNCNAME

    ${bin_path}/LIB-776.check-dangling-regions.py $geobin_datafile \
    | tee $geobin_datafile.dangling_regions

    dangling_amount=$(cat $geobin_datafile.dangling_regions | wc -l)
    if [ 0 -ne $dangling_amount ]; then
        echo ">>> BUG: dangling regions were detected !!! <<<"
        exit 1
    fi
}

check_borders_removed_regions() {
    echo $FUNCNAME

    detected_amount=$(${bin_path}/LIB-775.check-removed-regions-borders.py $geobin_datafile \
    | tee removed-regions-with-borders.list | wc -l)
	if [ 0 -lt $detected_amount ]; then
		subj_text="borders of removed regions, ${detected_amount} // ${geobin_datafile}"
		echo $subj_text
		cat removed-regions-with-borders.list
        # TODO(dieash) send-email
	fi
}

check_spaced_linguistics() {
    echo $FUNCNAME

    ${bin_path}/LIB-871.check-en-ling.py --geodata $geobin_datafile
}

check_city_id() {
    echo $FUNCNAME

    if [ ! -f ${city_id_checker} ]; then
        echo "no util ${city_id_checker}... SKIP check."
        return
    fi

    ${city_id_checker} ${geobin_datafile}
}

test_tree_lings() {
    echo $FUNCNAME

    # TODO(dieash@) suggest-testing(maybe)
    check_city_id
    check_dangling_regions
    check_spaced_linguistics
}

check_geodata_bin() {
    echo $FUNCNAME

    check_file ${geobin_datafile}
    $geocheck_tool $geobin_datafile
    $geoutils_tool -i -f $geobin_datafile

    test_tree_lings
}

prepare_eu_list() {
    echo $FUNCNAME
    local export_url="http://geoexport.yandex.ru/?types=_all_&fields=country_id,id"

    ${curl_cmd} ${export_url} | ${bin_path}/prepare-eu-list.py | tee ${eu_regions_fname} | wc -l
}

prepare_iso_alpha3_list() {
    echo $FUNCNAME

    if [ ! -f ${iso_alpha3_fname} ]; then
        echo "let's use local file"
        cp ${script_dir}/$(basename ${iso_alpha3_fname}) ${iso_alpha3_fname}
    fi
    return

    local datafile_url="https://proxy.sandbox.yandex-team.ru/last/GEODATA_VERSIONS?owner=GEOBASE&attrs=%7B%22released%22:%22stable%22,%22data_kind%22:%22iso_alpha3_list%22%7D"
    echo "#@desc ${iso_alpha3_column_name}@string" > ${iso_alpha3_fname}
    ${curl_cmd} ${datafile_url} | tee --append ${iso_alpha3_fname} | wc -l
}

check_new_columns() {
    local field=$1
    local org_data=$2

    echo "$FUNCNAME // ${field} // ${org_data}"

    local md5sum_txt_list=$(cat ${org_data} | grep -v -e ^# -e ^$ | awk '{ print $1 }' | sort -n | md5sum)
    local md5sum_bin_list=$(${bin_path}/extract-id-if-field.py --field-name ${field} --geodata ${geobin_datafile} | tee ${org_data}_bin | sort -n | md5sum)

    echo "${md5sum_txt_list} vs ${md5sum_bin_list}"
    if [ "${md5sum_txt_list}" != "${md5sum_bin_list}" ]; then
        echo "something wrong with data in column"
        exit 1
    fi
}

check_eu_list() {
    check_new_columns is_eu ${eu_regions_fname}
}

check_iso_alpha3_list() {
    check_new_columns ${iso_alpha3_column_name} ${iso_alpha3_fname}
}

test_geodata_bin() {
    echo $FUNCNAME

    cat $assets_datafile | ${bin_path}/assets-testing.py --geodata $geobin_datafile --check-full-range
    cat $tor_datafile | ${bin_path}/tor-testing.py --geodata $geobin_datafile
    sed -n 5~100p $ipreg_datafile | ${bin_path}/ipreg-testing.py --geodata $geobin_datafile --debug-progress-step=50000
}

test_geodata5_bin() {
    echo $FUNCNAME

    cat $isp_datafile | ${bin_path}/isp-testing.py --geodata $geobin_datafile
    check_borders_removed_regions
}

check_awscli() {
    echo $FUNCNAME

    if [ -z "${S3MDS_ENDPOINT}" ]; then
        return
    fi

    if [ -z "$(which aws)" ]; then
        echo "no AWS-CLI"
        exit 1
    fi

    if [ ! -f ${s3mds_traits_datafile} ]; then
        echo "no auth-file '${s3mds_traits_datafile}'"
        exit 1
    fi
}

debug_info() {
    echo ">>> ${bin_path} <<<"
    ls -Al ${bin_path}

    echo ">>> ${data_path} <<<"
    ls -Al ${data_path}
}

view_remote_content() { # debug-only
    echo ">>> last/IPREG_EXPORT/content.list <<<"
    curl --insecure --silent "${ipreg_last_traits_src}"
}

get_xenial_apt_source() {
    cat <<- EOF_XENIAL
	#
	deb http://dist.yandex.ru/yandex-xenial unstable/all/
	deb http://dist.yandex.ru/yandex-xenial unstable/amd64/

	#
	deb http://dist.yandex.ru/yandex-xenial testing/all/
	deb http://dist.yandex.ru/yandex-xenial testing/amd64/
	EOF_XENIAL
}

fix_apt_sources() {
    get_xenial_apt_source > yandex-xenial-fix.list
    sudo cp yandex-xenial-fix.list /etc/apt/sources.list.d/
}

install_required_debs() {
    echo $FUNCNAME

    fix_apt_sources

    sudo apt-get update
    sudo DEBIAN_FRONTEND=noninteractive apt-get --assume-yes install $*
}

move_result_file_if_required() {
    echo $FUNCNAME

    if [ -z "${ramdrive_dir}" ]; then
        echo "${FUNCNAME} // nothing to move"
        return
    fi

    if [ "${data_path}" = "${curr_dir}" ]; then
        echo "${FUNCNAME} // equals path; nothing to move"
        return
    fi

    if [ ! -e ${result_fname_path} ]; then
        echo "${FUNCNAME} // >>> UNABLE TO DETECT result: [${result_fname_path}]"
        exit 1
    fi

    cp -r ${result_fname_path} ${curr_dir}
}

generate_targz() {
    local gz_ext=gz
    local result_ext=${result_fname_path##*.}
    if [ "${gz_ext}" != "${result_ext}" ]; then
        return
    fi

    echo "'${geobin_datafile_targz}' archive generation..."
    tar czvf ${geobin_datafile_targz} --transform "s:^.*/::" $geobin_datafile
    check_file $geobin_datafile_targz
}

upload_s3mds() {
    echo "${FUNCNAME}"
    if [ -z "${S3MDS_ENDPOINT}" ]; then
        echo "... do nothing"
        return
    fi

    local data_fname="$(basename ${geobin_datafile})"
    local s3_fname_path="s3://geobase/last/${data_fname}"
    echo "... ${geobin_datafile} => ${s3_fname_path}"

    AWS_SHARED_CREDENTIALS_FILE=${s3mds_traits_datafile} \
        aws --endpoint-url=${S3MDS_ENDPOINT} s3 cp ${geobin_datafile} ${s3_fname_path}
}
