#!/usr/bin/perl

=head1 METADATA

<crontab>
    time: */20 * * * *
    <switchman>
        group: scripts-other
        lockname: ppcCalcIsBSRarelyLoaded
        <leases>
            mem: 2048
        </leases>
    </switchman>
    package: scripts-switchman
</crontab>
<juggler>
    host:   checks_auto.direct.yandex.ru
    ttl:    10h
    tag:    direct_yt
    tag: direct_group_internal_systems
    <notification>
        template: on_status_change
        status: OK
        status: CRIT
        method: telegram
        login: DISMonitoring
    </notification>
</juggler>
<juggler_check>
    host:   checks_auto.direct.yandex.ru
    name:       direct.ppcCalcIsBSRarelyLoaded.check_last_sync_time
    raw_events: direct.ppcCalcIsBSRarelyLoaded.check_last_sync_time.production
    ttl:        5h
    tag: direct_group_internal_systems
    <notification>
        template: on_status_change
        <status>
            from: OK
            to: CRIT
        </status>
        <status>
            from: WARN
            to: CRIT
        </status>
        <status>
            from: CRIT
            to: OK
        </status>
        <status>
            from: CRIT
            to: WARN
        </status>
        method: telegram
        login: DISMonitoring
    </notification>
</juggler_check>

=head1 NAME

    ppcCalcIsBSRarelyLoaded.pl

=head1 DESCRIPTION

    Скрипт для обработки логов "геноцида" движка БК. Актуализирует значения phrases.is_bs_rarely_loaded по этим логам.

    Режимы работы скрипта:
    0. если сменился кластер, то перемещаем старые таблицы на новом кластере в папку tmp с добавлением к имени create_time
    1. нет старого снепшота - создать снепшот, сделать полную синхронизацию его по диапазону pid от 0 до max(pid),
        заменить новым снепшотом старый
    2. есть дифф-табличка - налить дифф в базу (по чанкам из NN строк таблицы),
        удалить дифф и заменить старый снепшот новым
    3. нет дифф-таблички и таблица-исходник свежее чем "предыдущая, использованная для генерации" - сделать новый снешпот,
        посчитать дифф между ними, влить в базу дифф, удалить дифф и заменить старый снепшот новым
    4. нет дифф таблички и таблица-исходник та же (или с меньше датой создания, чем использованная предыдущая) - ничего не делаем

    Параметры:

        import-path - путь к папке, из которой берем таблицы с логом геноцида, по умолчанию import/genocide_results

=cut

use Direct::Modern;

use File::Basename qw/basename/;
use List::MoreUtils qw/all pairwise/;
use List::Util qw/min/;

use Yandex::DateTime;
use Yandex::DBShards;
use Yandex::DBTools;
use Yandex::ListUtils;
use Yandex::Overshard;
use Yandex::Shell;
use Yandex::TimeCommon;
use Yandex::Trace;
use Yandex::YT::Streaming;
use Yandex::YT::Table;
use Yandex::YT::TableReader;
use Yandex::Advmon;
use Yandex::Retry qw/relaxed_guard/;

use my_inc '..';

use EnvTools qw/is_beta/;
use ScriptHelper 'Yandex::Log' => 'messages';
use Settings;
use ShardingTools qw/ppc_shards/;
use Tools ();
use Property;

=head1 CONSTANTS

=head2 YT_CLUSTER_WITH_BS_DATA

    Кластер по умолчанию, с которого берем БКшные данные. Используется, если в проперти нет данных или они не валидны

    Захардкожен Hahn.
    Автоопределения кластера нет, так как полный импорт с другого кластера (отличного от предыдущего)
    почти на порядок дороже во времени, чем синхронизация дельты

=cut

use constant YT_CLUSTER_WITH_BS_DATA => 'hahn';

=head2 FULL_SYNC_YT_CHUNK_SIZE

    Какими чанками по диапазону pid будет чтение из YT при полной синхронизации

=cut

use constant FULL_SYNC_YT_CHUNK_SIZE => 50_000_000;

=head2 INCREMENTAL_SYNC_YT_CHUNK_SIZE

    Какими чанками будет чтение из YT при инкрементальной синхронизации

=cut

use constant INCREMENTAL_SYNC_YT_CHUNK_SIZE => 2_000_000;

=head2 FULL_SYNC_DB_READ_CHUNK_SIZE

    Какими чанками по диапазону pid будут обновления в базу при полной синхронизации

=cut

use constant FULL_SYNC_DB_READ_CHUNK_SIZE => 1_000_000;

=head2 DB_WRITE_CHUNK_SIZE

    Какими чанками будет обновление данных в базу

=cut

use constant DB_WRITE_CHUNK_SIZE => 2_500;

my $CAMP_KEY = 'OrderID';

$Yandex::YT::Streaming::DEFAULT_FORMAT = 'json';

my $LAST_SYNC_TIME_PROPERTY_NAME = 'ppcCalcIsBSRarelyLoaded_last_sync_time';
my $LAST_YT_CLUSTER_WITH_BS_DATA_PROPERTY_NAME = 'ppcCalcIsBSRarelyLoaded_last_yt_cluster_with_bs_data';
my $CURRENT_YT_CLUSTER_PROPERTY_NAME = 'ppcCalcIsBSRarelyLoaded_current_yt_cluster';
my $RELAXED_COEF_PROP = Property->new('bs_rarely_loaded_import_sleep_coef');

my $FRESHNESS_TIME = 'freshness_time';

# ловим ошибки из yash_system
local $SIG{__DIE__} = sub { $log->die({ message => 'DIED!', error => [split(qr/[\r\n]\s+/, Carp::longmess($_[0]) =~ s{[\r\n\s]+$}{}r)] }); };

my $export_path = 'export';
my $tmp_path = 'tmp';
my $import_path = 'import/genocide_results';

if (is_beta()) {
    $export_path = 'tmp/' . $ENV{USER} . '/export';
    $tmp_path = 'tmp/' . $ENV{USER};
}

extract_script_params(
    "import-path=s" => \$import_path
);

$log->out("START");

my $cluster = Property->new($CURRENT_YT_CLUSTER_PROPERTY_NAME)->get();
$log->out({yt_cluster_from_property => $cluster});
unless ($cluster && $cluster =~ m/^(hahn|arnold)$/) {
    $log->out("invalid cluster from property, use default");
    $cluster = YT_CLUSTER_WITH_BS_DATA;
}

$log->out("getting sql lock");
my $sql_lock_guard = sql_lock_guard(PPCDICT, get_script_name(short => 1), 0);
$log->out("sql lock obtained");

$log->out("setup YT environment: $cluster");
Tools::force_set_yt_environment($cluster);

my $source_table = get_source_table();
my $old_snapshot = Yandex::YT::Table->new("$export_path/bs_load_probability");
my $new_snapshot = Yandex::YT::Table->new("$tmp_path/bs_load_probability");
my $diff_table = Yandex::YT::Table->new("$export_path/bs_load_probability_snapshots_diff");

$log->out({
    export_path => $export_path,
    tmp_path => $tmp_path,
    source_table => $source_table ? $source_table->node_name() : undef,
});

if (!$source_table) {
    my $msg = 'No suitable source table found. Stop working';
    $log->out($msg);
    juggler_warn(description => $msg);
    exit;
}

my $last_sync_time_property = Property->new($LAST_SYNC_TIME_PROPERTY_NAME);
my $last_sync_time = $last_sync_time_property->get() || '1970-01-01 00:00:00';
$log->out("Last sync time: ".$last_sync_time);
juggler_check(service => 'direct.ppcCalcIsBSRarelyLoaded.check_last_sync_time',
    description => 'Разница (в секундах) с последнего успеха синхронизации в ppcCalcIsBSRarelyLoaded',
    value => (time() - mysql2unix($last_sync_time)),
    warn => 4*3600,
    crit => 8*3600,
);

my $last_yt_cluster_property = Property->new($LAST_YT_CLUSTER_WITH_BS_DATA_PROPERTY_NAME);
my $last_yt_cluster = $last_yt_cluster_property->get() || $cluster;
$log->out("Last sync cluster: ".$last_yt_cluster);
if ($last_yt_cluster ne $cluster) { # Если изменился кластер, то архивируем старые таблицы на новом кластере
    $log->out("Start archiving old tables in new cluster");
    archive_old_tables_in_new_cluster();
}

for my $path ($export_path, $tmp_path) {
    Yandex::YT::Table->new($path)->create("map_node");
}

if (!$old_snapshot->exists()) {
    $log->out("Previous snapshot doesn't exists. Doing full sync");
    create_first_snapshot();

    set_last_cluster_prop();

    sync_snapshot();

    $log->out("Save snapshot as old");
    $new_snapshot->move($old_snapshot);
} elsif ($diff_table->exists()) {
    $log->out("Found old differences table. Applying again");
    apply_snapshots_diff();
} elsif ($source_table->get_attribute($FRESHNESS_TIME) gt ($old_snapshot->get_attribute('source_createtime') // 0)) {
    $log->out("Found new source table. Doing incremental sync");
    create_new_snapshot_and_calc_diff();
    set_last_cluster_prop();
    apply_snapshots_diff();
} else {
    $log->out("Direct data is fresh. Skip working");
}

juggler_ok();

$log->out("FINISH");

exit;

sub set_last_cluster_prop {
    $log->out("Uptate last_yt_cluster_property to: $cluster");
    $last_yt_cluster_property->set($cluster);
}

=head2 get_source_table

    Получить путь до таблицы с логом геноцида.

    Выбираем свежайшую из двух контуров (key_1 или key_2)

    Возвращает Yandex::YT::Table или undef

=cut

sub get_source_table {
    my $source_table;

    my @candidates =
        sort {
            # сортируем от новых к старым
            $b->get_attribute($FRESHNESS_TIME) cmp $a->get_attribute($FRESHNESS_TIME)
        } grep {
            $_->get_attribute('row_count') > 0
        } Yandex::YT::Table->new($import_path)->list();

    return $candidates[0];
}

sub create_first_snapshot {
    if ($new_snapshot->exists()) {
        $log->out("New snapshot already exists. Skip creating");
        return;
    }

    my $source_createtime = $source_table->get_attribute($FRESHNESS_TIME);

    $log->out("execute task for creating first data snapshot");
    my $profile = Yandex::Trace::new_profile('ppcCalcIsBSRarelyLoaded:create_first_snapshot');
    yash_system('pyt',
                '--path' => my_inc::path('.'),
                'Direct::YT::Export::BsLoadProbability' => 'create_bs_snapshot', 
                '-v' => "tmp_dir=$tmp_path",
                '-v' => "source_table=".$source_table->node_name(),
                '-v' => "source_createtime=$source_createtime",
    );
    undef $profile;

    if (!$new_snapshot->exists()) {
        die {message => "something went wrong - new snapshot was not created"};
    }

    $log->out("First snapshot created sucessfully");
}

sub sync_snapshot {
    my $max_pid = overshard(group => '', max => 'max_pid', get_all_sql(PPC(shard => "all"), 'SELECT MAX(pid) AS max_pid FROM phrases'))->[0]->{max_pid};

    my %BS_RARELY_LOADED_PIDS;
    # заранее заготавливаем побольше корзин, чтобы избежать перестроения хеша при заполнении
    keys (%BS_RARELY_LOADED_PIDS) = 32_000_000;

    for (my ($chunk_start, $chunk_end) = (1, 1 + FULL_SYNC_YT_CHUNK_SIZE - 1);
         $chunk_start <= $max_pid;
         ($chunk_start, $chunk_end) = ($chunk_end + 1, $chunk_end + FULL_SYNC_YT_CHUNK_SIZE)
    ) {
        %BS_RARELY_LOADED_PIDS = ();

        $log->out('processing pids chunk');

        my $yt_chunk_start = $chunk_start;
        # в YTе правая граница диапазона - не включена, поэтому расширяем диапазон на 1
        my $yt_chunk_end = min($chunk_end, $max_pid) + 1;
        my $reader = Yandex::YT::TableReader->new("$tmp_path/bs_load_probability{pid}[$yt_chunk_start:$yt_chunk_end]");
        $log->out("read data chunk from YT");
        while (my $r = $reader->next()) {
            $BS_RARELY_LOADED_PIDS{ $r->{pid} } = undef;
        }

        my $shard_results = foreach_shard_parallel shard => [ppc_shards()], sub {
            local *__ANON__ = 'apply_full_sync_chunk_in_shard';

            my $shard = shift;
            Yandex::Trace::restart(\$ScriptHelper::trace, tags => "worker,shard=$shard");

            $log->out({shard => $shard, message => "read data chunk from PPC"});
            my (@to_add, @to_remove);
            for (my ($subchunk_start, $subchunk_end) = ($chunk_start, $chunk_start + FULL_SYNC_DB_READ_CHUNK_SIZE - 1);
                 $subchunk_start <= $chunk_end && $subchunk_start <= $max_pid;
                 ($subchunk_start, $subchunk_end) = ($subchunk_end + 1, $subchunk_end + FULL_SYNC_DB_READ_CHUNK_SIZE)
            ) {
                $log->out({shard => $shard, message => "read data sub-chunk from PPC"});
                # сортировка для локальности по cid, для птимизации пересчёта аггрегированных статусов
                my $sth = exec_sql(PPC(shard => $shard), ['SELECT pid, is_bs_rarely_loaded FROM phrases',
                        WHERE => { pid__between__int => [ $subchunk_start, $subchunk_end ] },
                        "ORDER BY cid" ]);
                while ( my ($pid, $db_rarely_loaded) = $sth->fetchrow_array() ) {
                    my ($bs_rarely_loaded, $result_array);
                    if (exists $BS_RARELY_LOADED_PIDS{ $pid }) {
                        $bs_rarely_loaded = 1;
                        $result_array = \@to_add;
                    } else {
                        $bs_rarely_loaded = 0;
                        $result_array = \@to_remove;
                    }

                    if ($db_rarely_loaded != $bs_rarely_loaded) {
                        push @$result_array, $pid;
                    }
                }
                $sth->finish();
            }

            $log->out({
                message => 'full sync db chunk stat',
                shard => $shard,
                to_add => scalar(@to_add),
                to_remove => scalar(@to_remove),
                chunk_start => $chunk_start,
                chunk_end => $chunk_end
            });

            _db_chunked_write(shard => $shard, to_add => \@to_add, to_remove => \@to_remove);
        };
        unless (all {all {$_} @$_} values %$shard_results) {
            die {error => "somethind wrong in foreach_shard_parallel", shard_results => $shard_results};
        }
    }
    update_last_sync_time_property();
}

sub create_new_snapshot_and_calc_diff {
    my $source_createtime = $source_table->get_attribute($FRESHNESS_TIME);

    $log->out("execute task for creating new data snapshot and caclulate diff between snapshots");

    my $profile = Yandex::Trace::new_profile('ppcCalcIsBSRarelyLoaded:create_new_snapshot_and_calc_diff');
    yash_system('pyt',
                '--path' => my_inc::path('.'),
                'Direct::YT::Export::BsLoadProbability' => 'make_new_snapshot_and_calc_diff', 
                '-v' => "tmp_dir=$tmp_path",
                '-v' => "export_dir=$export_path",
                '-v' => "source_table=".$source_table->node_name(),
                '-v' => "source_createtime=$source_createtime",
    );
    undef $profile;

    if (!$diff_table->exists()) {
        die {message => "something went wrong - diff table was not created"};
    }

    $log->out("New snapshot and differences tables created sucessfully");
}

sub apply_snapshots_diff {
    my $diff_rows_cnt = $diff_table->get_attribute('row_count');
    $log->out("diff table has $diff_rows_cnt rows");

    my $groups_added = 0;
    my $groups_removed = 0;

    for (my ($chunk_start, $chunk_end) = (0, 0 + INCREMENTAL_SYNC_YT_CHUNK_SIZE);
         $chunk_start <= $diff_rows_cnt;
         ($chunk_start, $chunk_end) = ($chunk_end , $chunk_end + INCREMENTAL_SYNC_YT_CHUNK_SIZE)
    ) {
        my @data;

        $log->out("read data chunk from YT");
        my $diff_reader = $diff_table->reader([qw/pid is_bs_rarely_loaded/, $CAMP_KEY], "[#$chunk_start:#$chunk_end]");
        while (my $row = $diff_reader->next()) {
            push @data, $row;
            $row->{is_bs_rarely_loaded} eq 'true' ? $groups_added++ : $groups_removed++;
        }

        $log->out("prepare data chunk");
        my $shard_results = foreach_shard_parallel $CAMP_KEY => \@data, with_undef_shard => 1, chunk_size => 0, sub {
            local *__ANON__ = 'apply_snapshots_diff_in_shard';

            my ($shard, $chunk) = @_;
            Yandex::Trace::restart(\$ScriptHelper::trace, tags => "worker,shard=$shard");

            if (!$shard) {
                my $suffix = defined $shard ? 'zero_shard_data' : 'unknown_shard_data';
                $log->bulk_out($suffix => $chunk);
                return;
            }

            my (@to_add, @to_remove);
            for my $row (@$chunk) {
                if ($row->{is_bs_rarely_loaded} eq "true") {
                    push @to_add, $row->{pid};
                } else {
                    push @to_remove, $row->{pid};
                }
            }

            $log->out({
                message => 'incremental sync db chunk stat',
                shard => $shard,
                to_add => scalar(@to_add),
                to_remove => scalar(@to_remove),
                chunk_start_row => $chunk_start,
                chunk_end_row => $chunk_end - 1,
            });

            _db_chunked_write(shard => $shard, to_add => \@to_add, to_remove => \@to_remove);
        };
        unless (all {all {$_} @$_} values %$shard_results) {
            die {error => "somethind wrong in foreach_shard_parallel", shard_results => $shard_results};
        }
    }

    _send_to_graphite($groups_added, $groups_removed);

    $log->out("execute task for replace old snapshot with new one and remove diff table");
    yash_system('pyt',
                '--path' => my_inc::path('.'),
                'Direct::YT::Export::BsLoadProbability' => 'replace_snapshot_and_remove_diff', 
                '-v' => "tmp_dir=$tmp_path",
                '-v' => "export_dir=$export_path",
    );
    update_last_sync_time_property();
}

sub _db_chunked_write {
    my %params = @_;
    my $relaxed = relaxed_guard times => $RELAXED_COEF_PROP->get(60) // 2;
    my ($shard, $to_add, $to_remove) = @params{qw/shard to_add to_remove/};
    $log->out({shard => $shard, message => "apply data to PPC"});
    for my $chunk (chunks($to_remove, DB_WRITE_CHUNK_SIZE)) {
        $log->out({shard => $shard, message => "remove is_bs_rarely_loaded for pids chunk"});
        do_update_table(PPC(shard => $shard), 'phrases', {is_bs_rarely_loaded => 0, LastChange__dont_quote => 'NOW()'}, where => { pid__int => $chunk });
    }
    for my $chunk (chunks($to_add, DB_WRITE_CHUNK_SIZE)) {
        $log->out({shard => $shard, message => "add is_bs_rarely_loaded for pids chunk"});
        do_update_table(PPC(shard => $shard), 'phrases', {is_bs_rarely_loaded => 1, LastChange__dont_quote => 'NOW()'}, where => { pid__int => $chunk });
    }
}

sub _send_to_graphite {
    my $groups_added_cnt = shift;
    my $groups_removed_cnt = shift;

    local $Yandex::Advmon::GRAPHITE_PREFIX = sub {[qw/direct_one_min db_configurations/, $Settings::CONFIGURATION]};
    monitor_values({
        bs_rarely_loaded => {
            groups_added => $groups_added_cnt,
            groups_removed => $groups_removed_cnt,
        }
    });
}

sub update_last_sync_time_property {
    my $now = human_datetime();
    $log->out("Uptate last_sync_time_property to: ".$now);
    $last_sync_time_property->set($now);
}

sub archive_old_tables_in_new_cluster {
    for my $table ($old_snapshot, $new_snapshot, $diff_table) {
        if ($table->exists()) {
            my $table_name = basename($table->node_name);
            # время возвращается в формате 2014-05-07T23:32:34.878006Z
            my $creation_time = iso8601_2_datetime($table->get_attribute("creation_time"));
            my $move_to = Yandex::YT::Table->new("$tmp_path/$table_name-$creation_time");

            $log->out("archiving ".$table->node_name());
            $table->move($move_to);
            $log->out("archived to ".$move_to->node_name());
        } else {
            $log->out($table->node_name()." doesn't exists for archive, skip");
        }
    }
    $log->out("All tables archived sucessfully");
}
