#!/usr/bin/env perl

#  $Id$

=pod

=encoding utf8

=head1 NAME

ppcResendDomainsBS.pl - получить и посчитать правки доменов с учётом зеркал, отправить их в БК

=cut

use my_inc "..";
use strict;
use warnings;

=head1 $ULIMIT_FSIZE

Размер лимита на размер файла в байтах.
NB: при смене здесь нужно пересчитать значение в cron-секции.

=cut

my $ULIMIT_FSIZE = 22 * 1024 * 1024 * 1024; # 22 GiB

=head1 METADATA

<crontab>
    time:    0 5 * * *
    package: scripts-switchman
    ulimit:  -Sf 46137344   # здесь количество блоков по 512 байт
    <switchman>
        group:      scripts-other
        <leases>
            mem:    22528   # здесь количество MiB
        </leases>
    </switchman>
</crontab>
<juggler>
    host:   checks_auto.direct.yandex.ru
    ttl:            32h
    tag: direct_group_internal_systems
</juggler>

=cut

use File::Basename qw(dirname);
use File::Path qw(mkpath);
use List::MoreUtils qw(uniq any);

use Yandex::DBShards;
use Yandex::DBTools;
use Yandex::IDN qw(is_valid_domain);
use Yandex::ListUtils qw(xsort);
use Yandex::Shell qw(yash_system);
use Yandex::Validate ();
use Yandex::YT::Table;
use Yandex::YT::TableReader;

use Settings;
use ScriptHelper
        'Yandex::Log' => 'messages',
        sigterm => undef,
        ;
use Tools;

use BS::ResyncQueue;
use MirrorsTools;
use Primitives;
use ShardingTools;
use TextTools ();

use constant {
    CID => 0,
    BID => 1,
    PHONE => 2,
};

my ($GET_MIRRORS, $RESEND_DOMAINS) = (1, 1);
my $DRY_RUN = 0;
my $UPDATE_DB = 1;
my @ONLY_CIDS;
extract_script_params(
    'cid=i@' => \@ONLY_CIDS,
    'update-db!' => \$UPDATE_DB,
    'dry-run' => \$DRY_RUN,
    'ulimit=i' => \$ULIMIT_FSIZE,
    'get-mirrors!' => \$GET_MIRRORS,
    'resend-domains!' => \$RESEND_DOMAINS,
);

die "no-get-mirrors and no-resend-domains are incompatible!" unless ($GET_MIRRORS || $RESEND_DOMAINS);

if ($GET_MIRRORS) {
    my $PATH_PREFIX = "//home/mirrors/dump/";
    my $YT = "/usr/bin/yt";
    my $MIRRORS_CLUSTER = 'arnold'; # на Хане пока нет :(

    umask 002;

    $log->msg_prefix('get_mirrors');
    $log->out('start');

    # создаём директорию, если надо
    my $mirrors_dirname = dirname($Settings::MIRRORS_FILE);
    if (!-d $mirrors_dirname) {
        $log->out("Creating directory: $mirrors_dirname");
        mkpath $mirrors_dirname;
    }

    my $file = $Settings::MIRRORS_FILE;

    my $clones_file = "$file.clones.tmp";
    get_clones_to($clones_file);

    # копируем данные из поиска
    $log->out("detect last mirrors state");
    Tools::force_set_yt_environment("$MIRRORS_CLUSTER");
    my $last = Yandex::YT::Table->new($PATH_PREFIX)->get_attribute('last_deployed_state');
    $log->out({last_deployed_state => $last});
    my $MIRRORS = "${PATH_PREFIX}${last}/mirrors.res.prod";
    $log->out("download $MIRRORS");
    yash_system("bash", "-e", "-c", "set -o pipefail; ($YT download --config '{read_progress_bar = {enable = %false}}' $MIRRORS; cat $clones_file) | gzip >$file.tmp");


    # переименовываем временный файл
    $log->out("rename $file.tmp to $file");
    rename("$file.tmp", $file) || $log->die("Can't rename $file.tmp => $file: $!");

    $log->out('finish');
}

if ($RESEND_DOMAINS) {
    main();
} else {
    exit;
}

sub main
{
    my $resync_priority = BS::ResyncQueue::PRIORITY_RESEND_DOMAINS_BS;
    # размер $Settings::MIRRORS_FILE в процентах относительно $ulimit_fsize, свыше которого выдаем предупреждение
    my $FSIZE_PERCENT_WARNING = 90;
    $log->out("start");
    my ($juggler_status, $juggler_description) = ('OK', 'OK');

    my @SHARDS;
    my $CIDS_ONLY_SQL = '1';
    if (@ONLY_CIDS) {
        $log->out('Working only on cids: ' . join (',', @ONLY_CIDS));
        @SHARDS = uniq(grep {defined $_} values %{ get_shard_multi(cid => \@ONLY_CIDS) });
        $log->die('No shards to process') unless @SHARDS;
        $CIDS_ONLY_SQL = sql_condition({'b.cid__int' => \@ONLY_CIDS});
        # отключаем обработку файла со склейками (т.к. это долго)
        $UPDATE_DB = 0;
    } else {
        @SHARDS = ppc_shards();
    }

    my $mirror;
    if ($UPDATE_DB) {
        $log->out('Updating mirrors DB');
        $mirror = MirrorsTools->new(update_db => 1, log => $log);
        my $used_pct = $ULIMIT_FSIZE ? int(100 * $mirror->{db_filesize} / $ULIMIT_FSIZE) : '--';
        my $msg = "$mirror->{db_filename} size: $mirror->{db_filesize}, limit: $ULIMIT_FSIZE, used: $used_pct%";
        $log->out($msg);
        if ($ULIMIT_FSIZE && $used_pct > $FSIZE_PERCENT_WARNING) {
            $juggler_status = 'WARN';
            $juggler_description = $msg;
        }
    } else {
        $mirror = MirrorsTools->new(use_db => 1, dont_load_file => 1);
    }

    my %shards_check = map {$_ => undef} @SHARDS;
    foreach_shard_parallel_verbose($log, sub {
        my ($shard) = @_;
        next unless exists $shards_check{$shard};
        $log->out("start process this shard");
        $log->out("processing domains");
        my $sth_domains = exec_sql(PPC(shard => $shard), "
            SELECT reverse(b.reverse_domain) as domain
                , ifnull(ft.filter_domain, reverse(b.reverse_domain)) as filter_domain
                FROM banners b
                    LEFT JOIN filter_domain ft on ft.domain = reverse(b.reverse_domain)
                WHERE b.reverse_domain IS NOT NULL
                    AND $CIDS_ONLY_SQL
                GROUP BY b.reverse_domain
                ORDER BY null
        ");

        my ($banners_cnt, $domains_cnt, $phones_cnt) = (0, 0, 0);
        while(my ($domain, $filter_domain) = $sth_domains->fetchrow_array) {
            next if !is_valid_domain($domain);
            my $new_filter_domain = $mirror->domain_filter(Yandex::IDN::idn_to_ascii($domain));
            if (defined $new_filter_domain && $filter_domain ne $new_filter_domain) {
                my $banners = get_all_sql(PPC(shard => $shard), "
                                SELECT cid, bid, $resync_priority AS priority
                                  FROM banners
                                 WHERE reverse_domain = ?
                                   AND statusBsSynced = 'Yes'
                                   AND $CIDS_ONLY_SQL
                                ",
                                  reverse_domain($domain)
                    );
                $banners_cnt += @$banners;
                $domains_cnt++;
                $log->out("$domain: $filter_domain => $new_filter_domain, to resend: ".scalar(@$banners)." banners");
                if (!$DRY_RUN) {
                    if ($domain eq $new_filter_domain) {
                        do_sql(PPC(shard => $shard), "DELETE FROM filter_domain WHERE domain = ?", $domain);
                    } else {
                        do_sql(PPC(shard => $shard), "INSERT INTO filter_domain (domain, filter_domain)
                                     VALUES (?, ?)
                                  ON DUPLICATE KEY UPDATE
                                     filter_domain = values(filter_domain)", $domain, $new_filter_domain);
                    }
                    if (@$banners) {
                        bs_resync($banners);
                    }
                }
            }
        }
        $sth_domains->finish();
        $log->out("processing phones");
        my $sth_phones = exec_sql(PPC(shard => $shard), qq/
                SELECT b.cid, b.bid, v.phone
                  FROM banners b
                       JOIN vcards v ON v.vcard_id = b.vcard_id
                 WHERE b.reverse_domain IS NULL
                       AND v.phone IS NOT NULL
                       AND v.phone != ''
                       AND $CIDS_ONLY_SQL
                 GROUP BY v.phone
                 ORDER BY null
                    /);
        while (my $banners_chunk = $sth_phones->fetchall_arrayref(undef, 100_000)) {
            my %phone2phone_domain;
            for my $banner (@$banners_chunk) {
                next unless Yandex::Validate::is_valid_phone($banner->[PHONE]);
                $phone2phone_domain{ $banner->[PHONE] } //= TextTools::phone_domain($banner->[PHONE]);
            }

            # получаем фильтр-домены для текущей пачки
            my $phone_domain2old_filter_domain =
                get_hash_sql(PPC(shard => $shard), [
                                        'SELECT domain, filter_domain FROM filter_domain',
                                        WHERE => { domain => [values %phone2phone_domain] }
                                    ]);

            my @resync_data;
            my $phone_domain2new_filter_domain = {};
            for my $banner (@$banners_chunk) {
                my $phone_domain = $phone2phone_domain{ $banner->[PHONE] };
                next unless $phone_domain;

                my $new_filter_domain;
                my $old_filter_domain = $phone_domain2old_filter_domain->{ $phone_domain } // $phone_domain;
                if (exists $phone_domain2new_filter_domain->{ $phone_domain }) {
                    # есть в кеше
                    $new_filter_domain = $phone_domain2new_filter_domain->{ $phone_domain };
                } else {
                    # вычисляем новый фильтр-домен
                    $new_filter_domain = $mirror->domain_filter($phone_domain);
                    $phone_domain2new_filter_domain->{ $phone_domain } = $new_filter_domain;

                    if ($new_filter_domain ne $old_filter_domain) {
                        $log->out("$phone_domain: $old_filter_domain => $new_filter_domain");
                        if (!$DRY_RUN) {
                            if ($new_filter_domain eq $phone_domain) {
                                do_sql(PPC(shard => $shard), "DELETE FROM filter_domain WHERE domain = ?", $phone_domain);
                            } else {
                                do_sql(PPC(shard => $shard), "
                                               INSERT INTO filter_domain (domain, filter_domain)
                                                    VALUES (?, ?)
                                   ON DUPLICATE KEY UPDATE filter_domain = VALUES(filter_domain)",
                                    $phone_domain, $new_filter_domain,
                                );
                            }
                        }
                    }
                }

                if ($new_filter_domain ne $old_filter_domain) {
                    $phones_cnt++;
                    $banners_cnt++;
                    push @resync_data, {cid => $banner->[CID], bid => $banner->[BID], priority => $resync_priority};
                }
            }
            if (@resync_data && !$DRY_RUN) {
                bs_resync(\@resync_data);
            }

        }
        $sth_phones->finish();

        $log->out("to resend $banners_cnt banners, $domains_cnt domains, $phones_cnt phones with new mirrors file");
    });

    $log->msg_prefix(undef);
    if ($UPDATE_DB && !@ONLY_CIDS && !$DRY_RUN && $GET_MIRRORS) {
        juggler_event(status => $juggler_status, description => $juggler_description);
    }
    $log->out("finish");
}


sub get_clones_to {
    my ($clones_file) = @_;

    my $CLONES_YT_TABLE = '//home/globalsearch/clone/export/clones_for_direct';
    # получаем клоны из YT
    $log->out("download $CLONES_YT_TABLE");
    # выгрузка есть на Hahn: https://st.yandex-team.ru/DIRECT-87629
    Tools::force_set_yt_environment("hahn");
    my $reader = Yandex::YT::TableReader->new($CLONES_YT_TABLE, binmode => ":utf8");
    my %groups;
    my %groups_mintime;
    my %hosts_groups;
    my $cnt = 0;
    while(my $r = $reader->next()) {
        $cnt++;
        my $host  = $r->{Host};
        my $group = $r->{Group};
        my $flag  = $r->{Type};
        my $time  = $r->{Timestamp};
        my $user  = $r->{ContactSource};
        next if $user eq 'phone' || $user eq 'email';
        # ожидается, что все дошедшие до этого места домены в $host представлены в punycode
        $groups_mintime{$group} = $time if !$groups_mintime{$group} || $groups_mintime{$group} > $time;
        $hosts_groups{$host}{$group} = undef;
        push @{$groups{$group}}, "$time:$host";
    }
    $log->out("Got $cnt rows from $CLONES_YT_TABLE");

    # объединяем группы
    $log->out('combine groups');
    for my $group (xsort {$groups_mintime{$_}, $_} keys %groups) {
        next if !exists $groups{$group};
        while(1) {
            my %hosts = map {[split ':', $_]->[1] => 1} @{$groups{$group}};
            my @glue_groups =
                grep {exists $groups{$_} && $_ ne $group}
                    # список групп
                    uniq
                        map {keys %{$hosts_groups{$_}}}
                            keys %hosts;
            if (@glue_groups) {
                #print "$group: ".join(" ", @glue_groups)."\n";
            } else {
                last;
            }
            for my $g (@glue_groups) {
                push @{$groups{$group}}, grep {!$hosts{[split ':', $_]->[1]}++} @{$groups{$g}};
                delete $groups{$g};
            }
        }
    }

    # дописываем наши данные
    $log->out("append clones data to $clones_file");
    open(my $fh, ">", $clones_file) || $log->die("Can't open $clones_file: $!");
    for my $group (keys %groups) {
        print $fh "\n\n" or $log->die($!);
        # пытаемся найти главный хост - чьё имя совпадает с группой
        my $main_host = [sort grep {/:\Q$group\E\./} @{$groups{$group}}]->[0];
        # если не нашли - используется самый старый и лексикографически маленький
        for my $time_host (grep {defined $_} $main_host, sort grep {!$main_host || $_ ne $main_host} @{$groups{$group}}) {
            my ($time, $host) = split ':', $time_host;
            print $fh "0\t0\t$host\n";
        }
    }
    close($fh) || $log->die("Can't close $clones_file: $!");
}
