#!/usr/bin/perl

use my_inc "../..";

=head1 DESCRIPTION

    Удаляет визитки-дубли, в т.ч. отличающиеся только значениями в nullable полях (в случаях, если null и '' - равнозначные значения).

    LOG_TEE=1 ./protected/one-shot/drop_duplicate_vcards.pl --only-shard-id=2 --chunk-size=10 --sleep=1 --dry-run

    Параметры:
        chunk-size    -- сколько строк обрабатывать за раз
        sleep         -- время сна (можно указывать дробное число) между chunk-size обработанных строк
        only-shard-id -- запустить только на определенном шарде
        dry-run       -- запуск без внесения изменений в БД

    Программа работает параллельно с каждым шардом. Можно перезапускать.

=cut

use Direct::Modern;
use open ':std' => ':utf8';

use Settings;
use ScriptHelper;

use Yandex::DBTools;
use Yandex::DBShards;
use ShardingTools qw/ppc_shards/;
use Time::HiRes qw//;
use Yandex::ListUtils qw/chunks/;

use VCards qw//;

my ($ONLY_SHARD_ID, $CHUNK_SIZE, $SLEEP, $DRY_RUN);
extract_script_params(
    'chunk-size:i'    => \$CHUNK_SIZE,
    'sleep:f'         => \$SLEEP,
    'only-shard-id:i' => \$ONLY_SHARD_ID,
    'dry-run'         => \$DRY_RUN,
);

$CHUNK_SIZE ||= 100;
$SLEEP //= 1;

$log->out('START');

my $script_name = get_script_name();
my $shard_results = foreach_shard_parallel(shard => [defined $ONLY_SHARD_ID ? $ONLY_SHARD_ID : ppc_shards()], sub {
    my ($shard) = @_;
    do_sql(PPC(shard => $shard), "SET SESSION group_concat_max_len = 1000000");

    my $log_shard = Yandex::Log->new(
        log_file_name => $script_name.".shard_${shard}.log",
        date_suf      => '%Y%m%d',
        msg_prefix    => "[shard:$shard]",
    );
    $log_shard->out('START');

    my @rows;
    my $uids = get_one_column_sql(PPC(shard => $shard), "SELECT DISTINCT uid FROM vcards WHERE uid > 0");
    for my $uids_chunk (chunks $uids, $CHUNK_SIZE) {
        # Выберем все повторяющиеся визитки с группировкой по кампании
        push @rows, @{get_all_sql(PPC(shard => $shard), [
            q{SELECT cid, GROUP_CONCAT(vcard_id SEPARATOR ',') vcard_ids FROM vcards},
            where => {cid__gt => 0, uid => $uids_chunk},
            q{
            GROUP BY
                cid, uid,

                -- INTEGER NULL; 0 and NULL is the same
                ifnull(address_id,0), ifnull(org_details_id,0),

                -- INTEGER NOT NULL
                geo_id,

                -- INTEGER NULL; 0 and NULL have different meaning
                metro,

                -- VARCHAR() NULL; '' and NULL is the same
                ifnull(phone,''), ifnull(name,''), ifnull(city,''), ifnull(contactperson,''),
                ifnull(worktime,''), ifnull(country,''), ifnull(street,''), ifnull(house,''), ifnull(build,''), ifnull(apart,''),
                ifnull(extra_message,''), ifnull(contact_email,''), ifnull(im_client,''), ifnull(im_login,'')

            HAVING
                count(vcard_id) > 1
            }
        ])};
    };

    my $total = scalar @rows;
    $log_shard->out("Found $total campaign(s) with duplicated vcards");

    my $count = 0;
    for my $rows_chunk (chunks \@rows, $CHUNK_SIZE) {
        # Посчитаем число баннеров, привязанных к визиткам
        # Нужно для дальнейшей оптимизации -- будем обновлять баннеры/визитки с наименьшим числом баннеров
        my $vcard_links_count = get_hash_sql(PPC(shard => $shard), [
            "SELECT vcard_id, count(bid) FROM banners",
            where => {vcard_id => [map { split /,/, $_->{vcard_ids} } @$rows_chunk]},
            "GROUP BY vcard_id"
        ]);

        my %case_vcards_unlink;
        my (@main_vcard_ids, @old_vcard_ids);

        for my $row (@$rows_chunk) {
            $count++;

            my $cid = $row->{cid};
            $log_shard->out("Processing campaign $cid ($count/$total)");

            my @vcard_ids = sort { ($vcard_links_count->{$b} // 0) <=> ($vcard_links_count->{$a} // 0) } (split /,/, $row->{vcard_ids});
            my $main_vcard_id = shift @vcard_ids;

            $case_vcards_unlink{$_} = $main_vcard_id for @vcard_ids;
            push @main_vcard_ids, $main_vcard_id;
            push @old_vcard_ids, @vcard_ids;

            $log_shard->out("main_vcard_id: $main_vcard_id; old_vcards: ".join(', ', @vcard_ids));
        }

        next if $DRY_RUN;

        # Заменим связь старых визиток с баннерами на новые визитки
        do_sql(PPC(shard => $shard), [
            "UPDATE banners SET LastChange = LastChange, vcard_id = ".sql_case(vcard_id => \%case_vcards_unlink, default__dont_quote => 'vcard_id'),
            where => {vcard_id => \@old_vcard_ids},
        ]);
        do_sql(PPC(shard => $shard), [
            "UPDATE mediaplan_banners SET vcard_id = ".sql_case(vcard_id => \%case_vcards_unlink, default__dont_quote => 'vcard_id'),
            where => {vcard_id => \@old_vcard_ids},
        ]);

        # Обновим главную визитку
        do_sql(PPC(shard => $shard), [q{
            UPDATE vcards SET
                address_id     = NULLIF(address_id, 0),
                org_details_id = NULLIF(org_details_id, 0),
                phone          = NULLIF(phone, ''),
                name           = NULLIF(name, ''),
                city           = NULLIF(city, ''),
                contactperson  = NULLIF(contactperson, ''),
                worktime       = NULLIF(worktime, ''),
                country        = NULLIF(country, ''),
                street         = NULLIF(street, ''),
                house          = NULLIF(house, ''),
                build          = NULLIF(build, ''),
                apart          = NULLIF(apart, ''),
                extra_message  = NULLIF(extra_message, ''),
                contact_email  = NULLIF(contact_email, ''),
                im_client      = NULLIF(im_client, ''),
                im_login       = NULLIF(im_login, ''),
                LastChange     = LastChange
        }, where => {vcard_id => \@main_vcard_ids}]);

        # Удалим старые визитки
        VCards::delete_vcard_from_db(\@old_vcard_ids);

        Time::HiRes::sleep($SLEEP) if $SLEEP;
    }

    $log_shard->out('FINISH');
});
$log->out("Per-shard results:", $shard_results);

$log->out('FINISH');
