#!/usr/bin/perl

use my_inc "../..";


=head1 NAME

    check-sharding-metabase.pl - проверка соответствия данных в шардированной базе - данным в метабазе

=head1 SYNOPSIS

    check-sharding-metabase.pl --key uid --key cid
    check-sharding-metabase.pl --fix-absent --fix-chain
    check-sharding-metabase.pl --fix-absent --fix-chain --key mbid --min-id 417793

=head1 DESCRIPTION

    Возможные опции:
    --key - какие ключи проверять, полный список в скрипте
    --fix-absent - добавлять в метабазу информацию о записях, которые есть в базе но отсутствуют в метабазе
    --fix-chain - исправлять привязку id к ClientID, если ClientID находится в том же шарде
    --last-n=N - проверять не все id, а начиная с max(id)-N - удобно для быстрых проверок свежих данных
    --min-id=M - проверять не все id, а начиная с M - удобно для проверки/исправления последних нескольких дней (для ТС, например)

    --chunk-size - приблизительное количество строк для одного селекта, по-умолчанию 10_000

    --debug - обработать по одному чанку каждого ключа
    
    --sleep-coef - делать паузы между чанками (по-умолчанию 1 - пауза равна времени обработки чанка)
    
    Не все ошибки могут исправиться автоматически

=cut


use strict;
use warnings;

use POSIX qw/ceil/;
use JSON;
use List::Util qw/max/;

use Yandex::DBTools;
use Yandex::Advmon;
use Yandex::DBShards;
use Yandex::Retry qw/relaxed_guard/;

use Settings;
use ShardingTools;

use ScriptHelper
    'Yandex::Log' => [msg_prefix => "[$$]", tee => $ENV{LOG_TEE}],
    get_file_lock => [0, 'check-sharding-metabase'],
;

$Yandex::DBShards::CACHE_EXPIRES = 300;
$Yandex::DBShards::CACHE_MAX_SIZE = 1_000_000;

my $VERBOSE = 0;
my $FIX_ABSENT = 0;
my $FIX_CHAIN = 0;
my @KEYS;
my @TABLES;
my $DEBUG = 0;
my $LAST_N = undef;
my $MIN_ID = 0;
my $SLEEP_COEF = 1;
# приблизительное количество строк на запрос
my $CHUNK_SIZE = 100_000;
extract_script_params(
    "debug" => \$DEBUG,
    "verbose" => \$VERBOSE,
    "fix-absent" => \$FIX_ABSENT,
    "fix-chain" => \$FIX_CHAIN,
    "key=s" => \@KEYS,
    "table=s" => \@TABLES,
    "last-n=i" => \$LAST_N,
    "min-id=i" => \$MIN_ID,
    "chunk-size=i" => \$CHUNK_SIZE,
    "sleep-coef=i" => \$SLEEP_COEF,
);


my @RULES = (
    {
        key => 'ClientID',
        table => ['clients', 'users'],
    },
    {
        key => 'uid',
        table => {
            table => 'users',
            sql => "SELECT uid, ClientID FROM users WHERE uid >= ? and uid < ?"
        },
    },
    {
        key => 'login',
        table => {
            table => 'users',
            col => 'uid',
            sql => "SELECT u.login, max(u.uid)
                      FROM (SELECT login
                              FROM users 
                             WHERE uid >= ? and uid < ?
                           ) t 
                           JOIN users u USING (login)
                     GROUP BY u.login",
        },
    },
    {
        key => 'cid',  
        table => {
            table => 'campaigns', 
            sql => "SELECT c.cid, u.ClientID
                      FROM campaigns c
                           join users u on c.uid = u.uid
                     WHERE c.cid >= ? and c.cid < ?"
        },
    },
    {
        key => 'OrderID',  
        table => {
            table => 'campaigns', 
            sql => "SELECT c.OrderID, u.ClientID
                      FROM campaigns c
                           join users u on c.uid = u.uid
                     WHERE c.OrderID > 0
                       AND c.OrderID >= ? and c.OrderID < ?"
        },
    },
    {
        key => 'pid',
        table => [
            {
                table => 'phrases',
                sql => "SELECT p.pid, u.ClientID
                          FROM phrases p
                               left join campaigns c on c.cid = p.cid
                               left join users u on c.uid = u.uid 
                         WHERE p.pid >= ? and p.pid < ?"
            },
            {
                table => 'media_groups',
                col => 'mgid', 
                sql => "SELECT mg.mgid, u.ClientID
                          FROM media_groups mg
                               left join campaigns c on c.cid = mg.cid
                               left join users u on c.uid = u.uid
                         WHERE mg.mgid >= ? and mg.mgid < ?"
            },
            ],
    },
    {
        key => 'bid',
        table => [
            {
                table => 'banners', 
                sql => "SELECT b.bid, u.ClientID 
                          FROM banners b 
                               left join phrases p on p.pid = b.pid 
                               left join campaigns c on c.cid = p.cid 
                               left join users u on c.uid = u.uid 
                         WHERE b.bid >= ? and b.bid < ?",
            },
            {
                table => 'banner_images', 
                col => 'image_id', 
                sql => "SELECT bim.image_id, u.ClientID
                          FROM banner_images bim
                               left join banners b on b.bid = bim.bid
                               left join phrases p on p.pid = b.pid
                               left join campaigns c on c.cid = p.cid
                               left join users u on c.uid = u.uid 
                         WHERE bim.image_id >= ? and bim.image_id < ?"
            },
            ],
    },
    {
        key => 'mbid',
        table => {
            table => 'media_banners', 
            sql => "SELECT mb.mbid, u.ClientID 
                      FROM media_banners mb
                               left join media_groups mg on mg.mgid = mb.mgid
                               left join campaigns c on c.cid = mg.cid
                               left join users u on c.uid = u.uid 
                     WHERE mb.mbid >= ? and mb.mbid < ?"
        },
    },
    {
        key => 'tag_id',
        table => {
            table => 'tag_campaign_list',
            sql => "SELECT t.tag_id, u.ClientID
                      FROM tag_campaign_list t
                               left join campaigns c on c.cid = t.cid
                               left join users u on c.uid = u.uid
                     WHERE t.tag_id >= ? and t.tag_id < ?",
        },
    },
    {
        key => 'sitelinks_set_id',
        table => [
            {
                table => 'sitelinks_sets',
                sql => "SELECT t.sitelinks_set_id, t.ClientID
                          FROM sitelinks_sets t
                         WHERE t.sitelinks_set_id >= ? and t.sitelinks_set_id < ?",
            },
            {
                table => 'banners',
                sql => "SELECT t.sitelinks_set_id, u.ClientID
                          FROM banners t
                               LEFT JOIN campaigns c on c.cid = t.cid
                               LEFT JOIN users u on u.uid = c.uid
                         WHERE t.sitelinks_set_id >= ? and t.sitelinks_set_id < ?",
            },
        ],
    },
    {
        key => 'ret_cond_id',
        table => {
            table => 'retargeting_conditions',
            sql => "SELECT rc.ret_cond_id, rc.ClientID
                      FROM retargeting_conditions rc
                     WHERE rc.ret_cond_id >= ? AND rc.ret_cond_id < ?",
        },
    },
    {
        key => 'vcard_id',
        table => {
            table => 'vcards',
            sql => "SELECT vc.vcard_id, u.ClientID
                      FROM vcards vc
                           LEFT JOIN campaigns c ON c.cid = vc.cid
                           LEFT JOIN users u ON u.uid = c.uid
                     WHERE vc.vcard_id >= ? AND vc.vcard_id < ?",
        },
    },
    {
        key => 'org_details_id',
        table => {
            table => 'org_details',
            sql => "SELECT od.org_details_id, u.ClientID
                      FROM org_details od
                           LEFT JOIN users u ON u.uid = od.uid
                     WHERE od.org_details_id >= ? AND od.org_details_id < ?",
        },
    },
    {
        key => 'banner_images_pool_id',
        table => {
            table => 'banner_images_pool',
            col => 'imp_id',
            sql => "SELECT imp_id as banner_images_pool_id, ClientID
                      FROM banner_images_pool
                     WHERE imp_id >= ? AND imp_id < ?",
        },
    },
    {
        key => 'mediaplan_bid',
        table => {
            table => 'mediaplan_banners',
            col => 'mbid', 
            sql => "SELECT mb.mbid, u.ClientID 
                      FROM mediaplan_banners mb
                               left join campaigns c on c.cid = mb.cid
                               left join users u on c.uid = u.uid 
                     WHERE mb.mbid >= ? and mb.mbid < ?"
        },
    },
);

$log->out("start");

my %stat;

my %advmon;
for my $rule (@RULES) {
    if (@KEYS && !grep {$_ eq $rule->{key}} @KEYS) {
        $log->out("skip key $rule->{key}");
        next;
    }
    my $key = $rule->{key};
    my @tables_info = ref $rule->{table} eq 'ARRAY' ? @{$rule->{table}} : ($rule->{table});
    for my $tinfo (@tables_info) {
        my $table = ref $tinfo ? $tinfo->{table} : $tinfo;
        if (@TABLES && !grep {$_ eq $table} @TABLES) {
            $log->out("skip table $table");
            next;
        }
        my $col = ref $tinfo ? $tinfo->{col} || $key : $key;
        for my $shard (ppc_shards()) {
            my %stat = (
                inconsistent => 0,
                absent => 0,
                absent_chain => 0,
                incorrect_chain => 0,
                incorrect_shard => 0,
                correct => 0,
                fixes => 0,
            );

            my ($min_val, $max_val) = get_one_line_array_sql(PPC(shard => $shard), "SELECT min($col), max($col) FROM $table");
            if (defined $min_val && defined $max_val) {
                $min_val ||= 1;
                if (defined $LAST_N) {
                    $min_val = max($min_val, $max_val-$LAST_N);
                }
                if ( $MIN_ID ){
                    $min_val = $MIN_ID;
                }

                my $rows_estimate = get_one_line_sql(PPC(shard => $shard), "EXPLAIN SELECT * FROM $table WHERE $col >= ?", $min_val)->{rows};
                my $step = ceil( ($max_val - $min_val) / ceil($rows_estimate / $CHUNK_SIZE) ) || 1;

                my $sql = ref $tinfo && $tinfo->{sql} ? $tinfo->{sql} : "SELECT $col, $shard as shard FROM $table WHERE $col >= ? AND $col < ?";
                my $chain_key = $key eq 'ClientID' ? 'shard' : $Yandex::DBShards::SHARD_KEYS{$key}->{chain_key};

                for(my $start_val = $min_val; $start_val <= $max_val; $start_val += $step) {
                    my $guard = relaxed_guard times => $SLEEP_COEF;
                    $log->out("start check $table.$col in shard $shard: $start_val - ".($start_val+$step));
                    my $db_ids = get_hash_sql(PPC(shard=>$shard), $sql, $start_val, $start_val+$step);
                    my $chain_info = get_shard_multi($key => [keys %$db_ids], $chain_key);
                    my $shard_info = get_shard_multi($key => [keys %$db_ids]);
                    my %fixes;
                    for my $id (keys %$db_ids) {
                        if (!defined $db_ids->{$id}) {
                            $log->out("ERROR: Inconsistent state for $key, shard $shard: $table.$col=$id - no $chain_key");
                            $stat{inconsistent}++;
                        } elsif (!$chain_info->{$id}) { # нулевый chain_id приравниваются отсутствующим
                            $log->out("ERROR: Absent data for $key, shard $shard: $table.$col=$id in metabase");
                            if ($FIX_ABSENT) {
                                push @{$fixes{$db_ids->{$id}}}, $id;
                            }
                            $stat{absent}++;
                        } elsif ($chain_info->{$id} != $db_ids->{$id}) {
                            $log->out("ERROR: Incorrect chain data for $key, shard $shard: $table.$col=$id, $chain_key=$db_ids->{$id} - metabase.$chain_key=$chain_info->{$id}");
                            if ($FIX_CHAIN && $chain_key ne 'shard' 
                                && (!defined $shard_info->{$id} || $shard_info->{$id} == $shard)
                            ) {
                                push @{$fixes{$db_ids->{$id}}}, $id;
                            }
                            $stat{incorrect_chain}++;
                        } elsif (!$shard_info->{$id}) {
                            # если chain_key отсутствует в метабазе или указывает на нулевой шард
                            $log->out("ERROR: Absent shard data for $key, shard $shard: $table.$col=$id, $chain_key=$db_ids->{$id} in metabase");
                            $stat{absent_chain}++;
                        } elsif ($shard_info->{$id} != $shard) {
                            $log->out("ERROR: Incorrect shard data for $key, shard $shard: $table.$col=$id, $chain_key=$db_ids->{$id} - metabase.shard=$shard_info->{$id}");
                            $stat{incorrect_shard}++;
                        } else {
                            if ($VERBOSE) {
                                $log->out("Correct data for $key, shard $shard: $table.$col=$id, $chain_key=$db_ids->{$id} - metabase.$chain_key=$chain_info->{$id}");
                            }
                            $stat{correct}++;
                        }
                    }
                    while(my ($chain_val, $ids) = each %fixes) {
                        $log->out("FIX: set $chain_key=$chain_val for $key => ".join(',', @$ids));
                        save_shard($key => $ids, $chain_key => $chain_val);
                        $stat{fixes} += @$ids;
                    }
                    last if $DEBUG;
                }
            } else {
                # Ничего нет - пустая таблица
                $log->out("skip checking $table.$col in shard $shard: table is empty");
            }

            $log->out("STAT for ppc:$shard:$table.$col: ".to_json(\%stat));
            for my $stat_key (keys %stat) {
                for my $sh ($shard, 'all') {
                    for my $kk ("key.$key", "table.$table:$col") {
                        $advmon{"metabase.consistency.$kk.shard_$sh.$stat_key"} += $stat{$stat_key};
                    }
                }
            }
        }
    }
}
monitor_values(\%advmon);
$log->out('summary_stat: '.to_json(\%advmon));

$log->out('finish');
