#!/usr/bin/perl

use my_inc "../..";


=head2

Удаляем дублирующиеся записи в истории bids.id <=> PhraseID
В 2012 году отрабатывал за 12 часов

=cut

use strict;
use warnings;

use utf8;
use List::Util qw/min max/;

use Yandex::DBTools;
use Yandex::ListUtils;
use Yandex::Retry;

use lib::abs '..';

use Settings;
use ShardingTools;
use ScriptHelper;

$log->out("start");

my ($CID_FROM, $CID_TO);
my $SLEEP_COEF = 3;
my $STEP = 1000;
my @SHARDS;
extract_script_params(
    "cid-from=i" => \$CID_FROM,
    "cid-to=i" => \$CID_TO,
    "step=i" => \$STEP,
    "shard-id=i" => \@SHARDS,
    "sleep-coef=f" => \$SLEEP_COEF,
);

$CID_FROM //= 0;
$CID_TO //= max @{get_one_column_sql(PPC(shard => 'all'), "select max(cid) from campaigns") || []};
@SHARDS = ppc_shards() if !@SHARDS;

for my $shard (@SHARDS) {
    get_dbh(PPC(shard => $shard))->{mysql_use_result} = 1;
}

my $t1 = time;

for(my $from = $CID_FROM; $from <= $CID_TO; $from += $STEP) {
    
    my $to = min($from + $STEP - 1, $CID_TO);
    for my $shard (@SHARDS) {
        my $sth = exec_sql(PPC(shard => $shard), "
                                SELECT cid, pid, PhraseID, bids_id, logtime
                                  FROM bids_phraseid_associate
                                 WHERE cid between ? and ?
                                 ORDER BY cid, pid, PhraseID, logtime", 
                    $from, $to
            );

        my $to_del = {};
        my $prev = {cid => 0, pid => 0, PhraseID => 0, bids_id => 0};
        my $cnt = 0;
        my $del_cnt = 0;
        while (my $row = $sth->fetchrow_hashref) {
            
            if ($row->{cid} != $prev->{cid} && $del_cnt || $del_cnt >= 1000000) {
                _delete_records($shard, $prev->{cid}, $to_del);
                $to_del = {};
                $del_cnt = 0;
            }
        
            if ($row->{cid} == $prev->{cid}
                && $row->{pid} == $prev->{pid}
                && $row->{PhraseID} == $prev->{PhraseID}
                && $row->{bids_id} == $prev->{bids_id}
            ) {
                push @{$to_del->{$row->{pid}}->{$row->{PhraseID}}}, $row->{logtime};
                $cnt++;
                $del_cnt++;
            }

            $prev = $row;
        }

        _delete_records($shard, $prev->{cid}, $to_del) if $del_cnt;

        my $total_rows_est = _rows_estimate($shard, $CID_FROM, $CID_TO);
        my $rest_rows_est = _rows_estimate($shard, $to, $CID_TO);
        $log->out(sprintf "STATS: shard $shard, deleted %d rows, ela: %.1fh, rows: %.2f%%", 
                  $cnt, (time - $t1)/3600, 100 * ($total_rows_est - $rest_rows_est) / ($total_rows_est||1));
    }
}

$log->out("end.");

sub _rows_estimate {
    my ($shard, $from, $to) = @_;
    return get_one_line_sql(PPC(shard => $shard), "explain SELECT * FROM bids_phraseid_associate WHERE cid between ? and ?", $from, $to)->{rows};
}

sub _delete_records
{
    my ($shard, $cid, $to_del) = @_;

    my @log_info;
    my $cnt = 0;
    while(my ($pid, $binfo) = each %$to_del) {
        push @log_info, "$pid:".scalar(map {@$_} values %$binfo);
        while(my ($PhraseID, $times) = each %$binfo) {
            for my $times_chunk (chunks $times, 500) {
                relaxed times => $SLEEP_COEF, sub {
                    $cnt += do_delete_from_table(PPC(shard => $shard), 'bids_phraseid_associate',
                                                 where => {cid=>$cid, pid=>$pid, PhraseID=>$PhraseID, logtime=>$times_chunk});
                };
            }
        }
    }
    $log->out("cid=$cid, cnt=$cnt ".join(',', @log_info));
}
