#!/usr/bin/perl

=head1 DEPLOY

# .migr
{
  tasks => [
    {
      type => 'script',
      when => 'after',
      time_estimate => 'примерно 60-70 минут (измерил время исправления 100_000 групп и экстраполировал на общее количество групп)',
      comment => 'Убедиться перед запуском, что таблица //home/direct/tmp/voronov/adgroups_with_not_refined_geo существует на кластере hahn',
    }
  ],
  approved_by => 'liosha'
}

=cut

use my_inc '..';
use Direct::Modern;

use BS::ResyncQueue qw/bs_resync/;
use GeoTools;
use ScriptHelper;
use Settings;
use Tools;
use Yandex::DBShards;
use Yandex::DBTools;
use Yandex::Retry qw/relaxed_guard/;
use Yandex::YT::Table;

$Yandex::YT::Streaming::DEFAULT_FORMAT = 'json';

my $PIDS       = '';
my $LIMIT      = 0;
my $DRYRUN     = 0;
my $YT_CLUSTER = 'hahn';
my $CHUNK_SIZE = 1000;
my $TABLE      = '//home/direct/tmp/voronov/adgroups_with_not_refined_geo';
my $BS_RESYNC  = 'no';
my $PRIORITY   = $BS::ResyncQueue::PRIORITY_RESEND_DOMAINS_BS;

$log->out('START');

extract_script_params(
    'pids=s'      => \$PIDS,
    'limit=i'     => \$LIMIT,
    'dry'         => \$DRYRUN,
    'cluster=s'   => \$YT_CLUSTER,
    'table=s'     => \$TABLE,
    'chunksize=i' => \$CHUNK_SIZE,
    'bsresync=s'  => \$BS_RESYNC,
    'priority=i'  => \$PRIORITY,
);

my $TOTAL_UPDATED_ADGROUPS = 0;
my $IS_LIMIT_REACHED       = 0;

if ($LIMIT) {
    $log->out( sprintf( 'limit on pids amount is given - %s' => $LIMIT ) );
}

if ($PIDS) {
    $log->out( sprintf( 'list of pids is given - %s' => $PIDS ) );

    my @pids = split ',' => $PIDS;
    if ( @pids ) {
        proccess_pids([ map { +{pid => $_} } @pids ]);
    }

} else {
    $log->out( sprintf( 'fetch list of pids from [%s] table' => $TABLE ) );

    Tools::force_set_yt_environment($YT_CLUSTER);

    my $table = Yandex::YT::Table->new($TABLE);
    unless ($table->exists()) {
        $log->die("table $TABLE doesn't exists");
    }

    my @pids = ();

    my $reader = $table->reader();
    while ( my $r = $reader->next() ) {
        push @pids, { pid => $r->{pid} };
        if ( @pids >= $CHUNK_SIZE ) {
            proccess_pids( \@pids );
            @pids = ();
        }
        last if $IS_LIMIT_REACHED;
    }

    if ( !$IS_LIMIT_REACHED && @pids ) {
        proccess_pids( \@pids );
    }
}

$log->out( sprintf( 'total %s adgroups updated' => $TOTAL_UPDATED_ADGROUPS ) );

$log->out('FINISH');

sub proccess_pids {
    my $pids = shift;

    foreach_shard pid => $pids, with_undef_shard => 1, sub {
        my ($shard, $chunk) = @_;

        return if $IS_LIMIT_REACHED;

        if (!$shard) {
            $log->out( sprintf('can\'t guess shard for pids %s', join(', ' => map { $_->{pid} } @$chunk)) );
            return;
        }

        $log->out( sprintf('shard #%s, %s pids', $shard, scalar(@$chunk)) );

        my $rg = relaxed_guard times => 1;

        my $adgroups = get_adgroups_from_db( $shard, [ map { $_->{pid} } @$chunk ] );

        my $adgroups_to_update = refine_and_check_geo( $adgroups );

        update_geo_for_adgroups( $shard, $adgroups_to_update );

        if ($BS_RESYNC eq 'yes') {
            resync_adgroups_with_bs( $adgroups_to_update );
        }
    };
}

sub get_adgroups_from_db {
    my ($shard, $pids) = @_;

    return get_all_sql(PPC(shard => $shard), [
        'select
            p.pid, p.geo, p.LastChange, c.cid, c.archived
        from
            phrases p
                join campaigns c on (p.cid = c.cid)',
        where => { 'p.pid' => $pids },
    ]);
}

sub refine_and_check_geo {
    my ($adgroups) = @_;

    $log->out( sprintf('check geo for %s adgroups', scalar(@$adgroups)) );

    my @to_update;
    for my $adgroup ( @$adgroups ) {

        my $refined_geo = GeoTools::refine_geoid( $adgroup->{geo}, undef, {tree => 'api'} );

        if ($adgroup->{geo} eq $refined_geo) {
            $log->out( sprintf('adgroup\'s geo is already refined for adgroup #%s', $adgroup->{pid} ) );
            next;
        }

        $adgroup->{refined_geo} = $refined_geo;

        push @to_update, $adgroup;
    }

    return \@to_update;
}

sub update_geo_for_adgroups {
    my ($shard, $adgroups) = @_;

    return unless scalar @$adgroups;

    $log->out(['update geo for '. scalar(@$adgroups) .' adgroups', [ map { my $grp = $_; +{ map { $_ => $grp->{$_} } qw/pid geo refined_geo/ } } @$adgroups ]]);

    my %pid_to_refined_geo = map { $_->{pid} => { geo => $_->{refined_geo}, LastChange => $_->{LastChange} } } @$adgroups;
    my %pid_to_old_geo     = map { $_->{pid} => $_->{geo} } @$adgroups;

    my $updated_adgroups = 0;
    if ( !$DRYRUN ) {
        $updated_adgroups = do_mass_update_sql(PPC(shard => $shard), 'phrases', 'pid', \%pid_to_refined_geo,
            where => { geo__dont_quote => sql_case( 'pid', \%pid_to_old_geo, default__dont_quote => sql_quote_identifier('geo') ) });
    }

    if ( scalar( @$adgroups ) != $updated_adgroups ) {
        $log->out( sprintf( 'number of updated rows %s not equal to number of rows planned for update %s', $updated_adgroups, scalar( @$adgroups ) ) );
    }

    $TOTAL_UPDATED_ADGROUPS += $updated_adgroups;

    if ($LIMIT && $TOTAL_UPDATED_ADGROUPS > $LIMIT) {
        $IS_LIMIT_REACHED = 1;
    }
}

sub resync_adgroups_with_bs {
    my ($adgroups) = @_;

    return unless scalar @$adgroups;

    my @to_resync = grep { $_->{archived} ne 'Yes' } @$adgroups;
    if ( @to_resync ) {
        @to_resync = map { +{
            cid => $_->{cid},
            pid => $_->{pid},
            priority => $PRIORITY,
        } } @to_resync;
        
        $log->out(['resync with BS '. scalar(@to_resync) .' adgroups ', @to_resync]);
        if ( !$DRYRUN ) {
            bs_resync( \@to_resync );
        }
    }
}
