#!/usr/bin/perl
use strict;
use warnings;
use 5.010;

BEGIN {
    no warnings 'once';
    $Settings::NO_SETTINGS_LOCAL = 1;
}

use Encode qw( encode_utf8 decode_utf8 );
use File::Basename 'basename';
use FindBin '$Bin';
use Parallel::ForkManager;

use Yandex::DBShards;
use Yandex::DBTools;
use Yandex::Shell;

use my_inc '/var/www/ppc.yandex.ru', for => 'protected';

use RedirectCheckQueue;
use Settings;

yash_system( 'mkdir', '-p', "$Bin/banner_hrefs_domain_mismatch" );
yash_system( 'mkdir', '-p', "$Bin/logs" );

$Yandex::Log::LOG_ROOT = "$Bin/logs";

my $total_processed = 0;

my $pm = Parallel::ForkManager->new(18);
$pm->run_on_finish( sub {
    my ( $pid, $exit_code, $ident, $exit_signal, $core_dump, $data ) = @_;
    my $ads_processed = $data->{processed};
    $total_processed += $ads_processed;
    printf "%s finished, processed %d ads, total processed: %.2f million\n",
        $ident, $ads_processed, $total_processed / 1_000_000;
} );

my @filenames = glob("$Bin/banner_hrefs_filtered_even_files/*");

for my $filename (@filenames) {
    my $basename = basename($filename);

    $pm->start($basename) and next;

    my $ad_count = 0;
    open my $fh, '<', $filename;
    open my $out_fh, '>', "$Bin/banner_hrefs_domain_mismatch/$basename";
    while ( my $line = <$fh> ) {
        $ad_count++;

        chomp $line;
        my ( $bid, $href, $domain ) = split /\t/, decode_utf8($line);

        next if $href eq '';

        my ( $canonical_domain, $need_check_redirect ) =
            RedirectCheckQueue::domain_need_check_redirect( { href => $href, domain => $domain } );

        if ( !$need_check_redirect && $domain ne $canonical_domain ) {
            print $out_fh "$line\n";
        }
    }

    close $fh;

    $pm->finish( 0, { processed => $ad_count } );
}

$pm->wait_all_children;
