#!/usr/bin/perl -w
use strict;

use utf8;
use open ':utf8';

use FindBin;
use lib "$FindBin::Bin/../lib";

use Project;

use Utils::Common;
use Utils::Sys;

Utils::Sys::get_file_lock() or exit(0);
Utils::Sys::handle_errors();

my $proj = Project->new({});

push @{$Utils::Common::options->{yt_direct_banners_mapping}}, { yt_field => 'title_extension', mysql_field => 'title_extension' }; #CATALOGIA-629 - временный костыль до полного введения в бой

my $tries = 5;
my $sleep_between_tries = 120;

my $yt_client = $proj->yt_client->set_params(
    pool                => 'catalogia',
    tries               => $tries,
    sleep_between_tries => $sleep_between_tries,
);

my $src_table = '//home/direct/export/bm/bm_banners';
my $shuffled_src_table = "//tmp/bm_banners_shuffled_$$";
my $unsorted_dst_table = '//tmp/banners-recategorized-unsorted';
my $unsorted_schema = '<schema=[{name = bid; type = int64}; {name = cid; type = int64};{name = pid; type = int64};{name = OrderID; type = int64};{name = BannerID; type = int64}; {name = lang; type = string};{name = Categories; type = string};{name = Flags; type = string};{name = CategoryIDs; type = string};{name = Mediagroups; type = string};{name = UpdateTime; type = string};{name = minicategs_ids; type = string}; ]>';
my $dst_table = '//home/catalogia/catalogia-banners-recategorized-fast';

my $bytes_per_gigabyte = 1 << 30;
my $tmpfs_size = int(8 * $bytes_per_gigabyte);
my $memory_limit = int(9 * $bytes_per_gigabyte);
my $reserve_lower_bound = 0.9; # ~ 4G tmpfs + 2G proc / 8G - memory we will need in all jobs ++
my $reserve_probable = 0.95; # ~ 4.5G tmpfs + 2.5G proc / 8G - memory enough for the most jobs ++
my $data_size_per_job = 100*(1<<20);
chomp(my $modification_time = $yt_client->read_cmd("get '$src_table/\@upload_time' --format=dsv"));

my $cloud_pool_subspec = qq/"pool_trees"=["physical"]; "tentative_pool_trees"=["cloud"]; "scheduling_tag_filter"="";/;
my $memory_subspec = qq/"memory_limit"=$memory_limit; "memory_reserve_factor"=$reserve_probable; "user_job_memory_digest_lower_bound"=$reserve_lower_bound;/;
# tmpfs without copy, because we remove untared files - no need for extra copy
my $tmpfs_subspec = qq/"tmpfs_path"="."; "tmpfs_size"=$tmpfs_size; "copy_files"=true;/;
my $user_slots = 1000;

$yt_client->shuffle($src_table, $shuffled_src_table);

$yt_client->do_project_cmd(
    'map',
    "'./mr_perl ./categorize_banners_yt.pl'",
    "--local-file=$Utils::Common::options->{'dirs'}{'scripts'}/banners_categories/categorize_banners_yt.pl",
    "--src=$shuffled_src_table",
    "'--dst=$unsorted_schema$unsorted_dst_table'",
    "'--input-format=".$proj->bf->yt_banners_format."'",
    "'--output-format=<enable_string_to_all_conversion=true;columns=[bid;cid;pid;OrderID;BannerID;lang;Categories;Flags;CategoryIDs;Mediagroups;UpdateTime;minicategs_ids;]>schemaful_dsv'",
    "'--spec={", join(" ",
        $cloud_pool_subspec,
        qq/"mapper"={$tmpfs_subspec $memory_subspec};/,
        qq/"data_size_per_job"=$data_size_per_job;/,
        qq/"resource_limits"={"user_slots"=$user_slots;};/,
    ), "}'",
);

$yt_client->do_cmd('sort', "--src=$unsorted_dst_table", "--dst=$dst_table", '--sort-by=bid');

my $value = JSON::to_json({$src_table => $modification_time});
$yt_client->do_cmd(
    'set',
    "$dst_table/\@sources_modification_time",
    '--format=json',
    "--value='$value'",
);

$yt_client->set_upload_time($dst_table);

Utils::Sys::release_file_lock();
