#! /usr/bin/perl -w
use strict;
use utf8;
use 5.010;
use Data::Dumper;
use open ':utf8';
use Encode qw(_utf8_on);
use Getopt::Long;
no warnings 'utf8';

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

use FindBin;
use lib "$FindBin::Bin/../lib";
use lib "$FindBin::Bin/../cpan";
use lib "$FindBin::Bin/../wlib";
use Project;

my $proj = Project->new({});

sub print_help {
    print join("\n",
            "Usage: yt_send_last_tskv_offers.pl --help --last_days= --start_date= --end_date= --fields= --type= --yt_dir=",
            "Collects last tskv files for dyn or perf tasks and send it to YT",
            "  --help                   print this help and exit",
            "  --start_date             start of date range, when offers in tskv should be collected",
            "  --end_date               end of date range, when offers in tskv should be collected",
            "  --last_days              count of last days, when offers in tskv should be collected (use (start_date, end_date) or last_days)",
            "                           --last_days=1 is range from yesterday to today (inclusive)",
            "  --fields                 list of fields, which should be sent to YT",
            "  --filename_postfix       list of postfix file names (default=tskv_gen,feedtskv,specurls_tmp,tskv_mpd)",
            "  --type                   dyn or perf",
            "  --yt_dir                 yt directory to upload",

            qq!Examples:!,
            qq!  yt_send_last_tskv_offers.pl --last_days=3 --fields=name,url,categpath --type=perf --yt_dir=//home/bannerland/perf_banners!,
            qq!  yt_send_last_tskv_offers.pl --start_date=20171125 --end_date=20171128 --fields=name,url --type=dyn --yt_dir=//home/bannerland/dyn_banners!,
        ), "\n";
}

my $opt = {};
GetOptions($opt, 'help', 'last_days=i', 'start_date=s', 'end_date=s', 'fields=s', 'filename_postfix=s', 'type=s', 'yt_dir=s') or (print_help and die);

print_help and exit(0) if ($opt->{help});

my $start_date;
my $end_date;
if ($opt->{last_days}) {
    if ($opt->{start_date} || $opt->{end_date}) {
        die 'Use only one of last_days and (start_date, end_date)';
    }
    $end_date = $proj->dates->cur_date('direct');
    $start_date = $proj->dates->next_n_date(-$opt->{last_days} + 1, 'direct', 'direct', $end_date);
} else {
    unless ($opt->{start_date} && $opt->{end_date}) {
        die 'Set both start_date and end_date or last_days';
    }
    $start_date = $opt->{start_date};
    $end_date = $opt->{end_date};
    if ($start_date > $end_date) {
        die 'Error! start_date > end_date';
    }
    if ($start_date !~ /^\d{8}$/ || $end_date !~ /^\d{8}$/) {
        die 'Error! incorrect date format';
    }
}

die 'Set --type=dyn or --type=perf' unless (defined($opt->{type}) && ($opt->{type} eq 'dyn' || $opt->{type} eq 'perf'));
my $type = $opt->{type};

die 'Set --yt_dir' unless (defined($opt->{yt_dir}));
my $yt_dir = $opt->{yt_dir};

my $host = $proj->get_curr_host;
$host =~ s/\..*//;
my $main_dir = "/opt/broadmatching/temp/${type}_banners/";

my @FIELDS = ();
if (defined($opt->{fields})) {
    @FIELDS = split(',', $opt->{fields})
} else {
    # TODO: можно сделать не файлом, чтоб не дублировать, а списком в опциях
    open(my $fh, $proj->options->{dicts}.'/all_offerparams');
    while (my $row = <$fh>) {
        chomp $row;
        push @FIELDS, $row;
    }
    close($fh);
}
push @FIELDS, 'URLType';
push @FIELDS, 'TaskID';
push @FIELDS, 'host';

my $filename_re;
if (defined($opt->{filename_postfix})) {
    $filename_re = $opt->{filename_postfix};
    $filename_re =~ s/,/|/g;
    $filename_re = '('.$filename_re.')$'
} else {
    $filename_re = '(tskv_gen|feedtskv|specurls_tmp|tskv_mpd)$'
}

$proj->log("uploading ".join(',', map {"'$_'"} @FIELDS)." for dates $start_date - $end_date");

my $last_tskv = {};

opendir (MAIN_DIR, $main_dir) or die $!;
while (my $domain = readdir(MAIN_DIR)) {
    next unless (-d "$main_dir/$domain");
    next if $domain =~ /^\.{1,2}$/;
    opendir (DOMAIN_DIR, "$main_dir/$domain") or die $!;
    while (my $task = readdir(DOMAIN_DIR)) {
        next unless (-d "$main_dir/$domain/$task");
        next if $task =~ /^\.{1,2}$/;
        next if $task =~ /(test|grplog|sources)/;
        my $task_dir = "$main_dir/$domain/$task";
        opendir (TASK_DIR, $task_dir) or die $!;
        while (my $file = readdir(TASK_DIR)) {
            next if $file !~ /$filename_re/;
            my $file_end = $1;
            my $filedate = substr $file, 0, 8;
            next unless $start_date le $filedate && $filedate le $end_date;
            $last_tskv->{$task_dir} = {} unless defined($last_tskv->{$task_dir});
            $last_tskv->{$task_dir}->{$file_end} = $file if (!defined($last_tskv->{$task_dir}->{$file_end}) ||
                                                             $last_tskv->{$task_dir}->{$file_end} lt $file);
        }
        closedir(TASK_DIR);
    }
    closedir(DOMAIN_DIR);
}
closedir(MAIN_DIR);
my $temp_buffer = $proj->get_tempfile('yt_temp_buffer', DIR=>'/tmp', UNLINK=>1);
open my $buffer_handle, "> $temp_buffer";
for my $task_dir (keys %$last_tskv) {
    my @task_dir_tokens = split('/', $task_dir);
    my $domain = $task_dir_tokens[-2];
    my $task_id = $task_dir_tokens[-1];
    for my $filename (values $last_tskv->{$task_dir}) {
        open FIN, "< $task_dir/$filename";
        my $line_num = 0;
        while (my $line = <FIN>) {
            $line_num += 1;
            chomp $line;
            my $fields = { map {split '=', $_, 2} split("\t", $line, - 1) };
            if ($filename =~ /tskv_gen$/) {
                $fields->{URLType} = 'site';
            } elsif ($filename =~ /feedtskv/) {
                $fields->{URLType} = 'feed';
                $fields->{source_letter} = 'o';
            } elsif ($filename =~ /tskv_mpd/) {
                $fields->{URLType} = 'feed';
            } else {
                $fields->{URLType} = 'spec';
                $fields->{source_letter} = 'p';
            }
            $fields->{TaskID} = $task_id;
            $fields->{host} = $host;
            $fields->{domain} = $domain;
            $fields = { map { lc($_) => $fields->{$_} } keys(%$fields) };
            my @res = map { $fields->{lc($_)} || '' } @FIELDS;
            print $buffer_handle join("\t", @res), "\n";
        }
        close(FIN);
    }
}
close($buffer_handle);

# переводим названия полей, потому что в YT столбцы не могут содержать русские символы
my $translate = {'возраст до' => 'age_to', 'возраст от' => 'age_from', 'возраст ребенка' => 'child_age',
                 'материал игрушек' => 'toy_material', 'тип' => 'original_type'};
my @translated_fields = map {defined($translate->{$_}) ? $translate->{$_} : $_} @FIELDS;

$yt_dir =~ s/\/?$//;
my $yt_log_name = "${type}_tskv_$host";
foreach (@translated_fields) {$_ =~ s/ /_/g};
my $columns = join(';', @translated_fields);
my $yt_loader_command = "
    YT_PROXY=hahn \\
    YT_TOKEN_PATH=/opt/broadmatching/secrets/tokens/yt_plato \\
    yt write --table \"$yt_dir/${yt_log_name}_tmp\" --format '<columns=[$columns]>schemaful_dsv'
";

$proj->log("Start send tskv to yt");
my $tries = 3;
for my $try (1 .. $tries) {
    eval {
        # экранируем backslash, иначе может не сработать yt write
        $proj->do_sys_cmd("cat $temp_buffer | sed 's/\\\\/\\\\\\\\/g' | $yt_loader_command");
        $proj->yt_client->set_upload_time("$yt_dir/${yt_log_name}_tmp");
        $proj->yt_client->move("$yt_dir/${yt_log_name}_tmp", "$yt_dir/$yt_log_name");
    };
    if ($@) {
        $proj->log("WARN: retry $try: send tskv to failed: $@");
        die "" if ($try == $tries);
        sleep 100 * $try;
    } else {
        $proj->log("Send tskv to yt OK");
        last;
    }
}

$proj->log("OK");

exit(0);
