#!/usr/bin/perl -w
use strict;

use utf8;
use open ":utf8";

binmode STDIN, ':utf8';
binmode STDOUT, ':utf8';

use FindBin;
use lib "$FindBin::Bin/lib";
use lib "$FindBin::Bin/wlib";
use CatalogiaMediaProject;
use Utils::Sys qw(
    get_file_lock
    release_file_lock
    handle_errors
    mtime
    lines_count
    do_safely
);
use Utils::Common;
use Utils::Hosts qw( get_hosts get_curr_host );
use Data::Dumper;
use IPC::Open3;
use IO::Handle;
use Encode;
use JSON::XS;
use File::Basename qw(basename);
use BM::YQL::Helpers qw(get_k_random_banners);
use Storable qw(dclone);

get_file_lock() or exit(0);
handle_errors();

my $proj = CatalogiaMediaProject->new({
    projsrv         => 0,
    nrmsrv          => 0,
    no_auth         => 1,
    no_form         => 1
});

my $tbl_meta = $proj->banners_categories_diff_meta;
my $tbl_input = $proj->banners_categories_diff_input;
my $tbl_results = $proj->banners_categories_diff_results;

# обработка сломавшихся тасков
my $broken_tasks = [ grep { should_process_task($_) } @{ $tbl_meta->List({ State => "Processing" }) } ];
if(@$broken_tasks) {
    $proj->log(scalar(@$broken_tasks) . " broken tasks");
    for my $task (@$broken_tasks) {
        $proj->log("processing bad ID=" . $task->{ID});

        # смена статуса
        set_failed($task);
        save_result($task, $tbl_meta, $tbl_results);
        send_report_to_user($proj, $task);

        $proj->log("processing bad ID=" . $task->{ID});
    }
}

my $new_tasks = [ grep { should_process_task($_) } @{ $tbl_meta->List({ State => "New" }) } ];
$proj->log(scalar(@$new_tasks) . " new tasks");

for my $task (@$new_tasks) {
    $proj->log("processing ID=" . $task->{ID});

    # смена статуса
    $task->{State} = "Processing";
    $tbl_meta->Add($task, { replace => 1 });

    my $count = $task->{NumBanners};
    my $data_type = $task->{DataType};
    my $diff_type = $task->{DiffType};
    my $lang = $task->{Language};
    my $input_data = $tbl_input->Get({ ID => $task->{ID} })->{Input};
    my $fn_data = $Utils::Common::options->{dirs}{temp} . "/categs_diff_$data_type"."_set_$count" . ($lang eq "ru" ? "" : "_$lang");
    my $fn_caddphr = $Utils::Common::options->{dirs}{temp} . "/categs_diff_caddphr";
    my @errors;

    my $max_allowed_data_hours_old = 24;

    if ( ! -e $fn_data
        or  time - mtime($fn_data) > $max_allowed_data_hours_old * 3600
        or lines_count($fn_data) != $count   # wc -l
    ) {
        # генерация случайной выборки
        generate_set($fn_data, $count, $data_type, $lang);
    }

    # подготовка данных
    $proj->log("prepare caddphr");
    if ($diff_type eq 'custom' or $diff_type eq 'custom_light') {
        open F, "> $fn_caddphr" or die($!);
        binmode F, ':bytes';
        print F $input_data;
        close F;
    } else {
        my $caddphr = Encode::decode_utf8($input_data);
        open F, "> $fn_caddphr" or die($!);
        binmode F, ':utf8';
        print F $caddphr;
        close F;
    }
    $proj->log("/ prepare caddphr");

    $proj->log("load banners");
    open F, $fn_data or die($!);
    my $id2data = {};
    while(<F>) {
        chomp;
        if($data_type eq "banner") {
            my $bnr = $proj->banner_factory->text2banner($_);
            $id2data->{$bnr->id} = {
                title   => $bnr->title,
                body    => $bnr->body
            }
        }
    }
    close F;
    $proj->log("/ load banners");

    # вычисление категорий
    my $sf = time.".".$$;
    my ($fn_out_old, $fn_out_new) = map{$Utils::Common::options->{dirs}{temp} . "/categs_diff_banners_result_$_.$sf"} qw(old new); # TODO cleanup if unsuccess
    my $pid_main = $$;
    my ($pid_old, $pid_new);
    if(!($pid_old = fork)) {
        my $res = do_safely( sub { categorize($fn_data, $fn_out_old, "$fn_out_old.err", $data_type, $diff_type, $lang, undef) }, no_die => 1 );
        exit( $res ? 0 : 1 );
    } elsif(!($pid_new = fork)) {
        my %prm;
        if ($diff_type eq 'beta_t') {
            $prm{bm_path} = "/home/tatiana-oleynik/broadmatching/";
        }
        my $res = do_safely( sub { categorize($fn_data, $fn_out_new, "$fn_out_new.err", $data_type, $diff_type, $lang, $fn_caddphr, %prm) }, no_die => 1 );
        exit( $res ? 0 : 1 );
    }
    $proj->log("pid_old: $pid_old  pid_new: $pid_new");

    if(wait_for_child($pid_new)) {
        # если произошла ошибка, то не дожидаемся второго процесса
        kill -9, $pid_old;  # TODO  'kill' before 'kill -9'

        $proj->log("'new' child failed");

        set_failed($task, "$fn_out_new.err");
    } elsif(wait_for_child($pid_old)) {
        $proj->log("'old' child failed");

        set_failed($task, "$fn_out_old.err");
    } else {
        my ($oldh, $newh) = map{load_categorized($_)} ($fn_out_old, $fn_out_new);

        # ошибки
        my $errors = [];
        if(open F, "$fn_out_new.err") {
            $errors = grep{$_} map{chomp; $_} <F>;
            close F;
        }

        # вычисление разницы
        my $diff = [];
        for my $bid (sort keys %$oldh) {
            my $old = $oldh->{$bid};
            my $new = $newh->{$bid};

            next if !$new || $new->{categs} eq $old->{categs};

            my @diff_phrases = get_diff_phrases($old->{categs}, $new->{categs}, $old->{phrases}, $new->{phrases});
            my @diff_decoded = get_diff_phrases($old->{categs}, $new->{categs}, $old->{decoded}, $new->{decoded});
            my $bnr = $id2data->{$bid};
            push @$diff, join("\t", $bid, $bnr->{title}, $bnr->{body}, $old->{categs}, $new->{categs}, join(",", sort @diff_phrases), join(",", @diff_decoded), $proj->serial($old->{categs_phrases_hlist}), $proj->serial($new->{categs_phrases_hlist}) );
        }

        # сохранение результатов
        $task->{Result} = join("\n", @$errors, @$diff);
        $task->{State} = "Done";
    }

    $proj->dbh->reconnect;

    save_result($task, $tbl_meta, $tbl_results);
    send_report_to_user($proj, $task);

    $proj->log("/ processing ID=" . $task->{ID});
}

release_file_lock();
exit(0);

sub set_failed {
    my ($task, $fn_err) = @_;
    my @errors;

    if(open F, $fn_err) {
        @errors = <F>;
        close F;
    }

    $task->{Result} = join("", @errors);
    $task->{State} = "Failed";
}

sub get_diff_phrases {
    my ($old_categs, $new_categs, $old_phrases, $new_phrases) = @_;
    my @diff;
    my %oldh = map{$_ => 1} split "/", $old_categs;
    my %diffh = map{$_ => 1} grep{!$oldh{$_}} split "/", $new_categs;

    for my $phr (sort keys %$new_phrases) {
        if(!$old_phrases->{$phr} || grep{$diffh{$_}} keys %{$new_phrases->{$phr}}) {
            push @diff, $phr;
        }
    }

    return @diff;
}

sub generate_set {
    my ($fn, $count, $data_type, $lang) = @_;
    $proj->log("generate_set ($fn, $count, $data_type, $lang) ...");

    my $fn_temp = $proj->get_tempfile("generate_set-$data_type-$count-$lang", UNLINK => 1);

    if($data_type eq "banner") {
        my $json = JSON::XS->new->pretty(0);
        my $yql_result = get_k_random_banners(
            yt_client => $proj->yt_client(),
            banner_count => $count,
            lang => $lang,
        );

        my $yt_data_fn = Utils::Sys::get_tempfile(basename($fn_temp), UNLINK => 1);
        $proj->yt_client()->read_table_to_file($yql_result->{table_path}, $yt_data_fn, "json");

        my $line_count = 0;
        open(my $yt_data_fh, "< $yt_data_fn");
        open(my $result_fh, "> $fn_temp");
        while (my $line = <$yt_data_fh>) {
            my $data = $json->decode($line);

            my $bnr = $proj->bf->lbanner($data);
            print $result_fh join("\t", map{$bnr->{$_} || ""} $proj->bf->get_string_fields) . "\n";

            $line_count++;
        }

        if ($line_count != $count) {
            die "Could not get $count random banners ($line_count found)";
        }
    } else {
        die("unknown data type '$data_type'");
    }
    $proj->do_sys_cmd("mv $fn_temp $fn");
    $proj->log("generate_set ($fn, $count, $data_type, $lang) done");
    return 1;
}

sub wait_for_child {
    my ($pid) = @_;

    waitpid( $pid, 0 );

    return $? if $?;
    return "";
}

sub categorize {
    my ($fn_data, $fn_out, $fn_err, $data_type, $diff_type, $lang, $fn_caddphr, %prm) = @_;

    my $bm_path = $prm{bm_path} // $Utils::Common::options->{dirs}{root};
    my $cmd = "$bm_path/scripts/categs_diff_helper.pl $fn_data $fn_out $data_type $diff_type $lang";

    my $errors = [];

    $cmd .= " $fn_caddphr" if $fn_caddphr;

    $proj->log("executing $cmd");
    my $pid = open3(\*FW, \*FR, \*FE, $cmd);
    $proj->log("pid: $pid");
    close FW;
    close FR;
    binmode FE, ':utf8';
    while(<FE>) {
        chomp;
        $proj->log(" helper: $_");
        push @$errors, $_ if /ERROR:/;
    }
    close FE;

    if(open FE, "> $fn_err") {
        print FE "$_\n" for @$errors;
        close FE;
    } else {
        $proj->log($!);
    }

    if (wait_for_child($pid)) {
        die "wait_for_child($pid) failed";
    };

    $proj->log("/ executing $cmd");

    return 1;
}

sub load_categorized {
    my ($fn_out) = @_;
    my $h = {};
    my $json = JSON::XS->new;

    open F, $fn_out or die($!);
    while(<F>) {
        chomp;
        my $fields = $json->decode($_);

        $h->{$fields->{bid}} = {
            categs      => join("/", @{$fields->{categs} || []}),
            phrases     => $fields->{orig} || {},
            decoded     => $fields->{decoded} || {},
            categs_phrases_hlist => $fields->{categs_phrases_hlist} || [],
        };
    }
    close F;
    $proj->do_sys_cmd("rm $fn_out");

    return $h;
}

# На входе: task
# На выходе: 1/0 - нужно ли обрабатывать этот task на этом хосте
sub should_process_task {
    my ($task) = @_;

    my $res = 0;
    my $host = get_curr_host;
    if ($task->{DiffType} eq 'beta_t') {
        if ($host =~ m/catalogia-media-dev/) {  # TODO use host_role
            # categorization beta for tatiana-oleynik@ at catalogia-media-dev01e
            $res = 1;
        }
    } else {
        my $id = $task->{ID};
        my @hosts = sort ( get_hosts( role => 'catalogia-media-scripts' ));
        my $host_index = (grep { $hosts[$_] eq $host } (0 .. $#hosts))[0];
        if (defined $host_index) {
            if (($id % @hosts) == $host_index) {
                $res = 1;
            }
        }
    }

    $proj->log("should_process_task $task->{ID} $task->{DiffType} $host : $res");
    return $res;
}

sub send_report_to_user {
    my ($proj, $task) = @_;
    $proj->log("send_report_to_user " . $task->{ID} . " ...");

    my $res;
    my $email = $proj->login2email( $task->{Login} );
    if ($email) {
        $proj->log("send to $email");
        $res = $proj->SendMail({
                from    => 'no_reply@yandex-team.ru',
                to => $email,
                mail_list => "categs_diff_manager",
                subject => 'banners_categs_diff result',
                body    => join("\n",
                    "banners_categs_diff",
                    (map {"$_:\t" . $task->{$_} // ''}  qw[ Comment ID Login DataType NumBanners DiffType State ]),
                    ($task->{State} eq "Done"  ?  join("\n",
                            "Input:\t" . "https://catmedia.yandex.ru/cgi/ind.pl?cmd=banners_categs_diff&get_input=1&ID=".$task->{ID},
                            "Result:\t" . "https://catmedia.yandex.ru/cgi/ind.pl?cmd=banners_categs_diff&get_result=1&ID=".$task->{ID},
                    )  :  ()),
                ),
        });
        unless ($res) {
            $proj->log("ERROR: Could not send email to . " . ($task->{Login} // 'UNDEF'));
        }
    } else {
        $proj->log("ERROR: Void email for " . $task->{Login});
    }

    $proj->log("send_report_to_user " . $task->{ID} . " done ($res)");
    return $res;
}

sub save_result {
    my $task = dclone(shift);
    my $tbl_meta = shift;
    my $tbl_results = shift;

    if (defined $task->{Result}) {
        $tbl_results->Add({ ID => $task->{ID}, Result => $task->{Result} }, { replace => 1 });
        delete $task->{Result};
    }

    $tbl_meta->Add($task, { replace => 1 });
}
