#!/usr/bin/perl -w
use warnings;
use FindBin;
use CGI;
use locale;
use strict;
use lib "$FindBin::Bin/../../lib";
use lib "$FindBin::Bin/../../wlib";


use CatalogiaMediaProject;
use Encode;

use utf8;
use open ":utf8";

use Utils::Worker;
use Utils::Urls;
#binmode(STDIN,':utf8');
#binmode(STDOUT,':utf8');
use Utils::PDL qw(cosine);

my $proj = Project->new({
    load_dicts                              => 1,
    load_minicategs_light                   => 1,
});

my $yt_client = $proj->yt_client->set_params(
    pool                => 'catalogia',
    tries               => 1,
    sleep_between_tries => 1,
);


my $nf = $proj->new_features2;

while(<STDIN>) {
    chomp;
    my $title = $_;
    my $bnr = $proj->bf->lbanner({title => $title});
    my $phr = $bnr->preprocess_title_body;
    my @categs = $phr->get_minicategs;
    my $clean = $phr->clean_minicategs_subphrases_hash;
    my @keys = ();
    foreach my $categ ( @categs ) {
        foreach my $subtext ( keys %$clean ) {
            next unless exists $clean->{$subtext}{$categ};
            my $words = {};
            $words->{$_} = 1 foreach @{$phr->_decode_phrase($subtext, snorm => 1)};
            my $subsnorm = join ( ' ', sort keys %$words );
            push @keys, {categ => $categ, subsnorm => $subsnorm };
        }
    }

    my %phrases = ();
    foreach my $key (@keys) {
        $phrases{$_} = {text => $_} foreach map {$_->{phrase}} @{$yt_client->read_table('\'//home/catalogia/tmp/dse_categ_subphrase_phrase_uniq[("'.$key->{categ}.'","'.$key->{subsnorm}.'")]\'', '\'<encode_utf8=false>json\'')};
    }

    my $source = [];

    foreach my $phrase (keys %phrases) {
        $phrases{$phrase}{phr} = $proj->phrase($phrase);
        $phrases{$phrase}{vector} = $nf->_phrase_dssm_vector($phrases{$phrase}{phr}, "dssm_phr2");
        push @$source, $phrases{$phrase};
    }
    my $tree = [];
    gen_tree($source, $tree);

    my $bnr_vector = $nf->_banner_dssm_vector($bnr, "dssm_phr2");

    my @result = ();
    my $total_cnt = scalar(keys %phrases);
    my $chunk_size = $total_cnt/500;
    my $top_count = 50;
    my $current = $tree->[0];
    $current->{score} = cosine($current->{vector}, $bnr_vector);
    push @result, $current;
    delete $phrases{$current->{text}};

    @$tree = sort { cosine($a->{vector}, $bnr_vector) <=> cosine($b->{vector}, $bnr_vector) } @$tree;
    my @baseline_top = map {$_->{text}} @$tree;
    @baseline_top = splice @baseline_top, 0, $top_count;

    my $cnt = 0;
    OUTER: while ( %phrases ) {
        $cnt++;
        @result = sort {$a->{score} <=> $b->{score}} @result;
        for my $best_ind (0..$#result) {
            my $prev = $result[$best_ind];
            my $current_text;
            if ( $prev->{best} && $phrases{$prev->{best}} ) {
                $current_text = $prev->{best};
            }
            elsif ( $prev->{worst} && $phrases{$prev->{worst}} ) { 
                $current_text = $prev->{worst};
            }
            if ( $current_text ) {
                my $current = $phrases{$current_text};
                $current->{score} = cosine($current->{vector}, $bnr_vector);
                push @result, $current;
                delete $phrases{$current->{text}};
                unless ( $cnt % $chunk_size ) {
                    my @temp_top = ();
                    my $score = 0;
                    my $max = $top_count - 1;
                    $max = $#result if $#result < $max;
                    for my $i ( 0..$max ) {
                        push @temp_top, $result[$i]->{text};
                        $score += $result[$i]->{score};
                    }
#if ($cnt - $chunk_size > sqrt($total_cnt)) { $proj->dd(@temp_top); last OUTER } 
                    my %top_hash = map {$_=>1} @temp_top;
                    my $baseline_match = 0;
                    $baseline_match++ foreach grep { $top_hash{$_} } @baseline_top;
                    print "$cnt;$score;$baseline_match\n";
                    
                }

                last;
            }
        }
    }

    $proj->dd(@baseline_top);
}

sub gen_tree {
    my ($source_arr, $result_arr) = @_;
    my $current = shift @$source_arr;
    if ( @$source_arr ) {
        if (1) {
            my %cosine_cache = ();
            $cosine_cache{$_->{text}} = cosine($_->{vector}, $current->{vector}) foreach @$source_arr;
            @$source_arr = sort { $cosine_cache{$b->{text}} <=> $cosine_cache{$a->{text}} } @$source_arr;
        }
        $current->{best} = $source_arr->[-1]->{text};
        $current->{worst} = $source_arr->[0]->{text};
        push @$result_arr, $current;
        my $worst_arr = $source_arr;
        my $best_arr = [splice @$worst_arr, scalar(@$worst_arr)/2];
        @$best_arr = reverse @$best_arr;
        if ( @$worst_arr ) {
            gen_tree($worst_arr, $result_arr);
        }
        if ( @$best_arr ) { 
            gen_tree($best_arr, $result_arr);
        }
    }
    else {
        push @$result_arr, $current;
    }
}
