#!/usr/bin/perl -w
#ранжирование эталонных баннеров на основе близости к лексическому ядру категории

use strict;

use utf8;
use open ":utf8";
use Data::Dumper;

binmode STDIN, ':utf8';
binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';


open F, $ARGV[0]; #dict_make.pl

my $ctg_id_curr = "";
my %core; #лексическое ядро категории
my $cnt = 0;
while (<STDIN>) { #bnr_text_norm_s
    chomp;

    my ($bid, $bnr_nrm, $ctg_id) = split /\t/;
    print STDERR "$cnt\t$ctg_id\n" if ++$cnt % 10000 == 0; #ProgressBar

    L: if ($ctg_id_curr ne $ctg_id) {
        my $b = <F>;
        chomp $b;
        my @b = split /\t/, $b; #0 - ctg_id, 1 - size (число слов с ядре данной категории)
        $ctg_id_curr = $b[0];
        my $siz = $b[1];

        %core = ();
        for (1..$siz) {
            $b = <F>;
            chomp $b;
            @b = split /\t/, $b; #0 - "пусто", 1 - word, 2 - freq
            $core{$b[1]} = $b[2];
        }
        goto L;
    }


    my @a = split / /, $bnr_nrm;
    my $weight = 0;
    $weight += $core{$_} for @a;
    if ($weight) {
        my $ent = 0; #энтропия
        for (@a) {
            my $tmp = $core{$_} / $weight;
            $ent -= $tmp * log2($tmp);
        }
        #$ent = sprintf("%.3f", $ent);

        $weight /= @a; #усреднение
        $weight *= log2(@a+0); #учет числа значащих слов
        $weight *= $ent;
    }
    $weight = sprintf("%.2f", $weight);

    #print "$bid\t$bnr_nrm\t$ctg_id\t$weight\n";
    print "$bid\t$ctg_id\t$weight\n";
}


#--- двоичный логарифм ---
sub log2 {
    my $n = shift;
    return log($n)/log(2);
}
