#!/usr/bin/perl -w
#построение словаря ядер для обучающих баннеров

use strict;
use utf8;
use open ':utf8';
no warnings 'utf8';
use Data::Dumper;

binmode(STDIN,  ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

$| = 1;

open F, "Train_dict_tf";
my %dic;
while (<F>) {
    chomp;
    my ($ctg, $wrds) = split /\t/;
    my @wrds = split / /, $wrds; #слова с tf_idf (пробельный разделитель)
    for (my $i = 0; $i <= $#wrds; $i += 2) {
        $dic{$ctg}{$wrds[$i]} = $wrds[$i+1]; #<слово, tf_idf> - по убыванию tf_idf
    }
}

open F, "TrainExact_norm";
while (<F>) { #TrainExact_norm
    chomp;
    my ($tnorm, $bid, $mctgs, $bnorm) = split /\t/;

    my @wrds = split / /, "$tnorm $bnorm";

    my @ctgs = split m{/}, $mctgs;
    for my $ctg (@ctgs) {
        $ctg =~ s/^ +//;
        my %lex;
        for my $wrd (@wrds) {
            $lex{$wrd} = $dic{$ctg}{$wrd} if $dic{$ctg}{$wrd}; #tf_idf
        }

        my @lex = sort { $lex{$b} <=> $lex{$a} || $a cmp $b } keys %lex;
        my $core = join " ", @lex[0..1]; #семантическое ядро
        my $trash = join " ", @lex[2..$#lex];
        
        print "$core\t$trash\t$ctg\n";
    }
}
