#!/usr/bin/perl -w
#выбор уникальных фраз с IF*IDF

use strict;
use utf8;
use open ':utf8';
no warnings 'utf8';

binmode(STDIN,  ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

use FindBin;
use lib "$FindBin::Bin/../lib";
#use lib "/home/yuryz/arcadia/rt-research/broadmatching/scripts/wlib";
use lib "/home/yuryz/arcadia/rt-research/broadmatching/scripts/lib";

use Utils::Common;
use Project;
use BM::Phrase;
use BM::PhraseList;
use Time::HiRes qw(tv_interval gettimeofday);

my $proj = Project->new({ 
    load_dicts => 1,
    load_minicategs_light => 1,
});


my $ctg_prev = "";
my %ctg_phrs;
while (<STDIN>) { #zsrt.tfidf
    chomp;

    my ($ctg, $br_mod, $tfidf) = split /\t/;

    if ($ctg_prev ne $ctg) { #другая категория
        %ctg_phrs = ();
        my $ctg_phrs = $proj->phrase($ctg)->get_category_phrases;
        for my $ctg_phr (@$ctg_phrs) {
            $ctg_phrs{$ctg_phr} = 1;
        }
        $ctg_prev = $ctg;
    }
    print "$_\n" unless $ctg_phrs{$br_mod};
    #rint "$_\n" if $ctg_phrs{$br_mod};
}
