#!/usr/bin/perl -w
#снятие омонимимии с помощью отношения "родитель-потомок"

use strict;
use utf8;
use open ':utf8';
no warnings 'utf8';

binmode(STDIN,  ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

#use FindBin;
#use lib "$FindBin::Bin/../lib";
#use lib "/home/yuryz/arcadia/rt-research/broadmatching/scripts/wlib";
use lib "/home/yuryz/arcadia/rt-research/broadmatching/scripts/lib";

use Utils::Common;
use Project;
use BM::Phrase;
use BM::PhraseList;
use Time::HiRes qw(tv_interval gettimeofday);

my $proj = Project->new({ 
    load_dicts => 1,
    load_minicategs_light => 1,
});

#awk -F '\t' '{if($21 ~ /\//)print substr($21,7)}' /home/yuryz/scripts/data/bnrs_1kk_good.camp |sort |uniq -c |sort -k1,1nr -k2,2 |less

my $total = 0;
my $hom = 0;
my $cnt = 0;
while (<STDIN>) { #homonyms
    chomp;

/^ +(\d+) (.+)$/;
my $freq = $1;
my $categs = $2;
$total += $freq;

    #my @categs = split m{/};
    my @categs = split m{/}, $categs;
    #next if @categs != 2;
    $cnt+=$freq if @categs > 2;

my @cat = sort @categs;
    @categs = ancestors_add(\@categs); #добавление предков для категорий
    @categs = sort @categs;
    my @categs2;
    for my $i (0..$#categs) {
        next if $i < $#categs && $categs[$i+1] =~ /^$categs[$i]/; #вхождение категорий одна в другую или дублирование
        my @temp = split m{/}, $categs[$i];
        push @categs2, $temp[$#temp]; #последняя ветвь
    }

if (@categs2 == 1) {
$hom += $freq;
print "$_\n";
    #print join("/", @cat), "\n";
    #print join("/", sort(@categs2)), "\n--\n";
print join("/", sort(@categs2)), "\n";
}
}

print "$hom\n";
print "$total\n";
print "$cnt\n";


#--- добавление предков для категорий ---
sub ancestors_add {
    my ($categs) = @_;

    my @categs_ancestors;
    for my $categ (@$categs) {
        my @ancestors; #все предки категории
        my $ancestor = $categ;
        while ($ancestor = $proj->categs_tree->get_minicateg_parent($ancestor)) {
            push @ancestors, $ancestor;
        }
        push @categs_ancestors, join("/", reverse @ancestors)."/$categ";
    }

    return @categs_ancestors;
}
