#!/usr/bin/perl -w
#сборка словаря ядер для обучающих баннеров (после ctgs_rank_core.pl)

use strict;
use utf8;
use open ':utf8';
no warnings 'utf8';
use Data::Dumper;

binmode(STDIN,  ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

open F, "TrainExact_core";
my %dic;
my %size; #размеры кластеров ядер
while (<F>) {
    chomp;
    my ($core, $trash, $ctg) = split /\t/;
    unless ($dic{$core}{$ctg}) {
        $dic{$core}{$ctg} = $trash;
        $size{$core}{$ctg} = 1;
    } else {
        my %dup;
        my @uniq = grep { !$dup{$_}++ } split / /, $dic{$core}{$ctg}." $trash"; #удаление дублей
        $dic{$core}{$ctg} = join " ", sort @uniq;
        $size{$core}{$ctg}++;
    }
}

for my $core (sort keys %dic) {
    for my $ctg (sort keys %{$dic{$core}}) {
        my $trash = $dic{$core}{$ctg};
        my $size = $size{$core}{$ctg};
        print "$core\t$size\t$trash\t$ctg\n";
    }
}
