#!/usr/bin/perl -w
#построение кластеров для кампаний

use strict;
use utf8;
use open ":utf8";
use Data::Dumper;

binmode STDIN, ':utf8';
binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';


my %categs;
my $freq_total;
my $camp_prev = "";
while (<STDIN>) { #/home/yuryz/scripts/data/bnrs_1kk_good.camp
    chomp;
    my ($camp, $id, $text, $url, $categ) = split /\t/;

    if ($camp_prev ne $camp) { #разные кампании
        print "'$camp_prev'\t$freq_total\n==========\n" if %categs;
        cluster_build(\%categs) if %categs;
        %categs = ();
        $freq_total = 0;
        $camp_prev = $camp;
    }

    $categs{$categ}++; #частота категории
     unless ($categ =~ /NO_CATEGS/) {
        $freq_total++;
     }
}
print "'$camp_prev'\t$freq_total\n==========\n" if %categs;
cluster_build(\%categs);


#--- построение кластеров для кампании ---
sub cluster_build {
    my ($categs) = @_;

    my %clust_dist2;
    my $dist = 2; #расстояние до корня
    for (keys %$categs) {
        my @edges = split m{/}, $_;
        my $bound = $dist <= @edges ? $dist-1 : $#edges;
        my $branch = join "/", @edges[0..$bound];
        $clust_dist2{$branch}{FREQ} += $categs{$_}; #общая частота кластера
        $clust_dist2{$branch}{CAT}++; #число категорий в кластере
    }

    my $freq_cumul = 0; #накопленная частота
    my $flag = 1;
    my $freq_max = 0;
    my $prod_max = 0;
    for (sort { $clust_dist2{$b}{FREQ} <=> $clust_dist2{$a}{FREQ} || $clust_dist2{$a} cmp $clust_dist2{$b} } keys %clust_dist2) {
        if ($flag) {
            $freq_max = $clust_dist2{$_}{FREQ};
            $prod_max = $clust_dist2{$_}{FREQ} * $clust_dist2{$_}{CAT};
            $flag = 0;
        }

        unless ($_ =~ /NO_CATEGS/) {
            $freq_cumul += $clust_dist2{$_}{FREQ};
            #my $frac = sprintf "%.2f", $freq_cumul / $freq_total;
            my $frac = sprintf "%.2f", $clust_dist2{$_}{FREQ} / $freq_max;
            my $prod = $clust_dist2{$_}{FREQ} * $clust_dist2{$_}{CAT};
            my $frac2 = sprintf "%.2f", $prod / $prod_max;
            print "$_\t$clust_dist2{$_}{FREQ}\t$frac\t($clust_dist2{$_}{CAT})=$prod\t$frac2\n";
        } else {
            print "$_\t$clust_dist2{$_}{FREQ}\t***\n";
        }
    }
    print "\n";
}
