#!/usr/bin/perl -w
#корректировка категорий для кампании

use strict;
use utf8;
use open ":utf8";
use Data::Dumper;

binmode STDIN, ':utf8';
binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';

my %categs;
my $sum_freq = 0;
my $max_freq = -1;
my $max_categ; #категория с max частотой

chomp (my @campaign = <STDIN>); #zid1.sel.srt
for my $bnr (@campaign) {
    my ($id, $text, $url, $categ) = split /\t/, $bnr;
    $categs{$categ}++; #частота
    if ($categ ne 'NO_CATEGS' && $max_freq < $categs{$categ}) {
        $max_freq = $categs{$categ};
        $max_categ = $categ;
    }
    $sum_freq++;
}
printf STDERR "max_freq = $max_freq\n";
printf STDERR "max_categ = $max_categ\n";

#--- определение главного кластера категорий ---
my @categs = sort keys %categs;

my %clust_dist2;
my $freq_dist2 = -1;
my $branch_dist2;

my %clust_dist3;
my $freq_dist3 = -1;
my $branch_dist3;

my $H = 0; #энтропия
for my $i (0..$#categs) {
    my @edges = split m{/}, $categs[$i];

    my $dist = 2; #расстояние до корня
    if ($dist <= @edges) {
        my $branch = join "/", @edges[0..$dist-1];
        $clust_dist2{$branch}{NUM}++; #число категорий в кластере
        $clust_dist2{$branch}{FREQ} += $categs{$categs[$i]}; #общая частота кластера
        if ($freq_dist2 < $clust_dist2{$branch}{FREQ}) {
            $freq_dist2 = $clust_dist2{$branch}{FREQ};
            $branch_dist2 = $branch;
        }
    }

    $dist = 3;
    if ($dist <= @edges) {
        my $branch = join "/", @edges[0..$dist-1];
        $clust_dist3{$branch}{NUM}++; #число категорий в кластере
        $clust_dist3{$branch}{FREQ} += $categs{$categs[$i]}; #общая частота кластера
        if ($freq_dist3 < $clust_dist3{$branch}{FREQ}) {
            $freq_dist3 = $clust_dist3{$branch}{FREQ};
            $branch_dist3 = $branch;
        }
    }

    my $p = $categs{$categs[$i]} / $sum_freq;
    $H += -$p*log2($p);
}

printf STDERR "H = %.3f\n", $H;
print STDERR "branch_dist2=$branch_dist2\tnum_dist2=$clust_dist2{$branch_dist2}{NUM}\tfreq_dist2=$freq_dist2\n";
print STDERR "branch_dist3=$branch_dist3\tnum_dist3=$clust_dist3{$branch_dist3}{NUM}\tfreq_dist3=$freq_dist3\n";

my $branch_main = $branch_dist2; #ветвь главного кластера
if ($branch_dist3 =~ /^\Q$branch_dist2\E/ && ($clust_dist3{$branch_dist3}{NUM} / $clust_dist2{$branch_dist2}{NUM} > 0.5 || !$categs{$branch_dist2})) {
    $branch_main = $branch_dist3;
}
print STDERR "branch_main=$branch_main\n";

#--- корректировка категорий кластера ---
my $MIN_FREQ = 2; #минимальная частота категории при корректировке
my $good_ind = -1;
my $bad_ind;
my $flag = 'GOOD';
while (1) {
    if ($flag eq 'GOOD') {
        last if $good_ind + 1 > $#campaign;
        my ($id, $text, $url, $categ) = split /\t/, $campaign[$good_ind + 1];
        if ($categ =~ /^\Q$branch_main\E/) {
            $good_ind++;
        } else {
            $bad_ind = $good_ind + 1;
            $flag = 'BAD';
        }
    } elsif ($flag eq 'BAD') {
        if ($bad_ind + 1 > $#campaign) {
            $flag = 'CORRECT';
            next;
        }
        my ($id, $text, $url, $categ) = split /\t/, $campaign[$bad_ind + 1];
        unless ($categ =~ /^\Q$branch_main\E/) {
            $bad_ind++;
        } else {
            $flag = 'CORRECT';
        }
    } else { #корректировка категорий
        my $tmp_ind = $good_ind >= 0 ? $good_ind : $bad_ind + 1;
        my ($id1, $text1, $url1, $categ1) = split /\t/, $campaign[$tmp_ind];

        $tmp_ind = $bad_ind < $#campaign ? $bad_ind + 1 : $good_ind;
        my ($id2, $text2, $url2, $categ2) = split /\t/, $campaign[$tmp_ind];

        for (my $i = $good_ind + 1; $i <= $bad_ind; $i++) {
            my ($id, $text, $url, $categ) = split /\t/, $campaign[$i];

            my $simil1 = bnrs_simil($text, $text1);
            my $simil2 = bnrs_simil($text, $text2);
            $categ = $simil1 >= $simil2 ? $categ1 : $categ2;

            $categ = $max_categ if $categs{$categ} <= $MIN_FREQ;

            print "$campaign[$i]\n";
            print join("\t", $id, $text, $url, $categ)."\n";
            print "--\n";
        }

        $good_ind = $bad_ind + 1;
        $flag = 'GOOD';
    }
}


#--- log2 ---
sub log2 {
    my $n = shift;
    return log($n)/log(2);
}


#--- определение сходства текстов баннеров ---
sub bnrs_simil {
    my ($bnr_text1, $bnr_text2) = @_;

    my $dict1 = bnr_dict($bnr_text1);
    my $dict2 = bnr_dict($bnr_text2);
    ($dict1, $dict2) = ($dict2, $dict1) if keys %{$dict1} > keys %{$dict2};

    my $simil = 0; #сходство
    for (keys %{$dict1}) {
        if ($$dict2{$_}) {
            $simil += $$dict1{$_} < $$dict2{$_} ? $$dict1{$_} : $$dict2{$_};
        }
    }

    return $simil;
}


#--- частотный словарь баннера ---
sub bnr_dict {
    my ($bnr_text) = @_;

    my $valid_chars = qr/0-9a-zа-яё \-/; #допустимые символы

    my %dict;
    
    $bnr_text = lc($bnr_text);
    $bnr_text =~ s/\.[$valid_chars]+(?=[^$valid_chars])//g; #чистка
    $bnr_text =~ s/[^$valid_chars]/ /g;
    $bnr_text =~ s/^ +//;
    $bnr_text =~ s/ +$//;
    $bnr_text =~ s/ +/ /g;

    map { $dict{$_}++ } split / /, $bnr_text;
    
    return \%dict;
}
