#!/usr/bin/perl -w
#снятие омонимии с помощью частотных критериев

use strict;
use utf8;
use open ':utf8';
no warnings 'utf8';

binmode(STDIN,  ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

#use FindBin;
#use lib "$FindBin::Bin/../lib";
#use lib "/home/yuryz/arcadia/rt-research/broadmatching/scripts/wlib";
use lib "/home/yuryz/arcadia/rt-research/broadmatching/scripts/lib";

use Utils::Common;
use Project;
use BM::Phrase;
use BM::PhraseList;
use Time::HiRes qw(tv_interval gettimeofday);

my $proj = Project->new({ 
    load_dicts => 1,
    load_minicategs_light => 1,
});

my $MIN_CAMP = 200; #min число баннеров в кампании

my @camp;
my %camp;
my $camp_prev = "";
while (<STDIN>) { #/home/yuryz/scripts/data/bnrs_10kk.camp
    chomp;

    my @f = split /\t/;
    my $id = $f[0];
    my $camp = $f[1];
    my $text = "$f[3] $f[4]"; #title+body
    my $url = $f[8];
    my $categ = substr($f[20], 6); #mctgs=
    $categ = "NO_CATEGS" unless $categ;

    my $lang = $f[21];
    next unless $lang eq "lang=ru";

    if ($camp_prev ne $camp) { #другая кампания
        if (@camp >= $MIN_CAMP) {
            for my $i (0..$#camp) {
                my ($id, $text, $categ, $url) = split /\t/, $camp[$i];
                my @ctgs = split m{/}, $categ;
                if (@ctgs > 1) { #омонимия
                    print "$id\t$text\t$categ\n";
                    my @ctgs = split m{/}, $categ;
                    for my $ctg (@ctgs) {
                        #print "\t$ctg\t$camp{$ctg}\n";
                        print "\t$ctg\t$camp{$ctg}\t", sprintf("%.3f\n", simil_lev($ctg, $f[3]));
                    }

                    my $bnr = $proj->bf->get_banner_by_id($id);
                    my $phrs = $bnr->phl; #список фраз баннера
                    for my $phr (@$phrs) {
                        my $ctgs1 = $proj->phrase($phr)->get_minicategs;
                        my $ctgs2 = $proj->phrase($phr)->get_minicategs_snippets;
                        #print "\t[$phr]\n";
                        print "\t[$phr]";
                        #print "\t", join("/", @{$ctgs1}), "\n" if $ctgs1;
                        print "\t", $ctgs1, "\n" if $ctgs1;
                        print "\n";
                    }
                }
            }
        }

        @camp = ();
        %camp = ();
        $camp_prev = $camp;
    }

    my @ctgs = split m{/}, $categ;
    for my $ctg (@ctgs) {
        $camp{$ctg}++;
    }
    push @camp, join("\t", $id, $text, $categ, $url);
}
print @camp+0, "\n";


#--- добавление предков для категорий ---
sub ancestors_add {
    my ($categs) = @_;

    my @categs_ancestors;
    for my $categ (@$categs) {
        my @ancestors; #все предки категории
        my $ancestor = $categ;
        while ($ancestor = $proj->categs_tree->get_minicateg_parent($ancestor)) {
            push @ancestors, $ancestor;
        }
        push @categs_ancestors, join("/", reverse @ancestors)."/$categ";
    }

    return @categs_ancestors;
}


#--- очистка строки текста ---
sub text_clean {
    my ($text) = @_;

    my $valid_chars = qr/0-9a-zа-яё \-/; #допустимые символы
    
    $text = lc($text);
    $text =~ s/(?<=[\[\/])\.[$valid_chars]+?(?=[\]\/])/ /g; #атомы
    $text =~ s/-[$valid_chars]+?(?=( |[^$valid_chars]|$))/ /g; #минус слова
    $text =~ s/[^$valid_chars]/ /g;
    $text =~ s/^ +//;
    $text =~ s/ +$//;
    $text =~ s/ +/ /g;
    
    return $text;
}


#--- редакционное расстояние ---
sub levenshtein
{
    my ($a, $b)=@_;

    my @bellman;
    my $la=length $a;
    my $lb=length $b;

    $bellman[$_][0]=$_ for 0..$la;
    $bellman[0][$_]=$_ for 0..$lb;

    for my $j(1..$lb)
    {
        for my $i(1..$la)
        {
            $bellman[$i][$j]=$bellman[$i-1][$j]+1;
            $bellman[$i][$j]=$bellman[$i][$j-1]+1
                if $bellman[$i][$j]>$bellman[$i][$j-1]+1;

            if(substr($a, $i-1, 1) eq substr($b, $j-1, 1))
            {
                $bellman[$i][$j]=$bellman[$i-1][$j-1]
                    if $bellman[$i][$j]>$bellman[$i-1][$j-1];
            }
        }
    }
    return $bellman[$la][$lb];
}


#--- определение сходства текстов ---
sub simil_lev {
    my ($text1, $text2) = @_;

    $text1 = text_clean($text1);
    $text2 = text_clean($text2);

    return 1-levenshtein($text1, $text2)/(length($text1)+length($text2));
}
