#!/usr/bin/perl -w
#категоризация с помощью контекста кампаний

use strict;
use utf8;
use open ":utf8";
use Data::Dumper;

binmode STDIN, ':utf8';
binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';


my @camp;
my $camp_prev = "";
while (<STDIN>) { #/home/yuryz/scripts/data/bnrs_10kk.camp
    chomp;

    my @f = split /\t/;
    my $id = $f[0];
    my $camp = $f[1];
    my $text = "$f[3] $f[4]";
    my $url = $f[8];
    my $categ = substr($f[20], 6); #mctgs=
    $categ = "NO_CATEGS" unless $categ;

    my $lang = $f[21];
    next unless $lang eq "lang=ru";

    if ($camp_prev ne $camp) { #другая кампания
        if (@camp) {
            for my $i (0..$#camp) {
                if ($i > 0 && $i < $#camp) {
                    my ($id, $text, $categ) = split /\t/, $camp[$i];
                    if ($categ eq "NO_CATEGS") {
                        my ($id_up, $text_up, $categ_up) = split /\t/, $camp[$i-1];
                        my ($id_down, $text_down, $categ_down) = split /\t/, $camp[$i+1];
                        #if ($categ_up ne "NO_CATEGS") {
                        if ($categ_up ne "NO_CATEGS" && $categ_down ne "NO_CATEGS") {
                        #if ($categ_up ne "NO_CATEGS" && $categ_down ne "NO_CATEGS" && $id_up + 1 == $id && $id + 1 == $id_down) {
                        #if ($categ_up ne "NO_CATEGS" && $categ_up eq $categ_down) {
                            my $simil_up = bnrs_simil_lev($text, $text_up);
                            my $simil_down = bnrs_simil_lev($text, $text_down);
                            $categ = $simil_up >= $simil_down ? $categ_up : $categ_down;
                            print "$camp[$i-1]\n";
                            print join("\t", $id, $text, "*$categ"), "\n";
                            #print join("\t", $id, $text, $categ_up), "\n"; #############
                            print "$camp[$i+1]\n";
                            print "--\n";
                        }
                    }
                }
            }
        }

        @camp = ();
        $camp_prev = $camp;
    }

    push @camp, join("\t", $id, $text, $categ);
}
print @camp+0, "\n";


#--- определение сходства текстов баннеров ---
sub bnrs_simil_lev {
    my ($bnr1_text, $bnr2_text) = @_;

    $bnr1_text = bnr_text_clean($bnr1_text);
    $bnr2_text = bnr_text_clean($bnr2_text);

    return 1-levenshtein($bnr1_text, $bnr2_text)/(length($bnr1_text)+length($bnr2_text));
}


#--- очистка текста баннера ---
sub bnr_text_clean {
    my ($bnr_text) = @_;

    my $valid_chars = qr/0-9a-zа-яё \-/; #допустимые символы
    
    $bnr_text = lc($bnr_text);
    $bnr_text =~ s/[^$valid_chars]/ /g;
    $bnr_text =~ s/^ +//;
    $bnr_text =~ s/ +$//;
    $bnr_text =~ s/ +/ /g;
    
    return $bnr_text;
}


#--- редакционное расстояние ---
sub levenshtein
{
    my ($a, $b)=@_;

    my @bellman;
    my $la=length $a;
    my $lb=length $b;

    $bellman[$_][0]=$_ for 0..$la;
    $bellman[0][$_]=$_ for 0..$lb;

    for my $j(1..$lb)
    {
        for my $i(1..$la)
        {
            $bellman[$i][$j]=$bellman[$i-1][$j]+1;
            $bellman[$i][$j]=$bellman[$i][$j-1]+1
                if $bellman[$i][$j]>$bellman[$i][$j-1]+1;

            if(substr($a, $i-1, 1) eq substr($b, $j-1, 1))
            {
                $bellman[$i][$j]=$bellman[$i-1][$j-1]
                    if $bellman[$i][$j]>$bellman[$i-1][$j-1];
            }
        }
    }
    return $bellman[$la][$lb];
}
