#!/usr/bin/perl -w
# str_dst.pl (16.11.03)       

#       iso_dig     name_rus
# ----------------------------------
# cur_list.txt  1           2
# okv.txt       2           4
# ----------------------------------
# off_list.txt  4           3
# okcm.txt      2           5

# =====================================================================================
$f1 = "cur_list.txt";   $iso_dig1 = 1;  $name_rus1 = 2; #    
$f2 = "okv.txt";        $iso_dig2 = 2;  $name_rus2 = 4; #    

$t = 'B'; #  : 'Q' - q-, 'L' - , 'B' - 
$q = 3; #  q-
# =====================================================================================

$MAX_LEN_STR = 250; # max  
$ADD = 1; #   
$CNG = 1; #   
$DEL = 1; #   

open F, $f1; @f1 = <F>;
open F, $f2; @f2 = <F>;

for (@f1) {
    tr/-/-ߨ/;
    @s1 = split /[\t\r\n]/; #    
    for (@f2) {
        tr/-/-ߨ/;
        @s2 = split /[\t\r\n]/;
        
        if ($s1[$iso_dig1] == $s2[$iso_dig2] &&
                ($s1[$name_rus1] =~ /$s2[$name_rus2]/ || $s2[$name_rus2] =~ /$s1[$name_rus1]/)) {
            $dist = 0;
        } else {
            if ($t eq 'Q') { # q- 
                $dist = qgr_dst($s1[$name_rus1], $s2[$name_rus2]);
                $dist /= length($s1[$name_rus1]) < length($s2[$name_rus2]) ? length($s1[$name_rus1]) : length($s2[$name_rus2]);
            } elsif ($t eq 'L') { #  
                $dist = lev_dst($s1[$name_rus1], $s2[$name_rus2]);
            } elsif ($t eq 'B') { #  
                $dist = bgr_dst($s1[$name_rus1], $s2[$name_rus2]);
                $dist /= length($s1[$name_rus1]) < length($s2[$name_rus2]) ? length($s1[$name_rus1]) : length($s2[$name_rus2]);
            } else {
                exit 0;
            }
        }
        
        if ($s1[$iso_dig1] == $s2[$iso_dig2]) {
            $match = '*';
            $m_dist{$dist}++;
        } else {
            $match = ' ';
        }
        printf "%s\t%s\t%s\t%s\t%s\t%.2f\n",
            $s1[$iso_dig1], $s1[$name_rus1], $s2[$iso_dig2], $s2[$name_rus2], $match, $dist;
    }
    $s1_cnt++;
    printf STDERR "\r%.0f%%", 100 * $s1_cnt / @f1;
}
print STDERR "\n";

open F, ">stat.txt"; #         iso_dig ('*')
for (sort {$a <=> $b} keys %m_dist) {
    printf F "%.2f   %d\n", $_, $m_dist{$_};
}

# -----  q-  -----
sub qgr_dst {
    my ($s1, $s2) = @_;
    my (%qh, $dst);
    
    qgr_cnt($s1, \%qh, 0, \$dst);
    qgr_cnt($s2, \%qh, 1, \$dst);
    
    return $dst;
}

# -----  q- -----
sub qgr_cnt {
    my ($s, $qh, $flag, $dst) = @_;
    
    my $k = $q - length($s);
    $k = 0 if $k < 0;
    $s .= " " x $k; #    
    
    while ($s =~ /(.{$q,$q})/g) { #    q-
        !$flag ? $$qh{$1}++ : $$qh{$1}--;
        !$flag || $$qh{$1} < 0 ? $$dst++ : $$dst--; # 
        pos($s) -= $q - 1;
    }
}

# -----    -----
sub lev_dst {
    my ($str1, $str2) = @_;
    my @p1 = split //, $str1; #    
    my @p2 = split //, $str2;
    
    my $dist_im1;
    my ($dist_i_j, $dist_i_jm1, $dist_j0);
    my ($i, $j);
    
    for ($i = 1, $dist_im1[0] = 0; $i <= $MAX_LEN_STR; $i++) {
        $dist_im1[$i] = $dist_im1[$i-1] + $ADD;
    }
    $dist_j0 = 0;
    
    for ($i = 1; $i <= @p1; $i++) {
        $dist_i_jm1 = $dist_j0 += $DEL;
        for ($j = 1; $j <= @p2; $j++) {
            $dist_i_j = MIN3($dist_im1[$j-1] + IS_EQUAL($p1[$i-1], $p2[$j-1]),
                             $dist_i_jm1     + $ADD,
                             $dist_im1[$j]   + $DEL);
            $dist_im1[$j-1] = $dist_i_jm1;
            $dist_i_jm1 = $dist_i_j;
        }
        $dist_im1[$j-1] = $dist_i_j;
    }

    return $dist_i_j;
}

# -----  min 3-  -----
sub MIN3 {
    my ($x, $y, $z) = @_;
    return $x < $y ? ($x < $z ? $x : $z) : ($y < $z ? $y : $z);
}

# -----  2-  -----
sub IS_EQUAL {
    my ($ch1, $ch2) = @_;
    return $ch1 eq $ch2 ? 0 : $CNG;
}

# -----    -----
sub bgr_dst {
    my ($str1, $str2) = @_;
    my @p1 = split //, $str1; #    
    my @p2 = split //, $str2;
    
    return 100 if @p1 < 2 || @p2 < 2 || abs(@p1-@p2) > 5;
    my $bcnt = @p1 > @p2 ? @p1 : @p2;
    my $ecnt = $bcnt;
    
    for (my $i = 1; $i < @p1; $i++) {
        last if $ecnt - (@p1-$i) - 1 > 5;
        my $flag = $i < @p2 && $p1[$i-1] eq $p2[$i-1] && $p1[$i] eq $p2[$i] ||
                   $i-2 >= 0 && $i-1 < @p2 && $p1[$i-1] eq $p2[$i-2] && $p1[$i] eq $p2[$i-1] ||
                   $i-3 >= 0 && $i-2 < @p2 && $p1[$i-1] eq $p2[$i-3] && $p1[$i] eq $p2[$i-2] ||
                   $i+1 < @p2 && $p1[$i-1] eq $p2[$i] && $p1[$i] eq $p2[$i+1] ||
                   $i+2 < @p2 && $p1[$i-1] eq $p2[$i+1] && $p1[$i] eq $p2[$i+2];
        $ecnt-- if $flag;
    }
    $ecnt-- if $ecnt != $bcnt && $p1[0] eq $p2[0];
    return 100 if $ecnt > 5;
    return 100 * $ecnt / $bcnt;
}
