#!/usr/bin/perl -w
use strict;
use warnings;
use utf8;
use open ":utf8";

use Encode;

binmode(STDIN,':utf8');
binmode(STDOUT,':utf8');

use FindBin;
use lib "$FindBin::Bin/../../lib";
use lib "$FindBin::Bin/../../wlib";

use Getopt::Long;
use Data::Dumper;

use Utils::Words;

use Project;

my $proj = Project->new({
        load_dicts => 1,
    });


print STDERR "-----------Ready----------\n";

#сначала берем фразы, у которых начало фразы само по себе является гармонизированной фразой (и запоминаем ее конец)
#это легко проверять в отсортированном файле, сравнивая текущую строку с предыдущей
#складываем головы и хвосты таких фраз в хэши
#потом для каждого хвоста записываем имя самой популярной головы как clusterid
#потом для каждой головы проходимся по хвостам и смотрим самый популярный clusterid
#потом записываем в два дикта соответствие clusterid головам и хвостам

my %head2tail = ();
my %tailclusterid = ();
my %clusterid2tail = ();
my %clusterid2head = ();

my %headcount = ();

my $filename = '/home/apovetkin/harmonization_queries_data_sorted';
open F, $filename
  or die "Could not open file '$filename' $!";
 

sub chompstopwords
{
    my $str = shift;
    my @words = split /\s+/, $str;
    while ( @words && Utils::Words::stop4norm( $words[-1] ) ) {
        pop @words;
    }
    return join ' ', @words;
}
my $sorted = 1;
my @harmheads;

#read from file
if ($sorted) {
    while (<F>) {
        chomp;
        next unless $_;
        my ($inputstr) = split "\t", $_;
        $inputstr = chompstopwords($inputstr);
        next unless $inputstr;

        my @newharmheads;

        my $biggesthead = '';
        my $smallesttail = '';

        for my $harmhead ( @harmheads ) {
            my $harmheadspace = $harmhead.' ';
            if ( ( index($inputstr, $harmheadspace) == 0 ) && ( length($inputstr) > length($harmheadspace) ) ) {
                my $tail = substr($inputstr, length($harmheadspace) );
                if ( $tail ) {
                    if (length($harmhead)>length($biggesthead)) {
                        $biggesthead = $harmhead;
                        $smallesttail = $tail;
                    }
                }   
                push @newharmheads, $harmhead;
            }
        }
        if ( $smallesttail ) {
            push @{$head2tail{$biggesthead}}, $smallesttail;
            $tailclusterid{$smallesttail} = {cost => 0, id => ''} unless exists $tailclusterid{$smallesttail};
            $headcount{$biggesthead}++;
        }

        push @newharmheads, $inputstr;
        @harmheads = @newharmheads;
        
        my $size = scalar keys %head2tail;
    }

    close F;
}

#assign clusters to tails
foreach my $head ( keys %head2tail ) {
    foreach my $tail ( @{$head2tail{$head}} ) {
        my $cost = $headcount{$head};
        $tailclusterid{$tail} = {cost => $cost, id => $head } if $tailclusterid{$tail}->{cost} < $cost;
    }
}

#assign clusters to heads
foreach my $head ( keys %head2tail ) {
    my %tailclusters = ();
    foreach my $tail ( @{$head2tail{$head}} ) {
        $tailclusters{$tailclusterid{$tail}->{id}} += $tailclusterid{$tail}->{cost};
    }
    my $clusterid = (sort { $tailclusters{$b} <=> $tailclusters{$a} } keys %tailclusters )[0];

    $clusterid2head{$clusterid}->{$head}++;

    foreach my $tail ( @{$head2tail{$head}} ) {
        $clusterid2tail{$clusterid}->{$tail} += $headcount{$head} ; 
    }
}

my %norm_phr = ();
sub norm {
    my $str = shift;
    unless (exists $norm_phr{$str} ) {
        $norm_phr{$str} = $proj->phrase($str)->norm_phr;
    }
    return $norm_phr{$str};
}

#delete useless clusters
sub delete_single {
    foreach my $clusterid ( keys %clusterid2head ) {
        if ( ( keys %{$clusterid2tail{$clusterid}} <= 1 ) || ( keys %{$clusterid2head{$clusterid}} <= 1 ) ) {
            delete $clusterid2tail{$clusterid};
            delete $clusterid2head{$clusterid};
        }
    }
}

delete_single();

#leave only best harmonization per norm
foreach my $clusterid ( keys %clusterid2tail ) {
    my %normtail2tail;
    my %besttails = ();

    foreach my $tail ( keys %{$clusterid2tail{$clusterid}} ) {
        next if $tail eq norm($tail);
        $normtail2tail{norm($tail)}->{$tail} += $clusterid2tail{$clusterid}->{$tail};
    }
    foreach my $normtail (keys %normtail2tail) {
        my $besttail = ( sort { $normtail2tail{$normtail}->{$b} <=> $normtail2tail{$normtail}->{$a} } keys %{$normtail2tail{$normtail}} )[0];
        $besttails{$besttail} = $normtail2tail{$normtail}->{$besttail};
    }
    $clusterid2tail{$clusterid} = \%besttails;
}

delete_single();

my $headsfilename = '/home/apovetkin/harmonization_headclusters';
open HEADS, '>', $headsfilename
  or die "Could not open file '$filename' $!";

my $tailsfilename = '/home/apovetkin/harmonization_tailclusters';
open TAILS, '>', $tailsfilename
  or die "Could not open file '$filename' $!";

#prepare heads hash for writing to file
my %headclusterid = ();
foreach my $clusterid ( keys %clusterid2head ) {
    foreach ( keys %{$clusterid2head{$clusterid}} ) {
        $headclusterid{$_} = $clusterid;
    }
}
foreach my $head ( sort keys %headclusterid ) {
    my $clusterid = $headclusterid{$head};
    print HEADS "$head\t$clusterid\n";
}

close HEADS;
foreach my $clusterid ( sort keys %clusterid2head ) {
    my %tails = map { norm($_) => $_ } keys %{$clusterid2tail{$clusterid}}; # по этим ключам лежат веса хвостов, их тоже можно записывать, но мы их не записываем
    foreach my $norm ( sort keys %tails ) {
        my $harm = $tails{$norm};
        print TAILS "$clusterid:$norm\t$harm\n";
    }
}

close TAILS;
