#!/usr/bin/perl -w
use strict;
use warnings;
use utf8;
use open ":utf8";

binmode(STDIN,':utf8');
binmode(STDOUT,':utf8');


use FindBin;
use lib "$FindBin::Bin/../../lib";
use lib "$FindBin::Bin/../../wlib";

use Getopt::Long;
use Data::Dumper;

use Utils::Words;

use Project;

my $proj = Project->new({
        load_dicts => 1,
    });


my $filename = '/home/apovetkin/harmonization_tails_data';
open F, $filename
  or die "Could not open file '$filename' $!";

my %head2clusterid = ();
my %norm2clusterid = ();
my %tail2clusterid = ();
my %clusterid2normtails = ();
my %clusterid2tails = ();
my %normtail2tail = ();
my $cnt = 0;
while (<F>) {
   chomp;
   next unless $_;
   my ($clusterid, $headstr, $tailstr) = split "\t", $_;
   foreach my $hc (split /,/, $headstr) {
       my ($norm, $head, $count) = split /:/, $hc;
      # $head2clusterid{$head} = $clusterid;
       $norm2clusterid{$norm}->{$head} = $clusterid;
   }

   foreach my $tc (split /,/, $tailstr) {
       my ($norm, $tail, $count) = split /:/, $tc;
      # $clusterid2tails{$clusterid}->{$tail} = $count;
       $clusterid2normtails{$clusterid}->{$norm}->{$tail} = $count;
       push @{$tail2clusterid{$tail}}, $clusterid;
       push @{$normtail2tail{$norm}}, $tail.' '.$count;
   }

   print STDERR " loading $cnt\n" unless $cnt++ % 100000;
}

print "\n--------------READY--------------\n";

while (<STDIN>) {
    chomp;
    my $row = $_;

    my ($rhead, $rtail) = split /:/, $row;

    my $rnormhead = $proj->phrase($rhead)->norm_phr;
    my $rnormtail = $proj->phrase($rtail)->norm_phr;

    my %harmphrs;
    my @best;

    unless (exists $norm2clusterid{$rnormhead}) {
        print "\nno normalized head, next\n";
        next;
    }
if (0) {
    %harmphrs = ();
    @best = undef;
    for my $harmhead ( keys %{$norm2clusterid{$rnormhead}} ) {
        my $clusterid = $norm2clusterid{$rnormhead}->{$harmhead};
        print "clusterid: $clusterid\n";
        for my $normtail (keys %{$clusterid2normtails{$clusterid}}) {
            next unless $normtail eq $rnormtail || !$rtail;
            for my $harmtail ( keys %{$clusterid2normtails{$clusterid}->{$normtail}} ) {
                $harmphrs{join(' => ',$harmhead, $harmtail)} = $clusterid2normtails{$clusterid}->{$normtail}->{$harmtail};
            }
        }
    }

    @best = sort {$harmphrs{$b} <=> $harmphrs{$a} } keys %harmphrs;

    unless ( @best ) {
        print "\n no tails for this normalized head\n";
    }
    else {
        print "\nnormalized head best: ".$best[0]."\ntop:\n";
        print join "\n", map {join ' => ', $_, $harmphrs{$_}} firstten(@best);
    }

}
    unless (exists $norm2clusterid{$rnormhead}->{$rhead}) {
        print "\nno intact head, next\n";
        next;
    }

    %harmphrs = ();
    @best = undef;
    my $clusterid = $norm2clusterid{$rnormhead}->{$rhead};
 
    print "\nclusterid: $clusterid\n";

    for my $normtail (keys %{$clusterid2normtails{$clusterid}}) {
        next unless $normtail eq $rnormtail || !$rtail;
        for my $harmtail ( keys %{$clusterid2normtails{$clusterid}->{$normtail}} ) {
            $harmphrs{join(' => ',$rhead, $harmtail)} = $clusterid2normtails{$clusterid}->{$normtail}->{$harmtail};
        }
    }

    @best = sort {$harmphrs{$b} <=> $harmphrs{$a} } keys %harmphrs;

    unless ( @best ) {
        print "\n no tails for this intact head\n";
        next;
    }

    print "\nintact head best: ".$best[0]."\ntop:\n";
    print join "\n", map {join ' => ', $_, $harmphrs{$_}} firstten(@best);
    print "\n";

next;



    unless ( exists $head2clusterid{$row} ) {   
        print "not found\n";
        next;
    }

    print "\n\nadded tails: " . ( scalar keys %{$clusterid2tails{$head2clusterid{$row}}} ) . "\n";

    print join "\n", firstten( sort { $clusterid2tails{$head2clusterid{$row}}->{$b} <=> $clusterid2tails{$head2clusterid{$row}}->{$a} } keys %{$clusterid2tails{$head2clusterid{$row}}} );
    print "\n";
} 

sub firstten {
    my @arr = @_;

    my $cnt = scalar @arr;
    $cnt = 20 if $cnt > 20;
    my @res;
    for (0..$cnt) { push @res, $arr[$_] if $arr[$_]};
    return @res;
}

