#!/usr/bin/perl -w
#снятие омонимии с помощью контекста кампаний и лэндингов

use strict;
use utf8;
use open ':utf8';
no warnings 'utf8';

binmode(STDIN,  ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

#use FindBin;
#use lib "$FindBin::Bin/../lib";
#use lib "/home/yuryz/arcadia/rt-research/broadmatching/scripts/wlib";
use lib "/home/yuryz/arcadia/rt-research/broadmatching/scripts/lib";

use Utils::Common;
use Project;
use BM::Phrase;
use BM::PhraseList;
use Time::HiRes qw(tv_interval gettimeofday);

my $proj = Project->new({ 
    load_dicts => 1,
    load_minicategs_light => 1,
});

my $MIN_CAMP = 200; #min число баннеров в кампании

my @camp;
my %camp;
my $camp_prev = "";
while (<STDIN>) { #/home/yuryz/scripts/data/bnrs_10kk.camp
    chomp;

    my @f = split /\t/;
    my $id = $f[0];
    my $camp = $f[1];
    my $text = "$f[3] $f[4]";
    my $url = $f[8];
    my $categ = substr($f[20], 6); #mctgs=
    $categ = "NO_CATEGS" unless $categ;

    my $lang = $f[21];
    next unless $lang eq "lang=ru";

    if ($camp_prev ne $camp) { #другая кампания
        if (@camp >= $MIN_CAMP) {
            my $ctg_freq = (sort { $camp{$b} <=>  $camp{$a} || $a cmp $b } keys %camp)[0]; #самая частотная категория
            my @ctg_freq = ancestors_add([$ctg_freq]); #анонимный массив
            print "***$camp\t", @camp+0, "\t$ctg_freq[0]\n";
            for my $i (0..$#camp) {
                my ($id, $text, $categ, $url) = split /\t/, $camp[$i];
                my @ctgs = split m{/}, $categ;
                if (@ctgs > 1) { #омонимия
                    print "\t$id\t$text\t$categ";
                    my @cat_land = $proj->page($url)->get_minicategs; #категоризация по landing page
                    if (@cat_land) {
                        my @ancest_land = ancestors_add(\@cat_land);
                        my $cat_land = join "\t", @ancest_land;
                        #my $cat_land = join "/", @cat_land;
                        print "\t[$cat_land]";
                    }
                    print "\n";
                    my @ancestors = ancestors_add(\@ctgs);
                    for my $i (0..$#ctgs) {
                        print "\t$camp{$ctgs[$i]}\t$ancestors[$i]\n";
                    }
                }
            }
        }

        @camp = ();
        %camp = ();
        $camp_prev = $camp;
    }

    my @ctgs = split m{/}, $categ;
    for my $ctg (@ctgs) {
        $camp{$ctg}++;
    }
    push @camp, join("\t", $id, $text, $categ, $url);
}
print @camp+0, "\n";


#--- добавление предков для категорий ---
sub ancestors_add {
    my ($categs) = @_;

    my @categs_ancestors;
    for my $categ (@$categs) {
        my @ancestors; #все предки категории
        my $ancestor = $categ;
        while ($ancestor = $proj->categs_tree->get_minicateg_parent($ancestor)) {
            push @ancestors, $ancestor;
        }
        push @categs_ancestors, join("/", reverse @ancestors)."/$categ";
    }

    return @categs_ancestors;
}
