#!/usr/bin/perl -w
#снятие омонимии категорий у баннеров

use strict;
use utf8;
use open ':utf8';
no warnings 'utf8';

binmode(STDIN,  ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

#use FindBin;
#use lib "$FindBin::Bin/../lib";
use lib "/home/yuryz/arcadia/rt-research/broadmatching/scripts/lib";

use Utils::Common;
use Project;
use BM::Phrase;
use BM::PhraseList;
use Time::HiRes qw(tv_interval gettimeofday);

my $proj = Project->new({ 
    load_dicts => 1,
    load_minicategs_light => 1,
});

my $worker = Utils::Worker->new;
$worker->{verbose}    = 1;
$worker->{num_processes}    = 12;

#$worker->{file_input}       = "/home/yuryz/scripts/tatr/uncat_bnrs.tatr.cat";
#$worker->{file_output}      = "/home/yuryz/scripts/tatr/uncat_bnrs.tatr.hom";

$worker->{file_input}       = "/home/yuryz/scripts/tatr/zpart2";
$worker->{file_output}      = "/home/yuryz/scripts/tatr/zpart2.hom";

$worker->{process_line}     = sub {
    my ($line, $fh) = @_;
    chomp $line;

    my ($id, $text, $bnr_categs, $url) = split /\t/, $line;
    my $bnr_info = join("\t", $id, $text, $url);

    my @bnr_categs = split m{/}, $bnr_categs;

    my $best_categ;
    if (@bnr_categs > 1) { #снятие омонимии категорий
        my $bnr = $proj->bf->get_banner_by_id($id); #############
        my $campaign = $bnr->campaign_obj; #кампания *** ИСПОЛЬЗОВАТЬ КЭШИРОВАНИЕ ПРИ МАССОВОЙ ОБРАБОТКЕ ***
        my $campaign_bnl = $campaign->bnl; #баннеры кампании

        my %bctgs; #частотный словарь категорий
        for my $ban (@$campaign_bnl) {
            my @bctgs = $proj->phrase($ban->banner_text_phrase->text)->get_minicategs;
            for my $bctg (@bctgs) {
                $bctgs{$bctg}++; #частота категории
            }
        }

        $best_categ = $bnr_categs[0];
        for my $i (1..$#bnr_categs) {
            $best_categ = $bnr_categs[$i] if $bctgs{$best_categ} && $bctgs{$bnr_categs[$i]} && $bctgs{$best_categ} < $bctgs{$bnr_categs[$i]};
        }
    } else {
        $best_categ = $bnr_categs[0];
    }

    print $fh "$id\t$text\t$best_categ\t$url\n";

};

$worker->process_data;
