#!/usr/bin/perl -w
#категоризация баннеров с помощью landing page и ближайших соседей

use strict;
use utf8;
use open ':utf8';
no warnings 'utf8';

binmode(STDIN,  ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

#use FindBin;
#use lib "$FindBin::Bin/../lib";
use lib "/home/yuryz/arcadia/rt-research/broadmatching/scripts/lib";

use Utils::Common;
use Project;
use BM::Phrase;
use BM::PhraseList;
use Time::HiRes qw(tv_interval gettimeofday);


my $proj = Project->new({ 
    load_dicts => 1,
    load_minicategs_light => 1,
});


my $worker = Utils::Worker->new;
$worker->{verbose}    = 1;
$worker->{num_processes}    = 20;

$worker->{file_input}       = "/home/yuryz/scripts/sample/yt/clast/johnyh/bnrs_no_ctg_10000"; #get_bnrs_no_ctg.py
$worker->{file_output}      = "/home/yuryz/scripts/sample/yt/clast/johnyh/bnrs_ctg_10000";

$worker->{process_line}     = sub {
    my ($line, $fh) = @_;
    chomp $line;

    my ($bid, $_yql_column_0) = split(/\t/, $line);
    return if $bid =~ /^\./; #эти баннеры уже были обработаны раньше (см. top_1000 и top_corr)

    my $bnr = $proj->bf->get_banner_by_id($bid);
    return unless $bnr;

    my $bnr_pre = $proj->phrase($bnr->title." ".$bnr->body)->get_banner_prefiltered_phrase->text; #префильтрация баннера
    my $bnr_nrm = $proj->phrase($bnr_pre)->norm_phr; #нормализация текста баннера

    my @bnr_categs;
    if ($bnr->url =~ m{^https://www.ivi.tv/}) {
        @bnr_categs = ( "Видеосервисы" );
    } else {
        @bnr_categs = $bnr->get_categs_neighbors;
    }

    #if (@bnr_categs == 1) { #омонимы пока пропускаем
        print $fh "$_yql_column_0\t$bid\t".$bnr->title."\t".$bnr->body."\t$bnr_nrm\t".join("/", sort @bnr_categs)."\t".$bnr->url."\n";
    #}
};

$worker->process_data;
