#!/usr/bin/perl -w
#категоризация баннеров с помощью landing page и ближайших соседей

use strict;
use utf8;
use open ':utf8';
no warnings 'utf8';

binmode(STDIN,  ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

#use FindBin;
#use lib "$FindBin::Bin/../lib";
use lib "/home/yuryz/arcadia/rt-research/broadmatching/scripts/lib";

use Utils::Common;
use Project;
use BM::Phrase;
use BM::PhraseList;
use Time::HiRes qw(tv_interval gettimeofday);

my $proj = Project->new({
    load_dicts => 1,
    load_minicategs_light => 1,
});


my $bid = 4276009278; #1913691, 5362327619, 5732231575, 4039160808, 4106697144 (https://yt.yandex-team.ru/hahn/#page=navigation&path=//home/catalogia/users/johnyh/not_categotized_texts_2018-05-07&offsetMode=row)
my $bnr = $proj->bf->get_banner_by_id($bid);
exit unless $bnr;

my $url = $bnr->url;
print "$url\n";
print ">$_\n" for $proj->page($url)->get_minicategs; #категоризация по landing page

my @bnr_categs = $bnr->get_categs_neighbors;
print "$bid\t", $bnr->title, "\t", $bnr->body, "\t", $bnr->url, "\t", join("/", sort @bnr_categs), "\n";
#print "$bid\t", $bnr->title, "\t", $bnr->body, "\t", join("/", sort @bnr_categs), "\n";

=z
my $h = $bnr->get_intent; #ссылка на хеш с ключом intent
my $phr = $proj->phrase($$h{intent});
if ($phr) {
    print "intent=$phr\n";
    print ".$_\n" for $phr->get_minicategs; #категоризация по intent
    print "..$_\n" for $phr->get_minicategs_snippets; #категоризация по intent
}

my ($brand, $model) = $bnr->parse; #массив: (brand, model)
my $br_mod;
if ($brand && $model) {
    $br_mod = "$brand $model";
} elsif ($brand) {
    $br_mod = $brand;
} elsif ($model) {
    $br_mod = $model;
} else {
    $br_mod = "";
}
$phr = $proj->phrase($br_mod);
if ($phr) {
    print "brand=$phr\n";
    print "...$_\n" for $phr->get_minicategs; #категоризация по brand+model
    print "....$_\n" for $phr->get_minicategs_snippets; #категоризация по brand+model
}
=cut
