#!/usr/bin/perl -w
#категоризация баннеров с помощью get_minicategs

use strict;
use utf8;
use open ':utf8';
no warnings 'utf8';

binmode(STDIN,  ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

#use FindBin;
#use lib "$FindBin::Bin/../lib";
use lib "/home/yuryz/arcadia/rt-research/broadmatching/scripts/lib";

use Utils::Common;
use Project;
use BM::Phrase;
use BM::PhraseList;
use Time::HiRes qw(tv_interval gettimeofday);

my $proj = Project->new({ 
    load_dicts => 1,
    load_minicategs_light => 1,
});


open F, ">uncat_bnrs";
while (<STDIN>) { #/home/yuryz/scripts/data/uncat_bnrs_1kk.active
    chomp;

    my $bnr = $proj->bf->text2banner($_);
    my $bnr_info = join("\t", $bnr->id, $bnr->banner_text_phrase->text, $bnr->url);

    my @bnr_categs = $proj->phrase($bnr->banner_text_phrase->text)->get_minicategs;

    if (@bnr_categs) {
        print "$bnr_info\t", join("/", sort @bnr_categs), "\n"; #>cat_bnrs
    } else {
        print F "$_\n" if $bnr->url;
    }
}
