#!/usr/bin/perl -w
use strict;
use warnings;
use utf8;
use open ":utf8";

binmode(STDIN,':utf8');
binmode(STDOUT,':utf8');


use FindBin;
use lib "$FindBin::Bin/../../lib";
use lib "$FindBin::Bin/../../wlib";

use Getopt::Long;
use Data::Dumper qw(Dumper);
use CGI;
use FCGI::ProcManager;
use CGI::Fast;

use Project;
use Utils::Urls qw(url_to_punycode);
use Utils::Sys qw(split_csv_line);
use BM::PhraseCategs;
use Utils::XLS qw(spreadsheet2arr);
use HTML::TreeBuilder; 

use Devel::Size qw(total_size);
use Storable;

use JSON qw(to_json from_json);

my $proj = Project->new({load_dicts => 1, load_minicategs_light => 1, use_comptrie_subphraser => 1, load_languages => ['ru']});

my $row_hash = {};
my $norm_hash = {};
my $domain_hash = {};
my $categ_hash = {};

my $body_categ_uniq_hash = {};

my $c = 0;
while(my $line = <STDIN>) {
    $c++;
    chomp $line;
    my ($id, $setid, $bid, $bannerid, $data, undef, undef, $manualcategs, $mode, undef, $outercategs) = map {$_ eq 'NULL' ? '' : $_} split /\t/, $line;
    next unless $id =~ /\d/;
    $data =~ s/\\\\"/\\"/g;
    my $h = from_json($data);
    $h->{lang} = 'ru';
    my $bnr = $proj->bf->lbanner($h);

    my $categ = $mode eq 'good' ? $outercategs : $manualcategs;

    next if $h->{body} =~ /\w/ && exists $body_categ_uniq_hash->{$categ . "\t" . $h->{body}};
    $body_categ_uniq_hash->{$categ . "\t" . $h->{body}} = 1;

    my $normtext = $bnr->preprocess_title_body->get_core_subphrase;
    unless ($normtext =~ /\w/) {
        $normtext = join(' ', grep {$_ !~ /\d/} split /\s+/, $bnr->preprocess_title_body->norm_phr_uniq );
    }

    my $domain = $bnr->domain;
    $domain ||= $h->{body};

    $domain_hash->{$domain}++;
    $norm_hash->{$normtext}++;
    $categ_hash->{$categ}++;
    $row_hash->{$id} = {
        row => $line,
        domain => $domain,
        normtext => $normtext,
        categ => $categ,
        domain_score => $domain_hash->{$domain},
        norm_score => $norm_hash->{$normtext},
        categ_score => $categ_hash->{$categ}-1,
    };

    print STDERR $c . "\n";
}

my $max_domain = 0;
my $max_norm = 0;
my $max_categ = 0;

for my $id (keys %$row_hash) {
    if ( $max_domain < $row_hash->{$id}{domain_score} ) {
        $max_domain = $row_hash->{$id}{domain_score};
    }
    if ( $max_norm < $row_hash->{$id}{norm_score} ) { 
        $max_norm = $row_hash->{$id}{norm_score};
    }
    if ( $max_categ < $row_hash->{$id}{categ_score} ) { 
        $max_categ = $row_hash->{$id}{categ_score};
    }
}

for my $id (keys %$row_hash) {

    $row_hash->{$id}{crit} = ( $row_hash->{$id}{categ_score} ) *(  2 ** ($row_hash->{$id}{norm_score} < 30 ? $row_hash->{$id}{norm_score} : 30) ) * ( $row_hash->{$id}{domain_score} ** 2 );
}

for my $id ( sort {$row_hash->{$a}{crit} <=> $row_hash->{$b}{crit}} keys %$row_hash ) {
    print $row_hash->{$id}{row}, "\n";
}

