#! /usr/bin/env perl

use lib 'scripts/lib';

use utf8;

use Project;

use Utils::Urls qw(get_sec_level_domain);
use JSON::XS qw();

my $proj = Project->new({
    load_dicts                              => 1,
});

my $tld_cache = {};

while (my $line = <STDIN>) {
    chomp($line);
    my $r = { map {$_->[0] => $_->[1]} map {[ split /=/, $_, 2 ]} split /\t/, $line };
    my $domain = get_sec_level_domain($r->{url});

    my $tld = (split /\./, $domain)[-1];
    unless (exists $tld_cache->{$tld} ) {
        my @tld_variants = grep {$_} ($tld, $proj->phrase($tld)->translit_simple);
        push @tld_variants, map{$proj->phrase($_)->norm_phr} @tld_variants;
        push @tld_variants, map{"+$_"} @tld_variants;
        $tld_cache->{$tld} = \@tld_variants;
    }
    my $norm = $r->{norm};
    $norm =~ s/\s+~0$//;
    my $normed_phr = $proj->normed_phrase($norm);
    next unless filter_norm({norm_phr => $normed_phr, tlds => $tld_cache->{$tld} });
    print  JSON::XS::encode_json({ (map {$_ => $r->{$_}} qw(url norm)),( map {$_ => $r->{$_}+0} qw(hits exact_hits)) }), "\n";
    #print join("\t", map {$_ .'='. $r->{$_}} qw(url norm hits exact_hits)), "\n";
}

sub filter_norm {
    my ($h) = @_;
    my $phr = $h->{norm_phr};
    my $norm = $phr->text;
    my @words = split /\s+/, $norm;
    my $wordcount = scalar( @words );
    my $wc_minlength = {
        1 => 5,
        2 => 6,
        3 => 9,
    };
    my $minlength;
    if ( $wordcount <= 3 ) {
        $minlength = $wc_minlength->{$wordcount};
    }

    my $is_domain_phrase = 0;
    if ( $wordcount == 2 ) {
        foreach my $tld ( @{$h->{'tlds'}} ) {
            if ( grep {$_ eq $tld} @words ) {
                $is_domain_phrase = 1;
                last;
            }
        }
    }

    my $result = length($norm) <= 2                              ||
                 $norm !~ /[^\s\d]/                              ||
                 $is_domain_phrase                               ||
                 $phr->is_porno_phrase                           ||
                 $norm =~ /^\d+ скидка$/                         ||
                 ( $wordcount == 1 && !$phr->look_like_a_model ) ||
                 ( $minlength && length($norm) < $minlength )
                 ? 0 : 1;
    return $result;
}

