#!/usr/bin/perl -w
use strict;
use utf8;
use open ':utf8';
use v5.10;

use List::Util qw(sum min max shuffle);
use JSON;
use Getopt::Long;
use FindBin;
use lib "$FindBin::Bin/../../lib";
use Project;

my %default_opt = (
    p => 0,
    c => [qw(text tails banners snippets)],
);
my %opt;
GetOptions(\%opt, 'help|h', 'p=s', 'c=s@', 'no_header');
$opt{$_} //= $default_opt{$_} for keys %default_opt;
if (!@ARGV || $opt{help}) {
    say "Usage: $0 [options] TEXTS TEXT_CATEGORIZED";
    say "Options:";
    say "   -p=s        how to preprocess texts before categorization (default: $default_opt{p})";
    say "                   0 - do nothing";
    say "   -c=s@       how categorize preprocessed texts (default: ".join(",",@{$default_opt{c}}).")";
    say "                   text                categorize as is";
    say "                   tails               find dominating tails category";
    say "                   banners             find dominating banners category";
    say "                   snippets            find snippets category";
    say "                   snippets_tails      find snippets tails category";
    say "   --no_header don't print header";
    exit 0;
}

my ($texts_file, $out_file) = @ARGV;
open my $fF, '<', $texts_file or die $!;
open my $tF, '>', $out_file   or die $!;
my $proj = Project->new({
    load_dicts => 1,
    load_minicategs_light => 1,
});

# write header
say $tF join("\t", 'initial_text', map {"categorization=$_"} @{$opt{c}}) unless $opt{no_header};
while(<$fF>) {
    chomp;
    my $text = $_;
    my $pp_text = preprocess($proj, $text, $opt{p});
    my @all_categs;
    for my $cat_opt (@{$opt{c}}) {
        my @categs = categorize($proj, $pp_text, $cat_opt);
        if (!@categs) {
            push @categs, 'NOCATEGORY';
        }
        push @all_categs, \@categs;
    }
    say $tF join("\t", $text, map {join("/", sort @$_)} @all_categs);
}
close $fF; close $tF;


# subs
sub preprocess {
    my $proj = shift;
    my $text = shift;
    my $pp_opt = shift // $default_opt{p};

    return $text unless $pp_opt;
    # TODO more
    return $text;
}

sub categorize {
    my $proj = shift;
    my $text = shift;
    my $cat_opt = shift // $default_opt{c};

    my $phr = $proj->phrase($text);
    if ($cat_opt eq 'text') {
        return $phr->get_minicategs;
    } elsif ($cat_opt eq 'tails') {
        my $cnts = $phr->get_cdict_minicategs_counts;
        return argmax($cnts);
    } elsif ($cat_opt eq 'banners') {
        my @ids = $proj->banners_bender->find_ids($phr, 10000);
        @ids = shuffle @ids; # берем небольшое количество случайных баннеров, чтобы не тормозило из-за категоризации
        my $bnl = $proj->bf->_ids2bnl([grep {defined} @ids[0..99]]);
        my %cat2cnt;
        for my $bnr (@$bnl) {
            $cat2cnt{$_}++ for $bnr->get_minicategs;
        }
        return argmax(\%cat2cnt);
    } elsif ($cat_opt eq 'snippets') {
        return $phr->get_minicategs_snippets;
    } elsif ($cat_opt eq 'snippets_tails') {
        return $phr->get_minicategs_snippets_tails;
    }
}

sub argmax {
    my $kv = shift;
    my $max = max(values $kv);

    return grep {$kv->{$_} == $max} keys $kv;
}

1;
