#!/usr/bin/perl -w
use strict;
use warnings;
use utf8;
use open ":utf8";

binmode(STDIN,':utf8');
binmode(STDOUT,':utf8');


use FindBin;
use lib "$FindBin::Bin/../../lib";
use lib "$FindBin::Bin/../../wlib";

use Getopt::Long;
use Data::Dumper qw(Dumper);
use CGI;
use FCGI::ProcManager;
use CGI::Fast;

use Project;
use Utils::Urls qw(url_to_punycode);
use Utils::Sys qw(split_csv_line);
use BM::PhraseCategs;
use Utils::XLS qw(spreadsheet2arr);
use HTML::TreeBuilder; 

use Devel::Size qw(total_size);
use Storable;

use JSON qw(to_json from_json);

my $proj = Project->new({});

my $categ_hash = {};
my $categ_domain_hash = {};

my $c = 0;
while(my $line = <STDIN>) {
    chomp $line;
    my ($id, $setid, $bid, $bannerid, $data, undef, undef, $manualcategs, $mode, undef, $outercategs) = map {$_ eq 'NULL' ? '' : $_} split /\t/, $line;
    next unless $id =~ /\d/;
    $data =~ s/\\\\"/\\"/g;
    my $h = from_json($data);
    $h->{lang} = 'ru';
    my $bnr = $proj->bf->lbanner($h);

    my $categ = $mode eq 'good' ? $outercategs : $manualcategs;

    my $domain = $bnr->domain;
    $domain ||= $h->{body};

    push @{$categ_hash->{$categ}}, {row => $line, domain => $domain};
    $categ_domain_hash->{$categ}{$domain}++;

    print STDERR $c++ . "\n";
}

foreach my $categ ( keys %$categ_hash ) {
    my @arr = sort { $categ_domain_hash->{$categ}{$b->{domain}} <=> $categ_domain_hash->{$categ}{$a->{domain}} || $b->{domain} cmp $a->{domain} } @{$categ_hash->{$categ}};
    while (@arr) {
        my $train1 = shift @arr;
        print "train\t" .  $train1->{row} . "\n" if $train1;
        my $test1 = pop @arr;
        print "test\t" . $test1->{row} . "\n" if $test1;
        my $train2 = shift @arr;
        print "train\t" .  $train2->{row} . "\n" if $train2;
    }
}
