#!/usr/bin/perl -w
use strict;
use warnings;
use utf8;
use open ":utf8";

binmode(STDIN,':utf8');
binmode(STDOUT,':utf8');


use FindBin;
use lib "$FindBin::Bin/../../lib";
use lib "$FindBin::Bin/../../wlib";
use Utils::Sys qw/md5int/;
use List::Util qw(min);

use Getopt::Long;
use CatalogiaMediaProject;
use Project;
use Cmds::Mediaplanners;
use Data::Dumper;
use BM::PhraseCategs;
use Utils::Urls qw(normalize_url);
use Utils::Words qw(stop4norm);


use BM::YQL::Helpers qw(get_k_random_banners);

my @urls = ();
while (<STDIN>){
chomp;
my ($a,$b, $url) = split /\t/, $_;
push @urls, $url;
}

my $forks = 20;

my $forknum = 0;
for (0..$forks-1) {
    my $pid = fork;
    last unless $pid;
    $forknum++;
}

exit if $forknum >= $forks;

my $proj = Project->new({
    load_dicts                              => 1,
    load_minicategs_light                   => 1,
    allow_lazy_dicts                        => 1,
    use_comptrie_subphraser                 => 1,
    use_sandbox_categories_suppression_dict => 1,
});

$proj->categs_tree->never_read_categs_cache(1);
$proj->categs_tree->never_write_categs_cache(1);

my $ind = 0;

my $filename = "categorized_hrefs_$forknum";
open(my $fh, '>:encoding(UTF-8)', $filename) or die "Could not open file '$filename' $!";

for my $url (@urls) {
next unless $ind++ % $forks == $forknum;
my $page = $proj->page($url);
$page->{no_cache} = 1;
my @categs = $page->get_minicategs;
my $flags = join ",", $proj->categs_tree->get_catalogia_flags_with_asocial(@categs);
my $categs = join "/", @categs;
my $status = $page->{download_failed};
$status //= '';
print $fh "$url\t$categs\t$flags\t$status\n";
}


close $fh;
