#!/usr/bin/perl
use strict;

use utf8;
use open ":utf8";

use FindBin;
use lib "$FindBin::Bin/../../lib";
use lib "$FindBin::Bin/../../wlib";


use Utils::Sys qw/
    uncompressfile
/;
use Project;


my $proj = Project->new({load_dicts => 1, load_minicategs_light => 1});

while(my $site = <STDIN>) {
    chomp $site;
    my $robots_url = $proj->page($site)->_normurl('/robots.txt');
    my $robots_txt = $proj->page($robots_url)->tt;
    my ($sitemap_entry) = grep {/^sitemap: /i} split /\n/, $robots_txt;
    my ($sitemap_url) = $sitemap_entry =~ /(http.*$)/;
    $sitemap_url ||= $proj->page($site)->_normurl('/sitemap.xml');
    print "$sitemap_url\n";

    my $urls = process_sitemap_url_tree($sitemap_url);
    my @sorted_urls = sort {$urls->{$b}{priority} <=> $urls->{$a}{priority}} grep {$proj->dse_tools->filter_url($_)} keys %$urls;

    @sorted_urls = @sorted_urls[0..10];
    my $pgl = $proj->page_list(\@sorted_urls);
    $pgl->zora_batch_download;
    for my $pg ( @$pgl ) {
        my $url = $pg->url;
        my $name = $proj->dse_tools->get_title({title => $pg->title}, 56);
        next unless $name;
        my $minicategs = join('/', $pg->get_minicategs);
        print "url=$url\tname=$name\tminicategs=$minicategs\n";
    }
}

sub process_sitemap_url_tree {
    my $sitemap_url = shift;
    my ($sitemaps, $urls) = process_sitemap_url($sitemap_url);

    my @sitemaps = keys %$sitemaps;

    while (@sitemaps) {
        my $sub_sitemap_url = shift @sitemaps;
        my ($sub_sitemaps, $sub_urls) = process_sitemap_url($sub_sitemap_url);
        push @sitemaps, keys %$sub_sitemaps;
        $urls->{$_} = $sub_urls->{$_} foreach keys %$sub_urls;
    }
    return $urls;
}

sub process_sitemap_url {
    my $sitemap_url = shift;

    my $sitemap_file = $proj->get_tempfile();
    my $sitemap_file_unzipped = $proj->get_tempfile();
    my $sitemaps_file = $proj->get_tempfile();
    my $urls_file = $proj->get_tempfile();

    my $sitemap_page = $proj->page($sitemap_url);
    $sitemap_page->result_file($sitemap_file);
    $sitemap_page->{zora_big_files} = 1;
    $sitemap_page->tt; 
    uncompressfile($sitemap_file, $sitemap_file_unzipped); 
    eval { XMLParser::grep($sitemap_file_unzipped, $sitemaps_file, 'sitemap') };
    eval { XMLParser::grep($sitemap_file_unzipped, $urls_file, 'url') };

    my $sitemaps = process_parsed_file($sitemaps_file, 'sitemap');
    my $urls = process_parsed_file($urls_file, 'url');

    return ($sitemaps, $urls);
}

sub process_parsed_file {
    my ($file, $prefix) = @_;

    my $data = {};
    open(my $fh, '<', $file) or die "Could not open $file";
    while (my $row = <$fh>) {
        my @kv = grep {/^$prefix:/} split /\t/, $row;
        my %row_hash = ();
        for my $kv ( @kv ) {
            my ($key, $value) = split /=/, $kv, 2;
            $key =~ s/^$prefix://;
            $row_hash{$key} = $value;
        }
        $row_hash{priority} //= 0.5;
        if ($row_hash{loc}) {
            $data->{$row_hash{loc}} = \%row_hash;
        }
    }
    close($fh);
    return $data;
}
