#!/usr/bin/perl

use my_inc "../..";


=head1 DESCRIPTION

    Сравниваем наших рекламодателей с 2гис-овскими

    На входе csv-файл вида <source-id, href, city-name>
    (первая строка заголовок)

=cut

use 5.010;
use strict;
use warnings;
#use autodie; # ругаются юнит-тесты

use lib::abs '..';

use Settings;
use ScriptHelper;

use Yandex::DBTools;
use Yandex::CSV;
use Yandex::MirrorsTools::Hostings;

use GeoTools;

use Encode;
use List::MoreUtils qw/ uniq none /;

use utf8;


# домены, по которым надо сравнивать полный урл
our %IS_META_DOMAIN = map {( $_ => 1 )} qw/
    vk.com
    vkontakte.ru
    m.vk.com
    facebook.com
    odnoklassniki.ru
    avito.ru
    youtube.com
    youtu.be
/;


#$Yandex::DBTools::QUERIES_LOG = 1;
Getopt::Long::GetOptions(
    'o|output=s' => \my $ofile,
    's|stat=s' => \my $sfile,
);

my $ifile = shift @ARGV  or die;

$log->out('START');

$log->out("loading external advertisers from $ifile");
open my $fh, "<:encoding(utf8)", $ifile;
my $extdata = Text::CSV->new()->getline_all($fh);
close $fh;

my $header = shift @$extdata;


# ассоциируем города с гео-идами
$log->out('associating source cities');
my %city_parents;
for my $cityname ( uniq map {$_->[2]} @$extdata ) {
    my $our_city = $cityname;
    $our_city =~ tr/Ёё/Ее/;
    $our_city =~ s/\s* \( .* \)//xms;
    my $geo_id = get_geoid_by_cityname($our_city);
    die "$cityname not found"  if !$geo_id;

    my @path = get_region_path($geo_id);

    # костыль для Нижнего Тагила
    splice @path, 1, 1  if @path > 6;
    $city_parents{$cityname} = \@path;
}
$log->out(scalar(keys %city_parents) . ' cities');

#say Dump \%city_parents;


# вычленяем домены во входных урлах
$log->out('cleaning up external hrefs');
my %domain_records;
for my $item ( @$extdata ) {
    my $href = $item->[1];
    my ($domain) = $href =~ m# (?: www\.)? (.*?) \.? (?: [/\#\?] | $) #xms;

    push @{ $domain_records{lc $domain} }, $item;
}
$log->out(scalar(keys %domain_records) . ' uniq domains');


$log->out('fetching banners');

my $banner_sql = q{
    SELECT b.cid, pid, b.bid, href, p.geo, href
    FROM banners b
    JOIN campaigns c using(cid)
    JOIN phrases p using(pid)
    WHERE 1
        -- AND b.statusActive='Yes' AND c.statusActive='Yes'
};


my %is_hosting = map {( $_ => 1 )} @{ Yandex::MirrorsTools::Hostings::get_hostings() };

for my $domain ( sort keys %domain_records ) {
    $log->out(encode utf8 => $domain);

    my $banners = get_all_sql(PPC(shard=>'all'), [
            $banner_sql,
            AND => { reverse_domain => scalar reverse($domain) },
            limit => 100000,
        ]);

    # пробуем найти с более точным доменом
    if ( !@$banners ) {
        $banners = get_all_sql(PPC(shard=>'all'), [
                $banner_sql,
                AND => { reverse_domain__starts_with => scalar(reverse $domain) . q{.} },
                limit => 100000,
            ]);
    }

    # для доменов 3-го уровня - наоборот, ищем с более общим доменом
    if ( !@$banners && _domain_level($domain) > 2 ) {
        my $base_domain = _get_base_domain($domain);
        if ( !$is_hosting{$base_domain} ) {
            $banners = get_all_sql(PPC(shard=>'all'), [
                    $banner_sql,
                    AND => { reverse_domain => scalar(reverse $base_domain) },
                    limit => 100000,
                ]);
        }
    }

    next if !@$banners;

    $log->out(encode utf8 => "found banners for $domain");

    for my $item ( @{ $domain_records{$domain} } ) {
        my $item_geo_chain = $city_parents{$item->[2]};
        
        my $banners_to_check = $IS_META_DOMAIN{$domain}
            ? [ grep {_is_same_href($item->[1], $_->{href})} @$banners ]
            : $banners;
        next if !@$banners_to_check;

        for my $banner_geo ( uniq map {$_->{geo}} @$banners_to_check ) {
            for my $level ( 0 .. 4 ) {
                my $geo_id = $item_geo_chain->[$level];
                my ($is_minus, $is_found) = $banner_geo =~ / \b (\-?) ($geo_id) \b /xms;
                next if !$is_found;

                $item->[4+$level] = '+'  if !$is_minus;
                last if $is_found;
            }
        }

        $item->[3] = '*';
    }

#    say encode( utf8 => Dump [$domain, $domain_records{$domain}, $banners] );
#    last if our $lim++ > 300;
}


$log->out('saving result');
$ofile //= $ifile =~ s/ (?= (?: \. [^\/]+ )? $ )/-result/xmsr;
data2csv $extdata, { bom_header => 1, header_row => $header, output_file => $ofile };


$log->out('calculating and writing statistics');

my %stat;
for my $item ( @$extdata ) {
    my (undef, undef, $city, $is_domain, @match) = @$item;
    my $stat_item = $stat{$city} //= [$city];
    $stat_item->[1] ++;

    for my $i ( 0 .. 3 ) {
        $stat_item->[2+$i]++  if $match[$i];
    }
    
    $stat_item->[7]++  if $is_domain;
    $stat_item->[8]++  if none {$_} @match;
}

$sfile //= $ifile =~ s/ (?= (?: \. [^\/]+ )? $ )/-stat/xmsr;
data2csv [ sort {$a->[0] cmp $b->[0]} values %stat ],
    { bom_header => 1, output_file => $sfile };




$log->out('FINISH');


sub _is_same_href {
    my ($href1, $href2) = @_;
    s/ \# .* //xms  for ($href1, $href2);
    s/ ^ [^\/]+ //xms  for ($href1, $href2);
    return lc $href1 eq lc $href2;
}

sub _domain_level {
    my ($domain) = @_;
    my @points = $domain =~ /\./gxms;
    return 1 + @points;
}

sub _get_base_domain {
    my ($domain) = @_;
    my ($base_domain) = $domain =~ /([^\.]+ \. [^\.]+) $/xms;
    return $base_domain;
}

# упёрто из геоконтекста
BEGIN {

my $lookup = GeoTools::get_geobase_lookup();

sub get_region_object
{
    my ($id) = @_;

    my $region = geobase5::region->new();
    $lookup->region_by_id($id, $region) or return;
    return $region;
}

sub get_region_path
{
    my $geo_id = shift;

    my @path;
    for ( 0 .. 100 ) {
        push @path, $geo_id;
        my $reg = get_region_object($geo_id) or return;
        $geo_id = $reg->parent_id();
        last if !$geo_id || $geo_id < 0;
    }
    return @path;
}
}

