#!/usr/bin/perl

use Direct::Modern;

=head1 METADATA

<crontab>
    time: 10 2 * * *
    <switchman>
        group:  scripts-other
        <leases>
            mem: 6000
        </leases>
    </switchman>
    package: scripts-switchman
    ulimit: -v 12000000
</crontab>

<juggler>
    host:   checks_auto.direct.yandex.ru
    ttl:            2d8h
    tag: direct_group_internal_systems
</juggler>

=cut

#
#   Get user's geo by ip from cmd_log
#
#   If the user had used Direct from more then 1 IP the region containing more then $WINNER_RATIO=0.6 Direct calls is treated as user's region
#   results are in the  monitor.clients_geo_ip  table
#
#   There are to many cmdlog_ tables in ppclog DB so (uid|ip)aggregated data are cached in tables like monitor.clients_geo_ip_cache_2_______
#
#   To clear cached results simply remove cache tables from monitor_db
#
#   $Id$
#
#

use my_inc "..";

use Settings;
use ScriptHelper;

use List::Util qw/max min sum maxstr/;
use Date::Calc qw/Today/;
use Memoize;

use GeoTools;
use Yandex::DBTools;
use Yandex::DBShards;
use Yandex::ListUtils;
use geo_regions;
use Property;
use PrimitivesIds;
use YandexOffice;

use Tools 'get_clickhouse_handler';

my %clientid_by_uid_cache = ();
sub get_clientid_by_uid_cached {
    my ($uid) = @_;
    if (!exists $clientid_by_uid_cache{$uid}) {
        $clientid_by_uid_cache{$uid} = get_clientid(uid => $uid);
    }
    return $clientid_by_uid_cache{$uid};
}

memoize('get_direct_region_from_geo');

sub load_cache($$);

# if at least WINNER_RATIO = 0.6 ppclog_cmd lines corresponds to region X - It is treated as user's home region
our $WINNER_RATIO = 0.6;

# I don't know how deep the region tree is, calc it right here
my $min_type = min map { $_->{type}||0 } values %geo_regions::GEOREG;
my $max_type = max map { $_->{type}||0 } values %geo_regions::GEOREG;

# Check if the region relations in the tree are correct
while (my ($k,$v) = each %geo_regions::GEOREG) {
    if ( @{$v->{parents}||[]} and ($v->{type}||0) <= min map {$geo_regions::GEOREG{$_}{type}||0} @{$v->{parents}||[]} ) {
        warn "geo_regions childs' type is less or equal to parent node type: id:", $k, Dumper, $v;
    }
}

# calculate regions for a particular date in the past
my $today = sprintf "%04d-%02d-%02d", Today;

$log->out("Running ppcMakeRegionsFromIp.pl for date: $today");

my $dates = get_dates($today);
unless (@$dates) {
    $log->out('Work already done');
    juggler_ok(description => 'Work already done');
    exit 0;
}
$log->out('Running from day ' . $dates->[0]);

foreach my $date (@$dates) {
    $log->out("Process date $date");

    my $simple_ag_data = {};
    fetch_log_data($simple_ag_data, $date);

    $log->out("Data fetched");

    my @data;
    foreach my $uid (keys %$simple_ag_data) {
        my $ips = $simple_ag_data->{$uid};
        for my $ip (keys %$ips) {
            next unless ($ips->{$ip});

            push @data, [$uid, $ip, $ips->{$ip}];
        }
    }

    do_in_transaction {
        do_mass_insert_sql(
            MONITOR,
            "INSERT INTO clients_geo_ip_cache (uid, ip, hits_cnt) VALUES %s
                ON DUPLICATE KEY UPDATE hits_cnt = hits_cnt + values(hits_cnt)",
            \@data,
            { max_row_for_insert => 1000 }
            );
    };

    Property->new("clients_geo_ip_cache_date")->set($date);
}
$log->out('cache is filled up');

$log->out('get client->uid relations');

my $uids = get_one_column_sql(MONITOR, 'SELECT DISTINCT uid FROM clients_geo_ip_cache');
my %client2uids;
for my $uid_chunk (chunks($uids, 2_500)) {
        my $data = get_key2clientid(uid => $uid_chunk);
        for my $uid (@$uid_chunk) {
            my $ClientID = $data->{$uid};
            $clientid_by_uid_cache{$uid} = $ClientID;
            if ($ClientID) {
                $client2uids{ $ClientID } ||= [];
                push @{ $client2uids{ $ClientID } }, $uid;
            }
        }
}
undef $uids; # free memory - save a kitten

$log->out('calc geo of clients');

my @client_ids = keys %client2uids;
my ($i, $N) = (0, scalar @client_ids);
$log->out('total clients: ' . $N);

while (my @portion = splice(@client_ids, 0, 100)) {
    $i+=(scalar @portion);

    #                   free memory - save a tree
    my @uids = map { @{ delete $client2uids{$_} } } @portion;

    my $ag_data = {};
    load_cache($ag_data, \@uids);

    $log->out("calc geo");
    calc_geo_by_ip($ag_data);                   # calc user's regions

    $log->out("update results table");
    update_clients_geo_ip($ag_data);            # save results

    $log->out("progress: processed $i clients of $N");
}

juggler_ok();

$log->out('Finish!');

exit 0;

#
#
#   get unprocessed dates
#
sub get_dates {
    my ($end_date) = @_;

    my $start_date = Property->new("clients_geo_ip_cache_date")->get();

    if ($start_date) {
        $start_date =~ s/(\d{4})(\d{2})(\d{2})/$1-$2-$3/;
    }

    my $clh = get_clickhouse_handler('cloud');

    my $query = $clh->format(['
        SELECT
            log_date as date
        FROM
            ppclog_cmd
        WHERE',
        {
            ( $start_date ? (log_date__gt => $start_date) : () ),
            log_date__lt => $end_date,
        },
        'GROUP BY
            date
    ']);

    $clh->query_format('JSON');

    my $res = $clh->query($query)->json->{data};

    return [ sort map { $_->{date} } @$res ];
}

#
#   fill cache with data from ppclog_cmd table with dates from $start_date till $end_date - 1 day
#
#
sub fetch_log_data {
    my ($simple_ag_data, $date) = @_;

    $log->out("select from ppclog_cmd for $date");

    my $clh = get_clickhouse_handler('cloud');

    my $query = $clh->format(["
        SELECT
            ip, uid, count(*) as count
        FROM
            ppclog_cmd
        WHERE
                log_date = '$date'
            AND length(cluid) = 1
            AND cluid[1] = uid
        GROUP BY
            ip, uid
    "]);

    $clh->query_format('JSON');

    my $res = $clh->query($query)->json->{data};

    for my $row (@$res) {
        next unless $row->{ip} =~ m/^\d{1,3}(\.\d{1,3}){3}$/;

        $simple_ag_data->{ $row->{uid} }->{ $row->{ip} } += $row->{count};
    }
}

#
#   Load cached stats
#   load_cache($ag_data, $max_date)
#
#   $ag_data - dst hashref
#   $today - maximum valid cached date + 1
#   return: loaded cache date
#
sub load_cache($$) {
    my ($ag_data, $uids) = @_;
    $log->out("debug: start load_cache");

    eval {
        my $data = get_all_sql(
            MONITOR,
            [
                'SELECT uid, ip, hits_cnt FROM clients_geo_ip_cache',
                WHERE => { uid => $uids }
            ]
        ) or die "no cache data selected";

        for my $row (@$data) {
            my $ClientID = get_clientid_by_uid_cached($row->{uid});
            next unless $ClientID;

            $ag_data->{$ClientID} ||= { ClientID=>$ClientID, uid=>$row->{uid} };
            $ag_data->{$ClientID}->{ips}->{$row->{ip}}->{hits_cnt} += $row->{hits_cnt};
        }
    };
    $log->out("debug: finish load_cache");
    if ($@) {
        $log->out("ERROR: $@");
    }
}

#
#   find the region with the lowest (biggest) level, containing at least $WINNER_RATIO (IP)s
#
sub get_winner_region {
    my @regions = grep {exists $geo_regions::GEOREG{$_->{geo}}} @_;

    return 0 unless @regions;
    return $regions[0]->{geo} if 1 == @regions;

    my $local_max_type = max map { $geo_regions::GEOREG{$_->{geo}}->{type}||0 } @regions;
    my $total_hits_cnt = sum map {$_->{hits_cnt}} @regions;
    for my $type (reverse ($min_type .. $local_max_type)) {
        # get soft projection on type == $type
        my @new_regions = map { {geo => get_soft_geo_projection($_->{geo}, {type=>$type}), hits_cnt => $_->{hits_cnt}} } @regions;
        # aggregate
        my %buff = ();
        for (@new_regions) {
            ($buff{$_->{geo}}||=0) += $_->{hits_cnt};
        }
        @new_regions = map { {geo=> $_, hits_cnt => $buff{$_}} } keys %buff;

        #check if the vinner is here
        for (@new_regions) {
            return $_->{geo} if $_->{hits_cnt} >= $total_hits_cnt * $WINNER_RATIO;
        }
    }
    return 0;
}

#
#   recursively find user's region, containing at least $WINNER_RATIO (IP)s
#
sub get_local_winner_region {
    my @regions = grep {exists $geo_regions::GEOREG{$_->{geo}}} @_;

    my $winner_geo = undef;
    my @new_regions = @regions;

    my $MAX_DEPTH = 20;
    for(my $i=0; $i < $MAX_DEPTH; $i++) {
        my $new_winner_geo = get_winner_region(@new_regions);
        @new_regions = grep {
                $new_winner_geo eq $_->{geo} or grep {$new_winner_geo eq $_} @{$geo_regions::GEOREG{ $_->{geo} }->{parents}||[]}
            } @regions;

        return $winner_geo if defined $winner_geo and $new_winner_geo eq $winner_geo;
        $winner_geo = $new_winner_geo;
    }
}

#
#   return $geo or one of its parents if it's type <= $opt->{type}
#
sub get_soft_geo_projection {
    my ($geo, $opt) =@_;

    return 0 if !exists $geo_regions::GEOREG{$geo};

    for my $p (sort { ($geo_regions::GEOREG{$b}->{type}||0) <=> ($geo_regions::GEOREG{$a}->{type}||0) } $geo, @{$geo_regions::GEOREG{$geo}->{parents} || []} ) {
        return $p if ($geo_regions::GEOREG{$p}->{type}||0) <= $opt->{type};
    }
    return 0;
}

sub calc_geo_by_ip {
    my ($ag_data) = @_;

    for my $user (values %$ag_data) {
        for my $ip (keys %{$user->{ips}} ) {
            $user->{ips}->{$ip}->{geo} = get_direct_region_from_geo( get_geo_from_ip( $ip ) );
        }
        $user->{local_geo} = get_local_winner_region(values %{$user->{ips}});
        #$user->{geo}       = get_winner_region(values %{$user->{ips}}); # why?
    }
}

sub update_clients_geo_ip {
    my ($ag_data) = @_;

    my %data;
    for my $user (values %$ag_data) {
        my $office = get_office_by_geo($user->{local_geo});
        $data{$user->{ClientID}} = [$user->{ClientID}, $user->{local_geo}, $office->{office_id}, $office->{office_short_name}];
    }

    foreach_shard ClientID => [keys %data], chunk_size => 1000, sub {
        my ($shard, $chunk) = @_;
        do_mass_insert_sql(PPC(shard => $shard), "
            INSERT INTO clients_geo_ip
                        (ClientID, geo_id_by_ip, office_id_by_ip, office_name)
                        values %s
            ON DUPLICATE KEY UPDATE
                        geo_id_by_ip    = values(geo_id_by_ip),
                        office_id_by_ip = values(office_id_by_ip),
                        office_name     = values(office_name),
                        update_time = NOW()
            ", [map {$data{$_}} @$chunk]);
    };
}
