package Direct::YT::Export::MonitorStats;

=head1 NAME

    Direct::YT::Export::MonitorStats - подсчёт различных статистик в YT, замена ppcMonitorDaily

=cut

use strict;
use warnings;

use List::MoreUtils qw/uniq/;
use Digest::MD5 qw/md5/;

use Yandex::YT;

use Direct::YT::monitor_stats;
use Direct::YT::monitor_stats::adgroups;
use Direct::YT::monitor_stats::banners;
use Direct::YT::monitor_stats::banner_images;
use Direct::YT::monitor_stats::vcards;
use Direct::YT::monitor_stats::sitelinks;
use Direct::YT::monitor_stats::mobile_multiplier_pct;
use Direct::YT::monitor_stats::hierarchical_multipliers;

$Yandex::YT::Streaming::DEFAULT_FORMAT = 'json';

my $mon_stat = Direct::YT::monitor_stats->get_singleton();
my $used_fields = $mon_stat->used_fields;
job 'main',
    vars_spec => {
        export_path => qr/^[\w\/_-]+$/,
        tmp_dir => qr/^[\w\/_-]+$/,
        # для дебага
        cids_range => {default => '', check => qr/^(|\[(\d+)?:(\d+)?\])$/},

        # списки полей различных таблиц берём из mon_stat
        campaigns_fields => {default => sub {join ",", uniq qw/cid/, @{$used_fields->{campaigns}}}, check => qr/./},
        vcards_fields => {default => sub {join ",", uniq qw/cid vcard_id/, @{$used_fields->{vcards}}}, check => qr/./},
        
        phrases_fields => {default => sub {join ",", uniq qw/cid pid/, @{$used_fields->{phrases}}}, check => qr/./},
        hierarchical_multipliers_fields => {default => sub {join ",", uniq qw/cid pid type/, @{$used_fields->{hierarchical_multipliers}}}, check => qr/./},
        bids_fields => {default => sub {join ",", uniq qw/cid pid/, @{$used_fields->{bids}}}, check => qr/./},
        banners_fields => {default => sub {join ",", uniq qw/cid pid bid vcard_id/, @{$used_fields->{banners}}}, check => qr/./},
    },
    tasks => [
        [map_reduce => ['materialize_table_index', 'first_reduce'],
         src => '
                //home/direct/db/campaigns{$campaigns_fields}$cids_range
                //home/direct/db/vcards{$vcards_fields}$cids_range

                //home/direct/db/phrases{$phrases_fields}$cids_range
                //home/direct/db/hierarchical_multipliers{$hierarchical_multipliers_fields}$cids_range
                //home/direct/db/bids{$bids_fields}$cids_range
                //home/direct/db/banners{$banners_fields}$cids_range
                ',
         dst => '$tmp_dir/MonitorStats-1',
         spec => 'mapper={layer_paths=["//porto_layers/yt_packages_from_banach.tar.gz"]};'
                .'reducer={layer_paths=["//porto_layers/yt_packages_from_banach.tar.gz"]};'
                .'scheduling_tag_filter=porto',
        ],
        [map_reduce => ['uniq_buckets', 'uniq_reduce'],
         src => '
                $tmp_dir/MonitorStats-1
                ',
         dst => '$tmp_dir/MonitorStats-2',
         spec => 'mapper={layer_paths=["//porto_layers/yt_packages_from_banach.tar.gz"]};'
                .'reducer={layer_paths=["//porto_layers/yt_packages_from_banach.tar.gz"]};'
                .'scheduling_tag_filter=porto',
        ],
        [map_reduce => [undef, 'final_sum'],
         src => '
                $tmp_dir/MonitorStats-2
                ',
         dst => '$export_path',
         spec => 'mapper={layer_paths=["//porto_layers/yt_packages_from_banach.tar.gz"]};'
                .'reducer={layer_paths=["//porto_layers/yt_packages_from_banach.tar.gz"]};'
                .'scheduling_tag_filter=porto',
        ],
        [remove => '$tmp_dir/MonitorStats-1'],
        [remove => '$tmp_dir/MonitorStats-2'],
    ];

# превращаем @table_index(номер таблицы в запросе) -> поле ti
# для дальнейшей сортировки по нему (и reduce-side join)
mapper 'materialize_table_index',
    sub {
        my ($s, $vars) = @_;
        while(my $r = $s->get()) {
            $r->{ti} = $s->{_attrs}->{table_index};
            $s->yield($r);
        }
};


reducer 'first_reduce',
    reduceby => 'cid',
    # YT гарантирует, что "отсутствующие значения (значения типа null) меньше любого значения другого типа"
    # поэтому порядок записей такой: campaigns, vcards, hierarchical_multipliers(на кампанию) 
    #                                (phrases, hierarchical_multipliers(на группу), bids, banners)* для каждой adgroup
    sortby => ['pid', 'ti'],
    sub {
        my ($s, $vars) = @_;
        my $cnt = 0;
        while(my $g = $s->get_group()) {
            # получаем строку из campaigns
            my $camp = $s->get();
            if ($camp->{ti} != 0) {
                $s->pass_group();
                next;
            }
            $camp->{hierarchical_multipliers} = {};

            my $adgroup;
            my %vcards;
            while(my $r = $s->get()) {
                if ($r->{ti} == 1) {
                    # vcards - просто собираем в отдельный хэш
                    $vcards{$r->{vcard_id}} = $r;
                    $r->{campaign} = $camp;
                    $mon_stat->on_vcard($r);
                    next;
                } elsif ($r->{ti} == 3 && !$r->{pid}) {
                    $camp->{hierarchical_multipliers}->{$r->{type}} = $r;
                    next;
                }

                # дальше всё относится к одной конкретной adgroup
                if ($adgroup && $adgroup->{pid} != $r->{pid}) {
                    #$s->yield({ad => $adgroup});
                    call_adgroup_callbacks($mon_stat, $adgroup);
                    $adgroup = undef;
                }
                if ($r->{ti} == 2) {
                    # phrases
                    $adgroup = $r;
                    $adgroup->{campaign} = $camp;
                    $adgroup->{phrases} = [];
                    $adgroup->{banners} = [];
                    $adgroup->{hierarchical_multipliers} = {};
                } elsif ($r->{ti} == 3 && $adgroup) {
                    # hierarchical_multipliers
                    $adgroup->{hierarchical_multipliers}->{$r->{type}} = $r;
                } elsif ($r->{ti} == 4 && $adgroup) {
                    # bids
                    $r->{adgroup} = $adgroup;
                    push @{$adgroup->{phrases}}, $r;
                } elsif ($r->{ti} == 5 && $adgroup) {
                    # banners
                    $r->{adgroup} = $adgroup;
                    $r->{vcard} = $vcards{$r->{vcard_id} // ''};
                    push @{$adgroup->{banners}}, $r;
                }
            }
            call_adgroup_callbacks($mon_stat, $adgroup) if $adgroup;
            $mon_stat->on_campaign($camp);
            delete $_->{campaign} for values %vcards;

            if ($cnt++ % 100 == 0) {
                if ($mon_stat->stat_cnt >= 1_000_000) {
                    $mon_stat->flush($s);
                }
            }
        } 
        $mon_stat->flush($s);
};

sub call_adgroup_callbacks {
    my ($mon_stat, $adgroup) = @_;
    $mon_stat->on_adgroup($adgroup);
    for my $banner (@{$adgroup->{banners}}) {
        $mon_stat->on_banner($banner);
    }
    for my $phrase (@{$adgroup->{phrases}}) {
        $mon_stat->on_phrase($phrase);
    }
    # разрываем циклы для gc
    delete @{$adgroup}{'banners', 'phrases'};
}


# боремся с монстрами - uniq-ов может быть много разных, поэтому сначала считаем count_distinct в 256-ти бакетах, потом просто просуммирем
mapper 'uniq_buckets',
    sub {
        my ($s) = @_;
        while(my $r = $s->get()) {
            if (defined $r->{uniq}) {
                # md5 - не круто, но другой встроенной в дистрибутив перла чексуммы не нашлось
                $r->{uniq_b} = unpack("L", substr(md5($r->{uniq}), 0, 4)) % 256;
            }
            $s->yield($r);
        }
};

# суммирование всего что можно до уровня бакета
reducer 'uniq_reduce',
    reduceby => ['metric', 'uniq_b'],
    sub {
        my ($s, $vars) = @_;
        while(my $g = $s->get_group()) {
            if (defined $g->{uniq_b}) {
                my %uniq;
                while(my $r = $s->get()) {
                    $uniq{$r->{uniq}} = undef;
                }
                $g->{sum} = keys %uniq;
            } else {
                while(my $r = $s->get()) {
                    for my $f (qw/avg_cnt avg_sum sum/) {
                        $g->{$f} += $r->{$f} if defined $r->{$f};
                    }
                }
            }
            $s->yield($g);
        }
};

# окончательное суммирование до уровня метрики,
# подсчёт средних
reducer 'final_sum',
    reduceby => ['metric'],
    sub {
        my ($s, $vars) = @_;
        my $metrics = $mon_stat->metrics;
        while(my $g = $s->get_group()) {
            while(my $r = $s->get()) {
                for my $f (qw/avg_cnt avg_sum sum/) {
                    $g->{$f} += $r->{$f} if defined $r->{$f};
                }
            }
            my $val;
            if (defined $g->{avg_sum}) {
                $val = $g->{avg_sum} / ($g->{avg_cnt}||1);
            } else {
                $val = $g->{sum};
            }
            $s->yield({metric => $g->{metric}, val => $val, desc => $metrics->{$g->{metric}}});
        }
};

1;
