# -*- encoding: utf-8; tab-width: 8 -*-
package Direct::YT::banners_href_substituted_length_yt;
use Direct::Modern;
use Yandex::YT;
use URI::Escape qw/uri_escape_utf8/;

=pod

Общее число баннеров с шаблонами можно подсчитать как:

    mapreduce-yt -dsv -src //home/direct/db/banners -map 'grep -P '"'"'href=[^\t]*#[^\t]*#'"'"' || true' -dst //tmp/binarin-test-banners

Этот скрипт запускается через:

   PERL5LIB=protected:perl/settings pyt Direct::YT::banners_href_substituted_length_yt -v tmp_path=//tmp main

=cut


# Copied verbatim from BannerTemplates, cause using it barfs about missing Log::Syslog::Fast on YT servers
my $TEMPLATE_METKA = qr{(?^ui:\#((?^ui:[abcdefghijklmonpqrstuvwxyzABCDEFGHIJKLMONPQRSTUVWXYZабвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯЄІЇєіїҐґ\'ğĞüÜşŞöÖçÇıİ̇ҰҒғӨҢҚҺҮұөңқһүҖҲҳҗӘә\\\[\\\]0123456789\.ҰҒғӨҢҚҺҮұөңқһүҖҲҳҗӘә\-\+\,\.\ \"\!\?\\\(\)\%\$€\;\:\\/\&\'\*_\=\#№«» –—−])*?)\#)};

job 'main',
    tasks => [
        [ map => 'all_bids', src => 'db/bids{id,pid,cid,phrase,param1,param2}', dst => '$tmp_path/bids-for-templates-processing' ],
        [ map => 'find_template_banners', src => 'db/banners{bid,pid,href,cid}', dst => '$tmp_path/banners-with-templates', ],
        [
            sort => ['cid', 'pid'],
            src => '
                $tmp_path/bids-for-templates-processing
                $tmp_path/banners-with-templates
            ',
            dst => '$tmp_path/banners-with-templates-and-bids',
        ],
        [
            reduce => 'very_long_expansions',
            src => '$tmp_path/banners-with-templates-and-bids',
            dst => '
                $tmp_path/very-long-href-expansions
                $tmp_path/href-expansion-lengths
            ',
        ],
        [
            sort => ['length'],
            src => '$tmp_path/href-expansion-lengths',
        ],
        [
            reduce => 'gather_totals',
            src => '$tmp_path/href-expansion-lengths',
            dst => '$tmp_path/href-expansion-totals',
        ],
        [ remove => '$tmp_path/bids-for-templates-processing' ],
        [ remove => '$tmp_path/banners-with-templates' ],
        [ remove => '$tmp_path/banners-with-templates-and-bids' ],
        [ remove => '$tmp_path/href-expansion-lengths' ],
    ];

reducer 'gather_totals',
    reduceby => 'length',
    sub {
        my ($s, $vars) = @_;
        my %totals;
        while (my $g = $s->get_group) {
            while (my $rec = $s->get) {
                $totals{$rec->{length}} += $rec->{count};
            }
        }
        while (my($length, $count) = each %totals) {
            $s->yield({length => $length, count => $count});
        }
    };

reducer 'very_long_expansions',
    reduceby => [qw/cid pid/],
    sub {
        my ($s, $vars) = @_;
        my %totals;
        while(my $g = $s->get_group()) {
            my @bids;
            my @banners;
            while (my $rec = $s->get) {
                for ($rec->{type}) {
                    push @banners, $rec if /^banner$/;
                    push @bids, $rec if /^bid$/;
                }
            }
            for my $banner (@banners) {
                for my $bid (@bids) {
                    my $href = expand_href(
                        banner => $banner,
                        bid => $bid,
                    );
                    $totals{length($href)}++;
                    if (length($href) > 8192) {
                        $s->yield({banner => $banner->{bid}, bid => $bid->{id}});
                    }
                }
            }
        }
        while (my($length, $count) = each %totals) {
            $s->yield({length => $length, count => $count}, 1);
        }
    };

mapper 'all_bids', sub {
    my ($s, $vars) = @_;
    while (my $rec = $s->get) {
        $s->yield({
            pid => $rec->{pid},
            cid => $rec->{cid},
            phrase => $rec->{phrase},
            id => $rec->{id},
            type => 'bid',
            param1 => $rec->{param1},
            param2 => $rec->{param2},
        });
    }
};

mapper 'find_template_banners', sub {
    my ($s, $vars) = @_;
    while (my $rec = $s->get) {
        next unless $rec->{href};
        my $is_template_banner = $rec->{href} =~ $TEMPLATE_METKA;
        my $has_bs_href_params = $rec->{href} =~ /\{\w+\}/;
        if ($is_template_banner || $has_bs_href_params) {
            $s->yield({
                pid => $rec->{pid},
                bid => $rec->{bid},
                href => $rec->{href},
                type => 'banner',
                cid => $rec->{cid},
            });
        }
    }
};


sub expand_href {
    my (%o) = @_;
    my $href = $o{banner}{href};

    my $keyword = uri_escape_utf8($o{bid}{phrase});
    my ($user_keyword, undef) = split /\s-/, $o{bid}{phrase}, 2;
    $user_keyword = uri_escape_utf8($user_keyword);

    # BannerTemplates
    $href =~ s/$TEMPLATE_METKA/$keyword/gsi;

    my %substitutions = (
        # process_href_params
        'position' => 'XXXXX',
        'position_type' => 'XXXXX',
        'source' => 'XXXXX',
        'source_type' => 'XXXXX',
        'addphrases' => 'BM',
        'param1' => $o{bid}{param1},
        'param2' => $o{bid}{param1},
        'phraseid' => 'XXXXXXXXXX',
        'phrase_id' => 'XXXXXXXXXX',
        'retargeting_id' => 'XXXXXXXXXX',
        'keyword' => $user_keyword,
        'gbid' => 'GBID',

        # substitute_href_params
        campaignid => $o{banner}{cid},
        bannerid => $o{banner}{bid},
        adid => $o{banner}{bid},
        adgroupid => $o{banner}{pid},
        campaign_id => $o{banner}{cid},
        banner_id => $o{banner}{bid},
        ad_id => $o{banner}{bid},
        adgroup_id => $o{banner}{pid},
    );

    my $substitution_re = join "|", keys %substitutions;
    $href =~ s!\{($substitution_re)\}!$substitutions{lc $1} // ''!ige;

    return $href;
}

1;
