package Yandex::YT;

=head1 NAME
    
    Yandex::YT - работа из perl с yt

=head1 DESCRIPTION

    todo

=head2 SYNOPSIS

    use Yandex::YT;
    mapper 'mapper_example',
        sub {
            my ($s, $vars) = @_;
            my %stat;
            while(my $rec = $s->get()) {
                $stat{substr($rec->{logtime}, 0, 8)}++ if $rec->{cmd} && $rec->{cmd} eq $vars->{cmd};
            }
            while(my ($date, $cnt) = each %stat) {
                $s->yield({date=>$date, cnt => $cnt});
            }
        }
    ;
    reducer 'reducer_example',
        reduceby => 'date',
        sub {
            my ($s, $vars) = @_;
            while(my $group = $s->get_group()) {
                $group->{cnt} += $_->{cnt} for $s->get_all();
                $s->yield($group);
            }
        }
    ;
    job 'main',
        vars_spec => { 
            log => qr/^\w+$/,
            date_from => qr/^\d+$/,
            date_to => qr/^\d+$/,
        },
        tasks => [
            [map => 'mapper_example', 
                    src => 'logs/$log["$date_from":"$date_to"]', 
                    dst => 'tmp/t1',
                    files => {'stopwords.txt' => '$stpwrd_file'},
            ],
            [reduce => 'reducer_example', src => 'tmp/t1', dst => 'tmp/t2'],
            [format => 'tsv', src => 'tmp/t2'],
        ],
    ;

    do_job(name => 'main');

=cut

use strict;
use warnings;
use Carp qw/croak/;
use Data::Dumper;
use File::Temp qw/tempdir/;
use File::Basename qw/basename dirname/;
use File::Path qw/mkpath/;

use Yandex::Shell qw/yash_qx/;
use Yandex::TimeCommon qw/get_distinct_dates/;

use Yandex::YT::Streaming;
use Yandex::YT::Table::Schema;
use Yandex::YT::Tx;

use base qw/Exporter/;
our @EXPORT = qw/
    yt
    job mapper reducer formatter
/;

our $DEBUG;

=head2 $JOB_ADD_FILES{'stopwords.lst'} = $stopwords_local_path;

    хэш с путями к файлам, которые нужно дополнительно грузить в YT
        ключ - относительный путь к файлу в архиве
        значение - локальный путь к файлу
    
=cut
our %JOB_ADD_FILES;


=head2 $PACK_FILTER_RE

    регулярка, определающая, какие модули из системных путей нужно 
    упаковывать при отправке данных в YT
    
=cut
our $PACK_FILTER_RE ||= qr/Yandex|Yx|CLemmer|clemmer|Direct\/Modern/;

our %_reg;

sub job(@) {
    _reg_handle('job', @_);
}

sub mapper(@) {
    _reg_handle('map', @_);
}

sub reducer(@) {
    _reg_handle('reduce', @_);
}

sub formatter(@) {
    _reg_handle('format', @_);
}

sub _reg_handle {
    my ($type, $name) = (shift, shift);
    my $func = $type =~ /^(map|reduce|format)$/ ? pop : undef;
    my %OPT = @_;
    if (!defined $name) {
        croak '\u$type name is not defined';
    } elsif (ref($name) || $name !~ /^\w+$/) {
        croak "Incorrect $type name: '$name'";
    }

    my $pack = (caller(1))[0];
    my $rec = $_reg{$type}{$pack.'::'.$name} = {
        func => $func,
        pack => $pack,
        type => $type,
        name => $name,
    };

    if ($type =~ /^(map|reduce|format)$/) {
        $rec->{format} = delete $OPT{format} || $Yandex::YT::Streaming::DEFAULT_FORMAT;

        if (ref($func) ne 'CODE') {
            croak "\u$type subroutine is not defined";
        }
        $rec->{func} = $func;
    }

    if ($type =~ /^(map|reduce)$/) {
        if ($type eq 'reduce') {
            _process_fields_list($rec, 'reduceby', \%OPT);
            _process_fields_list($rec, 'sortby', \%OPT);
            croak "Argument 'reduceby' is not defined for $name" if !$rec->{reduceby};
        }
    }

    if ($type =~ /^(job)$/) {
        $rec->{tasks} = delete $OPT{tasks} || croak "Tasks for $type $name is not defined";
        if ($OPT{vars_spec}) {
            $rec->{vars_spec} = delete $OPT{vars_spec};
        }
    }

    if ($type =~ /^(format)$/ && $OPT{params}) {
        croak "Incorrect params for $type $name" if ref($OPT{params}) ne 'HASH';
        $rec->{params} = delete $OPT{params};
    }

    if (my $filter = delete $OPT{filter}) {
        croak "Incorrect filer for $name" if ref $filter;
        $rec->{filter} = $filter;
    }
    
    if (%OPT) {
        croak "Unknown params: ".join(', ', sort keys %OPT);;
    }
}

sub _process_fields_list {
    my ($rec, $name, $opt) = @_;
    return if !exists $opt->{$name};
    my $val = delete $opt->{$name};
    my @vals = ref($val) eq 'ARRAY' ? @$val
        : ref($val) eq '' ? split(/\s*,\s*|\s+/, $val)
        : croak "Incorrect '$name' argument: '$val'";
    croak "Empty '$name' argument" if !@vals;
    croak "Incorrect '$name' argument: ".join(', ', @vals) if grep {!/^\@?\w+$/} @vals;
    $rec->{$name} = \@vals;
}

sub _interpolate {
    my ($str, $vars) = @_;
    return undef if !defined $str;
    my $vv = sub {
        (my $p = shift) =~ s/^\{(\w+)\}/$1/; 
        croak "Variable $p is not defined" if !$vars || !defined $vars->{$p};
        return $vars->{$p};
    };
    $str =~ s/\$(\w+|\{\w+\})/$vv->($1)/ge;
    
    $str =~ s/(?<prefix>\S*)
              \$\{(?<func>\w+) \s+ 
              (?<args>[^\}]+) \}
              (?<suffix>\S*)
             /_interpolate_func(@+{qw!prefix suffix func args!})/gex;

    return $str;
}

sub _interpolate_func {
    my ($prefix, $suffix, $func, $args) = @_;
    my @args = split /\s+/, $args;
    my %funcs = (
        date_range => sub { return map {s/^(\d\d\d\d)(\d\d)(\d\d)/$1-$2-$3/r} get_distinct_dates($args[0], $args[1]); },
        );
    croak "Unexistent interpolate function call: $func" unless exists $funcs{$func};
    my @res = $funcs{$func}->(@args);
    return join "\n", map {"${prefix}${_}${suffix}"} @res;
}

sub _interpolate_schema {
    my ($schema, $vars) = @_;
    return $schema unless $schema && ref $schema eq 'ARRAY';
    my @new_schema;
    for my $column (@$schema) {
        my $new_column = { %$column };
        $new_column->{name} = _interpolate($new_column->{name}, $vars);
        if ($new_column->{group}) {
            $new_column->{group} = _interpolate($new_column->{group}, $vars);
        }
        push @new_schema, $new_column;
    }
    return \@new_schema;
}


sub yt(@) {
    my $tasks = shift;
    my %O = @_;
    my $job;
    if (ref($tasks) eq 'HASH') {
        $job = $tasks;
        $tasks = $job->{tasks};
    }
    my $pack = $job ? $job->{pack} : (caller(0))[0];
    my @yt_cmds;
    for(my $step = 0; $step < @$tasks; $step++) {
        my ($type, $name, %P) = @{$tasks->[$step]};
        croak "Incorrect task type: '$type'" if $type !~ /^(map|reduce|map_reduce|format|create_table|move|remove|merge|sort|set)$/;

        my $name_str = Data::Dumper->new([$name])->Terse(1)->Indent(0)->Dump();

        my $yt_cmd = {type => $type};
        my @shell = ('yt', Yandex::YT::Tx::tx_yt_args());

        my $format = $P{format} || $Yandex::YT::Streaming::DEFAULT_FORMAT;
        my $format_flags = $P{format_flags} // $Yandex::YT::Streaming::DEFAULT_FORMAT_FLAGS->{$format} // '';
        
        my $src = delete($P{src});
        if ($type !~ /^(create_table|remove|set)$/) {
            croak "Source for \u$type $name_str is not defined" if !defined $src;
            $src = _interpolate($src, $O{vars});
        }

        my $dst = delete($P{dst});
        if ($type =~ m/^(map|reduce|map_reduce|move|merge|sort)$/) {
            $dst //= $src if $type =~ /^(merge|sort)$/;
            croak "Dest for \u$type $name_str is not defined" if !defined $dst;
            $dst = _interpolate($dst, $O{vars});
        }
        
        if ($type eq 'remove') {
            my $force = delete $P{force};
            croak "Excess params for remove: %P" if %P;
            push @shell, remove => ($force ? '-f' : ()), _interpolate($name, $O{vars});
        } elsif ($type eq 'format') {
            my $hdl = _get_hdl(type => $type, pack => $pack, name => $name);
            $yt_cmd->{format_hdl} = $hdl;
            push @shell, read => $src, '--format' => "<$format_flags>$format";
        } elsif ($type =~ /^(map|reduce|map_reduce)$/) {
            push @shell, ($type =~ s/_/-/gr);
            push @shell, '--format' => "<$format_flags>$format";

            local %JOB_ADD_FILES = %JOB_ADD_FILES;
            if (my $files = delete $P{files}) {
                for my $fn (keys %$files) {
                    $JOB_ADD_FILES{$fn} = _interpolate($files->{$fn}, $O{vars});
                }
            }
    
            push @shell, map {('--src' => $_)} grep {$_ ne ''} split /\s+/, $src;
            push @shell, map {('--dst' => $_)} grep {$_ ne ''} split /\s+/, $dst;

            my @subtasks = $type eq 'map_reduce' 
                ? ( ($name->[0] ? ['map' => $name->[0]] : ()), ['reduce' => $name->[1]] ) 
                : [$type => $name];
    
            for my $subtask (@subtasks) {
                my ($subtype, $subname) = @$subtask;
    
                my $hdl = _get_hdl(type => $subtype, pack => $pack, name => $subname);
    
                my ($tar, $cmd) = _yt_prog_tar(task => $hdl, vars => $O{vars}, INC => $O{INC}, format => $format);
                $cmd = "$hdl->{filter} | $cmd" if $hdl->{filter};
                if ($subtype eq 'map') {
                    if ($type eq 'map_reduce') {
                        push @shell, '--map-local-file' => $tar, "--mapper" => $cmd;
                    } else {
                        push @shell, '--local-file' => $tar, $cmd;
                    }
                } elsif ($subtype eq 'reduce') {
                    push @shell, map {('--reduce-by' => $_)} @{$hdl->{reduceby}};
                    if ($hdl->{sortby}) {
                        push @shell, map {('--sort-by' => $_)} @{$hdl->{reduceby}}, @{$hdl->{sortby}};
                    }
                    if ($type eq 'map_reduce') {
                        push @shell, '--reduce-local-file' => $tar, "--reducer" => $cmd;
                    } else {
                        push @shell, '--local-file' => $tar, $cmd;
                    }
                }
            }
        } elsif ($type eq 'merge') {
            # используем имя для указания режима объединения
            croak "Invalid merge mode. Should be unordered, ordered or sorted" if $name !~ m/^((un)?ordered|sorted)$/;
            push @shell, 'merge', '--mode' => $name;
            push @shell, map {('--src' => $_)} grep {$_ ne ''} split /\s+/, $src;
            push @shell, '--dst' => $dst;
        } elsif ($type eq 'sort') {
            # используем имя для указания колонок сортировки
            croak "Invalid sort columns" if ref($name) ne 'ARRAY' || !@$name;
            push @shell, 'sort', map {('--sort-by' => $_)} @$name;
            push @shell, map {('--src' => $_)} grep {$_ ne ''} split /\s+/, $src;
            push @shell, '--dst' => $dst;
        } elsif ($type eq 'set') {
            my $table_name = _interpolate($name, $O{vars});
            my $attr = delete($P{attr}) // croak "Undefined attribute name";
            my $value = delete($P{value}) // croak "Undefined attribute value";
            push @shell, 'set', "$table_name/\@$attr", '"' . _interpolate($value, $O{vars}) . '"';
        } elsif ($type eq 'create_table') {
            my $schema = _interpolate_schema(delete $P{schema}, $O{vars});
            if (my $validation_error = Yandex::YT::Table::Schema::validate($schema)) {
                croak "Invalid table schema after interpolating: $validation_error";
            }
            my $schema_string = Yandex::YT::Table::Schema::get_attribute_string($schema);
            my $optimize_for = delete $P{optimize_for};
            croak "Invalid optimize_for value. Should be: lookup, scan" if defined $optimize_for && $optimize_for !~ m/^lookup|scan$/;
            # по умолчанию используем новый поколоночный формат хранения
            $optimize_for //= 'scan';
            push @shell, 'create', '-r', 'table' => _interpolate($name, $O{vars}), '--attributes' => "{ optimize_for = $optimize_for ; schema = $schema_string }";
        } elsif ($type eq 'move') {
            # имя не используем
            croak "Excess params for move: %P" if %P;
            push @shell, 'move', $src, $dst;
        }

        if ($P{spec}) {
            croak "Task type '$type' doesn't support yt specifications" if $type !~ /^(map|reduce|map_reduce|merge|sort|set)$/;
            push @shell, '--spec' => "{$P{spec}}"; 
        }
        for my $job_type (qw/mapper reducer/) {
            next unless $P{"${job_type}_memory_limit"};
            croak "Task type '$type' doesn't support yt ${job_type}_memory_limit parameter" if $type ne 'map_reduce';
            push @shell, "--${job_type}-memory-limit" => $P{"${job_type}_memory_limit"};
        }
        if ($P{memory_limit}) {
            croak "Task type '$type' doesn't support yt memory_limit parameter" if $type !~ /^(map|reduce)$/;
            push @shell, '--memory-limit' => $P{memory_limit};
        }

        $yt_cmd->{shell} = \@shell;
        push @yt_cmds, $yt_cmd;
    }
    for(my $step = 0; $step < @yt_cmds; $step++) {
        if (defined $O{step} 
            && ( !ref($O{step}) && $O{step} != $step
                 || ref($O{step}) && !(grep {$_ == $step} @{$O{step}})
                 )
        ) {
            print STDERR "### step $step - skip\n";
            next;
        }
        print STDERR "### step $step:\n";
        my $yt_cmd = $yt_cmds[$step];
        my @shell = @{$yt_cmd->{shell}};
        print STDERR join(" ", map {(my $r = $_) =~ s/(["\\])/\\$1/g; /(?:[ "\\(){}<>]|(?!\A)--)/ ? qq!"$r"! : $_} @shell)."\n";
        if (!$DEBUG) {
            if ($yt_cmd->{type} eq 'format' and $yt_cmd->{format_hdl}) {
                _read_and_format(\@shell, $yt_cmd->{format_hdl}, vars => $O{vars});
            } else {
                system(@shell) and croak "$!";
            }
        }
    }
}

sub _get_hdl {
    my %O = @_;
    my $pack = $O{pack} || (caller(1))[0];;
    my $fullname = $O{name} =~ /::/             ? $O{name} 
        : $_reg{$O{type}}{$pack.'::'.$O{name}}  ? $pack.'::'.$O{name}
        : __PACKAGE__.'::'.$O{name};
    my $h = $_reg{$O{type}}{$fullname};
    croak "$O{type} $O{name} is not defined" if !$h;
    return $h;
}

sub do_job {
    my %O = @_;
    my $job = _get_hdl(type => 'job', %O);
    $O{vars} ||= {};
    $O{INC} ||= {};
    _validate_job_vars($job->{vars_spec}, $O{vars});
    yt($job, step => $O{step}, vars => $O{vars}, INC => $O{INC});
}

sub _validate_job_vars {
    my ($vars_spec, $vars) = @_;
    for my $vname (sort keys %$vars_spec) {
        my $spec = $vars_spec->{$vname};
        if (ref $spec ne 'HASH') {
            $spec = {check => $spec};
        }
        if (!defined $vars->{$vname}) {
            if ($spec->{default} && ref $spec->{default} eq 'CODE') {
                $vars->{$vname} = $spec->{default}->();
            } elsif (defined $spec->{default}) {
                $vars->{$vname} = $spec->{default};
            }
            croak "Variable $vname is not defined" if !defined $vars->{$vname};
        }
        if (ref($spec->{check}) eq 'Regexp') {
            croak "Incorrect value of variable $vname: '$vars->{$vname}'" if $vars->{$vname} !~ $spec->{check};
        } else {
            croak "Incorrect specification of variable $vname";
        }
    }
}

sub do_map {
    my (%O) = @_;
    my $map = _get_hdl(type => 'map', %O);
    my $s = Yandex::YT::Streaming->new(
        format => $O{format},
        );
    $map->{func}->($s, $O{vars});
    $s->finish();
}

sub do_reduce {
    my (%O) = @_;
    my $reduce = _get_hdl(type => 'reduce', %O);
    my $s = Yandex::YT::Streaming->new(
        reduceby => $reduce->{reduceby},
        format => $O{format}
        );
    $reduce->{func}->($s, $O{vars});
    $s->finish();
}

sub _read_and_format {
    my ($shell, $hdl, %O) = @_;
    $hdl ||= _get_hdl(type => 'format', name => 'tsv');
    open(my $fh, "-|", @$shell) || croak "$!";
    my $s = Yandex::YT::Streaming->new(in_fh => $fh, format => $hdl->{format});
    $hdl->{func}->($s, $O{vars});
    $s->finish();
    close($fh) || croak "$!";
}

{
    my $tmpdir;
    my @mods;
    my @tars;
    sub _yt_prog_tar {
        my (%O) = @_;
        if (!$tmpdir) {
            $tmpdir = tempdir(
                CLEANUP => 1, 
                SUFFIX => '.yt',
                );
            my %files = (
                %JOB_ADD_FILES,
                ($O{INC} ? %{$O{INC}} : ()),
                );

            my $PATHS_RE = join '|', map {"\Q$_\E"} @INC;
            for my $mod (grep { $INC{$_} !~ m#^/usr(/local)?/(share|lib)# || m/$PACK_FILTER_RE/ } keys %INC) {
                $files{$mod} = $INC{$mod};
                if ($mod =~ s/\.pm//) {
                    $mod =~ s/\//::/g;
                    push @mods, $mod;
                }
            }
            
            my @so = grep {/$PACK_FILTER_RE/}
                    grep {defined && /\.so(?:\.\d+)*$/} 
                    map {(split /\s+/)[5]} 
                    split /\n/, _slurp("/proc/$$/maps");
            for my $so (@so) {
                if ($so =~ /^(?:$PATHS_RE)\/*(.*)/) {
                    $files{$1} = $so;
                } else {
                    $files{$so=~s/.*\///r} = $so;
                }
            }

            for my $rel (keys %files) {
                mkpath "$tmpdir/".dirname($rel);
                _slurp_write("$tmpdir/$rel", _slurp($files{$rel}));
            }
        }

        my $worker_cont = $O{prog};
        if (!defined $worker_cont) {
            my $vars_str = Data::Dumper->new([$O{vars}])->Terse(1)->Indent(0)->Dump();
            $worker_cont = "
use FindBin qw/\$Bin/;
use lib \$Bin;
".join("\n", map {"use $_;"} @mods)."
Yandex::YT::do_$O{task}->{type}(pack => '$O{task}->{pack}', name => '$O{task}->{name}', format => '$O{format}', vars => $vars_str);
";
        }
        my $task_name = $O{task}->{name} ? $O{task}->{name} : 'yt_work';
        _slurp_write("$tmpdir/$task_name.pl", $worker_cont);

        my $tar_name = File::Temp->new( 
            CLEANUP => 1, 
            UNLINK => 0,
            SUFFIX => ".yt.tar.gz",
            );
        push @tars, $tar_name;
        system("cd $tmpdir; tar czf $tar_name .") and print "tar failed: $!";

        return ($tar_name->filename, "tar xzf ".basename($tar_name->filename)." -m --strip-components 1 && LD_LIBRARY_PATH=. perl $task_name.pl");
    }
}

sub _slurp {
    open(my $fh, "<", $_[0]) || die "Can't open $_[0]: $!";
    local $/ = undef;
    return scalar <$fh>;
}
sub _slurp_write {
    open(my $fh, ">", $_[0]) || die "Can't open $_[0]: $!";
    print $fh $_[1] or die "Can't write: $!";
}


####################################################
# Default formatters
####################################################

formatter 'tsv',
    sub {
        my ($s, $vars) = @_;
        my $rec = $s->get();
        return if !$rec;
        my @fields;
        if ($vars
            && defined $vars->{fields}
            && (!ref $vars->{fields} || ref $vars->{fields} eq 'ARRAY')
        ) {
            @fields = ref $vars->{fields} ? @{ $vars->{fields} } : $vars->{fields};
        } else {
            @fields = map {
                $_->[0]
            } sort {
                $a->[1] <=> $b->[1]
                || $a->[0] cmp $b->[0]
            } map {
                my $o = /time|date/ ? 0 
                    : /uid/ ? 1
                    : /id/ ? 2
                    : /\d+/ ? 11
                    : 10;
                [$_, $o]
            } keys %$rec;
        }
        print join("\t", @fields)."\n";
        do {
            print join("\t", map {defined $rec->{$_} ? $rec->{$_} : ''} @fields)."\n";
        } while $rec = $s->get();
};

1;
