package BM::File;

use utf8;
use open ':utf8';

use base qw(ObjLib::ProjPart BM::MCached BM::PhraseLongText);

use Utils::Sys qw();

no warnings 'utf8'; # из-за варнингов ловим segfault в bmapi, глушим варнинги до решения DYNSMART-863

#    wc_l                           возвращает количество строк в файле

sub text {
    my ($self) = @_;
    my $prev = $/;
    $/ = undef;
    open(F, "< ".$self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    my $res = <F>;
    close(F);
    $/ = $prev;
    return $res; 
}

sub is_multiline {
    my ($self) = @_;
    my $wcl = ( split /\n/, $self->head_lines(2));
    my $result = 0;
    if ($wcl > 1) {
        $result = 1;
    }
    return $result;
}

sub reftext {
    my ($self) = @_;
    my $prev = $/;
    $/ = undef;
    open(F, "< ".$self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    my $res = <F>;
    close(F);
    $/ = $prev;
    return \$res; 
}

sub head_lines_bytelimit {
    my ($self, $lines, $bytelimit) = @_;
    $lines ||= 1000;
    $bytelimit ||= 1000000;
    my $text = "";
    open(F, "< ".$self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    read F, $text, $bytelimit;
    close(F);
    my @res = split /\n/, $text;
    @res = @res[0..$lines-1] if $lines < scalar(@res);
    return join("\n", @res);
}

sub tail_lines_bytelimit {
    my ($self, $lines, $bytelimit) = @_;
    $lines ||= 1000;
    $bytelimit ||= 1000000;
    my $text = "";
    open(F, "< ".$self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    binmode(F);

    my @stat = stat(F) or return undef;
    my $size = $stat[7];
    my $pos = $size - $bytelimit;
    $pos = 0 if $pos < 0;

    sysseek(F,$pos,0);
    sysread(F, $text, $bytelimit);
    close(F);  

    my @res = split /\n/, $text;
    @res = @res[-$lines..-1] if $lines < scalar(@res);
    return join("\n", @res);
}


sub head_lines {
    my ($self, $lines, $mode) = @_;
    $lines ||= 1000;
    $mode ||= '';
    open(F, "< ".$self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    
    binmode(F) if $mode eq 'binmode';

    my @res = ();
    while(defined( my $l = <F>)){
        last unless $lines--;
        my $t = $l; 
        push(@res, $t);
    }
    close(F);
    return join("", @res); 
}

sub tail_lines {
    my ($self, $num_lines, $mode) = @_;
    $num_lines ||= 1000;
    $mode ||= '';
    my $rec_sep    = qw/\015\012|\015|\012/;

    open (FILE, $self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    binmode FILE if $mode eq 'binmode';
    my @stat = stat(FILE) or return undef;
    my($size,$blksize) = @stat[7,11];
    $blksize ||= 8192;
    # grab the first chunk back from eof at block offset
    my $pos = $size - (($size % $blksize)|| $blksize );
    my $file = '';
    my $buf;
    my $lines = 0;
    my $exp = 1;#адаптируемся к длинным строкам, читая больше за раз. Читаем количество бвйт, кратное размеру блока, чтобы попадать точно в границы
    for(;;) {
        $pos = 0 if $pos < 0;
        sysseek(FILE,$pos,0);
        sysread(FILE, $buf, $blksize*(2**$exp)) or last; # returns 0 at eof;
        $file = $buf.$file;
        my $new_lines = () = $buf =~ m/$rec_sep/g;
        $lines += $new_lines;
        if ($new_lines) {
            $exp--;
            $exp = 0 if $exp < 0;
        }
        else {
            $exp++;
            $exp = 10 if $exp > 10;
        }
      last if $lines > $num_lines or $pos == 0;
        $pos -= $blksize*(2**$exp);
    }
    close FILE;
    my @file = split /$rec_sep/, $file;
    if ( $num_lines > @file ) {
      return join("\n",@file);
    }
    else {
        $num_lines = $#file - $num_lines + 1;
      return join("\n", @file[$num_lines..$#file] );
   }
}

#Вернуть строки как массив
sub lines {
    my ($self) = @_;
    open(F, "< ".$self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    my @arr = <F>;
    s/\n// for @arr;
    close(F);
    return @arr;
}

#Возращает массив массивов в разбивке по табам
sub tabs {
    my ($self) = @_;
    open(F, "< ".$self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    my @arr = <F>;
    s/\n// for @arr;
    close(F);
    my @res = map { [ split("\t", $_) ] } @arr;
    return @res;
}

#Возвращает массив хэшей
#Первая строка - строка формата
sub ftabs {
    my ($self) = @_;
    open(F, "< ".$self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    my @arr = <F>;
    s/\n// for @arr;
    my @frm = split("\t", shift @arr);
    close(F);
    my @res = map { my %h = {}; @h{@frm} = split("\t", $_); \%h } @arr;
    return @res;
}

#Вернуть phrase_list из строк файла
sub phrase_list {
    my ($self) = @_;
    return $self->proj->phrase_list([ $self->lines ]);
}

sub save {
    my ($self, $data) = @_;
    open(F, "> ".$self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    print F $data;
    close(F);
}

sub wc_l {
    my ($self) = @_;
    open(F, "< ".$self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    binmode(F, ":raw");
    my $cc = 0;
    while(defined(my $l = <F>)){
        $cc++;
    }
    close(F);
    return $cc;
}

sub wc_c {
    my ($self) = @_;
    open(F, "< ".$self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    my $cc = 0;
    while(defined(my $l = <F>)){
        $cc += length($l);
    }
    close(F);
    return $cc;
}

sub eachline {
    my ($self, $sb) = @_;
    open(F, "< ".$self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    while(defined( my $l = <F> )){
        $sb->($l);
    }
    close(F);
}

#Важно, что на вход не имя файла, а команда
sub each_n_cmd_lines {
    my ($self, $pk, $sb) = @_;
    $pk ||= 1000;
    my $cmd = $self->{cmd} || ('< '.$self->{name});
    open(F, $cmd) || die("Cant open file '".$self->{name}."' - $@");
    my $cc = 0;
    my @arr = ();
    while(defined( my $l = <F> )){
        unless(utf8::valid($l)){ #Пропускаем строки с невалидным utf8
            print STDERR "BAD UTF8:[$l]\n";
            next;
        }
        unless($cc % $pk){
            if($cc){ #Пропускаем первый пустой массив
                $sb->(\@arr);
                @arr = ();
            }
        }
        push(@arr, $l);
        $cc++;
    }
    close(F);
    $sb->(\@arr) if @arr;
    return $cc;
}

sub ts_eachline {
    my ($self, $sb) = @_;
    open(F, "< ".$self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    while(defined( my $l = <F> )){
        $l =~ s/\n$//;
        my @arr = split(/\t/, $l);
        $sb->(@arr);
    }
    close(F);
}

sub do_cmd_with_file {
    my ($self, %prm) = @_;

    my $proj = $self->proj;

    #my $file_in = $prm{file_in} // die "ERROR: void file_in";
    my $file_in = $self->{name} // die "ERROR: void file_in";

    # $prm{file_out},
    my $cmd_read = $prm{cmd_read} // 'cat';
    my $cmd = $prm{cmd};
    my $cmd_postprocess = $prm{cmd_postprocess};
    my $func_ref = $prm{func};
    $proj->log("do_cmd_with_file ...");

    # TODO check if @$file_in exist
    $file_in = join(" ", @$file_in)   if ref($file_in) eq 'ARRAY';

    my $add_cmd = '';
    $add_cmd = ' sed "s/^\s*//" | sed "s/ \+/\t/" '   if $prm{postprocess_uniqc};
    my $in = join(" ", map {"$_ |"} grep{ defined $_ and /[^ ]/ } ( "$cmd_read $file_in ", $cmd, $add_cmd ));
    $proj->log("in:  $in");
    open my $fh_in,  $in    or die "ERROR: open failed ($in): $!";

    my %files_out2temp;
    my %files_out = ref($prm{file_out}) eq 'HASH'  ?  %{$prm{file_out}}  :  ("" => $prm{file_out});
    my %filenames_temp;
    my %files_temp;
    my $h_out = {};
    for my $key (keys %files_out) {
        my $file_out = $files_out{$key};
        my $file_temp_name = (($file_out =~ m/([^\/]+)$/)[0] // "") . ".tmp";
        my $file_temp = $proj->get_tempfile($file_temp_name);
        $filenames_temp{$key} = $file_temp_name;
        my $out = "> $file_temp";
        $proj->log("out: $out");
        open my $fh_out, $out   or die "ERROR: open failed ($out): $!";
        $files_out2temp{$file_out} = $file_temp;
        $files_temp{$key} = $file_temp;
        $h_out->{$key} = $fh_out;
    }

    sub _parse_filename {
        my ($file_in) = @_;
        my ($path, $file_name, $prfx, $sfx, $flds);
        ($path, $file_name) = ($file_in =~ m!^(.*?)([^\/]+)$! );
        ($flds, $sfx) = ($file_name =~ m!^([^\.]+)(.*?)$! );       # path/prfx_flds.sfx
        ($prfx, $flds) = ($flds =~ m!^(.*?)([^_]+)$! );       # path/prfx_flds.sfx
        my @fld = split /-/, $flds;
        #$proj->dd([ $file_in,   $path, $file_name, $prfx, $sfx, \@fld ] );
        return ($path, $file_name, $prfx, $sfx, \@fld);
    }
    my @fld_names = $prm{get_fieldnames_from_filename}  ?  @{ (_parse_filename($file_in))[4] }  :  ();       # TODO  Если   $file_in = join(" ", @$file_in)   if ref($file_in) eq 'ARRAY';
    #$proj->dd( \@fld_names );

    while (<$fh_in>) {
        chomp;
        $proj->log("line number: $.")  if ($. % 500000 == 0);

        my $text = $_;
        my $out;
        if ($func_ref) {
            if ($prm{get_fieldnames_from_filename}) {
                my %h;
                @h{ @fld_names } = split /\t/, $text;
                $out = $func_ref->($proj, \%h, \@fld_names);
            } else {
                $out = $func_ref->($proj, $text);
            }
        } else {
            $out = $text;
        }

        $out = {"" => $out}  if (ref($out) ne 'HASH');
        for my $key (keys %$out) {
            my $fh_out = $h_out->{$key};
            my $text = $out->{$key};
            if (defined $text) {
                print $fh_out "$_\n"   for  grep {$_}  (ref($text) eq 'ARRAY' ? @$text : $text);
            }
        }
    }

    close $fh_in    or die "ERROR: close failed ($in): $!";
    close $h_out->{$_}   or die "ERROR: close failed ($_: " . $files_out{$_} . "): $!"
        for (sort keys %$h_out);

    if ($cmd_postprocess) {
        my %files_temp_postprocess = map { $_ => $proj->get_tempfile($filenames_temp{$_})  } keys %files_out;
        for my $key (keys %files_out) {
            $proj->do_sys_cmd("cat $files_temp{$key} | $cmd_postprocess > $files_temp_postprocess{$key}");
            $proj->do_sys_cmd("mv $files_temp_postprocess{$key} $files_out{$key}");
        }
    } else {
        #$proj->do_sys_cmd("mv " . $files_out2temp{$_} . " $_")   for keys %files_out2temp;
        $proj->do_sys_cmd("mv " . $files_temp{$_} . " " . $files_out{$_})   for keys %files_out;
    }

    $proj->log("do_cmd_with_file done");
}

sub seconds_expired {
    my ( $self ) = @_;
    die "ERROR: file ".$self->{name}." doesn't exist\n" unless ( -e $self->{name} );
    return time - (stat $self->{name})[9];
}

sub size {
    my ( $self, $unit ) = @_;
    return 0 unless ( -e $self->{name} );
    my $size = ( stat($self->{name}) )[7];
    return $size unless $unit;
    return sprintf("%.2f", $size/1024) if $unit eq 'kb';
    return $size/1048576 if $unit eq 'mb';
    return $size/1073741824 if $unit eq 'gb';
}

# сортировка inplace 
sub sort_inplace {
    my ( $self ) = @_;
  
    my $file_temp = $self->proj->get_tempfile( "file_sorted", UNLINK => 1);

    $self->proj->do_sys_cmd("LC_ALL=c sort $self->{name} > $file_temp");
    $self->proj->do_sys_cmd("mv $file_temp $self->{name}");

    return $self;
}

# в случайном порядке перемешивает строки файла, аналогично unix shuf
# Также можно задать ограничение на количество строк в файле. Если больше $max_len, то не перемешиваем
# По умолчанию $max_len=1000000, чтобы не было ограничения надо явно передать 0
#
# Возвращает $self
sub shuf {
    my ( $self, $max_len ) = @_;
    $max_len //= 1000000;

    # тут нечего перемешивать
    if (! $self->is_multiline ) {
        return $self;
    }

    my $len = $self->wc_l;

    if ($max_len && $len >= $max_len) {
        return $self;
    }
  
    my $file_temp = $self->proj->get_tempfile( "file_shuf", UNLINK => 1);

    $self->proj->do_sys_cmd("shuf $self->{name} > $file_temp");
    $self->proj->do_sys_cmd("mv $file_temp $self->{name}");

    return $self;
}

# возвращает k случайных строк без повторений
# если количество строк < k, то возвращает все строки
# строки возвращаются без \n
sub random_lines {
    my ( $self, $k ) = @_;
    @lines = ();
    open(F, "< ".$self->{name}) || die("Cant open file '".$self->{name}."' - $@");
    $i = 0;
    while(defined( my $l = <F> )) {
        chomp $l;
        if ($i < $k) {
            push @lines, $l;
        } else {
            my $r = int(rand($i + 1));
            if ($r < $k) {
                $lines[$r] = $l;
            }
        }
        $i++;
    }
    close(F);
    return @lines;
}

sub md5_hex {
    my $self = shift;
    return Utils::Sys::get_files_md5($self->{name});
}

1;
