package BM::LangRecognize;

use std;

no warnings 'utf8';

use utf8;
use base qw(Exporter);

use File::Basename qw(dirname);

our @EXPORT = (
    'recognize_text_lang',
    'recognize_utf8file_lang',
);

my $lang_recognizer;

# словари берутся из пакета libyxqueryrec
# все использование обернуто в eval на случай, если пакета нет (в таком случае просто продолжаем работать)
sub _init_lang_recognizer {
    require YxQueryrec;  # напрочь ломает open2!
    my $path = dirname($INC{'YxQueryrec.pm'})."/YxQueryrec";
    $lang_recognizer = YxQueryrec::TQueryRecognizer->new("$path/queryrec.dict", "$path/queryrec.weights");
}

sub recognize_text_lang {   
    my ($text, $params) = @_;
    my $weights = {};
    eval {
        $weights = _lang_recognize_weights($text, $params);
    };
    return _process_recognize_weights($weights);
}

sub recognize_utf8file_lang {
    my ($file, $params) = @_;
    my $weights = {};
    eval {
        local $PerlIO::encoding::fallback = Encode::FB_CROAK;
        open my $fh, '<:encoding(utf8)', $file or die $!;
        my $buffer;
        while (read $fh, $buffer, 1000) {
            my $buffer_weights = _lang_recognize_weights($buffer, $params);
            $weights->{$_} += $buffer_weights->{$_} foreach keys %$buffer_weights;
        }
        close $fh;
    };
    return _process_recognize_weights($weights);
}

sub _lang_recognize_weights {
    my ($text, $params) = @_;

    $params //= {};
    $params->{use_regex} //= 1;

    if ($params->{use_regex}) {
        my %letters_hash = map { $_ => 1 } split //, $text;
        my $letters = join('', keys %letters_hash);
        my $KAZ_LETTERS = qq!\x{4B0}\x{492}\x{493}\x{4E8}\x{4A2}\x{49A}\x{4BA}\x{4AE}\x{4B1}\x{4E9}\x{4A3}\x{49B}\x{4BB}\x{4AF}!
            .qq!\x{496}\x{4B2}\x{4B3}\x{497}\x{4D8}\x{4D9}!;
        return {'kk' => 1} if $letters =~ /[$KAZ_LETTERS]/;
    
        my $UKR_NATIVE_LETTERS = qq!\x{404}\x{406}\x{407}\x{454}\x{456}\x{457}\x{490}\x{491}!;
        return {'uk' => 1} if $letters =~ /[$UKR_NATIVE_LETTERS]/;
    
        my $RUS_LETTERS = qq!абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ!;
        return {'ru' => 1} if $letters =~ /[$RUS_LETTERS]/;
    
        my $TR_LETTERS = qq!\x{11F}\x{11E}\x{FC}\x{DC}\x{15F}\x{15E}\x{F6}\x{D6}\x{E7}\x{C7}\x{131}\x{130}\x{307}!;
        return {'tr' => 1} if $letters =~ /[$TR_LETTERS]/;
    
        my $LAT_LETTERS = qq!abcdefghijklmonpqrstuvwxyzABCDEFGHIJKLMONPQRSTUVWXYZ!;
        return {'unknown' => 1} if $letters !~ /[$LAT_LETTERS]/;
    }
    _init_lang_recognizer if !$lang_recognizer;
    my $res = $lang_recognizer->recognize($text);  # хэш { lang => weight }

    return {'unknown' => 1} unless %$res;

    my %rewrite = ('rus' => 'ru', 'eng' => 'en', 'tur' => 'tr', 'ukr' => 'uk');

    $res = { map { $rewrite{$_}//$_ => $res->{$_} } keys %$res };

    return $res;

}

sub _process_recognize_weights {
    my $res = shift;

    my @lang = sort { $res->{$b} <=> $res->{$a} || $a cmp $b } keys %$res; 
    my $lang = $lang[0] or return 'unknown';

    return $lang;

}

1;
