package Lang::Guess;

=head1 DESCRIPTION

Угадывание языка по тексту

=cut

use Direct::Modern;

use Settings;

use JSON qw/encode_json decode_json/;
use Yandex::HashUtils;
use Yandex::HTTP qw/http_fetch/;
use Yandex::SendMail qw/send_alert/;
use Yandex::Trace;
use Yandex::TVM2;

use Direct::Model::BannerImageAd::Constants;
use Direct::Model::BannerCpmAudio::Constants;
use LogTools qw/log_messages/;

use base qw/Exporter/;
our @EXPORT = qw/
    analyze_text_lang
    analyze_text_lang_with_context
/;

# конвертация используемых в Директе языков из Queryrec формата (похож на ISO 639-2/B)
# в Директовый формат (похож на ISO 639-1)
our %QUERYREC_TO_DIRECT_LANG = (
    'eng' => 'en',
    'kaz' => 'kk',
    'rus' => 'ru',
    'bel' => 'be',
    'ukr' => 'uk',
    'tur' => 'tr',
    'ger' => 'de',
    'uzb' => 'uz',
);

=head2 %known_strings

    Запоминаем, что не нужно спрашивать у queryrec

=cut

our %known_strings = (
    $Direct::Model::BannerImageAd::Constants::TITLE_PLACEHOLDER . ' ' . $Direct::Model::BannerImageAd::Constants::BODY_PLACEHOLDER => 'en',
    $Direct::Model::BannerCpmAudio::Constants::TITLE_PLACEHOLDER . ' ' . $Direct::Model::BannerCpmAudio::Constants::BODY_PLACEHOLDER => 'en',
);

=head2 $Lang::Guess::EXTERNAL_QUERYREC

    Какую функцию использовать для распознавания языка текста.

=cut

our %QUERYREC_SUBS = (java => \&java_queryrec);
our $SELECT_SUB ||= 'java';

our $EXTERNAL_QUERYREC ||= \&queryrec_sub_selector;

=head2 queryrec_sub_selector

    Обёртка над вызовом queryrec, чтобы можно было её подменять извне

=cut

sub queryrec_sub_selector {
    return $QUERYREC_SUBS{$SELECT_SUB}->(@_);
}

=head2 analyze_text_lang

    Analyze text and try to define language (kk|uk|ru|en|de|tr)

=cut

sub analyze_text_lang {
    my $text = join " ", grep {defined $_} @_;

    my $profile = Yandex::Trace::new_profile('queryrec:analyze_text_lang', tags => 'analyze_text_lang', obj_num => ($text ne '' ? 1 : 0));

    return analyze_text_lang_with_context(undef, $text);
}

=head2 analyze_text_lang_with_context

    Отличие от метода выше в том, что при определении языка учитывается не только текст, но и данные клиента.

=cut

sub analyze_text_lang_with_context {
    my $client_id = shift;
    my $text = join " ", grep {defined $_} @_;

    if ($text eq '') {
        return undef;
    }

    if ($known_strings{$text}) {
        return $known_strings{$text};
    }

    state $enable_proxy_for_analyze_text_lang //= Property->new('enable_proxy_for_analyze_text_lang');

    if (($enable_proxy_for_analyze_text_lang->get(120) // 0) > int(rand(100))) {
        my $lang = java_analyze_texts_lang($client_id, $text);
        return $QUERYREC_TO_DIRECT_LANG{$lang};
    } else {
        return analyze_text_lang_old($text);
    }
}

=head2 analyze_text_lang_old($text)

    Перловая версия алгоритма определения языка

=cut

sub analyze_text_lang_old {
    my ($text) = @_;

    if ($text =~ /[$Settings::KAZ_LETTERS]/) {
        # если есть хоть одна казахская буква - kk
        return 'kk';
    } elsif ($text =~ /[$Settings::UKR_NATIVE_LETTERS]/) {
        # если есть хоть одна украинская буква - uk
        return 'uk';
    } elsif ($text =~ /[$Settings::BEL_NATIVE_LETTERS]/) {
        return 'be';
    } elsif ($text =~ /[$Settings::COMMON_UKR_BEL_LETTERS]/) {
        return detect_language_ext($text, select_only => [qw/be uk kk ru/], default => 'be')
    } elsif ($text =~ /[$Settings::RUS_LETTERS]/) {
        # если есть русская буква - ru
        return 'ru';
    } elsif ($text =~ /[$Settings::UNIQUE_DE_LETTERS]/) {
        return 'de';
    } elsif ($text =~ /[$Settings::UNIQUE_TR_LETTERS]/) {
        return 'tr';
    } elsif ($known_strings{$text}) {
        return $known_strings{$text};
    } elsif ($text =~ /[${Settings::LAT_LETTERS}${Settings::GENERAL_TR_DE_LETTERS}]/) {
        return detect_language_ext($text, select_only => [qw/en tr de/], default => 'en');
    } else {
        return undef;
    }
}

=head2 detect_language_ext($text, %O)

    my $lang = detect_language_ext('hello world', select_only => [qw/en tr de/], default => 'en');

=cut

sub detect_language_ext {
    my ($text, %O) = @_;

    my $select_only = $O{select_only};
    my $default = $O{default};

    my $langs_probs = call_external_queryrec($text);
    unless (ref $langs_probs eq 'HASH') {
        return $default;
    }
    $langs_probs = hash_kmap { $QUERYREC_TO_DIRECT_LANG{$_} }
                    hash_kgrep { defined($QUERYREC_TO_DIRECT_LANG{$_}) }
                        $langs_probs;

    if ($select_only) {
        my %select_hash = map { $_ => 1 } @$select_only;
        $langs_probs = hash_kgrep { $select_hash{$_} } $langs_probs;
    }

    my @langs = sort { $langs_probs->{$b} <=> $langs_probs->{$a} } keys %$langs_probs;

    if (!@langs) {
        return $default;
    }
    return $langs[0];
}

=head2 call_external_queryrec($text)

    вызов внешнего сервиса queryrec

    возвращает хешик с возможными языками, на которых написан текст $text, и их вероятностями

=cut

sub call_external_queryrec {
    my ($text) = @_;

    my $langs;

    my $success = eval {
        $langs = $EXTERNAL_QUERYREC->($text);
        return 1;
    };

    if (!$success) {
        my $err = $@ // 'unknown exception';
        log_messages('', "queryrec call failed: $err");

        my $env_data = join("\n", map {$_ . " => " . $ENV{$_}} sort keys %ENV);
        $env_data =~ s/(Session_id|sessionid2)=[^;\s]+/$1=SECRET/g;
        send_alert(
            join("\n------------\n"
                 , $err
                 , ''
                 , $env_data
            ),
        'Queryrec error');
    }

    return $langs;
}

=head2 java_queryrec($text)

    Вызов Java INTAPI ручки queryrec

=cut

sub java_queryrec {
    my ($text) = @_;
    my $langs;
    my $profile = Yandex::Trace::new_profile("queryrec:recognize_texts:do_http");

    my $body = encode_json([$text]);
    my %opts = (
        timeout => 3,
        num_attempts => 2,
        ipv6_prefer => 1,
        headers => {
            'Content-Type' => 'application/json',
        },
    );
    my $content = http_fetch('POST', $Settings::QUERYREC_JAVA_HTTP_URL, $body, %opts);
    my $response = decode_json($content);
    if (ref $response eq 'ARRAY' && ref $response->[0] eq 'HASH') {
        $langs = $response->[0];
    } else {
        die "got malformed response: $content\n";
    }

    return $langs;
}

=head2 java_analyze_texts_lang($text)

    Вызов Java INTAPI ручки /queryrec/analyze_texts_lang

=cut

sub java_analyze_texts_lang {
    my ($client_id, $text) = @_;

    my $lang;
    my $profile = Yandex::Trace::new_profile("queryrec:analyze_texts_lang:do_http");

    my $ticket = eval { Yandex::TVM2::get_ticket($Settings::TVM2_APP_ID{intapi}) } or die "Cannot get ticket for $Settings::TVM2_APP_ID{intapi}: $@";
    my $trace_id = Yandex::Trace::current_span_id();
    my $trace_header = join(',', map {$_ // 0} $trace_id, Yandex::Trace::generate_traceid(), $trace_id, 0);

    my $body = encode_json([$text]);
    my %opts = (
        timeout => 3,
        num_attempts => 2,
        ipv6_prefer => 1,
        headers => {
            'Content-Type' => 'application/json',
            'X-Ya-Service-Ticket' => $ticket,
            'X-Yandex-Trace' => $trace_header,
        },
    );

    my $content = http_fetch('POST', make_url_for_analyze_texts_lang($client_id), $body, %opts);
    my $response = decode_json($content);

    if (ref $response eq 'ARRAY') {
        $lang = $response->[0];
    } else {
        die "got malformed response: $content\n";
    }

    return $lang;
}

sub make_url_for_analyze_texts_lang {
    my ($client_id) = @_;

    my $query_params = {};

    if ($client_id) {
        $query_params->{client_id} = $client_id;
    }

    return Yandex::HTTP::make_url($Settings::ANALYZE_TEXTS_LANG_JAVA_HTTP_URL, $query_params);
}

1;
