import tikaite from "common.dsl";
import deobfuscator from "common.dsl";
import join_attachments from "common.dsl";
import fast_text from "common.dsl";

create jniwrapper extractor unperson(string str) -> string {
    library = $(LIBUNPERSON)
    ctor = JniWrapperCreateUnperson
    dtor = JniWrapperDestroyUnperson
    main16 = JniWrapperUnpersonText
    free = JniWrapperFree
}

create chain extractor fast_text_hnsw_subject(string str) -> json_object, json_object, json_object, string, string, long {
    create chain extractor fast_text_subject(string subject) -> fast_text_embedding, string, long {
        trace deobfuscator(subject) -> deobfuscated_subject;
        limit_words(deobfuscated_subject, 10) -> truncated_subject, word_count;
        trace fast_text(truncated_subject) -> embedding;
        return embedding;
        return deobfuscated_subject;
        return word_count;
    }

    create chain extractor fast_text_embedding(string subject) -> fast_text_embedding {
        fast_text_subject(subject) -> embedding, deobfuscated_subject, word_count;
        return embedding;
    }

    create hnsw extractor fast_text_hnsw(fast_text_embedding embedding) -> wmd_distance, wmd_neighbour_id, long, neighbour_labels, string {
        sample-field-name = hdr_subject
        distance-type = greed
        required-label = check_subject
        extractor = fast_text_embedding
    }

    fast_text_subject(str) -> embedding, deobfuscated_subject, word_count;
    fast_text_hnsw(embedding) -> distance, neighbour_id, neighbour_word_count, neighbour_labels, from_domain;

    return distance.__json_object__, neighbour_id.__json_object__, neighbour_labels.__json_object__, deobfuscated_subject, from_domain, word_count if not is_any_null(str);
}

create chain extractor fast_text_hnsw_pure_body(string deobfuscated_pure_body) -> json_object, json_object, long, json_object, string, long {
    create chain extractor fast_text_pure_body(string deobfuscated_pure_body) -> fast_text_embedding, long {
        limit_words(deobfuscated_pure_body, 30) -> truncated_pure_body, word_count;
        trace fast_text(truncated_pure_body) -> embedding;
        return embedding;
        return word_count;
    }

    create chain extractor fast_text_embedding(string pure_body) -> fast_text_embedding {
        deobfuscator(pure_body) -> deobfuscated_pure_body;
        fast_text_pure_body(deobfuscated_pure_body) -> embedding, word_count;
        return embedding;
    }

    create hnsw extractor fast_text_hnsw(fast_text_embedding embedding) -> wmd_distance, wmd_neighbour_id, long, neighbour_labels, string {
        sample-field-name = pure_body
        extractor = fast_text_embedding
    }

    fast_text_pure_body(deobfuscated_pure_body) -> embedding, word_count;
    fast_text_hnsw(embedding) -> distance, neighbour_id, neighbour_word_count, neighbour_labels, from_domain;

    return distance.__json_object__, neighbour_id.__json_object__, neighbour_word_count, neighbour_labels.__json_object__, from_domain, word_count if not is_any_null(deobfuscated_pure_body);
}

create chain extractor fast_text_hnsw_pure_body10(string deobfuscated_pure_body) -> json_object, json_object, json_object, string {
    create chain extractor fast_text_pure_body10(string deobfuscated_pure_body) -> fast_text_embedding {
        limit_words(deobfuscated_pure_body, 10) -> truncated_pure_body, word_count;
        trace fast_text(truncated_pure_body) -> embedding;
        return embedding;
    }

    create chain extractor fast_text_embedding(string pure_body) -> fast_text_embedding {
        deobfuscator(pure_body) -> deobfuscated_pure_body;
        fast_text_pure_body10(deobfuscated_pure_body) -> embedding;
        return embedding;
    }

    create hnsw extractor fast_text_hnsw(fast_text_embedding embedding) -> wmd_distance, wmd_neighbour_id, long, neighbour_labels, string {
        sample-field-name = pure_body
        distance-type = greed
        required-label = pure_body10
        extractor = fast_text_embedding
    }

    fast_text_pure_body10(deobfuscated_pure_body) -> embedding;
    fast_text_hnsw(embedding) -> distance, neighbour_id, neighbour_word_count, neighbour_labels, from_domain;

    return distance.__json_object__, neighbour_id.__json_object__, neighbour_labels.__json_object__, from_domain if not is_any_null(deobfuscated_pure_body);
}

create ugc extractor ugc(string html_body) -> string, string_list {
    $(include ugc-conf.d/*.conf)
}

create chain extractor process_ugc(string ugc_id, string_list ugc_deltas) -> string, string {
    join_string_list(ugc_deltas, " ") -> ugc_content;
    deobfuscator(ugc_content) -> deobfuscated_ugc;
    urls(ugc_content) -> ugc_urls;

    return deobfuscated_ugc, ugc_urls if not is_any_null(ugc_id);
}

create chain extractor part_processor(tikaite_doc doc) -> tikaite_doc {
    create compose_doc extractor compose_doc(
        tikaite_doc doc,
        string unperson_subject,
        string unperson_pure_body,
        string ugc_id,
        string deobfuscated_ugc,
        string ugc_urls,
        string from_domain,
        string deobfuscated_subject,
        json_object subject_wmd_distance,
        json_object subject_wmd_neighbour_id,
        json_object subject_wmd_neighbour_labels,
        string subject_wmd_from_domain,
        long subject_word_count,
        string subject_language,
        string deobfuscated_pure_body,
        json_object pure_body_wmd_distance,
        json_object pure_body_wmd_neighbour_id,
        long pure_body_wmd_neighbour_word_count,
        json_object pure_body_wmd_neighbour_labels,
        string pure_body_wmd_from_domain,
        json_object pure_body10_wmd_distance,
        json_object pure_body10_wmd_neighbour_id,
        json_object pure_body10_wmd_neighbour_labels,
        string pure_body10_wmd_from_domain,
        long pure_body_word_count,
        string pure_body_language)
        -> tikaite_doc
    {
    }

    create sed extractor erase_tokens(string str) -> string {
        pattern = %[^%]+%
        replacement =
    }

    unperson(doc.hdr_subject.__string__) -> unperson_subject;
    trace unperson as unperson_pure_body(doc.pure_body.__string__) -> unperson_pure_body;
    ugc(doc.html_body.__string__) -> ugc_id, ugc_deltas;
    trace process_ugc(ugc_id, ugc_deltas) -> deobfuscated_ugc, ugc_urls;

    from_domain(doc) -> from_domain;

    trace fast_text_hnsw_subject(doc.hdr_subject.__string__) -> subject_wmd_distance, subject_wmd_neighbour_id, subject_wmd_neighbour_labels, deobfuscated_subject, subject_wmd_from_domain, subject_word_count;
    erase_tokens(deobfuscated_subject) -> clean_subject;
    trace detect_language(clean_subject) -> subject_language;
    trace deobfuscator(doc.pure_body.__string__) -> deobfuscated_pure_body;
    trace fast_text_hnsw_pure_body(deobfuscated_pure_body) -> pure_body_wmd_distance, pure_body_wmd_neighbour_id, pure_body_wmd_neighbour_word_count, pure_body_wmd_neighbour_labels, pure_body_wmd_from_domain, pure_body_word_count;
    trace fast_text_hnsw_pure_body10(deobfuscated_pure_body) -> pure_body10_wmd_distance, pure_body10_wmd_neighbour_id, pure_body10_wmd_neighbour_labels, pure_body10_wmd_from_domain;
    erase_tokens(deobfuscated_pure_body) -> clean_pure_body;
    trace detect_language(clean_pure_body) -> pure_body_language;

    compose_doc(
        doc,
        unperson_subject,
        unperson_pure_body,
        ugc_id,
        deobfuscated_ugc,
        ugc_urls,
        from_domain,
        deobfuscated_subject,
        subject_wmd_distance,
        subject_wmd_neighbour_id,
        subject_wmd_neighbour_labels,
        subject_wmd_from_domain,
        subject_word_count,
        subject_language,
        deobfuscated_pure_body,
        pure_body_wmd_distance,
        pure_body_wmd_neighbour_id,
        pure_body_wmd_neighbour_word_count,
        pure_body_wmd_neighbour_labels,
        pure_body_wmd_from_domain,
        pure_body10_wmd_distance,
        pure_body10_wmd_neighbour_id,
        pure_body10_wmd_neighbour_labels,
        pure_body10_wmd_from_domain,
        pure_body_word_count,
        pure_body_language)
        -> processed_doc;

    return processed_doc;
}

create template_master extractor sherlock(tikaite_doc text_part, smtp_envelope envelope) -> json_object {
    body-fields = html_body
    host = $(TEMPLATE_MASTER_HOST)
    connections = 32

    [timeout]
    connect = 200ms
    pool = $(TEMPLATE_MASTER_POOL_TIMEOUT)
    socket = $(TEMPLATE_MASTER_SOCKET_TIMEOUT)

    [stat]
    prefix = templatemaster
    metrics = httpcodes, requesttimes, requesthist
    histogram-ranges = 0, 10, 50, 100, 200, 300, 500, 750, 1000, 2000, 3000
    precise-histogram = false
    processing-time-stats = false
}

create chain extractor sherlock_wrapper(tikaite_doc text_part, smtp_envelope envelope, boolean all_from_same_org_id) -> json_object {
    trace async sherlock(text_part, envelope) -> matched_template;
    return matched_template if is_false(all_from_same_org_id);
}

create dkim_stats extractor dkim_stats(mail_meta meta, dkim_domains domains) -> json_object {
    host = $(SENDERS_HOST):$(SENDERS_PORT)
    connections = 32
    tvm-client-id = $(SENDERS_TVM_CLIENT_ID)

    [timeout]
    connect = 200ms
    pool = $(SOSEARCH_PROXY_POOL_TIMEOUT)
    socket = $(SOSEARCH_PROXY_SOCKET_TIMEOUT)

    [http-error-retries]
    interval = 50ms
    count = 1

    [io-error-retries]
    interval = 50ms
    count = 1

    [stat]
    prefix = dkim-stats
    metrics = httpcodes, requesttimes, requesthist
    histogram-ranges = 0, 10, 50, 100, 200, 300, 500, 750, 1000
    precise-histogram = false
    processing-time-stats = false
}

create senders extractor senders(mail_meta meta, smtp_envelope envelope, long sender_uid) -> senders {
    host = $(SENDERS_HOST):$(SENDERS_PORT)
    connections = 32
    max-references = 5
    tvm-client-id = $(SENDERS_TVM_CLIENT_ID)

    [timeout]
    connect = 200ms
    pool = $(SOSEARCH_PROXY_POOL_TIMEOUT)
    socket = $(SOSEARCH_PROXY_SOCKET_TIMEOUT)

    [http-error-retries]
    interval = 50ms
    count = 1

    [io-error-retries]
    interval = 50ms
    count = 1

    [stat]
    prefix = senders
    metrics = httpcodes, requesttimes, requesthist
    histogram-ranges = 0, 10, 50, 100, 200, 300, 500, 750, 1000
    precise-histogram = false
    processing-time-stats = false
}

create blackbox_userinfos extractor blackbox_userinfos(smtp_envelope envelope) -> smtp_envelope, userinfos_map, userinfo, string, boolean, long {
    emails-type = getdefault
    dbfields = suid, reg-date, country
    aliases = pddalias, yandexoid
    attributes = account-is-maillist, phone-confirmation-timestamp, account-connect-organization-ids, account-org-id

    [blackbox]
    host = $(BLACKBOX_HOST)
    connections = 10
    pass-referer = false
    timeout = 200ms
    tvm-client-id = $(BLACKBOX_TVM_CLIENT_ID)

    [blackbox.stat]
    prefix = blackbox
    metrics = httpcodes, requesttimes, requesthist
    histogram-ranges = 0, 1, 10, 20, 50, 100, 150, 200, 300, 500, 750, 1000
    precise-histogram = false
    processing-time-stats = false

    [blackbox.http-error-retries]
    interval = 50ms
    count = 1

    [blackbox.io-error-retries]
    interval = 50ms
    count = 1

    [corp-blackbox]
    host = $(CORP_BLACKBOX_HOST)
    connections = 10
    pass-referer = false
    timeout = 200ms
    tvm-client-id = $(CORP_BLACKBOX_TVM_CLIENT_ID)

    [corp-blackbox.stat]
    prefix = corp-blackbox
    metrics = httpcodes, requesttimes, requesthist
    histogram-ranges = 0, 1, 10, 20, 50, 100, 150, 200, 300, 500, 750, 1000
    precise-histogram = false
    processing-time-stats = false

    [corp-blackbox.http-error-retries]
    interval = 50ms
    count = 1

    [corp-blackbox.io-error-retries]
    interval = 50ms
    count = 1
}

create activity extractor activity(smtp_envelope envelope) -> json_object {
    host = $(ACTIVITY_HOST)
    connections = 5

    [timeout]
    connect = 200ms
    pool = $(SOSEARCH_PROXY_POOL_TIMEOUT)
    socket = $(SOSEARCH_PROXY_SOCKET_TIMEOUT)

    [http-error-retries]
    interval = 50ms
    count = 1

    [io-error-retries]
    interval = 50ms
    count = 1

    [stat]
    prefix = activity-stats
    metrics = httpcodes, requesttimes, requesthist
    histogram-ranges = 0, 10, 50, 100, 200, 300, 500, 750, 1000
    precise-histogram = false
    processing-time-stats = false
}

create chain extractor check_crypta(userinfo userinfo) -> boolean {
    create bigb extractor bigb(long uid) -> boolean, crypta_user_vector {
        host = $(BIGB_HOST)
        connections = 32
        pass-referer = false
        timeout = 300ms
        tvm-client-id = $(BIGB_TVM_CLIENT_ID)

        [stat]
        prefix = bigb
        metrics = httpcodes, requesttimes, requesthist
        histogram-ranges = 0, 1, 10, 20, 50, 100, 150, 200, 300, 500, 750, 1000
        precise-histogram = false
        processing-time-stats = false

        [http-error-retries]
        interval = 50ms
        count = 1

        [io-error-retries]
        interval = 50ms
        count = 1
    }

    userinfo_uid(userinfo) -> uid;
    trace bigb(uid) -> has_crypta_user_vector, crypta_user_vector;
    return has_crypta_user_vector if not is_any_null(userinfo);
}

create yadisk_info extractor yadisk(tikaite_docs docs) -> json_object {
    host = $(YADISK_HOST)
    connections = 32
    pass-referer = false
    tvm-client-id = $(YADISK_TVM_CLIENT_ID)

    [timeout]
    connect = 200ms
    pool = $(SOSEARCH_PROXY_POOL_TIMEOUT)
    socket = $(SOSEARCH_PROXY_SOCKET_TIMEOUT)

    [stat]
    prefix = yadisk
    metrics = httpcodes, requesttimes, requesthist
    histogram-ranges = 0, 1, 10, 20, 50, 100, 150, 200, 300, 500, 750, 1000
    precise-histogram = false
    processing-time-stats = false

    [http-error-retries]
    interval = 50ms
    count = 1

    [io-error-retries]
    interval = 50ms
    count = 1
}

create org_settings extractor org_settings(smtp_envelope envelope, long mailfrom_org_id, long common_org_id) -> boolean, long {
    uri = $(CRETUR_HOST)/get-all-organizations-settings
    connections = 2
    timeout = 10s
    update-interval = 1m
    tvm-client-id = $(CRETUR_TVM_ID)

    [https.keystore]
    file = $(ALL_CAS_DIR)/allCAs.jks
    type = PKCS12
    password = allcaspassword
}

create chain extractor main(raw_mail mail, smtp_envelope envelope) -> json_map {
    create for_each_tikaite_doc extractor parts_processor(tikaite_docs docs) -> tikaite_docs {
        extractor = part_processor
        parallel = true
    }

    create dssm extractor mail_dssm(tikaite_doc doc) -> dssm_embedding {
        model = $(MAIL_DSSM)
        layer = query_embedding
        preprocess-fields = true

        [fields-mapping]
        pure_body = body
        hdr_subject = subject
        reply_to_display_name = fromname
        reply_to_normalized = fromaddr
        hdr_from_display_name = fromname
        hdr_from_normalized = fromaddr
    }

    create compose_doc extractor compose_doc(
        json_map base,
        json_object docs,
        string queue_id,
        string hbf_project_id,
        json_object mail_dssm_embedding,
        json_object text_part_wmd_distance,
        json_object text_part_wmd_neighbour_id,
        json_object text_part_neighbour_labels,
        json_object rcpttos_userinfos,
        json_object mailfrom_userinfo,
        string mailfrom_login,
        boolean all_from_same_org_id,
        long common_org_id,
        boolean email_from_org_whitelist,
        long org_daily_limit,
        json_object senders,
        json_object sender_ml_features,
        json_object sender_ml_embeddings,
        json_object matched_template,
        json_object activity,
        json_object dkim_domains,
        json_object dkim_stats,
        json_object has_crypta_user_vector,
        json_object yadisk_info)
        -> json_map
    {
    }

    create log extractor info(
        json_map result,
        string queue_id,
        json_object text_part_wmd_distance,
        json_object text_part_wmd_neighbour_id,
        json_object text_part_neighbour_labels)
        -> json_map
    {
        queue_id = QueueID:\\ 
        text_part_wmd_distance = Text part wmd distance:\\ 
        text_part_wmd_neighbour_id = Text part wmd neighbour id:\\ 
        text_part_neighbour_labels = Text part wmd neighbour labels:\\ 
    }

    trace async tikaite(mail) -> meta, docs;

    header(meta, "x-yandex-queueid") -> queue_id;
    parts_processor(docs) -> processed_docs;

    sender_ips(meta, envelope) -> sender_ips;
    hbf_project_id(sender_ips) -> hbf_project_id;

    text_part(docs) -> text_part;
    trace mail_dssm(text_part) -> mail_embedding;

    text_part(processed_docs) -> processed_text_part;

    trace blackbox_userinfos(envelope) -> resolved_envelope, rcpttos_userinfos, mailfrom_userinfo, mailfrom_login, all_from_same_org_id, common_org_id;
    identity(mailfrom_userinfo.__json_map__) -> mailfrom_json;
    org_settings(envelope, mailfrom_json.attributes.account_org_id.__long__, common_org_id) -> email_from_org_whitelist, org_daily_limit;
    # has_crypta_user_vector can be true, false of null for external senders
    check_crypta(mailfrom_userinfo) -> has_crypta_user_vector;

    senders(meta, resolved_envelope, mailfrom_json.uid.__long__) -> senders;

    sherlock_wrapper(text_part, resolved_envelope, all_from_same_org_id) -> matched_template;

    trace activity(resolved_envelope) -> activity;

    dkim_domains(meta) -> dkim_domains;
    dkim_stats(meta, dkim_domains) -> dkim_stats;

    yadisk(docs) -> yadisk_info;

    create_json_map() -> empty_map;
    compose_doc(
        empty_map,
        processed_docs.__json_object__,
        queue_id,
        hbf_project_id,
        mail_embedding.__json_object__,
        processed_text_part.pure_body_wmd_distance,
        processed_text_part.pure_body_wmd_neighbour_id,
        processed_text_part.pure_body_neighbours,
        rcpttos_userinfos.__json_object__,
        mailfrom_json.__json_object__,
        mailfrom_login,
        all_from_same_org_id,
        common_org_id,
        email_from_org_whitelist,
        org_daily_limit,
        senders.sender_info,
        senders.sender_ml_features,
        senders.sender_ml_embeddings,
        matched_template,
        activity,
        dkim_domains.__json_object__,
        dkim_stats,
        has_crypta_user_vector.__json_object__,
        yadisk_info)
        -> result;

    info(
        result,
        queue_id,
        processed_text_part.pure_body_wmd_distance,
        processed_text_part.pure_body_wmd_neighbour_id,
        processed_text_part.pure_body_neighbours)
        -> logged_result;

    return logged_result;
}

