#include <util/system/defaults.h>
#include <contrib/deprecated/mimepp/mimepp/mimepp.h>
#include <util/string/split.h>
#include <util/str_stl.h>
#include <mail/so/corp/msg_index.h>
#include <mail/so/corp/mkshn.h>
#include <library/cpp/archive/yarchive.h>

static const unsigned char STOP_WORDS_RAW_DATA[] = {
#include "hashes.inc"
};

static const TArchiveReader HASHES_READER(TBlob::NoCopy(STOP_WORDS_RAW_DATA, sizeof(STOP_WORDS_RAW_DATA)));
const mhash TGlobalContext::dict_rus(HASHES_READER.ObjectBlobByKey("/words.hash"));
const mhash TGlobalContext::dict_tur(HASHES_READER.ObjectBlobByKey("/words_tur.hash"));
const mhash TGlobalContext::dict_stopwords(HASHES_READER.ObjectBlobByKey("/stop-words.UTF8.hash"));

TCatboostDict TCatboostDict::Load(IInputStream& stream) {
    TCatboostDict dict;
    TString line;
    stream.ReadLine(line);
    while (stream.ReadLine(line)) {
        TStringBuf view(line);
        view.ChopSuffix("\n");
        view.ChopSuffix("\r");

        TStringBuf type, feature;
        size_t index{};
        Split(view, '\t', index, type, feature);

        dict.features.emplace(feature, dict.features.size());

        if (type == "Num") {
            dict.floatFeatures.emplace(feature, dict.floatFeatures.size());
        } else if (type == "Categ") {
            dict.categFeatures.emplace(feature, dict.categFeatures.size());
        } else {
            ythrow TWithBackTrace<yexception>() << "unknown feature type " << type << " with value " << feature;
        }
    }

    return dict;
}

static constexpr char text_chars_yx[256] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0, 0, 0, 0, 0, 0,
    0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 0, 0, 0, 0, 0,
    0, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0, 0, 0, 0,
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
    0, 'X', 'X', 1, 1, 1, 1, 1, 'X', 'c', 'X', 1, 1, 1, 'R', 'X',
    0, 1, 1, 1, 1, 'X', 1, 1, 'X', 1, 'X', 1, 1, 1, 1, 'X',
    'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X',
    'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 1, 'X', 'X', 'X', 'X', 'X',
    'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X',
    'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 1, 'X', 'X', 'X', 'X', 'X'};

THashSet<ui64> TShinglesCounter::get_words_hash(const TString& s) {
    THashSet<ui64> st;
    const char* p = s.c_str();
    const char* start_pos = 0;
    ui64 h;
    while (true) {
        while (*p && !text_chars_yx[(ui8)*p])
            p++;
        if (!*p)
            return st;
        h = 0;
        start_pos = p;
        for (; text_chars_yx[(ui8)*p]; p++) {
        }
        h = hash_funck(start_pos, p - start_pos);
        st.insert(h);
    }
}

TString TShinglesCounter::get_shingle_2(const TString& doc, const TString& bw) {
    THashSet<ui64> bw_set(get_words_hash(bw));
    char sh[255];
    char* sups = get_supershingle_2(doc.c_str(), &sh[0], bw_set);
    if (sups && *sups)
        return sups;
    else
        return {};
}

bool is_number(const char* w, size_t l) {
    bool r = true;
    for (const char* p = w; (size_t)(p - w) < l; ++p) {
        if (!isdigit((unsigned char)*p)) {
            r = false;
            break;
        }
    }
    return r;
}

static constexpr ui64 upper_wrd = ULL(0x62d00b60619a9747); //6151 ^ 9

bool TShinglesCounter::getnextwordhash(const char*& pos, ui64& to, const THashSet<ui64>& bw) {
    const char* p = pos;
    const char* start_pos = 0;
    ui64 hk = 0;
    TString to_bayes;

next_word:
    while (*p && !text_chars_yx[(ui8)*p])
        p++;
    if (!*p)
        return false;
    hk = 0;

    start_pos = p;
    for (; text_chars_yx[(ui8)*p]; p++)
        ;
    if (((p - start_pos) < 2) || is_number(start_pos, p - start_pos))
        goto next_word;

    hk = hash_funck(start_pos, p - start_pos);
    sh_full_words++;

    // Already done
    if (bw.find(hk) != bw.end())
        goto next_word;

    // Dict check
    switch (language) {
        case LANG_RUS:
        case LANG_UNK_CYR:
        case LANG_ENG:
            if (!GlobalContext.dict_rus.find(hk))
                goto next_word;
            break;
        case LANG_TUR:
            if (!GlobalContext.dict_tur.find(hk))
                goto next_word;
            break;
        default:
            if (!GlobalContext.dict_rus.find(hk))
                goto next_word;
    }

    pos = p;
    to = hk;
    sh_dict_words++;

    return true;
}

ui64 TShinglesCounter::get_supershingle_next(const char* is,
                                     const THashSet<ui64>& bw,
                                     ui64* start_val) {
    TVector<ui64> res;
    get_doc_shinv(res, is, bw);
    ui32 n, e = res.size();
    ui64 s = 0;

    if (start_val)
        s = *start_val;

    if (!e)
        return s;

    if (start_val)
        n = 0;
    else {
        s = res[0];
        n = 1;
    }

    for (; n < res.size(); n++)
        if (res[n])
            s = s * 6151 + res[n];

    return s;
}

int TShinglesCounter::get_doc_shinv(TVector<ui64>& shv,
                            const char* is,
                            const THashSet<ui64>& bw) {
    shv.clear();
    ui64 cur_word[10], cur_shingle; //6151 is a good prime number between 7 ^ 4 and 7 ^ 5 :)
    int n;
    cur_word[0] = 0;
    ui64 f_shi;

    if (!TShinglesCounter::getnextwordhash(is, f_shi, bw))
        return 1;

    for (n = 1; n < 10; n++)
        cur_word[n] = f_shi;

    cur_shingle = 0;
    TVector<ui64> res(49, ui64());
    for (n = 1; n < 10; n++)
        cur_shingle = cur_shingle * 6151 + cur_word[n];
    n = 0;

    int num_w = 1;
    while (1) {
        cur_shingle -= cur_word[n] * upper_wrd;
        if (!TShinglesCounter::getnextwordhash(is, cur_word[n], bw)) {
            if (num_w < 5)
                return 1;
            break;
        }

        cur_shingle = cur_shingle * 6151 + cur_word[n];
        ++num_w;

        if (++n >= 10)
            n = 0;

        if (cur_shingle < res[0] || res[0] == 0) {
            //fprintf (stderr,"first %llu pow 0\n",cur_shingle);
            res[0] = cur_shingle;
            continue;
        }

        for (int k = 1; k < 49; ++k) {
            ui64 pow = 1 << k;
            if ((cur_shingle % pow) == 0) {
                if (cur_shingle < res[k] || res[k] == 0) {
                    //fprintf (stderr,"next %llu pow %d\n",cur_shingle,k);
                    res[k] = cur_shingle;
                    break;
                }
            }
        }
    }
    shv = res;

    return 0;
}

bool TShinglesCounter::getnextwordhash_2(const char*& pos,
                     ui64& to,
                     const THashSet<ui64>& bw) {
    const char* p = pos;
    const char* start_pos = 0;
    ui64 hk = 0;

next_word:
    while (*p && !text_chars_yx[(ui8)*p])
        p++;
    if (!*p)
        return false;
    hk = 0;
    start_pos = p;
    for (; text_chars_yx[(ui8)*p]; p++)
        ;
    if (((p - start_pos) < 2) || is_number(start_pos, p - start_pos))
        goto next_word;

    hk = hash_funck(start_pos, p - start_pos);
    full_words_count++;

    if (bw.find(hk) != bw.end())
        goto next_word;

    // commented out dictionary check for raw text, PWORDS_XX not used anymore
    switch (language) {
        case LANG_RUS:
        case LANG_UNK_CYR:
        case LANG_ENG:
            //            if (!dict_rus.find(hk))
            // goto next_word;
            break;
        case LANG_TUR:
            //            if (!dict_tur.find(hk))
            // goto next_word;
            break;
        default:
            //            if (!dict_rus.find(hk))
            // goto next_word;
            ;
    }

    pos = p;
    to = hk;
    dict_words_count++;

    return true;
}

int TShinglesCounter::get_doc_shinv_2(TVector<ui64>& shv,
                  const char* is,
                  const THashSet<ui64>& bw) {
    ui64 cur_word[10], cur_shingle; //6151 is a good prime number between 7 ^ 4 and 7 ^ 5 :)
    int n;
    cur_word[0] = 0;
    ui64 f_shi;

    if (!getnextwordhash_2(is, f_shi, bw))
        return 1;

    for (n = 1; n < 10; n++)
        cur_word[n] = f_shi;

    cur_shingle = 0;

    for (n = 1; n < 10; n++)
        cur_shingle = cur_shingle * 6151 + cur_word[n];

    n = 0;
    int num_w = 1;

    shv.clear();
    shv.resize(49, 0);
    while (1) {
        cur_shingle -= cur_word[n] * upper_wrd;
        if (!getnextwordhash_2(is, cur_word[n], bw)) {
            if (num_w < 5)
                return 1;
            break;
        }

        cur_shingle = cur_shingle * 6151 + cur_word[n];
        ++n;
        ++num_w;
        if (n >= 10)
            n = 0;

        if (cur_shingle < shv[0] || shv[0] == 0) {
            //fprintf (stderr,"first %llu pow 0\n",cur_shingle);
            shv[0] = cur_shingle;
            continue;
        }

        for (int k = 1; k < 49; ++k) {
            ui64 pow = 1ull << k;
            if ((cur_shingle % pow) == 0) {
                if (cur_shingle < shv[k] || shv[k] == 0) {
                    //fprintf (stderr,"next %llu pow %d\n",cur_shingle,k);
                    shv[k] = cur_shingle;
                    break;
                }
            }
        }
    }

    return 0;
}

char* TShinglesCounter::get_supershingle_2(const char* is,
                       char* outbuf,
                       const THashSet<ui64>& bw) {
    TVector<ui64> res;
    get_doc_shinv_2(res, is, bw);
    ui32 n, e = res.size();
    //fprintf(stderr,"shinv %u\n",e);
    if (!e) {
        *outbuf = 0;
        return 0;
    }

    ui64 s = res[0];
    for (n = 1; n < res.size(); n++) {
        if (res[n])
            s = s * 6151 + res[n];
    }
    if (s) {
        sprintf(outbuf, "%015" PRIx64, s);
        return outbuf;
    } else {
        return nullptr;
    }
}

TShinglesCounter::TShinglesCounter(const TGlobalContext& GlobalContext, const TString& bw_str)
: GlobalContext(GlobalContext)
, bw(get_words_hash(bw_str)){}

void TShinglesCounter::count_shingle_2(const TLog& logger, const mimepp::Message& msg, const TString& serv, const TString& user, const TString& fname) {
    const TString doc = shingl_index(GlobalContext, logger, msg);
    if (doc.length() < 10)
        return;
    last_shingle = get_shingle_2(doc, TStringBuilder{} << serv << ' ' << user << ' ' << fname);
}

void TShinglesCounter::feed(const TString& text) {
    if (first_time) {
        TString sStr(text);
        ToLower(sStr.begin(), sStr.size(), *CodePageByCharset(CODES_KOI8));

        shingle_val = get_supershingle_next(sStr.c_str(), bw, &shingle_val);
        first_time = false;
    } else {
        shingle_val = get_supershingle_next(text.c_str(), bw, &shingle_val);
    }

    if (shingle_val) {
        char outbuf[30];
        sprintf(outbuf, "%015llx", static_cast<unsigned long long int>(shingle_val));
        last_cshingle = outbuf;
    }
}

long long int TShinglesCounter::fwords() {
    if (sh_full_words != 0) {
        return sh_dict_words * 100 / sh_full_words;
    }
    return 100;
}

[[nodiscard]] TString TContext::count_crc() const {
    if (fcrc == FNV64INIT) {
        return {};
    }
    return Sprintf("%015llx", fcrc);
}

void TContext::init_deep_level() {
    deep_level = 0;
}

TGlobalContext::TGlobalContext(const TSoConfig& config)
    : ThreadFactory(config.ThreadStackSize)
    , ThreadPool(TThreadPoolParams(&ThreadFactory)) {

    InitZones(config, SysLogger());
    InitAssassin(config); // dictionaries, tags, splib rules

    pDaemonLogger = Loggers.GetSpLoggerPtr(SO_DAEMON_LOGGER);
    Openlog(config.fakeSyslog, config.logLevel);
    pDaemonLogger->Openlog(SO_ERRORS_AND_WARNINGS, config.fakeSyslog.c_str());

    if (config.fnFilterLog) {
        pFilterLogger = Loggers.GetSpLoggerPtr(SO_FILTER_LOGGER);
        pFilterLogger->Openlog(SO_ERRORS_AND_WARNINGS, config.fnFilterLog.c_str());
    }

    if (config.fnHttpLog) {
        pHttpLogger = Loggers.GetSpLoggerPtr(SO_HTTP_LOGGER);
        pHttpLogger->Openlog(SO_ERRORS_AND_WARNINGS, config.fnHttpLog.c_str());
    }

    if (config.fnRulesLog)
        Loggers.GetSpLoggerPtr(SO_RULES_LOGGER)->Openlog(SO_ERRORS_AND_WARNINGS, config.fnRulesLog.c_str());

    ThreadPool.Start(config.solverThreads, 2 * config.solverThreads);
}

void TGlobalContext::DequeContext() {
    BusyContexts.Inc();
}

void TGlobalContext::EnqueContext() {
    BusyContexts.Dec();
}

void TGlobalContext::UpUserWeights() {
    TUserWeightsPairMap weights;

    if (Pools->UserWeightsNgRequester) {
        if ((weights = Pools->UserWeightsNgRequester->GetUserWeights(SysLogger()))) {
            Syslog(TLOG_INFO) << "fetch " << weights.size() << " new weights";
        }
    }

    if(weights) {
        UserWeights = MakeTrueAtomicShared<TUserWeightsPairMap>(std::move(weights));
    }
}

void TGlobalContext::InitZones(const TSoConfig& spTop, const TLog& logger) {
    try {
        ip_matcher.read_file(spTop.TrustedZonesPath);
    } catch (yexception& e) {
        logger << TLOG_ERR << CurrentExceptionMessageWithBt();
    }

    try {
        intranet_matcher.read_file(spTop.IntranetZonesPath);
    } catch (yexception& e) {
        logger << TLOG_ERR << CurrentExceptionMessageWithBt();
    }
#ifdef SO_CMAIL
    try {
        local_matcher.read_file(spTop.LocalZonesPath);
    } catch (...) {
        logger << TLOG_ERR << CurrentExceptionMessageWithBt();
    }
#endif
}

void TGlobalContext::InitAssassin(const TSoConfig& spTop) {
    TAtomicSharedPtr<NTvmAuth::TTvmClient> tvmClient;

    if (const auto &conf = spTop.TvmConfig) {
        NTvmAuth::NTvmApi::TClientSettings tvmBBClientSettings;
        tvmBBClientSettings.SetSelfTvmId(conf->Id); // tvm resource ID for sp-daemon, refer to ABC
        NTvmAuth::NTvmApi::TClientSettings::TDstMap dsts;

        for (const auto &[service, id]: conf->Services) {
            dsts.emplace(service, id);
        }

        tvmBBClientSettings.EnableServiceTicketsFetchOptions(conf->Secret, std::move(dsts));

        tvmClient = MakeAtomicShared<NTvmAuth::TTvmClient>(tvmBBClientSettings,
                                                                 new NTvmAuth::TCerrLogger(7));
    }

    Recoder = MakeHolder<TRecognizer>(spTop.RecorderDictPath);

    RulesHolder = MakeTrueAtomicShared<TRulesHolder>(false,
                                                             spTop.dnRules,
                                                             Loggers,
                                                             spTop.PcreSettings,
                                                             spTop.HsRulesCache,
                                                             spTop.RulesDictPath);

    LibspParams = T_SpParams(spTop, &Loggers);

    if (spTop.ShortLog)
        Loggers.ShortLog.ResetBackend(CreateOwningThreadedLogBackend(spTop.ShortLog));

    if(spTop.KasperskyLogger) {
        Loggers.KasperskyLogger.ResetBackend(CreateFilteredOwningThreadedLogBackend(spTop.KasperskyLogger, TLOG_RESOURCES));
    }

    if(spTop.JsonMlLog) {
        Loggers.JsonMlLog.ResetBackend(CreateFilteredOwningThreadedLogBackend(spTop.JsonMlLog, TLOG_RESOURCES));
    }

    if (spTop.ComplLog)
        Loggers.GetSpLoggerPtr(SO_COMPL_LOGGER)->Openlog(SO_ERRORS_AND_WARNINGS, spTop.ComplLog.c_str(),
                                                                       false);
    if (spTop.fnClassification) {
        Loggers.DeliveryLog.ResetBackend(CreateFilteredOwningThreadedLogBackend(spTop.fnClassification, TLOG_RESOURCES));
    }

    if (spTop.MLLog)
        Loggers.GetSpLoggerPtr(SO_ML_LOGGER)->Openlog(SO_ERRORS_AND_WARNINGS, spTop.MLLog.c_str(), false);
    if (spTop.UserRepLog)
        Loggers.GetSpLoggerPtr(SO_USERREP_LOGGER)->Openlog(SO_ERRORS_AND_WARNINGS,
                                                                         spTop.UserRepLog.c_str());

    //////////////////////////////

    Models = MakeTrueAtomicShared<TAppliersMap>(LoadModels(spTop));

    //////////////////////////////

    Pools = MakeTrueAtomicShared<TRengineCurlPools>(LibspParams.pSpTop, tvmClient);

    if (spTop.Text2VecDssm) {
        TextToVecDssm = MakeTrueAtomicShared<NNeuralNetApplier::TModel>(
                NNeuralNetApplier::TModel::FromFile(spTop.Text2VecDssm));
    }

    if (spTop.DeobfuscatorConfigs) {
        NTextDeobfuscate::TTextDeobfuscator deobfuscator;
        for (const auto&[_, config]: spTop.DeobfuscatorConfigs->Configs) {
            deobfuscator.AddWordDeobfuscator(config.RemapPath, config.TriePath);
        }
        deobfuscator.Initialize();
        TextDeobfuscator = MakeTrueAtomicShared<NTextDeobfuscate::TTextDeobfuscator>(std::move(deobfuscator));
    }

    if (spTop.DomenFactorsTrie) {
        const auto blob = TBlob::FromFile(spTop.DomenFactorsTrie);
        DomenFactorsBuilder = MakeTrueAtomicShared<NDomenFactors::TDomenFactorsBuilder>(blob);
    }

    if (spTop.UidsFeaturesPath) {
        TFileInput fileInput(spTop.UidsFeaturesPath);
        TUidsStats uidsStats;
        ::Load(&fileInput, uidsStats);
        UidsStats = MakeTrueAtomicShared<TUidsStats>(uidsStats);
    }
}
