#include <util/generic/vector.h>
#include <util/generic/map.h>
#include <util/generic/set.h>
#include <util/stream/file.h>
#include <util/stream/str.h>
#include <util/stream/output.h>
#include <util/string/reverse.h>

#include <library/cpp/containers/comptrie/prefix_iterator.h>

#include <wmconsole/version3/junk/spam_hosts_ml/dataset/dataset.h>

#include <wmconsole/version3/wmcutil/string.h>
#include <wmconsole/version3/wmcutil/url.h>

#include "archive.h"
#include "dataset.h"

TClfDataset::TClfDataset()
    : TldMap(LoadTldMap())
    , NGrammsMap(BuildNGrammsMap())
    , IdfMap(LoadIdfMap())
{
    Build2ldListTrie();
}

TMap<TString, size_t> TClfDataset::LoadTldMap() {
    TBlob blob;
    LoadDataFromArchive("/tld.list", blob);
    TMemoryInput inputStream(blob.Data(), blob.Length());
    TVector<TString> tldList;
    TMap<TString, size_t> tldMap;
    for(TString line; inputStream.ReadLine(line);) {
        tldList.push_back(line);
    }

    for(size_t i = 0; i < tldList.size(); i++) {
        tldMap[tldList[i]] = i;
    }

    return tldMap;
}

TMap<TString, double> TClfDataset::LoadIdfMap() {
    TBlob blob;
    LoadDataFromArchive("/idf.list", blob);
    TMemoryInput inputStream(blob.Data(), blob.Length());
    TMap<TString, double> idfMap;
    for(TString line; inputStream.ReadLine(line);) {
        TString ngramm;
        double idf = 0.0;
        TStringInput stream(line);
        stream >> ngramm >> idf;
        idfMap[ngramm] = idf;
    }

    return idfMap;
}

TMap<TString, size_t> TClfDataset::BuildNGrammsMap() {
    const TString charmap = "qwertyuiopasdfghjklzxcvbnm1234567890-_";
    TSet<TString> ngrammsSet;
    TMap<TString, size_t> ngrammsMap;
    for (const char c1 : charmap) {
        for (const char c2 : charmap) {
            const char ngramm[3] = { c1, c2, 0 };
            ngrammsSet.insert(ngramm);
        }
    }

    TVector<TString> ngrammsList(ngrammsSet.begin(), ngrammsSet.end());
    for (size_t i = 0; i < ngrammsList.size(); i++) {
        ngrammsMap[ngrammsList[i]] = i;
    }

    return ngrammsMap;
}

void TClfDataset::Build2ldListTrie() {
    TBlob blob;
    LoadDataFromArchive("/2ld.list", blob);
    TMemoryInput inputStream(blob.Data(), blob.Length());

    TCompactTrie<char>::TBuilder trieBuilder;
    for(TString line; inputStream.ReadLine(line);) {
        TString rline = line;
        ReverseInPlace(rline);
        trieBuilder.Add(rline, rline.size());
    }

    TBufferStream data;
    trieBuilder.SaveAndDestroy(data);
    TrieStream.assign(data.Buffer().Data(), data.Buffer().Data() + data.Buffer().Size());
    Trie.Init(&TrieStream[0], TrieStream.size());
}

bool TClfDataset::Is2ldListSubdomain(const TString &host) const {
    TString rhost = host;
    ReverseInPlace(rhost);
    for (auto it = MakePrefixIterator(Trie, rhost.data(), rhost.size()); it; ++it) {
        const TString domain2nd = host.substr(host.size() - it.GetPrefixLen());

        if (NWebmaster::NUtils::IsSubdomain(host, domain2nd)) {
            return true;
        }
    }
    return false;
}

void TClfDataset::GetFeatures(size_t featuresOffset, TString hostname, TVector<float> &featuresList) const {
    hostname = NWebmaster::NUtils::FixDomainPrefix(NWebmaster::NUtils::RemoveScheme(hostname));

    const size_t MEANINGFUL_DOMAIN_COUNT = 5;
    const size_t hostnameLenFeatureIndex = featuresOffset;
    const size_t domainsCountFeatureIndex = hostnameLenFeatureIndex + 1;
    const size_t domains2ldListFeatureIndex = domainsCountFeatureIndex + 1;
    const size_t domainsLenghtsFeaturesOffset = domains2ldListFeatureIndex + 1;
//    const size_t domainsLenghtsFeaturesOffset = domainsCountFeatureIndex + 1;
    const size_t ngrammsFeaturesOffset = domainsLenghtsFeaturesOffset + MEANINGFUL_DOMAIN_COUNT;
    const size_t tldFeaturesOffset = ngrammsFeaturesOffset + NGrammsMap.size();
    const size_t featuresCount = tldFeaturesOffset + TldMap.size();
    featuresList.clear();
    featuresList.resize(featuresCount);

    if (hostname.empty()) {
        return;
    }

    TVector<TStringBuf> domains;//, rest;
    THashMap<TString, double> ngrammsTf;
    double ngrammsCount = 0;
    TString tld;
    EnumNGramms(hostname, domains, tld, [&](const char *ngramm) {
        if (NGrammsMap.contains(ngramm)) {
            ngrammsCount += 1.0;
            ngrammsTf[ngramm] += 1.0;
            //const size_t ngrammIndex = ngrammsFeaturesOffset + NGrammsMap.at(ngramm);
            //featuresList[ngrammIndex] += 1.0;
        }
    });

    for (const auto &obj : ngrammsTf) {
        const TString &ngramm = obj.first;
        if (IdfMap.contains(ngramm)) {
            double tf = obj.second / ngrammsCount;
            double tfIdf = tf * IdfMap.at(ngramm);
            const size_t ngrammIndex = ngrammsFeaturesOffset + NGrammsMap.at(ngramm);
            featuresList[ngrammIndex] = tfIdf;
        }
    }

    if (!TldMap.contains(tld)) {
        tld = "etc";
    }

    const size_t tldFeatureIndex = tldFeaturesOffset + TldMap.at(tld);
    featuresList[tldFeatureIndex] = 1.0;
    featuresList[hostnameLenFeatureIndex] = static_cast<float>(hostname.size());
    featuresList[domainsCountFeatureIndex] = static_cast<float>(domains.size());
    featuresList[domains2ldListFeatureIndex] = static_cast<float>(Is2ldListSubdomain(hostname));

    std::reverse(domains.begin(), domains.end()); //tld will always be first
    for (size_t i = 0; i < domains.size() && i < MEANINGFUL_DOMAIN_COUNT; i++) {
        featuresList[domainsLenghtsFeaturesOffset + i] = domains[i].length();
    }
}

void TClfDataset::GetFeaturesWithTarget(const TString &hostname, int target, TVector<float> &featuresList) const {
    size_t featuresOffset = 1;
    GetFeatures(featuresOffset, hostname, featuresList);
    featuresList[0] = static_cast<float>(target);
}

void TClfDataset::GetFeatures(const TString &hostname, TVector<float> &featuresList) const {
    size_t featuresOffset = 0;
    GetFeatures(featuresOffset, hostname, featuresList);
}
