#include <cmath>

#include <util/generic/vector.h>
#include <util/generic/map.h>
#include <util/generic/set.h>
#include <util/stream/file.h>
#include <util/stream/str.h>
#include <util/stream/output.h>

#include <wmconsole/version3/wmcutil/string.h>
#include <wmconsole/version3/junk/spam_hosts_ml/dataset/dataset.h>

int main(int argc, const char **argv) {
    Y_UNUSED(argc);
    Y_UNUSED(argv);

    TClfDataset clfDataset;

    TMap<TString, size_t> ngrammsByDocuments;
    size_t documentCount = 0;

    TUnbufferedFileInput input("data.shuf");
    for(TString line; input.ReadLine(line);) {
        TString hostname;
        int target = 0;
        TStringInput stream(line);
        stream >> target >> hostname;

        if (hostname.empty()) {
            continue;
        }

        THashSet<TString> ngramms;
        TVector<TStringBuf> domains;
        TString tld;
        TClfDataset::EnumNGramms(hostname, domains, tld, [&](const char *ngramm) {
            ngramms.insert(ngramm);
        });

        for (const TString &ngramm : ngramms) {
            ngrammsByDocuments[ngramm] += 1;
        }

        documentCount++;
    }

    for (const auto &obj : ngrammsByDocuments) {
        double idf = std::log(static_cast<double>(documentCount) / static_cast<double>(obj.second)) + 1.0;
        Cout << obj.first << " " << idf << Endl;
    }
}
