#include <mail/so/libs/unperson/unperson.h>

#include <library/cpp/containers/comptrie/comptrie_builder.h>
#include <library/cpp/getopt/small/last_getopt.h>

#include <util/generic/hash.h>
#include <util/generic/hash_set.h>

using TWordsMap = THashMap<TUtf32String, ui64>;
using TTrieBuilder = TCompactTrieBuilder<wchar32, bool>;

struct TOptions {
    TOptions(int argc, const char* argv[]) {
        TString alphabet;

        NLastGetopt::TOpts opts;
        opts.SetFreeArgsNum(0);

        opts.AddLongOption('a', "alphabet", "Language alphabet").RequiredArgument()
            .StoreResult(&alphabet).Required();

        opts.AddLongOption('n', "min-length", "Minimal word length").RequiredArgument()
            .StoreResult(&MinLength).Required();

        opts.AddLongOption('f', "min-frequency", "Minimal word frequency").RequiredArgument()
            .StoreResult(&MinFrequency).DefaultValue(1);

        opts.AddLongOption('l', "lowercase", "Lowercase words before adding to trie").NoArgument()
            .SetFlag(&Lowercase).DefaultValue(false);

        opts.AddLongOption('c', "compact", "Compact trie").NoArgument()
            .SetFlag(&Compact).DefaultValue(false);

        opts.AddLongOption('v', "verbose", "Output added words to stderr").NoArgument()
            .SetFlag(&Verbose).DefaultValue(false);

        NLastGetopt::TOptsParseResult res(&opts, argc, argv);

        for (auto c: TUtf32String::FromUtf8(alphabet)) {
            Alphabet.emplace(c);
        }
    }

    THashSet<wchar32> Alphabet;
    size_t MinLength;
    size_t MinFrequency;
    bool Verbose;
    bool Lowercase;
    bool Compact;
};

static bool AddWord(TWordsMap& words, const TUtf32String& str, const TOptions& opts) {
    if (str.length() >= opts.MinLength) {
        for (auto c: str) {
            if (!opts.Alphabet.contains(c)) {
                return false;
            }
        }
        ui64 count = words[str]++;
        if (!count) {
            if (opts.Verbose) {
                Cerr << "Added word <" << str << '>' << Endl;
            }
            return true;
        }
    }
    return false;
}

int main(int argc, const char* argv[]) {
    TOptions opts(argc, argv);

    TString str;
    TUtf32String buf;
    TWordsMap words;
    size_t counter = 0;
    size_t sinceLast = 0;
    while (Cin.ReadLine(str)) {
        if (opts.Lowercase) {
            str.to_lower();
        }
        bool inWord = false;
        for (auto c: NUnperson::EraseMarks(TUtf32String::FromUtf8(str), true)) {
            if (IsAlnum(c)) {
                inWord = true;
                buf.append(c);
            } else if (inWord) {
                inWord = false;
                sinceLast += AddWord(words, buf, opts);
                buf.clear();
            }
        }
        sinceLast += AddWord(words, buf, opts);
        if (sinceLast > 10000) {
            counter += sinceLast;
            sinceLast = 0;
            Cerr << "Words so far: " << counter << Endl;
        }
        buf.clear();
    }
    Cerr << "Total words count: " << (counter + sinceLast) << Endl;
    TTrieBuilder builder;
    ui64 trieWords = 0;
    for (const auto& x: words) {
        if (x.second >= opts.MinFrequency) {
            ++trieWords;
            builder.Add(x.first, true);
        }
    }
    Cerr << "Trie words count: " << trieWords << Endl;
    if (opts.Compact) {
        CompactTrieMinimize(Cout, builder, true);
    } else {
        builder.Save(Cout);
    }
}

