#include <mail/so/spamstop/tools/text2shingles/lib/text2shingles.h>
#include <mail/so/libs/html_sanitizer_misc/html_sanitizer_misc.h>
#include <mail/so/libs/unperson/lemmercache.h>

#include <mapreduce/yt/interface/operation.h>
#include <mapreduce/yt/interface/init.h>
#include <mapreduce/yt/interface/client.h>

#include <library/cpp/getopt/last_getopt.h>
#include <library/cpp/langs/langs.h>
#include <library/cpp/yson/node/node.h>

#include <util/string/join.h>

Y_DECLARE_OUT_SPEC(inline, NYT::TNode, stream, value) {
    value.Save(&stream);
}

class TMapper : public NYT::IMapper<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TNode>> {
    TString FieldName, TargetBodyFieldName, TargetSubjFieldName, TargetHidFieldName;

public:
    TMapper() = default;
    explicit TMapper(TString field, TString targetBodyField, TString targetSubjField, TString targetHidFieldName) noexcept
        : FieldName(std::move(field))
        , TargetBodyFieldName(std::move(targetBodyField))
        , TargetSubjFieldName(std::move(targetSubjField))
        , TargetHidFieldName(std::move(targetHidFieldName)){
    }

    Y_SAVELOAD_JOB(FieldName, TargetBodyFieldName, TargetSubjFieldName, TargetHidFieldName);

    void Do(NYT::TTableReader<NYT::TNode>* input, NYT::TTableWriter<NYT::TNode>* output) override {
        for (; input->IsValid(); input->Next()) {
            auto record = input->MoveRow();
            const NYT::TNode::TMapType& map = record.AsMap();

            const NYT::TNode* field = MapFindPtr(map, FieldName);
            if (!record.HasKey(FieldName)) {
                Cerr << record << " doesn't contain " << FieldName << Endl;
                continue;
            }

            NJson::TJsonValue value;

            if (const auto& text = field->As<TString>(); !NJson::ReadJsonFastTree(text, &value, false)) {
                Cerr << " cannot parse json from " << text << Endl;
                continue;
            }

            const auto answer = NHtmlSanMisc::TAnswer::Parse(std::move(value));

            auto part = answer.FindVisiblePart();
            if (!part) {
                Cerr << record << " cannot find visible part" << Endl;
                continue;
            }

            {
                const auto shingles = NText2Shingles::Text2Shingles(part->PureBody.Clipped, LANG_UNK, true, std::numeric_limits<size_t>::max());
                record(TargetBodyFieldName, JoinSeq(",", shingles));
            }
            {
                const auto shingles = NText2Shingles::Text2Shingles(part->Subject.Clipped, LANG_UNK, true, std::numeric_limits<size_t>::max());
                record(TargetSubjFieldName, JoinSeq(",", shingles));
            }
            record(TargetHidFieldName, part->hid);
            output->AddRow(record);
        }
    }
};

REGISTER_MAPPER(TMapper)

struct TOptions {
    NYT::TYPath SrcTable, DstTable;
    TString Cluster;
    TString Field;
    TString TargetBodyField, TargetSubjField, TargetHidField;

    static TOptions Read(int argc, const char** argv) {
        TOptions options;
        {
            NLastGetopt::TOpts opts;
            opts.AddLongOption("src", "src table").StoreResult(&options.SrcTable).Required();
            opts.AddLongOption("dst", "dst table").StoreResult(&options.DstTable).Required();
            opts.AddLongOption("field", "field with tikaite res").StoreResult(&options.Field).Required();
            opts.AddLongOption("clst", "cluster").StoreResult(&options.Cluster).DefaultValue("hahn");
            opts.AddLongOption("target_body_field", "target field with hashes").StoreResult(&options.TargetBodyField).DefaultValue("nnbody_full");
            opts.AddLongOption("target_subj_field", "target field with hashes").StoreResult(&options.TargetSubjField).DefaultValue("nnsubj_full");
            opts.AddLongOption("target_hid_field", "target field with hid").StoreResult(&options.TargetHidField).DefaultValue("hid");
            NLastGetopt::TOptsParseResult(&opts, argc, argv);
        }

        return options;
    }
};

int main(int argc, const char** argv) {
    NYT::Initialize(argc, argv);

    const auto options = TOptions::Read(argc, argv);

    auto client = NYT::CreateClient(options.Cluster);

    if (!client->Exists(options.DstTable))
        client->Create(options.DstTable, NYT::NT_TABLE);

    Y_VERIFY(client->Exists(options.DstTable));

    NYT::TMapOperationSpec spec;
    spec
        .AddInput<NYT::TNode>(options.SrcTable)
        .AddOutput<NYT::TNode>(options.DstTable);
    client->Map(spec, new TMapper(options.Field, options.TargetBodyField, options.TargetSubjField, options.TargetHidField));

    return 0;
}
