#include <saas/tools/patents_parser/proto/patentparserconfig.pb.h>

#include <google/protobuf/text_format.h>

#include <library/cpp/getopt/last_getopt.h>
#include <library/cpp/json/json_reader.h>
#include <library/cpp/json/writer/json.h>
#include <library/cpp/xml/document/xml-document.h>

#include <mapreduce/yt/interface/client.h>
#include <mapreduce/yt/interface/io.h>

#include <util/generic/hash_set.h>
#include <util/stream/output.h>
#include <util/system/user.h>
#include <util/stream/file.h>
#include <util/string/split.h>
#include <util/string/strip.h>

using namespace NYT;
using namespace NPatentParser;

class TPatentsReducer
    : public IReducer<TTableReader<TNode>, TTableWriter<TNode>>
{
public:
    Y_SAVELOAD_JOB(ConfigString_);
    TPatentsReducer() = default;
    TPatentsReducer(TString config)
        : ConfigString_(std::move(config))
    { }

    void Start(TWriter*) override {
        if (!google::protobuf::TextFormat::ParseFromString(ConfigString_, &Config_)) {
            Y_FAIL("Invalid configuration file");
        }
    }
private:

    struct TPatentDocument{
        TString XmlDocument;
        bool HasPdf;
    };

public:
    void Do(TReader* reader, TWriter* writer) override {
        TPatentDocument doc;
        TString key;
        for (; reader->IsValid(); reader->Next()) {
            const auto& row = reader->GetRow();
            if (!key) {
                key = row["key"].AsString();
            }
            const TString& subkey = row["subkey"].AsString();
            if (subkey == "DOCUMENT.PDF"){
                doc.HasPdf = true;
            } else if (subkey == "document.xml"){
                doc.XmlDocument = row["value"].AsString();
            }
        }
        if (key.empty()) {
            Y_FAIL("Empty key column is not allowed");
        }
        NJson::TJsonValue parsedDoc = TransformPatentDocument(doc);
        TString url = parsedDoc["docs"][0]["url"].GetString();
        NJsonWriter::TBuf jsonWriter;
        jsonWriter.WriteJsonValue(&parsedDoc);
        TNode outPatentRow;
        outPatentRow["key"] = url;
        outPatentRow["JsonMessage"] = jsonWriter.Str();
        writer->AddRow(outPatentRow, 0);
        TNode outKeyRow;
        outKeyRow["key"] = key;
        outKeyRow["url"] = url;
        writer->AddRow(outKeyRow, 1);
    }

    TStringBuf GetDefaultType(TStringBuf name){
        if (name.StartsWith("i_")){
            return "#pi";
        }
        if (name.StartsWith("s_")){
            return "#pl";
        }
        return "#p";
    }

    NJson::TJsonValue ElementJsonArray(TStringBuf type, TString value) {
        NJson::TJsonValue array;
        NJson::TJsonValue element;
        element.InsertValue("type", NJson::TJsonValue(std::move(type)));
        element.InsertValue("value", NJson::TJsonValue(std::move(value)));
        array.AppendValue(std::move(element));
        return array;
    }

    TString GetNodeHtml(NXml::TConstNode node){
        const auto name = node.Name();
        if (name == "p" || name == "P") {
            return node.ToString();
        } else if (name == "claim-text"){
            auto text = node.ToString();
            size_t start = text.find('>') + 1;
            size_t end = text.rfind('<');
            return "<p>" + text.substr(start, end - start) + "</p>";
        }
        return Strip(node.Value<TString>(""));
    }

    NJson::TJsonValue TransformPatentDocument(const TPatentDocument& src){
        NJson::TJsonValue res;
        NJson::TJsonValue result;
        NXml::TDocument doc(src.XmlDocument, NXml::TDocument::String);
        auto root = doc.Root();
        for (auto field : Config_.GetField()){
            const TString& name = field.GetName();
            auto xpathInput = field.GetXPath();
            TVector<TString> xpathVector;
            Split(xpathInput, " ", xpathVector);
            TString plainText;
            TString taggedText;
            TStringStream taggedStream;
            bool needHtml = name.StartsWith(ZonePreffix);
            for (auto xpath : xpathVector) {
                auto nodes = root.XPath(xpath, true);
                for (const auto& node : nodes){
                    plainText.append(Strip(node.Value<TString>("")) + " ");
                    if (needHtml){
                        taggedText.append(GetNodeHtml(node) + " ");
                    }
                }
            }
            if (plainText.empty()) {
                continue;
            }
            plainText = StripInPlace(plainText);
            taggedText = StripInPlace(taggedText);
            NJson::TJsonValue array;
            if (needHtml){
                if (plainText != taggedText) {
                    NJson::TJsonValue plainElement;
                    plainElement.InsertValue("type", NJson::TJsonValue("#z"));
                    plainElement.InsertValue("value", NJson::TJsonValue(plainText));
                    array.AppendValue(plainElement);
                    NJson::TJsonValue htmlElement;
                    htmlElement.InsertValue("type", NJson::TJsonValue("#p"));
                    htmlElement.InsertValue("value", NJson::TJsonValue(taggedText));
                    array.AppendValue(htmlElement);
                } else {
                    array = ElementJsonArray("#pz", plainText);
                }
            } else {
                array = ElementJsonArray(GetDefaultType(name), plainText);
            }
            res.InsertValue(name, array);
        }
        TString url = root.Node("/ru-patent-document/SDOBI[@lang='ru']/B100/B190").Value<TString>() + root.Node("/ru-patent-document/SDOBI[@lang='ru']/B100/B110").Value<TString>() + root.Node("/ru-patent-document/SDOBI[@lang='ru']/B100/B130").Value<TString>();
        res.InsertValue("url", NJson::TJsonValue(url));
        auto files = root.XPath("//img/@file", true);
        TString img;
        TVector<TString> images;
        THashSet<TString> addedImages;
        for (const auto& file : files){
            TString fileName = file.Value<TString>();
            if (addedImages.find(fileName) == addedImages.end()) {
                addedImages.insert(fileName);
                images.push_back(fileName);
            }
        }
        for (const auto& imageName : images) {
            img.append(url + "/" + imageName + " ");
        }
        img = StripInPlace(img);
        res.InsertValue("img", ElementJsonArray("#p", img));
        if (src.HasPdf) {
            res.InsertValue("pdf", ElementJsonArray("#p", url + "/pdf"));
        }
        NJson::TJsonValue opts;
        opts.InsertValue("mime_type", NJson::TJsonValue("text/html"));
        opts.InsertValue("charset", NJson::TJsonValue("utf8"));
        opts.InsertValue("language", NJson::TJsonValue("ru"));
        opts.InsertValue("language2", NJson::TJsonValue("en"));
        opts.InsertValue("language_default", NJson::TJsonValue("ru"));
        opts.InsertValue("language_default2", NJson::TJsonValue("en"));
        res.InsertValue("options", opts);
        NJson::TJsonValue docs;
        docs.AppendValue(res);
        result.InsertValue("docs", docs);
        result.InsertValue("action", NJson::TJsonValue("modify"));
        result.InsertValue("prefix", NJson::TJsonValue(1));
        return result;
    }

    TString ConfigString_;
    TPatentParserConfig Config_;
    const TString ZonePreffix = "z_";
};

class TPatentUrlsMapper
    : public IMapper<TTableReader<TNode>, TTableWriter<TNode>>
{
public:
    void Start(TWriter*) override {
        TIFStream stream("patent_urls");
        auto patentUrlsReader = CreateTableReader<TNode>(&stream);

        for (; patentUrlsReader->IsValid(); patentUrlsReader->Next()) {
            const auto& curRow = patentUrlsReader->GetRow();
            UrlsMap_[curRow["key"].AsString()] = curRow["url"].AsString();
        }
    }

    void Do(TReader* reader, TWriter* writer) override {
        if (UrlsMap_.size() == 0) {
            return;
        }
        for (; reader->IsValid(); reader->Next()) {
            const auto& row = reader->GetRow();
            TString url;
            if (row.HasKey("url")){
                url = row["url"].AsString();
            }
            const TString& key = row["key"].AsString();
            if (UrlsMap_.contains(key)){
                url = UrlsMap_[key];
            }
            const TString& subkey = row["subkey"].AsString();
            if (subkey == "DOCUMENT.PDF"){
                url = url + "/pdf";
            } else if (subkey != "document.xml"){
                url = url + "/" + subkey;
            }
            TNode outRow;
            outRow["key"] = row["key"];
            outRow["subkey"] = row["subkey"];
            outRow["value"] = row["value"];
            outRow["url"] = url;
            writer->AddRow(outRow);
        }
    }
private:
    THashMap<TString, TString> UrlsMap_;
};

REGISTER_REDUCER(TPatentsReducer);
REGISTER_MAPPER(TPatentUrlsMapper);

int main(int argc, const char** argv) {
    NYT::Initialize(argc, argv);
    NLastGetopt::TOpts options;
    TString configFilePath = "patent-parser.conf";
    TString tmpFolderPath = "//tmp/" + GetUsername();
    TString inputTable = "//home/jupiter-dev/grmammaev/patentsearch/patents";
    TString patentsTable = "//home/saas/test_patents/result_patents";
    TString filesWithUrlsTable;
    TString proxy = "banach";
    options
        .AddCharOption('c', "-- parser config file path")
        .AddLongName("config")
        .DefaultValue(configFilePath)
        .StoreResult(&configFilePath);
    options
        .AddCharOption('o', "-- urled files table - src patents table by default")
        .AddLongName("output")
        .DefaultValue(filesWithUrlsTable)
        .StoreResult(&filesWithUrlsTable);
    options
        .AddCharOption('s', "-- src patents table")
        .AddLongName("src")
        .DefaultValue(inputTable)
        .StoreResult(&inputTable);
    options
        .AddCharOption('r', "-- result table with json messages for indexing")
        .AddLongName("res")
        .DefaultValue(patentsTable)
        .StoreResult(&patentsTable);
    options
        .AddCharOption('p', "-- cluster name - banach for default")
        .AddLongName("proxy")
        .DefaultValue(proxy)
        .StoreResult(&proxy);
    options
        .AddCharOption('t', "-- tmp folder path")
        .DefaultValue(tmpFolderPath)
        .StoreResult(&tmpFolderPath);
    NLastGetopt::TOptsParseResult opts(&options, argc, argv);
    TIFStream fileStream(configFilePath);
    auto client = CreateClient(proxy);
    const TString sortedTmpTable = tmpFolderPath + "/sorted-patents";
    const TString urlsTable = tmpFolderPath + "/patent_urls";
    client->Sort(
        TSortOperationSpec()
            .AddInput(inputTable)
            .Output(sortedTmpTable)
            .SortBy({"key"}));
    client->Reduce(
        TReduceOperationSpec()
            .ReduceBy({"key"})
            .AddInput<TNode>(sortedTmpTable)
            .AddOutput<TNode>(patentsTable)
            .AddOutput<TNode>(urlsTable),
        new TPatentsReducer(fileStream.ReadAll()));
    if (!filesWithUrlsTable) {
        filesWithUrlsTable = inputTable;
    }
    client->Map(
        TMapOperationSpec()
            .AddInput<TNode>(inputTable)
            .MapperSpec(TUserJobSpec()
                .AddFile(TRichYPath(urlsTable).Format("yson")))
            .AddOutput<TNode>(filesWithUrlsTable),
        new TPatentUrlsMapper);
    return 0;
}
