#include "yt_indexing_reducer.h"

#include <saas/rtyserver/config/const.h>
#include <saas/library/yt/file/yt_file_dumper.h>
#include <saas/library/yt/stream/yt_chunked_output.h>
#include <saas/protos/shards.pb.h>

#include <library/cpp/json/json_writer.h>
#include <library/cpp/logger/global/global.h>

#include <util/folder/filelist.h>
#include <util/folder/path.h>
#include <util/system/fs.h>
#include <util/generic/guid.h>
#include <util/generic/ptr.h>
#include <util/stream/file.h>
#include <util/string/vector.h>

REGISTER_REDUCER(TSaasYTIndexingReducer);



void TSaasYTIndexingReducer::Do(TReader* input, TWriter* output) {
    const TShardId shardId = input->GetRow().GetShardId();
    const ui32 segmentId = input->GetRow().GetSegmentId();
    const TString segmentName = Sprintf("index_%010d_%010d", 0, segmentId);

    THolder<TStandaloneIndexer> indexer = MakeHolder<TStandaloneIndexer>(ConfigBundle.Text, ConfigBundle.Variables, RunModules);
    indexer->EnableIndexing();
    INFO_LOG << "Started indexing" << Endl;
    ui32 docsIndexed = 0;
    ui32 urlsInSample = 0;
    NRTYServer::TDocParseContext c;
    ui64 maxTimestamp = 0;
    ui64 maxExtraTimestamp = 0;
    ui64 maxAgeTs = 0;
    for (; input->IsValid(); input->Next()) {
        const auto& inRow = input->GetRow();
        Y_VERIFY(inRow.GetShardId() == shardId);
        Y_VERIFY(inRow.GetSegmentId() == segmentId);

        TParseResult parsedDocument(indexer->GetParsedDocument(inRow.GetDocument(), c));
        Y_VERIFY(parsedDocument.Document != nullptr || !FailOnIndexError,
                "can't parse document %s, fail reason: %s",
                inRow.GetDocument().GetUrl().c_str(),
                parsedDocument.ErrorMessage.c_str());

        if (parsedDocument.Document == nullptr) {
            NOTICE_LOG << "Skip document " << inRow.GetDocument().GetUrl() << " because can't parse it" << Endl;
            if (FilterIncorrectDocs) {
                NSaas::TYTIndexError errorRecord;
                errorRecord.SetUrl(inRow.GetDocument().GetUrl());
                *errorRecord.MutableDocument() = inRow.GetDocument();
                errorRecord.SetError(parsedDocument.ErrorMessage);
                output->AddRow<NSaas::TYTIndexError>(errorRecord, INCORRECT_DOCS_INDEX);
            }
            continue;
        }

        if (FailOnIndexError) {
            indexer->IndexUnsafe(*parsedDocument.Document);
            maxAgeTs = Max<ui64>(maxAgeTs, inRow.GetDocument().HasModificationTimestamp() ? inRow.GetDocument().GetModificationTimestamp() : 0);
        } else {
            auto indexResult = indexer->Index(*parsedDocument.Document);
            if (!indexResult.IndexSuccess && FilterIncorrectDocs) {
                NOTICE_LOG << "Skip document " << inRow.GetDocument().GetUrl() << " because can't index it" << Endl;
                NSaas::TYTIndexError errorRecord;
                errorRecord.SetUrl(inRow.GetDocument().GetUrl());
                *errorRecord.MutableDocument() = inRow.GetDocument();
                errorRecord.SetError(parsedDocument.ErrorMessage);
                output->AddRow<NSaas::TYTIndexError>(errorRecord, INCORRECT_DOCS_INDEX);
            } else {
                maxAgeTs = Max<ui64>(maxAgeTs, inRow.GetDocument().HasModificationTimestamp() ? inRow.GetDocument().GetModificationTimestamp() : 0);
            }
        }
        if (inRow.GetIsDeleted()) {
            indexer->RemoveDoc(*parsedDocument.Document);
        }
        if (!inRow.GetDocument().GetTimestamps().empty()) {
            auto timestamp = inRow.GetDocument().GetTimestamps()[0]; // No timestamp
            maxTimestamp = Max<ui64>(timestamp.GetValueEx(), maxTimestamp);
            maxExtraTimestamp = Max<ui64>(timestamp.GetValue(), maxExtraTimestamp);
        }

        if (SampleUrls && urlsInSample < SampleUrlsPerShard) {
            NSaas::TUrlSample urlSampleRow;
            urlSampleRow.SetUrl(inRow.GetDocument().GetUrl());
            urlSampleRow.SetShardId(shardId.ToString());
            urlSampleRow.SetSegmentId(segmentId);
            urlSampleRow.SetShardMin(inRow.GetShardMin());
            urlSampleRow.SetShardMax(inRow.GetShardMax());
            urlSampleRow.SetPrefix(inRow.GetDocument().GetKeyPrefix());
            output->AddRow<NSaas::TUrlSample>(urlSampleRow, SAMPLE_OUT_INDEX);
            ++urlsInSample;
        }

        if (++docsIndexed % 100000 == 0) {
            INFO_LOG << "Indexed " << docsIndexed << " documents" << Endl;
        }
    }
    INFO_LOG << "Finished indexing: " << docsIndexed << " documents" << Endl;
    const TFsPath indexDir = indexer->GetIndexerDir();
    indexer.Destroy();

    INFO_LOG << "Started dumping index to YT table writer" << Endl;
    INFO_LOG << "Dumping format: " << Format << Endl;
    TYTChunkedOutput::TConstructionContext outputContext({
        /* TableWriter =*/ output,
        /* ShardId =*/ shardId,
        /* TableIndex =*/ BLOB_OUT_INDEX,
        /* SegmentId =*/ segmentId,
        /* ChunkSize =*/ DumperChunkSize,
        /* Format =*/ Format
    });

    TYTFileDumper dumper(segmentName, outputContext, /* sortFiles =*/ true);
    dumper.DumpDirectory(indexDir);
    INFO_LOG << "Finished dumping index to YT table writer" << Endl;
    NRTYServer::TShardResource currentResource;

    TStringBuilder name;
    name << "shard_id: " << shardId.GetMin() << ":" << shardId.GetMax() << "; ";
    name << "timestamp: " << maxTimestamp << "; extra-timestamp: " << maxExtraTimestamp;
    currentResource.SetName(name);
    currentResource.SetTimestamp(maxTimestamp);
    currentResource.SetTimestampEx(maxExtraTimestamp);
    currentResource.SetShardMin(shardId.GetMin());
    currentResource.SetShardMax(shardId.GetMax());
    currentResource.SetSegmentId(segmentId);

    NJson::TJsonValue js(NJson::JSON_MAP);
    js["ts_max"] = ToString(maxAgeTs);
    TStringStream ss;
    NJson::WriteJson(&ss, &js);
    currentResource.SetStat(ss.Str());

    output->AddRow<NRTYServer::TShardResource>(currentResource, SHARD_RESOURCES_INDEX);

    indexDir.ForceDelete();
}
