#include "pre_describe_filter_combiner.h"
#include "queue_item.h"

#include <crypta/lib/native/identifiers/lib/generic.h>
#include <crypta/lib/native/identifiers/lib/id_types/all.h>
#include <crypta/lib/proto/user_data/user_data_stats.pb.h>
#include <crypta/graph/soup/config/cpp/soup_config.h>

#include <library/cpp/bloom_filter/bloomfilter.h>
#include <library/cpp/yson/node/node_io.h>

#include <util/generic/queue.h>
#include <util/random/random.h>

using namespace NCrypta;
using namespace NCrypta::NSiberia;

void TPreDescribeFilterCombiner::Start(TWriter* writer) {
    Y_UNUSED(writer);

    SkipFilter = TFilterIdentifier(State->GetUserDataStatsOptions().GetSamplingOptions().GetSkipRate());
}

void TPreDescribeFilterCombiner::Do(TReader* reader, TWriter* writer) {
    i32 actualSegmentSize = 0;

    TMaybe<TBloomFilter> filter;
    const auto& segments = State->GetUserDataStatsOptions().GetSegments();

    TIdsToDescribe output;
    TId idBuffer;

    TPriorityQueue<TQueueItem> ids;

    for (; reader->IsValid(); reader->Next()) {
        const auto& row = reader->GetRow();

        if (!output.HasGroupID()) {
            output.SetGroupID(row.GetGroupID());
            auto it = segments.find(output.GetGroupID());
            if (it != segments.end()) {
                const auto& options = it->second.GetFilterOptions();
                if (options.GetCapacity() > 0) {
                    filter = TBloomFilter(options.GetCapacity(), options.GetErrorRate());
                }
            }
        }

        const auto& idType = State->HasIdType() ? State->GetIdType() : NIdentifiers::GetIdentifierTypeByName(row.GetIdType());
        if (idType == NCrypta::NIdentifiersProto::NIdType::DEFAULT) {
            continue;
        }
        idBuffer.SetType(NCrypta::NSoup::IdType(idType).GetName());

        const auto& idValue = NYT::NodeFromYsonString(row.GetIdValue()).ConvertTo<TString>();
        NIdentifiers::TGenericID genericId(idType, idValue);

        if (!genericId.IsValid()) {
            continue;
        }
        idBuffer.SetValue(genericId.Normalize());

        ++actualSegmentSize;

        if (filter.Defined()) {
            filter->Add(idBuffer.GetValue());
        }

        if (!SkipFilter.Filter(idBuffer.GetValue())) {
            ids.push({.Random=RandomNumber<ui32>(), .Id=idBuffer});
        }

        if (ids.size() > State->GetSampleSize()) {
            ids.pop();
        }
    }

    if (ids.empty()) {
        return;
    }

    if (filter.Defined()) {
        auto& protoFilter = *output.MutableFilter();
        protoFilter.MutableOptions()->CopyFrom(segments.at(output.GetGroupID()).GetFilterOptions());

        TStringStream stream;
        filter->Save(&stream);
        protoFilter.SetBloomFilter(stream.Str());
    }

    output.SetActualSegmentSize(actualSegmentSize);

    while (!ids.empty()) {
        *output.MutableIds()->MutableIds()->Add() = ids.top().Id;
        ids.pop();
    }

    writer->AddRow(output);
}
