#include <util/string/cast.h>
#include <util/stream/output.h>
#include <util/system/user.h>
#include <util/generic/size_literals.h>
#include <mapreduce/yt/library/operation_tracker/operation_tracker.h>
#include <mapreduce/yt/interface/client.h>
#include <mapreduce/yt/common/config.h>
#include <datacloud/launcher/lib/grep/fast_grep.h>
#include <datacloud/launcher/lib/grep/data/data.pb.h>


using namespace NYT;


namespace Datacloud {
namespace Grep {

const auto outputLogTableSchema = TTableSchema()
    .AddColumn(TColumnSchema().Type(VT_STRING).Name("yuid"))
    .AddColumn(TColumnSchema().Type(VT_STRING).Name("title"))
    .AddColumn(TColumnSchema().Type(VT_STRING).Name("url"))
    .AddColumn(TColumnSchema().Type(VT_INT64).Name("timestamp"));

const TString kCompressCodec = "brotli_6";

class TFastGrepReducer : public IReducer<
    TTableReader<::google::protobuf::Message>,
    TTableWriter<::google::protobuf::Message>>
{
public:
    Y_SAVELOAD_JOB(IsMultipleOutTables_);

    TFastGrepReducer() = default;

    TFastGrepReducer(bool IsMultipleOutTables) : IsMultipleOutTables_(IsMultipleOutTables) { }

    virtual void Do(TReader* reader, TWriter* writer) override
    {
        TString prevTitle, prevUrl, prevUid;
        bool isInteresting = false;
        const unsigned long border = 2000;
        for (; reader->IsValid(); reader->Next()) {
            const auto tableIndex = reader->GetTableIndex();
            if (tableIndex == 0) {
                isInteresting = true;
                continue;
            } else if (!isInteresting) {
                break;
            }
            const auto& row = reader->GetRow<TTSKVRecord>();
            const TString& value = row.GetValue();

            TStringBuf valueBuf(value);
            if (!valueBuf.StartsWith("type=TRAFFIC")) {
                  continue;
            }

            while (valueBuf && !valueBuf.StartsWith("url=")) {
                 valueBuf.NextTok('\t');
            }
            valueBuf.NextTok("url=");
            TString url = TString{valueBuf.NextTok('\t')};

            const auto ind = std::min(std::min(url.find('?'), url.find('#')), border);
            url = url.substr(0, ind);

            if (url.empty()) {
                continue;
            }

            while (valueBuf && !valueBuf.StartsWith("title=")) {
                valueBuf.NextTok('\t');
            }
            valueBuf.NextTok("title=");
            TString title = TString{valueBuf.NextTok('\t')};
            if (title.empty()) {
                continue;
            }
            title = title.substr(0, border);
            TString uid = row.GetKey();
            uid = uid.substr(1, uid.length());

            if (title == prevTitle && url == prevUrl && uid == prevUid) {
                continue;
            }

            const TString& subkey = row.GetSubkey();
            const i64 timestamp = FromString<i64>(subkey);

            TLogRecord outputNode;
            outputNode.SetYuid(uid);
            outputNode.SetTitle(title);
            outputNode.SetUrl(url);
            outputNode.SetTimestamp(timestamp);
            
            if (IsMultipleOutTables_) {
                writer->AddRow(outputNode, tableIndex - 1);
            } else {
                writer->AddRow(outputNode);
            }
            prevTitle = title;
            prevUrl = url;
            prevUid = uid;
        }
    }

private:
    bool IsMultipleOutTables_;
};

REGISTER_REDUCER(TFastGrepReducer);

void grep(NYT::IClientPtr client, const TString& date) {
    const TString cryptaTable = "//home/x-products/production/crypta_v2/crypta_db_last/all_interesting_yuid";
    const TString watchLogTable = "//user_sessions/pub/watch_log_tskv/daily/" + date + "/clean";
    const TString spyLogTable = "//user_sessions/pub/spy_log/daily/" + date + "/clean";

    const TString root = "//home/x-products/production";
    const TString finalSpyGrep = root + "/datacloud/grep/spy_log" + "/" + date;
    const TString finalWatchGrep = root + "/datacloud/grep/watch_log_tskv" + "/" + date;

    const auto outputSpyLogTable = TRichYPath(finalSpyGrep)
        .Schema(outputLogTableSchema)
        .OptimizeFor(EOptimizeForAttr::OF_SCAN_ATTR)
        .CompressionCodec(kCompressCodec)
        .ErasureCodec(EErasureCodecAttr::EC_LRC_12_2_2_ATTR);

    const auto outputWatchLogTable = TRichYPath(finalWatchGrep)
        .Schema(outputLogTableSchema)
        .OptimizeFor(EOptimizeForAttr::OF_SCAN_ATTR)
        .CompressionCodec(kCompressCodec)
        .ErasureCodec(EErasureCodecAttr::EC_LRC_12_2_2_ATTR);

    auto reduceSpec = TReduceOperationSpec()
           .DataSizePerJob(4_GB)
           .ReducerSpec(TUserJobSpec().CpuLimit(0.2))
           .ReduceBy({"key"})
           .AddInput<TCryptaRecord>(TRichYPath(cryptaTable))
           .AddInput<TTSKVRecord>(TRichYPath(spyLogTable))
           .AddInput<TTSKVRecord>(TRichYPath(watchLogTable))
           .AddOutput<TLogRecord>(outputSpyLogTable)
           .AddOutput<TLogRecord>(outputWatchLogTable);

    auto transaction = client->StartTransaction(NYT::TStartTransactionOptions().Title("Datacloud grep Spy/Watch logs"));

    transaction->Reduce(reduceSpec, new TFastGrepReducer(true));

    TOperationTracker tracker;
    tracker.AddOperation(
        transaction->Sort(
            TSortOperationSpec()
                .AddInput(finalWatchGrep)
                .Output(finalWatchGrep)
                .SortBy({"yuid", "timestamp"}),
            TOperationOptions().Wait(false)));
    tracker.AddOperation(
        transaction->Sort(
            TSortOperationSpec()
                .AddInput(finalSpyGrep)
                .Output(finalSpyGrep)
                .SortBy({"yuid", "timestamp"}),
            TOperationOptions().Wait(false)));
    tracker.WaitAllCompleted();

    transaction->Commit();
}


void fast_grep(const TString& yt_token, const TString& cluster, const TString& date) {
    auto client = NYT::CreateClient(
        cluster, NYT::TCreateClientOptions().Token(yt_token));
    grep(client, date);
}


void custom_fast_grep(
    const TString& yt_token, const TString& cluster,
    const TString& cryptaTable, const TVector<TString>& logTables,
    const TString& outputTable) {
    auto client = NYT::CreateClient(
        cluster, NYT::TCreateClientOptions().Token(yt_token));

    const auto outputLogTable = TRichYPath(outputTable)
        .Schema(outputLogTableSchema)
        .OptimizeFor(EOptimizeForAttr::OF_SCAN_ATTR)
        .CompressionCodec(kCompressCodec)
        .ErasureCodec(EErasureCodecAttr::EC_LRC_12_2_2_ATTR);

    auto reduceSpec = TReduceOperationSpec()
        .DataSizePerJob(4_GB)
        .ReducerSpec(TUserJobSpec().CpuLimit(0.2))
        .ReduceBy({"key"})
        .AddInput<TCryptaRecord>(TRichYPath(cryptaTable));
    
    for (const auto& table : logTables) {
        reduceSpec.AddInput<TTSKVRecord>(TRichYPath(table));
    }
    reduceSpec.AddOutput<TLogRecord>(outputLogTable);

    auto transaction = client->StartTransaction(NYT::TStartTransactionOptions().Title("Datacloud custom grep"));
    transaction->Reduce(reduceSpec, new TFastGrepReducer(false));
}


}  // namespace Grep
}  // namespace Datacloud
