#include <util/string/join.h>
#include <util/thread/pool.h>
#include <util/generic/set.h>

#include <library/cpp/string_utils/url/url.h>

#include <mapreduce/yt/interface/client.h>

#include <robot/jupiter/protos/acceptance.pb.h>

#include <wmconsole/version3/library/jupiter/jupiter.h>
#include <wmconsole/version3/wmcutil/log.h>
#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/wmcutil/yt/yt_runner.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>

#include <wmconsole/version3/processors/indexing/conf/yt.h>
#include <wmconsole/version3/processors/indexing/checkurl/conf/config.h>

#include "fields.h"
#include "monitor.h"
#include "schemes.h"
#include "task_update.h"

namespace NWebmaster {

enum ECheckUrlSourceId {
    E_CHECKURL_SOURCE_URLS,
    E_CHECKURL_SOURCE_JUPITER,
    E_CHECKURL_SOURCE_CONTENT_ATTRS,
    E_CHECKURL_SOURCE_FAST_BAN,
    E_CHECKURL_SOURCE_SPREAD,
    E_CHECKURL_SOURCE_TURBO_PAGE,
};

struct TCheckUrlConstraint {
    Y_SAVELOAD_DEFINE(Host, Path)

    TCheckUrlConstraint() = default;

    TCheckUrlConstraint(const TString &url) {
        SplitUrlToHostAndPath(url, Host, Path);
    }

    TCheckUrlConstraint(const TString &host, const TString &path)
        : Host(host)
        , Path(path)
    {
    }

    bool operator<(const TCheckUrlConstraint& rhs) const {
        if (Host == rhs.Host) {
            return Path < rhs.Path;
        }
        return Host < rhs.Host;
    }

public:
    TString Host;
    TString Path;
    bool AccessibleByHost = true;
};

struct TCheckUrlPathConvertMapper : public NYT::IMapper<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TNode>> {
    void Do(TReader *input, TWriter *output) override;
};

void TCheckUrlPathConvertMapper::Do(TReader *input, TWriter *output) {
    static NYT::TNode nullNode = NYT::TNode::CreateEntity();
    for (; input->IsValid(); input->Next()) {
        const NYT::TNode &row = input->GetRow();
        const TString url = row[F_URL].AsString();
        const NYT::TNode requestId = row.HasKey(F_REQUEST_ID) ? row[F_REQUEST_ID] : nullNode;
        TCheckUrlConstraint record(url);
        output->AddRow(NYT::TNode()
            (F_HOST, record.Host)
            (F_PATH, record.Path)
            (F_URL, url)
            (F_REQUEST_ID, requestId)
        );
    }
}

REGISTER_MAPPER(TCheckUrlPathConvertMapper)

struct TCheckUrlReducer : public NYT::IReducer<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TNode>> {
    Y_SAVELOAD_JOB(JupiterTimestamp)

public:
    TCheckUrlReducer() = default;
    TCheckUrlReducer(time_t jupiterTimestamp);

    //reduce by Host, Path
    void Do(TReader *input, TWriter *output) override;

public:
    time_t JupiterTimestamp;
};

TCheckUrlReducer::TCheckUrlReducer(time_t jupiterTimestamp)
    : JupiterTimestamp(jupiterTimestamp)
{
}

//reduce by Host, Path
void TCheckUrlReducer::Do(TReader *input, TWriter *output) {
    const char *F_IN_TITLE = "TitleRawUTF8";
    const char *F_IN_DESCRIPTION = "MetaDescription";
    const static TSet<TString> JUPITER_FIELDS = {
        "AddTime",
        "BeautyUrl",
        "Host",
        "HttpCode",
        "IsFake",
        "IsIndexed",
        "IsSearchable",
        "LastAccess",
        "MainHost",
        "MainMirrorHost",
        "MainPath",
        "MimeType",
        "Path",
        "RedirTarget",
        "RelCanonicalTarget",
        "UrlStatus",
    };
    const int OUTCOME_TABLE = 0;
    const int OUTCOME_IMPORTANT_TABLE = 1;
    static const NYT::TNode nullNode = NYT::TNode::CreateEntity();

    TMap<time_t, NYT::TNode> jupiterRows;
    TMap<time_t, NYT::TNode> spreadRows;
    NYT::TNode fastBanNode = nullNode;
    bool gotContentAttrs = false;
    NYT::TNode title, description;
    TString host, path, url;
    TDeque<NYT::TNode> requestIds;
    bool isTurboPage = false;
    for (; input->IsValid(); input->Next()) {
        const NYT::TNode &row = input->GetRow();

        switch(row[F_CHECKURL_SOURCE_ID].AsInt64()) {
        case E_CHECKURL_SOURCE_URLS:
            host = row[F_HOST].AsString();
            path = row[F_PATH].AsString();
            url = row[F_URL].AsString();
            requestIds.push_back(row[F_REQUEST_ID]);
            break;
        case E_CHECKURL_SOURCE_JUPITER: {
                time_t lastAccess = NYTUtils::FromNodeOrDefault<ui64>(row[F_LAST_ACCESS], 0);
                jupiterRows[lastAccess] = row;
            }
            break;
        case E_CHECKURL_SOURCE_CONTENT_ATTRS:
            gotContentAttrs = true;
            title = row[F_IN_TITLE];
            description = row[F_IN_DESCRIPTION];
            break;
        case E_CHECKURL_SOURCE_FAST_BAN:
            fastBanNode = row;
            break;
        case E_CHECKURL_SOURCE_SPREAD: {
                if (IsSpreadUrlBannedBySourceId(row)) {
                    continue;
                }
                time_t lastAccess = NYTUtils::FromNodeOrDefault<ui64>(row[F_LAST_ACCESS], 0);
                spreadRows[lastAccess] = row;
            }
            break;
        case E_CHECKURL_SOURCE_TURBO_PAGE:
            isTurboPage = true;
            break;
        }
    }

    if (requestIds.empty()) {
        return;
    }

    NYT::TNode dstRow;
    dstRow[F_JUPITER_TIMESTAMP] = JupiterTimestamp;
    dstRow[F_HOST] = host;
    dstRow[F_PATH] = path;
    dstRow[F_URL] = url;
    dstRow[F_FOUND_BASE] = !jupiterRows.empty();
    dstRow[F_FOUND_SPREAD] = !spreadRows.empty();
    dstRow[F_IS_TURBO_PAGE] = isTurboPage;
    if (gotContentAttrs) {
        dstRow[F_TITLE] = title;
        dstRow[F_DESCRIPTION] = description;
    }

    if (!jupiterRows.empty()) {
        const NYT::TNode &row = jupiterRows.rbegin()->second;
        const NYT::TNode &urlStatus = row[F_URL_STATUS];
        const bool isBadMimeImage = !NYTUtils::IsNodeNull(urlStatus)
                                    && urlStatus.AsUint64() == NJupiter::EAcceptanceUrlForWebMasterSimpleStatus::AUFWSS_BAD_MIME_TYPE
                                    && IsImagePathExtension(path);
        if (isBadMimeImage) {
            dstRow[F_FOUND_BASE] = false;
        } else {
            for (const auto &columnName : JUPITER_FIELDS) {
                dstRow[columnName] = row[columnName];
            }
        }
    }

    if (!spreadRows.empty()) {
        const NYT::TNode &row = spreadRows.rbegin()->second;
        if (row.HasKey(F_HTTP_CODE)) {
            dstRow[F_SPREAD_HTTP_CODE] = row[F_HTTP_CODE];
        }

        if (row.HasKey(F_LAST_ACCESS)) {
            dstRow[F_SPREAD_LAST_ACCESS] = row[F_LAST_ACCESS];
        }

        if (row.HasKey(F_MIME_TYPE)) {
            dstRow[F_SPREAD_MIME_TYPE] = row[F_MIME_TYPE];
        }
    }

    if (!NYTUtils::IsNodeNull(fastBanNode)) {
        dstRow[F_IS_SEARCHABLE] = false;
        dstRow[F_HTTP_CODE] = fastBanNode[F_HTTP_CODE];
        dstRow[F_URL_STATUS] = NJupiter::EAcceptanceUrlForWebMasterSimpleStatus::AUFWSS_HTTP_ERROR;
        dstRow[F_LAST_ACCESS] = fastBanNode[F_LAST_ACCESS];
    }

    for (const NYT::TNode &requestId : requestIds) {
        dstRow[F_REQUEST_ID] = requestId;
        output->AddRow(dstRow, OUTCOME_TABLE);
        output->AddRow(dstRow, OUTCOME_IMPORTANT_TABLE);
    }
}

REGISTER_REDUCER(TCheckUrlReducer)

struct TCheckUrlSourceMergeReducer : public NYT::IReducer<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TNode>> {
    Y_SAVELOAD_JOB(RequestedUrls)

    TCheckUrlSourceMergeReducer() = default;
    TCheckUrlSourceMergeReducer(const TSet<TCheckUrlConstraint> &requestedUrls)
        : RequestedUrls(requestedUrls)
    {
    }

    void Start(TWriter *) override {
        for (const TCheckUrlConstraint &record : RequestedUrls) {
            RequestedHosts.insert(record.Host);
        }
    }

    //ReduceBy Host, Path
    void Do(TReader *input, TWriter *output) override {
        static const int INPUT_TABLENO_SOURCE_URLS      = 0;
        static const int INPUT_TABLENO_JUPITER          = 1;
        static const int INPUT_TABLENO_CONTENT_ATTRS    = 2;
        static const int INPUT_TABLENO_FAST_BAN         = 3;

        const TString host = input->GetRow()[F_HOST].AsString();
        if (!RequestedHosts.contains(host)) {
            return;
        }

        for (; input->IsValid(); input->Next()) {
            NYT::TNode row = input->GetRow();
            TCheckUrlConstraint record(host, row[F_PATH].AsString());
            if (!RequestedUrls.contains(record)) {
                continue;
            }

            ECheckUrlSourceId sourceId = E_CHECKURL_SOURCE_URLS;
            switch(input->GetTableIndex()) {
            case INPUT_TABLENO_SOURCE_URLS:
                sourceId = E_CHECKURL_SOURCE_URLS;
                break;
            case INPUT_TABLENO_JUPITER:
                sourceId = E_CHECKURL_SOURCE_JUPITER;
                break;
            case INPUT_TABLENO_CONTENT_ATTRS:
                sourceId = E_CHECKURL_SOURCE_CONTENT_ATTRS;
                break;
            case INPUT_TABLENO_FAST_BAN:
                sourceId = E_CHECKURL_SOURCE_FAST_BAN;
                break;
            default:
                sourceId = E_CHECKURL_SOURCE_SPREAD;
                break;
            }
            output->AddRow(row
                (F_CHECKURL_SOURCE_ID, static_cast<i64>(sourceId))
            );
        }
    }

public:
    TSet<TCheckUrlConstraint> RequestedUrls;
    TSet<TString> RequestedHosts;
};

REGISTER_REDUCER(TCheckUrlSourceMergeReducer)

NYT::TRichYPath GetTablePathWithKeys(const TString &table, const TSet<TCheckUrlConstraint> &urls) {
    TSet<TString> hostConstrains;
    TSet<TCheckUrlConstraint> urlConstrains;

    NYT::TRichYPath path(table);

    for (const TCheckUrlConstraint &key : urls) {
        if (key.AccessibleByHost) {
            hostConstrains.insert(key.Host);
        } else {
            urlConstrains.insert(key);
        }
    }

    for (const TString &host : hostConstrains) {
        NYT::TKey key(host);
        path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(key)));
    }

    for (const TCheckUrlConstraint &constraint : urlConstrains) {
        NYT::TKey key(constraint.Host, constraint.Path);
        path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(key)));
    }

    return path;
}

struct TTaskProcessCheckUrl : public IObjectInQueue {
    TTaskProcessCheckUrl(NYT::IClientBasePtr client, const TDeque<TString> &inputTables, time_t timestamp)
        : Client(client)
        , InputTables(inputTables)
        , Timestamp(timestamp)
    {
    }

    void Process(void* /*tsr*/) override try {
        const auto &config = NCheckurl::TConfig::CInstance();

        NYT::ITransactionPtr tx = Client->StartTransaction();

        const TString intermediateTable = NYTUtils::JoinPath(config.TABLE_CHECKURL_OUTCOME_PREFIX, "intm-" + ToString(Timestamp));
        const TString outputTable = NYTUtils::JoinPath(config.TABLE_CHECKURL_OUTCOME_PREFIX, "urls-" + ToString(Timestamp));
        const TString outputForMergeTable = NYTUtils::JoinPath(config.TABLE_CHECKURL_OUTCOME_FOR_MERGE_PREFIX, ToString(Timestamp));
        const TString sourcesTable = NYTUtils::JoinPath(config.TABLE_CHECKURL_OUTCOME_PREFIX, "srcs-" + ToString(Timestamp));
        const TString jupiterFastBanTableName = TCommonYTConfigIndexing::CInstance().TABLE_SOURCE_JUPITER_FASTBAN;
        const TString turboTableName = TCommonYTConfigIndexing::CInstance().TABLE_SOURCE_TURBO_PAGE;

        NYT::TTableSchema intermediateSchema;
        intermediateSchema.AddColumn(NYT::TColumnSchema().Name(F_HOST).Type(NYT::VT_STRING));
        intermediateSchema.AddColumn(NYT::TColumnSchema().Name(F_PATH).Type(NYT::VT_STRING));
        intermediateSchema.AddColumn(NYT::TColumnSchema().Name(F_URL).Type(NYT::VT_STRING));
        intermediateSchema.AddColumn(NYT::TColumnSchema().Name(F_REQUEST_ID).Type(NYT::VT_ANY));

        TOpRunner runner(tx);

        for (const TString &table : InputTables) {
            runner.InputNode(table);
        }

        runner
            .OutputNode(NYT::TRichYPath(intermediateTable).Schema(intermediateSchema))
            .Map(new TCheckUrlPathConvertMapper)
            .SortBy(F_HOST, F_PATH)
            .Sort(intermediateTable)
        ;

        for (const TString &table : InputTables) {
            runner.Drop(table);
        }

        size_t processedUrls = 0;
        TSet<TCheckUrlConstraint> requestedUrls;
        THashSet<TString> hosts;
        auto reader = tx->CreateTableReader<NYT::TNode>(intermediateTable);
        for (; reader->IsValid(); reader->Next()) {
            const NYT::TNode &row = reader->GetRow();
            const TString host = row[F_HOST].AsString();
            const TString path = row[F_PATH].AsString();
            const TString url = row[F_URL].AsString();
            TCheckUrlConstraint record(host, path);
            requestedUrls.insert(record);
            hosts.insert(host);
            LOG_INFO("checkurl, record TS=%lu Host=%s Path=%s Url=%s", Timestamp, record.Host.data(), record.Path.data(), url.data());
            processedUrls++;
        }

        LOG_INFO("checkurl, loaded %lu urls", requestedUrls.size());

        const NYT::TRichYPath jupiterTable = GetTablePathWithKeys(GetJupiterAcceptanceInProdTable(tx), requestedUrls);
        const NYT::TRichYPath contentAttrsTable = GetTablePathWithKeys(GetJupiterContentAttrsInProdTable(tx), requestedUrls);
        time_t jupiterTimestamp = GetJupiterTsTZFromPath(jupiterTable.Path_);

        runner
            .InputNode(intermediateTable)
            .InputNode(jupiterTable)
            .InputNode(contentAttrsTable)
            .InputNode(jupiterFastBanTableName)
            .InputNode(turboTableName)
        ;

        TDeque<NYTUtils::TTableInfo> spreadTables;
        LoadSpreadTables(tx, jupiterTimestamp, spreadTables);
        for (const NYTUtils::TTableInfo &table : spreadTables) {
            const NYT::TRichYPath spreadTable = GetTablePathWithKeys(table.Name, requestedUrls);
            runner.InputNode(spreadTable);
        }

        runner
            .OutputNode(sourcesTable)
            .ReduceBy(F_HOST, F_PATH)
            .Reduce(new TCheckUrlSourceMergeReducer(requestedUrls))
            .Drop(intermediateTable)
            .SortBy(F_HOST, F_PATH)
            .Sort(sourcesTable)
        ;

        NYT::TTableSchema schema = CreateCheckUrlOutcomeSchema(tx);

        runner
            .InputNode(sourcesTable)
            .OutputNode(NYT::TRichYPath(outputTable).Schema(schema))
            .OutputNode(NYT::TRichYPath(outputForMergeTable).Schema(schema))
            .ReduceBy(F_HOST, F_PATH)
            .Reduce(new TCheckUrlReducer(jupiterTimestamp))
            .Drop(sourcesTable)
        ;

        tx->Commit();

        LOG_INFO("checkurl, urls processed");

        MonitorPushCheckurlProcessed(config.MONITOR_PERFORMANCE_SUFFIX, processedUrls);
    } catch(yexception &e) {
        LOG_ERROR("checkurl, something went wrong: %s", e.what());
    }

public:
    NYT::IClientBasePtr Client;
    const TDeque<TString> InputTables;
    time_t Timestamp;
};

int TaskUpdateCheckUrl(int, const char **) {
    const auto &config = NCheckurl::TConfig::CInstance();

    //TClfPredictor::Instance();

    NYT::IClientPtr client = NYT::CreateClient(config.MR_SERVER_HOST);
    NYTUtils::CreatePath(client, config.TABLE_CHECKURL_INCOME_PREFIX);
    NYTUtils::CreatePath(client, config.TABLE_CHECKURL_OUTCOME_PREFIX);

    #if 0
    auto writer = client->CreateTableWriter<NYT::TNode>(NYTUtils::JoinPath(config.TABLE_CHECKURL_INCOME_PREFIX, "urls-1487674035"));
    writer->AddRow(NYT::TNode()
        (F_URL, "https://lenta.ru/2017/02/18/")
        (F_REQUEST_ID, 123)
    );

    writer->AddRow(NYT::TNode()
        (F_URL, "https://lenta.ru/2017/02/18/")
        (F_REQUEST_ID, 456)
    );

/*    writer->AddRow(NYT::TNode()
        (F_URL, "https://lenta.ru/1999/11/07/")
    );
    writer->AddRow(NYT::TNode()
        (F_URL, "https://lenta.ru/xxx/1999/11/07/")
    );
*/
    writer->Finish();
    #endif
    //if (true) {
        //return 0;
    //}

    TDeque<NYTUtils::TTableInfo> incomeTables;
    if (NYTUtils::GetTableList(client, config.TABLE_CHECKURL_INCOME_PREFIX, incomeTables, Max<int>()) == 0) {
        LOG_INFO("checkurl, there are no income tables");
        return 0;
    }

    std::sort(incomeTables.begin(), incomeTables.end(), [](const NYTUtils::TTableInfo &lhs, const NYTUtils::TTableInfo &rhs) -> bool {
        return lhs.Time < rhs.Time;
    });

    TDeque<TString> inputTables;
    THashSet<TString> hostConstrains;
    TSet<TCheckUrlConstraint> urlConstrains;

    TThreadPool checkUrlQueue(TThreadPool::TParams().SetBlocking(true).SetCatching(true));
    checkUrlQueue.Start(8, 8);

    int taskId = 0;
    time_t timestamp = Now().Seconds();
    for (const NYTUtils::TTableInfo &table : incomeTables) {
        if (table.RecordCount >= 1000) {
            continue;
        }

        auto reader = client->CreateTableReader<NYT::TNode>(table.Name);
        for (; reader->IsValid(); reader->Next()) {
            const NYT::TNode &row = reader->GetRow();
            const TString url = row[F_URL].AsString();
            TCheckUrlConstraint constraint(url);
            if (/*small host*/ true) { //!!! check for host size
                hostConstrains.insert(constraint.Host);
            } else {
                urlConstrains.insert(constraint);
            }
        }

        if ((hostConstrains.size() + urlConstrains.size()) < 160 && inputTables.size() < 100) {
            inputTables.push_back(table.Name);
            LOG_INFO("checkurl, prepare task %d, income table %s", taskId, table.Name.data());
        } else {
            checkUrlQueue.SafeAddAndOwn(MakeHolder<TTaskProcessCheckUrl>(client, inputTables, timestamp));
            LOG_INFO("checkurl, started task %d with %lu income tables", taskId, inputTables.size());
            timestamp++;
            inputTables.clear();
            hostConstrains.clear();
            urlConstrains.clear();
            taskId++;
        }
    }

    if (inputTables.size() > 0) {
        checkUrlQueue.SafeAddAndOwn(MakeHolder<TTaskProcessCheckUrl>(client, inputTables, timestamp));
        LOG_INFO("checkurl, started task %d with %lu income tables", taskId, inputTables.size());
        timestamp++;
    }

    checkUrlQueue.Stop();

    return 0;
}

} //namespace NWebmaster
