#include <util/draft/datetime.h>
#include <util/charset/wide.h>

#include <library/cpp/regex/pire/regexp.h>
#include <library/cpp/getopt/last_getopt.h>

#include <mapreduce/yt/interface/client.h>
#include <mapreduce/yt/common/config.h>

#include <wmconsole/version3/wmcutil/yt/yt_runner.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>
#include <wmconsole/version3/wmcutil/regex.h>
#include <wmconsole/version3/wmcutil/log.h>
#include <wmconsole/version3/wmcutil/args.h>
#include <wmconsole/version3/protos/queries2.pb.h>

#include <wmconsole/version3/searchqueries-mr/tools/ya-service-stats/urlstats.pb.h>
#include "config.h"

namespace NWebmaster {

using PositionStats = proto::urlstats::UrlPositionInfo;
using Stats = proto::urlstats::UrlStatsMessage;
using PireRegex = NRegExp::TFsm;

const uint8_t DAYS_IN_WEEK = 7;

const TString FIELD_GROUP_ID = "group_id";
const TString FIELD_NEG_SHOWS = "neg_shows";
const TString FIELD_STATS = "stats";
const TString FIELD_URL = "url";
const TString FIELD_URL_PATTERN = "url_pattern";
const TString FIELD_HOST_PATTERN = "host_pattern";

bool MatchRegex(const PireRegex& regex, const TString &s) {
    return NRegExp::TMatcher(regex).Match(s).Final();
}

struct TStatsAggregator {
    TStatsAggregator(Stats *stats)
        : Message(stats)
    {
        for (PositionStats &pos : *stats->mutable_position_stats()) {
            Positions[pos.position()] = &pos;
        }
    }

    void AddAllPositions(const Stats &newStats) {
        for (const PositionStats& pos: newStats.position_stats()) {
            AddPosition(pos);
        }
    }

    void AddPosition(const PositionStats &newPos) {
        PositionStats *addTo;
        if (Positions.contains(newPos.position())) {
            addTo = Positions[newPos.position()];
        } else {
            addTo = Message->add_position_stats();
            addTo->set_position(newPos.position());
            Positions[newPos.position()] = addTo;
        }
        addTo->set_shows_count(addTo->shows_count() + newPos.shows_count());
        addTo->set_clicks_count(addTo->clicks_count() + newPos.clicks_count());
    }

public:
    Stats* Message;
    THashMap<ui32, PositionStats*> Positions;
};

struct TUrlGroup {
    Y_SAVELOAD_DEFINE(GroupName, HostPattern, UrlPattern);

public:
    TUrlGroup() = default;
    TUrlGroup(const TString &groupName, const TString &hostPattern, const TString &urlPattern)
        : GroupName(groupName)
        , HostPattern(hostPattern)
        , UrlPattern(urlPattern)
    {
    }

public:
    TString GroupName;
    TString HostPattern;
    TString UrlPattern;

};

struct TMapClassifyServiceUrls : public NYT::IMapper<NYT::TTableReader<NYT::TYaMRRow>, NYT::TTableWriter<NYT::TNode>> {
    Y_SAVELOAD_JOB(Groups, AnyGroupPattern);

public:
    TMapClassifyServiceUrls() = default;
    TMapClassifyServiceUrls(const TVector<TUrlGroup>& groups, const TString &anyGroupPattern)
        : Groups(groups)
        , AnyGroupPattern(anyGroupPattern)
    {
    }

    void Do(TReader *input, TWriter *output) override {
        proto::queries2::QueryMessage queryStats;
        PireRegex anyGroupRegex(AnyGroupPattern, PireRegex::TOptions().SetSurround(false));
        THashMap<TString, TSimpleSharedPtr<PireRegex>> group2HostRegex;
        THashMap<TString, TSimpleSharedPtr<PireRegex>> group2UrlRegex;
        for (const TUrlGroup &group : Groups) {
            group2HostRegex[group.GroupName] = new PireRegex(group.HostPattern, PireRegex::TOptions().SetSurround(false));
            group2UrlRegex[group.GroupName] = new PireRegex(group.UrlPattern, PireRegex::TOptions().SetSurround(false));
        }

        PositionStats posStatMessage;
        for (; input->IsValid(); input->Next()) {
            const NYT::TYaMRRow &row = input->GetRow();
            const TString key = TString{row.Key};
            if (MatchRegex(anyGroupRegex, key)) {
                Y_PROTOBUF_SUPPRESS_NODISCARD queryStats.ParseFromString(TString{row.Value});
                const TString url = queryStats.url();
                for (const TUrlGroup &group : Groups) {
                    if (MatchRegex(*(group2HostRegex[group.GroupName]), key) &&
                            MatchRegex(*(group2UrlRegex[group.GroupName]), url)) {

                        Stats valueMsg;
                        TStatsAggregator agg(&valueMsg);
                        for (const proto::queries2::QueryRegionInfo &rawStat: queryStats.reports_by_region()) {
                            for (const proto::queries2::QueryPositionInfo &rawPosStat: rawStat.position_info()) {
                                posStatMessage.set_shows_count(rawPosStat.shows_count());
                                posStatMessage.set_clicks_count(rawPosStat.clicks_count());
                                posStatMessage.set_position(rawPosStat.position());
                                agg.AddPosition(posStatMessage);
                            }
                        }
                        TString statsString;
                        Y_PROTOBUF_SUPPRESS_NODISCARD valueMsg.SerializeToString(&statsString);

                        output->AddRow(NYT::TNode()
                                       (FIELD_GROUP_ID, group.GroupName)
                                       (FIELD_URL, url)
                                       (FIELD_STATS, statsString)
                                       );
                    }
                }
            }
        }
    }

public:
    TVector<TUrlGroup> Groups;
    TString AnyGroupPattern;
};

REGISTER_MAPPER(TMapClassifyServiceUrls);

struct TReduceUrlStats : public NYT::IReducer<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TNode>> {
    void Do(TReader *input, TWriter *output) override {
        Stats aggregatedStat;
        TStatsAggregator aggregator(&aggregatedStat);

        Stats statsItem;

        const TString group = input->GetRow()[FIELD_GROUP_ID].AsString();
        const TString url = input->GetRow()[FIELD_URL].AsString();

        for (; input->IsValid(); input->Next()) {
            const NYT::TNode &row = input->GetRow();
            Y_PROTOBUF_SUPPRESS_NODISCARD statsItem.ParseFromString(row[FIELD_STATS].AsString());
            aggregator.AddAllPositions(statsItem);
        }

        int64_t totalShows = 0;
        for (const PositionStats &posStat : aggregatedStat.position_stats()) {
            totalShows += posStat.shows_count();
        }
        TString statsString;
        Y_PROTOBUF_SUPPRESS_NODISCARD aggregatedStat.SerializeToString(&statsString);

        output->AddRow(NYT::TNode()
                       (FIELD_GROUP_ID, group)
                       (FIELD_URL, url)
                       (FIELD_NEG_SHOWS, -totalShows)
                       (FIELD_STATS, statsString)
                       );
    }
};

REGISTER_REDUCER(TReduceUrlStats);

struct TReduceTopUrlStats : public NYT::IReducer<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TNode>> {
    void Do(TReader *input, TWriter *output) override {
        ui16 MAX_URLS = 5000;

        for (ui16 urlIndex = 0; urlIndex < MAX_URLS && input->IsValid(); urlIndex++) {
            const NYT::TNode &row = input->GetRow();
            output->AddRow(NYT::TNode()
                           (FIELD_GROUP_ID, row[FIELD_GROUP_ID])
                           (FIELD_URL, row[FIELD_URL])
                           (FIELD_STATS, row[FIELD_STATS])
                           );
            input->Next();
        }
    }
};

REGISTER_REDUCER(TReduceTopUrlStats);

void ProcessPattern(const TString &pattern, TUtf16String &appendTo) {
    TUtf16String patternW = UTF8ToWide(pattern);
    for (size_t i = 0; i < patternW.size(); i++) {
        wchar16 ch = patternW[i];
        switch(ch) {
        case '*': appendTo.AppendAscii(".*");
            break;
        default: EscapePireRegexChar(ch, appendTo);
        }
    }
}

void ReadSourceTables(NYT::IClientPtr client, TMap<time_t, TString> &tables) {
    const TConfig &config = TConfig::CInstance();

    TDeque<NYTUtils::TTableInfo> tableList;
    NYTUtils::GetTableList(client, config.TABLE_RAW_STATS_DIR, tableList);

    TRegularExpression regex("clicks_shows_(\\d{8})_\\d{8}_for_wmc_web$");
    for (const NYTUtils::TTableInfo &info : tableList) {
        TVector<TString> hits;
        if (regex.GetMatches(info.Name, hits) > 0) {
            LOG_INFO("Found table %s", info.Name.data());
            time_t date = str2date(hits[0]);
            tables[date] = info.Name;
        }
    }

    LOG_INFO("Found %lu source tables", tables.size());
}

time_t SelectTablesToProcess(const TMap<time_t, TString> &sourceTables, TVector<TString> &tablesToProcess) {
    const TConfig &config = TConfig::CInstance();

    time_t periodEnd;
    if (config.WEEK == 0) {
        time_t expectTime;
        for (auto it = sourceTables.rbegin(); it != sourceTables.rend(); ++it) {
            time_t curTime = it->first;
            if (curTime != expectTime) {
                tablesToProcess.clear();
                periodEnd = curTime;
            }

            expectTime = curTime - TInstant::Days(1).TimeT();
            tablesToProcess.push_back(it->second);
            if (tablesToProcess.size() >= DAYS_IN_WEEK) {
                break;
            }
        }
    } else {
        periodEnd = config.WEEK;
        for (int i = 0; i < DAYS_IN_WEEK; i++) {
            time_t dayTime = periodEnd - TInstant::Days(i).TimeT();
            if (sourceTables.contains(dayTime)) {
                tablesToProcess.push_back(sourceTables.at(dayTime));
            }
        }
    }
    if (tablesToProcess.size() < DAYS_IN_WEEK) {
        ythrow yexception() << "Continuous period of " << (int) DAYS_IN_WEEK << " days not found";
    }
    return periodEnd;
}

void ReadUrlGroups(NYT::IClientPtr client, TVector<TUrlGroup> &groups) {
    const TConfig &config = TConfig::CInstance();

    NYT::TTableReaderPtr<NYT::TNode> groupsTableReader = client->CreateTableReader<NYT::TNode>(config.TABLE_GROUPS_NAME);
    THashMap<TString, TVector<std::pair<TString, TString>>> groupsMap;
    for (; groupsTableReader->IsValid(); groupsTableReader->Next()) {
        const TString groupName = groupsTableReader->GetRow()[FIELD_GROUP_ID].AsString();
        const TString hostPattern = groupsTableReader->GetRow()[FIELD_HOST_PATTERN].AsString();
        const TString urlPattern = groupsTableReader->GetRow()[FIELD_URL_PATTERN].AsString();
        groupsMap[groupName].push_back(std::pair<TString, TString>(hostPattern, urlPattern));
    }

    for (const auto &groupDescr : groupsMap) {
        const TString &groupName = groupDescr.first;
        const TVector<std::pair<TString, TString>> patterns = groupDescr.second;
        TUtf16String hostPattern;
        TUtf16String urlPattern;
        urlPattern.append('(');
        for (const auto &hostUrlPatternPair : patterns) {
            if (hostPattern.size() > 0) {
                hostPattern.append('|');
                urlPattern.append('|');
            }
            bool prependProtocol = !hostUrlPatternPair.first.Contains("://");
            hostPattern.append('(');
            if (prependProtocol) {
                hostPattern.AppendAscii("https?://");
            }
            ProcessPattern(hostUrlPatternPair.first, hostPattern);
            hostPattern.append(')');

            urlPattern.append('(');
            if (prependProtocol) {
                urlPattern.AppendAscii("https?://");
            }
            ProcessPattern(hostUrlPatternPair.first, urlPattern);
            ProcessPattern(hostUrlPatternPair.second, urlPattern);
            urlPattern.append(')');
        }
        urlPattern.AppendAscii(").*");
        groups.push_back(TUrlGroup(groupName, WideToUTF8(hostPattern), WideToUTF8(urlPattern)));
    }
}

TString BuildAnyGroupPattern(const TVector<TUrlGroup> &groups) {
    TString anyGroupPattern;
    TSet<TString> uniqueHostPatterns;
    for (const TUrlGroup &group : groups) {
        uniqueHostPatterns.insert(group.HostPattern);
    }
    for (const TString &hostPattern: uniqueHostPatterns) {
        if (anyGroupPattern.Size() > 0) {
            anyGroupPattern.append("|");
        }
        anyGroupPattern
                .append("(")
                .append(hostPattern)
                .append(")");
    }
    return anyGroupPattern;
}

void CleanupOldTables(NYT::IClientPtr client) {
    const TConfig &config = TConfig::CInstance();
    size_t maxTables = config.MAX_RESULT_TABLES_COUNT;

    TDeque<NYTUtils::TTableInfo> resultTables;
    NYTUtils::GetTableList(client, config.TABLE_WEEKLY_STATS_DIR, resultTables);
    TMap<time_t, TString> timeToResultTable;
    TRegularExpression resultTableRegex(config.TABLE_WEEKLY_STATS_NAME_PREFIX + "(\\d{8})_\\d{8}$");
    for (const NYTUtils::TTableInfo &info : resultTables) {
        TVector<TString> hits;
        if (resultTableRegex.GetMatches(info.Name, hits)) {
            time_t date = str2date(hits[0]);
            timeToResultTable[date] = info.Name;
        }
    }

    TVector<TString> tablesToRemove;
    while (timeToResultTable.size() > maxTables) {
        tablesToRemove.push_back(timeToResultTable.begin()->second);
        timeToResultTable.erase(timeToResultTable.begin());
    }
    if (tablesToRemove.size() > 0) {
        LOG_INFO("Will remove %lu old tables", tablesToRemove.size());
        for (const TString &table : tablesToRemove) {
            LOG_INFO("Removing old table %s", table.data());
            client->Remove(table);
        }
    }
}

int Main() {
    const TConfig &config = TConfig::CInstance();

    NYT::IClientPtr client = NYT::CreateClient(config.MR_SERVER_HOST);

    TMap<time_t, TString> timeToTable;
    ReadSourceTables(client, timeToTable);

    LOG_INFO("Found %lu source tables", timeToTable.size());

    TVector<TString> tablesToProcess;
    time_t periodEnd = SelectTablesToProcess(timeToTable, tablesToProcess);
    time_t periodStart = periodEnd - TInstant::Days(DAYS_IN_WEEK - 1).TimeT();
    LOG_INFO("Will count stats for week %s-%s", date2str(periodStart).data(), date2str(periodEnd).data());

    const TString resultTableName = config.TABLE_WEEKLY_STATS_DIR + "/" +
            config.TABLE_WEEKLY_STATS_NAME_PREFIX +
            date2str(periodStart) + "_" + date2str(periodEnd);

    if (!config.FORCE) {
        NYTUtils::TTableInfo resultTableInfo;
        if (NYTUtils::GetTableInfo(client, resultTableName, resultTableInfo)) {
            LOG_INFO("Table %s already exist, no new data to process", resultTableName.data());
            return 0;
        }
    }

    TVector<TUrlGroup> groups;
    ReadUrlGroups(client, groups);

    const TString anyGroupPattern = BuildAnyGroupPattern(groups);

    NYT::ITransactionPtr tx = client->StartTransaction();
    TOpRunner taskRunner(tx);
    for (const TString &inputTable : tablesToProcess) {
        taskRunner.InputYaMR(inputTable);
    }
    const TString intermediateTable = "//tmp/webmaster/ya-service-search-url-stats-" + ToString(Now().MilliSeconds());

    taskRunner
            .OutputNode(intermediateTable)
            .ReduceBy(FIELD_GROUP_ID, FIELD_URL)
            .MapReduce(new TMapClassifyServiceUrls(groups, anyGroupPattern), new TReduceUrlStats);

    taskRunner
            .InputNode(intermediateTable)
            .OutputNode(resultTableName)
            .ReduceBy(FIELD_GROUP_ID)
            .SortBy(FIELD_GROUP_ID, FIELD_NEG_SHOWS)
            .MapReduce(new TReduceTopUrlStats)
            .Drop(intermediateTable);

    tx->Commit();

    CleanupOldTables(client);
    return 0;
}

static void LogInfo(const TString &msg) {
    LOG_INFO("%s", msg.data());
}

} //namespace NWebmaster

int main(int argc, const char **argv) {
    using namespace NWebmaster;

    NYT::Initialize(argc, argv);
    NYTUtils::DisableLogger();
    TOpRunner::LogInfo = LogInfo;

    TArgs::Init(argc, argv);

    TString week;

    TArgs::Opts()
            .AddLongOption('w', "week", "Specify last day of week to count stats for")
            .StoreResult(&week);

    TArgs::Opts()
            .AddLongOption('f', "force", "Force reevaluate stats even if already have stats for this week")
            .NoArgument();

    auto parsedOpts = TArgs::ParseOpts();

    TConfig &config = TConfig::Instance();
    config.Load();
    if (!config.IsGlobalOk()) {
        throw yexception() << "Error during loading configuration files";
    }

    config.FORCE = parsedOpts->Has('f');
    if (week.size() > 0) {
        time_t weekEndTime = str2date(week);
        config.WEEK = weekEndTime;
    }

    int res = 1;
    try {
        res = Main();
    } catch (yexception &e) {
        LOG_CRIT("%s", e.what());
    }

    return res;
}
