#include <util/draft/datetime.h>
#include <util/generic/deque.h>
#include <util/generic/hash_set.h>
#include <util/generic/set.h>
#include <util/generic/size_literals.h>
#include <util/stream/file.h>
#include <util/string/join.h>
#include <util/string/printf.h>
#include <util/string/reverse.h>
#include <util/string/subst.h>
#include <library/cpp/string_utils/url/url.h>
#include <util/system/user.h>

#include <mapreduce/yt/interface/client.h>
#include <mapreduce/yt/interface/protos/yamr.pb.h>

#include <library/cpp/containers/comptrie/comptrie.h>
#include <library/cpp/containers/comptrie/prefix_iterator.h>
#include <library/cpp/getopt/last_getopt.h>

#include <quality/user_search/common/clicks_shows_stats.pb.h>
#include <robot/library/yt/static/command.h>

#include <wmconsole/version3/wmcutil/regex.h>
#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>

#include <wmconsole/version3/searchqueries-mr/batch_matcher.cpp>
#include <wmconsole/version3/searchqueries-mr/tools/sqgrep_raw/queries.pb.h>

namespace NWebmaster {

using namespace NJupiter;
//using TPrefixIterator = TPrefixIterator<TCompactTrie<char>>;

bool IsSubdomain(const TString &subdomain, const TString &domain) {
    size_t pos = subdomain.rfind(domain);

    if (pos == TString::npos) {
        return false;
    }

    if ((pos + domain.size()) != subdomain.size()) {
        return false;
    }

    if (pos == 0) {
        return true;
    }

    char symb = subdomain[pos - 1];

    if (symb == '/' || symb == '.') {
        return true;
    }

    return false;
}

struct TMapQueries : public NYT::IMapper<NYT::TTableReader<NYT::TYamr>, NYT::TTableWriter<NProto::TQuery>> {
    Y_SAVELOAD_JOB(Hosts, TrieStream, PeriodsConfig, Queries)

public:
    TMapQueries() = default;

    TMapQueries(const TDeque<TString> &hosts, const TVector<char> &trieStream, const TDeque<time_t> &periodsConfig, const TDeque<TString> &queries)
        : Hosts(hosts)
        , TrieStream(trieStream)
        , PeriodsConfig(periodsConfig)
        , Queries(queries)
    {
    }

public:
    void Start(TWriter* /*writer*/) override {
        Trie.Init(&TrieStream[0], TrieStream.size());
        Matcher.Reset(new TBatchMatcher(Queries));
    }

    void Do(TReader *input, TWriter *output) override {
        for (; input->IsValid(); input->Next()) {
            TString host = input->GetRow().GetKey();
            if (GetHttpPrefixSize(host) == 0) {
                host = TString::Join("http://", host);
            }

            host.to_lower();
            TString rhost = host;
            ReverseInPlace(rhost);

            bool found = false;
            for (TPrefixIterator<TCompactTrie<char>> it = MakePrefixIterator(Trie, rhost.data(), rhost.size()); it; ++it) {
                size_t hostNo;
                it.GetValue(hostNo);

                if (IsSubdomain(host, Hosts[hostNo])) {
                    found = true;
                    break;
                }
            }

            if (!found) {
                continue;
            }

            NClicksShowsStats::TUrlClicksShowsStatsForWMC msg;
            Y_PROTOBUF_SUPPRESS_NODISCARD msg.ParseFromString(input->GetRow().GetValue());

            if (!Queries.empty() && !Matcher->Matches(msg.query())) {
                continue;
            }

            TString urlHost, urlPath;
            SplitUrlToHostAndPath(msg.url(), urlHost, urlPath);
            if (GetHttpPrefixSize(urlHost) == 0) {
                urlHost = TString::Join("http://", urlHost);
            }
            urlHost.to_lower();

            NProto::TQuery dstMsg;
            dstMsg.SetHost(urlHost);
            dstMsg.SetPath(urlPath);
            dstMsg.SetQuery(msg.query());
            dstMsg.SetRegionId(msg.GetRegion());
            dstMsg.SetIsMobile(msg.GetIsMobile());
            dstMsg.SetIsPad(msg.GetIsPad());
            dstMsg.SetPeriod(PeriodsConfig.at(input->GetTableIndex()));

            for (size_t p = 0; p < msg.PerPosStatsSize(); p++) {
                const auto &position = msg.GetPerPosStats(p);
                dstMsg.SetPosition(position.GetPos());
                dstMsg.SetShows(position.GetShows());
                dstMsg.SetClicks(position.GetClicks());
                output->AddRow(dstMsg);
            }
        }
    }

public:
    TDeque<TString> Hosts;
    TVector<char> TrieStream;
    TCompactTrie<char> Trie;
    TDeque<time_t> PeriodsConfig;
    TDeque<TString> Queries;
    THolder<TBatchMatcher> Matcher;
};

REGISTER_MAPPER(TMapQueries)

} //namespace NWebmaster

bool MatchPeriod(const TString &tableName, time_t periodBegin, time_t periodEnd, time_t &matched, bool newFormat) {
    if (tableName.empty()) {
        return false;
    }

    TVector<TString> period;
    //NWebmaster::TRegularExpression regex("//home/userfeat-dev/USERSTATS-169/clicks_shows/(\\d+)_(\\d+)/for_wmc/web");
    if (newFormat) {
        NWebmaster::TRegularExpression regex("(\\d+)/\\d+_\\d+/for_wmc/web");
        if (regex.GetMatches(tableName, period) != 1) {
            //Cerr << "ERROR: Unable to parse source table name" << Endl;
            return false;
        }
    } else {
        NWebmaster::TRegularExpression regex("(\\d+)_(\\d+)/for_wmc/web");
        if (regex.GetMatches(tableName, period) != 2) {
            //Cerr << "ERROR: Unable to parse source table name" << Endl;
            return false;
        }
    }

    time_t tableDate = str2date(period[0]);
    if (tableDate >= periodBegin && tableDate <= periodEnd) {
        matched = tableDate;
        return true;
    }

    return false;
}

TString GetOutputName(const TString &periodBeginStr, const TString &periodEndStr) {
    return "//tmp/webmaster/sqgrep_raw/" + GetUsername() + "/report_" + periodBeginStr + "_" + periodEndStr + "_" + ToString(Now().Seconds());
}

int main(int argc, const char **argv) {
    using namespace NWebmaster;

    NYT::Initialize(argc, argv);

    NYTUtils::DisableLogger();

    NLastGetopt::TOpts opts = NLastGetopt::TOpts::Default();

    time_t periodBegin = (Now() - TInstant::Days(30)).Seconds();
    time_t periodEnd = Now().Seconds();

    TString mrServer;
    TString prefix;
    TString periodBeginStr = date2str(periodBegin);
    TString periodEndStr = date2str(periodEnd);
    TString host;
    TString hostsFile;
    TString queryText;
    TString queriesFile;
    const TString initialOutput = GetOutputName(periodBeginStr, periodEndStr);
    TString output = initialOutput;
    TString newFormatStr;

    opts
        .AddLongOption('S', "server", "MR server")
        .StoreResult(&mrServer)
        .DefaultValue("hahn.yt.yandex.net");

    opts
        .AddLongOption('p', "prefix", "converted searchqueries prefix")
        .StoreResult(&prefix)
        .DefaultValue("//home/webmaster/userstats/clicks_shows");

    opts
        .AddLongOption('b', "period-begin", "grep period begin")
        .StoreResult(&periodBeginStr)
        .DefaultValue(periodBeginStr);

    opts
        .AddLongOption('e', "period-end", "grep period end")
        .StoreResult(&periodEndStr)
        .DefaultValue(periodEndStr);

    opts
        .AddLongOption('o', "output", "Table with results")
        .StoreResult(&output)
        .DefaultValue(output);

    opts
        .AddLongOption('h', "hostname", "Hostname")
        .StoreResult(&host);

    opts
        .AddLongOption('H', "hostnames-file", "File with hostnames")
        .StoreResult(&hostsFile);

    opts
        .AddLongOption('q', "query-text", "Query text to grep")
        .StoreResult(&queryText);

    opts
        .AddLongOption('Q', "query-file", "File with queries to grep divided by \\n")
        .StoreResult(&queriesFile);

    opts
        .AddLongOption('n', "new-format", "New table name format (half-hour queries) (1 for new format)")
        .StoreResult(&newFormatStr);

    THolder<NLastGetopt::TOptsParseResult> parsedOpts(new NLastGetopt::TOptsParseResult(&opts, argc, argv));

    periodBegin = str2date(periodBeginStr);
    periodEnd = str2date(periodEndStr);
    bool newFormat = "1" == newFormatStr || "true" == newFormatStr;
    if (output == initialOutput) {
        output = GetOutputName(periodBeginStr, periodEndStr);
    }

    THashSet<TString> hosts;

    if (!hostsFile.empty()) {
        TFileInput input(hostsFile);
        TString host, asciiHost;

        while (input.ReadLine(host)) {
            if (host.empty()) {
                continue;
            }

            if (NUtils::IDNHostToAscii(host, asciiHost)) {
                hosts.insert(asciiHost);
            } else {
                Cerr << "Unable to parse hostname " << host << Endl;
            }
        }
    }

    if (!host.empty()) {
        hosts.insert(host);
    }

    if (hosts.empty()) {
        Cerr << "ERROR: At least one host must be defined" << Endl;
        return 1;
    }

    TDeque<TString> queries;

    if (!queriesFile.empty()) {
        TFileInput input(queriesFile);
        TString line;

        while (input.ReadLine(line)) {
            if (line.empty()) {
                continue;
            }

            queries.push_back(line);
        }
    }

    if (!queryText.empty()) {
        queries.push_back(queryText);
    }

    TCompactTrie<char>::TBuilder trieBuilder;
    TDeque<TString> hostsArray(hosts.size());

    for (const TString &host : hosts) {
        TString rhost = host;
        ReverseInPlace(rhost);
        trieBuilder.Add(rhost, hostsArray.size());
        hostsArray.push_back(host);
    }

    TBufferStream data;
    trieBuilder.SaveAndDestroy(data);
    TVector<char> hostsTrieStream(data.Buffer().Data(), data.Buffer().Data() + data.Buffer().Size());

    //SubstGlobal(output, "://", "_");

    Cerr << "Cluster " << mrServer << Endl;
    Cerr << "InputPrefix " << prefix << Endl;
    Cerr << "PeriodBegin " << periodBeginStr << Endl;
    Cerr << "PeriodEnd " << periodEndStr << Endl;
    Cerr << "OutputTable " << output << Endl;
    Cerr << "Loaded " << hosts.size() << " hosts" << Endl;
    Cerr << "Loaded " << queries.size() << " queries" << Endl;
    Cerr << "New format " << newFormat << Endl;

    if (queries.empty()) {
        Cerr << "  All queries will be extracted" << Endl;
    }

    NYT::IClientPtr client = NYT::CreateClient(mrServer);
    NYT::ITransactionPtr tx = client->StartTransaction();

    TDeque<NYTUtils::TTableInfo> tablesList;
    if (!NYTUtils::GetTableList(tx, prefix, tablesList, Max<int>())) {
        Cerr << "ERROR: No input tables found" << Endl;
        return 1;
    }

    std::sort(tablesList.begin(), tablesList.end(), [](const NYTUtils::TTableInfo &lhs, const NYTUtils::TTableInfo &rhs) -> bool {
        return lhs.Name < rhs.Name;
    });

    TDeque<time_t> periodsConfig;
    TDeque<TTable<NYT::TYamr>> inputTables;

    const TRegularExpression regex("(clicks_shows/\\d+_\\d+/for_wmc/web)$");
    for (const NYTUtils::TTableInfo &table : tablesList) {
        time_t matchedPeriod;

        TVector<TString> hits;
        if (regex.GetMatches(table.Name, hits) != 1) {
            continue;
        }

        if (MatchPeriod(table.Name, periodBegin, periodEnd, matchedPeriod, newFormat)) {
            inputTables.emplace_back(tx, table.Name);
            periodsConfig.push_back(matchedPeriod);
            Cerr << "Input " << table.Name << Endl;
        }
    }
    Cerr << "Output " << output << Endl;

    Cerr << "Extracting(1)..." << Endl;
    TMapCmd<TMapQueries>(tx, new TMapQueries(hostsArray, hostsTrieStream, periodsConfig, queries))
        .MemoryLimit(3_GBs)
        .Inputs(inputTables)
        .Output(TTable<NProto::TQuery>(tx, output))
        .Do()
    ;

    Cerr << "Sorting..." << Endl;
    TSortCmd<NProto::TQuery>(tx, TTable<NProto::TQuery>(tx, output))
        .By({"Host", "Path", "Query", "RegionId", "Position"})
        .Do()
    ;

    tx->Commit();
    Cerr << "Done" << Endl;
}
