#include <util/draft/datetime.h>
#include <util/generic/deque.h>
#include <util/generic/set.h>
#include <util/generic/hash_set.h>
#include <util/stream/file.h>
#include <util/string/join.h>
#include <util/string/printf.h>
#include <util/string/reverse.h>
#include <util/string/subst.h>
#include <library/cpp/string_utils/url/url.h>
#include <util/system/user.h>

#include <mapreduce/yt/interface/client.h>

#include <library/cpp/containers/comptrie/comptrie.h>
#include <library/cpp/containers/comptrie/prefix_iterator.h>
#include <library/cpp/getopt/last_getopt.h>

#include <wmconsole/version3/protos/queries2.pb.h>

#include <wmconsole/version3/wmcutil/regex.h>
#include <wmconsole/version3/wmcutil/yt/yt_runner.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>

#include <wmconsole/version3/searchqueries-mr/batch_matcher.cpp>

namespace NWebmaster {

//using TPrefixIterator = TPrefixIterator<TCompactTrie<char>>;

bool IsSubdomain(const TString &subdomain, const TString &domain) {
    size_t pos = subdomain.rfind(domain);

    if (pos == TString::npos) {
        return false;
    }

    if ((pos + domain.size()) != subdomain.size()) {
        return false;
    }

    if (pos == 0) {
        return true;
    }

    char symb = subdomain[pos - 1];

    if (symb == '/' || symb == '.') {
        return true;
    }

    return false;
}

struct TMapQueries : public NYT::IMapper<NYT::TTableReader<NYT::TYaMRRow>, NYT::TTableWriter<NYT::TNode>> {
    Y_SAVELOAD_JOB(Hosts, TrieStream, PeriodsConfig, Queries)

public:
    TMapQueries() = default;

    TMapQueries(const TDeque<TString> &hosts, const TVector<char> &trieStream, const TDeque<time_t> &periodsConfig, const TDeque<TString> &queries)
        : Hosts(hosts)
        , TrieStream(trieStream)
        , PeriodsConfig(periodsConfig)
        , Queries(queries)
    {
    }

public:
    void Start(TWriter* /*writer*/) override {
        Trie.Init(&TrieStream[0], TrieStream.size());
        Matcher.Reset(new TBatchMatcher(Queries));
    }

    void Do(TReader *input, TWriter *output) override {
        for (; input->IsValid(); input->Next()) {
            const TString host = TString{input->GetRow().Key};

            bool found = false;
            if (Hosts.empty()) {
                found = true;
            } else {
                TString rhost = host;
                ReverseInPlace(rhost);
                for (TPrefixIterator<TCompactTrie<char>> it = MakePrefixIterator(Trie, rhost.data(), rhost.size()); it; ++it) {
                    size_t hostNo;
                    it.GetValue(hostNo);

                    if (IsSubdomain(host, Hosts[hostNo])) {
                        found = true;
                        break;
                    }
                }
            }

            if (!found) {
                continue;
            }

            proto::queries2::QueryMessage msg;
            const NYT::TYaMRRow &row = input->GetRow();
            Y_PROTOBUF_SUPPRESS_NODISCARD msg.ParseFromArray(row.Value.data(), row.Value.length());

            if (!Queries.empty() && !Matcher->Matches(msg.query())) {
                continue;
            }

            TString urlHost, urlPath;
            SplitUrlToHostAndPath(msg.url(), urlHost, urlPath);

            for (int i = 0; i < msg.reports_by_region_size(); i++) {
                const auto &region = msg.reports_by_region(i);
                for (int p = 0; p < region.position_info_size(); p++) {
                     const auto &position = msg.reports_by_region(i).position_info(p);

                     output->AddRow(NYT::TNode()
                        ("Host", urlHost)
                        ("Path", urlPath)
                        ("Query", msg.query())
                        ("CorrectedQuery", msg.corrected_query())
                        ("RegionId", region.region_id())
                        ("IsMobile", region.is_mobile())
                        ("IsPad", region.is_pad())
                        ("Position", position.position())
                        ("Shows", position.shows_count())
                        ("Clicks", position.clicks_count())
                        ("Period", static_cast<ui64>(PeriodsConfig[input->GetTableIndex()]))
                    );
                }
            }
        }
    }

public:
    TDeque<TString> Hosts;
    TVector<char> TrieStream;
    TCompactTrie<char> Trie;
    TDeque<time_t> PeriodsConfig;
    TDeque<TString> Queries;
    THolder<TBatchMatcher> Matcher;
};

REGISTER_MAPPER(TMapQueries)

} //namespace NWebmaster

bool MatchPeriod(const TString &tableName, time_t periodBegin, time_t periodEnd, time_t &matched) {
    if (tableName.empty()) {
        return false;
    }

    NWebmaster::TRegularExpression regex("(\\d+)_(\\d+)");
    TVector<TString> period;
    if (regex.GetMatches(tableName, period) != 2) {
        Cerr << "ERROR: Unable to parse source table name" << Endl;
    }

    time_t tableDate = str2date(period[0]);
    if (tableDate >= periodBegin && tableDate <= periodEnd) {
        matched = tableDate;
        return true;
    }

    return false;
}

int main(int argc, const char **argv) {
    using namespace NWebmaster;

    NYT::Initialize(argc, argv);

    NYTUtils::DisableLogger();

    NLastGetopt::TOpts opts = NLastGetopt::TOpts::Default();

    time_t periodBegin = (Now() - TInstant::Days(30)).Seconds();
    time_t periodEnd = Now().Seconds();

    TString mrServer;
    TString prefix;
    TString periodBeginStr = date2str(periodBegin);
    TString periodEndStr = date2str(periodEnd);
    TString host;
    TString hostsFile;
    TString queryText;
    TString queriesFile;
    TString output = "//tmp/webmaster/sqgrep2/" + GetUsername() + "/report_" + ToString(Now().Seconds());

    opts
        .AddLongOption('S', "server", "MR server")
        .StoreResult(&mrServer)
        .DefaultValue("hahn.yt.yandex.net");

    opts
        .AddLongOption('p', "prefix", "converted searchqueries prefix")
        .StoreResult(&prefix)
        .DefaultValue("//home/webmaster/prod/searchqueries/converted_v4");

    opts
        .AddLongOption('b', "period-begin", "grep period begin")
        .StoreResult(&periodBeginStr)
        .DefaultValue(periodBeginStr);

    opts
        .AddLongOption('e', "period-end", "grep period end")
        .StoreResult(&periodEndStr)
        .DefaultValue(periodEndStr);

    opts
        .AddLongOption('o', "output", "Table with results")
        .StoreResult(&output)
        .DefaultValue(output);

    opts
        .AddLongOption('h', "hostname", "Hostname")
        .StoreResult(&host);

    opts
        .AddLongOption('H', "hostnames-file", "File with hostnames")
        .StoreResult(&hostsFile);

    opts
        .AddLongOption('q', "query-text", "Query text to grep")
        .StoreResult(&queryText);

    opts
        .AddLongOption('Q', "query-file", "File with queries to grep divided by \\n")
        .StoreResult(&queriesFile);

    THolder<NLastGetopt::TOptsParseResult> parsedOpts(new NLastGetopt::TOptsParseResult(&opts, argc, argv));

    periodBegin = str2date(periodBeginStr);
    periodEnd = str2date(periodEndStr);

    THashSet<TString> hosts;

    if (!hostsFile.empty()) {
        TUnbufferedFileInput input(hostsFile);
        TString line;

        while (input.ReadLine(line)) {
            if (line.empty()) {
                continue;
            }

            hosts.insert(line);
        }
    }

    if (!host.empty()) {
        hosts.insert(host);
    }

    TDeque<TString> queries;

    if (!queriesFile.empty()) {
        TUnbufferedFileInput input(queriesFile);
        TString line;

        while (input.ReadLine(line)) {
            if (line.empty()) {
                continue;
            }

            queries.push_back(line);
        }
    }

    if (!queryText.empty()) {
        queries.push_back(queryText);
    }

    //SubstGlobal(output, "://", "_");

    Cerr << "Cluster " << mrServer << Endl;
    Cerr << "InputPrefix " << prefix << Endl;
    Cerr << "PeriodBegin " << periodBeginStr << Endl;
    Cerr << "PeriodEnd " << periodEndStr << Endl;
    Cerr << "OutputTable " << output << Endl;
    Cerr << "Loaded " << hosts.size() << " hosts" << Endl;
    if (hosts.empty()) {
        Cerr << "  All hosts will be extracted" << Endl;
    }

    Cerr << "Loaded " << queries.size() << " queries" << Endl;
    if (queries.empty()) {
        Cerr << "  All queries will be extracted" << Endl;
    }

    NYT::IClientPtr client = NYT::CreateClient(mrServer);
    NYT::ITransactionPtr tx = client->StartTransaction();

    NWebmaster::TOpRunner runner(tx);

    TDeque<NYTUtils::TTableInfo> tables;
    if (!NYTUtils::GetTableList(client, prefix, tables)) {
        Cerr << "ERROR: No input tables found" << Endl;
        return 1;
    }

    std::sort(tables.begin(),tables.end(), [](const NYTUtils::TTableInfo &lhs, const NYTUtils::TTableInfo &rhs) -> bool {
        return lhs.Name < rhs.Name;
    });

    TDeque<time_t> periodsConfig;

    for (const NYTUtils::TTableInfo& table : tables) {
        time_t matchedPeriod;
        if (MatchPeriod(table.Name, periodBegin, periodEnd, matchedPeriod)) {
            runner.InputYaMR(table.Name);
            periodsConfig.push_back(matchedPeriod);
        }
    }

    NYT::TTableSchema schema = NYT::TTableSchema()
        .AddColumn(NYT::TColumnSchema().Name("Host").Type(NYT::VT_STRING))
        .AddColumn(NYT::TColumnSchema().Name("Path").Type(NYT::VT_STRING))
        .AddColumn(NYT::TColumnSchema().Name("Query").Type(NYT::VT_STRING))
        .AddColumn(NYT::TColumnSchema().Name("CorrectedQuery").Type(NYT::VT_STRING))
        .AddColumn(NYT::TColumnSchema().Name("RegionId").Type(NYT::VT_UINT64))
        .AddColumn(NYT::TColumnSchema().Name("IsMobile").Type(NYT::VT_BOOLEAN))
        .AddColumn(NYT::TColumnSchema().Name("IsPad").Type(NYT::VT_BOOLEAN))
        .AddColumn(NYT::TColumnSchema().Name("Position").Type(NYT::VT_UINT64))
        .AddColumn(NYT::TColumnSchema().Name("Shows").Type(NYT::VT_UINT64))
        .AddColumn(NYT::TColumnSchema().Name("Clicks").Type(NYT::VT_UINT64))
        .AddColumn(NYT::TColumnSchema().Name("Period").Type(NYT::VT_UINT64))
    ;

    TCompactTrie<char>::TBuilder trieBuilder;
    TDeque<TString> hostsArray(hosts.size());

    for (const TString &host : hosts) {
        TString rhost = host;
        ReverseInPlace(rhost);
        trieBuilder.Add(rhost, hostsArray.size());
        hostsArray.push_back(host);
    }

    TBufferStream data;
    trieBuilder.SaveAndDestroy(data);
    TVector<char> hostsTrieStream(data.Buffer().Data(), data.Buffer().Data() + data.Buffer().Size());

    runner
        .OutputNode(NYT::TRichYPath(output).Schema(schema))
        .MemoryLimit(MEMORY_LIMIT_2GB)
        .Map(new TMapQueries(hostsArray, hostsTrieStream, periodsConfig, queries))
        .SortBy("Host", "Query", "Path", "RegionId", "Position")
        .Sort(output)
    ;

    tx->Commit();
}
