#include <library/cpp/containers/comptrie/comptrie_trie.h>
#include <library/cpp/containers/comptrie/prefix_iterator.h>

#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/protos/exported.pb.h>
#include <util/string/reverse.h>
#include <util/thread/pool.h>

#include "host2vec.h"
#include "rivals.h"
#include "source_tables.h"

namespace NWebmaster {

struct TPreparePreFilterReducer : public NYT::IReducer<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TNode>> {
    //reduce by Query
    void Do(TReader *input, TWriter *output) override {
        size_t shows = 0;
        size_t clicks = 0;
        const TString query = input->GetRow()[F_QUERY].AsString();
        for (; input->IsValid(); input->Next()) {
            const NYT::TNode &row = input->GetRow();
            shows += row[F_SHOWS].AsUint64();
            clicks += row[F_CLICKS].AsUint64();
        }
        if (clicks > 1 && shows > 5) {
            output->AddRow(NYT::TNode()
                (F_QUERY, query)
                (F_SHOWS, shows)
                (F_CLICKS, clicks)
            );
        }
    }
};

REGISTER_REDUCER(TPreparePreFilterReducer)

//ReduceBy F_QUERY
struct TPreFilterQueriesReducer : public NYT::IReducer<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TNode>> {
    void Do(TReader *input, TWriter *output) override {
        const int TABLENO_GOOD_QUERIES = 0;
        const int TABLENO_QUERIES_TO_FILTER = 1;
        bool passedFilter = false;
        for (; input->IsValid(); input->Next()) {
            switch (input->GetTableIndex()) {
            case TABLENO_GOOD_QUERIES:
                passedFilter = true;
                break;
            case TABLENO_QUERIES_TO_FILTER:
                if (passedFilter) {
                    output->AddRow(input->GetRow());
                }
                break;
            default:
                ythrow yexception() << "unknown table passed";
            }
        }
    }
};

REGISTER_REDUCER(TPreFilterQueriesReducer)

const THashSet<TString> &GetUrlsSkipList() {
    const static THashSet<TString> skipListUrls = {
        "http://maps.yandex.ru/geo_wizard",
        "http://yandex.by/imageswizard",
        "http://yandex.by/video/videoblend",
        "http://yandex.com.tr/imageswizard",
        "http://yandex.com.tr/video/videoblend",
        "http://yandex.com/imageswizard",
        "http://yandex.com/video/videoblend",
        "http://yandex.kz/imageswizard",
        "http://yandex.kz/video/videoblend",
        "http://yandex.ru/imageswizard",
        "http://yandex.ru/video/videoblend",
        "http://yandex.ua/imageswizard",
        "http://yandex.ua/video/videoblend",

        "https://maps.yandex.ru/geo_wizard",
        "https://yandex.by/imageswizard",
        "https://yandex.by/video/videoblend",
        "https://yandex.com.tr/imageswizard",
        "https://yandex.com.tr/video/videoblend",
        "https://yandex.com/imageswizard",
        "https://yandex.com/video/videoblend",
        "https://yandex.kz/imageswizard",
        "https://yandex.kz/video/videoblend",
        "https://yandex.ru/imageswizard",
        "https://yandex.ru/video/videoblend",
        "https://yandex.ua/imageswizard",
        "https://yandex.ua/video/videoblend",
    };

    return skipListUrls;
}

const THashSet<TString> &GetHostsSkipList() {
    const static THashSet<TString> skipListHosts = {
        "https://zvooq.online",         //448151512
        "https://my.mail.ru",           //489498566
        "https://www.drive2.ru",        //493962068
        "http://yandex.ua",             //504255526
        "https://yandex.ua",
        "http://muz-color.ru",          //505244948
        "https://muz-color.ru",
        "https://studfiles.net",        //549845033
        "https://www.kakprosto.ru",     //563317940
        "https://dic.academic.ru",      //587226757
        "https://ru-ru.facebook.com",   //612401618
        "https://znanija.com",          //618813646
        "https://www.syl.ru",           //639440230
        "https://yandex.ru",            //698259132
        "http://www.bolshoyvopros.ru",  //784847716
        "https://www.bolshoyvopros.ru",
        "https://www.kinopoisk.ru",     //785748004
        "http://irecommend.ru",         //976565608
        "https://irecommend.ru",
        "http://otzovik.com",           //1005541231
        "https://otzovik.com",
        "https://www.facebook.com",     //1253646221
        "https://market.yandex.ru",     //1306298719
        "https://twitter.com",          //1350820983
        "https://www.instagram.com",    //1378130602
        "https://www.avito.ru",         //1408524912
        "https://news.yandex.ru",       //1611699865
        "https://otvet.mail.ru",        //1682321256
        "https://ok.ru",                //1853164848
        "http://maps.yandex.ru",        //2097511826
        "https://maps.yandex.ru",
        "http://fb.ru",                 //2314880763
        "https://fb.ru",
        "https://ru.wikipedia.org",     //3445565485
        "https://vk.com",               //4982471539
        "https://www.youtube.com",      //5290216353
        "http://yandex.ru",             //14187683857
    };

    return skipListHosts;
}

const TDeque<TString> &GetQueriesJunkFilters() {
    const static TDeque<TString> filters = {
        "<<"
        "|\\&\\&"
        "|\\~\\~"
        "|/\\+"
        "|/-"
        "|\\|"
        "|title:"
        "|url:"
        "|site:"
        "|inurl:"
        "|host:"
        "|rhost:"
        "|domain:"
        "|mime:"
        "|lang:"
        "|date:"
        "|cat:"
        "|^\\s*!"
        "|^\"\\s*!"
    };

    return filters;
}

TMapExtractDirectQueries::TMapExtractDirectQueries(const TVector<char> &trieStream, const TDeque<time_t> &periodsConfig,
                                                   const TDeque<TString> &queries, int positionOffset)
    : TrieStream(trieStream)
    , PeriodsConfig(periodsConfig)
    , Queries(queries)
    , SkipListUrls(GetUrlsSkipList())
    , SkipListHosts(GetHostsSkipList())
    , PositionOffset(positionOffset)
{
}

void TMapExtractDirectQueries::Start(TWriter* /*writer*/) {
    Trie.Init(&TrieStream[0], TrieStream.size());
    Matcher.Reset(new TBatchMatcher(Queries));
}

void TMapExtractDirectQueries::Do(TReader *input, TWriter *output) {
    TBatchMatcher junkMatcher(GetQueriesJunkFilters());

    for (; input->IsValid(); input->Next()) {
        const TString host = TString{input->GetRow().Key};
        if (SkipListHosts.contains(host)) {
            continue;
        }

        TString rhost = host;
        ReverseInPlace(rhost);

        THashSet<TString> owners;
        bool found = false;
        for (auto it = MakePrefixIterator(Trie, rhost.data(), rhost.size()); it; ++it) {
            const TString owner = host.substr(host.size() - it.GetPrefixLen());

            if (NUtils::IsSubdomain(host, owner)) {
                found = true;
                owners.insert(owner);
            }
        }

        if (!found) {
            continue;
        }

        proto::queries2::QueryMessage msg;
        const NYT::TYaMRRow &row = input->GetRow();
        Y_PROTOBUF_SUPPRESS_NODISCARD msg.ParseFromArray(row.Value.data(), row.Value.length());

        if (msg.GetIsPornoQuery()) {
            continue;
        }

        if (junkMatcher.Matches(msg.corrected_query())) {
            continue;
        }

        TString urlHost, urlPath;
        if (!NUtils::SplitUrl(msg.url(), urlHost, urlPath)) {
            continue;
        }

        if (SkipListUrls.contains(msg.url())) {
            continue;
        }

        if (SkipListHosts.contains(urlHost)) {
            continue;
        }

        if (!Queries.empty() && !Matcher->Matches(msg.corrected_query())) {
            continue;
        }

        ExtractForGroups(host, msg, output);
    }
}

void TMapExtractDirectQueries::ExtractForGroups(const TString &host, const proto::queries2::QueryMessage &msg, TWriter *output) {
    for (int i = 0; i < msg.reports_by_region_size(); i++) {
        const auto &region = msg.reports_by_region(i);
        size_t clicks = 0;
        size_t shows = 0;
        size_t weightedPosition = 0;
        for (int p = 0; p < region.position_info_size(); p++) {
            const auto &position = msg.reports_by_region(i).position_info(p);
            shows += position.shows_count();
            clicks += position.clicks_count();
            weightedPosition += (position.position() + PositionOffset) * position.shows_count();
        }
        output->AddRow(NYT::TNode()
           (NHost2Vec::F_HOST, host)
           (F_IS_NAV, msg.is_nav())
           (F_QUERY, msg.corrected_query())
           (F_URL, msg.url())
           (F_REGION_ID, region.region_id())
           (F_POSITION, weightedPosition) //row[F_POSITION] = Weighted position
           (F_SHOWS, shows)
           (F_CLICKS, clicks)
        );
    }
}

REGISTER_MAPPER(TMapExtractDirectQueries)

TMapCountDirectQueries::TMapCountDirectQueries(const TVector<char> &trieStream, const TDeque<time_t> &periodsConfig,
                                                   const TDeque<TString> &queries, int positionOffset)
    : TrieStream(trieStream)
    , PeriodsConfig(periodsConfig)
    , Queries(queries)
    , SkipListUrls(GetUrlsSkipList())
    , SkipListHosts(GetHostsSkipList())
    , PositionOffset(positionOffset)
{
}

void TMapCountDirectQueries::Start(TWriter* /*writer*/) {
    Trie.Init(&TrieStream[0], TrieStream.size());
    Matcher.Reset(new TBatchMatcher(Queries));
}

void TMapCountDirectQueries::Do(TReader *input, TWriter *output) {
    TBatchMatcher junkMatcher(GetQueriesJunkFilters());

    struct TRecord {
        size_t Clicks = 0;
        size_t Shows = 0;
    };

    THashMap<TString, TRecord> records;
    for (; input->IsValid(); input->Next()) {
        const TString host = TString{input->GetRow().Key};
        if (SkipListHosts.contains(host)) {
            continue;
        }

        TString rhost = host;
        ReverseInPlace(rhost);

        THashSet<TString> owners;
        bool found = false;
        for (auto it = MakePrefixIterator(Trie, rhost.data(), rhost.size()); it; ++it) {
            const TString owner = host.substr(host.size() - it.GetPrefixLen());

            if (NUtils::IsSubdomain(host, owner)) {
                found = true;
                owners.insert(owner);
            }
        }

        if (!found) {
            continue;
        }

        proto::queries2::QueryMessage msg;
        const NYT::TYaMRRow &row = input->GetRow();
        Y_PROTOBUF_SUPPRESS_NODISCARD msg.ParseFromArray(row.Value.data(), row.Value.length());

        if (msg.GetIsPornoQuery()) {
            continue;
        }

        if (junkMatcher.Matches(msg.corrected_query())) {
            continue;
        }

        TString urlHost, urlPath;
        if (!NUtils::SplitUrl(msg.url(), urlHost, urlPath)) {
            continue;
        }

        if (SkipListUrls.contains(msg.url())) {
            continue;
        }

        if (SkipListHosts.contains(urlHost)) {
            continue;
        }

        if (!Queries.empty() && !Matcher->Matches(msg.corrected_query())) {
            continue;
        }

        TRecord &record = records[host];
        for (int i = 0; i < msg.reports_by_region_size(); i++) {
            const auto &region = msg.reports_by_region(i);
            for (int p = 0; p < region.position_info_size(); p++) {
                const auto &position = msg.reports_by_region(i).position_info(p);
                record.Shows += position.shows_count();
                record.Clicks += position.clicks_count();
            }
        }
    }

    for (const auto &obj : records) {
        output->AddRow(NYT::TNode()
           (NHost2Vec::F_HOST, obj.first)
           (F_SHOWS, obj.second.Shows)
           (F_CLICKS, obj.second.Clicks)
        );
    }
}

REGISTER_MAPPER(TMapCountDirectQueries)

//ReduceBy F_HOST, F_QUERY, F_URL, F_REGION_ID
void TCombineExtractedQueries::Do(TReader *input, TWriter *output) {
    NYT::TNode row = input->GetRow();

    size_t shows = 0;
    size_t clicks = 0;
    size_t weightedPosition = 0;
    for (; input->IsValid(); input->Next()) {
        const NYT::TNode &row = input->GetRow();
        shows += row[F_SHOWS].AsUint64();
        clicks += row[F_CLICKS].AsUint64();
        weightedPosition += row[F_POSITION].AsUint64(); //row[F_POSITION] = Weighted position
    }

    output->AddRow(row
       (F_SHOWS, shows)
       (F_CLICKS, clicks)
       (F_POSITION, weightedPosition) //row[F_POSITION] = Normal position
    );
}

REGISTER_REDUCER(TCombineExtractedQueries)

//ReduceBy F_HOST, F_QUERY, F_URL, F_REGION_ID
void TReduceExtractedQueries::Do(TReader *input, TWriter *output) {
    NYT::TNode row = input->GetRow();

    size_t shows = 0;
    size_t clicks = 0;
    size_t weightedPosition = 0;
    for (; input->IsValid(); input->Next()) {
        const NYT::TNode &row = input->GetRow();
        shows += row[F_SHOWS].AsUint64();
        clicks += row[F_CLICKS].AsUint64();
        weightedPosition += row[F_POSITION].AsUint64(); //row[F_POSITION] = Weighted position
    }

    if (shows > 0) {
        size_t position = weightedPosition / shows;
        output->AddRow(row
           (F_SHOWS, shows)
           (F_CLICKS, clicks)
           (F_POSITION, position) //row[F_POSITION] = Normal position
        );
    }
}

REGISTER_REDUCER(TReduceExtractedQueries)

//reduce by any
void TReduceUnique::Do(TReader *input, TWriter *output) {
    output->AddRow(input->GetRow());
}

REGISTER_REDUCER(TReduceUnique)

TMapGroupQueries::TMapGroupQueries(const TVector<char> &trieStream, const THashMap<TString, TVector<ui32>> &ownerToGroups, const THashMap<ui32, TString> &hashToGroup, int shardNo)
    : TrieStream(trieStream)
    , OwnerToGroups(ownerToGroups)
    , HashToGroup(hashToGroup)
    , ShardNo(shardNo)
{
}

void TMapGroupQueries::Start(TWriter* /*writer*/) {
    Trie.Init(&TrieStream[0], TrieStream.size());
}

void TMapGroupQueries::Do(TReader *input, TWriter *output) {
    NYT::TNode dstRow;
    for (; input->IsValid(); input->Next()) {
        const NYT::TNode &row = input->GetRow();

        const NYT::TNode &clicks = row[F_CLICKS];
        const NYT::TNode &shows = row[F_SHOWS];
        const NYT::TNode &regionId = row[F_REGION_ID];
        const NYT::TNode &position = row[F_POSITION];

        const TString host = row[NHost2Vec::F_HOST].AsString();
        const TString query = row[F_QUERY].AsString();
        const TString url = row[F_URL].AsString();
        const bool isNav = row[F_IS_NAV].AsBool();
        //const TString title = row[F_TITLE].AsString();
        //const TString metaDescr = row[F_METADESCR].AsString();

        TString rhost = host;
        ReverseInPlace(rhost);

        const ui64 queryId = FnvHash<ui64>(query.data(), query.size());
        const ui64 urlId = FnvHash<ui64>(url.data(), url.size());

        /*
        ui64 titleId = 0;
        ui64 metaDescrId = 0;

        if (!title.empty()) {
            titleId = FnvHash<ui64>(title.data(), title.size());
        }

        if (!metaDescr.empty()) {
            metaDescrId = FnvHash<ui64>(metaDescr.data(), metaDescr.size());
        }
        */

        for (auto it = MakePrefixIterator(Trie, rhost.data(), rhost.size()); it; ++it) {
            const TString owner = host.substr(host.size() - it.GetPrefixLen());
            //TString domain;
            //if (!GetHostWithoutScheme(host, domain)) {
                //Cerr << "unable to parse host " << host << Endl;
            //}

            const bool isSubdomain = NUtils::IsSubdomain(host, owner);

            if (!isSubdomain) {
                continue;
            }

            auto groupIt = OwnerToGroups.find(owner);
            if (groupIt != OwnerToGroups.end()) {
                for (ui32 groupId : groupIt->second) {
                    if ((groupId % SHARDS_COUNT) == ShardNo) {
                        const TString &group = HashToGroup[groupId];
                        //bool isMainInGroup = (owner == group);

                        output->AddRow(NYT::TNode()
                            (NHost2Vec::F_GROUP, group)
                            (F_QUERY_ID, queryId)
                            (F_REGION_ID, regionId)
                            (F_POSITION, position)
                            (NHost2Vec::F_HOST, host)
                            (F_SHOWS, shows)
                            (F_CLICKS, clicks)
                            //(F_TITLE_ID, titleId)
                            //(F_METADESCR_ID, metaDescrId)
                            (F_URL_ID, urlId)
                            (F_IS_NAV, isNav)
                        );
                    }
                }
            }
        }
    }
}

REGISTER_MAPPER(TMapGroupQueries)

//ReduceBy Group, QueryId, RegionId
void TReduceProcessGroups::Do(TReader *input, TWriter *output) {
    struct THostRecord {
        size_t Shows = 0;
        size_t Clicks = 0;
        size_t MinPosition = 49;
    };

    struct TGroupRecord {
        size_t RivalsShows = 0;
        size_t RivalsClicks = 0;
        THashSet<TString> UniqRivalsByShows;
        THashSet<TString> UniqRivalsByClicks;
    };

    //THashSet<ui64> titlesIds;
    //THashSet<ui64> metaDescrsIds;
    THashMap<TString, THostRecord> hostCounters;
    THashMap<ui64, size_t> urlsIds;

    const TString group = input->GetRow()[NHost2Vec::F_GROUP].AsString();
    const bool isNav = input->GetRow()[F_IS_NAV].AsBool();
    const ui64 queryId = input->GetRow()[F_QUERY_ID].AsUint64();
    const ui64 regionId = input->GetRow()[F_REGION_ID].AsUint64();

    for (; input->IsValid(); input->Next()) {
        const NYT::TNode &row = input->GetRow();

        const TString host = row[NHost2Vec::F_HOST].AsString();
        size_t shows = row[F_SHOWS].AsUint64();
        THostRecord &record = hostCounters[host];
        record.Clicks += row[F_CLICKS].AsUint64();
        record.Shows += shows;

        if (NUtils::IsSubdomain(host, group)) {
            record.MinPosition = Min(record.MinPosition, row[F_POSITION].AsUint64());
            /*
            ui64 titleId = row[F_TITLE_ID].AsUint64();
            if (titleId != 0) {
                titlesIds.insert(titleId);
            }

            ui64 metaDescrId = row[F_METADESCR_ID].AsUint64();
            if (metaDescrId != 0) {
                metaDescrsIds.insert(metaDescrId);
            }
            */
            ui64 urlId = row[F_URL_ID].AsUint64();
            size_t &storedShows = urlsIds[urlId];
            storedShows = Max(shows, storedShows);
        }
    }

    size_t popularIdShows = 0;
    ui64 popularId = 0;
    for (const auto &obj : urlsIds) { //select the most popular url
        if (obj.second > popularIdShows) {
            popularIdShows = obj.second;
            popularId = obj.first;
        }
    }

    //TString titlesIdsStream;
    //TString metaDescrsIdsStream;
    TString urlsIdsStream;
    //TVector<ui64> titlesIdsVector(titlesIds.begin(), titlesIds.end());
    //TVector<ui64> metaDescrsIdsVector(metaDescrsIds.begin(), metaDescrsIds.end());
    TVector<ui64> urlsIdsVector; //(urlsIds.begin(), urlsIds.end());
    urlsIdsVector.push_back(popularId);
    //titlesIdsStream.assign(reinterpret_cast<char*>(&titlesIdsVector[0]), titlesIdsVector.size() * sizeof(ui64));
    //metaDescrsIdsStream.assign(reinterpret_cast<char*>(&metaDescrsIdsVector[0]), metaDescrsIdsVector.size() * sizeof(ui64));
    urlsIdsStream.assign(reinterpret_cast<char*>(&urlsIdsVector[0]), urlsIdsVector.size() * sizeof(ui64));

    THashMap<TString, TGroupRecord> groupCounters;
    for (const auto &groupObj : hostCounters) {
        const TString &groupHost = groupObj.first;
        TGroupRecord &groupRecord = groupCounters[groupHost];

        for (const auto &hostObj : hostCounters) {
            const TString &host = hostObj.first;

            //if (host == groupHost) {
                //continue;
            //}

            if (NUtils::IsSubdomain(host, group)) {
                continue;
            }

            const THostRecord &hostRecord = hostObj.second;
            groupRecord.RivalsShows += hostRecord.Shows;
            groupRecord.RivalsClicks += hostRecord.Clicks;

            if (hostRecord.Shows > 0) {
                groupRecord.UniqRivalsByShows.insert(host);
            }

            if (hostRecord.Clicks > 0) {
                groupRecord.UniqRivalsByClicks.insert(host);
            }
        }
    }

    for (const auto &groupObj : groupCounters) {
        const TString &groupHost = groupObj.first;
        const TGroupRecord &groupRecord = groupObj.second;

        size_t position = hostCounters[groupHost].MinPosition;

        if (FilterQueries && position < 4) { //Filter out too frequent queries
            continue;
        }

        //if (!IsSubdomain(groupHost, group)) { //todo?: skip all domains except exact match by host
            //continue;
        //}

        const TString groupDomain = TString{NUtils::GetHost2vecDomain(groupHost)};
        if (groupDomain != group) {
            continue;
        }

        if (//groupRecord.RivalsShows >
                groupRecord.RivalsClicks > 0
             && groupRecord.UniqRivalsByShows.size() > 1
            //|| groupRecord.UniqRivalsByClicks.size() > 0
        ) {
            output->AddRow(NYT::TNode()
                (NHost2Vec::F_GROUP, group)
                (F_QUERY_ID, queryId)
                (F_IS_NAV, isNav)
                (F_REGION_ID, regionId)
                (F_POSITION, position)
                (NHost2Vec::F_HOST, groupHost)
                (F_RIVALS_SHOWS, groupRecord.RivalsShows)
                (F_RIVALS_CLICKS, groupRecord.RivalsClicks)
                (F_RIVALS_UNIQ_BY_SHOWS, groupRecord.UniqRivalsByShows.size())
                (F_RIVALS_UNIQ_BY_CLICKS, groupRecord.UniqRivalsByClicks.size())
                (F_URLS_HASHES, urlsIdsStream)
                //(F_TITLES_HASHES, titlesIdsStream)
                //(F_METADESCR_HASHES, metaDescrsIdsStream)
            );
        }
    }
}

REGISTER_REDUCER(TReduceProcessGroups)

NYT::TNode TReduceJoinQueries::GetOutputRow(const NYT::TNode &row) {
    return NYT::TNode()
        (NHost2Vec::F_GROUP, row[NHost2Vec::F_GROUP])
        (F_IS_NAV, row[F_IS_NAV])
        (NHost2Vec::F_HOST, row[NHost2Vec::F_HOST])
        (F_REGION_ID, row[F_REGION_ID])
        (F_POSITION, row[F_POSITION])
        (F_RIVALS_CLICKS, row[F_RIVALS_CLICKS])
        (F_RIVALS_SHOWS, row[F_RIVALS_SHOWS])
        (F_RIVALS_UNIQ_BY_CLICKS, row[F_RIVALS_UNIQ_BY_CLICKS])
        (F_RIVALS_UNIQ_BY_SHOWS, row[F_RIVALS_UNIQ_BY_SHOWS])
    ;
}

//reduce by QueryId
void TReduceJoinQueries::Do(TReader *input, TWriter *output) {
    const size_t TABLENO_QUERIES_IDS = 0;

    TString queryText;
    for (; input->IsValid(); input->Next()) {
        const NYT::TNode &row = input->GetRow();
        if (input->GetTableIndex() == TABLENO_QUERIES_IDS) {
            queryText = row[F_QUERY].AsString();
            continue;
        }

        /*
        const TString titlesStream = row[F_TITLES_HASHES].AsString();
        const ui64* titlesData = reinterpret_cast<const ui64*>(titlesStream.data());
        /TVector<ui64> titlesHashes(titlesData, titlesData + titlesStream.size() / sizeof(ui64));

        const TString metaDescrsStream = row[F_METADESCR_HASHES].AsString();
        const ui64* metaDescrsData = reinterpret_cast<const ui64*>(metaDescrsStream.data());
        TVector<ui64> metaDescrsHashes(metaDescrsData, metaDescrsData + metaDescrsStream.size() / sizeof(ui64));
        */
        const TString urlsStream = row[F_URLS_HASHES].AsString();
        const ui64* urlsData = reinterpret_cast<const ui64*>(urlsStream.data());
        TVector<ui64> urlsHashes(urlsData, urlsData + urlsStream.size() / sizeof(ui64));

        NYT::TNode dstRow = GetOutputRow(row);
        dstRow[F_QUERY] = queryText;

        /*if (titlesHashes.empty() && metaDescrsHashes.empty()) {*/
            dstRow[F_HASH_SOURCE] = 0u;
            output->AddRow(dstRow
                (NHost2Vec::F_HASH, 0u)
            );
        /*} else {
            dstRow[F_HASH_SOURCE] = 1u;
            for (ui64 hash : titlesHashes) {
                output->AddRow(dstRow
                    (NHost2Vec::F_HASH, hash)
                );
            };

            dstRow[F_HASH_SOURCE] = 2u;
            for (ui64 hash : metaDescrsHashes) {
                output->AddRow(dstRow
                    (NHost2Vec::F_HASH, hash)
                );
            }
        }
        */
        dstRow[F_HASH_SOURCE] = 3u;
        for (ui64 hash : urlsHashes) {
            output->AddRow(dstRow
                (NHost2Vec::F_HASH, hash)
            );
        };
    }
}

REGISTER_REDUCER(TReduceJoinQueries)

TString JoinContent(const THashSet<TString> &texts) {
    TString content = JoinSeq("\t", texts);
    if (content.size() > MAX_ROW_SIZE) {
        content.resize(MAX_ROW_SIZE);
    }
    return content;
}

NYT::TNode TReduceJoinTextContent::GetOutputRow(const NYT::TNode &row) {
    return NYT::TNode()
        (NHost2Vec::F_GROUP, row[NHost2Vec::F_GROUP])
        (NHost2Vec::F_HOST, row[NHost2Vec::F_HOST])
        (F_QUERY, row[F_QUERY])
        (F_IS_NAV, row[F_IS_NAV])
        (F_REGION_ID, row[F_REGION_ID])
        (F_POSITION, row[F_POSITION])
        (F_RIVALS_CLICKS, row[F_RIVALS_CLICKS])
        (F_RIVALS_SHOWS, row[F_RIVALS_SHOWS])
        (F_RIVALS_UNIQ_BY_CLICKS, row[F_RIVALS_UNIQ_BY_CLICKS])
        (F_RIVALS_UNIQ_BY_SHOWS, row[F_RIVALS_UNIQ_BY_SHOWS])
    ;
}

//reduce by hash
void TReduceJoinTextContent::Do(TReader *input, TWriter *output) {
    //const size_t TABLENO_TITLES = 0;
    //const size_t TABLENO_METADESCRS = 1;
    //const size_t TABLENO_URLS = 2;
    //const size_t TABLENO_GROUPS = 3;
    const size_t TABLENO_URLS = 0;
    //const size_t TABLENO_GROUPS = 1;

    THashSet<TString> urls;
    THashSet<TString> titles;
    THashSet<TString> mDescrs;

    for (; input->IsValid(); input->Next()) {
        const NYT::TNode &row = input->GetRow();
        /*if (input->GetTableIndex() == TABLENO_TITLES) {
            titles.insert(row[F_TITLE].AsString());
        } else if (input->GetTableIndex() == TABLENO_METADESCRS) {
            mDescrs.insert(row[F_METADESCR].AsString());
        } else */ if (input->GetTableIndex() == TABLENO_URLS) {
            const TString url = row[F_URL].AsString();
            if (!url.empty()) {
                urls.insert(url);
            }
        } else {
            TString contentUrls;
            TString contentTitles;
            TString contentMDescrs;

            if (row[NHost2Vec::F_HASH].AsUint64() != 0) {
                contentUrls = JoinContent(urls);
                //contentTitles = JoinContent(titles);
                //contentMDescrs = JoinContent(mDescrs);
            }

            output->AddRow(GetOutputRow(row)
                //(F_TITLE, contentTitles)
                //(F_METADESCR, contentMDescrs)
                (F_URL, contentUrls)
            );
        }
    }
}

REGISTER_REDUCER(TReduceJoinTextContent)

//reduce by hash
void TReduceWrapTextContent::Do(TReader *input, TWriter *output) {
    THashSet<TString> urls, metaDescrs, titles;
    NYT::TNode dstRow = input->GetRow();
    for (; input->IsValid(); input->Next()) {
        const NYT::TNode row = input->GetRow();
        const TString url = row[F_URL].AsString();
        if (!url.empty()) {
            urls.insert(url);
        }
        //titles.insert(row[F_TITLE].AsString());
        //metaDescrs.insert(row[F_METADESCR].AsString());
    }

    const TString contentUrls = JoinContent(urls);
    const TString contentTitles = JoinContent(titles);
    const TString contentMDescrs = JoinContent(metaDescrs);

    output->AddRow(dstRow
        //(F_TITLE, contentTitles)
        //(F_METADESCR, contentMDescrs)
        (F_URL, contentUrls)
    );
}

REGISTER_REDUCER(TReduceWrapTextContent)

using TReduceGetColumnHash64 = NHost2Vec::TReduceGetColumnHash<ui64>;
REGISTER_REDUCER(TReduceGetColumnHash64)

void BuildHostsTrie(const THashSet<TString> &words, TVector<char> &dest) {
    TCompactTrie<char>::TBuilder trieBuilder;
    for (const TString& word : words) {
        TString rword = word;
        ReverseInPlace(rword);
        trieBuilder.Add(rword, rword.size());
    }

    TBufferStream data;
    trieBuilder.SaveAndDestroy(data);
    dest.assign(data.Buffer().Data(), data.Buffer().Data() + data.Buffer().Size());
}

void LoadHosts(NYT::IClientBasePtr client, const TString &sourceTable, const TString &column, THashSet<TString> &hosts) {
    auto reader = client->CreateTableReader<NYT::TNode>(sourceTable);
    for (; reader->IsValid(); reader->Next()) {
        const NYT::TNode &row = reader->GetRow();
        const TString host = row[column].AsString();
        TString asciiHost;
        if (NUtils::IDNHostToAscii(host, asciiHost)) {
            hosts.insert(asciiHost);
        } else {
            LOG_ERROR("direct, unable to parse %s hostname", host.data());
        }
    }
}

void MergeEnrichedGroups(NYT::IClientBasePtr client, const TString &prefix, const TString &dest) {
    TDeque<NYTUtils::TTableInfo> tables;
    if (!NYTUtils::GetTableList(client, prefix, tables)) {
        ythrow yexception() << "direct, unable to load table list";
    }

    NYT::ITransactionPtr tx = client->StartTransaction();

    TOpRunner runner(tx);
    runner.Drop(dest);
    for (const NYTUtils::TTableInfo &table : tables) {
        runner.Input(table.Name);
    }

    runner
        .Output(dest)
        //.Map(new TMapFilterNonGroups)
        .Merge()
    ;

    for (const NYTUtils::TTableInfo &table : tables) {
        runner.Drop(table.Name);
    }

    tx->Commit();
}

NYT::TTableSchema GetEnrichedGroupsSchema() {
    NYT::TTableSchema tableSchema;
    tableSchema.Strict(true);
    tableSchema.AddColumn(NYT::TColumnSchema().Name(NHost2Vec::F_GROUP).Type(NYT::VT_STRING).SortOrder(NYT::SO_ASCENDING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(NHost2Vec::F_HOST).Type(NYT::VT_STRING).SortOrder(NYT::SO_ASCENDING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_QUERY).Type(NYT::VT_STRING).SortOrder(NYT::SO_ASCENDING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_REGION_ID).Type(NYT::VT_UINT64).SortOrder(NYT::SO_ASCENDING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_IS_NAV).Type(NYT::VT_BOOLEAN));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_POSITION).Type(NYT::VT_UINT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_RIVALS_CLICKS).Type(NYT::VT_UINT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_RIVALS_SHOWS).Type(NYT::VT_UINT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_RIVALS_UNIQ_BY_CLICKS).Type(NYT::VT_UINT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_RIVALS_UNIQ_BY_SHOWS).Type(NYT::VT_UINT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_URL).Type(NYT::VT_STRING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_TITLE).Type(NYT::VT_STRING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_METADESCR).Type(NYT::VT_STRING));
    return tableSchema;
}

void EnrichGroups(NYT::IClientBasePtr client, const TString &rawGroups, const TString &enrichedGroups, const TRivalsTables &rivalsTables) {
    const NYT::TSortColumns KC_OUTPUT(NHost2Vec::F_GROUP, NHost2Vec::F_HOST, F_QUERY, F_REGION_ID);
    NYT::ITransactionPtr tx = client->StartTransaction();
    TOpRunner(tx)
        .Drop(enrichedGroups)
        .InputNode(rivalsTables.IdsQueries)
        .InputNode(rawGroups)
        .OutputNode(enrichedGroups)
        .ReduceBy(F_QUERY_ID)
        .Reduce(new TReduceJoinQueries)
        .SortBy(NHost2Vec::F_HASH)
        .Sort(enrichedGroups)

        //.InputNode(config.TABLE_RECOMMENDED_IDS_TITLES)
        //.InputNode(config.TABLE_RECOMMENDED_IDS_METADESCRS)
        .InputNode(rivalsTables.IdsUrls)
        .InputNode(enrichedGroups)
        .OutputNode(NYT::TRichYPath(enrichedGroups))
        .ReduceBy(NHost2Vec::F_HASH)
        .Reduce(new TReduceJoinTextContent)
        .SortBy(KC_OUTPUT)
        .Sort(enrichedGroups)

        .InputNode(enrichedGroups)
        .OutputNode(NYT::TRichYPath(enrichedGroups).Schema(GetEnrichedGroupsSchema()))
        .ReduceBy(KC_OUTPUT)
        .Reduce(new TReduceWrapTextContent)
    ;
    tx->Commit();
}

struct TProcessGroupTask: public IObjectInQueue {
    TProcessGroupTask(NYT::IClientBasePtr client, const TConfig &config, const TRivalsTables &rivalsTables,
                      const TVector<char> &trieStream, const THashMap<TString, TVector<ui32>> &ownerToGroups,
                      const THashMap<ui32, TString> &hashToGroup, bool filterQueries,
                      int shardNo, TAtomic &processedShards)
        : Client(client)
        , Config(config)
        , RivalsTables(rivalsTables)
        , TrieStream(trieStream)
        , OwnerToGroups(ownerToGroups)
        , HashToGroup(hashToGroup)
        , filterQueries(filterQueries)
        , ShardNo(shardNo)
        , ProcessedShards(processedShards)
    {
    }

    void Process(void*) override try {
        const TString rawGroups = NYTUtils::JoinPath(RivalsTables.IntmRawGroups, Sprintf("%02d", ShardNo));
        const TString enrichedGroups = NYTUtils::JoinPath(RivalsTables.IntmEnrichedGroups, Sprintf("%02d", ShardNo));

        NYT::ITransactionPtr tx = Client->StartTransaction();
        TOpRunner(tx)
            .InputNode(RivalsTables.EnrichedQueries)
            .OutputNode(rawGroups)
            .MapperMemoryLimit(MEMORY_LIMIT_6GB)
            .ReducerMemoryLimit(MEMORY_LIMIT_4GB)
            .UseTmpfsInMapper()
            .UseTmpfsInReducer()
            .ReduceBy(NHost2Vec::F_GROUP, F_QUERY_ID, F_REGION_ID)
            .MapReduce(new TMapGroupQueries(TrieStream, OwnerToGroups, HashToGroup, ShardNo),
                       new TReduceProcessGroups(filterQueries))
            .SortBy(F_QUERY_ID)
            .Sort(rawGroups)
        ;
        tx->Commit();

        EnrichGroups(Client, rawGroups, enrichedGroups, RivalsTables);
        AtomicIncrement(ProcessedShards);
    } catch (yexception &e) {
        LOG_ERROR("direct, process group shard %d error: %s", ShardNo, e.what());
    }

public:
    NYT::IClientBasePtr Client;
    const TConfig &Config;
    const TRivalsTables &RivalsTables;
    const TVector<char> &TrieStream;
    const THashMap<TString, TVector<ui32>> &OwnerToGroups;
    const THashMap<ui32, TString> &HashToGroup;
    const bool filterQueries;
    int ShardNo = 0;
    TAtomic &ProcessedShards;
};

void TaskBuildQueries(NYT::IClientBasePtr clientQueries, const TConfig &config, const NHost2Vec::TTableConfig &host2VecTables, const TRivalsTables &rivalsTables,
                      int positionOffset, bool filterQueries) {

    NYT::IClientBasePtr clientGroups = clientQueries; //NYT::CreateClient(config.MR_SERVER_HOST_TEXTS);

    THashSet<TString> groups;
    TDeque<TString> queriesRegex;
    TVector<char> hostsTrieStream;

    LoadHosts(clientGroups, host2VecTables.HostsHash, NHost2Vec::F_HOST, groups);
    BuildHostsTrie(groups, hostsTrieStream);

    TDeque<TSourceTable> sourceTables;
    TDeque<time_t> periodsConfig;

    NYT::ITransactionPtr tx = clientQueries->StartTransaction();
    const int DAYS = 90;
    if (!LoadConvertedTables(tx, config.TABLE_SOURCE_QUERIES_CONVERTED_PREFIX, sourceTables, DAYS)) {
        ythrow yexception() << "direct, no input tables found";
    }

    NWebmaster::TOpRunner runner(tx);
    runner.Drop(rivalsTables.ExtractedQueries);
    runner.Drop(rivalsTables.EnrichedQueries);

    for (const TSourceTable &table : sourceTables) {
        runner.InputYaMR(table.Name);
        periodsConfig.push_back(table.PeriodBegin);
    }

    runner
        .OutputNode(rivalsTables.ExtractedQueries)
        .MemoryLimit(MEMORY_LIMIT_4GB)
        .UseTmpfs() //to avoid "Detected excessive disk IO"
        .ReduceBy(NHost2Vec::F_HOST, F_QUERY, F_URL, F_REGION_ID)
        .MapReduce(new TMapExtractDirectQueries(hostsTrieStream, periodsConfig, queriesRegex, positionOffset),
                   new TCombineExtractedQueries, new TReduceExtractedQueries)
        .SortBy(F_QUERY, NHost2Vec::F_HOST, F_REGION_ID, F_POSITION)
        .Sort(rivalsTables.ExtractedQueries)
    ;

    if (filterQueries) {
        NYT::TTableSchema schemaQueriesFilter;
        schemaQueriesFilter.Strict(true);
        schemaQueriesFilter.AddColumn(NYT::TColumnSchema().Name(F_QUERY).Type(NYT::VT_STRING).SortOrder(NYT::SO_ASCENDING));
        schemaQueriesFilter.AddColumn(NYT::TColumnSchema().Name(F_SHOWS).Type(NYT::VT_UINT64));
        schemaQueriesFilter.AddColumn(NYT::TColumnSchema().Name(F_CLICKS).Type(NYT::VT_UINT64));

        NYT::TTableSchema queriesGroupsSchema = NYTUtils::GetTableSchema(tx, rivalsTables.ExtractedQueries);

        runner
            .InputNode(rivalsTables.ExtractedQueries)
            .OutputNode(NYT::TRichYPath(rivalsTables.ExtractedQueriesFilter).Schema(schemaQueriesFilter))
            .ReduceBy(F_QUERY)
            .Reduce(new TPreparePreFilterReducer)

            .InputNode(rivalsTables.ExtractedQueriesFilter)
            .InputNode(rivalsTables.ExtractedQueries)
            .OutputNode(NYT::TRichYPath(rivalsTables.ExtractedQueries).Schema(queriesGroupsSchema))
            .ReduceBy(F_QUERY)
            .Reduce(new TPreFilterQueriesReducer)
        ;
    }

    tx->Commit();
}

void TaskEnrichQueries(NYT::IClientBasePtr clientQueries, const TConfig &config, const TRivalsTables &rivalsTables) {
    Y_UNUSED(config);

    NYT::ITransactionPtr tx = clientQueries->StartTransaction();

    TOpRunner(tx)
        //.InputNode(config.TABLE_SOURCE_TEXTS)
        //.InputNode(config.TABLE_RECOMMENDED_EXTRACTED_QUERIES)
        //.OutputNode(NYT::TRichYPath(config.TABLE_RECOMMENDED_ENRICHED_QUERIES).SortedBy(NYT::TSortColumns(F_URL)))
        //.ReduceBy(F_URL)
        //.Reduce(new TReduceEnrichQueries)
        //.Drop(config.TABLE_RECOMMENDED_EXTRACTED_QUERIES)
        .Move(rivalsTables.ExtractedQueries, rivalsTables.EnrichedQueries)

        .InputNode(rivalsTables.EnrichedQueries)
        .OutputNode(rivalsTables.IdsQueries)
        .ReduceBy(F_QUERY)
        .MapReduce(nullptr, new TReduceUnique, new TReduceGetColumnHash64(F_QUERY, F_QUERY_ID), ASYNC_CTX0)
/*
        .InputNode(config.TABLE_RECOMMENDED_ENRICHED_QUERIES)
        .OutputNode(config.TABLE_RECOMMENDED_IDS_TITLES)
        .ReduceBy(F_TITLE)
        .MapReduce(nullptr, new TReduceUnique, new TReduceGetColumnHash64(F_TITLE, NHost2Vec::F_HASH), ASYNC_CTX0)

        .InputNode(config.TABLE_RECOMMENDED_ENRICHED_QUERIES)
        .OutputNode(config.TABLE_RECOMMENDED_IDS_METADESCRS)
        .ReduceBy(F_METADESCR)
        .MapReduce(nullptr, new TReduceUnique, new TReduceGetColumnHash64(F_METADESCR, NHost2Vec::F_HASH), ASYNC_CTX0)
*/
        .InputNode(rivalsTables.EnrichedQueries)
        .OutputNode(rivalsTables.IdsUrls)
        .ReduceBy(F_URL)
        .MapReduce(nullptr, new TReduceUnique, new TReduceGetColumnHash64(F_URL, NHost2Vec::F_HASH), ASYNC_CTX0)

        .Wait(ASYNC_CTX0)

        //.SortBy(NHost2Vec::F_HASH)
        //.Sort(config.TABLE_RECOMMENDED_IDS_TITLES, ASYNC_CTX0)
        //.SortBy(NHost2Vec::F_HASH)
        //.Sort(config.TABLE_RECOMMENDED_IDS_METADESCRS, ASYNC_CTX0)
        .SortBy(F_QUERY_ID)
        .Sort(rivalsTables.IdsQueries, ASYNC_CTX0)
        .SortBy(NHost2Vec::F_HASH)
        .Sort(rivalsTables.IdsUrls, ASYNC_CTX0)
        .Wait(ASYNC_CTX0)
    ;

    tx->Commit();
}

void TaskGroupQueries(NYT::IClientBasePtr clientQueries, const TConfig &config, const NHost2Vec::TTableConfig &host2VecTables, const TRivalsTables &rivalsTable, bool filterQueries) {
    NYTUtils::CreatePath(clientQueries, rivalsTable.IntmRawGroups);
    NYTUtils::CreatePath(clientQueries, rivalsTable.IntmEnrichedGroups);

    THashMap<ui32, TString> hashToGroup;
    NHost2Vec::LoadGroupsHashes(clientQueries, host2VecTables.GroupsHash, hashToGroup);

    THashMap<TString, TVector<ui32>> ownerToGroups;
    TVector<char> trieStream;
    NHost2Vec::BuildHostToGroupTrie(clientQueries, host2VecTables.HostGroupsHash, ownerToGroups, trieStream);

    TAtomic processedShards = 0;
    TThreadPool processQueue(TThreadPool::TParams().SetBlocking(true).SetCatching(true));
    processQueue.Start(2, 16);
    for (int i = 0; i < TMapGroupQueries::SHARDS_COUNT; i++) {
        processQueue.SafeAddAndOwn(MakeHolder<TProcessGroupTask>(clientQueries, config, rivalsTable, trieStream, ownerToGroups, hashToGroup,
                                                         filterQueries, i, processedShards));
    }
    processQueue.Stop();

    if (processedShards == TMapGroupQueries::SHARDS_COUNT) {
        MergeEnrichedGroups(clientQueries, rivalsTable.IntmEnrichedGroups, rivalsTable.EnrichedGroups);
    } else {
        LOG_ERROR("direct, some shards were not completed");
    }
}

} //namespace NWebmaster
