#include <util/charset/wide.h>
#include <util/thread/pool.h>

#include <library/cpp/string_utils/url/url.h>

#include <robot/jupiter/protos/acceptance.pb.h>

#include <wmconsole/version3/processors/indexing/conf/yt.h>
#include <wmconsole/version3/processors/indexing/important_urls/conf/config.h>

#include <wmconsole/version3/wmcutil/compress.h>
#include <wmconsole/version3/wmcutil/regex.h>
#include <wmconsole/version3/wmcutil/thread.h>
#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/wmcutil/yt/backlog.h>
#include <wmconsole/version3/wmcutil/yt/misc.h>

#include <wmconsole/version3/protos/exported.pb.h>
#include <wmconsole/version3/protos/urltree.pb.h>

#include <wmconsole/version3/library/jupiter/jupiter.h>

#include "monitor.h"
#include "task_crawler.h"

namespace NWebmaster {

namespace {
const char *ATTR_TO_COMPACT = "to_compact";
const char *ATTR_LAST_PROCESSED = "last_processed";

const char *F_BANNED_BY_ROBOTSTXT_SINCE = "BannedByRobotsTxtSince";
const char *F_CHANGE_TYPE               = "ChangeType";
const char *F_COUNT_CHANGED             = "CountChanged";
const char *F_COUNT_CRAWLED             = "CountCrawled";
const char *F_COUNT_NEW                 = "CountNew";
const char *F_HOST                      = "Host";
const char *F_HTTP_CODE                 = "HttpCode";
const char *F_IS_USER_PATH              = "IsUserPath";
const char *F_KEY                       = "key";
const char *F_LAST_ACCESS               = "LastAccess";
const char *F_NAME                      = "Name";
const char *F_NODE_ID                   = "NodeId";
const char *F_PARENT_ID                 = "ParentId";
const char *F_PATH                      = "Path";
const char *F_PREV_HTTP_CODE            = "PrevHttpCode";
const char *F_ROBOTS                    = "Robots";
const char *F_SOURCE_ID                 = "SourceId";
const char *F_SOURCE_NAME               = "SourceName";
const char *F_TIMESTAMP                 = "Timestamp";
const char *F_VALUE                     = "value";
}

bool FindWeeklyConfig(const TMap<TWeekTableConfig, int> &config, time_t ts, int &tableNo) {
    for (const auto &obj : config) {
        if (obj.first.In(ts)) {
            tableNo = obj.second;
            return true;
        }
    }
    return false;
}

bool TCrawlerEvent::operator<(const TCrawlerEvent &rhs) const {
    return Timestamp < rhs.Timestamp;
}

TCrawlerEvent TCrawlerEvent::FromJupiterBase(const TUrldatTableConfig &config, const NYT::TNode &row) {
    TCrawlerEvent record;
    if (!NYTUtils::IsNodeNull(row[F_LAST_ACCESS])) {
        record.LastAccess = row[F_LAST_ACCESS].AsUint64();
    }
    if (!NYTUtils::IsNodeNull(row[F_HTTP_CODE])) {
        record.HttpCode = row[F_HTTP_CODE].AsUint64();
    }
    record.Timestamp = config.Timestamp;
    return record;
}

TCrawlerEvent TCrawlerEvent::FromJupiterSpread(const TUrldatTableConfig &config, const NYT::TNode &row) {
    TCrawlerEvent record;
    if (!NYTUtils::IsNodeNull(row[F_LAST_ACCESS])) {
        record.LastAccess = row[F_LAST_ACCESS].AsUint64();
    }
    if (!NYTUtils::IsNodeNull(row[F_HTTP_CODE])) {
        record.HttpCode = row[F_HTTP_CODE].AsUint64();
    }
    record.Timestamp = config.Timestamp;
    return record;
}

TCrawlerEvent TCrawlerEvent::FromDiff(const NYT::TNode &row) {
    TCrawlerEvent record;
    record.Timestamp = row[F_TIMESTAMP].AsInt64();
    record.LastAccess = NYTUtils::FromNodeOrDefault<i64>(row[F_LAST_ACCESS], -1);
    record.HttpCode = NYTUtils::FromNodeOrDefault<i64>(row[F_HTTP_CODE], -1);
    record.PrevHttpCode = NYTUtils::FromNodeOrDefault<i64>(row[F_PREV_HTTP_CODE], -1);
    return record;
}

static int AgeDays(time_t ts) {
    return (Now() - TInstant::Seconds(ts)).Hours() / 24;
}

TRobotsRecord::TRobotsRecord(const TString &robots, time_t lastAccess, time_t bannedSince)
    : Robots(robots)
    , LastAccess(lastAccess)
    , BannedByRobotsTxtSince(bannedSince)
{
}

void TRobotsRecord::Release() {
    TString().swap(Robots);
}

TDisallowByRobots::TDisallowByRobots(const TString &host, TRobotsRecord &robots) {
    NUtils::Decompress(robots.Robots);
    AddRobotsRecord(host, robots.Robots, robots.LastAccess, robots.BannedByRobotsTxtSince);
    robots.Release();
}

void TDisallowByRobots::AddRobotsRecord(const TString &host, const TString &robots, time_t lastAccess, bool bannedByRobotsTxtSince) {
    NJupiter::THostdat hostDat;
    hostDat.SetHost(host);
    hostDat.SetRobots(robots);
    hostDat.SetLastAccess(lastAccess);
    HostInfo.Reset(new NJupiter::THostInfo(hostDat));
    if (bannedByRobotsTxtSince > 0) {
        BanByDisallowedRootAge = AgeDays(bannedByRobotsTxtSince) >= 14;
    }
}

bool TDisallowByRobots::IsPathDisallowed(const TString &path) try {
    if (HostInfo) {
        if (HostInfo->IsDisallowedByRobotsTxt() && BanByDisallowedRootAge) {
            return true;
        } else {
            if (HostInfo->PathIsDisallowedByRobotsTxt(path)) {
                return true;
            }
        }
    }

    return false;
} catch (yexception &e) {
    Cerr << "IsPathDisallowed(" << path << "): " << e.what() << Endl;
    return false;
}

TJupiterUrldatReducer::TJupiterUrldatReducer(const TDeque<TUrldatTableConfig> &tablesInputConfig, const TMap<TWeekTableConfig, int> &eventSamplesOutputConfig,
    const TMap<TWeekTableConfig, int> &recentSamplesOutputConfig, const THashMap<time_t, int> &countersOutputConfig, const THashSet<TString> &webmasterHosts,
    int importantUrlsJupiterTableNo, int importantUrlsSpreadTableNo, const THashMap<TString, TRobotsRecord> &robots)
    : TablesInputConfig(tablesInputConfig)
    , EventSamplesOutputConfig(eventSamplesOutputConfig)
    , RecentSamplesOutputConfig(recentSamplesOutputConfig)
    , CountersOutputConfig(countersOutputConfig)
    , WebmasterHosts(webmasterHosts)
    , ImportantUrlsJupiterTableNo(importantUrlsJupiterTableNo)
    , ImportantUrlsSpreadTableNo(importantUrlsSpreadTableNo)
    , Robots(robots)
{
}

void TJupiterUrldatReducer::Do(TReader *input, TWriter *output) {
    NYT::TNode nullNode = NYT::TNode::CreateEntity();
    TSet<TCrawlerEvent> recordsDiff;
    TSet<TCrawlerEvent> recordsBase;
    TSet<TCrawlerEvent> recordsBaseAndSpread;
    TCrawlerEvent lastNewSpread;
    bool isImportantUrl = false;

    const TString host = input->GetRow()[F_HOST].AsString();
    const TString path = input->GetRow()[F_PATH].AsString();
    THashMap<time_t, THashMap<int, TCrawlerEventCounter>> httpCounters;

    if (!WebmasterHosts.contains(host)) {
        return;
    }

    TDisallowByRobots::Ptr banner;
    with_lock(RobotsMutex) {
        if (Robots.contains(host) && !Banners.contains(host)) {
            Banners[host].Reset(new TDisallowByRobots(host, Robots[host]));
        }
        banner = Banners[host];
    }

    for (; input->IsValid(); input->Next()) {
        const NYT::TNode &row = input->GetRow();
        const TUrldatTableConfig &config = TablesInputConfig[input->GetTableIndex()];

        switch(config.Type) {
        case TUrldatTableConfig::E_TABLE_IMPORTANT_URLS:
            isImportantUrl = true;
            break;
        case TUrldatTableConfig::E_TABLE_JUPITER_ACCEPTANCE: {
                if (isImportantUrl) {
                    output->AddRow(row, ImportantUrlsJupiterTableNo);
                }
                TCrawlerEvent record = TCrawlerEvent::FromJupiterBase(config, row);
                recordsBase.insert(record);
            }
            break;
        case TUrldatTableConfig::E_TABLE_SPREAD_NEW: {
                if (IsSpreadUrlBannedBySourceId(row)) {
                    continue;
                }
                if (banner && banner->IsPathDisallowed(path)) {
                    continue;
                }
                if (isImportantUrl) {
                    output->AddRow(row, ImportantUrlsSpreadTableNo);
                }
                TCrawlerEvent record = TCrawlerEvent::FromJupiterSpread(config, row);
                httpCounters[record.Timestamp][record.HttpCode].Crawled++;
                recordsBaseAndSpread.insert(record);
                if (record.Timestamp > lastNewSpread.Timestamp) {
                    lastNewSpread = record;
                }
            }
            break;
        case TUrldatTableConfig::E_TABLE_SPREAD_PROCESSED:
            if (IsSpreadUrlBannedBySourceId(row)) {
                continue;
            }
            if (banner && banner->IsPathDisallowed(path)) {
                continue;
            }
            recordsBaseAndSpread.insert(TCrawlerEvent::FromJupiterSpread(config, row));
            break;
        case TUrldatTableConfig::E_TABLE_EVENT_HISTORY_WEEKS: {
                TCrawlerEvent record = TCrawlerEvent::FromDiff(row);
                recordsDiff.insert(record);
                int tableNo;
                if (FindWeeklyConfig(EventSamplesOutputConfig, record.Timestamp, tableNo)) { //proxy weekly event data to the updated table
                    output->AddRow(row, tableNo);
                }
            }
            break;
        case TUrldatTableConfig::E_TABLE_EVENT_HISTORY_TAIL: {
                TCrawlerEvent record = TCrawlerEvent::FromDiff(row);
                recordsDiff.insert(record);
            }
            break;
        case TUrldatTableConfig::E_TABLE_RECENT_HISTORY: {
                int tableNo;
                if (FindWeeklyConfig(RecentSamplesOutputConfig, row[F_TIMESTAMP].AsInt64(), tableNo)) { //proxy weekly recent data to the updated table
                    output->AddRow(row, tableNo);
                }
            }
            break;
        default:
            ythrow yexception() << "unknown input table #" << input->GetTableIndex() << " Type=" << static_cast<int>(config.Type);
        }
    }

    /* Возможные сценарии
        * Урл есть в спреде, но нет в базе и диффе -> новый урл, в дифф
        * Урл есть только в диффе -> ушедший урл, пропускаем
        * Урл есть только в базе -> пропускаем
        * Урл из диффа новее урла в базе, спреда еще нет -> пропускаем
        * Урл из диффа новее урла в базе и части спреда -> обрабатываем новый спред, в дифф
    */

    if (recordsBaseAndSpread.empty()) { //there is diff only
        return;
    }

    if (!lastNewSpread.Empty()) {
        int tableNo;
        if (FindWeeklyConfig(RecentSamplesOutputConfig, lastNewSpread.Timestamp, tableNo)) { //proxy weekly recent data to the updated table
            output->AddRow(NYT::TNode()
                (F_HOST, host)
                (F_PATH, path)
                (F_HTTP_CODE, lastNewSpread.HttpCode)
                (F_LAST_ACCESS, lastNewSpread.LastAccess)
                (F_TIMESTAMP, lastNewSpread.Timestamp),
                tableNo
            );
        }
    }

    bool isNewUrl = recordsBase.empty() && recordsDiff.empty();

    if (isNewUrl) {
        const TCrawlerEvent &record = *recordsBaseAndSpread.begin();
        OutputNew(output, record, host, path);
        httpCounters[record.Timestamp][record.HttpCode].New++;
    }

    int prevHttpCode = recordsBaseAndSpread.begin()->HttpCode;
    for (const TCrawlerEvent &record : recordsBaseAndSpread) {
        bool isRecordProcessed = !recordsDiff.empty() && recordsDiff.rbegin()->Timestamp > record.Timestamp;
        if (!isRecordProcessed && record.HttpCode != prevHttpCode && !recordsDiff.contains(record)) {
            OutputChange(output, record, host, path, prevHttpCode);
            httpCounters[record.Timestamp][record.HttpCode].Changed++;
        }

        prevHttpCode = record.HttpCode;
    }

    OutputHistoryCounters(output, host, path, httpCounters);
}

void TJupiterUrldatReducer::OutputHistoryCounters(TWriter *output, const TString &host, const TString &path, const THashMap<time_t, THashMap<int, TCrawlerEventCounter>> &httpCounters) const {
    for (const auto &tsObj : httpCounters) {
        time_t timestamp = tsObj.first;
        const auto configIt = CountersOutputConfig.find(timestamp);
        if (configIt == CountersOutputConfig.end()) {
            continue;
        }

        for (const auto &httpCodeObj : tsObj.second) {
            int httpCode = httpCodeObj.first;
            const TCrawlerEventCounter &record = httpCodeObj.second;

            output->AddRow(NYT::TNode()
                (F_HOST, host)
                (F_PATH, path)
                (F_HTTP_CODE, httpCode)
                (F_TIMESTAMP, timestamp)
                (F_COUNT_CHANGED, record.Changed)
                (F_COUNT_CRAWLED, record.Crawled)
                (F_COUNT_NEW, record.New),
                configIt->second //table No.
            );
        }
    }
}

void TJupiterUrldatReducer::OutputEventSamples(TWriter *output, const TCrawlerEvent &record, const TString &host, const TString &path, TCrawlerEvent::EEventType diffType, int prevHttpCode) const {
    const static NYT::TNode NullNode = NYT::TNode::CreateEntity();

    const bool thereIsCountersConfig = CountersOutputConfig.contains(record.Timestamp); //hack to prevent influence of changed spread

    int tableNo;
    if (thereIsCountersConfig && FindWeeklyConfig(EventSamplesOutputConfig, record.Timestamp, tableNo)) {
        output->AddRow(NYT::TNode()
            (F_HOST, host)
            (F_PATH, path)
            (F_TIMESTAMP, record.Timestamp)
            (F_LAST_ACCESS, record.LastAccess)
            (F_HTTP_CODE, record.HttpCode)
            (F_PREV_HTTP_CODE, prevHttpCode != -1 ? prevHttpCode : NullNode)
            (F_CHANGE_TYPE, diffType),
            tableNo
        );
    }
}

void TJupiterUrldatReducer::OutputNew(TWriter *output, const TCrawlerEvent &record, const TString &host, const TString &path) const {
    OutputEventSamples(output, record, host, path, TCrawlerEvent::E_URL_NEW);
}

void TJupiterUrldatReducer::OutputChange(TWriter *output, const TCrawlerEvent &record, const TString &host, const TString &path, int prevHttpCode) const {
    OutputEventSamples(output, record, host, path, TCrawlerEvent::E_URL_CHANGED, prevHttpCode);
}

REGISTER_REDUCER(TJupiterUrldatReducer)

void TCrawlerCountersReducer::OutputHttpCodes(TWriter *output, const TString &host, const TString &path, time_t timestamp, const TWrapperNode::Ptr &node, bool isUserNode) {
    for (const auto &obj : node->Data.CrawledHttpCodes) {
        int httpCode = obj.first;
        size_t crawledUrls = obj.second;

        output->AddRow(NYT::TNode()
            (F_HOST, host)
            (F_NAME, node->Name)
            (F_PATH, path)
            (F_HTTP_CODE, httpCode)
            (F_TIMESTAMP, timestamp)
            (F_NODE_ID, node->Id)
            (F_PARENT_ID, node->ParentId)
            (F_COUNT_CRAWLED, crawledUrls)
            (F_IS_USER_PATH, isUserNode),
            TABLENO_COUNTERS_HTTP_CODES
        );
    }
}

void TCrawlerCountersReducer::OutputEvents(TWriter *output, const TString &host, const TString &path, time_t timestamp, const TWrapperNode::Ptr &node, bool isUserNode) {
    const TCrawlerEventCounter &counter = node->Data.CrawledEvents;

    //if (counter.Changed == 0 && counter.New == 0) {
        //return;
    //}

    output->AddRow(NYT::TNode()
        (F_HOST, host)
        (F_PATH, path)
        (F_NAME, node->Name)
        (F_TIMESTAMP, timestamp)
        (F_NODE_ID, node->Id)
        (F_PARENT_ID, node->ParentId)
        (F_COUNT_NEW, counter.New)
        (F_COUNT_CHANGED, counter.Changed)
        (F_IS_USER_PATH, isUserNode),
        TABLENO_COUNTERS_EVENTS
    );
}

void TCrawlerCountersReducer::OutputCrawlerCounters(TWriter *output, const TString &host, time_t timestamp, TSiteTreeShard::Ptr sitetreeShardPtr) {
    if (sitetreeShardPtr.Get() == nullptr) {
        return;
    }

    for (const TWrapperNode::TPath &pathObj : sitetreeShardPtr->TreePaths) { // first = size_t, second = TWrapperNode::Ptr
        const TString &path = pathObj.Path;
        const TWrapperNode::Ptr &node = sitetreeShardPtr->IdNodeMap[pathObj.NodeId];
        if (node.Get() == nullptr || node->Name.empty()) {
            continue;
        }
        OutputHttpCodes(output, host, path, timestamp, node, false);    //??? node is redundant
        OutputEvents(output, host, path, timestamp, node, false);       //??? node is redundant
    }

    for (const TWrapperNode::Ptr &userNode : sitetreeShardPtr->UserPatterns) {
        if (userNode.Get() == nullptr) {
            continue;
        }
        OutputHttpCodes(output, host, userNode->Name, timestamp, userNode, true);   //??? node is redundant
        OutputEvents(output, host, userNode->Name, timestamp, userNode, true);      //??? node is redundant
    }
}

void TCrawlerCountersReducer::Do(TReader *input, TWriter *output) {
    const int TABLENO_SITETREE = 0;
    const int TABLENO_COUNTERS = 1;

    TVector<TTreeRecord> siteStructure;
    TVector<TString> userStructure;
    TSiteTreeShard::Ptr sitetreeShardPtr;
    const TString host = input->GetRow()[F_HOST].AsString();
    time_t timestamp = -1;
    for (; input->IsValid(); input->Next()) {
        const NYT::TNode &row = input->GetRow();

        switch(input->GetTableIndex()) {
        case TABLENO_SITETREE:
            if (row[F_IS_USER_PATH].AsBool()) {
                userStructure.push_back(row[F_PATH].AsString());
            } else {
                TTreeRecord record(row[F_PATH].AsString(), TTreeData(), row[F_NODE_ID].AsInt64(), row[F_PARENT_ID].AsInt64());
                siteStructure.push_back(record);
            }
            break;
        case TABLENO_COUNTERS:
            if (sitetreeShardPtr.Get() == nullptr) {
                TUserPatternMatcher::Ptr patternMatcher;
                if (!userStructure.empty()) {
                    patternMatcher.Reset(new TUserPatternMatcher(userStructure));
                }
                if (siteStructure.empty()) {
                    siteStructure.push_back(TTreeRecord("/", TTreeData(), 1, 0));
                }
                sitetreeShardPtr.Reset(new TSiteTreeShard(siteStructure, patternMatcher, proto::urltree::ACCEPTANCE, proto::urltree::RU_ACCEPTANCE));
                timestamp = row[F_TIMESTAMP].AsInt64();
            }
            sitetreeShardPtr->AppendCrawlerEvent(row[F_PATH].AsString(), row[F_HTTP_CODE].AsInt64(), row[F_COUNT_NEW].AsUint64(), row[F_COUNT_CHANGED].AsUint64(), row[F_COUNT_CRAWLED].AsUint64());
            break;
        default:
            ythrow yexception() << "unknown input table";
        }
    }

    OutputCrawlerCounters(output, host, timestamp, sitetreeShardPtr);
}

REGISTER_REDUCER(TCrawlerCountersReducer)

template<class T>
void /*TCrawlerSitetreePrepareReducer::*/OutputSitetreeNode(NYT::TTableWriter<NYT::TNode>* output, const T &nodes, const TString &host, bool isUserNodes) {
    for (const proto::urltree::NodeInfo &node : nodes) {
        if (node.shard_id() == proto::urltree::RU && node.search_source_id() == proto::urltree::PRODUCTION) {
            output->AddRow(NYT::TNode()
                (F_HOST, host)
                (F_PATH, node.name())
                (F_IS_USER_PATH, isUserNodes)
                (F_NODE_ID, node.node_id())
                (F_PARENT_ID, node.parent_id())
            );
        }
    }
}

void TCrawlerSitetreePrepareReducer::Do(TReader *input, TWriter *output) {
    const NYT::TNode &row = input->GetRow();
    const TString host = row[F_KEY].AsString();
    proto::urltree::HostInfo msg;
    Y_PROTOBUF_SUPPRESS_NODISCARD msg.ParseFromString(row[F_VALUE].AsString());
    OutputSitetreeNode(output, msg.nodes(), host, false);
    OutputSitetreeNode(output, msg.user_nodes(), host, true);
}

REGISTER_REDUCER(TCrawlerSitetreePrepareReducer)

void TCrawlerWeeklySamplesMapper::Do(TReader *input, TWriter *output) {
    for (; input->IsValid(); input->Next()) {
        NYT::TNode row = input->GetRow();
        output->AddRow(row
            (F_LAST_ACCESS, -row[F_LAST_ACCESS].AsInt64())
        );
    }
}

REGISTER_MAPPER(TCrawlerWeeklySamplesMapper)

void TCrawlerWeeklySamplesReducer::Do(TReader *input, TWriter *output) {
    const size_t LIMIT = 50000;
    for (size_t counter = 0; input->IsValid() && counter < LIMIT; input->Next(), counter++) {
        NYT::TNode row = input->GetRow();
        output->AddRow(row
            (F_LAST_ACCESS, -row[F_LAST_ACCESS].AsInt64())
        );
    }
}

REGISTER_REDUCER(TCrawlerWeeklySamplesReducer)

void TCrawlerDailyEventCountersReducer::Do(TReader *input, TWriter *output) {
    size_t countChanged = 0;
    size_t countNew = 0;

    NYT::TNode dstRow = input->GetRow();
    for (; input->IsValid(); input->Next()) {
        const NYT::TNode row = input->GetRow();
        countChanged += row[F_COUNT_CHANGED].AsUint64();
        countNew += row[F_COUNT_NEW].AsUint64();
    }

    output->AddRow(dstRow
        (F_COUNT_CHANGED, countChanged)
        (F_COUNT_NEW, countNew)
    );
}

REGISTER_REDUCER(TCrawlerDailyEventCountersReducer)

void TCrawlerDailyHttpCountersReducer::Do(TReader *input, TWriter *output) {
    size_t countCrawled = 0;

    NYT::TNode dstRow = input->GetRow();
    for (; input->IsValid(); input->Next()) {
        const NYT::TNode row = input->GetRow();
        countCrawled += row[F_COUNT_CRAWLED].AsUint64();
    }

    output->AddRow(dstRow
        (F_COUNT_CRAWLED, countCrawled)
    );
}

REGISTER_REDUCER(TCrawlerDailyHttpCountersReducer)

TCrawlerPrepareRobotsReducer::TCrawlerPrepareRobotsReducer(const THashSet<TString> &webmasterHosts)
    : WebmasterHosts(webmasterHosts)
{
}

void TCrawlerPrepareRobotsReducer::Do(TReader *input, TWriter *output) {
    const int TABLENO_HOSTTABLE = 1;
    //const int TABLENO_HOSTDAT = 0;

    const TString host = input->GetRow()[F_HOST].AsString();
    if (!WebmasterHosts.contains(host)) {
        return;
    }

    time_t bannedByRobotsTxtSince = 0;
    TMap<time_t, TString> hostInfoRecords;

    for (; input->IsValid(); input->Next()) {
        const NYT::TNode &row = input->GetRow();
        const int tableNo = input->GetTableIndex();

        if (tableNo == TABLENO_HOSTTABLE) {
            bannedByRobotsTxtSince = Max<time_t>(bannedByRobotsTxtSince, NYTUtils::FromNodeOrDefault<ui64>(row[F_BANNED_BY_ROBOTSTXT_SINCE], 0));
        }

        // protection from wrong LastAccess
        if (NYTUtils::IsNodeNull(row[F_LAST_ACCESS])) {
            continue;
        }

        if (!NYTUtils::IsNodeNull(row[F_ROBOTS])) {
            const TString robots = row[F_ROBOTS].AsString();
            time_t lastAccess = NYTUtils::FromNodeOrDefault<ui64>(row[F_LAST_ACCESS], 0);
            NJupiter::THostdat hostDat;
            hostDat.SetHost(row[F_HOST].AsString());
            hostDat.SetRobots(robots);
            hostDat.SetLastAccess(lastAccess);
            NJupiter::THostInfo hi(hostDat);
            if (hi.HasRobotsTxt()) {
                hostInfoRecords[lastAccess] = robots;
            }
        }
    }

    if (!hostInfoRecords.empty()) {
        time_t lastAccess = hostInfoRecords.rbegin()->first;
        TString &robots = hostInfoRecords.rbegin()->second;
        NUtils::Compress(robots);

        output->AddRow(NYT::TNode()
            (F_HOST, host)
            (F_LAST_ACCESS, lastAccess)
            (F_ROBOTS, robots)
            (F_BANNED_BY_ROBOTSTXT_SINCE, bannedByRobotsTxtSince)
        );
    }
}

REGISTER_REDUCER(TCrawlerPrepareRobotsReducer)

struct TReduceUnique : public NYT::IReducer<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TNode>> {
    //reduce by any
    void Do(TReader *input, TWriter *output) override {
        output->AddRow(input->GetRow());
    }
};

REGISTER_REDUCER(TReduceUnique)

NYT::TTableSchema GetEventSamplesTableSchema() {
    NYT::TTableSchema tableSchema;
    tableSchema.Strict(true);
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_HOST).Type(NYT::VT_STRING).SortOrder(NYT::SO_ASCENDING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_PATH).Type(NYT::VT_STRING).SortOrder(NYT::SO_ASCENDING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_HTTP_CODE).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_LAST_ACCESS).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_PREV_HTTP_CODE).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_TIMESTAMP).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_CHANGE_TYPE).Type(NYT::VT_INT64));
    return tableSchema;
}

NYT::TTableSchema GetRecentSamplesTableSchema() {
    NYT::TTableSchema tableSchema;
    tableSchema.Strict(true);
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_HOST).Type(NYT::VT_STRING).SortOrder(NYT::SO_ASCENDING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_PATH).Type(NYT::VT_STRING).SortOrder(NYT::SO_ASCENDING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_HTTP_CODE).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_LAST_ACCESS).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_TIMESTAMP).Type(NYT::VT_INT64));
    return tableSchema;
}

NYT::TTableSchema GetSpreadCountersTableSchema() {
    NYT::TTableSchema tableSchema;
    tableSchema.Strict(true);
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_HOST).Type(NYT::VT_STRING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_PATH).Type(NYT::VT_STRING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_HTTP_CODE).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_TIMESTAMP).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_COUNT_CRAWLED).Type(NYT::VT_UINT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_COUNT_CHANGED).Type(NYT::VT_UINT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_COUNT_NEW).Type(NYT::VT_UINT64));
    return tableSchema;
}

NYT::TTableSchema GetHttpCountersTableSchema() {
    NYT::TTableSchema tableSchema;
    tableSchema.Strict(true);
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_HOST).Type(NYT::VT_STRING).SortOrder(NYT::SO_ASCENDING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_NAME).Type(NYT::VT_STRING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_PATH).Type(NYT::VT_STRING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_HTTP_CODE).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_TIMESTAMP).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_COUNT_CRAWLED).Type(NYT::VT_UINT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_NODE_ID).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_PARENT_ID).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_IS_USER_PATH).Type(NYT::VT_BOOLEAN));
    return tableSchema;
}

NYT::TTableSchema GetEventCountersTableSchema() {
    NYT::TTableSchema tableSchema;
    tableSchema.Strict(true);
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_HOST).Type(NYT::VT_STRING).SortOrder(NYT::SO_ASCENDING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_NAME).Type(NYT::VT_STRING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_PATH).Type(NYT::VT_STRING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_TIMESTAMP).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_COUNT_CHANGED).Type(NYT::VT_UINT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_COUNT_NEW).Type(NYT::VT_UINT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_NODE_ID).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_PARENT_ID).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_IS_USER_PATH).Type(NYT::VT_BOOLEAN));
    return tableSchema;
}

NYT::TTableSchema GetSitetreeTableSchema() {
    NYT::TTableSchema tableSchema;
    tableSchema.Strict(true);
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_HOST).Type(NYT::VT_STRING).SortOrder(NYT::SO_ASCENDING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_PATH).Type(NYT::VT_STRING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_IS_USER_PATH).Type(NYT::VT_BOOLEAN));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_NODE_ID).Type(NYT::VT_INT64));
    tableSchema.AddColumn(NYT::TColumnSchema().Name(F_PARENT_ID).Type(NYT::VT_INT64));
    return tableSchema;
}

NYT::TSortColumns GetEventCountersKeys() {
    return NYT::TSortColumns(F_HOST, F_IS_USER_PATH, F_PATH);
}

NYT::TSortColumns GetHttpCountersKeys() {
    return NYT::TSortColumns(F_HOST, F_IS_USER_PATH, F_PATH, F_HTTP_CODE);
}

struct TTaskReduceSpreadCounters: public IObjectInQueue {
    TTaskReduceSpreadCounters(NYT::IClientBasePtr client, const TString &sitetreeTable, const TString &inSpreadCountersTable, const TString &outHttpCountersTable, const TString &outEventCountersTable, TAtomic &errors)
        : Client(client)
        , SitetreeTable(sitetreeTable)
        , InSpreadCountersTable(inSpreadCountersTable)
        , OutHttpCountersTable(outHttpCountersTable)
        , OutEventCountersTable(outEventCountersTable)
        , Errors(errors)
    {
    }

    void Process(void* /*tsr*/) override {
        NYT::TTableSchema httpCounterSchema = GetHttpCountersTableSchema();
        NYT::TTableSchema eventCounterSchema = GetEventCountersTableSchema();

        try {
            TOpRunner(Client)
                .SortBy(F_HOST)
                .Sort(InSpreadCountersTable)
                .InputNode(SitetreeTable)
                .InputNode(InSpreadCountersTable)
                .OutputNode(NYT::TRichYPath(OutHttpCountersTable).Schema(httpCounterSchema))
                .OutputNode(NYT::TRichYPath(OutEventCountersTable).Schema(eventCounterSchema))
                .ReduceBy(F_HOST)
                .Reduce(new TCrawlerCountersReducer)
                .SortBy(GetHttpCountersKeys())
                .Sort(OutHttpCountersTable, ASYNC_CTX0)
                .SortBy(GetEventCountersKeys())
                .Sort(OutEventCountersTable, ASYNC_CTX0)
                .Wait(ASYNC_CTX0)
                .Drop(InSpreadCountersTable)
            ;
        } catch(const yexception &e) {
            LOG_ERROR("crawler, unable to reduce spread counters table %s: %s", InSpreadCountersTable.data(), e.what());
            AtomicIncrement(Errors);
        }
    }

public:
    NYT::IClientBasePtr Client;
    const TString SitetreeTable;
    const TString InSpreadCountersTable;
    const TString OutHttpCountersTable;
    const TString OutEventCountersTable;
    TAtomic &Errors;
};

template<class TReducer>
struct TTaskReduceDailyCounters : public IObjectInQueue {
    TTaskReduceDailyCounters(NYT::IClientBasePtr client, const TString &dailyTable, const TDeque<TString> &counterTables, const NYT::TTableSchema &schema, const NYT::TSortColumns &reduceBy, TAtomic &errors)
        : Client(client)
        , DailyTable(dailyTable)
        , CounterTables(counterTables)
        , Schema(schema)
        , ReduceBy(reduceBy)
        , Errors(errors)
    {
    }

    void Process(void* /*tsr*/) override {
        try {
            TOpRunner runner(Client);

            const bool dailyTableExists = Client->Exists(DailyTable);
            if (dailyTableExists) {
                runner
                    .SortBy(ReduceBy)
                    .Sort(DailyTable)
                ;
            }

            for (const TString &table : CounterTables) {
                runner.InputNode(table);
            }

            if (dailyTableExists) {
                runner.InputNode(DailyTable);
            }

            runner
                .OutputNode(NYT::TRichYPath(DailyTable).Schema(Schema))
                .ReduceBy(ReduceBy)
                .Reduce(new TReducer)
            ;

            for (const TString &table : CounterTables) {
                runner.Drop(table);
            }
        } catch(const yexception &e) {
            LOG_ERROR("crawler, unable to reduce daily counters table %s: %s", DailyTable.data(), e.what());
            AtomicIncrement(Errors);
        }
    }

public:
    NYT::IClientBasePtr Client;
    const TString &DailyTable;
    const TDeque<TString> &CounterTables;
    const NYT::TTableSchema &Schema;
    const NYT::TSortColumns &ReduceBy;
    TAtomic &Errors;
};

struct TTaskReduceWeeklySamples : public IObjectInQueue {
    TTaskReduceWeeklySamples(NYT::IClientBasePtr client, const TString &weeklyTable, const NYT::TTableSchema &schema, TAtomic &errors)
        : Client(client)
        , WeeklyTable(weeklyTable)
        , Schema(schema)
        , Errors(errors)
    {
    }

    void Process(void* /*tsr*/) override {
        try {
            TOpRunner(Client)
                .InputNode(WeeklyTable)
                .OutputNode(NYT::TRichYPath(WeeklyTable).Schema(Schema))
                .ReduceBy(F_HOST)
                .SortBy(F_HOST, F_LAST_ACCESS)
                .MapReduce(new TCrawlerWeeklySamplesMapper, new TCrawlerWeeklySamplesReducer)
                .SortBy(F_HOST, F_PATH)
                .Sort(WeeklyTable)
            ;
        } catch(const yexception &e) {
            LOG_ERROR("crawler, unable to reduce weekly samples table %s: %s", WeeklyTable.data(), e.what());
            AtomicIncrement(Errors);
        }
    }

public:
    NYT::IClientBasePtr Client;
    const TString &WeeklyTable;
    const NYT::TTableSchema &Schema;
    TAtomic &Errors;
};

struct TUpdatedTables {
    void Add(const TString &table) {
        with_lock(Mutex) {
            Tables.push_back(table);
        }
    }

    bool Empty() const {
        return Tables.empty();
    }

public:
    TDeque<TString> Tables;
    TMutex Mutex;
};

static NYT::TRichYPath DebugPath(const TString &table) {
    NYT::TRichYPath path(table);
/*
    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("http://odobri.ru"))));
    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("http://www.bloknotov.ru"))));
    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("http://www.pleer.ru"))));
    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("http://www.vedomosti.ru"))));
    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("https://lenta.ru"))));
*/
    return path;
}

TString GetLatestSitetree(NYT::IClientBasePtr client, const TString &root) {
    TDeque<NYTUtils::TTableInfo> tables;
    NYTUtils::GetTableList(client, root, tables, Max<int>());

    TString latestTransmitted;

    TRegularExpression regex("/ready-(\\d+)$");
    for (const auto &table : tables) {
        TVector<TString> hits;
        if (regex.GetMatches(table.Name, hits) && table.Name > latestTransmitted) {
            latestTransmitted = table.Name;
        }
    }

    if (latestTransmitted.empty()) {
        ythrow yexception() << "there is no transmitted sitetree";
    }

    return latestTransmitted;
}

void UpdateSitetreeTable(NYT::IClientBasePtr client) {
    const TConfig &config = TConfig::CInstance();
    const NYT::TTableSchema sitetreeTableSchema = GetSitetreeTableSchema();
    const TString latestSitetree = GetLatestSitetree(client, config.TABLE_SITETREE_ROOT);
    const TString prevSitetree = NYTUtils::GetAttrOrDefault<TString>(client, config.TABLE_CRAWLER_SOURCE_SITETREE, "last_source", "");
    if (latestSitetree != prevSitetree) {
        TOpRunner(client)
            .InputNode(DebugPath(latestSitetree))
            .OutputNode(NYT::TRichYPath(config.TABLE_CRAWLER_SOURCE_SITETREE).Schema(sitetreeTableSchema))
            .ReduceBy(F_KEY)
            .Reduce(new TCrawlerSitetreePrepareReducer)
        ;
        NYTUtils::SetAttr(client, config.TABLE_CRAWLER_SOURCE_SITETREE, "last_source", latestSitetree);
    }
}

void UpdateCrawlerCounters(NYT::IClientBasePtr client, const TDeque<TString> &updatedSpreadCounters) {
    const TConfig &config = TConfig::CInstance();

    TThreadPool countersReducerQueue(TThreadPool::TParams().SetBlocking(true).SetCatching(true));
    countersReducerQueue.Start(8, 32);

    TAtomic errors = 0;
    for (const TString &spreadCounterTable : updatedSpreadCounters) {
        const TString spreadTimestampStr = NYTUtils::GetTableName(spreadCounterTable);
        countersReducerQueue.SafeAddAndOwn(MakeHolder<TTaskReduceSpreadCounters>(
            client,
            config.TABLE_CRAWLER_SOURCE_SITETREE,
            spreadCounterTable,
            NYTUtils::JoinPath(config.TABLE_CRAWLER_RAW_COUNTERS_HTTP, spreadTimestampStr),
            NYTUtils::JoinPath(config.TABLE_CRAWLER_RAW_COUNTERS_EVENT, spreadTimestampStr),
            errors
        ));
    }

    countersReducerQueue.Stop();

    if (errors > 0) {
        ythrow yexception() << "something is wrong";
    }
}

template<class TReducer>
bool MergeDailyCounters(NYT::IClientBasePtr client, const TString &dailyPrefix, const TString &counterPrefix, const NYT::TTableSchema &schema, const NYT::TSortColumns &reduceBy, TUpdatedTables &updatedTables) try {
    TDeque<NYTUtils::TTableInfo> eventCounters;
    NYTUtils::GetTableList(client, counterPrefix, eventCounters);

    TMap<TString, TDeque<TString>> dailyEventCounters;
    for (const NYTUtils::TTableInfo &table : eventCounters) {
        const TString spreadTimestampStr = GetJupiterStateFromPath(table.Name);
        const time_t spreadTimestamp = GetTsTZFromJupiterState(spreadTimestampStr);
        const TString dailyTable = NYTUtils::JoinPath(dailyPrefix, "day_" + NUtils::Date2StrTZ(spreadTimestamp));
        dailyEventCounters[dailyTable].push_back(table.Name);
    }

    TAtomic errors = 0;
    TThreadPool countersReducerQueue(TThreadPool::TParams().SetBlocking(true).SetCatching(true));
    countersReducerQueue.Start(4, 32);
    for (const auto &dailyObj : dailyEventCounters) {
        const TString &dailyTable = dailyObj.first;
        const TDeque<TString> &counterTables = dailyObj.second;
        countersReducerQueue.SafeAddAndOwn(MakeHolder<TTaskReduceDailyCounters<TReducer>>(client, dailyTable, counterTables, schema, reduceBy, errors));
        updatedTables.Add(dailyTable);
    }
    countersReducerQueue.Stop();

    return errors == 0;
} catch (yexception &e) {
    LOG_ERROR("crawler, unable to merge daily counters: %s", e.what());
    return false;
}

bool MergeDailyCounters(NYT::IClientBasePtr client, TUpdatedTables &updatedTables) {
    const TConfig &config = TConfig::CInstance();

    bool resultHttp = false;
    bool resultEvent = false;

    auto taskMergeEventCounters = [&] {
        NYT::TTableSchema schema = GetEventCountersTableSchema();
        resultEvent = MergeDailyCounters<TCrawlerDailyEventCountersReducer>(client, config.TABLE_CRAWLER_DAILY_COUNTERS_EVENT,
            config.TABLE_CRAWLER_RAW_COUNTERS_EVENT, schema, GetEventCountersKeys(), updatedTables);
    };

    auto taskMergeHttpCounters = [&] {
        NYT::TTableSchema schema = GetHttpCountersTableSchema();
        resultHttp = MergeDailyCounters<TCrawlerDailyHttpCountersReducer>(client, config.TABLE_CRAWLER_DAILY_COUNTERS_HTTP,
            config.TABLE_CRAWLER_RAW_COUNTERS_HTTP, schema, GetHttpCountersKeys(), updatedTables);
    };

    NUtils::RunAsync(
        taskMergeEventCounters,
        taskMergeHttpCounters
    );

    return resultHttp && resultEvent;
}

bool MergeSamples(NYT::IClientBasePtr client, const TDeque<TString> &updatedRecentSamples, const TDeque<TString> &updatedEventSamples) try {
    NYT::TTableSchema eventTableSchema = GetEventSamplesTableSchema();
    NYT::TTableSchema recentTableSchema = GetRecentSamplesTableSchema();
    NYTUtils::PatchSchema(eventTableSchema, F_HOST, NYT::VT_STRING); //drop sort order
    NYTUtils::PatchSchema(eventTableSchema, F_PATH, NYT::VT_STRING); //drop sort order
    NYTUtils::PatchSchema(recentTableSchema, F_HOST, NYT::VT_STRING); //drop sort order
    NYTUtils::PatchSchema(recentTableSchema, F_PATH, NYT::VT_STRING); //drop sort order

    TAtomic errors = 0;
    TThreadPool samplesReducerQueue(TThreadPool::TParams().SetBlocking(true).SetCatching(true));
    samplesReducerQueue.Start(4, 32);

    for (const TString &table : updatedEventSamples) {
        samplesReducerQueue.SafeAddAndOwn(MakeHolder<TTaskReduceWeeklySamples>(client, table, eventTableSchema, errors));
    }

    for (const TString &table : updatedRecentSamples) {
        samplesReducerQueue.SafeAddAndOwn(MakeHolder<TTaskReduceWeeklySamples>(client, table, recentTableSchema, errors));
    }

    samplesReducerQueue.Stop();
    return errors == 0;

} catch(yexception &e) {
    LOG_ERROR("crawler, unable to merge weekly samples: %s", e.what());
    return false;
}

void PrepareRobotsTxt(NYT::IClientBasePtr client, const THashSet<TString> &webmasterHosts, const TString &robotsTable) {
    const TString hostTable = GetJupiterAcceptanceHostTable(client);
    const time_t hostTableTs = GetJupiterTsTZFromPath(hostTable);

    NYT::TTableSchema robotsSchema;
    robotsSchema.Strict(true);
    robotsSchema.AddColumn(NYT::TColumnSchema().Name(F_HOST).Type(NYT::VT_STRING).SortOrder(NYT::SO_ASCENDING));
    robotsSchema.AddColumn(NYT::TColumnSchema().Name(F_LAST_ACCESS).Type(NYT::VT_INT64));
    robotsSchema.AddColumn(NYT::TColumnSchema().Name(F_BANNED_BY_ROBOTSTXT_SINCE).Type(NYT::VT_INT64));
    robotsSchema.AddColumn(NYT::TColumnSchema().Name(F_ROBOTS).Type(NYT::VT_STRING));

    TOpRunner runner(client);
    runner.InputNode(hostTable);
    TDeque<NYTUtils::TTableInfo> hostSpreadTables;
    LoadHostSpreadTables(client, hostTableTs, hostSpreadTables);
    for (const NYTUtils::TTableInfo &table : hostSpreadTables) {
        runner.InputNode(table.Name);
    }

    runner
        .OutputNode(NYT::TRichYPath(robotsTable).Schema(robotsSchema))
        .ReduceBy(F_HOST)
        .MemoryLimit(MEMORY_LIMIT_4GB)
        .Reduce(new TCrawlerPrepareRobotsReducer(webmasterHosts))
        .SortBy(F_HOST)
        .Sort(robotsTable)
    ;
}

void LoadRobotsTxt(NYT::IClientBasePtr client, const TString &robotsTable, THashMap<TString, TRobotsRecord> &robots) {
    auto reader = client->CreateTableReader<NYT::TNode>(robotsTable);
    for (; reader->IsValid(); reader->Next()) {
        const NYT::TNode &row = reader->GetRow();
        robots[row[F_HOST].AsString()] = TRobotsRecord(row[F_ROBOTS].AsString(), row[F_LAST_ACCESS].AsInt64(), row[F_BANNED_BY_ROBOTSTXT_SINCE].AsInt64());
    }
}

int TaskUpdateCrawlerSamples(int, const char **) {
    const TConfig &config = TConfig::CInstance();

    NYT::IClientPtr client = NYT::CreateClient(config.MR_SERVER_HOST);
    NYT::ITransactionPtr tx = client->StartTransaction();

    //const TString jupiterAcceptanceHostTable = GetJupiterAcceptanceHostTable(tx);
    const TString jupiterAcceptanceTable = GetJupiterAcceptanceTable(tx);
    const time_t jupiterAcceptanceTimestamp = GetJupiterTsTZFromPath(jupiterAcceptanceTable);
    const time_t newDiffTimestamp = Now().Seconds();

    NYTUtils::CreatePath(tx, config.TABLE_CRAWLER_WEEKLY_SAMPLES_RECENT);
    NYTUtils::CreatePath(tx, config.TABLE_CRAWLER_WEEKLY_SAMPLES_EVENT);
    NYTUtils::CreatePath(tx, config.TABLE_CRAWLER_RAW_COUNTERS_SPREAD);
    NYTUtils::CreatePath(tx, config.TABLE_CRAWLER_RAW_COUNTERS_HTTP);
    NYTUtils::CreatePath(tx, config.TABLE_CRAWLER_RAW_COUNTERS_EVENT);

    TUpdatedTables updatedTables;
    TDeque<TString> updatedRecentSamples;
    TDeque<TString> updatedEventSamples;
    TDeque<TString> updatedSpreadCounters;
    TDeque<TUrldatTableConfig> tablesInputConfig;
    THashMap<time_t, int> countersOutputConfig;
    TMap<TWeekTableConfig, int> eventSamplesOutputConfig;
    TMap<TWeekTableConfig, int> recentSamplesOutputConfig;

    const int MAX_SPREAD_TABLES_AT_ONCE = 8;
    TDeque<NYTUtils::TTableInfo> spreadTables;
    NYTUtils::GetTableList(tx, GetJupiterSpreadExportPrefix(), spreadTables);
    std::sort(spreadTables.begin(), spreadTables.end(),
        [] (const NWebmaster::NYTUtils::TTableInfo &lhs, const NWebmaster::NYTUtils::TTableInfo &rhs) -> bool {
            return lhs.Name < rhs.Name;
        }
    );

    NYTUtils::TBacklog processedBacklog(tx, config.TABLE_CRAWLER_PROCESSED_LOG, "ProcessedSpread");

    THashSet<TString> webmasterHosts;
    if (!NYTUtils::LoadWebmastersHosts(tx, config.TABLE_SOURCE_WEBMASTER_HOSTS, webmasterHosts, config.TABLE_SOURCE_WEBMASTER_HOSTS_ROW_COUNT)) {
        ythrow yexception() << "could not load webmaster hosts table";
    }

    int outputTableNo = 0;
    time_t mostRecentSpreadTimestamp = 0;
    size_t processedSpreadsCount = 0;
    TOpRunner runner(tx);

    if (tx->Exists(NImportantUrls::TConfig::CInstance().TABLE_IMPORTANT_URLS_SOURCE_URLS_PREPARED)) {
        tablesInputConfig.push_back(TUrldatTableConfig(TUrldatTableConfig::E_TABLE_IMPORTANT_URLS, 0));
        runner.InputNode(DebugPath(NImportantUrls::TConfig::CInstance().TABLE_IMPORTANT_URLS_SOURCE_URLS_PREPARED)); //!!!Table with ImportantUrls must be first in input!!!
    }

    THolder<NYT::TTableSchema> spreadSchema;

    for (const NYTUtils::TTableInfo &table : spreadTables) {
        if (NYTUtils::GetTableName(table.Name) != "urldat") {
            continue;
        }
        const TString directoryName = NYTUtils::GetDirectoryName(table.Name);
        const TString spreadTimestampStr = GetJupiterStateFromPath(table.Name);
        const time_t spreadTimestamp = GetTsTZFromJupiterState(spreadTimestampStr);

        try {
            if (!NYTUtils::GetAttr(tx, directoryName, "completed").AsBool()) {
                LOG_INFO("crawler, spread %s is not completed", table.Name.data());
                continue;
            }
        } catch(yexception &e) {
            LOG_WARN("crawler, unable to get spread table attribute: %s", e.what());
            continue;
        }

        if (spreadTimestamp < jupiterAcceptanceTimestamp) {
            continue;
        }

        if (!spreadSchema.Get()) {
            spreadSchema.Reset(new NYT::TTableSchema);
            *spreadSchema = NYTUtils::GetTableSchema(tx, table.Name);
            *spreadSchema = NYTUtils::DropSortOrder(*spreadSchema);
            NYTUtils::PatchSchema(*spreadSchema, F_SOURCE_ID, NYT::VT_INT64);
            NYTUtils::PatchSchema(*spreadSchema, F_SOURCE_NAME, NYT::VT_STRING);
        }

        TUrldatTableConfig::ETableType spreadState = TUrldatTableConfig::E_TABLE_SPREAD_NEW;
        if (processedBacklog.Has(spreadTimestamp)) {
            spreadState = TUrldatTableConfig::E_TABLE_SPREAD_PROCESSED;
        } else {
            processedSpreadsCount++;
            processedBacklog.Add(spreadTimestamp);
            TWeekTableConfig weekConfig(spreadTimestamp);

            if (!eventSamplesOutputConfig.contains(weekConfig)) {
                eventSamplesOutputConfig[weekConfig] = outputTableNo++;
                const TString eventTableName = NYTUtils::JoinPath(config.TABLE_CRAWLER_WEEKLY_SAMPLES_EVENT, "week_" + weekConfig.WeekName());
                runner.OutputNode(NYT::TRichYPath(eventTableName).Schema(GetEventSamplesTableSchema()));
                updatedTables.Add(eventTableName);

                recentSamplesOutputConfig[weekConfig] = outputTableNo++;
                const TString recentTableName = NYTUtils::JoinPath(config.TABLE_CRAWLER_WEEKLY_SAMPLES_RECENT, "week_" + weekConfig.WeekName());
                runner.OutputNode(NYT::TRichYPath(recentTableName).Schema(GetRecentSamplesTableSchema()));
                updatedTables.Add(recentTableName);

                updatedRecentSamples.push_back(recentTableName);
                updatedEventSamples.push_back(eventTableName);
            }

            const TString tableName = NYTUtils::JoinPath(config.TABLE_CRAWLER_RAW_COUNTERS_SPREAD, spreadTimestampStr);
            countersOutputConfig[spreadTimestamp] = outputTableNo++;
            runner.OutputNode(NYT::TRichYPath(tableName).Schema(GetSpreadCountersTableSchema()));
            updatedSpreadCounters.push_back(tableName);
        }

        mostRecentSpreadTimestamp = Max(mostRecentSpreadTimestamp, spreadTimestamp);
        tablesInputConfig.push_back(TUrldatTableConfig(spreadState, spreadTimestamp));
        runner.InputNode(DebugPath(table.Name));
        if (updatedSpreadCounters.size() > MAX_SPREAD_TABLES_AT_ONCE) { //to prevent "output tables times job count violation"
            break;
        }
    }

    if (updatedTables.Empty()) {
        LOG_INFO("crawler: there is no tables to update");
        return 0;
    }

    PrepareRobotsTxt(tx, webmasterHosts, config.TABLE_CRAWLER_ROBOTS_TXT);
    THashMap<TString, TRobotsRecord> robots;
    LoadRobotsTxt(tx, config.TABLE_CRAWLER_ROBOTS_TXT, robots);

    NYT::TTableSchema jupiterSchema = NYTUtils::GetTableSchema(tx, jupiterAcceptanceTable);
    int importantUrlsJupiterTableNo = outputTableNo++;
    const TString &sourceJupiterTable = NYTUtils::JoinPath(NImportantUrls::TConfig::CInstance().TABLE_IMPORTANT_URLS_SOURCE_JUPITER, ToString(jupiterAcceptanceTimestamp));
    runner.OutputNode(NYT::TRichYPath(sourceJupiterTable).Schema(jupiterSchema));

    int importantUrlsSpreadTableNo = outputTableNo++;
    const TString &sourceSpreadTable = NYTUtils::JoinPath(NImportantUrls::TConfig::CInstance().TABLE_IMPORTANT_URLS_SOURCE_SPREAD, ToString(mostRecentSpreadTimestamp));
    runner.OutputNode(NYT::TRichYPath(sourceSpreadTable).Schema(*spreadSchema));

    tablesInputConfig.push_back(TUrldatTableConfig(TUrldatTableConfig::E_TABLE_JUPITER_ACCEPTANCE, jupiterAcceptanceTimestamp));
    runner.InputNode(DebugPath(jupiterAcceptanceTable));

    TDeque<NYTUtils::TTableInfo> processedEventWeeks;
    NYTUtils::GetTableList(tx, config.TABLE_CRAWLER_WEEKLY_SAMPLES_EVENT, processedEventWeeks);
    for (const NYTUtils::TTableInfo &table : processedEventWeeks) {
        const time_t diffTimestamp = NYTUtils::GetAttrOrDefault<i64>(tx, table.Name, ATTR_LAST_PROCESSED, 0);
        tablesInputConfig.push_back(TUrldatTableConfig(TUrldatTableConfig::E_TABLE_EVENT_HISTORY_WEEKS, diffTimestamp));
        runner.InputNode(DebugPath(table.Name));
    }

    if (tx->Exists(config.TABLE_CRAWLER_TAIL_SAMPLES_EVENT)) {
        tablesInputConfig.push_back(TUrldatTableConfig(TUrldatTableConfig::E_TABLE_EVENT_HISTORY_TAIL, 0));
        runner.InputNode(DebugPath(config.TABLE_CRAWLER_TAIL_SAMPLES_EVENT));
    }

    TDeque<NYTUtils::TTableInfo> processedRecentWeeks;
    NYTUtils::GetTableList(tx, config.TABLE_CRAWLER_WEEKLY_SAMPLES_RECENT, processedRecentWeeks);
    for (const NYTUtils::TTableInfo &table : processedRecentWeeks) {
        const time_t diffTimestamp = NYTUtils::GetAttrOrDefault<i64>(tx, table.Name, ATTR_LAST_PROCESSED, 0);
        tablesInputConfig.push_back(TUrldatTableConfig(TUrldatTableConfig::E_TABLE_RECENT_HISTORY, diffTimestamp));
        runner.InputNode(DebugPath(table.Name));
    }

    runner
        .MemoryLimit(MEMORY_LIMIT_10GB)
        .ReduceBy(F_HOST, F_PATH)
        .Reduce(new TJupiterUrldatReducer(tablesInputConfig, eventSamplesOutputConfig, recentSamplesOutputConfig, countersOutputConfig, webmasterHosts,
            importantUrlsJupiterTableNo, importantUrlsSpreadTableNo, robots
        ))
        .SortBy(F_HOST, F_PATH)
        .Sort(sourceJupiterTable, ASYNC_CTX0)
        .SortBy(F_HOST, F_PATH)
        .Sort(sourceSpreadTable, ASYNC_CTX0)
        .Wait(ASYNC_CTX0)
    ;

    // for chunk compression
    runner
        .SortBy(F_TIMESTAMP)
        .Sort(config.TABLE_CRAWLER_PROCESSED_LOG);

    bool resultCounters = false;
    bool resultSamples = false;

    UpdateSitetreeTable(tx);
    UpdateCrawlerCounters(tx, updatedSpreadCounters);

    auto taskMergeCounters = [&] {
        resultCounters = MergeDailyCounters(tx, updatedTables);
    };

    auto taskMergeSamples = [&] {
        resultSamples = MergeSamples(tx, updatedRecentSamples, updatedEventSamples);
    };

    NUtils::RunAsync(taskMergeCounters, taskMergeSamples);

    if (!resultCounters || !resultSamples) {
        ythrow yexception() << "crawler: merge daily counters error";
    }

    for (const TString &table : updatedTables.Tables) {
        NYTUtils::SetAttr(tx, table, ATTR_LAST_PROCESSED, newDiffTimestamp);
    }

    tx->Commit();

    MonitorPushCrawlerSpreads(config.MONITOR_PERFORMANCE_SUFFIX, processedSpreadsCount);

    return 0;
}

void CompactCrawlerSamples(NYT::IClientBasePtr client, const TString &samplesPrefix, const TString &tailTable) {
    TDeque<NYTUtils::TTableInfo> weekTables;
    NYTUtils::GetTableList(client, samplesPrefix, weekTables);

    TDeque<TString> eventsToCompact;
    for (const NYTUtils::TTableInfo &table : weekTables) {
        bool needCompact = NYTUtils::GetAttrOrDefault(client, table.Name, ATTR_TO_COMPACT, false);
        if (needCompact) {
            eventsToCompact.push_back(table.Name);
        }
    }

    if (!eventsToCompact.empty()) {
        TOpRunner runner(client);

        NYT::TTableSchema schema = NYTUtils::GetTableSchema(client, *eventsToCompact.begin());
        NYTUtils::PatchSchema(schema, F_HOST, NYT::VT_STRING); //drop sort order
        NYTUtils::PatchSchema(schema, F_PATH, NYT::VT_STRING); //drop sort order
        NYTUtils::PatchSchema(schema, F_TIMESTAMP, NYT::VT_INT64); //drop sort order FIX

        if (client->Exists(tailTable)) {
            runner.InputNode(tailTable);
        }

        for (const TString& table : eventsToCompact) {
            runner.InputNode(table);
        }

        runner
            .OutputNode(NYT::TRichYPath(tailTable).Schema(schema))
            .ReduceBy(F_HOST)
            .SortBy(F_HOST, F_LAST_ACCESS)
            .MapReduce(new TCrawlerWeeklySamplesMapper, new TCrawlerWeeklySamplesReducer)

            .SortBy(F_HOST, F_PATH, F_TIMESTAMP)
            .Sort(tailTable)
        ;

        schema = NYTUtils::GetTableSchema(client, tailTable);
        NYTUtils::PatchSchema(schema, F_TIMESTAMP, NYT::VT_INT64); //drop sort order

        runner
            .InputNode(tailTable)
            .OutputNode(NYT::TRichYPath(tailTable).Schema(schema))
            .ReduceBy(F_HOST, F_PATH, F_TIMESTAMP)
            .Reduce(new TReduceUnique)
        ;

        for (const TString& table : eventsToCompact) {
            runner.Drop(table);
        }

        const time_t newTimestamp = Now().Seconds();
        NYTUtils::SetAttr(client, tailTable, ATTR_LAST_PROCESSED, newTimestamp);
    }
}

int TaskCompactEventSamples(int, const char **) {
    const TConfig &config = TConfig::CInstance();
    NYT::IClientPtr client = NYT::CreateClient(config.MR_SERVER_HOST);
    NYT::ITransactionPtr tx = client->StartTransaction();
    CompactCrawlerSamples(tx, config.TABLE_CRAWLER_WEEKLY_SAMPLES_EVENT, config.TABLE_CRAWLER_TAIL_SAMPLES_EVENT);
    tx->Commit();
    return 0;
}

int TaskCompactRecentSamples(int, const char **) {
    const TConfig &config = TConfig::CInstance();
    NYT::IClientPtr client = NYT::CreateClient(config.MR_SERVER_HOST);
    NYT::ITransactionPtr tx = client->StartTransaction();
    CompactCrawlerSamples(tx, config.TABLE_CRAWLER_WEEKLY_SAMPLES_RECENT, config.TABLE_CRAWLER_TAIL_SAMPLES_RECENT);
    tx->Commit();
    return 0;
}

} //namespace NWebmaster
