package ru.yandex.webmaster3.worker.searchurl;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.UUID;
import java.util.function.Predicate;
import java.util.regex.Pattern;

import com.datastax.driver.core.utils.UUIDs;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Preconditions;
import com.google.common.collect.Range;
import lombok.AllArgsConstructor;
import lombok.Setter;
import lombok.Value;
import org.apache.commons.lang3.tuple.Pair;
import org.joda.time.DateTime;
import org.joda.time.Days;
import org.joda.time.Duration;
import org.joda.time.Instant;
import org.joda.time.LocalDate;

import ru.yandex.webmaster3.core.WebmasterException;
import ru.yandex.webmaster3.core.checklist.data.SiteProblemState;
import ru.yandex.webmaster3.core.checklist.data.SiteProblemTypeEnum;
import ru.yandex.webmaster3.core.data.WebmasterHostId;
import ru.yandex.webmaster3.core.http.WebmasterErrorResponse;
import ru.yandex.webmaster3.core.util.IdUtils;
import ru.yandex.webmaster3.core.util.RetryUtils;
import ru.yandex.webmaster3.core.worker.client.WorkerClient;
import ru.yandex.webmaster3.core.worker.task.PeriodicTaskType;
import ru.yandex.webmaster3.core.worker.task.Test404ErrorsAbsenceTaskData;
import ru.yandex.webmaster3.proto.Urltree;
import ru.yandex.webmaster3.storage.abt.AbtService;
import ru.yandex.webmaster3.storage.abt.model.Experiment;
import ru.yandex.webmaster3.storage.checklist.data.ProblemSignal;
import ru.yandex.webmaster3.storage.checklist.data.RealTimeSiteProblemInfo;
import ru.yandex.webmaster3.storage.checklist.service.SiteProblemsService;
import ru.yandex.webmaster3.storage.clickhouse.TableType;
import ru.yandex.webmaster3.storage.clickhouse.system.dao.ClickhouseSystemTablesCHDao;
import ru.yandex.webmaster3.storage.host.AllHostsCacheService;
import ru.yandex.webmaster3.storage.host.CommonDataType;
import ru.yandex.webmaster3.storage.jupiter.JupiterUtils;
import ru.yandex.webmaster3.storage.searchurl.history.dao.SiteStructureCHDao;
import ru.yandex.webmaster3.storage.settings.SettingsService;
import ru.yandex.webmaster3.storage.util.clickhouse2.CHTable;
import ru.yandex.webmaster3.storage.util.clickhouse2.ClickhouseHost;
import ru.yandex.webmaster3.storage.util.clickhouse2.ClickhouseQueryContext;
import ru.yandex.webmaster3.storage.util.yt.AsyncTableReader;
import ru.yandex.webmaster3.storage.util.yt.YtNode;
import ru.yandex.webmaster3.storage.util.yt.YtPath;
import ru.yandex.webmaster3.storage.util.yt.YtTableReadDriver;
import ru.yandex.webmaster3.storage.yql.YqlFunctions;
import ru.yandex.webmaster3.storage.yql.YqlQueryBuilder;
import ru.yandex.webmaster3.storage.ytimport.ImportPriority;
import ru.yandex.webmaster3.storage.ytimport.YtClickhouseDataLoad;
import ru.yandex.webmaster3.storage.ytimport.YtClickhouseDataLoadState;
import ru.yandex.webmaster3.storage.ytimport.YtClickhouseDataLoadType;
import ru.yandex.webmaster3.storage.ytimport.YtClickhouseImportCommand;
import ru.yandex.webmaster3.storage.ytimport.YtClickhouseTableRelation;
import ru.yandex.webmaster3.worker.TaskSchedule;
import ru.yandex.webmaster3.worker.turbo.AbstractYqlPrepareImportTask;

/**
 * Created by Oleg Bazdyrev on 04/03/2021.
 */
public class MdbImportSiteStructuresToChTask extends AbstractYqlPrepareImportTask {

    private static final String ATTR_SOURCE = "acceptance_table";
    private static final Predicate<String> SOURCE_TABLE_PREDICATE = Pattern.compile("(ready-[0-9]+)").asMatchPredicate();
    private static final int ROW_COUNT = 256;
    private static final String AUX_YQL_FUNCTIONS = "" +
            "\n$script = @@\n" +
            "def JavaStringHash(s):\n" +
            "    res = 0\n" +
            "    for c in s:\n" +
            "        res = (31 * res + c) & 0xffffffff\n" +
            "    if res > 0x7fffffff:\n" +
            "        return (res - 0x100000000)\n" +
            "    else: \n" +
            "        return res\n" +
            "        \n" +
            "def GetNodeFullPath(id, node_path_dict):\n" +
            "    path = ''\n" +
            "    while id != 0:\n" +
            "        path = node_path_dict[id].path + path\n" +
            "        id = node_path_dict[id].parent_id\n" +
            "    return path    \n" +
            "@@;\n" +
            "$javaStringHash = Python3::JavaStringHash(Callable<(String?)->Int32>, $script);\n" +
            "$getNodeFullPath = Python3::GetNodeFullPath(Callable<(Int32?, Dict<Int32?, Struct<parent_id:Int32?,path:Utf8?>>?)->String>, $script);\n" +
            "$listToJson = ($l) -> { return if($l is null, '[]', cast(Yson::SerializeJson(Yson::From($l)) as String)); };\n\n";

    @Setter
    private AbtService abtService;
    @Setter
    private AllHostsCacheService allHostsCacheService;
    @Setter
    private ClickhouseSystemTablesCHDao clickhouseSystemTablesCHDao;
    @Setter
    private SettingsService settingsService;
    @Setter
    private SiteProblemsService siteProblemsService;
    @Setter
    private WorkerClient workerClient;

    private YtPath no404StatusTable;

    public void init() throws Exception {
        no404StatusTable = YtPath.path(workDir, "no-404-status");
    }

    @Override
    protected int getShardsCount() {
        return clickhouseServer.getShardsCount();
    }

    @Override
    protected YtClickhouseDataLoad init(YtClickhouseDataLoad latestImport) throws Exception {
        return ytService.withoutTransactionQuery(cypressService -> {
            List<YtPath> tables = cypressService.list(tablePath);
            String lastProcessedTable = Objects.requireNonNullElse(latestImport.getData(), "");
            Optional<String> tableName = tables.stream().sorted().map(YtPath::getName).filter(SOURCE_TABLE_PREDICATE)
                    .filter(n -> n.compareTo(lastProcessedTable) > 0).findFirst();
            if (tableName.isEmpty()) {
                return latestImport.withState(YtClickhouseDataLoadState.DONE);
            }
            return latestImport.withData(tableName.get()).withSourceTable(YtPath.path(tablePath, tableName.get()), LocalDate.now(), LocalDate.now());
        });
    }

    @Override
    protected YqlQueryBuilder prepareIntermediateTable(YtClickhouseDataLoad imprt) {
        int shardCount = getShardsCount();
        YqlQueryBuilder queryBuilder = new YqlQueryBuilder();
        queryBuilder
                .cluster(tablePath)
                .inferSchema(YqlQueryBuilder.InferSchemaMode.INFER)
                .appendText("PRAGMA yt.MaxRowWeight = '128M';\n")
                .appendText("PRAGMA yt.DefaultMemoryLimit = '4G';\n\n")
                .appendText(AUX_YQL_FUNCTIONS)
                .appendText("INSERT INTO " + INTERMEDIATE_TABLE)
                .appendText("SELECT ShardId, RowId, Compress::Gzip(String::JoinFromList(AGGREGATE_LIST(data), ''), 6) as data FROM\n")
                .appendText("(\n")
                .appendText("  SELECT\n")
                .appendText("    (Digest::Fnv64(").appendFCall(YqlFunctions.url2HostId("host")).appendText(") % " + shardCount + ") as ShardId,\n")
                .appendText("    (Digest::CityHash(").appendFCall(YqlFunctions.url2HostId("host")).appendText(") % " + ROW_COUNT + ") as RowId,\n")
                .appendText("    (")
                .appendText("    ").appendFCall(YqlFunctions.url2HostId("host")).appendText(" || '\\t' ||\n")
                .appendText("    cast($javaStringHash(if (n.is_user, 'user::' || n.name, $getNodeFullPath(n.node_id, node_path_dict))) as String) || '\\t' ||\n")
                .appendText("    cast(ts as String) || '\\t' ||\n")
                .appendText("    if(n.is_user, '1', '0') || '\\t' ||\n")
                .appendText("    cast($javaStringHash($getNodeFullPath(n.parent_id, node_path_dict)) as String) || '\\t' ||\n")
                .appendText("    String::EscapeC(n.name) || '\\t' ||\n")
                .appendText("    cast(n.num_of_docs as String) || '\\t' ||\n")
                .appendText("    cast(n.num_of_docs_on_search as String) || '\\t' ||\n")
                .appendText("    cast(n.num_of_doubles as String) || '\\t' ||\n")
                .appendText("    cast(n.num_of_new_search_docs as String) || '\\t' ||\n")
                .appendText("    cast(n.num_of_gone_search_docs as String) || '\\t' ||\n")
                .appendText("    $listToJson(ListMap(n.httpcodes, ($p) -> { return AsTuple($p.code, $p.total); })) || '\\t' ||\n")
                .appendText("    $listToJson(ListMap(n.url_statuses, ($p) -> { return AsTuple($p.url_status, $p.total); })) || '\\t' ||\n")
                .appendText("    $listToJson(ListMap(n.turbo_source_info, ($p) -> { return AsTuple($p.source_id, $p.total); })) || '\\n'\n")
                .appendText("    ) as data\n")
                .appendText("  FROM\n")
                .appendText("  (\n")
                .appendText("    SELECT host, message.searchdb_production_timestamp as ts, \n")
                .appendText("      ListExtend(\n")
                .appendText("        ListMap(nvl(message.nodes, AsList()), ($n) -> { return AddMember($n, \"is_user\", False); }),\n")
                .appendText("        ListMap(nvl(message.user_nodes, AsList()), ($n) -> { return AddMember($n, \"is_user\", True); })\n")
                .appendText("      ) as nodes,\n")
                .appendText("      ToDict(ListMap(message.nodes, ($n) -> { return AsTuple($n.node_id, AsStruct($n.parent_id as parent_id, cast($n.name as Utf8) as path) ); })) as node_path_dict,\n")
                .appendText("    FROM\n")
                .appendText("    (\n")
                .appendText("      SELECT key as host, ")
                .appendFCall(YqlFunctions.parseProto("NWebmaster.proto.urltree.HostInfo", YqlFunctions.protoMeta(Urltree.getDescriptor()), "value"))
                .appendText(" as message\n")
                .appendText("      FROM").appendTable(imprt.getSourceTable()).appendText("\n")
                .appendText("    )\n")
                .appendText("  )\n")
                .appendText("  FLATTEN LIST BY nodes as n\n")
                .appendText("  WHERE n.shard_id = 0 and n.search_source_id = 0\n")
                .appendText(")\n")
                .appendText("GROUP BY ShardId, RowId;\n")
                .appendText("\n\n")
                .appendText("INSERT INTO").appendTable(no404StatusTable).appendText("WITH TRUNCATE")
                .appendText("SELECT host, (ListHasItems(ListFilter(root_node.httpcodes, ($h) -> { return $h.code == 200 and $h.total > 1; })) and\n")
                .appendText("not ListHasItems(ListFilter(root_node.httpcodes, ($h) -> { return $h.code == 404 and $h.total > 0; })) ) as no_404\n")
                .appendText("FROM\n")
                .appendText("(\n")
                .appendText("  SELECT key as host, ListFilter(")
                .appendFCall(YqlFunctions.parseProto("NWebmaster.proto.urltree.HostInfo", YqlFunctions.protoMeta(Urltree.getDescriptor()), "value"))
                .appendText(".nodes, ($n) -> { return $n.shard_id = 0 and $n.search_source_id = 0 and $n.parent_id = 0 and $n.name == '/'; })[0] as root_node\n")
                .appendText("FROM").appendTable(imprt.getSourceTable()).appendText("\n")
                .appendText(");\n\n")
                .appendText("COMMIT;\n\n");

        return queryBuilder;
    }

    @Override
    protected YtClickhouseDataLoad doImport(YtClickhouseDataLoad imprt) throws Exception {
        int shardCount = getShardsCount();
        List<YtPath> tables = imprt.getPreparedTables();
        UUID taskId = UUIDs.timeBased();
        log.info("Import taskId={}", taskId);

        List<YtClickhouseTableRelation> tablesRels = new ArrayList<>();
        int idx = 0;
        for (int shard = 0; shard < shardCount; shard++) {
            String createSpec = getTable().createMergeTreeSpec(-1, imprt.getData().replace("-", "_"));
            tablesRels.add(new YtClickhouseTableRelation(
                    tables.get(idx++),
                    shard,
                    getTable().replicatedMergeTreeTableName(-1, imprt.getData().replace("-", "_")),
                    createSpec
            ));
        }

        YtClickhouseImportCommand command = new YtClickhouseImportCommand(
                taskId,
                tablesRels,
                getTable().getDatabase(),
                getTable().importSpec(),
                ImportPriority.ONLINE
        );
        ytClickhouseImportManager.startImport(command);
        return imprt.withImportTaskIds(taskId).withNextState();
    }

    @Override
    protected YtClickhouseDataLoad replicate(YtClickhouseDataLoad imprt) throws Exception {
        return imprt.withNextState();
    }

    @Override
    protected YtClickhouseDataLoad rename(YtClickhouseDataLoad imprt) throws Exception {
        String database = getTable().getDatabase();
        String tempTableName = getTable().replicatedMergeTreeTableName(-1, imprt.getData().replace("-", "_"));
        // удаляем из общей таблицы подливаемую дату и вливаем свежие данные
        for (ClickhouseHost host : clickhouseServer.getHosts()) {
            boolean hasTable = !clickhouseSystemTablesCHDao.getTables(host, database, Collections.singleton(tempTableName)).isEmpty();
            if (!hasTable) {
                continue;
            }
            // получим минимальный и максимальный ts в новой таблице
            ClickhouseQueryContext.Builder ctx = ClickhouseQueryContext.useDefaults().setHost(host).setTimeout(Duration.standardMinutes(10L));
            Pair<Long, Long> minMax = clickhouseServer.queryOne(ctx, "SELECT min(timestamp), max(timestamp) FROM " + database + "." + tempTableName,
                    chRow -> Pair.of(chRow.getLong(0), chRow.getLong(1))).orElseThrow();
            RetryUtils.execute(RetryUtils.linearBackoff(3, Duration.standardMinutes(1L)), () -> {
                log.info("Attach partitions on shard {} DC {}", host.getShard(), host.getDcName());
                // удаляем старое
                clickhouseServer.execute(ctx, String.format("ALTER TABLE %s.%s DELETE WHERE timestamp between %s and %s",
                        database, SiteStructureCHDao.FULL_TABLE_NAME, minMax.getLeft(), minMax.getRight()));
                // вставляем новое
                StringBuilder sb = new StringBuilder();
                sb.append("ALTER TABLE ").append(database).append(".").append(SiteStructureCHDao.FULL_TABLE_NAME);
                for (int partition = 0; partition < SiteStructureCHDao.PARTITIONS_COUNT; partition++) {
                    if (partition > 0) {
                        sb.append(",");
                    }
                    sb.append(" ATTACH PARTITION '").append(partition).append("' FROM ").append(database).append(".").append(tempTableName);
                }
                clickhouseServer.execute(ctx, sb.toString());
                // дропаем временную табличку
                clickhouseServer.execute(ctx, String.format("DROP TABLE %s.%s", database, tempTableName));
            });
        }
        checkForNo404Errors();
        // обновим последнюю заимпорченную дату
        ytService.withoutTransaction(cypressService -> {
            YtNode node = cypressService.getNode(imprt.getSourceTable());
            if (node.getNodeMeta().has(ATTR_SOURCE)) {
                Instant baseDate = JupiterUtils.getBaseDateFromPath(node.getNodeMeta().get(ATTR_SOURCE).asText());
                Preconditions.checkState(baseDate != null);
                settingsService.update(CommonDataType.LAST_IMPORTED_SITE_STRUCTURES, String.valueOf(baseDate.getMillis() / 1000L));
            }
            return true;
        });
        return imprt.withNextState();
    }

    // TODO WMC-11312
    private void checkForNo404Errors() {
        DateTime now = DateTime.now();
        // пройдемся по хостам, у которых есть N+ проиндексированных страниц, но нет ни одной с 404-ым кодом
        ytService.inTransaction(no404StatusTable).execute(cypressService -> {
            AsyncTableReader<No404StatusRow> tableReader = new AsyncTableReader<>(cypressService, no404StatusTable, Range.all(),
                    YtTableReadDriver.createYSONDriver(No404StatusRow.class)).withThreadName("no-404-status-reader");
            try(var iterator =  tableReader.read()) {
                Map<WebmasterHostId, Boolean> batch = new HashMap<>();
                while (iterator.hasNext()) {
                    No404StatusRow row = iterator.next();
                    try {
                        WebmasterHostId hostId = IdUtils.urlToHostId(row.getHost());
                        if (!allHostsCacheService.contains(hostId)) {
                            continue;
                        }
                        if (!abtService.isInHashExperiment(hostId, Experiment.NO_404_ERRORS_ALERT)) {
                            continue;
                        }
                        batch.put(hostId, Boolean.TRUE.equals(row.getNo404()));
                        if (batch.size() >= 500) {
                            updateNo404Problems(batch, now);
                        }
                    } catch (IllegalArgumentException e) {
                        log.warn("Bad host {}", row.getHost());
                    }
                }
            } catch (IOException e) {
                throw new WebmasterException("Error reading no-404 statuses table from YT", new WebmasterErrorResponse.YTServiceErrorResponse(getClass(), e), e);
            }
            return true;
        });
    }

    private void updateNo404Problems(Map<WebmasterHostId, Boolean> batch, DateTime now) {
        Map<WebmasterHostId, RealTimeSiteProblemInfo> problems = siteProblemsService.listSitesProblems(batch.keySet(), SiteProblemTypeEnum.NO_404_ERRORS);
        List<Test404ErrorsAbsenceTaskData> tasks = new ArrayList<>();
        // update problems
        Map<WebmasterHostId, Pair<ProblemSignal, RealTimeSiteProblemInfo>> problemsForUpdate = new HashMap<>();
        batch.forEach((hostId, no404) -> {
            RealTimeSiteProblemInfo problem = problems.get(hostId);
            if (no404) {
                if (problem == null || problem.getLastUpdate() == null || problem.getLastUpdate().isBefore(now.minus(Days.days(7)))) {
                    tasks.add(new Test404ErrorsAbsenceTaskData(hostId));
                }
            } else {
                if (problem != null && problem.getState().isPresent()) {
                    problemsForUpdate.put(hostId, Pair.of(new ProblemSignal(SiteProblemTypeEnum.NO_404_ERRORS, SiteProblemState.ABSENT, now), problem));
                }
            }
        });
        siteProblemsService.updateRealTimeProblem(problemsForUpdate);
        workerClient.enqueueBatch(tasks);
        batch.clear();
    }

    @Override
    protected YtClickhouseDataLoad createDistributedTables(YtClickhouseDataLoad imprt) throws Exception {
        return imprt.withNextState();
    }

    @Override
    protected CHTable getTable() {
        return SiteStructureCHDao.TABLE;
    }

    @Override
    protected TableType getTableType() {
        return TableType.SITE_STRUCTURE;
    }

    @Override
    protected YtClickhouseDataLoadType getImportType() {
        return YtClickhouseDataLoadType.SITE_STRUCTURE;
    }

    @Override
    public PeriodicTaskType getType() {
        return PeriodicTaskType.MDB_IMPORT_SITE_STRUCTURES_CH;
    }

    @Override
    public TaskSchedule getSchedule() {
        return TaskSchedule.startByCron("0 17 * * * *");
    }

    @Value
    @AllArgsConstructor(onConstructor_ = @JsonCreator)
    private static class No404StatusRow {
        @JsonProperty("host")
        String host;
        @JsonProperty("no_404")
        Boolean no404;
    }

}
