package ru.yandex.crypta.graph2.soup.workflow;

import ru.yandex.bolts.collection.Cf;
import ru.yandex.crypta.graph2.dao.Dao;
import ru.yandex.crypta.graph2.model.soup.edge.EdgeType;
import ru.yandex.crypta.graph2.model.soup.edge.EdgeTypeActivityStats;
import ru.yandex.crypta.graph2.soup.config.SoupAndStorageProcessingParams;
import ru.yandex.crypta.graph2.soup.workflow.ops.CalculateEdgeTypesStats;
import ru.yandex.crypta.graph2.workflow.Task;
import ru.yandex.inside.yt.kosher.cypress.YPath;

public class CalculateSoupStatsTask extends Task<YPath, YPath, SoupAndStorageProcessingParams> {

    private YPath outStatsTable;

    public CalculateSoupStatsTask(Dao dao, YPath workdir, SoupAndStorageProcessingParams params) {
        super(dao, workdir, params);

        outStatsTable = workdir.child("dates_count_per_edge_type");
    }

    @Override
    protected void runImpl(YPath soupMergedTable) {
        dao.ytCypress().ensureDir(workdir);

        dao.ytTr().withTransactionId(trId -> {
            dao.ytCypress().createTableWithSchema(trId, outStatsTable, EdgeTypeActivityStats.class);

            // use two separate ops, because not-that-smart yt scheduler
            // can't properly choose data per job size in single map-reduce case
            dao.ytOps().mapSync(
                    trId,
                    Cf.list(soupMergedTable),
                    Cf.list(outStatsTable),
                    new CalculateEdgeTypesStats.Mapper()
            );
            dao.ytOps().mapReduceSync(
                    trId,
                    Cf.list(outStatsTable),
                    Cf.list(outStatsTable),
                    null,
                    EdgeType.EDGE_TYPE_UNIQUE_KEY,
                    new CalculateEdgeTypesStats.Reducer()
            );
        });

    }

    @Override
    public YPath getOutput() {
        return outStatsTable;
    }

    @Override
    public String getDescription() {
        return "Calculate activity statistics per edge type";
    }
}
