# -*- coding: utf-8 -*-
"""
Генерация индексов хроносрезов для ADVQ (таблицы из //home/advq/advq/{phits_type}{db}/weeklyhits
и //home/advq/advq/{phits_type}/{db}/monthlyhits.

Поддерживается генерация многотомных архивов и дельта-чанков.
"""
import re
import json
import logging
import tempfile
import os

import sandbox.common.types.task as ctt
from sandbox import sdk2
import sandbox.sdk2.helpers
from sandbox.common.errors import TaskFailure
from sandbox.common.types.task import Semaphores
from sandbox.projects.advq.AdvqGenChronoIndex import AdvqGenChronoIndex, CHRONO_TYPE_MONTH, CHRONO_TYPE_WEEK
from sandbox.projects.advq.artifacts import SUMHITS_GENERATOR
from sandbox.projects.advq.common import validate_arcadia_rev, YT_MINIMAL_REV, DEFAULT_PROCESS_TIMEOUT
from sandbox.projects.advq.common.parameters import releaseTo_params, PhitsParameters, SandboxParameters
from sandbox.projects.advq.common.sandbox_utils import get_sandbox_env_from_parameters
from sandbox.projects.advq.common.yt_utils import get_yt_env_from_parameters
from sandbox.sandboxsdk.environments import PipEnvironment
from sandbox.sdk2 import ResourceData
from sandbox.sdk2.task import WaitTask
from sandbox.sdk2.helpers import subprocess as sp


CHRONO_INDEX_TIMEOUT = 48 * 60 * 60  # Обычно собирается примерно за 24ч, ставим 48ч.
CHRONO_CHUNK_TIMEOUT = 12 * 60 * 60

SEMAPHORE_GENERATION_NAME_TEMPLATE = 'advq_chrono_db_generation_{phits_type}'
NEW_CHRONO_YT_PATH = '{advq_prefix}/advq/{phits_type}/{advq_db}/{chrono_type}lyhits'

# TODO(monoid) сейчас эти значения игнорируются, играет роль только параметр limit, который пока мал.
BUILDING_LIMIT = {
    CHRONO_TYPE_WEEK: 52,
    CHRONO_TYPE_MONTH: 12,
}

CHRONO_TABLE_RE = re.compile('^([0-9]{6})(?:-(delta)?([0-9]+))?$')


SUMHITS_GENERATOR_LIST_GEN_CHRONO_DB_REV = 3398878


class AdvqGenChronoIndices(sdk2.Task):
    class Requirements(sdk2.Requirements):
        environments = (
            PipEnvironment("yandex-yt"),
            PipEnvironment("yandex-yt-yson-bindings-skynet")
        )
        cores = 1
        disk_space = 1 * 1024

        class Caches(sdk2.Requirements.Caches):
            pass

    class Parameters(PhitsParameters):
        sandbox_parameters = SandboxParameters
        dbs = sdk2.parameters.List("List of ADVQ dbs")
        release_new_resource, releaseTo = releaseTo_params()
        advq_yt_prefix = sdk2.parameters.String("YT prefix instead of //home/advq (for testing proposes)")
        child_non_parallel_kill_timeout = sdk2.parameters.Integer("Child non-parallel kill timeout for db, in seconds", default=CHRONO_INDEX_TIMEOUT)
        child_parallel_kill_timeout = sdk2.parameters.Integer("Child parallel kill timeout for db chunk, in seconds", default=CHRONO_CHUNK_TIMEOUT)
        chunk_size = sdk2.parameters.Float("Floating-point chunk size, GB")
        measure_period = sdk2.parameters.Integer("Number of lines to measure data size", default=1000000)
        ignore_legacy = sdk2.parameters.Bool("Ignore huge legacy single-chunk dbs for 'rus'")
        disk_space_limit = sdk2.parameters.Float("Generation disk limit, GB", required=True)
        generate_parallel = sdk2.parameters.Bool("Use parallel generation", required=True, default=False)
        chunk_num_week = sdk2.parameters.Integer("Number of chunks to generate for week", required=True)
        chunk_num_month = sdk2.parameters.Integer("Number of chunks to generate for month", required=True)
        dry_run = sdk2.parameters.Bool("Dry run")
        ttl = sdk2.parameters.Integer("TTL for released chunks (days, always; 0 for inf)", default=720, required=True)
        retries_count = sdk2.parameters.Integer("Number of retries for chunk generation subtasks", default=3, required=False)

    def on_enqueue(self):
        # Устанавливаем семафор, имя которого зависит от phits_type; значение то же, что у legacy-генерации
        # хроносрезов.
        self.Requirements.semaphores = Semaphores(
            acquires=[
                Semaphores.Acquire(
                    name=SEMAPHORE_GENERATION_NAME_TEMPLATE.format(phits_type=self.Parameters.advq_phits_type),
                    capacity=1)
            ],
            release=(
                ctt.Status.Group.BREAK, ctt.Status.Group.FINISH
            )
        )
        return super(AdvqGenChronoIndices, self).on_enqueue()

    def on_execute(self):
        validate_arcadia_rev(self.Parameters.advq_build_binaries, [
            SUMHITS_GENERATOR_LIST_GEN_CHRONO_DB_REV, YT_MINIMAL_REV,
        ])
        env = dict(os.environ)
        env.update(get_yt_env_from_parameters(self.Parameters))

        dry_run_msg = 'DRY_RUN ' if self.Parameters.dry_run else ''

        if self.Parameters.advq_yt_prefix:
            config_file = tempfile.NamedTemporaryFile()
            import yaml
            yaml.safe_dump({
                'ADVQ_PREFIX': self.Parameters.advq_yt_prefix
            }, config_file.file)
            config_file.flush()
            env['ADVQ_CONFIG_FILE'] = config_file.name

        if self.Parameters.release_new_resource:
            releaseTo = self.Parameters.releaseTo
        else:
            releaseTo = None

        with self.memoize_stage.gen_tasks(commit_on_entrance=False):
            task_ids = []
            advq_phits_type = self.Parameters.advq_phits_type

            advq_binaries_res = ResourceData(self.Parameters.advq_build_binaries)

            sumhits_gen_bin = advq_binaries_res.path.joinpath(SUMHITS_GENERATOR)
            env.update(get_sandbox_env_from_parameters(self.Parameters.sandbox_parameters))

            for db in self.Parameters.dbs:
                gen_args = [
                    str(sumhits_gen_bin),
                    'list_gen_chrono_db',
                    self.Parameters.advq_phits_type,
                    db,
                ]

                if self.Parameters.release_new_resource and self.Parameters.releaseTo:
                    gen_args.extend(['--released', self.Parameters.releaseTo])

                with sandbox.sdk2.helpers.ProcessLog(self, logger=logging.getLogger("list_gen_chrono_db")) as pl:
                    try:
                        to_build_str = sp.check_output(
                            gen_args,
                            timeout=DEFAULT_PROCESS_TIMEOUT,
                            stderr=pl.stdout,
                            env=env,
                        )
                    except sp.CalledProcessError:
                        logging.exception(
                            "Failed to get chrono db generation info for %s-%s",
                            self.Parameters.advq_phits_type,
                            db
                        )
                        self.set_info("Failed to get chrono db generation info for {}-{}".format(
                            self.Parameters.advq_phits_type,
                            db
                        ))
                    else:
                        to_build = json.loads(to_build_str)
                        for desc in to_build:
                            chrono_type = desc['advq_chrono_type']
                            advq_db = desc['advq_db']
                            epodate = desc['advq_epodate']
                            input_tbl = desc['input_table']
                            date = desc['advq_date']
                            epoch = desc['advq_epoch']
                            is_delta = desc['advq_is_delta']

                            descr = ("Generate chrono index for {}_{}_{} from {!r}".format(
                                chrono_type, advq_db, epodate, input_tbl))
                            self.set_info(dry_run_msg + descr)
                            if not self.Parameters.dry_run:
                                task = AdvqGenChronoIndex(
                                    self,
                                    description=descr,
                                    kill_timeout=self.Parameters.child_non_parallel_kill_timeout,
                                    child_parallel_kill_timeout=self.Parameters.child_parallel_kill_timeout,
                                    yt_proxy=self.Parameters.yt_proxy,
                                    yt_token_vault_user=self.Parameters.yt_token_vault_user,
                                    yt_token_vault_name=self.Parameters.yt_token_vault_name,
                                    advq_phits_type=advq_phits_type,
                                    advq_build_binaries=self.Parameters.advq_build_binaries,
                                    advq_chrono_type=chrono_type,
                                    advq_db=advq_db,
                                    date=date,
                                    epoch=epoch,
                                    epodate=epodate,
                                    is_delta=is_delta,
                                    input_table=input_tbl,
                                    chunk_size=self.Parameters.chunk_size,
                                    measure_period=self.Parameters.measure_period,
                                    generate_parallel=self.Parameters.generate_parallel,
                                    chunk_num_week=self.Parameters.chunk_num_week,
                                    chunk_num_month=self.Parameters.chunk_num_month,
                                    disk_space_limit=self.Parameters.disk_space_limit,
                                    releaseTo=releaseTo,
                                    ttl=self.Parameters.ttl,
                                    retries_count=self.Parameters.retries_count,
                                )
                                logging.info("Running %s in %s", descr, task.id)
                                task_ids.append(task.id)
                                task.enqueue()

            self.Context.tasks_to_wait = task_ids

        if self.Context.tasks_to_wait:
            with self.memoize_stage.wait_for_children:
                raise WaitTask(self.Context.tasks_to_wait, statuses=(ctt.Status.Group.FINISH + ctt.Status.Group.BREAK),
                               wait_all=True)

            if self.Parameters.release_new_resource:
                failed_tasks = []
                for task_id in self.Context.tasks_to_wait:
                    task = sdk2.Task[task_id]
                    if task.status in ctt.Status.Group.BREAK or task.status == ctt.Status.FAILURE:
                        failed_tasks.append(task)
                if failed_tasks:
                    logging.error("Some generation task failed: %r", failed_tasks)
                    raise TaskFailure("Some generation task failed", failed_tasks)
