import logging
from datetime import datetime, timedelta

from sandbox import sdk2
from sandbox.sandboxsdk.environments import PipEnvironment
from sandbox.projects.websearch.begemot.tasks.BegemotYT.common import CommonYtParameters


class MergeChunksYT(sdk2.Task):

    class Parameters(sdk2.Parameters):
        root=sdk2.parameters.String(
            'YT parent directory',
            required=True,
        )
        chunk_size=sdk2.parameters.Integer(
            'Desired chunk size',
            required=True,
            default=4 * (1 << 30),  # 4 GB
        )
        yt_token_vault_name = CommonYtParameters.yt_token_vault_name()
        yt_token_vault_owner = CommonYtParameters.yt_token_vault_owner()
        yt_proxy = CommonYtParameters.yt_proxy()
        yt_pool = CommonYtParameters.yt_pool()

    class Requirements(sdk2.Task.Requirements):
        environments = [
            PipEnvironment('yandex-yt', version='0.10.8'),
        ]

    def on_execute(self):
        import yt.wrapper as yt

        token = sdk2.Vault.data(self.Parameters.yt_token_vault_owner, self.Parameters.yt_token_vault_name)
        yt.config.update_config({
            'proxy': {
                'url': self.Parameters.yt_proxy
            },
            'token': token
        })
        now = datetime.now()
        yesterday = now + timedelta(days=-1)
        table = '{}{:02d}{:02d}'.format(yesterday.year, yesterday.month, yesterday.day)
        logging.info('Table: {}'.format(table))
        root = self.Parameters.root
        dirs = yt.list(root)

        for i, sub in enumerate(dirs):
            path = '{}/{}/{}'.format(root, sub, table)

            if not yt.exists(path):
                self.set_info('{} skipped (does not exist)'.format(path))
                continue

            chunks_before = yt.get_attribute(path, 'chunk_count')
            chunk_threshold = 400
            if chunks_before < chunk_threshold:
                self.set_info('{} skipped, less than {} chunks (to be exact: {})'.format(path, chunk_threshold, chunks_before))
                continue

            yt.run_merge(
                source_table=path,
                destination_table=path,
                spec={
                    "data_size_per_job": self.Parameters.chunk_size,
                    "mode": "unordered",
                    "pool": self.Parameters.yt_pool,
                    "combine_chunks": True,
                },
            )

            chunks_after = yt.get_attribute(path, 'chunk_count')
            self.set_info('{}: {} -> {} chunks\n'.format(path, chunks_before, chunks_after))
