import os
import stat
import shutil
import logging
import datetime as dt
import itertools
import subprocess as sp
import collections

import six

from sandbox.common import fs as common_fs
from sandbox.common import rest as common_rest
from sandbox.common import config as common_config
from sandbox.common import errors as common_errors
from sandbox.common import itertools as common_it
from sandbox.common.vcs import cache as vcs_cache
import sandbox.common.types.task as ctt
import sandbox.common.types.client as ctc
import sandbox.common.types.misc as ctm
import sandbox.common.types.user as ctu

from sandbox import sdk2
from sandbox.agentr import errors as ar_errors

from sandbox.projects.common.vcs import arc
from sandbox.projects.common import binary_task


SEMAPHORES = {
    name: ctt.Semaphores.Acquire(name=name, capacity=capacity)
    for name, capacity in six.iteritems({
        "sandbox/cleanup/new": 10,
        "sandbox/cleanup/regular": 150
    })
}


class Cleanup2(sdk2.Task, binary_task.LastBinaryTaskRelease):
    """
    Removes any "trash" on a host, the task is running at.
    Firstly, it checks tasks' directories for any stuff, which is not marked as shared resource.
    Secondly, it will drop all the core files collected.
    Thirdly, this task will drop any Subversion copies, not started with "arcadia".
    Finally, it will check all the resources, which are marked as ready on the host and drop them if some
    resource files are absent.
    """

    class Requirements(sdk2.Requirements):
        client_tags = ctc.Tag.VOID
        disk_space = 20 * 1024  # 20 GiB

    class Parameters(sdk2.Parameters):
        _lbrp = binary_task.binary_release_parameters(stable=True)  # Non-binary as default

        extra_objects = sdk2.parameters.Integer(
            "Threshold for extra objects (stop the process if the amount of extra tasks and files is greater)",
            default_value=250000
        )
        run_in_dry_mode = sdk2.parameters.Bool("Dry mode", default_value=True)
        semaphore = sdk2.parameters.String("Semaphore", ui=None)

    def on_save(self):
        binary_task.LastBinaryTaskRelease.on_save(self)

    def on_enqueue(self):
        sem = SEMAPHORES.get(self.Parameters.semaphore)
        if sem:
            self.Requirements.semaphores = ctt.Semaphores(acquires=[sem])
        if self.Requirements.host and self.Requirements.host not in self.Parameters.tags:
            self.Parameters.tags += [self.Requirements.host]

    @staticmethod
    def remove_item(path, dry_mode):
        if dry_mode:
            logging.info("(dry_mode): remove '%s'" % path)
        else:
            try:
                sdk2.paths.remove_path(path)
            except OSError as ex:
                logging.error("Unable to remove '%s': %s", path, ex)

    def __clean_cores(self, dry_run):
        coredumps_dir = common_config.Registry().client.tasks.coredumps_dir
        if os.path.exists(coredumps_dir):
            logging.info("Cleanup folder %s", coredumps_dir)
            for coredump_item in os.listdir(coredumps_dir):
                coredump_item_path = os.path.join(coredumps_dir, coredump_item)
                self.remove_item(coredump_item_path, dry_run)

    def __clean_vcs_cache(self, dry_run):
        settings = common_config.Registry()
        my_tags = set(settings.client.tags)
        if (
            ({ctc.Tag.MULTISLOT, ctc.Tag.PORTOD} & my_tags) or
            not (set(ctc.Tag.Group.LINUX) & my_tags) or
            ctc.Tag.GENERIC not in my_tags or
            dry_run
        ):
            return

        cache_folder = settings.client.vcs.dirs.base_cache
        if os.path.exists(cache_folder):
            logging.info("VCS caches cleanup %s", cache_folder)
            vcss = vcs_cache.VCSCache()
            if ctc.Tag.SSD in my_tags:
                vcss.cleanup_dir(cache_folder, vcs_cache.CacheableVCS.NON_REMOVABLE_FILES)
            vcss.clean()

            self.__update_arc_cache(dry_run)

    def __update_arc_cache(self, dry_run):
        if dry_run:
            return

        bare_dir = str(self.path("bare"))
        os.mkdir(bare_dir)
        logging.info("Update arc cache")
        arc_obj = arc.Arc()
        with arc_obj.init_bare(bare_dir) as arc_dir:
            with sdk2.helpers.ProcessLog(self, logger="arc_prefetch") as pl:
                cmd = [arc_obj.binary_path, "prefetch-files"]
                proc = sp.Popen(
                    cmd, stdout=pl.stdout, stderr=sp.STDOUT, cwd=arc_dir
                )
                out, err = proc.communicate()
                return_code = proc.poll()
                if return_code:
                    raise arc.ArcCommandFailed(
                        "Failed to prefetch files with commnd {} in bare arc repo `{}`. Return code: {}."
                        "\nOUTPUT: {}\nERROR: {}".format(
                            " ".join(cmd), arc_dir, return_code, out, err
                        )
                    )

    def __git_gc(self):
        logging.debug("Start git gc on all cached git repos")
        try:
            self.agentr.git_repos_gc()
        except ar_errors.ARException:
            logging.exception("Failed to gc on git repos")

    def task_dirs(self):
        trash = []
        tasks = set()
        tasks_dir = common_config.Registry().client.tasks.data_dir
        for l1 in six.moves.xrange(10):
            for l2 in six.moves.xrange(10):
                _dir = os.path.join(tasks_dir, str(l1), str(l2))
                if not os.path.exists(_dir):
                    continue
                for subdir in os.listdir(_dir):
                    try:
                        tasks.add(int(subdir))
                    except ValueError:
                        trash.append((str(l1), str(l2), subdir))
                        logging.warn("Not a task dir: '%s'", os.path.join(*trash[-1]))
        return tasks, trash

    def resource_files(self):
        def chunker(data, limit=1000):
            while data:
                data, chunk = data[limit:], data[:limit]
                yield chunk

        resources = collections.defaultdict(lambda: dict())
        rest = common_rest.Client()
        if common_config.Registry().common.installation == ctm.Installation.PRE_PRODUCTION:
            # Dirty hack here - it seems as Nanny's service balancer cannot proxy server response of more than 100Kb
            rest = common_rest.Client(base_url="http://sandbox-preprod0{3,5,6}.search.yandex.net:8080/api/v1.0")
        # Fetch all resource IDs in single call - it is very important!.
        rest.DEFAULT_TIMEOUT = rest.MAX_TIMEOUT
        ids = rest.client[common_config.Registry().this.id].service.resources.read()
        logging.info("Fetched %d known host's resource IDs from the server", len(ids))

        rest = self.server << self.server.HEADERS({ctm.HTTPHeader.NO_LINKS: "true"})
        for ids_chunk in chunker(ids):
            chunk = rest.resource.read(
                id=ids_chunk, limit=len(ids_chunk)
            )["items"]
            logging.debug("Loaded chunk of %d objects by chunk of %d IDs", len(chunk), len(ids_chunk))
            assert len(chunk) == len(ids_chunk)
            for r in chunk:
                if r and r["task"]["id"] != self.id:
                    task_resources = resources[r["task"]["id"]]
                    rid = task_resources.setdefault(r["file_name"], r["id"])
                    if rid != r["id"]:
                        logging.warning(
                            "Path %r of task #%r registered for several resources: #%r and #%r",
                            r["file_name"], r["task"]["id"], rid, r["id"]
                        )
        return resources

    def check_fs(self, resources, known_tasks):
        extra, missing, ok = set(), set(), set()  # Extra files, missing resources, ok resources
        for tid, taskres in six.iteritems(resources):
            if tid not in known_tasks:
                missing.update(six.itervalues(taskres))
                continue

            try:
                add_extra, add_missing, add_ok = common_fs._check_task_files(tid, taskres)
            except:
                logging.warning("Try to call AgentR for checking resources for task %s", tid)
                add_extra, add_missing, add_ok = self.agentr.check_task_files(tid, taskres)
            extra.update((map(lambda _: (_[0], six.ensure_str(_[1])), add_extra)))
            missing.update(add_missing)
            ok.update(add_ok)

        return extra, missing, ok

    @staticmethod
    def chmod(path, write):
        mode = os.stat(path).st_mode
        wall = stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH
        new_mode = mode | wall if write else mode & ~wall
        if new_mode != mode:
            logging.info("%s write privileges for '%s'", 'Add' if write else 'Drop', path)
            os.chmod(path, new_mode)

    def chmod_sp(self, base_cmd, dnames, l2path):
        dnames = sorted(set(dnames) - self.problems.viewkeys())
        cmd = base_cmd + dnames
        logging.info("Execute: %r", cmd)
        try:
            sp.check_call(cmd, cwd=l2path)
        except sp.CalledProcessError as ex:
            if len(dnames) > 1:
                logging.error("Error on bulk chmod. Performing one-by-one.")
                for dname in dnames:
                    self.chmod_sp(base_cmd, [dname], l2path)
            else:
                self.problems[dnames[0]] = l2path
                logging.error("Unable to set privileges on %r: %s", os.path.join(l2path, dnames[0]), str(ex))

    def chmod_tree(self, dirs, tasks_dir, write=True):
        mode = "a+rwX" if write else "a-w"
        base_cmd = ["chmod", "-R", mode]
        for l1, l2s in six.iteritems(dirs):
            l1path = os.path.join(tasks_dir, l1)
            self.chmod(l1path, write)
            for l2, dnames in six.iteritems(l2s):
                l2path = os.path.join(l1path, l2)
                self.chmod(l2path, write)
                if write and dnames:
                    self.chmod_sp(base_cmd, dnames, l2path)

    def __clean_env(self, dry_run):
        env_dir = common_config.Registry().client.tasks.env_dir

        def safe_rmdir(path, suffix="_todelete", force=False):
            if not os.path.exists(path):
                return

            del_path = "{}{}".format(path, suffix)
            if os.path.exists(del_path):
                if force:
                    shutil.rmtree(del_path)
                else:
                    raise OSError("Directory to rename '{}' exists. Use force=True".format(del_path))
            # Atomic rename
            os.rename(path, del_path)
            shutil.rmtree(del_path)

        valid_envs = set()
        no_meta_envs = set()
        outdated_envs = set()
        broken_envs = set()
        deleted_envs = set()
        not_envs = set()

        with sdk2.helpers.ProgressMeter("Traversing environments folders."):
            for subdir in os.listdir(env_dir):
                subdir_fullpath = os.path.join(env_dir, subdir)
                if not os.path.isdir(subdir_fullpath):
                    logging.info("Not environment '%s' has found.", subdir)
                    not_envs.add(subdir_fullpath)
                    continue
                meta = sdk2.environments.FileSizeMetadata(subdir_fullpath)

                if not meta.exists:
                    no_meta_envs.add(subdir)
                    logging.info("Environment '%s' has no metadata.", subdir)
                elif meta.outdated(dt.timedelta(weeks=2)):
                    outdated_envs.add(subdir)
                    logging.info("Environment '%s' is too old. Last use: '%s'",
                                 subdir, dt.datetime.utcfromtimestamp(meta.mtime))
                else:
                    absent, extra, different = meta.check_files()

                    if extra:
                        logging.warning("Extra files %s in environment '%s'.", list(extra), subdir)

                    if absent or different:
                        broken_envs.add(subdir)
                        logging.info("Environment '%s' broken. Absent: %s. Different size (stored, real): %s",
                                     subdir, list(absent), list(different))
                    else:
                        valid_envs.add(subdir)
                        logging.info("Environment '%s' looks good.", subdir)

        if not dry_run:
            with sdk2.helpers.ProgressMeter("Deleting broken/outdated/without metadata environments."):
                for full_path in not_envs:
                    logging.info("Delete '%s'", full_path)
                    os.remove(full_path)
                for subdir in itertools.chain(no_meta_envs, outdated_envs, broken_envs):
                    subdir_fullpath = os.path.join(env_dir, subdir)
                    deleted_envs.add(subdir)
                    logging.info("Delete '%s'.", subdir)
                    safe_rmdir(subdir_fullpath)

            if deleted_envs:
                self.set_info(
                    "Deleted environments: {}.".format(sorted(deleted_envs))
                )

        self.set_info(
            "Totally observed {} valid, {} broken, {} outdated environments and {} w/o metadata. Deleted: {}".format(
                len(valid_envs), len(broken_envs), len(outdated_envs), len(no_meta_envs), len(deleted_envs)
            )
        )

    @staticmethod
    def __executing_task_ids():
        client_info = common_rest.Client().client[common_config.Registry().this.id].read()
        for task in client_info.get("tasks", []):
            yield task["id"]

    @staticmethod
    def __get_task_abs_path(task_id):
        settings = common_config.Registry()
        return os.path.join(settings.client.tasks.data_dir, *ctt.relpath(task_id))

    def __clean_fs(self, dry_run):
        tasks_dir = common_config.Registry().client.tasks.data_dir

        executing_tasks = set(self.__executing_task_ids())
        with sdk2.helpers.ProgressMeter("Traversing tasks folders."):
            tasks, trash = self.task_dirs()
            self.set_info("Totally observed {} task directories and {} trash items.".format(len(tasks), len(trash)))
            logging.debug("Trash task directories to drop: %r", sorted(trash))

        with sdk2.helpers.ProgressMeter("Querying the database."):
            resources = self.resource_files()
            self.set_info("Totally know about {} task directories and {} resource files on host.".format(
                len(resources), sum(map(len, six.itervalues(resources)))
            ))

        executing_tasks |= set(self.__executing_task_ids())
        extra_tasks = tasks - set(resources) - executing_tasks
        if extra_tasks:
            logging.info("%r extra task directories found: %r", len(extra_tasks), sorted(extra_tasks))

        with sdk2.helpers.ProgressMeter("Traversing resources' files."):
            extra_files, missing_files, ok = self.check_fs(resources, set(tasks) | executing_tasks)
            self.set_info(
                "Totally observed {} extra task directories, {} extra/unknown files, "
                "{} missing resources/files, {} resources are valid.".format(
                    len(extra_tasks), len(extra_files), len(missing_files), len(ok)
                )
            )
            executing_tasks |= set(self.__executing_task_ids())
            logging.info("Skip executing tasks: %s", executing_tasks)
            if extra_files:
                extra_files = [_ for _ in extra_files if _[0] not in executing_tasks]
                logging.info(
                    "%r unknown files found: %r",
                    len(extra_files), sorted(os.path.join(str(tid), fname) for tid, fname in extra_files)
                )
            if missing_files:
                executing_resources = set(common_it.chain(*(
                    six.itervalues(resources.get(_, {})) for _ in executing_tasks
                )))
                missing_files -= executing_resources
                self.set_info("Missing resources: {!r}".format(sorted(missing_files)))

            threshold = self.Parameters.extra_objects or 0
            if len(extra_tasks or []) + len(extra_files or []) > threshold:
                raise common_errors.TaskFailure(
                    "Amount of extra tasks ({}) and files ({}) greater than threshold {}".format(
                        len(extra_tasks), len(extra_files), threshold
                    )
                )

        if dry_run:
            return
        with sdk2.helpers.ProgressMeter("Dropping host records for missing resources."):
            def chunker(data, size=1000):
                while data:
                    chunk, data = data[:size], data[size:]
                    yield chunk

            for chunk in chunker(list(str(_) for _ in missing_files)):
                self.server.client[self.host].service.resources.drop.create(chunk)

        with sdk2.helpers.ProgressMeter("Removing extra files"):
            self.agentr.dropper([os.path.join(self.__get_task_abs_path(tid), fname) for tid, fname in extra_files])
        with sdk2.helpers.ProgressMeter("Removing extra task working  directories"):
            self.agentr.dropper([self.__get_task_abs_path(tid) for tid in extra_tasks], workers=30)
        with sdk2.helpers.ProgressMeter("Removing unknown task working directories"):
            self.agentr.dropper([os.path.join(tasks_dir, l1, l2, name) for l1, l2, name in trash], workers=30)

    def on_execute(self):
        binary_task.LastBinaryTaskRelease.on_execute(self)

        # # Run strace and tcpdump when using of Arcadia
        # settings = common.config.Registry()
        # settings.client.sdk.svn.arcadia.run_tcpdump = True
        # settings.client.sdk.svn.arcadia.run_strace = True

        self.problems = {}
        dry_run = self.Parameters.run_in_dry_mode
        if ctc.Tag.NEW_LAYOUT in common_config.Registry().client.tags:
            self.agentr.backup()
            self.agentr.maintain(dry=dry_run)
        else:
            self.__clean_fs(dry_run)

        with sdk2.helpers.ProgressMeter("Cleaning up tasks core files."):
            self.__clean_cores(dry_run)

        with sdk2.helpers.ProgressMeter("Cleaning up vcs cache."):
            self.__clean_vcs_cache(dry_run)

        with sdk2.helpers.ProgressMeter("Run git GC on cached repos."):
            self.__git_gc()

        with sdk2.helpers.ProgressMeter("Cleaning up environments."):
            self.__clean_env(dry_run)

        if self.problems:
            msg = "Unable to change privileges on the following directories:\n"
            problems = {}
            for tid, d in six.iteritems(self.problems):
                problems.setdefault(d, []).append(tid)
            for d in sorted(problems):
                msg += "\t'{}': {}\n".format(d, " ".join(sorted(problems[d])))
            self.set_info(msg)
            raise Exception("Unable to operate some directories (see task info for details)")

        tags = self.server.client[common_config.Registry().this.id].read().get("tags")
        if ctc.Tag.NEW in tags:
            logging.info("There is tag NEW in host tags.", common_config.Registry().client.tags)
            if self.server.user.current.read()["role"] == ctu.Role.ADMINISTRATOR:
                logging.info("Current user is administrator, try to remove NEW tag.")
                new_tags = list(tag for tag in tags if tag != ctc.Tag.NEW)
                self.server.client[common_config.Registry().this.id].tags.update(new_tags)
