from __future__ import print_function
from __future__ import absolute_import

import os
import sys
import copy
import time
import uuid
import errno
import socket
import inspect
import logging
import httplib
import platform
import threading
import contextlib
import collections
import multiprocessing as mp

import Queue as queue
import itertools as it
import functools as ft
import subprocess as sp

from sandbox.common import os as common_os
from sandbox.common import fs as common_fs
from sandbox.common import enum as common_enum
from sandbox.common import rest as common_rest
from sandbox.common import config as common_config
from sandbox.common import format as common_format
from sandbox.common import system as common_system
from sandbox.common import package as common_package
from sandbox.common import patterns as common_patterns
from sandbox.common import threading as common_threading

import sandbox.common.types.misc as ctm
import sandbox.common.types.task as ctt
import sandbox.common.types.client as ctc

import sandbox.agentr.client
import sandbox.agentr.errors as ar_errors

from . import base, errors, system
from . import commands, platforms


logger = logging.getLogger(__name__)


# maximum sequential fails of executor, after exceeding it client would be stopped
MAX_EXECUTOR_FAILS = 3

# maximum interval for retrying dropping jobs, in seconds
DROP_JOB_INTERVAL_MAX = 900

DynamicSlotsUsage = collections.namedtuple("DynamicSlotsUsage", "cores ram")

# Total RAM on host, in MiB
TOTAL_RAM = (os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")) >> 20
# Total CPU cores on host
TOTAL_CORES = mp.cpu_count()
# RAM shared by dynamic slots, 1GiB reserved, in MiB
SHARED_RAM = TOTAL_RAM - 1024
# CPU cores shared by dynamic slots, 1 core reserved
SHARED_CORES = TOTAL_CORES - 1


class Event(common_enum.Enum):
    """ Wakeup event reason. """
    JOB_COMPLETED = None
    VALIDATE = None
    WAKEUP = None
    STOP = None
    DUMP = None


class PingSandboxServerThread(object):
    """
    Service thread for pinging of server and for getting information of what to do the client
    """

    __metaclass__ = common_patterns.SingletonMeta

    class Timer(common_patterns.Abstract):
        __slots__ = ("tick", "updated", "idle")
        __defs__ = [0] * 3

    # version of protocol used in XMLRPC-method client_ping
    CLIENT_AGE = 17

    last_system_status = {}
    sys_params = None

    rest = None
    token = None
    revision_checker = common_package.RevisionChecker()

    stopped = threading.Event()
    _events = queue.Queue()
    _timer = Timer()

    executor_fails = 0

    def __init__(self):
        # This mostly required by unit-tests to be able to run client's loop manually.
        self.node_id = common_config.Registry().this.id

        self._lock_sock = None
        self._getajob_bell = threading.Event()
        self._dropajob = queue.Queue()
        self.last_clear_command = None
        self._service_threads = {
            f.__name__: threading.Thread(target=f, name=name)
            for f, name in (
                (self._sock_loop, "Socket wakeup thread"),
            )
        }
        serv_threads = (
            (self._jobgetter, "Job getter thread", 0),
            (self._jobdropper, "Job dropper thread", 600),
            (self._status_report, "Status self reporting thread", 0),
        )
        if common_config.Registry().client.lxc.enabled:
            serv_threads += (
                (self._lxc_destroyer, "LXC container destroyer", 0),
            )
            if common_config.Registry().client.lxc.network.type == ctm.Network.Type.MACVLAN:
                serv_threads += (
                    (self._lxc_ip_router, "LXC IP routes updater", 0),
                )

        self._service_threads.update({
            f.__name__: threading.Thread(target=ft.partial(self._service_thread_loop, f, watchdog), name=name)
            for f, name, watchdog in serv_threads
        })
        self._kamikadze_thread = common_threading.KamikadzeThread(
            common_config.Registry().client.idle_time * 100, logger
        )
        self._status_report_timeouted = False
        self.agentr = sandbox.agentr.client.Service(logger)
        super(PingSandboxServerThread, self).__init__()

    def set_stopped(self):
        if not self.stopped.is_set():
            try:
                self.agentr.stopped = True
            except:
                logger.exception("Unable to save stopped flag in AgentR")
            self.stopped.set()

    def load_state(self, jobs):
        jobs = {key: base.Serializable.decode(value) for key, value in jobs.iteritems()}
        if jobs:
            last_command_id = max(j.id for j in jobs.itervalues())

            # we need to initialize ProcMan objects under root so that they can be killed later
            with system.UserPrivileges():
                active_sockets = list(system.TaskLiner.active_sockets)
                for job in jobs.itervalues():
                    if job.liner is None and job.token and job.platform:
                        # check if there are any previously launched executors with this token (SANDBOX-5070)
                        truken = common_format.obfuscate_token(job.token)
                        if truken in active_sockets:
                            try:
                                job.liner = system.TaskLiner(truken, job.logger)
                            except socket.error as ex:
                                if ex.errno == errno.ECONNREFUSED:
                                    logger.warning("Liner job %r does not exist", job.liner)
                                    continue
                                raise
                            logger.warning("Found existing liner job %r", job.liner)
                            if not job.pid:
                                job.liner.terminate()
                                job.liner = None
                            else:
                                job.save_state()

            commands.Command.registry = jobs
            commands.Command.last_command_id = last_command_id
            commands.Command.next_command_id = common_patterns.classproperty(
                lambda _, g=it.count(last_command_id): g.next()
            )

    @common_patterns.singleton_property
    def _is_dynamic_slots(self):
        return ctc.Tag.MULTISLOT in common_config.Registry().client.tags

    @common_patterns.singleton_property
    def _shared_cores(self):
        return common_config.Registry().client.dynamic_slots.shared_cores or SHARED_CORES

    @common_patterns.singleton_property
    def _shared_ram(self):
        return common_config.Registry().client.dynamic_slots.shared_ram or SHARED_RAM

    @property
    def _dynamic_slots_usage(self):
        used_cores = 0
        used_ram = 0
        for cmd in commands.Command.registry.itervalues():
            used_cores += cmd.args.get("cores") or 0
            used_ram += cmd.args.get("ram") or 0
        return DynamicSlotsUsage(cores=used_cores, ram=used_ram)

    @property
    def _free_slots(self):
        if commands.Command.registry.get(ctc.ServiceTokens.SERVICE_TOKEN) is None:
            if self._is_dynamic_slots:
                used = self._dynamic_slots_usage
                max_free_cores_slots = self._shared_cores - used.cores
                max_free_ram_slots = self._shared_ram - used.ram
                return max(min(max_free_cores_slots, max_free_ram_slots), 0)
            else:
                return max(common_config.Registry().client.max_job_slots - len(commands.Command.registry), 0)

    def start(self):
        self._service_threads["_sock_loop"].start()
        self._kamikadze_thread.start()
        return self

    def stop(self, on_signal=False):
        self._events.put((Event.STOP, on_signal))
        if not on_signal:
            logger.info("stop(%r) called.", on_signal)
        else:
            # print("Signal caught. Stopping...", file=sys.stderr)
            pass

    def wakeup(self):
        self._events.put((Event.WAKEUP, None))

    @staticmethod
    def free_space_threshold(free_space_threshold, total_space):
        if free_space_threshold < 1:
            free_space_threshold *= total_space
        else:
            free_space_threshold <<= 20
        return free_space_threshold

    @staticmethod
    def __disk_status(free_space, total_space):
        free_space_threshold = PingSandboxServerThread.free_space_threshold(
            common_config.Registry().client.auto_cleanup.free_space_threshold, total_space
        )
        if free_space > free_space_threshold:
            return ctc.DiskStatus.OK
        if free_space > free_space_threshold / 2:
            return ctc.DiskStatus.WARNING
        return ctc.DiskStatus.CRITICAL

    @staticmethod
    def __disk_full(free_space, total_space):
        free_space_threshold = PingSandboxServerThread.free_space_threshold(
            common_config.Registry().client.auto_cleanup.hard_free_space_threshold, total_space
        )
        return free_space < free_space_threshold

    def __get_status(self):
        """
        Get client status

        :return: status
        :rtype: dict
        """
        if self.sys_params is None:
            self.sys_params = common_system.get_sysparams()
            porto_enabled = common_config.Registry().client.porto.enabled and common_os.User.has_root
            self.sys_params.update({
                "age": self.CLIENT_AGE,
                "ram": self.sys_params.pop("physmem"),
                "uuid": self.__client_uuid(),
                "root": common_os.User.has_root,
                "platform": platform.platform(),
                "lxc": common_config.Registry().client.lxc.enabled and common_os.User.has_root,
                "porto": porto_enabled,
                "dc": common_config.Registry().this.dc,
            })
            shared_ram = common_config.Registry().client.dynamic_slots.shared_ram
            for key, value in {
                "ncpu": common_config.Registry().client.dynamic_slots.shared_cores,
                "ram": shared_ram and shared_ram << 20,
            }.items():
                if value:
                    self.sys_params[key] = value
        self.last_system_status.update(self.sys_params)
        df = self.agentr.df()
        self.last_system_status["disk"] = {
            "total_space": df.total,
            "free_space": df.free - df.locked,
            "locked_space": df.locked,
            "status": self.__disk_status(df.free - df.locked, df.total),
        }
        max_job_slots = common_config.Registry().client.max_job_slots
        free_slots = self._free_slots
        used_slots = (
            max(0, max_job_slots - (free_slots or max_job_slots))
            if self._is_dynamic_slots else
            len(commands.Command.registry)
        )
        self.last_system_status["slots"] = {
            "used": used_slots,
            "total": common_config.Registry().client.max_job_slots,
        }
        self.last_system_status.update(self.revision_checker.get_revisions())
        return self.last_system_status

    @staticmethod
    def __client_uuid():
        uuid_fname = os.path.join(common_config.Registry().client.dirs.run, "client.uuid")
        with open(uuid_fname, "a+") as f:
            f.seek(0)
            client_uuid = f.read().strip()
            if not client_uuid:
                client_uuid = uuid.uuid4().hex
                logger.info("Client UUID is not found, created new UUID: %r", client_uuid)
                f.write(client_uuid)
        return client_uuid

    def _sock_loop(self):
        logger.info("Socket wakeup thread started with socket %r.", self._lock_sock.getsockname())
        while not self.stopped.is_set():
            peer, addr = self._lock_sock.accept()
            self._events.put((Event.WAKEUP, None))
            logger.info("Woken via socket by %r.", addr)
            peer.close()
        logger.info("Socket wakeup thread stopped.")

    @property
    @contextlib.contextmanager
    def lock(self):
        try:
            self._lock_sock = socket.socket()
            self._lock_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
            self._lock_sock.bind(('127.0.0.1', common_config.Registry().client.port))
            logger.info("Bound on port: %s", common_config.Registry().client.port)
            self._lock_sock.listen(3)
        except socket.error as ex:
            # print("Bind on port {} error: {}".format(config.Registry().client.port, ex), file=sys.stderr)
            logger.error("Bind on port %s error: %s", common_config.Registry().client.port, ex)
            sys.exit(1)

        try:
            yield self
        finally:
            self._lock_sock.close()
            logger.info("Socket lock on port %s is closed", common_config.Registry().client.port)

    def _reset(self):
        # cleanup all
        cmd = type("Command", (object,), {"logger": logger, "task_id": None, "liner": None})()
        cmd.args = {}
        cmd.token = system.PROCMAN_TAG
        platform = platforms.Platform(cmd)
        setattr(platform, "_cgroup_id", lambda: "")
        platform.cleanup()
        platform.prepare(None)

        for truken in system.TaskLiner.active_sockets:
            logger.info("Found liner %r. Ensure it terminated.", truken)
            try:
                liner = system.TaskLiner(truken)
                liner.terminate()
            except (Exception, errors.InfraError):
                system.TaskLiner.drop_stale_socket(truken)

        if common_config.Registry().client.lxc.enabled:
            platforms.PrivilegedLXCPlatform.maintain()
            cmd.args["container"] = {"alias": None, "id": "privileged"}
            with system.UserPrivileges():
                platforms.PrivilegedLXCPlatform(cmd).resume()
            for i, name in enumerate(platforms.LXCPlatform.running.keys()):
                container = platforms.lxc.Container(name=name)
                cmd.args["container"] = {"alias": None, "id": container.resource_id}
                plat = platforms.LXCPlatform(cmd)
                plat._container.instance = container.instance or None
                if not i:
                    del plat.ramdrive
                plat.restore_system_files()
                plat.cleanup_home()

        if common_config.Registry().client.porto.enabled:
            platforms.PortoPlatform.maintain()

        status = self.__get_status()
        self.agentr("reset")
        status.update({"uuid": self.__client_uuid(), "tags": common_config.Registry().client.tags})
        new = self.rest.client[self.node_id](status) != self.rest.NO_CONTENT
        logger.info("Server %s this client.", "registered" if new else "updated and reset")
        commands.Command.last_command_id = None

    @property
    def _shutting_down(self):
        return isinstance(
            commands.Command.registry.get(ctc.ServiceTokens.SERVICE_TOKEN),
            (commands.ShutdownClientCommand, commands.ResetClientCommand)
        )

    def _jobgetter(self, rest):
        self._getajob_bell.wait(self._timer.tick)
        self._getajob_bell.clear()
        if self.stopped.is_set() or self._shutting_down:
            return

        free_slots = self._free_slots
        logger.info(
            "%s Slots: %r, tick: %.2fs, idle: %.2fs",
            "Asking server for new jobs." if free_slots else "Updating client status.",
            free_slots, self._timer.tick, self._timer.idle
        )

        if not free_slots:
            logger.debug("No free slots to ask for a new job.")
            self._getajob(rest, free_slots)

            if commands.Command.registry.get(ctc.ServiceTokens.SERVICE_TOKEN):
                self._timer.idle += self._timer.tick
            self._timer.tick = common_config.Registry().client.idle_time
            return True

        jobs = self._getajob(rest, free_slots)
        if jobs is not None:
            self._timer.idle = 0
            self._timer.tick = common_config.Registry().client.idle_time
            return True
        elif free_slots >= common_config.Registry().client.max_job_slots:
            if self._timer.idle >= common_config.Registry().client.idle_time:
                job = self._idle_job()
                if isinstance(job, commands.Command.commands[ctc.Command.CLEAR]):
                    self.last_clear_command = job
                self._timer.tick = common_config.Registry().client.idle_time
            self._timer.idle += self._timer.tick
        self._timer.tick = common_config.Registry().client.idle_time
        return True

    def _lxc_ip_router(self, rest):
        # Get current routes and compare them to saved ones.
        # If they're different -- push new routes to all containers.
        dev, _, _ = platforms.lxc.LXCNetwork.lxc_iface(0)  # Fake argument 0
        routes = [route.split("  ")[0] for route in platforms.lxc.LXCNetwork.lxc_route_table(dev)]

        if platforms.LXCPlatform.instances and routes != self.agentr.iproutes:
            logger.info("Detected ip routes change!")
            logger.debug("Previous routes: %s", self.agentr.iproutes)
            logger.debug("Current routes: %s", routes)
            # Persist current routes to AgentR
            self.agentr.iproutes = routes

            for name in platforms.LXCPlatform.running.keys():
                container = platforms.lxc.Container(name=name)
                logger.info("Updating ip routes for container '%s'", container.name)
                with platforms.LXCPlatform._lock, system.UserPrivileges():
                    # Check that container is still running after getting the lock
                    if container.name not in platforms.LXCPlatform.running:
                        logger.debug("Container %s is already stopped", container.name)
                        continue

                    # Create update script to execute inside container
                    with open(container.path("etc", "update_routes.sh"), "w") as f:
                        # Enable verbose mode
                        f.write("set -x\n")
                        # First flush current routes
                        f.write("/sbin/ip -6 route flush scope global dev {}\n".format(dev))
                        # Now add every routes from host
                        for route in platforms.lxc.LXCNetwork.lxc_route_table(dev):
                            f.write("/sbin/ip route add {} || true\n".format(route))

                    logger.debug("Run update command for container '%s'", container.name)
                    update_cmd = [
                        "/usr/bin/lxc-attach", "-n", container.name, "--",
                        "/bin/sh", "/etc/update_routes.sh"
                    ]
                    try:
                        output = sp.check_output(update_cmd, stderr=sp.STDOUT)
                        logger.info("Update command output:\n%s", output)
                    except sp.CalledProcessError as exc:
                        logger.warning("ip route flush failed: %s", exc.output.strip())

        self.stopped.wait(common_config.Registry().client.idle_time)
        return True

    def _lxc_destroyer(self, rest):
        item = platforms.LXCPlatform._destroy_queue.get()
        if item is None:
            return
        container, kwargs = item
        logger.debug("Got container to destroy: %s", container)
        try:
            platforms.LXCPlatform._destroy_container(container, **kwargs)
        except errors.ExecutorFailed:
            logger.exception("Error on container %s destroy.", container)
        finally:
            with platforms.LXCPlatform._lock:
                platforms.LXCPlatform.instances.get(container.template)[int(container.instance)] = 0

        return True

    def _service_thread_loop(self, func, watchdog=0):
        logger.info("%r service thread started.", func.__name__)
        if watchdog > 0:
            watchdog_thread = common_threading.KamikadzeThread(
                ttl=watchdog, logger=logger, name=func.__name__ + "_watchdog"
            )
            watchdog_thread.start()
        else:
            watchdog_thread = None
        try:
            rest = common_rest.ThreadLocalCachableClient(
                common_config.Registry().client.rest_url,
                auth=self.token,
                component=ctm.Component.CLIENT,
                total_wait=0
            )
            while not self.stopped.is_set() and func.__name__ in self._service_threads:
                if not func(rest):
                    break
                if watchdog_thread:
                    watchdog_thread.ttl = watchdog
        except Exception:
            logger.exception("Fatal error in service thread %r", func.__name__)
            self._events.put((Event.STOP, None))
        finally:
            if watchdog_thread:
                watchdog_thread.stop()
                watchdog_thread.join()
        logger.info("%r service thread stopped.", func.__name__)

    def _delete_job_on_server(self, rest, dropajob_info):
        job_token = dropajob_info.get("token")
        try:
            rest.client[self.node_id].job.delete(dropajob_info)
            logger.debug("Job %r successfully dropped", job_token)
        except rest.HTTPError as ex:
            if ex.status == httplib.NOT_FOUND:
                logger.warning("There is no session %r", job_token)
            else:
                logger.error("Error dropping job %r: %s", common_format.obfuscate_token(job_token), ex)
                return False
        except rest.TimeoutExceeded as ex:
            logger.error("Error dropping job %r: %s", common_format.obfuscate_token(job_token), ex)
            return False
        except Exception as ex:
            logger.error("Error dropping job %r: %s", common_format.obfuscate_token(job_token), ex)
        return True

    def __call_with_retries(self, func, *args, **kwargs):
        for _ in range(50):
            try:
                func(*args, **kwargs)
                break
            except Exception:
                logger.exception("Exception in AgentR call")
                time.sleep(2)
        else:
            return False
        return True

    def _jobdropper(self, rest):
        try:
            dropajobs_info = self.agentr.dropajobs_info
        except Exception:
            logger.exception("Exception on taking jobs to drop from AgentR")
            return True

        def set_dropajobs_info(dropajobs_info):
            self.agentr.dropajobs_info = dropajobs_info

        if dropajobs_info:
            finished_tokens = set()
            for job_token, info in dropajobs_info.iteritems():
                logger.debug("Finish dropping job %s", common_format.obfuscate_token(job_token))
                if self._delete_job_on_server(rest, info):
                    finished_tokens.add(job_token)

            dropajobs_info = {k: v for k, v in dropajobs_info.iteritems() if k not in finished_tokens}
            if not self.__call_with_retries(set_dropajobs_info, dropajobs_info):
                return True

        try:
            job = self._dropajob.get(timeout=30)
            logger.debug("Job dropper new object: %r", job)
        except queue.Empty:
            # Give a chance for `dropajobs` in agentr state
            return True

        if job is None:
            return

        try:
            if job.platform:
                job.platform.cleanup()
                if job.status is errors.ContainerError:
                    job.platform.on_system_error()
                job.platform = None
        except (Exception, errors.InfraError):
            logger.exception("Error on job %r cancelling.", job)

        dropajob_info = {}
        if not isinstance(job, commands.ServiceCommand):
            dropajob_info = {
                "token": job.token,
                "reject": job.status not in ctt.Status and not isinstance(job.status, errors.InvalidJob),
                "reason": job.status_message,
                "reject_type": job.reject_type,
                "restart": inspect.isclass(job.status) and issubclass(
                    job.status, (errors.InfraError, errors.ShuttingDown, errors.ContainerError)
                ),
                "target_status": job.status if job.status in ctt.Status else None,
                "wait_targets": job.executor_ctx.get("wait_targets")
            }
            dropajobs_info[job.token] = dropajob_info
            if not self.__call_with_retries(set_dropajobs_info, dropajobs_info):
                return True

        try:
            if job is not None and job.agentr:
                job.agentr.finished(mark_as_ready=None)
        except ar_errors.NoTaskSession:
            pass
        except ar_errors.UmountError as e:
            if e.lxc:
                logger.warning("Got umount error from agentr, while dropping a job. Destroy container %s", e.lxc)
                platforms.LXCPlatform._destroy_queue.put((e.lxc, dict(check=False, stop=True)))
            else:
                logger.warning("Got umount error from agentr, while dropping a job.")
        except ar_errors.ARException:
            logger.exception("Error on communicating with AgentR.")

        commands.Command.registry.pop(job.token, None)

        if not isinstance(job, commands.ServiceCommand):
            obfuscated_token = common_format.obfuscate_token(job.token)
            logger.debug("Drop job %s in status %s", obfuscated_token, job.status)
            if self._delete_job_on_server(rest, dropajob_info):
                dropajobs_info.pop(job.token, None)
                if not self.__call_with_retries(set_dropajobs_info, dropajobs_info):
                    return True
            else:
                logger.warning(
                    "Can't drop job %s. Try to drop it later.", obfuscated_token
                )

            self._getajob_bell.set()
        else:
            if job.status == ctt.Status.SUCCESS:
                self._getajob_bell.set()

        if job.status is errors.ExecutorFailed and not self.stopped.is_set() and not self._shutting_down:
            # FIXME: SANDBOX-6163: Temporary use `lower` here
            if str(job.exec_type).lower() in map(str.lower, ctt.ImageType.Group.REGULAR):
                self.executor_fails += 1
                logger.error("Executor failed (%s/%s). Job: %r", self.executor_fails, MAX_EXECUTOR_FAILS, job)
                if self.executor_fails >= MAX_EXECUTOR_FAILS and not self._shutting_down:
                    logger.error(
                        "Executor failures %d exceeds maximum %d. Shutting down.",
                        self.executor_fails, MAX_EXECUTOR_FAILS
                    )
                    commands.ShutdownClientCommand.emergency_shutdown()
            else:
                logger.error(
                    "Job's %r executor of type '%s' failed. Do not increment failure count (%s).",
                    common_format.obfuscate_token(job.token), job.exec_type, self.executor_fails
                )
        else:
            self.executor_fails = 0

        return True

    def _status_report(self, rest):
        self._kamikadze_thread.ttl = common_config.Registry().client.idle_time * 10

        if self._status_report_timeouted:
            self.stopped.wait(timeout=5)  # Wait a little bit after timeout to decrease server load
            self._status_report_timeouted = False
        else:
            self.stopped.wait(timeout=common_config.Registry().client.idle_time)

        if self.stopped.is_set() or self._shutting_down:
            return

        status = self.__get_status()
        jobs = commands.Command.registry.copy()

        status["jobs"] = [
            {"id": token, "state": str(job.session_state)}
            for token, job in jobs.iteritems() if token != ctc.ServiceTokens.SERVICE_TOKEN
        ]
        log_status = copy.copy(status)
        log_status["jobs"] = [
            {"id": common_format.obfuscate_token(token), "state": str(job.session_state)}
            for token, job in jobs.iteritems() if token != ctc.ServiceTokens.SERVICE_TOKEN
        ]

        logger.info(
            "Slots: %r, tick: %.2fs, idle: %.2fs, jobs: %r, status update: %r",
            self._free_slots, self._timer.tick, self._timer.idle, len(jobs), log_status
        )
        logger.debug(
            "Getajob tokens: %s", map(common_format.obfuscate_token, (self.agentr.getajob_token or "").split())
        )
        logger.debug(
            "Dropajob token: %s",
            ", ".join(
                [common_format.obfuscate_token(token) for token in self.agentr.dropajobs_info.iterkeys()]
            )
        )

        # Don't retry longer than default timeout to avoid reporting outdated status
        rest.total_wait = rest.DEFAULT_TIMEOUT

        try:
            reset = rest.client[self.node_id].update(status) == rest.RESET
        except rest.TimeoutExceeded:
            logger.error("Status report timeouted, try again later")
            self._status_report_timeouted = True
            reset = False
        except rest.HTTPError:
            logger.exception("Error on status update. Re-checking jobs.")
            reset = True

        if reset:
            self._events.put((Event.VALIDATE, None))

        self._kamikadze_thread.ttl = common_config.Registry().client.idle_time * 3
        return True

    def _validate_jobs(self):
        current = set(commands.Command.registry.keys())
        current.discard(ctc.ServiceTokens.SERVICE_TOKEN)
        current.discard(ctc.ServiceTokens.TASKBOX_TOKEN)

        actual = self.rest.client[self.node_id].job[:]
        log_actual = {}
        if actual == self.rest.RESET:
            actual = {}
        else:
            actual = {_["id"]: (_["state"], _.get("options")) for _ in actual}
            log_actual = {common_format.obfuscate_token(key): value for key, value in actual.iteritems()}

        logger.debug("Actual jobs list: %r", log_actual)

        getajob_tokens = (self.agentr.getajob_token or "").split()
        for token, (state, options) in actual.iteritems():
            if token in getajob_tokens and state == ctt.SessionState.ACTIVE:
                continue

            job = commands.Command.registry.get(token, ctm.NotExists)
            if job is ctm.NotExists:
                # check that this job is not currently being dropped
                if token in self.agentr.dropajobs_info:
                    continue

                logger.error("There's no record about a job with ID %r", common_format.obfuscate_token(token))
                try:
                    self.rest.client[self.node_id].job.delete({
                        "token": token,
                        "reject": True,
                        "reason": "Unknown job",
                        "restart": True,
                    })
                except self.rest.HTTPError as ex:
                    if ex.status == httplib.NOT_FOUND:
                        logger.warning("Job %r is already deleted", common_format.obfuscate_token(token))
                    else:
                        raise
                continue
            if job is None:
                continue
            job.check_session_state(state, options)

        drop = sorted(current - actual.viewkeys())
        log_drop = list(map(common_format.obfuscate_token, drop))
        if drop:
            logger.info("Jobs list from server side: %r, drop: %r", sorted(log_actual), log_drop)
        for token in drop:
            job = commands.Command.registry.get(token, ctm.NotExists)
            if job is ctm.NotExists:
                logger.warning("Job %r is already cancelled, skip dropping it", common_format.obfuscate_token(token))
                continue
            logger.warning("Cancelling job %r", job)
            job.cancel(errors.SessionExpired)

    def _on_job_completed(self, job):
        assert job.status and job.status != ctt.Status.SUSPENDED
        token = None if isinstance(job, commands.ServiceCommand) else job.token
        error = isinstance(job.status, type) and issubclass(job.status, BaseException)
        (logger.error if error else logger.info)(
            "Job %r %r completed with status %r", common_format.obfuscate_token(token), job, job.status
        )
        subsequent = None if error else job.on_terminate()
        if subsequent is not None:
            logger.info("Running subsequent job %r %r.", common_format.obfuscate_token(token), subsequent)
            subsequent.token = job.token
            if job.args.get("service_auth"):
                subsequent.args["service_auth"] = job.args["service_auth"]
            subsequent.start()
        else:
            self._dropajob.put(job)

    @staticmethod
    def _filter_response_for_logs(resp):
        if type(resp) is dict and "id" in resp:
            resp = copy.deepcopy(resp)
            resp["id"] = common_format.obfuscate_token(resp["id"])
        return resp

    def _getajob(self, rest, free_slots=None):
        df = self.agentr.df()
        skip_job = False
        disk_full = self.__disk_full(df.free, df.total)

        if free_slots:
            if self.last_clear_command is None:
                if disk_full:
                    skip_job = True
            else:
                if (
                    disk_full and
                    time.time() - self.last_clear_command.timestamp_start >
                    common_config.Registry().client.hard_clear_interval
                ):
                    skip_job = True

                if (
                    self.need_cleanup() and free_slots >= common_config.Registry().client.max_job_slots and
                    self.last_clear_command.status == ctt.Status.SUCCESS
                ):
                    skip_job = True

        if not free_slots or skip_job:
            try:
                rest.client[self.node_id].job(
                    age=self.CLIENT_AGE, disk_free=df.free - df.locked
                )
            except rest.TimeoutExceeded as error:
                logger.error("Error on updating client status: %s", error)
            except rest.HTTPError:
                logger.exception("Error on updating client status")
            return

        tokens = (self.agentr.getajob_token or "").split()
        if len(tokens) < free_slots:
            tokens.extend(uuid.uuid4().hex for _ in xrange(free_slots - len(tokens)))
            self.agentr.getajob_token = " ".join(tokens)

        try:
            dynamic_slots_usage = self._dynamic_slots_usage
            free_computing_resources = dict(
                ram=self._shared_ram - dynamic_slots_usage.ram,
                cores=self._shared_cores - dynamic_slots_usage.cores
            ) if self._is_dynamic_slots else None
            jobs = rest.client[self.node_id].job(
                age=self.CLIENT_AGE, disk_free=df.free - df.locked, tokens=tokens, free=free_computing_resources
            )
        except (rest.HTTPError, rest.TimeoutExceeded) as error:
            if isinstance(error, rest.TimeoutExceeded):
                logger.error("Error on asking for a new job: %s", error)
            else:
                logger.exception("Error on asking for a new job")
            return

        if jobs == rest.RESET:
            logger.warning("Resetting token(s)")
            self.agentr.getajob_token = None
            return

        if jobs == rest.NO_CONTENT:
            return

        cmds = []
        for job in jobs:
            logger.info("New job for task #%s from server: %r", job.get("task_id"), self._filter_response_for_logs(job))
            cmd = commands.Command(job.pop("command"), job)
            service = isinstance(cmd, commands.ServiceCommand)
            if service:
                if isinstance(cmd, commands.ResetClientCommand):
                    if not commands.Command.registry:
                        logger.debug("Got RESET command, but job registry is already empty")
                        return None
                elif commands.Command.registry:
                    logger.warning("Service command for busy client!")
                    return None
                cmd.args["service_auth"] = self.token or self.__client_uuid()
            cmd.start()
            cmds.append(cmd)
            if cmd.token in tokens:
                tokens.remove(cmd.token)
        self.agentr.getajob_token = " ".join(tokens) or None
        return cmds

    @classmethod
    def need_cleanup(cls):
        # Try to cleanup on lack of free disk space too.
        th = common_config.Registry().client.auto_cleanup.free_space_threshold
        ts = cls.last_system_status["disk"]["total_space"]
        fs = cls.last_system_status["disk"]["free_space"]
        us = fs * 100 / ts
        if th and (th < 1 and (us < th * 100) or (th > 1 and fs < th << 20)):
            return (
                ("%.2f%% < %.2f%%" % (us, th * 100))
                if th < 1 else
                ("%s < %s" % (common_format.size2str(fs), common_format.size2str(th << 20)))
            )
        return None

    def _idle_job(self):
        logger.debug("Servicing self.")
        # Try to cleanup on lack of free disk space too.
        reason = self.need_cleanup()
        if reason:
            logger.info("Cleaning up the client because of lack of free space (%s).", reason)
            cmd = commands.Command(ctc.Command.CLEAR)
        else:
            cmd = commands.Command(ctc.Command.IDLE)
        cmd.args["service_auth"] = self.token or self.__client_uuid()
        cmd.start()
        return cmd

    def _serve_events(self):
        try:
            event, job = self._events.get(True, 1)
        except queue.Empty:
            return

        logger.info(
            "Processing event %r. Slots: %r, tick: %.2fs, idle: %.2fs",
            event, self._free_slots, self._timer.tick, self._timer.idle
        )
        if event == Event.WAKEUP:
            logger.info("B-Z-Z-Z-Z-Z-Z!!!")
            self._getajob_bell.set()
            event = None

        if event is None:
            self._getajob_bell.set()
        elif event == Event.STOP:
            if job:
                logger.info("Stopping by a signal.")
            self.wait_stop()
        elif event == Event.JOB_COMPLETED:
            job.join()
            self._on_job_completed(job)
        elif event == Event.VALIDATE:
            # Don't perform validation if client is stopping.
            # Chances are the registry is missing some jobs.
            if self.stopped.is_set():
                logger.info("Client is stopping, skip %r event", event)
            else:
                self._validate_jobs()
        else:
            logger.warning("Unhandled event %r", event)

    def wait_stop(self, final=False):
        self._kamikadze_thread.ttl = common_config.Registry().client.idle_time

        self.set_stopped()
        self._dropajob.put(None)
        platforms.LXCPlatform._destroy_queue.put(None)
        self._getajob_bell.set()

        try:
            socket.create_connection(("127.0.0.1", common_config.Registry().client.port), 1)
        except socket.error:
            pass

        logger.info("Waiting for service threads stop.")
        for t in self._service_threads.itervalues():
            if not t.daemon:
                t.join()

        if commands.Command.registry:
            log_fn = logger.warning if final else logger.info
            log_fn("Waiting for job threads stop.")

            for token, job_thread in commands.Command.registry.items():
                log_fn("Waiting for job %s", job_thread)
                job_thread.join()
                commands.Command.registry.pop(token, None)

    def run(self):
        try:
            logger.info("Pinger thread started. Max slots: %r", common_config.Registry().client.max_job_slots)
            self._timer.tick = common_config.Registry().client.min_tick
            self.token = (
                common_fs.read_settings_value_from_file(common_config.Registry().client.auth.oauth_token)
                if common_config.Registry().client.auth.oauth_token else
                None
            )
            self.rest = common_rest.Client(
                common_config.Registry().client.rest_url,
                auth=self.token, component=ctm.Component.CLIENT
            )
            if not commands.Command.registry:
                self._reset()
                self._timer.updated = time.time()
            else:
                if common_config.Registry().client.lxc.enabled:
                    platforms.LXCPlatform.maintain()  # Do not try to destroy privileged container

                if common_config.Registry().client.porto.enabled:
                    platforms.PortoPlatform.maintain()

                if self.rest.client[self.node_id].job[:] == self.rest.RESET:
                    logger.info("Got RESET before jobs started. Client state is outdated, do initial reset.")
                    commands.ResetClientCommand.initial_reset()
                else:
                    self.__get_status()
                    for job in commands.Command.registry.values():
                        job.start()

            inner_service_threads = (
                "_jobgetter", "_jobdropper", "_status_report",
                "_lxc_ip_router", "_lxc_destroyer"
            )

            if not self.stopped.is_set():
                for k in inner_service_threads:
                    if k in self._service_threads:
                        self._service_threads[k].start()
                while True:
                    self._serve_events()
                    if self.stopped.is_set() and self._events.empty():
                        break
            else:
                # these threads haven't been started, so avoid joining them
                for k in inner_service_threads:
                    self._service_threads.pop(k, None)

            logger.info("Pinger thread stopped.")
        except:
            logger.exception("Fatal error at the main loop.")

        self.wait_stop(final=True)

        self._kamikadze_thread.stop()
        self._kamikadze_thread.join()
