import random

import inject
import itertools
import monotonic
import six
import time
from datetime import datetime, timedelta
from sepelib.core import config as appconfig
from six.moves import zip

from awacs import resolver
from awacs.lib import ctlmanager, nannyclient, context, gutils
from awacs.lib.strutils import to_full_id, join_full_uids
from awacs.model import events, db, cache, util, errors
from awacs.model.balancer import generator
from awacs.model.balancer.generator import Resolution
from awacs.model.balancer.vector import BackendVersion
from awacs.model.dao import IDao, Dao
from awacs.model.util import make_system_endpoint_set_id
from awacs.model.zk import IZkStorage, ZkStorage
from infra.awacs.proto import model_pb2
from infra.swatlib import metrics


BACKEND_CTL_REGISTRY = metrics.ROOT_REGISTRY.path('backend-ctl')


def load_yp_sd_timestamps(yp_sd_timestamps):
    """
    :type yp_sd_timestamps: dict[six.text_type, int]
    """
    return {tuple(flat_id.split(u'/', 1)): ts for flat_id, ts in six.iteritems(yp_sd_timestamps)}


def dump_yp_sd_timestamps(yp_sd_timestamps):
    """
    :type yp_sd_timestamps: dict[tuple[six.text_type, six.text_type], int]
    """
    return {u'/'.join(full_id): ts for full_id, ts in six.iteritems(yp_sd_timestamps)}


def exception_to_status_message(e):
    if getattr(e, 'status_message', None):
        return six.text_type(e.status_message)
    return six.text_type(e)


class BackendCtl(ctlmanager.ContextedCtl):
    _cache = inject.attr(cache.IAwacsCache)  # type: cache.AwacsCache
    _db = inject.attr(db.IMongoStorage)  # type: db.MongoStorage
    _zk = inject.attr(IZkStorage)  # type: ZkStorage
    _dao = inject.attr(IDao)  # type: Dao
    _nanny_client = inject.attr(nannyclient.INannyClient)  # type: nannyclient.NannyClient

    _yp_endpoint_set_does_not_exist_error_counter = BACKEND_CTL_REGISTRY.get_counter(
        'yp-endpoint-set-does-not-exist-error')
    _yp_sd_obsolete_response_error_counter = BACKEND_CTL_REGISTRY.get_counter('yp-sd-obsolete-response-error')
    _yp_endpoint_set_is_empty_error_counter = BACKEND_CTL_REGISTRY.get_counter('yp-endpoint-set-is-empty-error')
    _instances_list_is_empty_error_counter = BACKEND_CTL_REGISTRY.get_counter('instances-list-is-empty-error')
    _resolve_error_counter = BACKEND_CTL_REGISTRY.get_counter('resolve-error')
    _resolve_success_counter = BACKEND_CTL_REGISTRY.get_counter('resolve-success')
    _yp_endpoint_set_master_resolve_counter = BACKEND_CTL_REGISTRY.get_counter('yp-endpoint-set-master-resolve')
    _yp_endpoint_set_sd_resolve_counter = BACKEND_CTL_REGISTRY.get_counter('yp-endpoint-set-sd-resolve')
    _updated_endpoint_sets_counter = BACKEND_CTL_REGISTRY.get_counter('updated-endpoint-sets')

    PROCESS_INTERVAL = 90
    PROCESS_INTERVAL_JITTER = 15
    SELF_DELETION_CHECK_INTERVAL = 30
    SLEEP_AFTER_EXCEPTION_TIMEOUT = 30

    EVENTS_QUEUE_GET_TIMEOUT = min(PROCESS_INTERVAL, SELF_DELETION_CHECK_INTERVAL) // 2

    # a number of seconds to wait before actual deletion after backend marked as deleted
    # introduced to avoid races with sluggish balancer controllers
    SELF_DELETION_COOLDOWN_PERIOD = 60 * 5  # 5 minutes

    def __init__(self, namespace_id, backend_id):
        name = 'backend-ctl("{}:{}")'.format(namespace_id, backend_id)
        super(BackendCtl, self).__init__(name)

        self._namespace_id = namespace_id
        self._backend_id = backend_id

        self._seen_backend_version = None
        self._processing_deadline = monotonic.monotonic()
        self._self_deletion_check_deadline = monotonic.monotonic()

    def _accept_event(self, event):
        """
        :type event: events.*
        :rtype: bool
        """
        if isinstance(event, (events.BackendUpdate, events.EndpointSetUpdate)):
            if event.pb.meta.namespace_id == self._namespace_id and event.pb.meta.id == self._backend_id:
                return True
        return False

    def _start(self, ctx):
        """
        :type ctx: context.OpCtx
        """
        try:
            self._process(ctx)
            # let's not count on-start failures or successes to
            # _resolve_error_counter and _resolve_success_counter to avoid peaks during awacs redeployment
        except ctlmanager.UNEXPECTED_EXCEPTIONS as e:
            ctx.log.exception('failed to process backend on start: %s', e)
        self._cache.bind(self._callback)
        self._processing_deadline = monotonic.monotonic()
        self._self_deletion_check_deadline = monotonic.monotonic()

    def _stop(self):
        self._cache.unbind(self._callback)

    def _create_endpoint_set(self, backend_meta_pb, backend_selector_pb, instance_pbs, is_global_pb):
        utcnow = datetime.utcnow()
        endpoint_set_pb = model_pb2.EndpointSet()
        meta_pb = model_pb2.EndpointSetMeta(id=self._backend_id, namespace_id=self._namespace_id)
        meta_pb.ctime.FromDatetime(utcnow)
        meta_pb.mtime.FromDatetime(utcnow)
        meta_pb.auth.type = endpoint_set_pb.meta.auth.STAFF
        meta_pb.backend_versions.append(backend_meta_pb.version)
        meta_pb.resolved_from.CopyFrom(backend_selector_pb)
        meta_pb.is_system.CopyFrom(backend_meta_pb.is_system)

        spec_pb = model_pb2.EndpointSetSpec()
        for instance_pb in instance_pbs:
            spec_pb.instances.add(
                host=instance_pb.host,
                port=instance_pb.port,
                ipv4_addr=instance_pb.ipv4_addr,
                ipv6_addr=instance_pb.ipv6_addr,
                weight=instance_pb.weight)
        spec_pb.is_global.CopyFrom(is_global_pb)
        endpoint_set_pb = self._dao.create_endpoint_set(
            meta_pb=meta_pb,
            spec_pb=spec_pb,
            login=util.NANNY_ROBOT_LOGIN)
        return endpoint_set_pb

    def _update_endpoint_set(self, backend_meta_pb, backend_selector_pb, endpoint_set_pb,
                             instance_pbs=None, is_global_pb=None):
        spec_pb = util.clone_pb(endpoint_set_pb.spec)
        if is_global_pb:
            spec_pb.is_global.CopyFrom(is_global_pb)
        if instance_pbs:
            spec_pb.ClearField('instances')
            for instance_pb in instance_pbs:
                spec_pb.instances.add(
                    host=instance_pb.host,
                    port=instance_pb.port,
                    ipv4_addr=instance_pb.ipv4_addr,
                    ipv6_addr=instance_pb.ipv6_addr,
                    weight=instance_pb.weight)
        self._dao.update_endpoint_set(
            namespace_id=self._namespace_id,
            endpoint_set_id=self._backend_id,
            version=endpoint_set_pb.meta.version,
            comment='Updated by {}'.format(util.NANNY_ROBOT_LOGIN),
            login=util.NANNY_ROBOT_LOGIN,
            backend_version=backend_meta_pb.version,
            backend_selector_pb=backend_selector_pb,
            updated_spec_pb=spec_pb,
            updated_is_system_pb=backend_meta_pb.is_system,
        )
        self._updated_endpoint_sets_counter.inc()

    def _fill_snapshot_ids(self, selector_pb):
        """
        :type selector_pb: model_pb2.BackendSelector
        :raises: context.CtxTimeoutCancelled, context.CtxTimeoutExceeded
        """
        for nanny_snapshot_pb in selector_pb.nanny_snapshots:
            assert not nanny_snapshot_pb.snapshot_id

            target_snapshot_id = self._nanny_client.get_target_runtime_attrs_id(nanny_snapshot_pb.service_id)
            if target_snapshot_id is not None:
                nanny_snapshot_pb.snapshot_id = target_snapshot_id
            else:
                nanny_snapshot_pb.snapshot_id = \
                    self._nanny_client.get_current_runtime_attrs_id(nanny_snapshot_pb.service_id)

    def _resolve(self, ctx, selector_pb, prev_yp_sd_timestamps, use_sd=False):
        """
        :type ctx: context.OpCtx
        :type selector_pb: awacs.proto.model_pb2.BackendSelector
        :type prev_yp_sd_timestamps: dict[tuple[six.text_type, six.text_type], int]
        :type use_sd: bool
        :rtype: Resolution
        :raises: context.CtxTimeoutCancelled, context.CtxTimeoutExceeded
        """
        yp_sd_timestamps = {}
        with ctx.with_forced_timeout(60 * 5):
            if selector_pb.type == model_pb2.BackendSelector.NANNY_SNAPSHOTS:
                instance_pbs = generator.resolve_nanny_snapshot_pbs(
                    selector_pb.nanny_snapshots, default_port=selector_pb.port, default_use_mtn=selector_pb.use_mtn)
            elif selector_pb.type == model_pb2.BackendSelector.GENCFG_GROUPS:
                instance_pbs = generator.resolve_gencfg_group_pbs(
                    selector_pb.gencfg_groups, default_port=selector_pb.port, default_use_mtn=selector_pb.use_mtn)
            elif selector_pb.type in (model_pb2.BackendSelector.YP_ENDPOINT_SETS, model_pb2.BackendSelector.BALANCERS):
                if selector_pb.type == model_pb2.BackendSelector.BALANCERS:
                    yp_endpoint_sets = []
                    for selector_balancer_pb in selector_pb.balancers:
                        balancer_id = selector_balancer_pb.id
                        balancer_pb = self._cache.must_get_balancer(self._namespace_id, balancer_id)
                        nanny_service_id = balancer_pb.spec.config_transport.nanny_static_file.service_id
                        endpoint_set_id = make_system_endpoint_set_id(nanny_service_id)
                        cluster = balancer_pb.meta.location.yp_cluster.lower()
                        yp_endpoint_sets.append(model_pb2.BackendSelector.YpEndpointSet(endpoint_set_id=endpoint_set_id,
                                                                                        cluster=cluster))
                else:
                    yp_endpoint_sets = selector_pb.yp_endpoint_sets
                reject_empty = (appconfig.get_value('run.reject_empty_yp_endpoint_sets', default=False) and
                                not selector_pb.allow_empty_yp_endpoint_sets)
                try:
                    if use_sd:
                        treat_not_exists_as_empty = appconfig.get_value('run.enable_sd.treat_not_exists_as_empty',
                                                                        default=False)
                        self._yp_endpoint_set_sd_resolve_counter.inc()
                        resolution = generator.resolve_yp_endpoint_set_pbs_using_sd(
                            yp_endpoint_sets,
                            prev_yp_sd_timestamps=prev_yp_sd_timestamps,
                            default_port=selector_pb.port,
                            treat_not_exists_as_empty=treat_not_exists_as_empty,
                            reject_empty=reject_empty)
                        instance_pbs = resolution.instance_pbs
                        yp_sd_timestamps = resolution.yp_sd_timestamps
                    else:
                        self._yp_endpoint_set_master_resolve_counter.inc()
                        instance_pbs = generator.resolve_yp_endpoint_set_pbs(
                            yp_endpoint_sets,
                            default_port=selector_pb.port,
                            reject_empty=reject_empty)
                except resolver.YpSdObsoleteResponseError:
                    self._yp_sd_obsolete_response_error_counter.inc()
                    raise
                except resolver.YpEndpointSetDoesNotExistError:
                    self._yp_endpoint_set_does_not_exist_error_counter.inc()
                    raise
                except resolver.YpEndpointSetIsEmptyError:
                    self._yp_endpoint_set_is_empty_error_counter.inc()
                    raise
            else:
                raise RuntimeError('{}: unsupported backend selector type: {}'.format(ctx.id(), selector_pb.type))
            if not instance_pbs:
                self._instances_list_is_empty_error_counter.inc()
                raise resolver.ResolvingError('{}: got an empty instances list'.format(ctx.id()))
            return Resolution(instance_pbs=instance_pbs, yp_sd_timestamps=yp_sd_timestamps)

    @staticmethod
    def _are_instance_lists_equal(ctx, xs, ys):
        if len(xs) != len(ys):
            return False
        for i1, i2 in zip(xs, ys):
            res = generator.instances_cmp(i1, i2)
            if res != 0:
                ctx.log.debug('differing instances: %s, %s', i1, i2)
                return False
        return True

    def _set_attempt(self, ctx, current_resolver_status_pb, updated_last_attempt_pb):
        """
        :type ctx: context.OpCtx
        :type current_resolver_status_pb: model_pb2.BackendResolverStatus
        :type updated_last_attempt_pb: model_pb2.BackendResolverAttempt
        """
        last_attempt_pb = current_resolver_status_pb.last_attempt
        if (last_attempt_pb.succeeded == updated_last_attempt_pb.succeeded and
                last_attempt_pb.revision_id == updated_last_attempt_pb.revision_id and
                last_attempt_pb.yp_sd_timestamps == updated_last_attempt_pb.yp_sd_timestamps):
            # There is nothing to update
            return

        if (last_attempt_pb.succeeded.status == 'False' and
                updated_last_attempt_pb.succeeded.status == 'False' and
                last_attempt_pb.revision_id == updated_last_attempt_pb.revision_id):
            # If backend resolve has failed, remains failed and the only things to update are
            # message and started/finished times in resolver status, we don't want to do that too often.
            # It generates a stream of Zookeeper watches and can affect balancer ctls, which
            # postpone their validation after *every* backend update.
            now = datetime.utcnow()

            if current_resolver_status_pb.HasField('last_successful_attempt'):
                last_successful_attempt_at = current_resolver_status_pb.last_successful_attempt.finished_at.ToDatetime()
                time_since_last_successful_attempt = now - last_successful_attempt_at
                if time_since_last_successful_attempt < timedelta(minutes=10):
                    too_long = timedelta(minutes=0)
                elif time_since_last_successful_attempt < timedelta(minutes=60):
                    too_long = timedelta(minutes=10)
                elif time_since_last_successful_attempt < timedelta(minutes=60 * 24):
                    too_long = timedelta(minutes=30)
                else:
                    too_long = timedelta(minutes=60)
            else:
                # Backend has never been successfully resolved, let's use 10 minutes for now
                too_long = timedelta(minutes=10)

            last_attempt_at = last_attempt_pb.finished_at.ToDatetime()
            if now - last_attempt_at < too_long:
                return

        if current_resolver_status_pb.HasField('last_successful_attempt'):
            last_successful_attempt_pb = current_resolver_status_pb.last_successful_attempt
        else:
            last_successful_attempt_pb = None
        if updated_last_attempt_pb.succeeded.status == 'True':
            last_successful_attempt_pb = updated_last_attempt_pb

        updated_resolver_status_pb = util.clone_pb(current_resolver_status_pb)
        updated_resolver_status_pb.last_attempt.CopyFrom(updated_last_attempt_pb)
        if last_successful_attempt_pb is not None:
            updated_resolver_status_pb.last_successful_attempt.CopyFrom(last_successful_attempt_pb)

        updated_backend_pb = self._dao.update_backend(
            namespace_id=self._namespace_id,
            backend_id=self._backend_id,
            updated_resolver_status_pb=updated_resolver_status_pb)
        ctx.log.debug('updated resolver status, new meta.generation: %s', updated_backend_pb.meta.generation)

    def _reset_processing_timers(self):
        self._processing_deadline = (
                monotonic.monotonic() +
                self.PROCESS_INTERVAL +
                random.randint(-self.PROCESS_INTERVAL_JITTER, self.PROCESS_INTERVAL_JITTER))

    def _process(self, ctx):
        """
        :type ctx: context.OpCtx
        :rtype: bool
        """
        backend_pb = self._cache.must_get_backend(namespace_id=self._namespace_id,
                                                  backend_id=self._backend_id)
        ctx.log.debug('processing version %s, generation %s',
                      backend_pb.meta.version.split('-', 1)[0], backend_pb.meta.generation)

        if backend_pb.spec.selector.type == backend_pb.spec.selector.MANUAL:
            ctx.log.debug('backend is MANUAL, nothing to process')
            self._reset_processing_timers()
            return True

        if backend_pb.spec.selector.type == backend_pb.spec.selector.YP_ENDPOINT_SETS_SD:
            ctx.log.debug('backend is YP_ENDPOINT_SETS_SD, nothing to process')
            self._reset_processing_timers()
            return True

        attempt_pb = model_pb2.BackendResolverAttempt()
        attempt_pb.revision_id = backend_pb.meta.version
        attempt_pb.started_at.GetCurrentTime()

        if backend_pb.spec.deleted:
            self._reset_processing_timers()
            msg = 'Backend is deleted, nothing to process'
            attempt_pb.finished_at.GetCurrentTime()
            attempt_pb.succeeded.status = 'True'
            attempt_pb.succeeded.message = msg
            ctx.log.debug(msg)
            self._set_attempt(ctx, backend_pb.resolver_status, attempt_pb)
            return True

        selector_pb = backend_pb.spec.selector
        if selector_pb.type == model_pb2.BackendSelector.NANNY_SNAPSHOTS:
            filled_selector_pb = util.clone_pb(selector_pb)
            try:
                self._fill_snapshot_ids(filled_selector_pb)
            except (errors.NannyApiError, nannyclient.NannyApiRequestException, context.CtxTimeoutExceeded) as e:
                self._reset_processing_timers()
                msg = 'Failed to read current Nanny snapshot identifiers: {}'.format(exception_to_status_message(e))
                attempt_pb.finished_at.GetCurrentTime()
                attempt_pb.succeeded.status = 'False'
                attempt_pb.succeeded.message = msg
                ctx.log.warn('failed to read current Nanny snapshot identifiers: %s', e)
                self._set_attempt(ctx, backend_pb.resolver_status, attempt_pb)
                return False
            except context.CtxTimeoutCancelled:
                ctx.log.debug('ctx is cancelled: %s, returning...', ctx.error())
                return False
        else:
            filled_selector_pb = selector_pb

        prev_yp_sd_timestamps = {}
        if selector_pb.type in (model_pb2.BackendSelector.YP_ENDPOINT_SETS,
                                model_pb2.BackendSelector.BALANCERS):
            prev_yp_sd_timestamps = load_yp_sd_timestamps(
                backend_pb.resolver_status.last_successful_attempt.yp_sd_timestamps)

            if selector_pb.type == model_pb2.BackendSelector.YP_ENDPOINT_SETS:
                clusters = {es_pb.cluster for es_pb in selector_pb.yp_endpoint_sets}
            elif selector_pb.type == model_pb2.BackendSelector.BALANCERS:
                clusters = set()
                for b_pb in selector_pb.balancers:
                    balancer_pb = self._cache.must_get_balancer(self._namespace_id, b_pb.id)
                    clusters.add(balancer_pb.meta.location.yp_cluster.lower())
            else:
                raise AssertionError()

            matched_by_namespace_id = self._namespace_id in appconfig.get_value(u'run.enable_sd.namespace_ids', [])
            matched_by_cluster = bool(clusters & set(appconfig.get_value(u'run.enable_sd.clusters', [])))
            full_id_crc = util.crc32(u'{}/{}'.format(self._namespace_id, self._backend_id).encode(u'utf-8'))
            matched_by_percent = (full_id_crc % 100 < appconfig.get_value(u'run.enable_sd.percent', 0))
            use_sd = matched_by_namespace_id or (matched_by_cluster and matched_by_percent)
        else:
            use_sd = False

        endpoint_set_pb = self._cache.get_endpoint_set(namespace_id=self._namespace_id,
                                                       endpoint_set_id=self._backend_id)

        if endpoint_set_pb:
            current_backend_version = backend_pb.meta.version
            current_endpoint_set_version = endpoint_set_pb.meta.version

            is_already_resolved = current_backend_version in endpoint_set_pb.meta.backend_versions
            selector_changed = endpoint_set_pb.meta.resolved_from != filled_selector_pb

            if (not is_already_resolved or
                    selector_changed or
                    selector_pb.type in (model_pb2.BackendSelector.YP_ENDPOINT_SETS,
                                         model_pb2.BackendSelector.BALANCERS)):
                ctx.log.debug('resolving instances from %s, use_sd: %s', filled_selector_pb, use_sd)
                try:
                    resolution = self._resolve(ctx, filled_selector_pb,
                                               prev_yp_sd_timestamps=prev_yp_sd_timestamps,
                                               use_sd=use_sd)
                except context.CtxTimeoutCancelled:
                    ctx.log.debug('ctx is cancelled: %s, returning...', ctx.error())
                    return False
                except resolver.YpSdObsoleteResponseError as e:
                    self._reset_processing_timers()
                    ctx.log.debug('failed to resolve instances: %s', e)
                    return False
                except (Exception, context.CtxTimeoutExceeded) as e:
                    self._reset_processing_timers()
                    msg = 'Failed to resolve instances: {}'.format(exception_to_status_message(e))
                    attempt_pb.finished_at.GetCurrentTime()
                    attempt_pb.succeeded.status = 'False'
                    attempt_pb.succeeded.message = msg
                    ctx.log.debug('failed to resolve instances: %s', e)
                    self._set_attempt(ctx, backend_pb.resolver_status, attempt_pb)
                    return False

                instances_not_changed = self._are_instance_lists_equal(ctx, endpoint_set_pb.spec.instances,
                                                                       resolution.instance_pbs)

                if instances_not_changed:
                    ctx.log.debug('instances not changed')
                    is_global_changed = endpoint_set_pb.spec.is_global.value != backend_pb.spec.is_global.value
                    if (not is_already_resolved) or selector_changed:
                        added = self._dao.update_resolved_from_and_add_backend_version_to_endpoint_set_rev(
                            namespace_id=self._namespace_id,
                            endpoint_set_id=self._backend_id,
                            backend_version_to_add=current_backend_version,
                            endpoint_set_version=current_endpoint_set_version,
                            resolved_from_pb=filled_selector_pb)
                        if added:
                            ctx.log.debug('added backend version to endpoint set rev')
                        else:
                            ctx.log.debug('did not add backend version to endpoint set rev')
                    if is_global_changed:
                        self._update_endpoint_set(backend_meta_pb=backend_pb.meta,
                                                  backend_selector_pb=filled_selector_pb,
                                                  endpoint_set_pb=endpoint_set_pb,
                                                  instance_pbs=None,
                                                  is_global_pb=backend_pb.spec.is_global)
                        ctx.log.debug('updated is_global for endpoint set')
                else:
                    ctx.log.debug('instances changed, creating new endpoint set revision...')
                    self._update_endpoint_set(backend_meta_pb=backend_pb.meta,
                                              backend_selector_pb=filled_selector_pb,
                                              endpoint_set_pb=endpoint_set_pb,
                                              instance_pbs=resolution.instance_pbs,
                                              is_global_pb=backend_pb.spec.is_global)
                    ctx.log.debug('created new endpoint set revision...')

                attempt_pb.finished_at.GetCurrentTime()
                attempt_pb.succeeded.status = 'True'
                attempt_pb.succeeded.message = ''
                attempt_pb.yp_sd_timestamps.update(dump_yp_sd_timestamps(
                    prev_yp_sd_timestamps if instances_not_changed else resolution.yp_sd_timestamps))
                self._set_attempt(ctx, backend_pb.resolver_status, attempt_pb)
            else:
                if ctx.done():
                    ctx.log.debug('ctx is cancelled: %s, returning...', ctx.error())
                    return False

                # is already resolved and selector has not changed
                if backend_pb.resolver_status.last_attempt.succeeded.status != 'True':
                    attempt_pb.finished_at.GetCurrentTime()
                    attempt_pb.succeeded.status = 'True'
                    attempt_pb.succeeded.message = ''
                    attempt_pb.yp_sd_timestamps.update(dump_yp_sd_timestamps(prev_yp_sd_timestamps))
                    self._set_attempt(ctx, backend_pb.resolver_status, attempt_pb)

        else:
            ctx.log.debug('endpoint set does not exist, use_sd: %s, creating...', use_sd)
            try:
                resolution = self._resolve(ctx, filled_selector_pb,
                                           prev_yp_sd_timestamps=prev_yp_sd_timestamps,
                                           use_sd=use_sd)
            except context.CtxTimeoutCancelled:
                ctx.log.debug('ctx is cancelled: %s, returning...', ctx.error())
                return False
            except resolver.YpSdObsoleteResponseError as e:
                self._reset_processing_timers()
                ctx.log.debug('failed to resolve instances: %s', e)
                return False
            except (Exception, context.CtxTimeoutExceeded) as e:
                self._reset_processing_timers()
                msg = 'Failed to resolve instances: {}'.format(exception_to_status_message(e))
                attempt_pb.finished_at.GetCurrentTime()
                attempt_pb.succeeded.status = 'False'
                attempt_pb.succeeded.message = msg
                ctx.log.debug('failed to resolve instances: %s', e)
                self._set_attempt(ctx, backend_pb.resolver_status, attempt_pb)
                return False
            else:
                self._create_endpoint_set(backend_meta_pb=backend_pb.meta,
                                          backend_selector_pb=filled_selector_pb,
                                          instance_pbs=resolution.instance_pbs,
                                          is_global_pb=backend_pb.spec.is_global)

                attempt_pb.finished_at.GetCurrentTime()
                attempt_pb.succeeded.status = 'True'
                attempt_pb.succeeded.message = ''
                attempt_pb.yp_sd_timestamps.update(dump_yp_sd_timestamps(resolution.yp_sd_timestamps))
                self._set_attempt(ctx, backend_pb.resolver_status, attempt_pb)

        self._reset_processing_timers()
        ctx.log.debug('processed')
        return True

    def _process_and_update_counters(self, ctx):
        """
        :type ctx: context.OpCtx
        """
        try:
            rv = self._process(ctx)
        except ctlmanager.UNEXPECTED_EXCEPTIONS:
            self._resolve_error_counter.inc()
            raise
        else:
            if rv:
                self._resolve_success_counter.inc()
            else:
                self._resolve_error_counter.inc()

    def _maybe_process(self, ctx):
        """
        :type ctx: context.OpCtx
        """
        if monotonic.monotonic() >= self._processing_deadline:
            self._process_and_update_counters(ctx)

    def _process_backend_update(self, ctx, backend_pb):
        """
        :type ctx: context.OpCtx
        :type backend_pb: model_pb2.Backend
        """
        version = BackendVersion.from_pb(backend_pb)
        if self._seen_backend_version is None or version > self._seen_backend_version:
            self._process_and_update_counters(ctx)
            self._seen_backend_version = version
        elif version != self._seen_backend_version:
            ctx.log.warn(u'skipping processing, version (%s) < seen version (%s)',
                         version.short_ver(), self._seen_backend_version.short_ver())
        else:
            ctx.log.info(u'already processed this backend version, nothing to do')

    @staticmethod
    def _get_keys_w_statuses(m):
        rv = set()
        for k, v_pb in six.iteritems(m):
            if v_pb.statuses:
                rv.add(k)
        return rv

    @staticmethod
    def _get_keys_w_l3_statuses(m):
        rv = set()
        for k, v_pb in six.iteritems(m):
            if v_pb.l3_statuses:
                rv.add(k)
        return rv

    def _is_backend_used_in_l3_balancers(self, ctx):
        """
        :type ctx: context.OpCtx
        :rtype: bool
        """
        ctx.log.info('started _is_backend_used_in_l3_balancers()')
        full_backend_id = (self._namespace_id, self._backend_id)
        rv = False
        try:
            with ctx.with_forced_timeout(60 * 5):
                self._zk.sync_l3_balancer_states(self._namespace_id)
                for l3_balancer_pb in self._cache.list_all_l3_balancers(self._namespace_id):
                    l3_balancer_state_pb = self._zk.must_get_l3_balancer_state(self._namespace_id,
                                                                               l3_balancer_pb.meta.id)
                    for backend_id in self._get_keys_w_l3_statuses(l3_balancer_state_pb.backends):
                        if to_full_id(self._namespace_id, backend_id) == full_backend_id:
                            rv = True
                            break
                    if rv:
                        break
        except context.CtxTimeoutExceeded:
            ctx.log.warn('_is_backend_used_in_l3_balancers() timed out, returning...')
            return
        except context.CtxTimeoutCancelled:
            ctx.log.debug('_is_backend_used_in_l3_balancers(): ctx is cancelled: %s, returning...', ctx.error())
            return
        ctx.log.info('finished _is_backend_used_in_l3_balancers(), result is %s', rv)
        return rv

    def _is_backend_used_in_dns_balancers(self, ctx):
        """
        :type ctx: context.OpCtx
        :rtype: bool
        """
        ctx.log.info('started _is_backend_used_in_dns_balancers()')
        full_backend_id = (self._namespace_id, self._backend_id)
        rv = False
        try:
            with ctx.with_forced_timeout(60 * 5):
                self._zk.sync_dns_record_states(self._namespace_id)
                for dns_record_pb in self._cache.list_all_dns_records(namespace_id=self._namespace_id):
                    dns_record_state_pb = self._zk.must_get_dns_record_state(self._namespace_id, dns_record_pb.meta.id)
                    for backend_id in self._get_keys_w_statuses(dns_record_state_pb.backends):
                        if to_full_id(self._namespace_id, backend_id) == full_backend_id:
                            rv = True
                            break
                    if rv:
                        break
        except context.CtxTimeoutExceeded:
            ctx.log.warn('_is_backend_used_in_dns_balancers() timed out, returning...')
            return
        except context.CtxTimeoutCancelled:
            ctx.log.debug('_is_backend_used_in_dns_balancers(): ctx is cancelled: %s, returning...', ctx.error())
            return
        ctx.log.info('finished _is_backend_used_in_dns_balancers(), result is %s', rv)
        return rv

    def _is_backend_used_in_l7_balancers(self, ctx):
        """
        :type ctx: context.OpCtx
        :rtype: bool
        """
        ctx.log.info('started _is_backend_used_in_l7_balancers()')
        full_backend_id = (self._namespace_id, self._backend_id)
        rv = False
        try:
            with ctx.with_forced_timeout(60 * 5):
                self._zk.sync_balancer_states(self._namespace_id)
                for balancer_pb in self._cache.list_all_balancers(self._namespace_id):
                    balancer_state_pb = self._zk.must_get_balancer_state(self._namespace_id, balancer_pb.meta.id)
                    for backend_id in self._get_keys_w_statuses(balancer_state_pb.backends):
                        if to_full_id(self._namespace_id, backend_id) == full_backend_id:
                            rv = True
                            break
                    if rv:
                        break
        except context.CtxTimeoutExceeded:
            ctx.log.warn('_is_backend_used_in_l7_balancers() timed out, returning...')
            return
        except context.CtxTimeoutCancelled:
            ctx.log.debug('_is_backend_used_in_l7_balancers(): ctx is cancelled: %s, returning...', ctx.error())
            return
        ctx.log.info('finished _is_backend_used_in_l7_balancers(), result is %s', rv)
        return rv

    def _is_backend_used_in_all_l7_balancers(self, ctx):
        """
        :type ctx: context.OpCtx
        :rtype: bool

        Checks across ALL namespaces
        """
        ctx.log.info('started _is_backend_used_in_all_l7_balancers()')
        full_backend_id = (self._namespace_id, self._backend_id)
        rv = False
        try:
            with ctx.with_forced_timeout(60 * 40):
                for namespace_pb in gutils.gevent_idle_iter(self._cache.list_all_namespaces()):
                    namespace_id = namespace_pb.meta.id
                    self._zk.sync_balancer_states(namespace_id)
                    for balancer_pb in self._cache.list_all_balancers(namespace_id):
                        balancer_state_pb = self._zk.must_get_balancer_state(namespace_id, balancer_pb.meta.id)
                        for backend_id in self._get_keys_w_statuses(balancer_state_pb.backends):
                            if to_full_id(namespace_id, backend_id) == full_backend_id:
                                rv = True
                                break
                    if rv:
                        break
        except context.CtxTimeoutExceeded:
            ctx.log.warn('_is_backend_used_in_all_l7_balancers() timed out, returning...')
            return
        except context.CtxTimeoutCancelled:
            ctx.log.debug('_is_backend_used_in_all_l7_balancers(): ctx is cancelled: %s, returning...', ctx.error())
            return
        ctx.log.info('finished _is_backend_used_in_all_l7_balancers(), result is %s', rv)
        return rv

    def _self_delete(self, ctx, backend_pb):
        """
        :type ctx: context.OpCtx
        """
        ctx.log.info(u'starting full self deletion checks')
        used_in = None
        if self._is_backend_used_in_l3_balancers(ctx):
            used_in = 'L3 balancers'
        elif self._is_backend_used_in_dns_balancers(ctx):
            used_in = 'DNS records'
        elif backend_pb.spec.is_global.value and self._is_backend_used_in_all_l7_balancers(ctx):
            used_in = 'L7 balancers'
        elif not backend_pb.spec.is_global.value and self._is_backend_used_in_l7_balancers(ctx):
            used_in = 'L7 balancers'

        if used_in:
            raise RuntimeError(u"Critical error: would delete a referenced backend if it wasn't for this raise. "
                               u"Backend is used in some {}".format(used_in))
        self._dao.delete_backend(self._namespace_id, self._backend_id)

    def _maybe_self_delete(self, ctx):
        """
        :type ctx: context.OpCtx
        """
        backend_pb = self._cache.get_backend(self._namespace_id, self._backend_id)

        if not backend_pb or not backend_pb.spec.deleted:
            return

        if monotonic.monotonic() < self._self_deletion_check_deadline:
            return

        self_deletion_elapsed_time = time.time() - backend_pb.meta.mtime.ToSeconds()
        if self_deletion_elapsed_time < self.SELF_DELETION_COOLDOWN_PERIOD:
            return

        ctx.log.info(u'starting cached self deletion checks')
        full_balancer_ids = self._cache.list_full_balancer_ids_for_backend(
            self._namespace_id, self._backend_id)
        full_l3_balancer_ids = self._cache.list_full_l3_balancer_ids_for_backend(
            self._namespace_id, self._backend_id)
        full_dns_record_ids = self._cache.list_full_dns_record_ids_for_backend(
            self._namespace_id, self._backend_id)
        if full_balancer_ids or full_l3_balancer_ids or full_dns_record_ids:
            ctx.log.info(
                u'decided to not self-delete, used in %s',
                join_full_uids(itertools.chain(full_balancer_ids, full_l3_balancer_ids, full_dns_record_ids)))
            self._self_deletion_check_deadline = monotonic.monotonic() + self.SELF_DELETION_CHECK_INTERVAL
            return
        self._self_delete(ctx, backend_pb)
        ctx.log.info(u'successfully self deleted')

    def _process_event(self, ctx, event):
        """
        :type ctx: context.OpCtx
        """
        if isinstance(event, events.BackendUpdate):
            self._process_backend_update(ctx, event.pb)
        elif isinstance(event, events.EndpointSetUpdate):
            self._maybe_process(ctx)
        else:
            raise RuntimeError('{}: Unsupported event: {}'.format(ctx.id(), type(event)))
        self._maybe_self_delete(ctx)

    def _process_empty_queue(self, ctx):
        """
        :type ctx: context.OpCtx
        """
        self._maybe_process(ctx)
        self._maybe_self_delete(ctx)
