from __future__ import unicode_literals

import gevent.monkey

gevent.monkey.patch_all(subprocess=False)

from grpc.experimental.gevent import init_gevent

init_gevent()

import os
import random
import logging
import logging.config
import datetime
from urlparse import urlparse
import gevent
import gevent.event
import gevent.pool
import raven.conf
import raven.handlers.logging
import nanny_rpc_client
from nanny_repo import repo_api_stub
from collections import namedtuple
from infra.swatlib.gevent.geventutil import force_kill_greenlet
from infra.swatlib.zk import client
from infra.watchdog.src.lib import missed_completed_taskgroups, eviction_requested_pods
from infra.watchdog.src.lib.alemate_client import AlemateClient
from infra.watchdog.src.lib.metrics import Metrics
from infra.watchdog.src.lib.push_client import PushClient
from infra.watchdog.src.lib.snapshots import check_awaited_snapshot_states, collect_snapshots_info
from infra.watchdog.src.lib.zk_client import ZkClient
from infra.watchdog.src.lib.yp_client import YpClient
from infra.watchdog.src.lib.yp_pod_set import PodSet
from infra.nanny.yp_lite_api.py_stubs import admin_api_stub
from sepelib.core import config

ITERATION_PERIOD = 10

MISSED_COMPLETED_TASKGROUPS_ITERATION_PERIOD = 21600  # 6 hours


class OrphanedYpPodSetsCollectorSettings(object):
    class DomainSettings(namedtuple('DomainSettings', ['process', 'last_modification_timedelta'])):
        pass

    def __init__(self, data):
        self.sleep_before_first_run = data['sleep_before_first_run']
        self.iteration_period = int(datetime.timedelta(**data['iteration_period']).total_seconds())
        self.max_marked_pod_sets_per_iteration = data['max_marked_pod_sets_per_iteration']
        self.mark_label_name = data['mark_label_name']
        self.metrics_ttl = data.get('metrics_ttl', self.iteration_period)
        self._domains_settings = {d['name']: self._build_domain_settings(d)
                                  for d in data['match_deploy_engine_url_domains']}
        self._unmatched_domains_settings = self._build_domain_settings(data['unmatched_deploy_engine_url_domains'])

    def _build_domain_settings(self, data):
        return self.DomainSettings(
            process=data['process'],
            last_modification_timedelta=datetime.timedelta(**data.get('last_modification_timedelta', {}))
        )

    def get_domain_settings(self, domain):
        """
        :type domain: six.text_type
        :rtype: OrphanedYpPodSetsCollectorSettings.DomainSettings
        """
        return self._domains_settings.get(domain, self._unmatched_domains_settings)


class Application(object):
    """
    God object, managing service lifetime.
    """

    main_log = logging.getLogger('watchdog')

    name = 'watchdog'

    @staticmethod
    def setup_logging(sentry_dsn=None):
        if sentry_dsn:
            c = raven.Client(dsn=sentry_dsn)
            c.transport_options['maximum_outstanding_requests'] = 50
            handler = raven.handlers.logging.SentryHandler(client, level=logging.WARN)
            raven.conf.setup_logging(handler)

    def __init__(self, instance_id):
        sentry_dsn = config.get_value('sentry.dsn', default=None)
        # Web application initialization
        self.setup_logging(sentry_dsn=sentry_dsn)
        # Init coordination service client
        # Several instances can run on one host, use port for distinction.
        self.instance_id = instance_id
        coord = client.ZookeeperClient(
            cfg={
                'hosts': config.get_value('coord.hosts'),
                'zk_root': config.get_value('coord.root'),
                'read_only': False,
                'log_debug': config.get_value('coord.log_debug'),
            },
            identifier=instance_id,
        )
        self.zk_client = ZkClient(coord)
        rpc_client = nanny_rpc_client.RetryingRpcClient(
            rpc_url=config.get_value('nanny.repo_url'),
            oauth_token=config.get_value('nanny.token'),
            request_timeout=config.get_value('nanny.rpc_timeout', default=30)
        )
        yp_lite_rpc_client = nanny_rpc_client.RetryingRpcClient(
            rpc_url=config.get_value('yp_lite_api.repo_url'),
            oauth_token=config.get_value('yp_lite_api.token'),
            request_timeout=config.get_value('yp_lite_api.rpc_timeout', default=30)
        )
        self.repo_client = repo_api_stub.RepoServiceStub(rpc_client)
        self.admin_api_client = admin_api_stub.YpLiteAdminServiceStub(yp_lite_rpc_client)
        self.alemate_client = AlemateClient(nanny_url=config.get_value('nanny.url'),
                                            token=config.get_value('nanny.token'))
        self.push_client = PushClient(yasm_host=os.getenv('YP_NODE_FQDN'),
                                      yasm_port=config.get_value('metrics.yasm_port'),
                                      metrics_ttl=config.get_value('metrics.ttl'),
                                      ctype=os.getenv('a_ctype'),
                                      prj=os.getenv('a_prj'),
                                      geo=os.getenv('a_dc'))

        self.yp_clients_by_clusters = YpClient.get_clients_by_clusters(config.get_value('yp'))
        self.orphaned_collector_settings = OrphanedYpPodSetsCollectorSettings(
            config.get_value('orphaned_yp_pod_sets_collector')
        )
        self.metrics_runners = []
        self._stop_flag = gevent.event.Event()

    @staticmethod
    def setup_environment():
        # Patch requests connection pool to use gevent queue
        from requests.packages.urllib3.connectionpool import ConnectionPool
        from gevent.queue import LifoQueue

        ConnectionPool.QueueCls = LifoQueue
        # Disable requests spamming about:
        # Connection pool is full, discarding connection
        # There is nothing we can do about it, so simply mute
        logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(logging.ERROR)

    def teardown_environment(self):
        self.zk_client.stop()

    def run(self):
        """
        Start application.
        Blocks until stop was called.
        """

        self.main_log.info('starting service...')
        self.setup_environment()
        self.zk_client.start()

        alemate_runner = gevent.spawn(self.run_alemate_metrics)
        alemate_runner.link_exception(lambda _: self.run_alemate_metrics())
        self.metrics_runners.append(alemate_runner)
        self.main_log.info('Started alemate metrics collector')

        snapshot_runner = gevent.spawn(self.run_snapshot_metrics)
        snapshot_runner.link_exception(lambda _: self.run_snapshot_metrics())
        self.metrics_runners.append(snapshot_runner)
        self.main_log.info('Started snapshot metrics collector')

        orphaned_yp_pod_sets_runner = gevent.spawn(self.run_orphaned_yp_pod_sets_metrics)
        orphaned_yp_pod_sets_runner.link_exception(lambda _: self.run_orphaned_yp_pod_sets_metrics())
        self.metrics_runners.append(orphaned_yp_pod_sets_runner)
        self.main_log.info('Started orphaned yp pod sets metrics collector')

        missed_completed_taskgroups_runner = gevent.spawn(self.run_missed_completed_taskgroups_metrics)
        missed_completed_taskgroups_runner.link_exception(lambda _: self.run_missed_completed_taskgroups_metrics())
        self.metrics_runners.append(missed_completed_taskgroups_runner)
        self.main_log.info('Started missed completed taskgroups metrics collector')

        eviction_requested_pods_metrics_runner = gevent.spawn(self.run_eviction_requested_pods_metrics)
        eviction_requested_pods_metrics_runner.link_exception(lambda _: self.run_eviction_requested_pods_metrics())
        self.metrics_runners.append(eviction_requested_pods_metrics_runner)
        self.main_log.info('Started eviction requested pods metrics collector')

        while True:
            try:
                if self._stop_flag.is_set():
                    return self.stop()
                gevent.sleep()
            except:
                return self.stop()

    def run_snapshot_metrics(self):
        log = logging.getLogger('watchdog-snapshots')
        while True:
            if self._stop_flag.is_set():
                return
            metrics = Metrics()
            pool = gevent.pool.Pool(100)
            try:
                log.info('Started iteration')
                log.info('Collecting snapshot metrics')
                service_states_list = self.zk_client.get_service_states_list()
                imap = pool.imap_unordered(self._process_service_snapshots, service_states_list)
                for stuck_snapshots, unprocessed_cleanup_snapshots in imap:
                    if stuck_snapshots:
                        metrics.stuck_snapshots.inc(stuck_snapshots.get())
                    if unprocessed_cleanup_snapshots:
                        metrics.unprocessed_cleanup_snapshots.inc(unprocessed_cleanup_snapshots.get())
            except KeyboardInterrupt:
                return self._stop_flag.set()
            except Exception:
                log.exception('Collecting metrics failed')
            else:
                log.info('Sending metrics')
                self.send_metrics(metrics, log)
            log.info('Sleeping for {} seconds'.format(ITERATION_PERIOD))
            gevent.sleep(ITERATION_PERIOD)

    def _process_service_snapshots(self, s_id):
        cleanup_policy = self.zk_client.get_cleanup_policy(s_id)
        if not cleanup_policy:
            return None, None
        s_pb = self.zk_client.get_service_state(s_id)
        if not s_pb:
            return None, None
        return collect_snapshots_info(cleanup_policy, s_pb)

    def run_alemate_metrics(self):
        log = logging.getLogger('watchdog-alemate')
        while True:
            if self._stop_flag.is_set():
                return
            metrics = Metrics()
            pool = gevent.pool.Pool(10)
            try:
                log.info('Started iteration')
                log.info('Collecting taskgroups and tasks info')
                # Check #5
                metrics.status_ctl_unprocessed_taskgroups.inc(self.alemate_client.get_unprocessed_taskgroups_count())
                # Info for checks #3 and #4
                expected_snapshot_states, enqueued_tasks = self.alemate_client.collect_tasks_info(pool)
                metrics.enqueued_tasks = enqueued_tasks
                log.info('Checking snapshot statuses')
                metrics.stalled_meta_tasks = check_awaited_snapshot_states(self.zk_client, expected_snapshot_states)
            except KeyboardInterrupt:
                return self._stop_flag.set()
            except Exception:
                log.exception('Collecting metrics failed')
            else:
                log.info('Sending metrics')
                self.send_metrics(metrics, log)
            log.info('Sleeping for {} seconds'.format(ITERATION_PERIOD))
            gevent.sleep(ITERATION_PERIOD)

    def _run_orphaned_yp_pod_sets_metrics(self, metrics, log):
        orphaned_pod_sets_ids_w_cluster = []  # type: list[six.text_type, PodSet, six.text_type]
        max_marked_pod_sets_per_iteration = self.orphaned_collector_settings.max_marked_pod_sets_per_iteration

        log.info('Started iteration')
        log.info('Collecting nanny service ids')
        nanny_service_ids = set(self.zk_client.get_service_ids())
        log.info('Found {} nanny service ids'.format(len(nanny_service_ids)))
        if not nanny_service_ids:
            raise RuntimeError('No one nanny service found')

        pod_sets_query = '[/labels/deploy_engine] = "YP_LITE"'
        now = datetime.datetime.utcnow()

        # find orphaned yp pod sets
        for cluster, yp_client in self.yp_clients_by_clusters.iteritems():
            for pod_set in yp_client.list_objects(PodSet, query=pod_sets_query):  # type: PodSet
                # skip empty deploy_engine_url
                if not pod_set.deploy_engine_url:
                    metrics.deploy_engine_url_missing_pod_sets.inc()
                    continue

                nanny_domain = urlparse(pod_set.deploy_engine_url).netloc

                domain_settings = self.orphaned_collector_settings.get_domain_settings(nanny_domain)
                if not domain_settings.process:
                    metrics.not_processed_domains_pod_sets.inc()
                    continue

                # collect already marked pod sets
                if self.orphaned_collector_settings.mark_label_name in pod_set.nanny_watchdog_marks:
                    orphaned_pod_sets_ids_w_cluster.append((cluster, pod_set, nanny_domain))
                    continue

                min_modification_period = domain_settings.last_modification_timedelta

                # skip recently updated pod sets
                last_modification_dt = pod_set.get_attr_last_modification_dt('labels')
                if (now - last_modification_dt) < min_modification_period:
                    metrics.last_mtime_skip_pod_sets.inc()
                    continue

                if pod_set.nanny_service_id not in nanny_service_ids:
                    orphaned_pod_sets_ids_w_cluster.append((cluster, pod_set, nanny_domain))

        for cluster, pod_set, nanny_domain in orphaned_pod_sets_ids_w_cluster:
            yp_client = self.yp_clients_by_clusters[cluster]
            # check the existence of the pod again for skip the services that are being deleted right now
            pod_set_exists = yp_client.get_object(PodSet, pod_set.obj_id, ignore_nonexistent=True)
            if pod_set_exists is None:
                log.info('{} in {} already removed'.format(pod_set, cluster))
                continue

            log.info("Orphaned pod-set: cluster:{}, id:{}, domain:{}, account_id:{}".format(
                cluster, pod_set.obj_id, nanny_domain, pod_set.account_id))

            metrics.orphaned_pod_sets.inc()

            # skip already marked pod sets
            if self.orphaned_collector_settings.mark_label_name in pod_set.nanny_watchdog_marks:
                continue

            if max_marked_pod_sets_per_iteration > 0:

                if pod_set.exists_nanny_watchdog_marks:
                    label_name = "/".join([
                        PodSet.NANNY_WATCHDOG_LABELS_PATH,
                        self.orphaned_collector_settings.mark_label_name
                    ])
                    label_value = self.instance_id
                else:
                    label_name = PodSet.NANNY_WATCHDOG_LABELS_PATH
                    label_value = {self.orphaned_collector_settings.mark_label_name: self.instance_id}

                yp_client.set_object_label(PodSet, pod_set.obj_id, label_name, label_value)
                max_marked_pod_sets_per_iteration -= 1

    def run_orphaned_yp_pod_sets_metrics(self):
        log = logging.getLogger('watchdog-orphaned-yp-pod-sets')
        if self.orphaned_collector_settings.sleep_before_first_run:
            first_iteration_delay = random.randint(0, self.orphaned_collector_settings.iteration_period)
            log.info('Sleeping before first run for {} seconds'.format(first_iteration_delay))
            gevent.sleep(first_iteration_delay)

        while True:
            if self._stop_flag.is_set():
                return
            metrics = Metrics()
            metrics.orphaned_pod_sets.set(value=0)
            metrics.deploy_engine_url_missing_pod_sets.set(value=0)
            metrics.not_processed_domains_pod_sets.set(value=0)
            metrics.last_mtime_skip_pod_sets.set(value=0)
            try:
                self._run_orphaned_yp_pod_sets_metrics(metrics, log)
            except KeyboardInterrupt:
                return self._stop_flag.set()
            except Exception:
                log.exception('Collecting metrics failed')
            else:
                log.info('Sending metrics')
                self.send_metrics(metrics, log, ttl=self.orphaned_collector_settings.metrics_ttl)
            log.info('Sleeping for {} seconds'.format(self.orphaned_collector_settings.iteration_period))
            gevent.sleep(self.orphaned_collector_settings.iteration_period)

    def run_missed_completed_taskgroups_metrics(self):
        log = logging.getLogger('watchdog-missed-completed-taskgroups')
        while True:
            if self._stop_flag.is_set():
                return
            metrics = Metrics()
            try:
                log.info('Looking for missed completed taskgroups')
                tgs = missed_completed_taskgroups.find_missed_completed_taskgroups(
                    self.repo_client,
                    self.alemate_client
                )
                if tgs:
                    log.info('Found missed completed taskgroups %s', ', '.join(tg.tg_id for tg in tgs))
                metrics.missed_completed_taskgroups.set(value=len(tgs))
            except KeyboardInterrupt:
                return self._stop_flag.set()
            except Exception:
                log.exception('Collecting metrics failed')
            else:
                log.info('Sending metrics')
                self.send_metrics(metrics, log)
            log.info('Sleeping for %d seconds', MISSED_COMPLETED_TASKGROUPS_ITERATION_PERIOD)
            gevent.sleep(MISSED_COMPLETED_TASKGROUPS_ITERATION_PERIOD)

    def send_metrics(self, metrics, log, ttl=None):
        log.info(metrics)
        try:
            self.push_client.send_metrics(metrics, log, ttl=ttl)
        except Exception:
            log.exception('Metrics sending failed')

    def run_eviction_requested_pods_metrics(self, run_once=False):
        log = logging.getLogger('watchdog-eviction-requested-pods')
        while True:
            if self._stop_flag.is_set():
                return
            metrics = Metrics()
            try:
                for timeout_hours in [24, 48, 72]:
                    log.info('Looking for eviction requested pods for %s', timeout_hours)
                    pods = eviction_requested_pods.list_pod_summary(self.admin_api_client, timeout_hours*3600)
                    if pods:
                        log.info('Found %s eviction requested pods for %s hours: %s',
                                 len(pods), timeout_hours, ', '.join(pod.id for pod in pods))
                    metrics.set('eviction_requested_{}_hours'.format(timeout_hours), len(pods))
            except KeyboardInterrupt:
                return self._stop_flag.set()
            except Exception:
                log.exception('Collecting metrics failed')
            else:
                log.info('Sending metrics')
                self.send_metrics(metrics, log)
            if run_once:
                return metrics
            log.info('Sleeping for %d seconds', ITERATION_PERIOD)
            gevent.sleep(ITERATION_PERIOD)

    def stop(self):
        """
        Gracefully stop application.
        Can block for a long time or throw exception, be ready.
        """
        self.main_log.info('stopping service...')
        self._stop_flag.set()
        for g in self.metrics_runners:
            force_kill_greenlet(g, ignore_greenlet_exit=True, log=self.main_log)
        self.teardown_environment()
        self.main_log.info('=' * 30)
