import json
import pprint
import time
import urllib

import gevent
import requests

from infra.qyp.qdm.src.lib.qnotifier_client import QnotifierNotAuthorized
from .common import check_dc, DC
from ..deblock import Deblock
from ..model import Session, StorageRevision, EvoqLog
from ..utils import BaseLogDbObj


class OpContinue(Exception):
    pass


class OpReturn(Exception):
    def __init__(self, result):
        self.result = result


class OpFail(Exception):
    pass


class EvoqLogDbObj(BaseLogDbObj):
    def __init__(self, db, log, evoq_id):
        super(EvoqLogDbObj, self).__init__(db, log)
        self._evoq_id = evoq_id

    def _make_rec(self, ts_ms, severity, msg, args):
        rec = EvoqLog(self._db, self._evoq_id, ts_ms, severity, msg, args)
        rec.save()


class EvoqProcess(object):
    NON_DESTRUCTIVE_KINDS = frozenset([
        'temporary-unreachable',
        'reboot',
    ])

    def __init__(self, log, db, key, evoq, qnotifier_client, vmproxy_client, yp_client, disable_vm_leaving):
        self.db = db
        self.key = key
        self.evoq = evoq

        self.log = EvoqLogDbObj(self.db, log, self.evoq.id)

        self.qnotifier_client = qnotifier_client
        self.vmproxy_client = vmproxy_client
        self.yp_client = yp_client

        self.qnotifier_tags = ['qyp', 'qdm', 'evacuate', 'vm:%s' % (evoq.vm_id, )]
        self._qnotifier_tvm_secret = None  # cache

        self._deblocker = Deblock(logger=log.getChild('deblock'), keepalive=None)

        assert '.' in evoq.vm_id

        self.pod_id, self.dc = evoq.vm_id.split('.', 1)
        self.disable_vm_leaving = disable_vm_leaving

        # valid states:
        # - prepare  initialised, nothing was done
        # - upload   qdmupload session initialised, qdmupload action already done
        # - evict    upload finished, need pod eviction
        # - leave    dont do backup, just acknowledge node maintenance
        # - finish   pod evicted, wait till it will be in running state

    def notify_user(self, kind, state):
        # Valid kinds:
        # - prepared-fail
        # - prepared
        # - vm-evicted
        # - vm-leaved
        # - finished
        # - cancel

        notifications_sent = self.evoq.extra.setdefault('notifications_sent', [])
        if kind not in notifications_sent:
            message_header = '(This is automatic message, do not reply)\n'

            message_table = '\n'.join([
                'VM props:',
                '  vm       : {vm}'.format(vm=self.pod_id),
                '  dc       : {dc}'.format(dc=self.dc),
                '  node     : {node}'.format(node=state['podinfo']['node_id']),
                '  fqdn (r) : {fqdn}'.format(fqdn=state['podinfo']['fqdn'][0]),
                '  fqdn (t) : {fqdn_fixed}'.format(fqdn_fixed=state['podinfo']['fqdn'][1]),
            ])

            message_footer = '\n'.join([
                '',
                message_table,
                '',
                'Have a nice day!'
            ])

            subject = '[{dc}] QYP: machine "{vm_id}" eviction from underlying node'.format(
                dc=self.dc,
                vm_id=self.pod_id
            )

            state_table = [
                'VM stopped and prepared to evacuation',
                'VM evicted from node it lived on',
                'VM evacuated to new node and is accessible now'
            ]

            def _state_table(n, show_current=True):
                ret = []

                ret.append('')
                ret.append('Evacuation state overview:')

                for idx, info in enumerate(state_table):
                    idx = idx + 1

                    if n == idx and show_current:
                        ret.append('  [x] %-45s <<< WE ARE HERE' % (info, ))
                    elif n >= idx:
                        ret.append('  [x] %s' % (info, ))
                    else:
                        ret.append('  [ ] %s' % (info, ))

                return '\n'.join(ret)

            if kind == 'prepared':
                subject += ' -- PREPARED'
                message = [
                    message_header,
                    'Your QYP vm was requested to evict from node it lives on. We was able to contact your vm and '
                    'will backup all data then move it to other node. You will receive more notifications once '
                    'that will be done.',
                    _state_table(1),
                    message_footer,
                ]

            elif kind == 'prepared-fail':
                subject += ' -- CANT CONTACT NODE'
                message = [
                    message_header,
                    'Your QYP vm was requested to evict from node it lives on. But we were unable to contact '
                    'vmagent, possibly this indicated dead node. We will try more, you will be notified if '
                    'something will change.',
                    _state_table(0),
                    message_footer,
                ]

            elif kind == 'vm-evicted':
                subject += ' -- EVICTED'
                message = [
                    message_header,
                    'Your QYP vm was requested to evict from node it lives on. We already copied all vm data and '
                    'made eviction from node. Now YP will allocate new vm and we will download all your data back.',
                    _state_table(2),
                    message_footer,
                ]

            elif kind == 'vm-leaved':
                subject += ' -- LEAVED'
                message = [
                    message_header,
                    'Your QYP vm is located on a host that requires maintenance. When maintenance will be finished '
                    'your vm will be started again.',
                    message_footer
                ]

            elif kind == 'finished':
                subject += ' -- DONE'
                message = [
                    message_header,
                    'Your QYP vm was evicted successfully. It should now be accessible!',
                    _state_table(3, show_current=False),
                    message_footer,
                ]

            elif kind == 'cancel':
                subject += ' -- CANCELLED'
                message = [
                    message_header,
                    'Your QYP vm was requested to evict from node it lives on. Process was cancelled',
                    message_footer,
                ]

            else:
                message = [
                    message_header,
                    '(no message for kind %r)' % (kind, ),
                    message_footer
                ]
                return

            message = '\n'.join(message)

            self.notify_usergroups_autotvm(
                state['podinfo']['owners']['users'],
                state['podinfo']['owners']['groups'],
                subject, message
            )

            notifications_sent.append(kind)

    def notify_usergroups_autotvm(self, users, groups, subject, message):
        for i in range(1, 11):
            try:
                try:
                    self._deblocker.apply(
                        self.qnotifier_client.notify_usergroups,
                        self._qnotifier_tvm_secret, users, groups, self.qnotifier_tags, subject, message
                    )
                    return True
                except QnotifierNotAuthorized:
                    self._qnotifier_tvm_secret = self._deblocker.apply(
                        self.qnotifier_client.get_qnotifier_tvm_ticket
                    )
                    gevent.sleep(1)  # avoid busy loops
                    continue
            except Exception as ex:
                self.log.warning('Unable to notify users: %s', str(ex))

                import traceback
                self.log.warning(traceback.format_exc())
                gevent.sleep(i * 3)

        return False

    def run(self):
        # run thru all evacuation stages
        #
        # Stages include:
        # 1. update vmagent if needed
        # 1a. wait for vmagent to become alive
        # 2. Initialise upload session
        # 2a. MakeAction - Backup
        # 2b. Wait until it either succedes or fails
        # 3. Issue eviction acknowledge
        # 4. Wait tilll vm will be in either CONFIGURED or RUNNING state

        # User notifications:
        # 1) (only if we fail to grab status) -- evacuation needed, but we unable to get status, will try more
        # 2) evacuation needed -- will attempt to make vm backup
        # 3) backup done, evacuating
        # 5) evacuation done
        # 6) (only if we roll back) -- rollback machine state, evacuation cancelled

        self.log.info('Running evoq process')

        if not self.evoq.extra:
            # Brand new record, initialize all fields

            self.evoq.extra = {
                'state': 'init',
                'state_ts': int(time.time()),
            }
            self.evoq.run_ts = int(time.time())
            self.evoq.run_cnt += 1
            if self.evoq.state != 'run':
                self.log.info('Switching evoq record state %s => %s', self.evoq.state, 'run')
                self.evoq.state = 'run'
            self.evoq.info = 'initializing'
            self.evoq.save()

        else:
            # Record already has extra field, possibly was re-initialized after being in failed state

            # We want to reset all extras but only if we are not in running/finished state
            if self.evoq.state == 'init':
                # Record already has extra, possibly it was failed
                self.log.info(
                    'Switching evoq record state %s => %s (preserving extra %r)',
                    self.evoq.state, 'run', self.evoq.extra
                )

                self.evoq.extra['state'] = 'init'
                self.evoq.extra['state_ts'] = int(time.time())
                self.evoq.run_ts = int(time.time())
                self.evoq.run_cnt += 1
                self.evoq.state = 'run'
                self.evoq.info = 'initializing'
                self.evoq.save()
            else:
                # Do not touch everything in all other states, just increase run_cnt
                self.evoq.run_cnt += 1
                self.evoq.save()

        state = self.evoq.extra['state']

        if state == 'done':
            self.log.warning('Evoq record state already marked as done -- skipping')
            if self.evoq.state != 'done':
                self.evoq.state = 'done'
                self.evoq.save()
            return

        while True:
            assert state in (
                'init', 'upload', 'evict', 'finish', 'done', 'rollback', 'leave'
            ), 'Unknown fsm state: %r' % (state, )

            state = {
                'init': self._step_prepare,
                'upload': self._step_upload,
                'evict': self._step_evict,
                'leave': self._step_leave,
                'finish': self._step_finish,
                'rollback': self._step_rollback
            }[state]()

            if self.evoq.extra['state'] != state:
                self.evoq.extra['state'] = state
                self.evoq.extra['state_ts'] = int(time.time())
                self.evoq.save()

            if state == 'done':
                break

    def _vmproxy_api_call(self, url, data):
        url = urllib.parse.urljoin(DC[self.dc]['vmproxy'], url)

        response = self._deblocker.apply(
            self.vmproxy_client.api_call, url, data
        )

        if response.headers.get('Content-Type', '') == 'application/json':
            result = json.loads(response.text)
        else:
            result = response.text

        if response.status_code != 200:
            self.log.error('vmproxy request failed with code %d', response.status_code)
            self.log.error('vmproxy request output')
            self.log.error(response.text)
            self.log.error('vmproxy request headers')
            self.log.error(pprint.pformat(response.headers))

            return False, response.status_code, result

        return True, response.status_code, result

    def _vmproxy_get_status(self):
        complete, status_code, result = self._vmproxy_api_call(
            '/api/GetStatus/',
            {'vm_id': {'pod_id': self.pod_id}},
        )
        if complete:
            return result

        raise OpContinue(
            'vmproxy api call (GetStatus) failed with code %r, response: %r' % (
                status_code, result
            )
        )

    def _vmproxy_get_vmspec(self):
        complete, status_code, result = self._vmproxy_api_call(
            '/api/ListYpVm/', {'query': {'name': self.pod_id}}
        )

        for vm in result.get('vms', []):
            if vm['meta']['id'] == self.pod_id:
                return vm

        self.log.critical('Unable to get vmspec for vm %r, searched in %r', self.pod_id, result)
        raise OpContinue('Unable to get vmspec, vm %r not found')

    def _vmproxy_stop_vm(self):
        complete, status_code, result = self._vmproxy_api_call(
            '/api/MakeAction/',
            {'vm_id': {'pod_id': self.pod_id}, 'action': 5}
        )

        self.log.debug('vmproxy stop request: %s %s %s', complete, status_code, result)

        if not complete:
            raise OpContinue(
                'vmproxy api call (MakeAction-5, stopvm) failed with code %r, response %r' % (
                    status_code, result
                )
            )

        self.log.info('waiting vm to stop')

        wait_max = 300
        deadline = time.time() + wait_max

        while True:
            vm_status = self._vmproxy_get_status()

            self.log.debug('vmproxy vm status: %s', vm_status)

            if vm_status['state']['type'] in ('STOPPED', 'CONFIGURED'):
                return True

            if time.time() > deadline:
                raise OpContinue('vm stop wait timeout')

            gevent.sleep(10)
            continue

    def _vmproxy_update_vmagent(self, vmspec):
        vmspec['updateVmagent'] = True

        complete, status_code, result = self._vmproxy_api_call(
            '/api/UpdateVm/', vmspec
        )

        self.log.debug('vmproxy update vmagent: %s %s %s', complete, status_code, result)

        if not complete:
            raise OpContinue(
                'vmproxy api call (UpdateVm, update vmagent) failed with code %r, response %r' % (
                    status_code, result
                )
            )

        # Ok, update request succeeded, wait for good status a little
        self.log.info('waiting vm to stop')

        wait_max = 300
        deadline = time.time() + wait_max

        while True:
            try:
                vm_status = self._vmproxy_get_status()
            except Exception as ex:
                self.log.debug('vm status still failed with %s', ex)
                vm_status = None

            if vm_status:
                self.log.debug('vmproxy vm status: %s', vm_status)

                if vm_status['state']['type'] == ('STOPPED', 'CONFIGURED'):
                    return True

            if time.time() > deadline:
                raise OpContinue('vm stop after vmagent update wait timeout')

            gevent.sleep(10)
            continue

    def _vmproxy_qdmupload(self, key):
        complete, status_code, result = self._vmproxy_api_call(
            '/api/MakeAction/', {
                'vm_id': {'pod_id': self.pod_id},
                'action': 'QDMUPLOAD',
                'qdmreq': {'key': key}
            }
        )
        if not complete:
            raise OpContinue(
                'vmproxy api call (MakeAction, QDMUPLOAD) failed with code %r, response %r' % (
                    status_code, result
                )
            )

    def _vmproxy_acknowledge_eviction(self, qdm_key, use_evict=False):
        complete, status_code, result = self._vmproxy_api_call(
            '/api/AcknowledgeEviction/', {
                'vm_id': self.pod_id,
                'qdm_res_id': 'qdm:%s' % (qdm_key, ),
                'use_evict': use_evict,
            }
        )

        if not complete:
            raise OpContinue(
                'vmproxy api call (AcknowledgeEviction) failed with code %r, response %r' % (
                    status_code, result
                )
            )

        result = result.get('result', 'NO_RESULT')

        if result == 'NO_EVICTION':
            self.log.warning('vmproxy eviction ack failed with NO_EVICTION message')
            # We do not raise OpReturn stopped with "rollback" here, because we dont trust
            # vmproxy much. Instead, that will force eviction request checking again and we will
            # cancel and rollback everything if Yp confirms no eviction set for our pod.
            raise OpContinue()

        if result == 'DONE_EVICTION':
            return

        raise OpContinue('vmproxy AcknowledgeEviction returned unexpected result %r', result)

    def _get_pod_info(self):
        # Returns:
        # None -- YP request timedout
        # False -- no such pod in YP
        # {pod_dict}
        #
        # (False, None, error_desc, retryable)
        # (True, podinfo, None, False)

        try:
            query = '[/meta/id] = "%s"' % (self.pod_id, )
            pods = self._deblocker.apply(
                self.yp_client.select_pods, self.dc, query
            )
        except requests.exceptions.ReadTimeout as ex:
            return (False, str(ex), True)

        if len(pods) == 0:
            self.log.info('YP reported 0 pods matching search criteria (dc %s, vm_id %s)', self.dc, self.vm_id)
            return (False, 'no such pod', False)

        assert len(pods) == 1
        podinfo = pods[0]

        return (True, podinfo, False)

    def _op_waiter(self, state, key, amount):
        key_last_try = 'op_waiter_%s_%s' % (key, 'last_try')
        key_cnt_tries = 'op_waiter_%s_%s' % (key, 'cnt_tries')
        key_play_num = 'op_waiter_%s_%s' % (key, 'play_num')

        now = int(time.time())

        current_play = state['play_num']

        last_play_num = state.get(key_play_num, None)

        if last_play_num is None or (last_play_num == current_play):
            # Do not add wait if
            # - we didnt run this key yet, or
            # - we already wait in this play run
            state[key_last_try] = now
            state[key_cnt_tries] = 1
            state[key_play_num] = current_play
            return 0, 1

        state[key_cnt_tries] += 1
        state[key_play_num] = current_play

        prev_try_delta = now - state[key_last_try]  # how much seconds ago was previous try
        state[key_last_try] = now

        wait = min(amount, amount - prev_try_delta)

        if wait > 0:
            self.log.debug(
                'op_waiter: waiting %ds for %s key retry (play: %r)',
                wait, key, current_play
            )
            gevent.sleep(wait)
            return wait, state[key_cnt_tries]
        else:
            self.log.debug(
                'op_waiter: no need to wait %ds for %s key (time already passed, play: %r)',
                wait, key, current_play
            )
            return 0, state[key_cnt_tries]

    def _op_get_podinfo(self, state, deadline=3600):
        self.log.debug('play op: get_podinfo')

        key_deadline = 'op_get_podinfo_deadline'    # deadline for retries
        key_result = 'podinfo'                      # result key

        retry_wait = min(900, deadline / 20)  # 180s if deadline 3600, 900s max
        _, total_tries = self._op_waiter(state, 'get_podinfo', retry_wait)

        if key_deadline not in state:
            state[key_deadline] = time.time() + deadline

        try:
            query = '[/meta/id] = "%s"' % (self.pod_id, )
            pods = self._deblocker.apply(
                self.yp_client.select_pods, self.dc, query
            )
        except Exception as ex:
            timeleft = max(0, int(state[key_deadline] - time.time()))

            if timeleft < 1:
                raise OpFail('op_get_podinfo: all timeouts exhausted, we made %d total tries', total_tries)

            self.log.warning('play op: select_pods failed %s, will retry more for %d secs', ex, timeleft)
            raise OpContinue()

        else:
            state.pop(key_deadline, None)

        if len(pods) == 0:
            self.log.info('YP reported 0 pods matching search criteria (dc %s, vm_id %s)', self.dc, self.vm_id)
            raise OpReturn('done')

        assert len(pods) == 1
        podinfo = pods[0]

        assert isinstance(podinfo, dict)
        state[key_result] = podinfo

        self.log.debug('play op: get_podinfo: complete, found pod')

    def _op_check_eviction_request(self, state):
        self.log.debug('play op: check_eviction_request')

        pod_info = state['podinfo']
        eviction_requested = pod_info['eviction'] and pod_info['eviction']['state'] == 'requested'
        maintenance_requested = pod_info['maintenance'] and pod_info['maintenance']['state'] == 'requested'

        if not eviction_requested and not maintenance_requested:
            self.log.info('YP reported no active eviction process')
            raise OpReturn('rollback')

        if self.pod_id == 'mcsl2':
            return 1

        self.log.info('play op: YP eviction state: %s, playing further ', pod_info['eviction'])
        self.log.info('play op: YP maintenance state: %s, playing further ', pod_info['maintenance'])

    def _op_get_vm_status(self, state, want=None, nowant=None, timeout=3600):
        """
        Example vmproxy vm status:

        {'config': {'accessInfo': {'vncPassword': '<nope>'},
        'autorun': True,
        'disk': {'deltaSize': '20401094656',
                    'path': '/',
                    'resource': {
                    'md5sum': '',
                    'path': '',
                    'rbTorrent': 'qdm:bc0503fd9f049cc512bcc2c3c863c3b12ec297c983d523227dd5cc4a2054a259'
                },
                    'size': '3489726768',
                    'type': 'RAW'},
        'id': 'empty',
        'mem': '8',
        'type': 'LINUX',
        'vcpu': 2},
            'state': {'generation': 2, 'info': '', 'type': 'RUNNING'},
            'vmagentVersion': '0.27'}
        """

        self.log.debug('play op: get_vm_status')

        key_deadline = 'op_get_vm_status_deadline'
        key_result = 'vm_status'

        _, total_tries = self._op_waiter(state, 'vm_status', 300)

        if key_deadline not in state:
            state[key_deadline] = time.time() + timeout

        try:
            vm_status = self._vmproxy_get_status()
            self.log.debug('vm status: %r', vm_status)
            if want is not None:
                assert isinstance(want, (list, tuple))
                found_status = vm_status.get('state', {}).get('type', 'unknown').upper()

                if found_status not in (x.upper() for x in want):
                    self.log.debug('  want one of %r, but we found %r, raising OpContinue...', want, found_status)
                    raise OpContinue()

            if nowant is not None:
                assert isinstance(nowant, (list, tuple))
                found_status = vm_status.get('state', {}).get('type', 'unknown').upper()

                if found_status in (x.upper() for x in nowant):
                    self.log.warning('  found one of unwanted statuses -- %r (one of %r)', found_status, nowant)
                    raise OpFail('get_vm_status: unwanted vm status %r' % (found_status, ))

        except Exception as ex:
            timeleft = max(0, int(state[key_deadline] - time.time()))
            if timeleft < 1:
                raise OpFail('get_vm_status: all timeouts exhausted, we made %d total tries', total_tries)

            if isinstance(ex, OpContinue):
                # If we got opcontinue -- do not log error nor notify user, just reraise
                raise

            self.log.warning('get vmstatus failed %s, will retry more for %d secs', ex, timeleft)

            if state['step'] == 'prepare':
                self.notify_user('prepared-fail', state)

            raise OpContinue()
        else:
            if state['step'] == 'prepare':
                # Special case -- if we got vm status during prepare -- send notification
                # about eviction process begin
                self.notify_user('prepared', state)

            state.pop(key_deadline, None)

        state[key_result] = vm_status
        self.log.debug('play op: get_vm_status: complete')

    def _op_set_finish(self, state):
        # Just send final notification here, that's all
        self.notify_user('finished', state)

    def _op_save_vm_state(self, state):
        # This one is used if we will need to rollback everything

        self.log.debug('play op: save_vm_state')

        vm_status = state['vm_status']

        if not self.evoq.extra.get('initial_state', None):
            if 'state' not in vm_status or 'type' not in vm_status['state']:
                self.log.warning(
                    'play op: save_vm_state: unable to find state-type keys in vm_status, will retry'
                )
                raise OpContinue()

            if vm_status['state']['type'] == 'RUNNING':
                self.evoq.extra['initial_state'] = 'run'
                self.evoq.save()

            elif vm_status['state']['type'] in ('STOPPED', 'CONFIGURED'):
                self.evoq.extra['initial_state'] = 'stop'
                self.evoq.save()
            else:
                self.log.debug(
                    'play op: save_vm_state: unexpected vm state: %s, will retry',
                    vm_status['state']['type']
                )
                raise OpContinue()

            self.log.debug(
                'play op: save_vm_state completed, stored initial state: %s',
                self.evoq.extra['initial_state']
            )
        else:
            self.log.debug(
                'play op: save_vm_state completed, initial state already saved earlier'
            )

    def _op_stop_vm(self, state):
        self.log.debug('play op: stop_vm')

        vm_status = state['vm_status']

        if vm_status['state']['type'] in ('STOPPED', 'CONFIGURED'):
            self.log.debug('play op: stop_vm completed, vm already in stopped state')
            return

        _, total_tries = self._op_waiter(state, 'stop_vm', 60)

        key_deadline = 'op_stop_vm_deadline'

        if key_deadline not in state:
            state[key_deadline] = time.time() + 3600

        try:
            self._vmproxy_stop_vm()
        except Exception as ex:
            timeleft = max(0, int(state[key_deadline] - time.time()))

            if timeleft < 1:
                raise OpFail('stop_vm: all timeouts exhausted, we made %d total tries', total_tries)

            self.log.warning('stop_vm failed: %s, will retry more for %d secs', ex, timeleft)
            raise OpContinue()
        else:
            state.pop(key_deadline, None)

        self.log.debug('play op: stop_vm complete')

    def _op_update_vmagent(self, state):
        self.log.debug('play op: update_vmagent')

        podinfo = state['podinfo']

        try:
            vmagent_version = podinfo['labels']['vmagent_version']
            if vmagent_version:
                vmagent_version_list = [int(x) for x in vmagent_version.split('.', 2)]
                if len(vmagent_version_list) == 2:
                    vmagent_version_list.append(0)
            else:
                vmagent_version_list = [0, 0, 0]
        except Exception as ex:
            self.log.warning('Unable to get current vmagent version: %s', ex)
            vmagent_version = None
            vmagent_version_list = []

        self.log.info('detected vmagent version %r', vmagent_version_list)

        if not vmagent_version_list or vmagent_version_list < [0, 26, 0]:
            _, total_tries = self._op_waiter(state, 'update_vmagent', 60)

            key_deadline = 'op_update_vmagent_deadline'
            if key_deadline not in state:
                state[key_deadline] = time.time() + 3600

            try:
                vmspec = self._vmproxy_get_vmspec()
            except Exception as ex:
                timeleft = max(0, int(state[key_deadline] - time.time()))

                if timeleft < 1:
                    raise OpFail('update_vmagent: all timeouts exhausted, we made %d total tries', total_tries)

                self.log.warning('update_vmagent failed: %s, will retry more for %d secs', ex, timeleft)
                raise OpContinue()

            self.log.debug('vmproxy: got vmspec %r', vmspec)

            try:
                self._vmproxy_update_vmagent(vmspec)
            except Exception as ex:
                timeleft = max(0, int(state[key_deadline] - time.time()))

                if timeleft < 1:
                    raise OpFail('update_vmagent: all timeouts exhausted, we made %d total tries', total_tries)

                self.log.warning('update_vmagent failed: %s, will retry more for %d secs', ex, timeleft)
                raise OpContinue()

        self.log.debug('play op: update_vmagent complete')

    def _op_check_backup_needed(self, state):
        """
        We can acknowledge maintenance only if all conditions are met:
        - eviction requested by hfsm or was not requested
        - node maintenance requested and its kind is nondestructive
        - maintenance acknowledge is enabled in config

        In other cases we need to make backup and evict vm
        """
        pod_info = state['podinfo']
        eviction = not pod_info['eviction'] or pod_info['eviction']['reason'] == 'hfsm'
        maintenance = (pod_info['maintenance'] and
                       pod_info['maintenance']['state'] == 'requested' and
                       pod_info['maintenance']['info'] and
                       pod_info['maintenance']['info']['kind'] in self.NON_DESTRUCTIVE_KINDS)

        if eviction and maintenance and not self.disable_vm_leaving:
            if state['step'] == 'leave':
                return
            else:
                raise OpReturn('leave')

    def _op_make_backup(self, state):
        self.log.debug('play op: make backup')

        _, total_tries = self._op_waiter(state, 'make_backup', 60)

        key_deadline = 'op_make_backup_deadline'
        if key_deadline not in state:
            state[key_deadline] = time.time() + 3600

        create_new_session = True
        issue_qdmupload_cmd = True

        if self.evoq.session_key:
            session = Session(self.db)
            session.load(self.evoq.session_key)
            if session.state == 'new':
                create_new_session = False
            elif session.state == 'active':
                # We have active sesison, but if we got here that means vm state is not BUSY.
                # Schedule qdmupload command to vmagent once again, but do not create new session
                create_new_session = False
                issue_qdmupload_cmd = False
            elif session.state == 'archive':
                # This is possibly old session already archive. That could happen if we already scheduled and
                # completed upload, but previous evoq logic was failed somehow. Or if upload failed itself.
                #
                # Schedule qdmupload job once again and also create new session, so old will be kept for
                # analytic purposes
                pass
            else:
                # We actually should not be here
                assert False, 'Session already created with unexpected state: %r' % (session.state, )

        if create_new_session:
            # Generate new qdmupload session
            session = Session(self.db)
            session.generate('upload', 'evoq')
            session.vm_id = self.evoq.vm_id
            session.node_id = self.evoq.node_id
            session.save()

            self.evoq.session_key = session.key
            self.evoq.save()

        if issue_qdmupload_cmd:
            try:
                self.log.info('issuing QDMUPLOAD vmproxy cmd')
                self._vmproxy_qdmupload(self.evoq.session_key)
            except Exception as ex:
                timeleft = max(0, int(state[key_deadline] - time.time()))

                if timeleft < 1:
                    raise OpFail('make_backup: all timeouts exhausted, we made %d total tries', total_tries)

                self.log.warning('make_backup failed: %s, will retry more for %d secs', ex, timeleft)
                raise OpContinue()
        else:
            self.log.info('will not issue QDMUPLOAD vmproxy cmd (not needed)')

        self.log.debug('play op: make_backup complete')

    def _op_wait_qdmupload_session(self, state):
        self.log.debug('play op: wait qdmupload session to complete')

        session = Session(self.db)

        last_log = None

        while session.state != 'archive':
            if not last_log or time.time() - last_log > 600:
                self.log.debug('play op: waiting session to become archived (current state %s)', session.state)
                last_log = time.time()
            session.load(self.evoq.session_key)
            gevent.sleep(5)

        revno = session.rev_id

        if not revno:
            # For some reason we have no storage revision as a result
            self.log.error('session marked as archived, but no rev_id was set')
            raise OpContinue()

        revision = StorageRevision(self.db)
        last_log = None

        while True:
            found = revision.search(session.vm_id, session.rev_id)
            if not found:
                # For some reason there is no such revision in db
                self.log.error('unable to find storage revision in db')
                raise OpContinue()

            if revision.state == 'active':
                self.log.info('play op: qdmupload session archived and storage revision marked as active!')
                self.log.info('play op: revno %d, rev key %s', revision.rev_id, revision.key)
                self.evoq.extra['qdm_revision_key'] = revision.key
                self.evoq.save()
                break

            elif revision.state == 'draft':
                # Still in progress
                if not last_log or time.time() - last_log > 600:
                    self.log.debug(
                        'play op: waiting for revision to become active (current state %s)',
                        revision.state
                    )
                    last_log = time.time()
                gevent.sleep(5)
                continue

            elif revision.state == 'archive':
                self.log.error('Revision marked as archived, will retry')
                raise OpContinue()

            else:
                assert False, 'StorageRevision has unexpected state: %r' % (revision.state, )

        self.log.debug('play op: wait qdmupload session done')

    def _op_send_eviction_acknowledge(self, state):
        self.log.debug('play op: send eviction ack')
        eviction_info = state['podinfo']['eviction']
        use_evict = not (eviction_info and eviction_info['state'] == 'requested')
        self._vmproxy_acknowledge_eviction(self.evoq.extra['qdm_revision_key'], use_evict)
        self.notify_user('vm-evicted', state)
        self.log.debug('play op: send eviction ack complete')

    def _op_send_maintenance_acknowledge(self, state):
        self.log.debug('play op: send maintenance ack')
        self._deblocker.apply(
            self.yp_client.acknowledge_maintenance,
            self.dc, self.pod_id, state['podinfo']['maintenance_ts']
        )
        self.notify_user('vm-leaved', state)
        self.log.debug('play op: send maintenance ack complete')

    def _op_rollback_vm_state(self, state):
        self.log.debug('play op: attempt to rollback vm state')
        self.log.debug('play op: attempt to rollback vm state complete')

    def _play_ops(self, stepcodename, ops):
        # Replay all operations (ops) in list
        # On OpContinue -- rerun from scratch
        # On OpReturn -- send result immidiately
        # On OpFail -- log and reraise exception
        loop_ts = None

        state = {
            'play_num': 0,
            'transient': {},
            'step': stepcodename
        }

        while True:
            state['play_num'] += 1
            self.log.info('%s: playing round #%d', stepcodename, state['play_num'])

            new_loop_ts = int(time.time())
            if loop_ts is None:
                loop_ts = new_loop_ts
            else:
                if (new_loop_ts - loop_ts) < 1:
                    self.log.warning('%s: slow down, playing too fast', stepcodename)
                    gevent.sleep(1)
                    loop_ts = int(time.time())
                else:
                    loop_ts = new_loop_ts

            try:
                for op in ops:
                    op(state)
                return

            except OpContinue:
                self.log.debug('%s: got OpContinue request during playing %r', stepcodename, op)
                continue

            except OpReturn as op:
                self.log.debug('%s: got OpReturn request during playing %r', stepcodename, op)
                return op.result

            except OpFail as ex:
                self.log.warning('%s: got OpFail with %s request during playing %r', stepcodename, ex, op)
                raise

            except Exception as ex:
                self.log.warning(
                    '%s: got unhandled Exception with %s request during playing %r',
                    stepcodename, ex, op
                )
                raise OpFail('(reraise from %s: %s)' % (type(ex).__name__, str(ex)))

    def _step_prepare(self):
        check_dc(self.dc)

        ops = [
            # In prepare step we wait a lot for grabbing podinfo, this is to avoid whole job retries
            # if node is down (and send only 1 notification, instead of 1 notification every time)
            lambda state: self._op_get_podinfo(state, deadline=3600 * 24),
            self._op_check_eviction_request,
            self._op_check_backup_needed,
            self._op_get_vm_status,  # notify-send: evoq start or evoq unable to start coz cant get status
            self._op_save_vm_state,
            self._op_stop_vm,
            self._op_update_vmagent,
        ]

        # Let the show begin
        try:
            result = self._play_ops('prepare', ops)
            if result is not None:
                return result  # some op made OpReturn request for asking next step
            return 'upload'  # default next step after this one
        except OpFail as ex:
            try:
                self._step_rollback()
            except:
                self.log.warning('rollback step failed: %s: %s (that error was ignored)', type(ex).__name__, ex)
            raise ex

    def _step_upload(self):
        check_dc(self.dc)

        ops = [
            self._op_get_podinfo,
            self._op_check_eviction_request,
            self._op_make_backup,
            self._op_wait_qdmupload_session
        ]

        try:
            result = self._play_ops('upload', ops)
            if result is not None:
                return result
            return 'evict'
        except OpFail as ex:
            try:
                self._step_rollback()
            except:
                self.log.warning('rollback step failed: %s: %s', type(ex).__name__, ex)
            raise ex

    def _step_evict(self):
        ops = [
            self._op_get_podinfo,
            self._op_check_eviction_request,
            self._op_send_eviction_acknowledge  # notify-send: made backup, evacuating machine
        ]

        try:
            result = self._play_ops('evict', ops)
            if result is not None:
                return result
            return 'finish'
        except OpFail as ex:
            try:
                self._step_rollback()
            except:
                self.log.warning('rollback step failed: %s: %s', type(ex).__name__, ex)
            raise ex

    def _step_leave(self):
        ops = [
            self._op_get_podinfo,
            self._op_check_eviction_request,
            self._op_check_backup_needed,
            self._op_send_maintenance_acknowledge
        ]

        try:
            result = self._play_ops('leave', ops)
            if result is not None:
                return result
            return 'done'
        except OpFail as ex:
            try:
                self._step_rollback()
            except:
                self.log.warning('rollback step failed: %s: %s', type(ex).__name__, ex)
            raise ex

    def _step_finish(self):
        ops = [
            self._op_get_podinfo,
            lambda state: self._op_get_vm_status(
                state,
                want=('running', 'configured'),
                nowant=('invalid', ),
                timeout=6 * 3600,
            ),
            self._op_set_finish  # notify-send: evoq finished
        ]

        try:
            result = self._play_ops('finish', ops)
            if result is not None:
                return result
            return 'done'
        except OpFail as ex:
            try:
                self._step_rollback()
            except:
                self.log.warning('rollback step failed: %s: %s', type(ex).__name__, ex)
            raise ex

    def _step_rollback(self):
        # Rollback step can be scheduled in 2 scenarios:
        # 1) either we got OpFail, and attempt to rollback everything. In this case "done" return below does not
        #    change anything -- we still will have OpFail error
        # 2) some step scheduled cancel and rollback and schedule us. In this case all evacuation will continue
        #    to next "done" step after rollback (i.e. success)
        ops = [
            self._op_get_podinfo,
            self._op_get_vm_status,
            self._op_rollback_vm_state
        ]

        try:
            result = self._play_ops('rollback', ops)
            if result is not None:
                return result
            return 'done'
        except OpFail as ex:
            self.log.warning('rollback failed: %s: %s', type(ex).__name__, ex)
            # Sadly, we cant do anything meaningfull anymore
            raise ex
