Make provision to evacuate all instances

As of now host failure workflow was evacuating instances which were
having vm_state as active, stopped, error and resize. It was ignoring
other vm_states such as shelved, rescued, paused and suspended. Made
provision to evacuate instances which are having vm_states such as
shelved, rescued, paused and suspended by changing its vm_state to
error and after evacuating those instances will be stopped.

NOTE:
On master if the instance is in error or resized state then after
recovery it was becoming active. With this patch error instances
will be stopped and then set to error after recovery. For resized
instance if it's previous power_state is 4(SHUTDOWN) then we can
say that before failure the instance was is stopped state and then
it was resized so masakari will stop that instance to maintain
consistency of instance states as the instance was not fully
resized(resize operation was not confirmed). Resized instance which
was in active state before failure will become active again after
recovery.

Closes-Bug: #1693731
Closes-Bug: #1692435
Closes-Bug: #1690995
Closes-Bug: #1690768
Change-Id: I134e8b6ee7315935bd8ce418ef6241be0b9450b3
This commit is contained in:
dineshbhor 2017-05-26 16:00:53 +05:30 committed by Dinesh Bhor
parent 4ce94ec6ed
commit 4173aaf039
3 changed files with 153 additions and 130 deletions

View File

@ -19,6 +19,7 @@ from eventlet import timeout as etimeout
from oslo_log import log as logging
from oslo_service import loopingcall
from oslo_utils import excutils
from oslo_utils import strutils
import taskflow.engines
from taskflow.patterns import linear_flow
@ -37,6 +38,9 @@ LOG = logging.getLogger(__name__)
ACTION = 'instance:evacuate'
# Instance power_state
SHUTDOWN = 4
class DisableComputeServiceTask(base.MasakariTask):
def __init__(self, novaclient):
@ -95,72 +99,119 @@ class EvacuateInstancesTask(base.MasakariTask):
requires=requires)
self.novaclient = novaclient
def _get_state_and_host_of_instance(self, context, instance):
new_instance = self.novaclient.get_server(context, instance.id)
instance_host = getattr(new_instance,
"OS-EXT-SRV-ATTR:hypervisor_hostname")
old_vm_state = getattr(instance, "OS-EXT-STS:vm_state")
new_vm_state = getattr(new_instance, "OS-EXT-STS:vm_state")
return (old_vm_state, new_vm_state, instance_host)
def _stop_after_evacuation(self, context, instance):
def _wait_for_stop_confirmation():
old_vm_state, new_vm_state, _ = (
self._get_state_and_host_of_instance(context, instance))
if new_vm_state == 'stopped':
raise loopingcall.LoopingCallDone()
periodic_call_stopped = loopingcall.FixedIntervalLoopingCall(
_wait_for_stop_confirmation)
try:
self.novaclient.stop_server(context, instance.id)
# confirm instance is stopped after recovery
periodic_call_stopped.start(interval=CONF.verify_interval)
etimeout.with_timeout(
CONF.wait_period_after_power_off,
periodic_call_stopped.wait)
except etimeout.Timeout:
with excutils.save_and_reraise_exception():
msg = ("Instance '%s' is successfully evacuated but "
"failed to stop.")
LOG.warning(msg, instance.id)
finally:
periodic_call_stopped.stop()
def _evacuate_and_confirm(self, context, instance, host_name,
failed_evacuation_instances, reserved_host=None):
vm_state = getattr(instance, "OS-EXT-STS:vm_state")
if vm_state in ['active', 'error', 'resized', 'stopped']:
# Before locking the instance check whether it is already locked
# by user, if yes don't lock the instance
instance_already_locked = self.novaclient.get_server(
context, instance.id).locked
# Before locking the instance check whether it is already locked
# by user, if yes don't lock the instance
instance_already_locked = self.novaclient.get_server(
context, instance.id).locked
if not instance_already_locked:
# lock the instance so that until evacuation and confirmation
# is not complete, user won't be able to perform any actions
# on the instance.
self.novaclient.lock_server(context, instance.id)
if not instance_already_locked:
# lock the instance so that until evacuation and confirmation
# is not complete, user won't be able to perform any actions
# on the instance.
self.novaclient.lock_server(context, instance.id)
def _wait_for_evacuation_confirmation():
old_vm_state, new_vm_state, instance_host = (
self._get_state_and_host_of_instance(context, instance))
def _wait_for_evacuation():
new_instance = self.novaclient.get_server(context, instance.id)
instance_host = getattr(new_instance,
"OS-EXT-SRV-ATTR:hypervisor_hostname")
old_vm_state = getattr(instance, "OS-EXT-STS:vm_state")
new_vm_state = getattr(new_instance, "OS-EXT-STS:vm_state")
if instance_host != host_name:
if ((old_vm_state == 'error' and
new_vm_state == 'active') or
old_vm_state == new_vm_state):
raise loopingcall.LoopingCallDone()
if instance_host != host_name:
if ((old_vm_state == 'error' and
new_vm_state == 'active') or
old_vm_state == new_vm_state):
raise loopingcall.LoopingCallDone()
try:
vm_state = getattr(instance, "OS-EXT-STS:vm_state")
# Nova evacuates an instance only when vm_state is in active,
# stopped or error state. If an instance is in other than active,
# error and stopped vm_state, masakari resets the instance state
# to *error* so that the instance can be evacuated.
stop_instance = True
if vm_state not in ['active', 'error', 'stopped']:
self.novaclient.reset_instance_state(context, instance.id)
instance = self.novaclient.get_server(context, instance.id)
power_state = getattr(instance, "OS-EXT-STS:power_state")
if vm_state == 'resized' and power_state != SHUTDOWN:
stop_instance = False
vm_state = getattr(instance, "OS-EXT-STS:vm_state")
# evacuate the instance
self.novaclient.evacuate_instance(
context, instance.id,
target=reserved_host.name if reserved_host else None)
periodic_call = loopingcall.FixedIntervalLoopingCall(
_wait_for_evacuation_confirmation)
try:
# Nova evacuates an instance only when vm_state is in active,
# stopped or error state. If an instance is in resized
# vm_state, masakari resets the instance state to *error* so
# that the instance can be evacuated.
if vm_state == 'resized':
self.novaclient.reset_instance_state(context, instance.id)
# add a timeout to the periodic call.
periodic_call.start(interval=CONF.verify_interval)
etimeout.with_timeout(
CONF.wait_period_after_evacuation,
periodic_call.wait)
# evacuate the instance
self.novaclient.evacuate_instance(
context, instance.id,
target=reserved_host.name if reserved_host else None)
periodic_call = loopingcall.FixedIntervalLoopingCall(
_wait_for_evacuation)
try:
# add a timeout to the periodic call.
periodic_call.start(interval=CONF.verify_interval)
etimeout.with_timeout(
CONF.wait_period_after_evacuation,
periodic_call.wait)
except etimeout.Timeout:
# Instance is not evacuated in the expected time_limit.
failed_evacuation_instances.append(instance.id)
finally:
# stop the periodic call, in case of exceptions or
# Timeout.
periodic_call.stop()
except Exception:
# Exception is raised while resetting instance state or
# evacuating the instance itself.
if vm_state not in ['active', 'stopped']:
if stop_instance:
self._stop_after_evacuation(context, instance)
# If the instance was in 'error' state before failure
# it should be set to 'error' after recovery.
if vm_state == 'error':
self.novaclient.reset_instance_state(
context, instance.id)
except etimeout.Timeout:
# Instance is not evacuated in the expected time_limit.
failed_evacuation_instances.append(instance.id)
finally:
if not instance_already_locked:
# Unlock the server after evacuation and confirmation
self.novaclient.unlock_server(context, instance.id)
# stop the periodic call, in case of exceptions or
# Timeout.
periodic_call.stop()
except Exception:
# Exception is raised while resetting instance state or
# evacuating the instance itself.
failed_evacuation_instances.append(instance.id)
finally:
if not instance_already_locked:
# Unlock the server after evacuation and confirmation
self.novaclient.unlock_server(context, instance.id)
def execute(self, context, host_name, instance_list, reserved_host=None):
def _do_evacuate(context, host_name, instance_list,

View File

@ -18,6 +18,7 @@ Unit Tests for host failure TaskFlow
"""
import copy
import ddt
import mock
from masakari.compute import nova
@ -32,6 +33,7 @@ from masakari.tests.unit import fakes
CONF = conf.CONF
@ddt.ddt
@mock.patch.object(nova.API, "enable_disable_service")
@mock.patch.object(nova.API, "lock_server")
@mock.patch.object(nova.API, "unlock_server")
@ -55,8 +57,8 @@ class HostFailureTestCase(test.TestCase):
for server in self.novaclient.get_servers(self.ctxt,
self.instance_host):
instance = self.novaclient.get_server(self.ctxt, server.id)
self.assertEqual('active',
getattr(instance, 'OS-EXT-STS:vm_state'))
self.assertIn(getattr(instance, 'OS-EXT-STS:vm_state'),
['active', 'stopped', 'error'])
self.assertNotEqual(self.instance_host,
getattr(instance,
'OS-EXT-SRV-ATTR:hypervisor_hostname'))
@ -155,22 +157,47 @@ class HostFailureTestCase(test.TestCase):
self.assertIn(reserved_host.name,
self.fake_client.aggregates.get('1').hosts)
@ddt.data('active', 'rescued', 'paused', 'shelved', 'suspended',
'error', 'stopped', 'resized')
@mock.patch('masakari.compute.nova.novaclient')
def test_evacuate_instances_task(self, _mock_novaclient, mock_unlock,
mock_lock, mock_enable_disable):
def test_host_failure_flow_all_instances(
self, vm_state, _mock_novaclient, mock_unlock, mock_lock,
mock_enable_disable):
_mock_novaclient.return_value = self.fake_client
# create test data
# create ha_enabled test data
power_state = 4 if vm_state == 'resized' else None
self.fake_client.servers.create(id="1", host=self.instance_host,
vm_state="error", ha_enabled=True)
vm_state=vm_state,
power_state=power_state,
ha_enabled=True)
self.fake_client.servers.create(id="2", host=self.instance_host,
vm_state="error", ha_enabled=True)
vm_state=vm_state,
power_state=power_state,
ha_enabled=True)
instance_list = {
"instance_list": self.fake_client.servers.list()
}
# execute DisableComputeServiceTask
self._test_disable_compute_service(mock_enable_disable)
# execute EvacuateInstancesTask
self._evacuate_instances(instance_list, mock_enable_disable)
# execute PrepareHAEnabledInstancesTask
instance_list = self._test_instance_list()
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_all_instances_active_resized_instance(
self, _mock_novaclient, mock_unlock, mock_lock,
mock_enable_disable):
_mock_novaclient.return_value = self.fake_client
# create ha_enabled test data
self.fake_client.servers.create(id="1", host=self.instance_host,
vm_state='resized',
ha_enabled=True)
self.fake_client.servers.create(id="2", host=self.instance_host,
vm_state='resized',
ha_enabled=True)
instance_list = {
"instance_list": self.fake_client.servers.list()
}
# execute EvacuateInstancesTask
self._evacuate_instances(instance_list, mock_enable_disable)
@ -219,66 +246,6 @@ class HostFailureTestCase(test.TestCase):
exception.HostRecoveryFailureException,
self._evacuate_instances, instance_list, mock_enable_disable)
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_resized_instance(
self, _mock_novaclient, mock_unlock, mock_lock,
mock_enable_disable):
_mock_novaclient.return_value = self.fake_client
# create ha_enabled test data
self.fake_client.servers.create(id="1", host=self.instance_host,
vm_state="resized",
ha_enabled=True)
self.fake_client.servers.create(id="2", host=self.instance_host,
vm_state="resized",
ha_enabled=True)
instance_list = {
"instance_list": self.fake_client.servers.list()
}
# execute EvacuateInstancesTask
self._evacuate_instances(instance_list, mock_enable_disable)
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_shutdown_instance(
self, _mock_novaclient, mock_unlock, mock_lock,
mock_enable_disable):
_mock_novaclient.return_value = self.fake_client
# create ha_enabled test data
self.fake_client.servers.create(id="1", host=self.instance_host,
vm_state="stopped",
ha_enabled=True)
self.fake_client.servers.create(id="2", host=self.instance_host,
vm_state="stopped",
ha_enabled=True)
instance_list = {
"instance_list": self.fake_client.servers.list()
}
# execute EvacuateInstancesTask
self._evacuate_instances(instance_list, mock_enable_disable)
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_instance_in_error(
self, _mock_novaclient, mock_unlock, mock_lock,
mock_enable_disable):
_mock_novaclient.return_value = self.fake_client
# create ha_enabled test data
self.fake_client.servers.create(id="1", host=self.instance_host,
vm_state="error",
ha_enabled=True)
self.fake_client.servers.create(id="2", host=self.instance_host,
vm_state="error",
ha_enabled=True)
instance_list = {
"instance_list": self.fake_client.servers.list()
}
# execute EvacuateInstancesTask
self._evacuate_instances(instance_list, mock_enable_disable)
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_no_instances_on_host(
self, _mock_novaclient, mock_unlock, mock_lock,

View File

@ -24,12 +24,13 @@ NOW = timeutils.utcnow().replace(microsecond=0)
class FakeNovaClient(object):
class Server(object):
def __init__(self, id=None, uuid=None, host=None, vm_state=None,
ha_enabled=None, locked=False):
power_state=1, ha_enabled=None, locked=False):
self.id = id
self.uuid = uuid or uuidutils.generate_uuid()
self.host = host
setattr(self, 'OS-EXT-SRV-ATTR:hypervisor_hostname', host)
setattr(self, 'OS-EXT-STS:vm_state', vm_state)
setattr(self, 'OS-EXT-STS:power_state', power_state)
self.metadata = {"HA_Enabled": ha_enabled}
self.locked = locked
@ -38,9 +39,10 @@ class FakeNovaClient(object):
self._servers = []
def create(self, id, uuid=None, host=None, vm_state='active',
ha_enabled=False):
power_state=1, ha_enabled=False):
server = FakeNovaClient.Server(id=id, uuid=uuid, host=host,
vm_state=vm_state,
power_state=power_state,
ha_enabled=ha_enabled)
self._servers.append(server)
return server
@ -69,9 +71,12 @@ class FakeNovaClient(object):
if not host:
host = 'fake-host-1'
server = self.get(uuid)
# pretending that instance is evacuated successfully on given host
setattr(server, 'OS-EXT-SRV-ATTR:hypervisor_hostname', host)
setattr(server, 'OS-EXT-STS:vm_state', 'active')
# pretending that instance is evacuated successfully on given host
if getattr(server, "OS-EXT-STS:vm_state") == 'active':
setattr(server, 'OS-EXT-STS:vm_state', 'active')
else:
setattr(server, 'OS-EXT-STS:vm_state', 'stopped')
def stop(self, id):
server = self.get(id)