From 4173aaf0396890ba47ce9c34aa38bf87945e9144 Mon Sep 17 00:00:00 2001 From: dineshbhor Date: Fri, 26 May 2017 16:00:53 +0530 Subject: [PATCH] Make provision to evacuate all instances As of now host failure workflow was evacuating instances which were having vm_state as active, stopped, error and resize. It was ignoring other vm_states such as shelved, rescued, paused and suspended. Made provision to evacuate instances which are having vm_states such as shelved, rescued, paused and suspended by changing its vm_state to error and after evacuating those instances will be stopped. NOTE: On master if the instance is in error or resized state then after recovery it was becoming active. With this patch error instances will be stopped and then set to error after recovery. For resized instance if it's previous power_state is 4(SHUTDOWN) then we can say that before failure the instance was is stopped state and then it was resized so masakari will stop that instance to maintain consistency of instance states as the instance was not fully resized(resize operation was not confirmed). Resized instance which was in active state before failure will become active again after recovery. Closes-Bug: #1693731 Closes-Bug: #1692435 Closes-Bug: #1690995 Closes-Bug: #1690768 Change-Id: I134e8b6ee7315935bd8ce418ef6241be0b9450b3 --- .../engine/drivers/taskflow/host_failure.py | 161 ++++++++++++------ .../taskflow/test_host_failure_flow.py | 109 +++++------- masakari/tests/unit/fakes.py | 13 +- 3 files changed, 153 insertions(+), 130 deletions(-) diff --git a/masakari/engine/drivers/taskflow/host_failure.py b/masakari/engine/drivers/taskflow/host_failure.py index 3b1e003a..9f440ccf 100644 --- a/masakari/engine/drivers/taskflow/host_failure.py +++ b/masakari/engine/drivers/taskflow/host_failure.py @@ -19,6 +19,7 @@ from eventlet import timeout as etimeout from oslo_log import log as logging from oslo_service import loopingcall +from oslo_utils import excutils from oslo_utils import strutils import taskflow.engines from taskflow.patterns import linear_flow @@ -37,6 +38,9 @@ LOG = logging.getLogger(__name__) ACTION = 'instance:evacuate' +# Instance power_state +SHUTDOWN = 4 + class DisableComputeServiceTask(base.MasakariTask): def __init__(self, novaclient): @@ -95,72 +99,119 @@ class EvacuateInstancesTask(base.MasakariTask): requires=requires) self.novaclient = novaclient + def _get_state_and_host_of_instance(self, context, instance): + new_instance = self.novaclient.get_server(context, instance.id) + instance_host = getattr(new_instance, + "OS-EXT-SRV-ATTR:hypervisor_hostname") + old_vm_state = getattr(instance, "OS-EXT-STS:vm_state") + new_vm_state = getattr(new_instance, "OS-EXT-STS:vm_state") + + return (old_vm_state, new_vm_state, instance_host) + + def _stop_after_evacuation(self, context, instance): + def _wait_for_stop_confirmation(): + old_vm_state, new_vm_state, _ = ( + self._get_state_and_host_of_instance(context, instance)) + + if new_vm_state == 'stopped': + raise loopingcall.LoopingCallDone() + + periodic_call_stopped = loopingcall.FixedIntervalLoopingCall( + _wait_for_stop_confirmation) + + try: + self.novaclient.stop_server(context, instance.id) + # confirm instance is stopped after recovery + periodic_call_stopped.start(interval=CONF.verify_interval) + etimeout.with_timeout( + CONF.wait_period_after_power_off, + periodic_call_stopped.wait) + except etimeout.Timeout: + with excutils.save_and_reraise_exception(): + msg = ("Instance '%s' is successfully evacuated but " + "failed to stop.") + LOG.warning(msg, instance.id) + finally: + periodic_call_stopped.stop() + def _evacuate_and_confirm(self, context, instance, host_name, failed_evacuation_instances, reserved_host=None): - vm_state = getattr(instance, "OS-EXT-STS:vm_state") - if vm_state in ['active', 'error', 'resized', 'stopped']: + # Before locking the instance check whether it is already locked + # by user, if yes don't lock the instance + instance_already_locked = self.novaclient.get_server( + context, instance.id).locked - # Before locking the instance check whether it is already locked - # by user, if yes don't lock the instance - instance_already_locked = self.novaclient.get_server( - context, instance.id).locked + if not instance_already_locked: + # lock the instance so that until evacuation and confirmation + # is not complete, user won't be able to perform any actions + # on the instance. + self.novaclient.lock_server(context, instance.id) - if not instance_already_locked: - # lock the instance so that until evacuation and confirmation - # is not complete, user won't be able to perform any actions - # on the instance. - self.novaclient.lock_server(context, instance.id) + def _wait_for_evacuation_confirmation(): + old_vm_state, new_vm_state, instance_host = ( + self._get_state_and_host_of_instance(context, instance)) - def _wait_for_evacuation(): - new_instance = self.novaclient.get_server(context, instance.id) - instance_host = getattr(new_instance, - "OS-EXT-SRV-ATTR:hypervisor_hostname") - old_vm_state = getattr(instance, "OS-EXT-STS:vm_state") - new_vm_state = getattr(new_instance, "OS-EXT-STS:vm_state") + if instance_host != host_name: + if ((old_vm_state == 'error' and + new_vm_state == 'active') or + old_vm_state == new_vm_state): + raise loopingcall.LoopingCallDone() - if instance_host != host_name: - if ((old_vm_state == 'error' and - new_vm_state == 'active') or - old_vm_state == new_vm_state): - raise loopingcall.LoopingCallDone() + try: + vm_state = getattr(instance, "OS-EXT-STS:vm_state") + + # Nova evacuates an instance only when vm_state is in active, + # stopped or error state. If an instance is in other than active, + # error and stopped vm_state, masakari resets the instance state + # to *error* so that the instance can be evacuated. + stop_instance = True + if vm_state not in ['active', 'error', 'stopped']: + self.novaclient.reset_instance_state(context, instance.id) + instance = self.novaclient.get_server(context, instance.id) + power_state = getattr(instance, "OS-EXT-STS:power_state") + if vm_state == 'resized' and power_state != SHUTDOWN: + stop_instance = False + + vm_state = getattr(instance, "OS-EXT-STS:vm_state") + + # evacuate the instance + self.novaclient.evacuate_instance( + context, instance.id, + target=reserved_host.name if reserved_host else None) + + periodic_call = loopingcall.FixedIntervalLoopingCall( + _wait_for_evacuation_confirmation) try: - # Nova evacuates an instance only when vm_state is in active, - # stopped or error state. If an instance is in resized - # vm_state, masakari resets the instance state to *error* so - # that the instance can be evacuated. - if vm_state == 'resized': - self.novaclient.reset_instance_state(context, instance.id) + # add a timeout to the periodic call. + periodic_call.start(interval=CONF.verify_interval) + etimeout.with_timeout( + CONF.wait_period_after_evacuation, + periodic_call.wait) - # evacuate the instance - self.novaclient.evacuate_instance( - context, instance.id, - target=reserved_host.name if reserved_host else None) - - periodic_call = loopingcall.FixedIntervalLoopingCall( - _wait_for_evacuation) - - try: - # add a timeout to the periodic call. - periodic_call.start(interval=CONF.verify_interval) - etimeout.with_timeout( - CONF.wait_period_after_evacuation, - periodic_call.wait) - except etimeout.Timeout: - # Instance is not evacuated in the expected time_limit. - failed_evacuation_instances.append(instance.id) - finally: - # stop the periodic call, in case of exceptions or - # Timeout. - periodic_call.stop() - except Exception: - # Exception is raised while resetting instance state or - # evacuating the instance itself. + if vm_state not in ['active', 'stopped']: + if stop_instance: + self._stop_after_evacuation(context, instance) + # If the instance was in 'error' state before failure + # it should be set to 'error' after recovery. + if vm_state == 'error': + self.novaclient.reset_instance_state( + context, instance.id) + except etimeout.Timeout: + # Instance is not evacuated in the expected time_limit. failed_evacuation_instances.append(instance.id) finally: - if not instance_already_locked: - # Unlock the server after evacuation and confirmation - self.novaclient.unlock_server(context, instance.id) + # stop the periodic call, in case of exceptions or + # Timeout. + periodic_call.stop() + except Exception: + # Exception is raised while resetting instance state or + # evacuating the instance itself. + failed_evacuation_instances.append(instance.id) + finally: + if not instance_already_locked: + # Unlock the server after evacuation and confirmation + self.novaclient.unlock_server(context, instance.id) def execute(self, context, host_name, instance_list, reserved_host=None): def _do_evacuate(context, host_name, instance_list, diff --git a/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py b/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py index d17e2434..45172bcd 100644 --- a/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py +++ b/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py @@ -18,6 +18,7 @@ Unit Tests for host failure TaskFlow """ import copy +import ddt import mock from masakari.compute import nova @@ -32,6 +33,7 @@ from masakari.tests.unit import fakes CONF = conf.CONF +@ddt.ddt @mock.patch.object(nova.API, "enable_disable_service") @mock.patch.object(nova.API, "lock_server") @mock.patch.object(nova.API, "unlock_server") @@ -55,8 +57,8 @@ class HostFailureTestCase(test.TestCase): for server in self.novaclient.get_servers(self.ctxt, self.instance_host): instance = self.novaclient.get_server(self.ctxt, server.id) - self.assertEqual('active', - getattr(instance, 'OS-EXT-STS:vm_state')) + self.assertIn(getattr(instance, 'OS-EXT-STS:vm_state'), + ['active', 'stopped', 'error']) self.assertNotEqual(self.instance_host, getattr(instance, 'OS-EXT-SRV-ATTR:hypervisor_hostname')) @@ -155,22 +157,47 @@ class HostFailureTestCase(test.TestCase): self.assertIn(reserved_host.name, self.fake_client.aggregates.get('1').hosts) + @ddt.data('active', 'rescued', 'paused', 'shelved', 'suspended', + 'error', 'stopped', 'resized') @mock.patch('masakari.compute.nova.novaclient') - def test_evacuate_instances_task(self, _mock_novaclient, mock_unlock, - mock_lock, mock_enable_disable): + def test_host_failure_flow_all_instances( + self, vm_state, _mock_novaclient, mock_unlock, mock_lock, + mock_enable_disable): _mock_novaclient.return_value = self.fake_client - # create test data + # create ha_enabled test data + power_state = 4 if vm_state == 'resized' else None self.fake_client.servers.create(id="1", host=self.instance_host, - vm_state="error", ha_enabled=True) + vm_state=vm_state, + power_state=power_state, + ha_enabled=True) self.fake_client.servers.create(id="2", host=self.instance_host, - vm_state="error", ha_enabled=True) + vm_state=vm_state, + power_state=power_state, + ha_enabled=True) + instance_list = { + "instance_list": self.fake_client.servers.list() + } - # execute DisableComputeServiceTask - self._test_disable_compute_service(mock_enable_disable) + # execute EvacuateInstancesTask + self._evacuate_instances(instance_list, mock_enable_disable) - # execute PrepareHAEnabledInstancesTask - instance_list = self._test_instance_list() + @mock.patch('masakari.compute.nova.novaclient') + def test_host_failure_flow_all_instances_active_resized_instance( + self, _mock_novaclient, mock_unlock, mock_lock, + mock_enable_disable): + _mock_novaclient.return_value = self.fake_client + + # create ha_enabled test data + self.fake_client.servers.create(id="1", host=self.instance_host, + vm_state='resized', + ha_enabled=True) + self.fake_client.servers.create(id="2", host=self.instance_host, + vm_state='resized', + ha_enabled=True) + instance_list = { + "instance_list": self.fake_client.servers.list() + } # execute EvacuateInstancesTask self._evacuate_instances(instance_list, mock_enable_disable) @@ -219,66 +246,6 @@ class HostFailureTestCase(test.TestCase): exception.HostRecoveryFailureException, self._evacuate_instances, instance_list, mock_enable_disable) - @mock.patch('masakari.compute.nova.novaclient') - def test_host_failure_flow_resized_instance( - self, _mock_novaclient, mock_unlock, mock_lock, - mock_enable_disable): - _mock_novaclient.return_value = self.fake_client - - # create ha_enabled test data - self.fake_client.servers.create(id="1", host=self.instance_host, - vm_state="resized", - ha_enabled=True) - self.fake_client.servers.create(id="2", host=self.instance_host, - vm_state="resized", - ha_enabled=True) - instance_list = { - "instance_list": self.fake_client.servers.list() - } - - # execute EvacuateInstancesTask - self._evacuate_instances(instance_list, mock_enable_disable) - - @mock.patch('masakari.compute.nova.novaclient') - def test_host_failure_flow_shutdown_instance( - self, _mock_novaclient, mock_unlock, mock_lock, - mock_enable_disable): - _mock_novaclient.return_value = self.fake_client - - # create ha_enabled test data - self.fake_client.servers.create(id="1", host=self.instance_host, - vm_state="stopped", - ha_enabled=True) - self.fake_client.servers.create(id="2", host=self.instance_host, - vm_state="stopped", - ha_enabled=True) - instance_list = { - "instance_list": self.fake_client.servers.list() - } - - # execute EvacuateInstancesTask - self._evacuate_instances(instance_list, mock_enable_disable) - - @mock.patch('masakari.compute.nova.novaclient') - def test_host_failure_flow_instance_in_error( - self, _mock_novaclient, mock_unlock, mock_lock, - mock_enable_disable): - _mock_novaclient.return_value = self.fake_client - - # create ha_enabled test data - self.fake_client.servers.create(id="1", host=self.instance_host, - vm_state="error", - ha_enabled=True) - self.fake_client.servers.create(id="2", host=self.instance_host, - vm_state="error", - ha_enabled=True) - instance_list = { - "instance_list": self.fake_client.servers.list() - } - - # execute EvacuateInstancesTask - self._evacuate_instances(instance_list, mock_enable_disable) - @mock.patch('masakari.compute.nova.novaclient') def test_host_failure_flow_no_instances_on_host( self, _mock_novaclient, mock_unlock, mock_lock, diff --git a/masakari/tests/unit/fakes.py b/masakari/tests/unit/fakes.py index 4cc7897e..7fd4907c 100644 --- a/masakari/tests/unit/fakes.py +++ b/masakari/tests/unit/fakes.py @@ -24,12 +24,13 @@ NOW = timeutils.utcnow().replace(microsecond=0) class FakeNovaClient(object): class Server(object): def __init__(self, id=None, uuid=None, host=None, vm_state=None, - ha_enabled=None, locked=False): + power_state=1, ha_enabled=None, locked=False): self.id = id self.uuid = uuid or uuidutils.generate_uuid() self.host = host setattr(self, 'OS-EXT-SRV-ATTR:hypervisor_hostname', host) setattr(self, 'OS-EXT-STS:vm_state', vm_state) + setattr(self, 'OS-EXT-STS:power_state', power_state) self.metadata = {"HA_Enabled": ha_enabled} self.locked = locked @@ -38,9 +39,10 @@ class FakeNovaClient(object): self._servers = [] def create(self, id, uuid=None, host=None, vm_state='active', - ha_enabled=False): + power_state=1, ha_enabled=False): server = FakeNovaClient.Server(id=id, uuid=uuid, host=host, vm_state=vm_state, + power_state=power_state, ha_enabled=ha_enabled) self._servers.append(server) return server @@ -69,9 +71,12 @@ class FakeNovaClient(object): if not host: host = 'fake-host-1' server = self.get(uuid) - # pretending that instance is evacuated successfully on given host setattr(server, 'OS-EXT-SRV-ATTR:hypervisor_hostname', host) - setattr(server, 'OS-EXT-STS:vm_state', 'active') + # pretending that instance is evacuated successfully on given host + if getattr(server, "OS-EXT-STS:vm_state") == 'active': + setattr(server, 'OS-EXT-STS:vm_state', 'active') + else: + setattr(server, 'OS-EXT-STS:vm_state', 'stopped') def stop(self, id): server = self.get(id)