diff --git a/nova/compute/manager.py b/nova/compute/manager.py index d3504b8fccf3..c14476764629 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -7541,6 +7541,53 @@ class ComputeManager(manager.Manager): # silently ignore. pass + def _stop_unexpected_shutdown_instance(self, context, vm_state, + db_instance, orig_db_power_state, + vm_power_state): + # this is an exceptional case; make sure our data is up + # to date before slamming through a power off + # TODO(jroll) remove the check for TypeError here; + # it's only here to be able to backport this without + # breaking out-of-tree drivers. + try: + vm_instance = self.driver.get_info(db_instance, + use_cache=False) + vm_power_state = vm_instance.state + except TypeError: + LOG.warning("Your virt driver appears to not support the " + "'use_cache' parameter to the 'get_info' method; " + "please update your virt driver.") + + # if it still looks off, go ahead and call stop() + if vm_power_state in (power_state.SHUTDOWN, + power_state.CRASHED): + + LOG.warning("Instance shutdown by itself. Calling the " + "stop API. Current vm_state: %(vm_state)s, " + "current task_state: %(task_state)s, " + "original DB power_state: %(db_power_state)s, " + "current VM power_state: %(vm_power_state)s", + {'vm_state': vm_state, + 'task_state': db_instance.task_state, + 'db_power_state': orig_db_power_state, + 'vm_power_state': vm_power_state}, + instance=db_instance) + try: + # Note(maoy): here we call the API instead of + # brutally updating the vm_state in the database + # to allow all the hooks and checks to be performed. + if db_instance.shutdown_terminate: + self.compute_api.delete(context, db_instance) + else: + self.compute_api.stop(context, db_instance) + except Exception: + # Note(maoy): there is no need to propagate the error + # because the same power_state will be retrieved next + # time and retried. + # For example, there might be another task scheduled. + LOG.exception("error during stop() in sync_power_state.", + instance=db_instance) + def _sync_instance_power_state(self, context, db_instance, vm_power_state, use_slave=False): """Align instance power state between the database and hypervisor. @@ -7610,31 +7657,9 @@ class ComputeManager(manager.Manager): # The only rational power state should be RUNNING if vm_power_state in (power_state.SHUTDOWN, power_state.CRASHED): - LOG.warning("Instance shutdown by itself. Calling the " - "stop API. Current vm_state: %(vm_state)s, " - "current task_state: %(task_state)s, " - "original DB power_state: %(db_power_state)s, " - "current VM power_state: %(vm_power_state)s", - {'vm_state': vm_state, - 'task_state': db_instance.task_state, - 'db_power_state': orig_db_power_state, - 'vm_power_state': vm_power_state}, - instance=db_instance) - try: - # Note(maoy): here we call the API instead of - # brutally updating the vm_state in the database - # to allow all the hooks and checks to be performed. - if db_instance.shutdown_terminate: - self.compute_api.delete(context, db_instance) - else: - self.compute_api.stop(context, db_instance) - except Exception: - # Note(maoy): there is no need to propagate the error - # because the same power_state will be retrieved next - # time and retried. - # For example, there might be another task scheduled. - LOG.exception("error during stop() in sync_power_state.", - instance=db_instance) + self._stop_unexpected_shutdown_instance( + context, vm_state, db_instance, orig_db_power_state, + vm_power_state) elif vm_power_state == power_state.SUSPENDED: LOG.warning("Instance is suspended unexpectedly. Calling " "the stop API.", instance=db_instance) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 327afa91108e..492b7a6dd64d 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -1791,10 +1791,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, power_state.RUNNING) mock_refresh.assert_called_once_with(use_slave=False) + @mock.patch.object(fake_driver.FakeDriver, 'get_info') @mock.patch.object(objects.Instance, 'refresh') @mock.patch.object(objects.Instance, 'save') def test_sync_instance_power_state_running_stopped(self, mock_save, - mock_refresh): + mock_refresh, + mock_get_info): + mock_get_info.return_value = hardware.InstanceInfo( + state=power_state.SHUTDOWN) instance = self._get_sync_instance(power_state.RUNNING, vm_states.ACTIVE) self.compute._sync_instance_power_state(self.context, instance, @@ -1802,11 +1806,12 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, self.assertEqual(instance.power_state, power_state.SHUTDOWN) mock_refresh.assert_called_once_with(use_slave=False) self.assertTrue(mock_save.called) + mock_get_info.assert_called_once_with(instance, use_cache=False) - def _test_sync_to_stop(self, power_state, vm_state, driver_power_state, + def _test_sync_to_stop(self, vm_power_state, vm_state, driver_power_state, stop=True, force=False, shutdown_terminate=False): instance = self._get_sync_instance( - power_state, vm_state, shutdown_terminate=shutdown_terminate) + vm_power_state, vm_state, shutdown_terminate=shutdown_terminate) with test.nested( mock.patch.object(objects.Instance, 'refresh'), @@ -1814,7 +1819,12 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, mock.patch.object(self.compute.compute_api, 'stop'), mock.patch.object(self.compute.compute_api, 'delete'), mock.patch.object(self.compute.compute_api, 'force_stop'), - ) as (mock_refresh, mock_save, mock_stop, mock_delete, mock_force): + mock.patch.object(self.compute.driver, 'get_info') + ) as (mock_refresh, mock_save, mock_stop, mock_delete, mock_force, + mock_get_info): + mock_get_info.return_value = hardware.InstanceInfo( + state=driver_power_state) + self.compute._sync_instance_power_state(self.context, instance, driver_power_state) if shutdown_terminate: @@ -1824,6 +1834,11 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, mock_force.assert_called_once_with(self.context, instance) else: mock_stop.assert_called_once_with(self.context, instance) + if (vm_state == vm_states.ACTIVE and + vm_power_state in (power_state.SHUTDOWN, + power_state.CRASHED)): + mock_get_info.assert_called_once_with(instance, + use_cache=False) mock_refresh.assert_called_once_with(use_slave=False) self.assertTrue(mock_save.called) diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index 976474cd89fc..77358432edc5 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -1100,6 +1100,34 @@ class IronicDriverTestCase(test.NoDBTestCase): mock_gbiu.assert_called_once_with(instance.uuid, fields=ironic_driver._NODE_FIELDS) + @mock.patch.object(ironic_driver.IronicDriver, '_get_node_list') + @mock.patch.object(objects.InstanceList, 'get_uuids_by_host') + @mock.patch.object(objects.ServiceList, 'get_all_computes_by_hv_type') + @mock.patch.object(FAKE_CLIENT.node, 'get_by_instance_uuid') + def test_get_info_skip_cache(self, mock_gbiu, mock_svc_by_hv, + mock_uuids_by_host, mock_get_node_list): + properties = {'memory_mb': 512, 'cpus': 2} + power_state = ironic_states.POWER_ON + node = _get_cached_node( + instance_uuid=self.instance_uuid, properties=properties, + power_state=power_state) + + mock_gbiu.return_value = node + mock_svc_by_hv.return_value = [] + mock_get_node_list.return_value = [node] + + # ironic_states.POWER_ON should be mapped to + # nova_states.RUNNING + instance = fake_instance.fake_instance_obj('fake-context', + uuid=self.instance_uuid) + mock_uuids_by_host.return_value = [instance.uuid] + result = self.driver.get_info(instance, use_cache=False) + self.assertEqual(hardware.InstanceInfo(state=nova_states.RUNNING), + result) + # verify we hit the ironic API for fresh data + mock_gbiu.assert_called_once_with(instance.uuid, + fields=ironic_driver._NODE_FIELDS) + @mock.patch.object(ironic_driver.IronicDriver, '_get_node_list') @mock.patch.object(objects.InstanceList, 'get_uuids_by_host') @mock.patch.object(objects.ServiceList, 'get_all_computes_by_hv_type') diff --git a/nova/virt/driver.py b/nova/virt/driver.py index f63652cb7515..6e939cf24c0d 100644 --- a/nova/virt/driver.py +++ b/nova/virt/driver.py @@ -157,10 +157,15 @@ class ComputeDriver(object): """ pass - def get_info(self, instance): + def get_info(self, instance, use_cache=True): """Get the current status of an instance. :param instance: nova.objects.instance.Instance object + :param use_cache: boolean to indicate if the driver should be allowed + to use cached data to return instance status. + This only applies to drivers which cache instance + state information. For drivers that do not use a + cache, this parameter can be ignored. :returns: An InstanceInfo object """ # TODO(Vek): Need to pass context in for access to auth_token diff --git a/nova/virt/fake.py b/nova/virt/fake.py index 6b0f59b11783..4f3abe9a5b91 100644 --- a/nova/virt/fake.py +++ b/nova/virt/fake.py @@ -350,7 +350,7 @@ class FakeDriver(driver.ComputeDriver): raise exception.InterfaceDetachFailed( instance_uuid=instance.uuid) - def get_info(self, instance): + def get_info(self, instance, use_cache=True): if instance.uuid not in self.instances: raise exception.InstanceNotFound(instance_id=instance.uuid) i = self.instances[instance.uuid] diff --git a/nova/virt/hyperv/driver.py b/nova/virt/hyperv/driver.py index a228bbd1cee8..f91f8d724285 100644 --- a/nova/virt/hyperv/driver.py +++ b/nova/virt/hyperv/driver.py @@ -176,7 +176,7 @@ class HyperVDriver(driver.ComputeDriver): """Cleanup after instance being destroyed by Hypervisor.""" self.unplug_vifs(instance, network_info) - def get_info(self, instance): + def get_info(self, instance, use_cache=True): return self._vmops.get_info(instance) def attach_volume(self, context, connection_info, instance, mountpoint, diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 4aecc864d893..15f95f0c6ef0 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -877,15 +877,30 @@ class IronicDriver(virt_driver.ComputeDriver): return node return _sync_node_from_cache() - def get_info(self, instance): + def get_info(self, instance, use_cache=True): """Get the current state and resource usage for this instance. If the instance is not found this method returns (a dictionary with) NOSTATE and all resources == 0. :param instance: the instance object. + :param use_cache: boolean to indicate if the driver should be allowed + to use cached data to return instance status. + If false, pull fresh data from ironic. :returns: an InstanceInfo object """ + def _fetch_from_ironic(self, instance): + try: + node = self._validate_instance_and_node(instance) + return hardware.InstanceInfo( + state=map_power_state(node.power_state)) + except exception.InstanceNotFound: + return hardware.InstanceInfo( + state=map_power_state(ironic_states.NOSTATE)) + + if not use_cache: + return _fetch_from_ironic(self, instance) + # we should already have a cache for our nodes, refreshed on every # RT loop. but if we don't have a cache, generate it. if not self.node_cache: @@ -896,11 +911,7 @@ class IronicDriver(virt_driver.ComputeDriver): break else: # if we can't find the instance, fall back to ironic - try: - node = self._validate_instance_and_node(instance) - except exception.InstanceNotFound: - return hardware.InstanceInfo( - state=map_power_state(ironic_states.NOSTATE)) + return _fetch_from_ironic(self, instance) return hardware.InstanceInfo(state=map_power_state(node.power_state)) diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 6d1603373ed0..9b1a10cc7ac5 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -5525,7 +5525,7 @@ class LibvirtDriver(driver.ComputeDriver): {'xml': xml}, instance=instance) return xml - def get_info(self, instance): + def get_info(self, instance, use_cache=True): """Retrieve information from libvirt for a specific instance. If a libvirt error is encountered during lookup, we might raise a @@ -5533,6 +5533,7 @@ class LibvirtDriver(driver.ComputeDriver): libvirt error is. :param instance: nova.objects.instance.Instance object + :param use_cache: unused in this driver :returns: An InstanceInfo object """ guest = self._host.get_guest(instance) diff --git a/nova/virt/powervm/driver.py b/nova/virt/powervm/driver.py index c3b3939dda79..3f65ba8608b9 100644 --- a/nova/virt/powervm/driver.py +++ b/nova/virt/powervm/driver.py @@ -116,10 +116,11 @@ class PowerVMDriver(driver.ComputeDriver): {'op': op, 'display_name': instance.display_name, 'name': instance.name}, instance=instance) - def get_info(self, instance): + def get_info(self, instance, use_cache=True): """Get the current status of an instance. :param instance: nova.objects.instance.Instance object + :param use_cache: unused in this driver :returns: An InstanceInfo object. """ return vm.get_vm_info(self.adapter, instance) diff --git a/nova/virt/vmwareapi/driver.py b/nova/virt/vmwareapi/driver.py index dba10d2e2097..7ec719d347e6 100644 --- a/nova/virt/vmwareapi/driver.py +++ b/nova/virt/vmwareapi/driver.py @@ -583,7 +583,7 @@ class VMwareVCDriver(driver.ComputeDriver): """Poll for rebooting instances.""" self._vmops.poll_rebooting_instances(timeout, instances) - def get_info(self, instance): + def get_info(self, instance, use_cache=True): """Return info about the VM instance.""" return self._vmops.get_info(instance) diff --git a/nova/virt/xenapi/driver.py b/nova/virt/xenapi/driver.py index 8ae3de362195..dbe19252e20d 100644 --- a/nova/virt/xenapi/driver.py +++ b/nova/virt/xenapi/driver.py @@ -369,7 +369,7 @@ class XenAPIDriver(driver.ComputeDriver): """Unplug VIFs from networks.""" self._vmops.unplug_vifs(instance, network_info) - def get_info(self, instance): + def get_info(self, instance, use_cache=True): """Return data about VM instance.""" return self._vmops.get_info(instance) diff --git a/nova/virt/zvm/driver.py b/nova/virt/zvm/driver.py index 8405f276bc4d..2d5d5846fc75 100644 --- a/nova/virt/zvm/driver.py +++ b/nova/virt/zvm/driver.py @@ -122,7 +122,7 @@ class ZVMDriver(driver.ComputeDriver): def get_available_nodes(self, refresh=False): return self._hypervisor.get_available_nodes(refresh=refresh) - def get_info(self, instance): + def get_info(self, instance, use_cache=True): _guest = guest.Guest(self._hypervisor, instance) return _guest.get_info() diff --git a/releasenotes/notes/bug-1815791-f84a913eef9e3b21.yaml b/releasenotes/notes/bug-1815791-f84a913eef9e3b21.yaml new file mode 100644 index 000000000000..f52352506b1e --- /dev/null +++ b/releasenotes/notes/bug-1815791-f84a913eef9e3b21.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Fixes a race condition that could allow a newly created Ironic + instance to be powered off after deployment, without letting + the user power it back on. +upgrade: + - | + Adds a ``use_cache`` parameter to the virt driver ``get_info`` + method. Out of tree drivers should add support for this + parameter.