ironic: check fresh data when sync_power_state doesn't line up

We return cached data to sync_power_state to avoid pummeling the ironic
API. However, this can lead to a race condition where an instance is
powered on, but nova thinks it should be off and calls stop(). Check
again without the cache when this happens to make sure we don't
unnecessarily kill an instance.

Closes-Bug: #1815791
Change-Id: I907b69eb689cf6c169a4869cfc7889308ca419d5
This commit is contained in:
Jim Rollenhagen 2019-02-13 12:59:53 -05:00
parent 104d79d8dd
commit 19cb828023
13 changed files with 140 additions and 43 deletions

View File

@ -7541,6 +7541,53 @@ class ComputeManager(manager.Manager):
# silently ignore.
pass
def _stop_unexpected_shutdown_instance(self, context, vm_state,
db_instance, orig_db_power_state,
vm_power_state):
# this is an exceptional case; make sure our data is up
# to date before slamming through a power off
# TODO(jroll) remove the check for TypeError here;
# it's only here to be able to backport this without
# breaking out-of-tree drivers.
try:
vm_instance = self.driver.get_info(db_instance,
use_cache=False)
vm_power_state = vm_instance.state
except TypeError:
LOG.warning("Your virt driver appears to not support the "
"'use_cache' parameter to the 'get_info' method; "
"please update your virt driver.")
# if it still looks off, go ahead and call stop()
if vm_power_state in (power_state.SHUTDOWN,
power_state.CRASHED):
LOG.warning("Instance shutdown by itself. Calling the "
"stop API. Current vm_state: %(vm_state)s, "
"current task_state: %(task_state)s, "
"original DB power_state: %(db_power_state)s, "
"current VM power_state: %(vm_power_state)s",
{'vm_state': vm_state,
'task_state': db_instance.task_state,
'db_power_state': orig_db_power_state,
'vm_power_state': vm_power_state},
instance=db_instance)
try:
# Note(maoy): here we call the API instead of
# brutally updating the vm_state in the database
# to allow all the hooks and checks to be performed.
if db_instance.shutdown_terminate:
self.compute_api.delete(context, db_instance)
else:
self.compute_api.stop(context, db_instance)
except Exception:
# Note(maoy): there is no need to propagate the error
# because the same power_state will be retrieved next
# time and retried.
# For example, there might be another task scheduled.
LOG.exception("error during stop() in sync_power_state.",
instance=db_instance)
def _sync_instance_power_state(self, context, db_instance, vm_power_state,
use_slave=False):
"""Align instance power state between the database and hypervisor.
@ -7610,31 +7657,9 @@ class ComputeManager(manager.Manager):
# The only rational power state should be RUNNING
if vm_power_state in (power_state.SHUTDOWN,
power_state.CRASHED):
LOG.warning("Instance shutdown by itself. Calling the "
"stop API. Current vm_state: %(vm_state)s, "
"current task_state: %(task_state)s, "
"original DB power_state: %(db_power_state)s, "
"current VM power_state: %(vm_power_state)s",
{'vm_state': vm_state,
'task_state': db_instance.task_state,
'db_power_state': orig_db_power_state,
'vm_power_state': vm_power_state},
instance=db_instance)
try:
# Note(maoy): here we call the API instead of
# brutally updating the vm_state in the database
# to allow all the hooks and checks to be performed.
if db_instance.shutdown_terminate:
self.compute_api.delete(context, db_instance)
else:
self.compute_api.stop(context, db_instance)
except Exception:
# Note(maoy): there is no need to propagate the error
# because the same power_state will be retrieved next
# time and retried.
# For example, there might be another task scheduled.
LOG.exception("error during stop() in sync_power_state.",
instance=db_instance)
self._stop_unexpected_shutdown_instance(
context, vm_state, db_instance, orig_db_power_state,
vm_power_state)
elif vm_power_state == power_state.SUSPENDED:
LOG.warning("Instance is suspended unexpectedly. Calling "
"the stop API.", instance=db_instance)

View File

@ -1791,10 +1791,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
power_state.RUNNING)
mock_refresh.assert_called_once_with(use_slave=False)
@mock.patch.object(fake_driver.FakeDriver, 'get_info')
@mock.patch.object(objects.Instance, 'refresh')
@mock.patch.object(objects.Instance, 'save')
def test_sync_instance_power_state_running_stopped(self, mock_save,
mock_refresh):
mock_refresh,
mock_get_info):
mock_get_info.return_value = hardware.InstanceInfo(
state=power_state.SHUTDOWN)
instance = self._get_sync_instance(power_state.RUNNING,
vm_states.ACTIVE)
self.compute._sync_instance_power_state(self.context, instance,
@ -1802,11 +1806,12 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
self.assertEqual(instance.power_state, power_state.SHUTDOWN)
mock_refresh.assert_called_once_with(use_slave=False)
self.assertTrue(mock_save.called)
mock_get_info.assert_called_once_with(instance, use_cache=False)
def _test_sync_to_stop(self, power_state, vm_state, driver_power_state,
def _test_sync_to_stop(self, vm_power_state, vm_state, driver_power_state,
stop=True, force=False, shutdown_terminate=False):
instance = self._get_sync_instance(
power_state, vm_state, shutdown_terminate=shutdown_terminate)
vm_power_state, vm_state, shutdown_terminate=shutdown_terminate)
with test.nested(
mock.patch.object(objects.Instance, 'refresh'),
@ -1814,7 +1819,12 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
mock.patch.object(self.compute.compute_api, 'stop'),
mock.patch.object(self.compute.compute_api, 'delete'),
mock.patch.object(self.compute.compute_api, 'force_stop'),
) as (mock_refresh, mock_save, mock_stop, mock_delete, mock_force):
mock.patch.object(self.compute.driver, 'get_info')
) as (mock_refresh, mock_save, mock_stop, mock_delete, mock_force,
mock_get_info):
mock_get_info.return_value = hardware.InstanceInfo(
state=driver_power_state)
self.compute._sync_instance_power_state(self.context, instance,
driver_power_state)
if shutdown_terminate:
@ -1824,6 +1834,11 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
mock_force.assert_called_once_with(self.context, instance)
else:
mock_stop.assert_called_once_with(self.context, instance)
if (vm_state == vm_states.ACTIVE and
vm_power_state in (power_state.SHUTDOWN,
power_state.CRASHED)):
mock_get_info.assert_called_once_with(instance,
use_cache=False)
mock_refresh.assert_called_once_with(use_slave=False)
self.assertTrue(mock_save.called)

View File

@ -1100,6 +1100,34 @@ class IronicDriverTestCase(test.NoDBTestCase):
mock_gbiu.assert_called_once_with(instance.uuid,
fields=ironic_driver._NODE_FIELDS)
@mock.patch.object(ironic_driver.IronicDriver, '_get_node_list')
@mock.patch.object(objects.InstanceList, 'get_uuids_by_host')
@mock.patch.object(objects.ServiceList, 'get_all_computes_by_hv_type')
@mock.patch.object(FAKE_CLIENT.node, 'get_by_instance_uuid')
def test_get_info_skip_cache(self, mock_gbiu, mock_svc_by_hv,
mock_uuids_by_host, mock_get_node_list):
properties = {'memory_mb': 512, 'cpus': 2}
power_state = ironic_states.POWER_ON
node = _get_cached_node(
instance_uuid=self.instance_uuid, properties=properties,
power_state=power_state)
mock_gbiu.return_value = node
mock_svc_by_hv.return_value = []
mock_get_node_list.return_value = [node]
# ironic_states.POWER_ON should be mapped to
# nova_states.RUNNING
instance = fake_instance.fake_instance_obj('fake-context',
uuid=self.instance_uuid)
mock_uuids_by_host.return_value = [instance.uuid]
result = self.driver.get_info(instance, use_cache=False)
self.assertEqual(hardware.InstanceInfo(state=nova_states.RUNNING),
result)
# verify we hit the ironic API for fresh data
mock_gbiu.assert_called_once_with(instance.uuid,
fields=ironic_driver._NODE_FIELDS)
@mock.patch.object(ironic_driver.IronicDriver, '_get_node_list')
@mock.patch.object(objects.InstanceList, 'get_uuids_by_host')
@mock.patch.object(objects.ServiceList, 'get_all_computes_by_hv_type')

View File

@ -157,10 +157,15 @@ class ComputeDriver(object):
"""
pass
def get_info(self, instance):
def get_info(self, instance, use_cache=True):
"""Get the current status of an instance.
:param instance: nova.objects.instance.Instance object
:param use_cache: boolean to indicate if the driver should be allowed
to use cached data to return instance status.
This only applies to drivers which cache instance
state information. For drivers that do not use a
cache, this parameter can be ignored.
:returns: An InstanceInfo object
"""
# TODO(Vek): Need to pass context in for access to auth_token

View File

@ -350,7 +350,7 @@ class FakeDriver(driver.ComputeDriver):
raise exception.InterfaceDetachFailed(
instance_uuid=instance.uuid)
def get_info(self, instance):
def get_info(self, instance, use_cache=True):
if instance.uuid not in self.instances:
raise exception.InstanceNotFound(instance_id=instance.uuid)
i = self.instances[instance.uuid]

View File

@ -176,7 +176,7 @@ class HyperVDriver(driver.ComputeDriver):
"""Cleanup after instance being destroyed by Hypervisor."""
self.unplug_vifs(instance, network_info)
def get_info(self, instance):
def get_info(self, instance, use_cache=True):
return self._vmops.get_info(instance)
def attach_volume(self, context, connection_info, instance, mountpoint,

View File

@ -877,15 +877,30 @@ class IronicDriver(virt_driver.ComputeDriver):
return node
return _sync_node_from_cache()
def get_info(self, instance):
def get_info(self, instance, use_cache=True):
"""Get the current state and resource usage for this instance.
If the instance is not found this method returns (a dictionary
with) NOSTATE and all resources == 0.
:param instance: the instance object.
:param use_cache: boolean to indicate if the driver should be allowed
to use cached data to return instance status.
If false, pull fresh data from ironic.
:returns: an InstanceInfo object
"""
def _fetch_from_ironic(self, instance):
try:
node = self._validate_instance_and_node(instance)
return hardware.InstanceInfo(
state=map_power_state(node.power_state))
except exception.InstanceNotFound:
return hardware.InstanceInfo(
state=map_power_state(ironic_states.NOSTATE))
if not use_cache:
return _fetch_from_ironic(self, instance)
# we should already have a cache for our nodes, refreshed on every
# RT loop. but if we don't have a cache, generate it.
if not self.node_cache:
@ -896,11 +911,7 @@ class IronicDriver(virt_driver.ComputeDriver):
break
else:
# if we can't find the instance, fall back to ironic
try:
node = self._validate_instance_and_node(instance)
except exception.InstanceNotFound:
return hardware.InstanceInfo(
state=map_power_state(ironic_states.NOSTATE))
return _fetch_from_ironic(self, instance)
return hardware.InstanceInfo(state=map_power_state(node.power_state))

View File

@ -5525,7 +5525,7 @@ class LibvirtDriver(driver.ComputeDriver):
{'xml': xml}, instance=instance)
return xml
def get_info(self, instance):
def get_info(self, instance, use_cache=True):
"""Retrieve information from libvirt for a specific instance.
If a libvirt error is encountered during lookup, we might raise a
@ -5533,6 +5533,7 @@ class LibvirtDriver(driver.ComputeDriver):
libvirt error is.
:param instance: nova.objects.instance.Instance object
:param use_cache: unused in this driver
:returns: An InstanceInfo object
"""
guest = self._host.get_guest(instance)

View File

@ -116,10 +116,11 @@ class PowerVMDriver(driver.ComputeDriver):
{'op': op, 'display_name': instance.display_name,
'name': instance.name}, instance=instance)
def get_info(self, instance):
def get_info(self, instance, use_cache=True):
"""Get the current status of an instance.
:param instance: nova.objects.instance.Instance object
:param use_cache: unused in this driver
:returns: An InstanceInfo object.
"""
return vm.get_vm_info(self.adapter, instance)

View File

@ -583,7 +583,7 @@ class VMwareVCDriver(driver.ComputeDriver):
"""Poll for rebooting instances."""
self._vmops.poll_rebooting_instances(timeout, instances)
def get_info(self, instance):
def get_info(self, instance, use_cache=True):
"""Return info about the VM instance."""
return self._vmops.get_info(instance)

View File

@ -369,7 +369,7 @@ class XenAPIDriver(driver.ComputeDriver):
"""Unplug VIFs from networks."""
self._vmops.unplug_vifs(instance, network_info)
def get_info(self, instance):
def get_info(self, instance, use_cache=True):
"""Return data about VM instance."""
return self._vmops.get_info(instance)

View File

@ -122,7 +122,7 @@ class ZVMDriver(driver.ComputeDriver):
def get_available_nodes(self, refresh=False):
return self._hypervisor.get_available_nodes(refresh=refresh)
def get_info(self, instance):
def get_info(self, instance, use_cache=True):
_guest = guest.Guest(self._hypervisor, instance)
return _guest.get_info()

View File

@ -0,0 +1,11 @@
---
fixes:
- |
Fixes a race condition that could allow a newly created Ironic
instance to be powered off after deployment, without letting
the user power it back on.
upgrade:
- |
Adds a ``use_cache`` parameter to the virt driver ``get_info``
method. Out of tree drivers should add support for this
parameter.