Stop stack tracing when trying to auto-stop a stopped instance

Commit cc5388bbe8 added locks to
stop_instance and the _sync_power_states periodic task to try and fix a
race between stopping the instance via the API where the task_state is
set to powering-off, and the periodic task seeing the instance
power_state as shutdown in _sync_instance_power_state and calling the
stop API again, at which point the task_state is already None from the
first stop API call and we get an UnexpectedTaskStateError.

The handle_lifecycle_event method is getting callbacks from the libvirt
driver on state changes on the VM and calling the
_sync_instance_power_state method which may try to stop the instance
asynchronously, and lead to UnexpectedTaskStateError if the instance is
already stopped by the time it gets the lock and the task_state has
changed.

Attempting to lock in handle_lifecycle_event just moves the race around
so this change adds logic to stop_instance such that if the instance
says it's active but the virt driver says it's not running, then we add
None to the expected_task_state so we don't stacktrace on
instance.save().

An alternative and/or additional change to this would be doing a call
rather than a cast when _sync_instance_power_state calls the stop API
but in some previous testing it doesn't appear to make a significant
difference in the race found when we hit the stop_instance method.

Adds a bunch of debug logging since this code is inherently racey and
is needed when looking at failures around these operations.

Closes-Bug: #1339235
Closes-Bug: #1266611
Related-Bug: #1320628

Change-Id: Ib495a5ab15de88051c5fa7abfb58a5445691dcad
This commit is contained in:
Matt Riedemann 2014-09-02 12:11:55 -07:00
parent ecb3bb3a32
commit aa1792eb4c
2 changed files with 92 additions and 5 deletions

View File

@ -1071,6 +1071,17 @@ class ComputeManager(manager.Manager):
event.get_transition())
if vm_power_state is not None:
LOG.debug('Synchronizing instance power state after lifecycle '
'event "%(event)s"; current vm_state: %(vm_state)s, '
'current task_state: %(task_state)s, current DB '
'power_state: %(db_power_state)s, VM power_state: '
'%(vm_power_state)s',
dict(event=event.get_name(),
vm_state=instance.vm_state,
task_state=instance.task_state,
db_power_state=instance.power_state,
vm_power_state=vm_power_state),
instance_uuid=instance.uuid)
self._sync_instance_power_state(context,
instance,
vm_power_state)
@ -2461,6 +2472,31 @@ class ComputeManager(manager.Manager):
@utils.synchronized(instance.uuid)
def do_stop_instance():
current_power_state = self._get_power_state(context, instance)
LOG.debug('Stopping instance; current vm_state: %(vm_state)s, '
'current task_state: %(task_state)s, current DB '
'power_state: %(db_power_state)s, current VM '
'power_state: %(current_power_state)s',
dict(vm_state=instance.vm_state,
task_state=instance.task_state,
db_power_state=instance.power_state,
current_power_state=current_power_state),
instance_uuid=instance.uuid)
# NOTE(mriedem): If the instance is already powered off, we are
# possibly tearing down and racing with other operations, so we can
# expect the task_state to be None if something else updates the
# instance and we're not locking it.
expected_task_state = [task_states.POWERING_OFF]
# The list of power states is from _sync_instance_power_state.
if current_power_state in (power_state.NOSTATE,
power_state.SHUTDOWN,
power_state.CRASHED):
LOG.info(_LI('Instance is already powered off in the '
'hypervisor when stop is called.'),
instance=instance)
expected_task_state.append(None)
self._notify_about_instance_usage(context, instance,
"power_off.start")
self._power_off_instance(context, instance, clean_shutdown)
@ -2468,7 +2504,7 @@ class ComputeManager(manager.Manager):
instance.power_state = current_power_state
instance.vm_state = vm_states.STOPPED
instance.task_state = None
instance.save(expected_task_state=task_states.POWERING_OFF)
instance.save(expected_task_state=expected_task_state)
self._notify_about_instance_usage(context, instance,
"power_off.end")
@ -5691,8 +5727,16 @@ class ComputeManager(manager.Manager):
# The only rational power state should be RUNNING
if vm_power_state in (power_state.SHUTDOWN,
power_state.CRASHED):
LOG.warn(_("Instance shutdown by itself. Calling "
"the stop API."), instance=db_instance)
LOG.warn(_LW("Instance shutdown by itself. Calling the stop "
"API. Current vm_state: %(vm_state)s, current "
"task_state: %(task_state)s, current DB "
"power_state: %(db_power_state)s, current VM "
"power_state: %(vm_power_state)s"),
{'vm_state': vm_state,
'task_state': db_instance.task_state,
'db_power_state': db_power_state,
'vm_power_state': vm_power_state},
instance=db_instance)
try:
# Note(maoy): here we call the API instead of
# brutally updating the vm_state in the database
@ -5738,8 +5782,16 @@ class ComputeManager(manager.Manager):
if vm_power_state not in (power_state.NOSTATE,
power_state.SHUTDOWN,
power_state.CRASHED):
LOG.warn(_("Instance is not stopped. Calling "
"the stop API."), instance=db_instance)
LOG.warn(_LW("Instance is not stopped. Calling "
"the stop API. Current vm_state: %(vm_state)s, "
"current task_state: %(task_state)s, "
"current DB power_state: %(db_power_state)s, "
"current VM power_state: %(vm_power_state)s"),
{'vm_state': vm_state,
'task_state': db_instance.task_state,
'db_power_state': db_power_state,
'vm_power_state': vm_power_state},
instance=db_instance)
try:
# NOTE(russellb) Force the stop, because normally the
# compute API would not allow an attempt to stop a stopped

View File

@ -1872,6 +1872,41 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
exception.UnexpectedDeletingTaskStateError(expected='foo',
actual='bar'))
def test_stop_instance_task_state_none_power_state_shutdown(self):
# Tests that stop_instance doesn't puke when the instance power_state
# is shutdown and the task_state is None.
instance = fake_instance.fake_instance_obj(
self.context, vm_state=vm_states.ACTIVE,
task_state=None, power_state=power_state.SHUTDOWN)
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(self.compute, '_get_power_state',
return_value=power_state.SHUTDOWN)
@mock.patch.object(self.compute, '_notify_about_instance_usage')
@mock.patch.object(self.compute, '_power_off_instance')
@mock.patch.object(instance, 'save')
def do_test(save_mock, power_off_mock, notify_mock, get_state_mock,
event_finish_mock, event_start_mock):
# run the code
self.compute.stop_instance(self.context, instance)
# assert the calls
self.assertEqual(2, get_state_mock.call_count)
notify_mock.assert_has_calls([
mock.call(self.context, instance, 'power_off.start'),
mock.call(self.context, instance, 'power_off.end')
])
power_off_mock.assert_called_once_with(
self.context, instance, True)
save_mock.assert_called_once_with(
expected_task_state=[task_states.POWERING_OFF, None])
self.assertEqual(power_state.SHUTDOWN, instance.power_state)
self.assertIsNone(instance.task_state)
self.assertEqual(vm_states.STOPPED, instance.vm_state)
do_test()
class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
def setUp(self):