Stop stack tracing when trying to auto-stop a stopped instance

Commit cc5388bbe8 added locks to stop_instance and the _sync_power_states periodic task to try and fix a race between stopping the instance via the API where the task_state is set to powering-off, and the periodic task seeing the instance power_state as shutdown in _sync_instance_power_state and calling the stop API again, at which point the task_state is already None from the first stop API call and we get an UnexpectedTaskStateError. The handle_lifecycle_event method is getting callbacks from the libvirt driver on state changes on the VM and calling the _sync_instance_power_state method which may try to stop the instance asynchronously, and lead to UnexpectedTaskStateError if the instance is already stopped by the time it gets the lock and the task_state has changed. Attempting to lock in handle_lifecycle_event just moves the race around so this change adds logic to stop_instance such that if the instance says it's active but the virt driver says it's not running, then we add None to the expected_task_state so we don't stacktrace on instance.save(). An alternative and/or additional change to this would be doing a call rather than a cast when _sync_instance_power_state calls the stop API but in some previous testing it doesn't appear to make a significant difference in the race found when we hit the stop_instance method. Adds a bunch of debug logging since this code is inherently racey and is needed when looking at failures around these operations. Closes-Bug: #1339235 Closes-Bug: #1266611 Related-Bug: #1320628 Change-Id: Ib495a5ab15de88051c5fa7abfb58a5445691dcad
2014-09-02 12:11:55 -07:00 · 2014-09-02 12:11:55 -07:00 · aa1792eb4c
parent ecb3bb3a32
commit aa1792eb4c
2 changed files with 92 additions and 5 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -1071,6 +1071,17 @@ class ComputeManager(manager.Manager):
                        event.get_transition())

        if vm_power_state is not None:
+            LOG.debug('Synchronizing instance power state after lifecycle '
+                      'event "%(event)s"; current vm_state: %(vm_state)s, '
+                      'current task_state: %(task_state)s, current DB '
+                      'power_state: %(db_power_state)s, VM power_state: '
+                      '%(vm_power_state)s',
+                      dict(event=event.get_name(),
+                           vm_state=instance.vm_state,
+                           task_state=instance.task_state,
+                           db_power_state=instance.power_state,
+                           vm_power_state=vm_power_state),
+                      instance_uuid=instance.uuid)
            self._sync_instance_power_state(context,
                                            instance,
                                            vm_power_state)
@ -2461,6 +2472,31 @@ class ComputeManager(manager.Manager):

        @utils.synchronized(instance.uuid)
        def do_stop_instance():
+            current_power_state = self._get_power_state(context, instance)
+            LOG.debug('Stopping instance; current vm_state: %(vm_state)s, '
+                      'current task_state: %(task_state)s, current DB '
+                      'power_state: %(db_power_state)s, current VM '
+                      'power_state: %(current_power_state)s',
+                      dict(vm_state=instance.vm_state,
+                           task_state=instance.task_state,
+                           db_power_state=instance.power_state,
+                           current_power_state=current_power_state),
+                      instance_uuid=instance.uuid)
+
+            # NOTE(mriedem): If the instance is already powered off, we are
+            # possibly tearing down and racing with other operations, so we can
+            # expect the task_state to be None if something else updates the
+            # instance and we're not locking it.
+            expected_task_state = [task_states.POWERING_OFF]
+            # The list of power states is from _sync_instance_power_state.
+            if current_power_state in (power_state.NOSTATE,
+                                       power_state.SHUTDOWN,
+                                       power_state.CRASHED):
+                LOG.info(_LI('Instance is already powered off in the '
+                             'hypervisor when stop is called.'),
+                         instance=instance)
+                expected_task_state.append(None)
+
            self._notify_about_instance_usage(context, instance,
                                              "power_off.start")
            self._power_off_instance(context, instance, clean_shutdown)
@ -2468,7 +2504,7 @@ class ComputeManager(manager.Manager):
            instance.power_state = current_power_state
            instance.vm_state = vm_states.STOPPED
            instance.task_state = None
-            instance.save(expected_task_state=task_states.POWERING_OFF)
+            instance.save(expected_task_state=expected_task_state)
            self._notify_about_instance_usage(context, instance,
                                              "power_off.end")

@ -5691,8 +5727,16 @@ class ComputeManager(manager.Manager):
            # The only rational power state should be RUNNING
            if vm_power_state in (power_state.SHUTDOWN,
                                  power_state.CRASHED):
-                LOG.warn(_("Instance shutdown by itself. Calling "
-                           "the stop API."), instance=db_instance)
+                LOG.warn(_LW("Instance shutdown by itself. Calling the stop "
+                             "API. Current vm_state: %(vm_state)s, current "
+                             "task_state: %(task_state)s, current DB "
+                             "power_state: %(db_power_state)s, current VM "
+                             "power_state: %(vm_power_state)s"),
+                         {'vm_state': vm_state,
+                          'task_state': db_instance.task_state,
+                          'db_power_state': db_power_state,
+                          'vm_power_state': vm_power_state},
+                         instance=db_instance)
                try:
                    # Note(maoy): here we call the API instead of
                    # brutally updating the vm_state in the database
@ -5738,8 +5782,16 @@ class ComputeManager(manager.Manager):
            if vm_power_state not in (power_state.NOSTATE,
                                      power_state.SHUTDOWN,
                                      power_state.CRASHED):
-                LOG.warn(_("Instance is not stopped. Calling "
-                           "the stop API."), instance=db_instance)
+                LOG.warn(_LW("Instance is not stopped. Calling "
+                             "the stop API. Current vm_state: %(vm_state)s, "
+                             "current task_state: %(task_state)s, "
+                             "current DB power_state: %(db_power_state)s, "
+                             "current VM power_state: %(vm_power_state)s"),
+                         {'vm_state': vm_state,
+                          'task_state': db_instance.task_state,
+                          'db_power_state': db_power_state,
+                          'vm_power_state': vm_power_state},
+                         instance=db_instance)
                try:
                    # NOTE(russellb) Force the stop, because normally the
                    # compute API would not allow an attempt to stop a stopped
--- a/nova/tests/compute/test_compute_mgr.py
+++ b/nova/tests/compute/test_compute_mgr.py
@ -1872,6 +1872,41 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
            exception.UnexpectedDeletingTaskStateError(expected='foo',
                                                       actual='bar'))

+    def test_stop_instance_task_state_none_power_state_shutdown(self):
+        # Tests that stop_instance doesn't puke when the instance power_state
+        # is shutdown and the task_state is None.
+        instance = fake_instance.fake_instance_obj(
+            self.context, vm_state=vm_states.ACTIVE,
+            task_state=None, power_state=power_state.SHUTDOWN)
+
+        @mock.patch.object(objects.InstanceActionEvent, 'event_start')
+        @mock.patch.object(objects.InstanceActionEvent,
+                           'event_finish_with_failure')
+        @mock.patch.object(self.compute, '_get_power_state',
+                           return_value=power_state.SHUTDOWN)
+        @mock.patch.object(self.compute, '_notify_about_instance_usage')
+        @mock.patch.object(self.compute, '_power_off_instance')
+        @mock.patch.object(instance, 'save')
+        def do_test(save_mock, power_off_mock, notify_mock, get_state_mock,
+                    event_finish_mock, event_start_mock):
+            # run the code
+            self.compute.stop_instance(self.context, instance)
+            # assert the calls
+            self.assertEqual(2, get_state_mock.call_count)
+            notify_mock.assert_has_calls([
+                mock.call(self.context, instance, 'power_off.start'),
+                mock.call(self.context, instance, 'power_off.end')
+            ])
+            power_off_mock.assert_called_once_with(
+                self.context, instance, True)
+            save_mock.assert_called_once_with(
+                expected_task_state=[task_states.POWERING_OFF, None])
+            self.assertEqual(power_state.SHUTDOWN, instance.power_state)
+            self.assertIsNone(instance.task_state)
+            self.assertEqual(vm_states.STOPPED, instance.vm_state)
+
+        do_test()
+

 class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
    def setUp(self):