From e3b68a1c8bbedd877cfc988898f7e458ab067f28 Mon Sep 17 00:00:00 2001 From: Erik Olof Gunnar Andersson Date: Thu, 25 Jul 2019 20:19:40 -0700 Subject: [PATCH] Cleanup when hitting MaxRetriesExceeded from no host_available Prior to this patch there was a condition when no host_available was true and an exception would get raised without first cleaning up the instance. This causes instances to get indefinitely stuck in a scheduling state. This patch fixes this by calling the clean up function and then exits build_instances using a return statement. The related functional regression recreate test is updated to show this fixes the bug. NOTE(mriedem): There are three changes in this backport. First, since bug 1819460 and change I78fc2312274471a7bd85a263de12cc5a0b19fd10 do not apply to Rocky, _cleanup_when_reschedule_fails is added here. Second, the test_bug_1837955 setUp needed to stub notifications since change I017d1a31139c9300642dd706eadc265f7c954ca8 is not in Rocky to do that in ProviderUsageBaseTestCase. Third, the unit test is changed to mock _set_vm_state_and_notify since change Ibfb0a6db5920d921c4fc7cabf3f4d2838ea7f421 is not in Rocky for compute utils to call notify_about_compute_task_error. Change-Id: I6a2c63a4c33e783100208fd3f45eb52aad49e3d6 Closes-bug: #1837955 (cherry picked from commit b98d4ba6d54f5ca2999a8fe6b6d7dfcc134061df) (cherry picked from commit fcc2b9e33ed2b3d9f469b458e0f46011fe9883ac) --- nova/conductor/manager.py | 32 +++++++++++++---- .../regressions/test_bug_1837955.py | 35 +++++++----------- nova/tests/unit/conductor/test_conductor.py | 36 ++++++++++++------- 3 files changed, 60 insertions(+), 43 deletions(-) diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index 1c17e4b1d2dd..c742088a598c 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -534,6 +534,23 @@ class ComputeTaskManager(base.Base): bdm.attachment_id = attachment['id'] bdm.save() + def _cleanup_when_reschedule_fails( + self, context, instance, exception, request_spec, + requested_networks): + """Set the instance state and clean up. + + It is only used in case build_instance fails while rescheduling the + instance + """ + + updates = {'vm_state': vm_states.ERROR, + 'task_state': None} + self._set_vm_state_and_notify( + context, instance.uuid, 'build_instances', updates, exception, + request_spec) + self._cleanup_allocated_networks( + context, instance, requested_networks) + # NOTE(danms): This is never cell-targeted because it is only used for # cellsv1 (which does not target cells directly) and n-cpu reschedules # (which go to the cell conductor and thus are always cell-specific). @@ -614,11 +631,7 @@ class ComputeTaskManager(base.Base): # disabled in those cases. num_attempts = filter_properties.get( 'retry', {}).get('num_attempts', 1) - updates = {'vm_state': vm_states.ERROR, 'task_state': None} for instance in instances: - self._set_vm_state_and_notify( - context, instance.uuid, 'build_instances', updates, - exc, request_spec) # If num_attempts > 1, we're in a reschedule and probably # either hit NoValidHost or MaxRetriesExceeded. Either way, # the build request should already be gone and we probably @@ -631,8 +644,9 @@ class ComputeTaskManager(base.Base): self._destroy_build_request(context, instance) except exception.BuildRequestNotFound: pass - self._cleanup_allocated_networks( - context, instance, requested_networks) + self._cleanup_when_reschedule_fails( + context, instance, exc, request_spec, + requested_networks) return elevated = context.elevated() @@ -673,7 +687,11 @@ class ComputeTaskManager(base.Base): msg = ("Exhausted all hosts available for retrying build " "failures for instance %(instance_uuid)s." % {"instance_uuid": instance.uuid}) - raise exception.MaxRetriesExceeded(reason=msg) + exc = exception.MaxRetriesExceeded(reason=msg) + self._cleanup_when_reschedule_fails( + context, instance, exc, request_spec, + requested_networks) + return instance.availability_zone = ( availability_zones.get_host_availability_zone(context, host.service_host)) diff --git a/nova/tests/functional/regressions/test_bug_1837955.py b/nova/tests/functional/regressions/test_bug_1837955.py index e47702b9df97..247999558765 100644 --- a/nova/tests/functional/regressions/test_bug_1837955.py +++ b/nova/tests/functional/regressions/test_bug_1837955.py @@ -29,6 +29,11 @@ class BuildRescheduleClaimFailsTestCase( """ compute_driver = 'fake.SmallFakeDriver' + def setUp(self): + super(BuildRescheduleClaimFailsTestCase, self).setUp() + fake_notifier.stub_notifier(self) + self.addCleanup(fake_notifier.reset) + def _wait_for_unversioned_notification(self, event_type): for x in range(20): # wait up to 10 seconds for notification in fake_notifier.NOTIFICATIONS: @@ -85,31 +90,15 @@ class BuildRescheduleClaimFailsTestCase( image_uuid=fake_image.get_valid_image_id(), networks=[{'port': nova_fixtures.NeutronFixture.port_1['id']}]) server = self.api.post_server({'server': server}) - # FIXME(mriedem): This is bug 1837955 where the status is stuck in - # BUILD rather than the vm_state being set to error and the task_state - # being set to None. Uncomment this when the bug is fixed. - # server = self._wait_for_state_change(self.api, server, 'ERROR') + server = self._wait_for_state_change(self.api, server, 'ERROR') # Wait for the MaxRetriesExceeded fault to be recorded. # set_vm_state_and_notify sets the vm_state to ERROR before the fault # is recorded but after the notification is sent. So wait for the # unversioned notification to show up and then get the fault. - # FIXME(mriedem): Uncomment this when bug 1837955 is fixed. - # self._wait_for_unversioned_notification( - # 'compute_task.build_instances') - # server = self.api.get_server(server['id']) - # self.assertIn('fault', server) - # self.assertIn('Exceeded maximum number of retries', - # server['fault']['message']) - - # TODO(mriedem): Remove this when the bug is fixed. We need to assert - # something before the bug is fixed to show the failure so check the - # logs. - for x in range(20): - logs = self.stdlog.logger.output - if 'MaxRetriesExceeded' in logs: - break - time.sleep(.5) - else: - self.fail('Timed out waiting for MaxRetriesExceeded to show up ' - 'in the logs.') + self._wait_for_unversioned_notification( + 'compute_task.build_instances') + server = self.api.get_server(server['id']) + self.assertIn('fault', server) + self.assertIn('Exceeded maximum number of retries', + server['fault']['message']) diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py index c92d4fcbd40c..f493a1079fed 100644 --- a/nova/tests/unit/conductor/test_conductor.py +++ b/nova/tests/unit/conductor/test_conductor.py @@ -702,28 +702,38 @@ class _BaseTaskTestCase(object): mock.call(self.context, instances[1].uuid)]) self.assertFalse(mock_get_by_host.called) - @mock.patch("nova.scheduler.utils.claim_resources", return_value=False) + @mock.patch('nova.conductor.manager.ComputeTaskManager.' + '_set_vm_state_and_notify') @mock.patch.object(objects.Instance, 'save') - def test_build_instances_exhaust_host_list(self, _mock_save, mock_claim): + def test_build_instances_exhaust_host_list(self, _mock_save, mock_notify): # A list of three alternate hosts for one instance host_lists = copy.deepcopy(fake_host_lists_alt) instance = fake_instance.fake_instance_obj(self.context) image = {'fake-data': 'should_pass_silently'} - expected_claim_count = len(host_lists[0]) # build_instances() is a cast, we need to wait for it to complete self.useFixture(cast_as_call.CastAsCall(self)) + + self.conductor.build_instances( + context=self.context, + instances=[instance], image=image, + filter_properties={}, + admin_password='admin_password', + injected_files='injected_files', + requested_networks=None, + security_groups='security_groups', + block_device_mapping=None, + legacy_bdm=None, + host_lists=host_lists + ) + # Since claim_resources() is mocked to always return False, we will run - # out of alternate hosts, and MaxRetriesExceeded should be raised. - self.assertRaises(exc.MaxRetriesExceeded, - self.conductor.build_instances, context=self.context, - instances=[instance], image=image, filter_properties={}, - admin_password='admin_password', - injected_files='injected_files', requested_networks=None, - security_groups='security_groups', - block_device_mapping=None, legacy_bdm=None, - host_lists=host_lists) - self.assertEqual(expected_claim_count, mock_claim.call_count) + # out of alternate hosts, and complain about MaxRetriesExceeded. + mock_notify.assert_called_once_with( + self.context, instance.uuid, 'build_instances', + test.MatchType(dict), # updates + test.MatchType(exc.MaxRetriesExceeded), + test.MatchType(dict)) # request_spec @mock.patch.object(conductor_manager.ComputeTaskManager, '_destroy_build_request')