Handle get_host_availability_zone error during reschedule

If a build fails and reschedules to a cell conductor which does
not have access to the API DB, the call to get_host_availability_zone
will fail with a CantStartEngineError because it's trying to do an
"up-call" to the API DB for host aggregate info. The reschedule
fails and the instance is stuck in BUILD status without a fault
injected for determining what went wrong.

This change simply handles the failure and cleans up so the instance
is put into a terminal (ERROR) state.

Change-Id: I6bfa6fa767403fb936a6ae340b8687eb161732fc
Partial-Bug: #1781286
This commit is contained in:
Matt Riedemann 2019-10-01 12:03:17 -04:00
parent 6d5fdb4ef4
commit 38fb7f82ab
2 changed files with 57 additions and 3 deletions

View File

@ -744,9 +744,19 @@ class ComputeTaskManager(base.Base):
context, instance, exc, legacy_request_spec,
requested_networks)
return
instance.availability_zone = (
availability_zones.get_host_availability_zone(context,
host.service_host))
try:
instance.availability_zone = (
availability_zones.get_host_availability_zone(context,
host.service_host))
except Exception as exc:
# Put the instance into ERROR state, set task_state to None,
# inject a fault, etc.
self._cleanup_when_reschedule_fails(
context, instance, exc, legacy_request_spec,
requested_networks)
continue
try:
# NOTE(danms): This saves the az change above, refreshes our
# instance, and tells us if it has been deleted underneath us

View File

@ -3152,6 +3152,50 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
tb = mock_notify.call_args_list[0][0][6]
self.assertIn('Traceback (most recent call last):', tb)
@mock.patch('nova.scheduler.utils.claim_resources', return_value=True)
@mock.patch('nova.scheduler.utils.fill_provider_mapping')
@mock.patch('nova.availability_zones.get_host_availability_zone',
side_effect=db_exc.CantStartEngineError)
@mock.patch('nova.conductor.manager.ComputeTaskManager.'
'_cleanup_when_reschedule_fails')
@mock.patch('nova.objects.Instance.save')
def test_build_reschedule_get_az_error(self, mock_save, mock_cleanup,
mock_get_az, mock_fill, mock_claim):
"""Tests a scenario where rescheduling during a build fails trying to
get the AZ for the selected host will put the instance into a terminal
(ERROR) state.
"""
instance = fake_instance.fake_instance_obj(self.context)
image = objects.ImageMeta()
requested_networks = objects.NetworkRequestList()
request_spec = fake_request_spec.fake_spec_obj()
host_lists = copy.deepcopy(fake_host_lists_alt)
filter_props = {}
# Pre-populate the filter properties with the initial host we tried to
# build on which failed and triggered a reschedule.
host1 = host_lists[0].pop(0)
scheduler_utils.populate_filter_properties(filter_props, host1)
# We have to save off the first alternate we try since build_instances
# modifies the host_lists list.
host2 = host_lists[0][0]
self.conductor.build_instances(
self.context, [instance], image, filter_props,
mock.sentinel.admin_password, mock.sentinel.injected_files,
requested_networks, mock.sentinel.security_groups,
request_spec=request_spec, host_lists=host_lists)
mock_claim.assert_called_once()
mock_fill.assert_called_once()
mock_get_az.assert_called_once_with(self.context, host2.service_host)
mock_cleanup.assert_called_once_with(
self.context, instance,
test.MatchType(db_exc.CantStartEngineError), test.MatchType(dict),
requested_networks)
# Assert that we did not continue processing the instance once we
# handled the error.
mock_save.assert_not_called()
def test_cleanup_allocated_networks_none_requested(self):
# Tests that we don't deallocate networks if 'none' were specifically
# requested.