Handle get_host_availability_zone error during reschedule

If a build fails and reschedules to a cell conductor which does
not have access to the API DB, the call to get_host_availability_zone
will fail with a CantStartEngineError because it's trying to do an
"up-call" to the API DB for host aggregate info. The reschedule
fails and the instance is stuck in BUILD status without a fault
injected for determining what went wrong.

This change simply handles the failure and cleans up so the instance
is put into a terminal (ERROR) state.

Conflicts:
      nova/tests/unit/conductor/test_conductor.py

NOTE(mriedem): The conflict is due to not having change
Ibfb0a6db5920d921c4fc7cabf3f4d2838ea7f421 in Rocky.
Also note that the call to _cleanup_when_reschedule_fails does not
pass a "legacy_request_spec" variable since change
If8a13f74d2b3c99f05365eb49dcfa01d9042fefa is not in Rocky.

Change-Id: I6bfa6fa767403fb936a6ae340b8687eb161732fc
Partial-Bug: #1781286
(cherry picked from commit 38fb7f82ab)
(cherry picked from commit b5e6c389d7)
(cherry picked from commit 53bcf0b1ee)
This commit is contained in:
Matt Riedemann 2019-10-01 12:03:17 -04:00
parent 20de81c7c0
commit 5a98135650
2 changed files with 55 additions and 3 deletions

View File

@ -698,9 +698,19 @@ class ComputeTaskManager(base.Base):
context, instance, exc, request_spec,
requested_networks)
return
instance.availability_zone = (
availability_zones.get_host_availability_zone(context,
host.service_host))
try:
instance.availability_zone = (
availability_zones.get_host_availability_zone(context,
host.service_host))
except Exception as exc:
# Put the instance into ERROR state, set task_state to None,
# inject a fault, etc.
self._cleanup_when_reschedule_fails(
context, instance, exc, request_spec,
requested_networks)
continue
try:
# NOTE(danms): This saves the az change above, refreshes our
# instance, and tells us if it has been deleted underneath us

View File

@ -2842,6 +2842,48 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
requested_networks, mock.sentinel.secgroups)
mock_save.assert_called_once_with()
@mock.patch('nova.scheduler.utils.claim_resources', return_value=True)
@mock.patch('nova.availability_zones.get_host_availability_zone',
side_effect=db_exc.CantStartEngineError)
@mock.patch('nova.conductor.manager.ComputeTaskManager.'
'_cleanup_when_reschedule_fails')
@mock.patch('nova.objects.Instance.save')
def test_build_reschedule_get_az_error(self, mock_save, mock_cleanup,
mock_get_az, mock_claim):
"""Tests a scenario where rescheduling during a build fails trying to
get the AZ for the selected host will put the instance into a terminal
(ERROR) state.
"""
instance = fake_instance.fake_instance_obj(self.context)
image = objects.ImageMeta()
requested_networks = objects.NetworkRequestList()
request_spec = fake_request_spec.fake_spec_obj()
host_lists = copy.deepcopy(fake_host_lists_alt)
filter_props = {}
# Pre-populate the filter properties with the initial host we tried to
# build on which failed and triggered a reschedule.
host1 = host_lists[0].pop(0)
scheduler_utils.populate_filter_properties(filter_props, host1)
# We have to save off the first alternate we try since build_instances
# modifies the host_lists list.
host2 = host_lists[0][0]
self.conductor.build_instances(
self.context, [instance], image, filter_props,
mock.sentinel.admin_password, mock.sentinel.injected_files,
requested_networks, mock.sentinel.security_groups,
request_spec=request_spec, host_lists=host_lists)
mock_claim.assert_called_once()
mock_get_az.assert_called_once_with(self.context, host2.service_host)
mock_cleanup.assert_called_once_with(
self.context, instance,
test.MatchType(db_exc.CantStartEngineError), test.MatchType(dict),
requested_networks)
# Assert that we did not continue processing the instance once we
# handled the error.
mock_save.assert_not_called()
def test_cleanup_allocated_networks_none_requested(self):
# Tests that we don't deallocate networks if 'none' were specifically
# requested.