Error out interrupted builds

If the compute service is restarted while build requests are
executing the instance_claim or waiting for the COMPUTE_RESOURCE_SEMAPHORE
then those instances will be stuck forever in BUILDING state. If the instance
already finished instance_claim then instance.host is set and when the
compute restarts the instance is put to ERROR state.

This patch changes compute service startup to put instances into
ERROR state if they a) are in the BUILDING state, and b) have
allocations on the compute resource provider, but c) do not have
instance.host set to that compute.

Change-Id: I856a3032c83fc2f605d8c9b6e5aa3bcfa415f96a
Closes-Bug: #1833581
(cherry picked from commit a1a735bc6e)
(cherry picked from commit 06fd7c7301)
(cherry picked from commit cb951cbcb2)
(cherry picked from commit 13bb7ed701)
This commit is contained in:
Balazs Gibizer 2019-06-21 16:48:14 +02:00 committed by Elod Illes
parent 23d65bcf1e
commit 4164b96de9
3 changed files with 278 additions and 27 deletions

View File

@ -644,6 +644,10 @@ class ComputeManager(manager.Manager):
Then allocations are removed from Placement for every instance that is
evacuated from this host regardless if the instance is reported by the
hypervisor or not.
:param context: The request context
:return: A dict keyed by instance uuid mapped to Migration objects
for instances that were migrated away from this host
"""
filters = {
'source_compute': self.host,
@ -664,7 +668,7 @@ class ComputeManager(manager.Manager):
evacuations = objects.MigrationList.get_by_filters(context,
filters)
if not evacuations:
return
return {}
evacuations = {mig.instance_uuid: mig for mig in evacuations}
# The instances might be deleted in which case we need to avoid
@ -851,9 +855,8 @@ class ComputeManager(manager.Manager):
# instance has already been scheduled to this particular host.
LOG.debug("Instance failed to spawn correctly, "
"setting to ERROR state", instance=instance)
instance.task_state = None
instance.vm_state = vm_states.ERROR
instance.save()
self._set_instance_obj_error_state(
context, instance, clean_task_state=True)
return
if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and
@ -864,9 +867,8 @@ class ComputeManager(manager.Manager):
# spawned so set to ERROR state. This is consistent to BUILD
LOG.debug("Instance failed to rebuild correctly, "
"setting to ERROR state", instance=instance)
instance.task_state = None
instance.vm_state = vm_states.ERROR
instance.save()
self._set_instance_obj_error_state(
context, instance, clean_task_state=True)
return
if (instance.vm_state != vm_states.ERROR and
@ -1204,10 +1206,24 @@ class ComputeManager(manager.Manager):
# Initialise instances on the host that are not evacuating
for instance in instances:
if (not evacuated_instances or
instance.uuid not in evacuated_instances):
if instance.uuid not in evacuated_instances:
self._init_instance(context, instance)
# NOTE(gibi): collect all the instance uuids that is in some way
# was already handled above. Either by init_instance or by
# _destroy_evacuated_instances. This way we can limit the scope of
# the _error_out_instances_whose_build_was_interrupted call to look
# only for instances that have allocations on this node and not
# handled by the above calls.
already_handled = {instance.uuid for instance in instances}.union(
evacuated_instances)
# NOTE(gibi): If ironic and vcenter virt driver slow start time
# becomes problematic here then we should consider adding a config
# option or a driver flag to tell us if we should thread this out
# in the background on startup
self._error_out_instances_whose_build_was_interrupted(
context, already_handled)
finally:
if CONF.defer_iptables_apply:
self.driver.filter_defer_apply_off()
@ -1220,6 +1236,85 @@ class ComputeManager(manager.Manager):
# _sync_scheduler_instance_info periodic task will.
self._update_scheduler_instance_info(context, instances)
def _error_out_instances_whose_build_was_interrupted(
self, context, already_handled_instances):
"""If there are instances in BUILDING state that are not
assigned to this host but have allocations in placement towards
this compute that means the nova-compute service was
restarted while those instances waited for the resource claim
to finish and the _set_instance_host_and_node() to update the
instance.host field. We need to push them to ERROR state here to
prevent keeping them in BUILDING state forever.
:param context: The request context
:param already_handled_instances: The set of instance UUIDs that the
host initialization process already handled in some way.
"""
# Strategy:
# 1) Get the allocations from placement for our compute node(s)
# 2) Remove the already handled instances from the consumer list;
# they are either already initialized or need to be skipped.
# 3) Check which remaining consumer is an instance in BUILDING state
# and push it to ERROR state.
LOG.info(
"Looking for unclaimed instances stuck in BUILDING status for "
"nodes managed by this host")
try:
node_names = self.driver.get_available_nodes()
except exception.VirtDriverNotReady:
LOG.warning(
"Virt driver is not ready. Therefore unable to error out any "
"instances stuck in BUILDING state on this node. If this is "
"the first time this service is starting on this host, then "
"you can ignore this warning.")
return
for node_name in node_names:
try:
cn_uuid = objects.ComputeNode.get_by_host_and_nodename(
context, self.host, node_name).uuid
except exception.ComputeHostNotFound:
LOG.warning(
"Compute node %s not found in the database and therefore "
"unable to error out any instances stuck in BUILDING "
"state on this node. If this is the first time this "
"service is starting on this host, then you can ignore "
"this warning.", node_name)
continue
f = self.reportclient.get_allocations_for_resource_provider
allocations = f(context, cn_uuid)
if not allocations:
LOG.error(
"Could not retrieve compute node resource provider %s and "
"therefore unable to error out any instances stuck in "
"BUILDING state.", cn_uuid)
continue
not_handled_consumers = (set(allocations) -
already_handled_instances)
if not not_handled_consumers:
continue
filters = {
'vm_state': vm_states.BUILDING,
'uuid': not_handled_consumers
}
instances = objects.InstanceList.get_by_filters(
context, filters, expected_attrs=[])
for instance in instances:
LOG.debug(
"Instance spawn was interrupted before instance_claim, "
"setting instance to ERROR state", instance=instance)
self._set_instance_obj_error_state(
context, instance, clean_task_state=True)
def cleanup_host(self):
self.driver.register_event_listener(None)
self.instance_events.cancel_all_events()

View File

@ -178,21 +178,14 @@ class TestComputeRestartInstanceStuckInBuild(
# instance_claim call we can wait for in the test
fake_notifier.wait_for_versioned_notifications(
'instance.create.start')
self.restart_compute_service(self.compute1)
# This is bug 1833581 as the server remains in BUILD state after the
# compute restart.
self._wait_for_state_change(self.admin_api, server, 'BUILD')
# Not even the periodic task push this server to ERROR because the
# server host is still None since the instance_claim didn't set it.
self.flags(instance_build_timeout=1)
self.compute1.manager._check_instance_build_time(
nova_context.get_admin_context())
server = self.admin_api.get_server(server['id'])
self.assertEqual('BUILD', server['status'])
self.assertIsNone(server['OS-EXT-SRV-ATTR:host'])
with mock.patch('nova.compute.manager.LOG.debug') as mock_log:
self.restart_compute_service(self.compute1)
# We expect that the instance is pushed to ERROR state during the
# compute restart.
# self._wait_for_state_change(self.admin_api, server, 'ERROR')
self._wait_for_state_change(self.admin_api, server, 'ERROR')
mock_log.assert_called_with(
'Instance spawn was interrupted before instance_claim, setting '
'instance to ERROR state',
instance=mock.ANY)

View File

@ -622,6 +622,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
return instance_obj._make_instance_list(
self.context, objects.InstanceList(), db_list, None)
@mock.patch.object(manager.ComputeManager,
'_error_out_instances_whose_build_was_interrupted')
@mock.patch.object(fake_driver.FakeDriver, 'init_host')
@mock.patch.object(fake_driver.FakeDriver, 'filter_defer_apply_on')
@mock.patch.object(fake_driver.FakeDriver, 'filter_defer_apply_off')
@ -634,7 +636,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
def _do_mock_calls(mock_update_scheduler, mock_inst_init,
mock_destroy, mock_admin_ctxt, mock_host_get,
mock_filter_off, mock_filter_on, mock_init_host,
defer_iptables_apply):
mock_error_interrupted, defer_iptables_apply):
mock_admin_ctxt.return_value = self.context
inst_list = _make_instance_list(startup_instances)
mock_host_get.return_value = inst_list
@ -658,6 +660,9 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
mock_update_scheduler.assert_called_once_with(
self.context, inst_list)
mock_error_interrupted.assert_called_once_with(
self.context, {inst.uuid for inst in inst_list})
# Test with defer_iptables_apply
self.flags(defer_iptables_apply=True)
_do_mock_calls(defer_iptables_apply=True)
@ -666,6 +671,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self.flags(defer_iptables_apply=False)
_do_mock_calls(defer_iptables_apply=False)
@mock.patch('nova.compute.manager.ComputeManager.'
'_error_out_instances_whose_build_was_interrupted')
@mock.patch('nova.objects.InstanceList.get_by_host',
return_value=objects.InstanceList())
@mock.patch('nova.compute.manager.ComputeManager.'
@ -675,13 +682,15 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
@mock.patch('nova.compute.manager.ComputeManager.'
'_update_scheduler_instance_info', mock.NonCallableMock())
def test_init_host_no_instances(self, mock_destroy_evac_instances,
mock_get_by_host):
mock_get_by_host, mock_error_interrupted):
"""Tests the case that init_host runs and there are no instances
on this host yet (it's brand new). Uses NonCallableMock for the
methods we assert should not be called.
"""
self.compute.init_host()
mock_error_interrupted.assert_called_once_with(mock.ANY, set())
@mock.patch('nova.objects.InstanceList')
@mock.patch('nova.objects.MigrationList.get_by_filters')
def test_cleanup_host(self, mock_miglist_get, mock_instance_list):
@ -711,6 +720,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self.assertFalse(mock_register.called)
@mock.patch('nova.context.RequestContext.elevated')
@mock.patch.object(manager.ComputeManager,
'_error_out_instances_whose_build_was_interrupted')
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')
@mock.patch('nova.scheduler.utils.resources_from_flavor')
@mock.patch.object(manager.ComputeManager, '_get_instances_on_driver')
@ -725,7 +736,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
def test_init_host_with_evacuated_instance(self, mock_save, mock_mig_get,
mock_temp_mut, mock_init_host, mock_destroy, mock_host_get,
mock_admin_ctxt, mock_init_virt, mock_get_inst, mock_resources,
mock_get_node, mock_elevated):
mock_get_node, mock_error_interrupted, mock_elevated):
our_host = self.compute.host
not_our_host = 'not-' + our_host
@ -775,6 +786,11 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
mock.ANY, mock.ANY, mock.ANY)
mock_save.assert_called_once_with()
mock_error_interrupted.assert_called_once_with(
self.context, {deleted_instance.uuid})
@mock.patch.object(manager.ComputeManager,
'_error_out_instances_whose_build_was_interrupted')
@mock.patch.object(context, 'get_admin_context')
@mock.patch.object(objects.InstanceList, 'get_by_host')
@mock.patch.object(fake_driver.FakeDriver, 'init_host')
@ -783,7 +799,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
'_destroy_evacuated_instances')
def test_init_host_with_in_progress_evacuations(self, mock_destroy_evac,
mock_init_instance, mock_init_host, mock_host_get,
mock_admin_ctxt):
mock_admin_ctxt, mock_error_interrupted):
"""Assert that init_instance is not called for instances that are
evacuating from the host during init_host.
"""
@ -804,6 +820,145 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
mock_init_instance.assert_called_once_with(
self.context, active_instance)
mock_error_interrupted.assert_called_once_with(
self.context, {active_instance.uuid, evacuating_instance.uuid})
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(objects.InstanceList, 'get_by_filters')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocations_for_resource_provider')
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
def test_init_host_with_interrupted_instance_build(
self, mock_get_nodes, mock_get_by_host_and_node,
mock_get_allocations, mock_get_instances, mock_instance_save):
mock_get_nodes.return_value = ['fake-node']
mock_get_by_host_and_node.return_value = objects.ComputeNode(
host=self.compute.host, uuid=uuids.cn_uuid)
active_instance = fake_instance.fake_instance_obj(
self.context, host=self.compute.host, uuid=uuids.active_instance)
evacuating_instance = fake_instance.fake_instance_obj(
self.context, host=self.compute.host, uuid=uuids.evac_instance)
interrupted_instance = fake_instance.fake_instance_obj(
self.context, host=None, uuid=uuids.interrupted_instance,
vm_state=vm_states.BUILDING)
# we have 3 different instances. We need consumers for each instance
# in placement and an extra consumer that is not an instance
allocations = {
uuids.active_instance: "fake-resources-active",
uuids.evac_instance: "fake-resources-evacuating",
uuids.interrupted_instance: "fake-resources-interrupted",
uuids.not_an_instance: "fake-resources-not-an-instance",
}
mock_get_allocations.return_value = allocations
# get is called with a uuid filter containing interrupted_instance,
# error_instance, and not_an_instance but it will only return the
# interrupted_instance as the error_instance is not in building state
# and not_an_instance does not match with any instance in the db.
mock_get_instances.return_value = objects.InstanceList(
self.context, objects=[interrupted_instance])
# interrupted_instance and error_instance is not in the list passed in
# because it is not assigned to the compute and therefore not processed
# by init_host and init_instance
self.compute._error_out_instances_whose_build_was_interrupted(
self.context,
{inst.uuid for inst in [active_instance, evacuating_instance]})
mock_get_by_host_and_node.assert_called_once_with(
self.context, self.compute.host, 'fake-node')
mock_get_allocations.assert_called_once_with(
self.context, uuids.cn_uuid)
mock_get_instances.assert_called_once_with(
self.context,
{'vm_state': 'building',
'uuid': {uuids.interrupted_instance, uuids.not_an_instance}
},
expected_attrs=[])
# this is expected to be called only once for interrupted_instance
mock_instance_save.assert_called_once_with()
self.assertEqual(vm_states.ERROR, interrupted_instance.vm_state)
@mock.patch.object(manager.LOG, 'warning')
@mock.patch.object(
fake_driver.FakeDriver, 'get_available_nodes',
side_effect=exception.VirtDriverNotReady)
def test_init_host_with_interrupted_instance_build_driver_not_ready(
self, mock_get_nodes, mock_log_warning):
self.compute._error_out_instances_whose_build_was_interrupted(
self.context, set())
mock_log_warning.assert_called_once_with(
"Virt driver is not ready. Therefore unable to error out any "
"instances stuck in BUILDING state on this node. If this is the "
"first time this service is starting on this host, then you can "
"ignore this warning.")
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocations_for_resource_provider')
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
def test_init_host_with_interrupted_instance_build_compute_node_not_found(
self, mock_get_nodes, mock_get_by_host_and_node,
mock_get_allocations):
mock_get_nodes.return_value = ['fake-node1', 'fake-node2']
mock_get_by_host_and_node.side_effect = [
exception.ComputeHostNotFound(host='fake-node1'),
objects.ComputeNode(host=self.compute.host, uuid=uuids.cn_uuid)]
self.compute._error_out_instances_whose_build_was_interrupted(
self.context, set())
# check that nova skip the node that is not found in the db and
# continue with the next
mock_get_by_host_and_node.assert_has_calls(
[
mock.call(self.context, self.compute.host, 'fake-node1'),
mock.call(self.context, self.compute.host, 'fake-node2'),
]
)
# placement only queried for the existing compute
mock_get_allocations.assert_called_once_with(
self.context, uuids.cn_uuid)
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocations_for_resource_provider')
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
def test_init_host_with_interrupted_instance_build_compute_rp_not_found(
self, mock_get_nodes, mock_get_by_host_and_node,
mock_get_allocations):
mock_get_nodes.return_value = ['fake-node1', 'fake-node2']
mock_get_by_host_and_node.side_effect = [
objects.ComputeNode(host=self.compute.host, uuid=uuids.cn1_uuid),
objects.ComputeNode(host=self.compute.host, uuid=uuids.cn2_uuid),
]
mock_get_allocations.side_effect = [
{},
{uuids.active_instance: "fake-resources"}
]
self.compute._error_out_instances_whose_build_was_interrupted(
self.context, {uuids.active_instance})
# check that nova skip the node that is not found in placement and
# continue with the next
mock_get_allocations.assert_has_calls(
[
mock.call(self.context, uuids.cn1_uuid),
mock.call(self.context, uuids.cn2_uuid),
]
)
def test_init_instance_with_binding_failed_vif_type(self):
# this instance will plug a 'binding_failed' vif
@ -3609,6 +3764,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self._do_test_set_admin_password_driver_error(
exc, vm_states.ACTIVE, None, expected_exception)
def test_destroy_evacuated_instances_no_migrations(self):
with mock.patch(
'nova.objects.MigrationList.get_by_filters') as migration_list:
migration_list.return_value = []
result = self.compute._destroy_evacuated_instances(self.context)
self.assertEqual({}, result)
def test_destroy_evacuated_instances(self):
our_host = self.compute.host
flavor = objects.Flavor()