Error out interrupted builds
If the compute service is restarted while build requests are executing the instance_claim or waiting for the COMPUTE_RESOURCE_SEMAPHORE then those instances will be stuck forever in BUILDING state. If the instance already finished instance_claim then instance.host is set and when the compute restarts the instance is put to ERROR state. This patch changes compute service startup to put instances into ERROR state if they a) are in the BUILDING state, and b) have allocations on the compute resource provider, but c) do not have instance.host set to that compute. Change-Id: I856a3032c83fc2f605d8c9b6e5aa3bcfa415f96a Closes-Bug: #1833581
This commit is contained in:
parent
4f261f98e1
commit
a1a735bc6e
|
@ -667,6 +667,10 @@ class ComputeManager(manager.Manager):
|
|||
Then allocations are removed from Placement for every instance that is
|
||||
evacuated from this host regardless if the instance is reported by the
|
||||
hypervisor or not.
|
||||
|
||||
:param context: The request context
|
||||
:return: A dict keyed by instance uuid mapped to Migration objects
|
||||
for instances that were migrated away from this host
|
||||
"""
|
||||
filters = {
|
||||
'source_compute': self.host,
|
||||
|
@ -687,7 +691,7 @@ class ComputeManager(manager.Manager):
|
|||
evacuations = objects.MigrationList.get_by_filters(context,
|
||||
filters)
|
||||
if not evacuations:
|
||||
return
|
||||
return {}
|
||||
evacuations = {mig.instance_uuid: mig for mig in evacuations}
|
||||
|
||||
# TODO(mriedem): We could optimize by pre-loading the joined fields
|
||||
|
@ -923,9 +927,8 @@ class ComputeManager(manager.Manager):
|
|||
# instance has already been scheduled to this particular host.
|
||||
LOG.debug("Instance failed to spawn correctly, "
|
||||
"setting to ERROR state", instance=instance)
|
||||
instance.task_state = None
|
||||
instance.vm_state = vm_states.ERROR
|
||||
instance.save()
|
||||
self._set_instance_obj_error_state(
|
||||
context, instance, clean_task_state=True)
|
||||
return
|
||||
|
||||
if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and
|
||||
|
@ -936,9 +939,8 @@ class ComputeManager(manager.Manager):
|
|||
# spawned so set to ERROR state. This is consistent to BUILD
|
||||
LOG.debug("Instance failed to rebuild correctly, "
|
||||
"setting to ERROR state", instance=instance)
|
||||
instance.task_state = None
|
||||
instance.vm_state = vm_states.ERROR
|
||||
instance.save()
|
||||
self._set_instance_obj_error_state(
|
||||
context, instance, clean_task_state=True)
|
||||
return
|
||||
|
||||
if (instance.vm_state != vm_states.ERROR and
|
||||
|
@ -1329,10 +1331,24 @@ class ComputeManager(manager.Manager):
|
|||
|
||||
# Initialise instances on the host that are not evacuating
|
||||
for instance in instances:
|
||||
if (not evacuated_instances or
|
||||
instance.uuid not in evacuated_instances):
|
||||
if instance.uuid not in evacuated_instances:
|
||||
self._init_instance(context, instance)
|
||||
|
||||
# NOTE(gibi): collect all the instance uuids that is in some way
|
||||
# was already handled above. Either by init_instance or by
|
||||
# _destroy_evacuated_instances. This way we can limit the scope of
|
||||
# the _error_out_instances_whose_build_was_interrupted call to look
|
||||
# only for instances that have allocations on this node and not
|
||||
# handled by the above calls.
|
||||
already_handled = {instance.uuid for instance in instances}.union(
|
||||
evacuated_instances)
|
||||
# NOTE(gibi): If ironic and vcenter virt driver slow start time
|
||||
# becomes problematic here then we should consider adding a config
|
||||
# option or a driver flag to tell us if we should thread this out
|
||||
# in the background on startup
|
||||
self._error_out_instances_whose_build_was_interrupted(
|
||||
context, already_handled)
|
||||
|
||||
finally:
|
||||
if CONF.defer_iptables_apply:
|
||||
self.driver.filter_defer_apply_off()
|
||||
|
@ -1345,6 +1361,87 @@ class ComputeManager(manager.Manager):
|
|||
# _sync_scheduler_instance_info periodic task will.
|
||||
self._update_scheduler_instance_info(context, instances)
|
||||
|
||||
def _error_out_instances_whose_build_was_interrupted(
|
||||
self, context, already_handled_instances):
|
||||
"""If there are instances in BUILDING state that are not
|
||||
assigned to this host but have allocations in placement towards
|
||||
this compute that means the nova-compute service was
|
||||
restarted while those instances waited for the resource claim
|
||||
to finish and the _set_instance_host_and_node() to update the
|
||||
instance.host field. We need to push them to ERROR state here to
|
||||
prevent keeping them in BUILDING state forever.
|
||||
|
||||
:param context: The request context
|
||||
:param already_handled_instances: The set of instance UUIDs that the
|
||||
host initialization process already handled in some way.
|
||||
"""
|
||||
|
||||
# Strategy:
|
||||
# 1) Get the allocations from placement for our compute node(s)
|
||||
# 2) Remove the already handled instances from the consumer list;
|
||||
# they are either already initialized or need to be skipped.
|
||||
# 3) Check which remaining consumer is an instance in BUILDING state
|
||||
# and push it to ERROR state.
|
||||
|
||||
LOG.info(
|
||||
"Looking for unclaimed instances stuck in BUILDING status for "
|
||||
"nodes managed by this host")
|
||||
|
||||
try:
|
||||
node_names = self.driver.get_available_nodes()
|
||||
except exception.VirtDriverNotReady:
|
||||
LOG.warning(
|
||||
"Virt driver is not ready. Therefore unable to error out any "
|
||||
"instances stuck in BUILDING state on this node. If this is "
|
||||
"the first time this service is starting on this host, then "
|
||||
"you can ignore this warning.")
|
||||
return
|
||||
|
||||
for node_name in node_names:
|
||||
try:
|
||||
cn_uuid = objects.ComputeNode.get_by_host_and_nodename(
|
||||
context, self.host, node_name).uuid
|
||||
except exception.ComputeHostNotFound:
|
||||
LOG.warning(
|
||||
"Compute node %s not found in the database and therefore "
|
||||
"unable to error out any instances stuck in BUILDING "
|
||||
"state on this node. If this is the first time this "
|
||||
"service is starting on this host, then you can ignore "
|
||||
"this warning.", node_name)
|
||||
continue
|
||||
|
||||
try:
|
||||
f = self.reportclient.get_allocations_for_resource_provider
|
||||
allocations = f(context, cn_uuid).allocations
|
||||
except (exception.ResourceProviderAllocationRetrievalFailed,
|
||||
keystone_exception.ClientException) as e:
|
||||
LOG.error(
|
||||
"Could not retrieve compute node resource provider %s and "
|
||||
"therefore unable to error out any instances stuck in "
|
||||
"BUILDING state. Error: %s", cn_uuid, six.text_type(e))
|
||||
continue
|
||||
|
||||
not_handled_consumers = (set(allocations) -
|
||||
already_handled_instances)
|
||||
|
||||
if not not_handled_consumers:
|
||||
continue
|
||||
|
||||
filters = {
|
||||
'vm_state': vm_states.BUILDING,
|
||||
'uuid': not_handled_consumers
|
||||
}
|
||||
|
||||
instances = objects.InstanceList.get_by_filters(
|
||||
context, filters, expected_attrs=[])
|
||||
|
||||
for instance in instances:
|
||||
LOG.debug(
|
||||
"Instance spawn was interrupted before instance_claim, "
|
||||
"setting instance to ERROR state", instance=instance)
|
||||
self._set_instance_obj_error_state(
|
||||
context, instance, clean_task_state=True)
|
||||
|
||||
def cleanup_host(self):
|
||||
self.driver.register_event_listener(None)
|
||||
self.instance_events.cancel_all_events()
|
||||
|
|
|
@ -176,21 +176,14 @@ class TestComputeRestartInstanceStuckInBuild(
|
|||
# instance_claim call we can wait for in the test
|
||||
fake_notifier.wait_for_versioned_notifications(
|
||||
'instance.create.start')
|
||||
self.restart_compute_service(self.compute1)
|
||||
|
||||
# This is bug 1833581 as the server remains in BUILD state after the
|
||||
# compute restart.
|
||||
self._wait_for_state_change(self.admin_api, server, 'BUILD')
|
||||
|
||||
# Not even the periodic task push this server to ERROR because the
|
||||
# server host is still None since the instance_claim didn't set it.
|
||||
self.flags(instance_build_timeout=1)
|
||||
self.compute1.manager._check_instance_build_time(
|
||||
nova_context.get_admin_context())
|
||||
server = self.admin_api.get_server(server['id'])
|
||||
self.assertEqual('BUILD', server['status'])
|
||||
self.assertIsNone(server['OS-EXT-SRV-ATTR:host'])
|
||||
with mock.patch('nova.compute.manager.LOG.debug') as mock_log:
|
||||
self.restart_compute_service(self.compute1)
|
||||
|
||||
# We expect that the instance is pushed to ERROR state during the
|
||||
# compute restart.
|
||||
# self._wait_for_state_change(self.admin_api, server, 'ERROR')
|
||||
self._wait_for_state_change(self.admin_api, server, 'ERROR')
|
||||
mock_log.assert_called_with(
|
||||
'Instance spawn was interrupted before instance_claim, setting '
|
||||
'instance to ERROR state',
|
||||
instance=mock.ANY)
|
||||
|
|
|
@ -58,6 +58,7 @@ from nova.objects import instance as instance_obj
|
|||
from nova.objects import migrate_data as migrate_data_obj
|
||||
from nova.objects import network_request as net_req_obj
|
||||
from nova.pci import request as pci_request
|
||||
from nova.scheduler.client import report
|
||||
from nova import test
|
||||
from nova.tests import fixtures
|
||||
from nova.tests.unit.api.openstack import fakes
|
||||
|
@ -719,6 +720,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
|||
return instance_obj._make_instance_list(
|
||||
self.context, objects.InstanceList(), db_list, None)
|
||||
|
||||
@mock.patch.object(manager.ComputeManager,
|
||||
'_error_out_instances_whose_build_was_interrupted')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'init_host')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'filter_defer_apply_on')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'filter_defer_apply_off')
|
||||
|
@ -734,7 +737,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
|||
mock_validate_pinning,
|
||||
mock_destroy, mock_admin_ctxt, mock_host_get,
|
||||
mock_filter_off, mock_filter_on, mock_init_host,
|
||||
defer_iptables_apply):
|
||||
mock_error_interrupted, defer_iptables_apply):
|
||||
mock_admin_ctxt.return_value = self.context
|
||||
inst_list = _make_instance_list(startup_instances)
|
||||
mock_host_get.return_value = inst_list
|
||||
|
@ -760,6 +763,9 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
|||
mock_update_scheduler.assert_called_once_with(
|
||||
self.context, inst_list)
|
||||
|
||||
mock_error_interrupted.assert_called_once_with(
|
||||
self.context, {inst.uuid for inst in inst_list})
|
||||
|
||||
# Test with defer_iptables_apply
|
||||
self.flags(defer_iptables_apply=True)
|
||||
_do_mock_calls(defer_iptables_apply=True)
|
||||
|
@ -768,6 +774,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
|||
self.flags(defer_iptables_apply=False)
|
||||
_do_mock_calls(defer_iptables_apply=False)
|
||||
|
||||
@mock.patch('nova.compute.manager.ComputeManager.'
|
||||
'_error_out_instances_whose_build_was_interrupted')
|
||||
@mock.patch('nova.objects.InstanceList.get_by_host',
|
||||
return_value=objects.InstanceList())
|
||||
@mock.patch('nova.compute.manager.ComputeManager.'
|
||||
|
@ -777,13 +785,15 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
|||
@mock.patch('nova.compute.manager.ComputeManager.'
|
||||
'_update_scheduler_instance_info', mock.NonCallableMock())
|
||||
def test_init_host_no_instances(self, mock_destroy_evac_instances,
|
||||
mock_get_by_host):
|
||||
mock_get_by_host, mock_error_interrupted):
|
||||
"""Tests the case that init_host runs and there are no instances
|
||||
on this host yet (it's brand new). Uses NonCallableMock for the
|
||||
methods we assert should not be called.
|
||||
"""
|
||||
self.compute.init_host()
|
||||
|
||||
mock_error_interrupted.assert_called_once_with(mock.ANY, set())
|
||||
|
||||
@mock.patch('nova.objects.InstanceList')
|
||||
@mock.patch('nova.objects.MigrationList.get_by_filters')
|
||||
def test_cleanup_host(self, mock_miglist_get, mock_instance_list):
|
||||
|
@ -839,6 +849,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
|||
self.compute.init_virt_events()
|
||||
self.assertFalse(mock_register.called)
|
||||
|
||||
@mock.patch.object(manager.ComputeManager,
|
||||
'_error_out_instances_whose_build_was_interrupted')
|
||||
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')
|
||||
@mock.patch('nova.scheduler.utils.resources_from_flavor')
|
||||
@mock.patch.object(manager.ComputeManager, '_get_instances_on_driver')
|
||||
|
@ -853,7 +865,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
|||
def test_init_host_with_evacuated_instance(self, mock_save, mock_mig_get,
|
||||
mock_temp_mut, mock_init_host, mock_destroy, mock_host_get,
|
||||
mock_admin_ctxt, mock_init_virt, mock_get_inst, mock_resources,
|
||||
mock_get_node):
|
||||
mock_get_node, mock_error_interrupted):
|
||||
our_host = self.compute.host
|
||||
not_our_host = 'not-' + our_host
|
||||
|
||||
|
@ -899,6 +911,11 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
|||
mock.ANY, mock.ANY, mock.ANY)
|
||||
mock_save.assert_called_once_with()
|
||||
|
||||
mock_error_interrupted.assert_called_once_with(
|
||||
self.context, {deleted_instance.uuid})
|
||||
|
||||
@mock.patch.object(manager.ComputeManager,
|
||||
'_error_out_instances_whose_build_was_interrupted')
|
||||
@mock.patch.object(context, 'get_admin_context')
|
||||
@mock.patch.object(objects.InstanceList, 'get_by_host')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'init_host')
|
||||
|
@ -907,7 +924,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
|||
'_destroy_evacuated_instances')
|
||||
def test_init_host_with_in_progress_evacuations(self, mock_destroy_evac,
|
||||
mock_init_instance, mock_init_host, mock_host_get,
|
||||
mock_admin_ctxt):
|
||||
mock_admin_ctxt, mock_error_interrupted):
|
||||
"""Assert that init_instance is not called for instances that are
|
||||
evacuating from the host during init_host.
|
||||
"""
|
||||
|
@ -928,6 +945,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
|||
|
||||
mock_init_instance.assert_called_once_with(
|
||||
self.context, active_instance)
|
||||
mock_error_interrupted.assert_called_once_with(
|
||||
self.context, {active_instance.uuid, evacuating_instance.uuid})
|
||||
|
||||
@mock.patch.object(objects.InstanceList, 'get_by_host',
|
||||
new=mock.Mock())
|
||||
|
@ -943,6 +962,145 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
|||
self.assertRaises(exception.InvalidConfiguration,
|
||||
self.compute.init_host)
|
||||
|
||||
@mock.patch.object(objects.Instance, 'save')
|
||||
@mock.patch.object(objects.InstanceList, 'get_by_filters')
|
||||
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
|
||||
'get_allocations_for_resource_provider')
|
||||
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
||||
def test_init_host_with_interrupted_instance_build(
|
||||
self, mock_get_nodes, mock_get_by_host_and_node,
|
||||
mock_get_allocations, mock_get_instances, mock_instance_save):
|
||||
|
||||
mock_get_nodes.return_value = ['fake-node']
|
||||
mock_get_by_host_and_node.return_value = objects.ComputeNode(
|
||||
host=self.compute.host, uuid=uuids.cn_uuid)
|
||||
|
||||
active_instance = fake_instance.fake_instance_obj(
|
||||
self.context, host=self.compute.host, uuid=uuids.active_instance)
|
||||
evacuating_instance = fake_instance.fake_instance_obj(
|
||||
self.context, host=self.compute.host, uuid=uuids.evac_instance)
|
||||
interrupted_instance = fake_instance.fake_instance_obj(
|
||||
self.context, host=None, uuid=uuids.interrupted_instance,
|
||||
vm_state=vm_states.BUILDING)
|
||||
|
||||
# we have 3 different instances. We need consumers for each instance
|
||||
# in placement and an extra consumer that is not an instance
|
||||
allocations = {
|
||||
uuids.active_instance: "fake-resources-active",
|
||||
uuids.evac_instance: "fake-resources-evacuating",
|
||||
uuids.interrupted_instance: "fake-resources-interrupted",
|
||||
uuids.not_an_instance: "fake-resources-not-an-instance",
|
||||
}
|
||||
mock_get_allocations.return_value = report.ProviderAllocInfo(
|
||||
allocations=allocations)
|
||||
|
||||
# get is called with a uuid filter containing interrupted_instance,
|
||||
# error_instance, and not_an_instance but it will only return the
|
||||
# interrupted_instance as the error_instance is not in building state
|
||||
# and not_an_instance does not match with any instance in the db.
|
||||
mock_get_instances.return_value = objects.InstanceList(
|
||||
self.context, objects=[interrupted_instance])
|
||||
|
||||
# interrupted_instance and error_instance is not in the list passed in
|
||||
# because it is not assigned to the compute and therefore not processed
|
||||
# by init_host and init_instance
|
||||
self.compute._error_out_instances_whose_build_was_interrupted(
|
||||
self.context,
|
||||
{inst.uuid for inst in [active_instance, evacuating_instance]})
|
||||
|
||||
mock_get_by_host_and_node.assert_called_once_with(
|
||||
self.context, self.compute.host, 'fake-node')
|
||||
mock_get_allocations.assert_called_once_with(
|
||||
self.context, uuids.cn_uuid)
|
||||
|
||||
mock_get_instances.assert_called_once_with(
|
||||
self.context,
|
||||
{'vm_state': 'building',
|
||||
'uuid': {uuids.interrupted_instance, uuids.not_an_instance}
|
||||
},
|
||||
expected_attrs=[])
|
||||
|
||||
# this is expected to be called only once for interrupted_instance
|
||||
mock_instance_save.assert_called_once_with()
|
||||
self.assertEqual(vm_states.ERROR, interrupted_instance.vm_state)
|
||||
|
||||
@mock.patch.object(manager.LOG, 'warning')
|
||||
@mock.patch.object(
|
||||
fake_driver.FakeDriver, 'get_available_nodes',
|
||||
side_effect=exception.VirtDriverNotReady)
|
||||
def test_init_host_with_interrupted_instance_build_driver_not_ready(
|
||||
self, mock_get_nodes, mock_log_warning):
|
||||
self.compute._error_out_instances_whose_build_was_interrupted(
|
||||
self.context, set())
|
||||
|
||||
mock_log_warning.assert_called_once_with(
|
||||
"Virt driver is not ready. Therefore unable to error out any "
|
||||
"instances stuck in BUILDING state on this node. If this is the "
|
||||
"first time this service is starting on this host, then you can "
|
||||
"ignore this warning.")
|
||||
|
||||
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
|
||||
'get_allocations_for_resource_provider')
|
||||
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
||||
def test_init_host_with_interrupted_instance_build_compute_node_not_found(
|
||||
self, mock_get_nodes, mock_get_by_host_and_node,
|
||||
mock_get_allocations):
|
||||
|
||||
mock_get_nodes.return_value = ['fake-node1', 'fake-node2']
|
||||
mock_get_by_host_and_node.side_effect = [
|
||||
exception.ComputeHostNotFound(host='fake-node1'),
|
||||
objects.ComputeNode(host=self.compute.host, uuid=uuids.cn_uuid)]
|
||||
|
||||
self.compute._error_out_instances_whose_build_was_interrupted(
|
||||
self.context, set())
|
||||
|
||||
# check that nova skip the node that is not found in the db and
|
||||
# continue with the next
|
||||
mock_get_by_host_and_node.assert_has_calls(
|
||||
[
|
||||
mock.call(self.context, self.compute.host, 'fake-node1'),
|
||||
mock.call(self.context, self.compute.host, 'fake-node2'),
|
||||
]
|
||||
)
|
||||
|
||||
# placement only queried for the existing compute
|
||||
mock_get_allocations.assert_called_once_with(
|
||||
self.context, uuids.cn_uuid)
|
||||
|
||||
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
|
||||
'get_allocations_for_resource_provider')
|
||||
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
||||
def test_init_host_with_interrupted_instance_build_compute_rp_not_found(
|
||||
self, mock_get_nodes, mock_get_by_host_and_node,
|
||||
mock_get_allocations):
|
||||
|
||||
mock_get_nodes.return_value = ['fake-node1', 'fake-node2']
|
||||
mock_get_by_host_and_node.side_effect = [
|
||||
objects.ComputeNode(host=self.compute.host, uuid=uuids.cn1_uuid),
|
||||
objects.ComputeNode(host=self.compute.host, uuid=uuids.cn2_uuid),
|
||||
]
|
||||
|
||||
mock_get_allocations.side_effect = [
|
||||
exception.ResourceProviderAllocationRetrievalFailed(
|
||||
rp_uuid=uuids.cn1_uuid, error='error'),
|
||||
report.ProviderAllocInfo(allocations={})
|
||||
]
|
||||
|
||||
self.compute._error_out_instances_whose_build_was_interrupted(
|
||||
self.context, set())
|
||||
|
||||
# check that nova skip the node that is not found in placement and
|
||||
# continue with the next
|
||||
mock_get_allocations.assert_has_calls(
|
||||
[
|
||||
mock.call(self.context, uuids.cn1_uuid),
|
||||
mock.call(self.context, uuids.cn2_uuid),
|
||||
]
|
||||
)
|
||||
|
||||
def test_init_instance_with_binding_failed_vif_type(self):
|
||||
# this instance will plug a 'binding_failed' vif
|
||||
instance = fake_instance.fake_instance_obj(
|
||||
|
@ -4300,6 +4458,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
|||
self._do_test_set_admin_password_driver_error(
|
||||
exc, vm_states.ACTIVE, None, expected_exception)
|
||||
|
||||
def test_destroy_evacuated_instances_no_migrations(self):
|
||||
with mock.patch(
|
||||
'nova.objects.MigrationList.get_by_filters') as migration_list:
|
||||
migration_list.return_value = []
|
||||
|
||||
result = self.compute._destroy_evacuated_instances(self.context)
|
||||
self.assertEqual({}, result)
|
||||
|
||||
def test_destroy_evacuated_instances(self):
|
||||
our_host = self.compute.host
|
||||
flavor = objects.Flavor()
|
||||
|
|
Loading…
Reference in New Issue