Make compute auto-disable itself if builds are failing

This implements an auto-disable feature in nova-compute, where we
automatically set our service record to disabled if we consecutively
fail to build a certain number of instances.

While this is a very useful thing to do in general, disabling a failing
compute becomes more important in the future where scheduler retries due
to unknown failures may become either impossible or scoped to a single
cell. Since a compute that is consistently failing will look very
attractive to the scheduler, it may become a build magnet, that in the
absence of retries, would effectively kill all builds in a cloud until
fixed.

Change-Id: I02b7cd87d399d487dd1d650540f503a70bc27749
This commit is contained in:
Dan Smith 2017-05-09 09:11:27 -07:00
parent 5532d6f404
commit f93f675a85
4 changed files with 175 additions and 2 deletions

View File

@ -528,6 +528,7 @@ class ComputeManager(manager.Manager):
CONF.max_concurrent_live_migrations)
else:
self._live_migration_semaphore = compute_utils.UnlimitedSemaphore()
self._failed_builds = 0
super(ComputeManager, self).__init__(service_name="compute",
*args, **kwargs)
@ -1688,6 +1689,31 @@ class ComputeManager(manager.Manager):
return block_device_info
def _build_failed(self):
self._failed_builds += 1
limit = CONF.compute.consecutive_build_service_disable_threshold
if limit and self._failed_builds >= limit:
# NOTE(danms): If we're doing a bunch of parallel builds,
# it is possible (although not likely) that we have already
# failed N-1 builds before this and we race with a successful
# build and disable ourselves here when we might've otherwise
# not.
LOG.error('Disabling service due to %(fails)i '
'consecutive build failures',
{'fails': self._failed_builds})
ctx = nova.context.get_admin_context()
service = objects.Service.get_by_compute_host(ctx, CONF.host)
service.disabled = True
service.disabled_reason = (
'Auto-disabled due to %i build failures' % self._failed_builds)
service.save()
# NOTE(danms): Reset our counter now so that when the admin
# re-enables us we can start fresh
self._failed_builds = 0
elif self._failed_builds > 1:
LOG.warning('%(fails)i consecutive build failures',
{'fails': self._failed_builds})
@wrap_exception()
@reverts_task_state
@wrap_instance_fault
@ -1704,7 +1730,18 @@ class ComputeManager(manager.Manager):
# for a while and we want to make sure that nothing else tries
# to do anything with this instance while we wait.
with self._build_semaphore:
self._do_build_and_run_instance(*args, **kwargs)
try:
result = self._do_build_and_run_instance(*args, **kwargs)
except Exception:
result = build_results.FAILED
raise
finally:
fails = (build_results.FAILED,
build_results.RESCHEDULED)
if result in fails:
self._build_failed()
else:
self._failed_builds = 0
# NOTE(danms): We spawn here to return the RPC worker thread back to
# the pool. Since what follows could take a really long time, we don't

View File

@ -23,6 +23,12 @@ from oslo_config import types
from nova.conf import paths
compute_group = cfg.OptGroup(
'compute',
title='Compute Manager Options',
help="""
A collection of options specific to the nova-compute service.
""")
compute_opts = [
cfg.StrOpt('compute_driver',
help="""
@ -637,6 +643,27 @@ Possible values:
""")
]
compute_group_opts = [
cfg.IntOpt('consecutive_build_service_disable_threshold',
default=10,
help="""
Number of consecutive failed builds that result in disabling a compute service.
This option will cause nova-compute to set itself to a disabled state
if a certain number of consecutive build failures occur. This will
prevent the scheduler from continuing to send builds to a compute node that is
consistently failing. Note that all failures qualify and count towards this
score, including reschedules that may have been due to racy scheduler behavior.
Since the failures must be consecutive, it is unlikely that occasional expected
reschedules will actually disable a compute node.
Possible values:
* Any positive integer representing a build failure count.
* Zero to never auto-disable.
"""),
]
interval_opts = [
cfg.IntOpt('image_cache_manager_interval',
default=2400,
@ -1139,7 +1166,10 @@ ALL_OPTS = (compute_opts +
def register_opts(conf):
conf.register_opts(ALL_OPTS)
conf.register_group(compute_group)
conf.register_opts(compute_group_opts, group=compute_group)
def list_opts():
return {'DEFAULT': ALL_OPTS}
return {'DEFAULT': ALL_OPTS,
'compute': compute_opts}

View File

@ -4040,6 +4040,95 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
set_error=True, cleanup_volumes=True,
nil_out_host_and_node=True)
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
@mock.patch('nova.objects.Service.get_by_compute_host')
def test_build_failures_disable_service(self, mock_service, mock_dbari):
mock_dbari.return_value = build_results.FAILED
instance = objects.Instance(uuid=uuids.instance)
for i in range(0, 10):
self.compute.build_and_run_instance(None, instance, None,
None, None)
service = mock_service.return_value
self.assertTrue(service.disabled)
self.assertEqual('Auto-disabled due to 10 build failures',
service.disabled_reason)
service.save.assert_called_once_with()
self.assertEqual(0, self.compute._failed_builds)
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
@mock.patch('nova.objects.Service.get_by_compute_host')
def test_build_failures_not_disable_service(self, mock_service,
mock_dbari):
self.flags(consecutive_build_service_disable_threshold=0,
group='compute')
mock_dbari.return_value = build_results.FAILED
instance = objects.Instance(uuid=uuids.instance)
for i in range(0, 10):
self.compute.build_and_run_instance(None, instance, None,
None, None)
service = mock_service.return_value
self.assertFalse(service.save.called)
self.assertEqual(10, self.compute._failed_builds)
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
@mock.patch('nova.objects.Service.get_by_compute_host')
def test_transient_build_failures_no_disable_service(self, mock_service,
mock_dbari):
results = [build_results.FAILED,
build_results.ACTIVE,
build_results.RESCHEDULED]
def _fake_build(*a, **k):
if results:
return results.pop(0)
else:
return build_results.ACTIVE
mock_dbari.side_effect = _fake_build
instance = objects.Instance(uuid=uuids.instance)
for i in range(0, 10):
self.compute.build_and_run_instance(None, instance, None,
None, None)
service = mock_service.return_value
self.assertFalse(service.save.called)
self.assertEqual(0, self.compute._failed_builds)
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
@mock.patch('nova.objects.Service.get_by_compute_host')
def test_build_reschedules_disable_service(self, mock_service, mock_dbari):
mock_dbari.return_value = build_results.RESCHEDULED
instance = objects.Instance(uuid=uuids.instance)
for i in range(0, 10):
self.compute.build_and_run_instance(None, instance, None,
None, None)
service = mock_service.return_value
self.assertTrue(service.disabled)
self.assertEqual('Auto-disabled due to 10 build failures',
service.disabled_reason)
service.save.assert_called_once_with()
self.assertEqual(0, self.compute._failed_builds)
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
@mock.patch('nova.objects.Service.get_by_compute_host')
@mock.patch('nova.exception_wrapper._emit_exception_notification')
@mock.patch('nova.compute.utils.add_instance_fault_from_exc')
def test_build_exceptions_disable_service(self, mock_if, mock_notify,
mock_service, mock_dbari):
mock_dbari.side_effect = test.TestingException()
instance = objects.Instance(uuid=uuids.instance,
task_state=None)
for i in range(0, 10):
self.assertRaises(test.TestingException,
self.compute.build_and_run_instance,
None, instance, None,
None, None)
service = mock_service.return_value
self.assertTrue(service.disabled)
self.assertEqual('Auto-disabled due to 10 build failures',
service.disabled_reason)
service.save.assert_called_once_with()
self.assertEqual(0, self.compute._failed_builds)
@mock.patch.object(manager.ComputeManager, '_shutdown_instance')
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
@mock.patch.object(fake_driver.FakeDriver, 'spawn')

View File

@ -0,0 +1,17 @@
---
features:
- |
The `nova-compute` worker can automatically disable itself in the
service database if consecutive build failures exceed a set threshold. The
``[compute]/consecutive_build_service_disable_threshold`` configuration option
allows setting the threshold for this behavior, or disabling it entirely if
desired.
The intent is that an admin will examine the issue before manually
re-enabling the service, which will avoid that compute node becoming a
black hole build magnet.
upgrade:
- |
The new configuration option
``[compute]/consecutive_build_service_disable_threshold``
defaults to a nonzero value, which means multiple failed builds will
result in a compute node auto-disabling itself.