Merge "Make compute auto-disable itself if builds are failing"

This commit is contained in:
Jenkins 2017-05-16 17:43:27 +00:00 committed by Gerrit Code Review
commit 722e48fac4
4 changed files with 175 additions and 2 deletions

View File

@ -528,6 +528,7 @@ class ComputeManager(manager.Manager):
CONF.max_concurrent_live_migrations)
else:
self._live_migration_semaphore = compute_utils.UnlimitedSemaphore()
self._failed_builds = 0
super(ComputeManager, self).__init__(service_name="compute",
*args, **kwargs)
@ -1688,6 +1689,31 @@ class ComputeManager(manager.Manager):
return block_device_info
def _build_failed(self):
self._failed_builds += 1
limit = CONF.compute.consecutive_build_service_disable_threshold
if limit and self._failed_builds >= limit:
# NOTE(danms): If we're doing a bunch of parallel builds,
# it is possible (although not likely) that we have already
# failed N-1 builds before this and we race with a successful
# build and disable ourselves here when we might've otherwise
# not.
LOG.error('Disabling service due to %(fails)i '
'consecutive build failures',
{'fails': self._failed_builds})
ctx = nova.context.get_admin_context()
service = objects.Service.get_by_compute_host(ctx, CONF.host)
service.disabled = True
service.disabled_reason = (
'Auto-disabled due to %i build failures' % self._failed_builds)
service.save()
# NOTE(danms): Reset our counter now so that when the admin
# re-enables us we can start fresh
self._failed_builds = 0
elif self._failed_builds > 1:
LOG.warning('%(fails)i consecutive build failures',
{'fails': self._failed_builds})
@wrap_exception()
@reverts_task_state
@wrap_instance_fault
@ -1704,7 +1730,18 @@ class ComputeManager(manager.Manager):
# for a while and we want to make sure that nothing else tries
# to do anything with this instance while we wait.
with self._build_semaphore:
self._do_build_and_run_instance(*args, **kwargs)
try:
result = self._do_build_and_run_instance(*args, **kwargs)
except Exception:
result = build_results.FAILED
raise
finally:
fails = (build_results.FAILED,
build_results.RESCHEDULED)
if result in fails:
self._build_failed()
else:
self._failed_builds = 0
# NOTE(danms): We spawn here to return the RPC worker thread back to
# the pool. Since what follows could take a really long time, we don't

View File

@ -23,6 +23,12 @@ from oslo_config import types
from nova.conf import paths
compute_group = cfg.OptGroup(
'compute',
title='Compute Manager Options',
help="""
A collection of options specific to the nova-compute service.
""")
compute_opts = [
cfg.StrOpt('compute_driver',
help="""
@ -637,6 +643,27 @@ Possible values:
""")
]
compute_group_opts = [
cfg.IntOpt('consecutive_build_service_disable_threshold',
default=10,
help="""
Number of consecutive failed builds that result in disabling a compute service.
This option will cause nova-compute to set itself to a disabled state
if a certain number of consecutive build failures occur. This will
prevent the scheduler from continuing to send builds to a compute node that is
consistently failing. Note that all failures qualify and count towards this
score, including reschedules that may have been due to racy scheduler behavior.
Since the failures must be consecutive, it is unlikely that occasional expected
reschedules will actually disable a compute node.
Possible values:
* Any positive integer representing a build failure count.
* Zero to never auto-disable.
"""),
]
interval_opts = [
cfg.IntOpt('image_cache_manager_interval',
default=2400,
@ -1139,7 +1166,10 @@ ALL_OPTS = (compute_opts +
def register_opts(conf):
conf.register_opts(ALL_OPTS)
conf.register_group(compute_group)
conf.register_opts(compute_group_opts, group=compute_group)
def list_opts():
return {'DEFAULT': ALL_OPTS}
return {'DEFAULT': ALL_OPTS,
'compute': compute_opts}

View File

@ -4040,6 +4040,95 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
set_error=True, cleanup_volumes=True,
nil_out_host_and_node=True)
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
@mock.patch('nova.objects.Service.get_by_compute_host')
def test_build_failures_disable_service(self, mock_service, mock_dbari):
mock_dbari.return_value = build_results.FAILED
instance = objects.Instance(uuid=uuids.instance)
for i in range(0, 10):
self.compute.build_and_run_instance(None, instance, None,
None, None)
service = mock_service.return_value
self.assertTrue(service.disabled)
self.assertEqual('Auto-disabled due to 10 build failures',
service.disabled_reason)
service.save.assert_called_once_with()
self.assertEqual(0, self.compute._failed_builds)
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
@mock.patch('nova.objects.Service.get_by_compute_host')
def test_build_failures_not_disable_service(self, mock_service,
mock_dbari):
self.flags(consecutive_build_service_disable_threshold=0,
group='compute')
mock_dbari.return_value = build_results.FAILED
instance = objects.Instance(uuid=uuids.instance)
for i in range(0, 10):
self.compute.build_and_run_instance(None, instance, None,
None, None)
service = mock_service.return_value
self.assertFalse(service.save.called)
self.assertEqual(10, self.compute._failed_builds)
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
@mock.patch('nova.objects.Service.get_by_compute_host')
def test_transient_build_failures_no_disable_service(self, mock_service,
mock_dbari):
results = [build_results.FAILED,
build_results.ACTIVE,
build_results.RESCHEDULED]
def _fake_build(*a, **k):
if results:
return results.pop(0)
else:
return build_results.ACTIVE
mock_dbari.side_effect = _fake_build
instance = objects.Instance(uuid=uuids.instance)
for i in range(0, 10):
self.compute.build_and_run_instance(None, instance, None,
None, None)
service = mock_service.return_value
self.assertFalse(service.save.called)
self.assertEqual(0, self.compute._failed_builds)
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
@mock.patch('nova.objects.Service.get_by_compute_host')
def test_build_reschedules_disable_service(self, mock_service, mock_dbari):
mock_dbari.return_value = build_results.RESCHEDULED
instance = objects.Instance(uuid=uuids.instance)
for i in range(0, 10):
self.compute.build_and_run_instance(None, instance, None,
None, None)
service = mock_service.return_value
self.assertTrue(service.disabled)
self.assertEqual('Auto-disabled due to 10 build failures',
service.disabled_reason)
service.save.assert_called_once_with()
self.assertEqual(0, self.compute._failed_builds)
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
@mock.patch('nova.objects.Service.get_by_compute_host')
@mock.patch('nova.exception_wrapper._emit_exception_notification')
@mock.patch('nova.compute.utils.add_instance_fault_from_exc')
def test_build_exceptions_disable_service(self, mock_if, mock_notify,
mock_service, mock_dbari):
mock_dbari.side_effect = test.TestingException()
instance = objects.Instance(uuid=uuids.instance,
task_state=None)
for i in range(0, 10):
self.assertRaises(test.TestingException,
self.compute.build_and_run_instance,
None, instance, None,
None, None)
service = mock_service.return_value
self.assertTrue(service.disabled)
self.assertEqual('Auto-disabled due to 10 build failures',
service.disabled_reason)
service.save.assert_called_once_with()
self.assertEqual(0, self.compute._failed_builds)
@mock.patch.object(manager.ComputeManager, '_shutdown_instance')
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
@mock.patch.object(fake_driver.FakeDriver, 'spawn')

View File

@ -0,0 +1,17 @@
---
features:
- |
The `nova-compute` worker can automatically disable itself in the
service database if consecutive build failures exceed a set threshold. The
``[compute]/consecutive_build_service_disable_threshold`` configuration option
allows setting the threshold for this behavior, or disabling it entirely if
desired.
The intent is that an admin will examine the issue before manually
re-enabling the service, which will avoid that compute node becoming a
black hole build magnet.
upgrade:
- |
The new configuration option
``[compute]/consecutive_build_service_disable_threshold``
defaults to a nonzero value, which means multiple failed builds will
result in a compute node auto-disabling itself.