Merge "Change consecutive build failure limit to a weigher" into stable/pike
This commit is contained in:
commit
8a08cf332e
|
@ -860,6 +860,11 @@ Hosts and cells are weighted based on the following options in the
|
|||
- Multiplier used for weighing hosts for group soft-anti-affinity. Only a
|
||||
positive value is meaningful. Negative means that the behavior will
|
||||
change to the opposite, which is soft-affinity.
|
||||
* - [filter_scheduler]
|
||||
- ``build_failure_weight_multiplier``
|
||||
- Multiplier used for weighing hosts which have recent build failures. A
|
||||
positive value increases the significance of build falures reported by
|
||||
the host recently, making them less likely to be chosen.
|
||||
* - [metrics]
|
||||
- ``weight_multiplier``
|
||||
- Multiplier for weighting meters. Use a floating-point value.
|
||||
|
|
|
@ -532,7 +532,6 @@ class ComputeManager(manager.Manager):
|
|||
CONF.max_concurrent_live_migrations)
|
||||
else:
|
||||
self._live_migration_semaphore = compute_utils.UnlimitedSemaphore()
|
||||
self._failed_builds = 0
|
||||
|
||||
super(ComputeManager, self).__init__(service_name="compute",
|
||||
*args, **kwargs)
|
||||
|
@ -1710,29 +1709,15 @@ class ComputeManager(manager.Manager):
|
|||
return block_device_info
|
||||
|
||||
def _build_failed(self):
|
||||
self._failed_builds += 1
|
||||
limit = CONF.compute.consecutive_build_service_disable_threshold
|
||||
if limit and self._failed_builds >= limit:
|
||||
# NOTE(danms): If we're doing a bunch of parallel builds,
|
||||
# it is possible (although not likely) that we have already
|
||||
# failed N-1 builds before this and we race with a successful
|
||||
# build and disable ourselves here when we might've otherwise
|
||||
# not.
|
||||
LOG.error('Disabling service due to %(fails)i '
|
||||
'consecutive build failures',
|
||||
{'fails': self._failed_builds})
|
||||
ctx = nova.context.get_admin_context()
|
||||
service = objects.Service.get_by_compute_host(ctx, CONF.host)
|
||||
service.disabled = True
|
||||
service.disabled_reason = (
|
||||
'Auto-disabled due to %i build failures' % self._failed_builds)
|
||||
service.save()
|
||||
# NOTE(danms): Reset our counter now so that when the admin
|
||||
# re-enables us we can start fresh
|
||||
self._failed_builds = 0
|
||||
elif self._failed_builds > 1:
|
||||
LOG.warning('%(fails)i consecutive build failures',
|
||||
{'fails': self._failed_builds})
|
||||
if CONF.compute.consecutive_build_service_disable_threshold:
|
||||
rt = self._get_resource_tracker()
|
||||
# NOTE(danms): Update our counter, but wait for the next
|
||||
# update_available_resource() periodic to flush it to the DB
|
||||
rt.stats.build_failed()
|
||||
|
||||
def _build_succeeded(self):
|
||||
rt = self._get_resource_tracker()
|
||||
rt.stats.build_succeeded()
|
||||
|
||||
@wrap_exception()
|
||||
@reverts_task_state
|
||||
|
@ -1783,7 +1768,7 @@ class ComputeManager(manager.Manager):
|
|||
|
||||
self._build_failed()
|
||||
else:
|
||||
self._failed_builds = 0
|
||||
self._build_succeeded()
|
||||
|
||||
# NOTE(danms): We spawn here to return the RPC worker thread back to
|
||||
# the pool. Since what follows could take a really long time, we don't
|
||||
|
|
|
@ -628,7 +628,13 @@ class ResourceTracker(object):
|
|||
def _copy_resources(self, compute_node, resources):
|
||||
"""Copy resource values to supplied compute_node."""
|
||||
# purge old stats and init with anything passed in by the driver
|
||||
# NOTE(danms): Preserve 'failed_builds' across the stats clearing,
|
||||
# as that is not part of resources
|
||||
# TODO(danms): Stop doing this when we get a column to store this
|
||||
# directly
|
||||
prev_failed_builds = self.stats.get('failed_builds', 0)
|
||||
self.stats.clear()
|
||||
self.stats['failed_builds'] = prev_failed_builds
|
||||
self.stats.digest_stats(resources.get('stats'))
|
||||
compute_node.stats = copy.deepcopy(self.stats)
|
||||
|
||||
|
|
|
@ -138,3 +138,11 @@ class Stats(dict):
|
|||
os_type=os_type, project_id=project_id)
|
||||
|
||||
return (vm_state, task_state, os_type, project_id)
|
||||
|
||||
def build_failed(self):
|
||||
self['failed_builds'] = self.get('failed_builds', 0) + 1
|
||||
|
||||
def build_succeeded(self):
|
||||
# FIXME(danms): Make this more graceful, either by time-based aging or
|
||||
# a fixed decline upon success
|
||||
self['failed_builds'] = 0
|
||||
|
|
|
@ -638,20 +638,20 @@ compute_group_opts = [
|
|||
cfg.IntOpt('consecutive_build_service_disable_threshold',
|
||||
default=10,
|
||||
help="""
|
||||
Number of consecutive failed builds that result in disabling a compute service.
|
||||
Enables reporting of build failures to the scheduler.
|
||||
|
||||
This option will cause nova-compute to set itself to a disabled state
|
||||
if a certain number of consecutive build failures occur. This will
|
||||
prevent the scheduler from continuing to send builds to a compute node that is
|
||||
consistently failing. Note that all failures qualify and count towards this
|
||||
score, including reschedules that may have been due to racy scheduler behavior.
|
||||
Since the failures must be consecutive, it is unlikely that occasional expected
|
||||
reschedules will actually disable a compute node.
|
||||
Any nonzero value will enable sending build failure statistics to the
|
||||
scheduler for use by the BuildFailureWeigher.
|
||||
|
||||
Possible values:
|
||||
|
||||
* Any positive integer representing a build failure count.
|
||||
* Zero to never auto-disable.
|
||||
* Any positive integer enables reporting build failures.
|
||||
* Zero to disable reporting build failures.
|
||||
|
||||
Related options:
|
||||
|
||||
* [filter_scheduler]/build_failure_weight_multiplier
|
||||
|
||||
"""),
|
||||
]
|
||||
|
||||
|
|
|
@ -477,6 +477,34 @@ Possible values:
|
|||
for hosts with group soft anti-affinity. Only a positive value are
|
||||
meaningful, as negative values would make this behave as a soft affinity
|
||||
weigher.
|
||||
"""),
|
||||
cfg.FloatOpt(
|
||||
"build_failure_weight_multiplier",
|
||||
default=1000000.0,
|
||||
help="""
|
||||
Multiplier used for weighing hosts that have had recent build failures.
|
||||
|
||||
This option determines how much weight is placed on a compute node with
|
||||
recent build failures. Build failures may indicate a failing, misconfigured,
|
||||
or otherwise ailing compute node, and avoiding it during scheduling may be
|
||||
beneficial. The weight is inversely proportional to the number of recent
|
||||
build failures the compute node has experienced. This value should be
|
||||
set to some high value to offset weight given by other enabled weighers
|
||||
due to available resources. To disable weighing compute hosts by the
|
||||
number of recent failures, set this to zero.
|
||||
|
||||
This option is only used by the FilterScheduler and its subclasses; if you use
|
||||
a different scheduler, this option has no effect.
|
||||
|
||||
Possible values:
|
||||
|
||||
* An integer or float value, where the value corresponds to the multiplier
|
||||
ratio for this weigher.
|
||||
|
||||
Related options:
|
||||
|
||||
* [compute]/consecutive_build_service_disable_threshold - Must be nonzero
|
||||
for a compute to report data considered by this weigher.
|
||||
"""),
|
||||
# TODO(mikal): replace this option with something involving host aggregates
|
||||
cfg.ListOpt("isolated_images",
|
||||
|
|
|
@ -257,6 +257,9 @@ class HostState(object):
|
|||
self.ram_allocation_ratio = compute.ram_allocation_ratio
|
||||
self.disk_allocation_ratio = compute.disk_allocation_ratio
|
||||
|
||||
# update failed_builds counter reported by the compute
|
||||
self.failed_builds = int(self.stats.get('failed_builds', 0))
|
||||
|
||||
def consume_from_request(self, spec_obj):
|
||||
"""Incrementally update host state from a RequestSpec object."""
|
||||
|
||||
|
|
|
@ -76,6 +76,9 @@ class IronicNodeState(host_manager.HostState):
|
|||
self.ram_allocation_ratio = compute.ram_allocation_ratio
|
||||
self.disk_allocation_ratio = compute.disk_allocation_ratio
|
||||
|
||||
# update failed_builds counter reported by the compute
|
||||
self.failed_builds = int(self.stats.get('failed_builds', 0))
|
||||
|
||||
self.updated = compute.updated_at
|
||||
|
||||
def _locked_consume_from_request(self, spec_obj):
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
"""
|
||||
BuildFailure Weigher. Weigh hosts by the number of recent failed boot attempts.
|
||||
|
||||
"""
|
||||
|
||||
import nova.conf
|
||||
from nova.scheduler import weights
|
||||
|
||||
CONF = nova.conf.CONF
|
||||
|
||||
|
||||
class BuildFailureWeigher(weights.BaseHostWeigher):
|
||||
def weight_multiplier(self):
|
||||
"""Override the weight multiplier. Note this is negated."""
|
||||
return -1 * CONF.filter_scheduler.build_failure_weight_multiplier
|
||||
|
||||
def _weigh_object(self, host_state, weight_properties):
|
||||
"""Higher weights win. Our multiplier is negative, so reduce our
|
||||
weight by number of failed builds.
|
||||
"""
|
||||
return host_state.failed_builds
|
|
@ -324,6 +324,11 @@ class TestCase(testtools.TestCase):
|
|||
|
||||
self.useFixture(nova_fixtures.ForbidNewLegacyNotificationFixture())
|
||||
|
||||
# FIXME(danms): Disable this for all tests by default to avoid breaking
|
||||
# any that depend on default/previous ordering
|
||||
self.flags(build_failure_weight_multiplier=0.0,
|
||||
group='filter_scheduler')
|
||||
|
||||
def _setup_cells(self):
|
||||
"""Setup a normal cellsv2 environment.
|
||||
|
||||
|
|
|
@ -58,6 +58,7 @@ class ServersTestBase(integrated_helpers._IntegratedTestBase):
|
|||
_min_count_parameter = 'min_count'
|
||||
|
||||
def setUp(self):
|
||||
self.computes = {}
|
||||
super(ServersTestBase, self).setUp()
|
||||
# The network service is called as part of server creates but no
|
||||
# networks have been populated in the db, so stub the methods.
|
||||
|
@ -118,6 +119,30 @@ class ServersTest(ServersTestBase):
|
|||
for server in servers:
|
||||
LOG.debug("server: %s", server)
|
||||
|
||||
def _get_node_build_failures(self):
|
||||
ctxt = context.get_admin_context()
|
||||
computes = objects.ComputeNodeList.get_all(ctxt)
|
||||
return {
|
||||
node.hypervisor_hostname: int(node.stats.get('failed_builds', 0))
|
||||
for node in computes}
|
||||
|
||||
def _run_periodics(self):
|
||||
"""Run the update_available_resource task on every compute manager
|
||||
|
||||
This runs periodics on the computes in an undefined order; some child
|
||||
class redefined this function to force a specific order.
|
||||
"""
|
||||
|
||||
if self.compute.host not in self.computes:
|
||||
self.computes[self.compute.host] = self.compute
|
||||
|
||||
ctx = context.get_admin_context()
|
||||
for compute in self.computes.values():
|
||||
LOG.info('Running periodic for compute (%s)',
|
||||
compute.manager.host)
|
||||
compute.manager.update_available_resource(ctx)
|
||||
LOG.info('Finished with periodics')
|
||||
|
||||
def test_create_server_with_error(self):
|
||||
# Create a server which will enter error state.
|
||||
|
||||
|
@ -139,6 +164,12 @@ class ServersTest(ServersTestBase):
|
|||
self.assertEqual('ERROR', found_server['status'])
|
||||
self._delete_server(created_server_id)
|
||||
|
||||
# We should have no (persisted) build failures until we update
|
||||
# resources, after which we should have one
|
||||
self.assertEqual([0], list(self._get_node_build_failures().values()))
|
||||
self._run_periodics()
|
||||
self.assertEqual([1], list(self._get_node_build_failures().values()))
|
||||
|
||||
def test_create_and_delete_server(self):
|
||||
# Creates and deletes a server.
|
||||
|
||||
|
|
|
@ -4761,24 +4761,19 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
|||
nil_out_host_and_node=True)
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
||||
def test_build_failures_disable_service(self, mock_service, mock_dbari):
|
||||
@mock.patch('nova.compute.stats.Stats.build_failed')
|
||||
def test_build_failures_reported(self, mock_failed, mock_dbari):
|
||||
mock_dbari.return_value = build_results.FAILED
|
||||
instance = objects.Instance(uuid=uuids.instance)
|
||||
for i in range(0, 10):
|
||||
self.compute.build_and_run_instance(None, instance, None,
|
||||
None, None)
|
||||
service = mock_service.return_value
|
||||
self.assertTrue(service.disabled)
|
||||
self.assertEqual('Auto-disabled due to 10 build failures',
|
||||
service.disabled_reason)
|
||||
service.save.assert_called_once_with()
|
||||
self.assertEqual(0, self.compute._failed_builds)
|
||||
|
||||
self.assertEqual(10, mock_failed.call_count)
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
||||
def test_build_failures_not_disable_service(self, mock_service,
|
||||
mock_dbari):
|
||||
@mock.patch('nova.compute.stats.Stats.build_failed')
|
||||
def test_build_failures_not_reported(self, mock_failed, mock_dbari):
|
||||
self.flags(consecutive_build_service_disable_threshold=0,
|
||||
group='compute')
|
||||
mock_dbari.return_value = build_results.FAILED
|
||||
|
@ -4786,13 +4781,14 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
|||
for i in range(0, 10):
|
||||
self.compute.build_and_run_instance(None, instance, None,
|
||||
None, None)
|
||||
service = mock_service.return_value
|
||||
self.assertFalse(service.save.called)
|
||||
self.assertEqual(10, self.compute._failed_builds)
|
||||
|
||||
mock_failed.assert_not_called()
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
||||
def test_transient_build_failures_no_disable_service(self, mock_service,
|
||||
@mock.patch.object(manager.ComputeManager, '_build_failed')
|
||||
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
|
||||
def test_transient_build_failures_no_report(self, mock_succeeded,
|
||||
mock_failed,
|
||||
mock_dbari):
|
||||
results = [build_results.FAILED,
|
||||
build_results.ACTIVE,
|
||||
|
@ -4809,31 +4805,34 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
|||
for i in range(0, 10):
|
||||
self.compute.build_and_run_instance(None, instance, None,
|
||||
None, None)
|
||||
service = mock_service.return_value
|
||||
self.assertFalse(service.save.called)
|
||||
self.assertEqual(0, self.compute._failed_builds)
|
||||
|
||||
self.assertEqual(2, mock_failed.call_count)
|
||||
self.assertEqual(8, mock_succeeded.call_count)
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
||||
def test_build_reschedules_disable_service(self, mock_service, mock_dbari):
|
||||
@mock.patch.object(manager.ComputeManager, '_build_failed')
|
||||
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
|
||||
def test_build_reschedules_reported(self, mock_succeeded,
|
||||
mock_failed,
|
||||
mock_dbari):
|
||||
mock_dbari.return_value = build_results.RESCHEDULED
|
||||
instance = objects.Instance(uuid=uuids.instance)
|
||||
for i in range(0, 10):
|
||||
self.compute.build_and_run_instance(None, instance, None,
|
||||
None, None)
|
||||
service = mock_service.return_value
|
||||
self.assertTrue(service.disabled)
|
||||
self.assertEqual('Auto-disabled due to 10 build failures',
|
||||
service.disabled_reason)
|
||||
service.save.assert_called_once_with()
|
||||
self.assertEqual(0, self.compute._failed_builds)
|
||||
|
||||
self.assertEqual(10, mock_failed.call_count)
|
||||
mock_succeeded.assert_not_called()
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
|
||||
@mock.patch('nova.objects.Service.get_by_compute_host')
|
||||
@mock.patch('nova.exception_wrapper._emit_exception_notification')
|
||||
@mock.patch('nova.compute.utils.add_instance_fault_from_exc')
|
||||
def test_build_exceptions_disable_service(self, mock_if, mock_notify,
|
||||
mock_service, mock_dbari):
|
||||
@mock.patch.object(manager.ComputeManager, '_build_failed')
|
||||
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
|
||||
def test_build_exceptions_reported(self, mock_succeeded,
|
||||
mock_failed,
|
||||
mock_if, mock_notify,
|
||||
mock_dbari):
|
||||
mock_dbari.side_effect = test.TestingException()
|
||||
instance = objects.Instance(uuid=uuids.instance,
|
||||
task_state=None)
|
||||
|
@ -4842,12 +4841,9 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
|||
self.compute.build_and_run_instance,
|
||||
None, instance, None,
|
||||
None, None)
|
||||
service = mock_service.return_value
|
||||
self.assertTrue(service.disabled)
|
||||
self.assertEqual('Auto-disabled due to 10 build failures',
|
||||
service.disabled_reason)
|
||||
service.save.assert_called_once_with()
|
||||
self.assertEqual(0, self.compute._failed_builds)
|
||||
|
||||
self.assertEqual(10, mock_failed.call_count)
|
||||
mock_succeeded.assert_not_called()
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_shutdown_instance')
|
||||
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
|
||||
|
|
|
@ -1148,7 +1148,7 @@ class TestInitComputeNode(BaseTestCase):
|
|||
ram_allocation_ratio=1.0,
|
||||
cpu_allocation_ratio=1.0,
|
||||
disk_allocation_ratio=1.0,
|
||||
stats={},
|
||||
stats={'failed_builds': 0},
|
||||
pci_device_pools=objects.PciDevicePoolList(objects=[]),
|
||||
uuid=uuids.compute_node_uuid
|
||||
)
|
||||
|
|
|
@ -238,3 +238,19 @@ class StatsTestCase(test.NoDBTestCase):
|
|||
|
||||
self.assertEqual(0, len(self.stats))
|
||||
self.assertEqual(0, len(self.stats.states))
|
||||
|
||||
def test_build_failed_succeded(self):
|
||||
self.assertEqual('not-set', self.stats.get('failed_builds', 'not-set'))
|
||||
self.stats.build_failed()
|
||||
self.assertEqual(1, self.stats['failed_builds'])
|
||||
self.stats.build_failed()
|
||||
self.assertEqual(2, self.stats['failed_builds'])
|
||||
self.stats.build_succeeded()
|
||||
self.assertEqual(0, self.stats['failed_builds'])
|
||||
self.stats.build_succeeded()
|
||||
self.assertEqual(0, self.stats['failed_builds'])
|
||||
|
||||
def test_build_succeeded_first(self):
|
||||
self.assertEqual('not-set', self.stats.get('failed_builds', 'not-set'))
|
||||
self.stats.build_succeeded()
|
||||
self.assertEqual(0, self.stats['failed_builds'])
|
||||
|
|
|
@ -163,6 +163,7 @@ class CachingSchedulerTestCase(test_scheduler.SchedulerTestCase):
|
|||
host_state.ram_allocation_ratio = 1.5
|
||||
host_state.disk_allocation_ratio = 1.0
|
||||
host_state.metrics = objects.MonitorMetricList(objects=[])
|
||||
host_state.failed_builds = 0
|
||||
return host_state
|
||||
|
||||
@mock.patch('nova.db.instance_extra_get_by_instance_uuid',
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
"""
|
||||
Tests For Scheduler build failure weights.
|
||||
"""
|
||||
|
||||
from nova.scheduler import weights
|
||||
from nova.scheduler.weights import compute
|
||||
from nova import test
|
||||
from nova.tests.unit.scheduler import fakes
|
||||
|
||||
|
||||
class BuildFailureWeigherTestCase(test.NoDBTestCase):
|
||||
def setUp(self):
|
||||
super(BuildFailureWeigherTestCase, self).setUp()
|
||||
self.weight_handler = weights.HostWeightHandler()
|
||||
self.weighers = [compute.BuildFailureWeigher()]
|
||||
|
||||
def _get_weighed_host(self, hosts):
|
||||
return self.weight_handler.get_weighed_objects(self.weighers,
|
||||
hosts, {})
|
||||
|
||||
def _get_all_hosts(self):
|
||||
host_values = [
|
||||
('host1', 'node1', {'failed_builds': 0}),
|
||||
('host2', 'node2', {'failed_builds': 1}),
|
||||
('host3', 'node3', {'failed_builds': 10}),
|
||||
('host4', 'node4', {'failed_builds': 100})
|
||||
]
|
||||
return [fakes.FakeHostState(host, node, values)
|
||||
for host, node, values in host_values]
|
||||
|
||||
def test_build_failure_weigher_disabled(self):
|
||||
self.flags(build_failure_weight_multiplier=0.0,
|
||||
group='filter_scheduler')
|
||||
hosts = self._get_all_hosts()
|
||||
weighed_hosts = self._get_weighed_host(hosts)
|
||||
self.assertTrue(all([wh.weight == 0.0
|
||||
for wh in weighed_hosts]))
|
||||
|
||||
def test_build_failure_weigher_scaled(self):
|
||||
self.flags(build_failure_weight_multiplier=1000.0,
|
||||
group='filter_scheduler')
|
||||
hosts = self._get_all_hosts()
|
||||
weighed_hosts = self._get_weighed_host(hosts)
|
||||
self.assertEqual([0, -10, -100, -1000],
|
||||
[wh.weight for wh in weighed_hosts])
|
|
@ -0,0 +1,23 @@
|
|||
---
|
||||
security:
|
||||
- |
|
||||
To mitigate potential issues with compute nodes disabling
|
||||
themselves in response to failures that were either non-fatal or
|
||||
user-generated, the consecutive build failure counter
|
||||
functionality in the compute service has been changed to advise
|
||||
the scheduler of the count instead of self-disabling the service
|
||||
upon exceeding the threshold. The
|
||||
``[compute]/consecutive_build_service_disable_threshold``
|
||||
configuration option still controls whether the count is tracked,
|
||||
but the action taken on this value has been changed to a scheduler
|
||||
weigher. This allows the scheduler to be configured to weigh hosts
|
||||
with consecutive failures lower than other hosts, configured by the
|
||||
``[filter_scheduler]/build_failure_weight_multiplier`` option. If
|
||||
the compute threshold option is nonzero, computes will report their
|
||||
failure count for the scheduler to consider. If the threshold
|
||||
value is zero, then computes will not report this value
|
||||
and the scheduler will assume the number of failures for
|
||||
non-reporting compute nodes to be zero. By default, the scheduler
|
||||
weigher is enabled and configured with a very large multiplier to
|
||||
ensure that hosts with consecutive failures are scored low by
|
||||
default.
|
Loading…
Reference in New Issue