Merge "Change consecutive build failure limit to a weigher" into stable/pike

2018-06-25 06:23:17 +00:00 · 2018-06-25 06:23:17 +00:00 · 8a08cf332e
parent 7f5cf84ced bdb5c3b405
commit 8a08cf332e
17 changed files with 273 additions and 73 deletions
--- a/doc/source/admin/configuration/schedulers.rst
+++ b/doc/source/admin/configuration/schedulers.rst
@ -860,6 +860,11 @@ Hosts and cells are weighted based on the following options in the
     - Multiplier used for weighing hosts for group soft-anti-affinity.  Only a
       positive value is meaningful. Negative means that the behavior will
       change to the opposite, which is soft-affinity.
+   * - [filter_scheduler]
+     - ``build_failure_weight_multiplier``
+     - Multiplier used for weighing hosts which have recent build failures. A
+       positive value increases the significance of build falures reported by
+       the host recently, making them less likely to be chosen.
   * - [metrics]
     - ``weight_multiplier``
     - Multiplier for weighting meters. Use a floating-point value.
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -532,7 +532,6 @@ class ComputeManager(manager.Manager):
                CONF.max_concurrent_live_migrations)
        else:
            self._live_migration_semaphore = compute_utils.UnlimitedSemaphore()
-        self._failed_builds = 0

        super(ComputeManager, self).__init__(service_name="compute",
                                             *args, **kwargs)
@ -1710,29 +1709,15 @@ class ComputeManager(manager.Manager):
        return block_device_info

    def _build_failed(self):
-        self._failed_builds += 1
-        limit = CONF.compute.consecutive_build_service_disable_threshold
-        if limit and self._failed_builds >= limit:
-            # NOTE(danms): If we're doing a bunch of parallel builds,
-            # it is possible (although not likely) that we have already
-            # failed N-1 builds before this and we race with a successful
-            # build and disable ourselves here when we might've otherwise
-            # not.
-            LOG.error('Disabling service due to %(fails)i '
-                      'consecutive build failures',
-                      {'fails': self._failed_builds})
-            ctx = nova.context.get_admin_context()
-            service = objects.Service.get_by_compute_host(ctx, CONF.host)
-            service.disabled = True
-            service.disabled_reason = (
-                'Auto-disabled due to %i build failures' % self._failed_builds)
-            service.save()
-            # NOTE(danms): Reset our counter now so that when the admin
-            # re-enables us we can start fresh
-            self._failed_builds = 0
-        elif self._failed_builds > 1:
-            LOG.warning('%(fails)i consecutive build failures',
-                        {'fails': self._failed_builds})
+        if CONF.compute.consecutive_build_service_disable_threshold:
+            rt = self._get_resource_tracker()
+            # NOTE(danms): Update our counter, but wait for the next
+            # update_available_resource() periodic to flush it to the DB
+            rt.stats.build_failed()
+
+    def _build_succeeded(self):
+        rt = self._get_resource_tracker()
+        rt.stats.build_succeeded()

    @wrap_exception()
    @reverts_task_state
@ -1783,7 +1768,7 @@ class ComputeManager(manager.Manager):

                        self._build_failed()
                    else:
-                        self._failed_builds = 0
+                        self._build_succeeded()

        # NOTE(danms): We spawn here to return the RPC worker thread back to
        # the pool. Since what follows could take a really long time, we don't
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@ -628,7 +628,13 @@ class ResourceTracker(object):
    def _copy_resources(self, compute_node, resources):
        """Copy resource values to supplied compute_node."""
        # purge old stats and init with anything passed in by the driver
+        # NOTE(danms): Preserve 'failed_builds' across the stats clearing,
+        # as that is not part of resources
+        # TODO(danms): Stop doing this when we get a column to store this
+        # directly
+        prev_failed_builds = self.stats.get('failed_builds', 0)
        self.stats.clear()
+        self.stats['failed_builds'] = prev_failed_builds
        self.stats.digest_stats(resources.get('stats'))
        compute_node.stats = copy.deepcopy(self.stats)

--- a/nova/compute/stats.py
+++ b/nova/compute/stats.py
@ -138,3 +138,11 @@ class Stats(dict):
                                 os_type=os_type, project_id=project_id)

        return (vm_state, task_state, os_type, project_id)
+
+    def build_failed(self):
+        self['failed_builds'] = self.get('failed_builds', 0) + 1
+
+    def build_succeeded(self):
+        # FIXME(danms): Make this more graceful, either by time-based aging or
+        # a fixed decline upon success
+        self['failed_builds'] = 0
--- a/nova/conf/compute.py
+++ b/nova/conf/compute.py
@ -638,20 +638,20 @@ compute_group_opts = [
    cfg.IntOpt('consecutive_build_service_disable_threshold',
        default=10,
        help="""
-Number of consecutive failed builds that result in disabling a compute service.
+Enables reporting of build failures to the scheduler.

-This option will cause nova-compute to set itself to a disabled state
-if a certain number of consecutive build failures occur. This will
-prevent the scheduler from continuing to send builds to a compute node that is
-consistently failing. Note that all failures qualify and count towards this
-score, including reschedules that may have been due to racy scheduler behavior.
-Since the failures must be consecutive, it is unlikely that occasional expected
-reschedules will actually disable a compute node.
+Any nonzero value will enable sending build failure statistics to the
+scheduler for use by the BuildFailureWeigher.

 Possible values:

-* Any positive integer representing a build failure count.
-* Zero to never auto-disable.
+* Any positive integer enables reporting build failures.
+* Zero to disable reporting build failures.
+
+Related options:
+
+* [filter_scheduler]/build_failure_weight_multiplier
+
 """),
 ]

--- a/nova/conf/scheduler.py
+++ b/nova/conf/scheduler.py
@ -477,6 +477,34 @@ Possible values:
  for hosts with group soft anti-affinity. Only a positive value are
  meaningful, as negative values would make this behave as a soft affinity
  weigher.
+"""),
+    cfg.FloatOpt(
+        "build_failure_weight_multiplier",
+        default=1000000.0,
+        help="""
+Multiplier used for weighing hosts that have had recent build failures.
+
+This option determines how much weight is placed on a compute node with
+recent build failures. Build failures may indicate a failing, misconfigured,
+or otherwise ailing compute node, and avoiding it during scheduling may be
+beneficial. The weight is inversely proportional to the number of recent
+build failures the compute node has experienced. This value should be
+set to some high value to offset weight given by other enabled weighers
+due to available resources. To disable weighing compute hosts by the
+number of recent failures, set this to zero.
+
+This option is only used by the FilterScheduler and its subclasses; if you use
+a different scheduler, this option has no effect.
+
+Possible values:
+
+* An integer or float value, where the value corresponds to the multiplier
+  ratio for this weigher.
+
+Related options:
+
+* [compute]/consecutive_build_service_disable_threshold - Must be nonzero
+  for a compute to report data considered by this weigher.
 """),
    # TODO(mikal): replace this option with something involving host aggregates
    cfg.ListOpt("isolated_images",
--- a/nova/scheduler/host_manager.py
+++ b/nova/scheduler/host_manager.py
@ -257,6 +257,9 @@ class HostState(object):
        self.ram_allocation_ratio = compute.ram_allocation_ratio
        self.disk_allocation_ratio = compute.disk_allocation_ratio

+        # update failed_builds counter reported by the compute
+        self.failed_builds = int(self.stats.get('failed_builds', 0))
+
    def consume_from_request(self, spec_obj):
        """Incrementally update host state from a RequestSpec object."""

--- a/nova/scheduler/ironic_host_manager.py
+++ b/nova/scheduler/ironic_host_manager.py
@ -76,6 +76,9 @@ class IronicNodeState(host_manager.HostState):
        self.ram_allocation_ratio = compute.ram_allocation_ratio
        self.disk_allocation_ratio = compute.disk_allocation_ratio

+        # update failed_builds counter reported by the compute
+        self.failed_builds = int(self.stats.get('failed_builds', 0))
+
        self.updated = compute.updated_at

    def _locked_consume_from_request(self, spec_obj):
--- a/nova/scheduler/weights/compute.py
+++ b/nova/scheduler/weights/compute.py
@ -0,0 +1,33 @@
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+"""
+BuildFailure Weigher. Weigh hosts by the number of recent failed boot attempts.
+
+"""
+
+import nova.conf
+from nova.scheduler import weights
+
+CONF = nova.conf.CONF
+
+
+class BuildFailureWeigher(weights.BaseHostWeigher):
+    def weight_multiplier(self):
+        """Override the weight multiplier. Note this is negated."""
+        return -1 * CONF.filter_scheduler.build_failure_weight_multiplier
+
+    def _weigh_object(self, host_state, weight_properties):
+        """Higher weights win.  Our multiplier is negative, so reduce our
+           weight by number of failed builds.
+        """
+        return host_state.failed_builds
--- a/nova/test.py
+++ b/nova/test.py
@ -324,6 +324,11 @@ class TestCase(testtools.TestCase):

        self.useFixture(nova_fixtures.ForbidNewLegacyNotificationFixture())

+        # FIXME(danms): Disable this for all tests by default to avoid breaking
+        # any that depend on default/previous ordering
+        self.flags(build_failure_weight_multiplier=0.0,
+                   group='filter_scheduler')
+
    def _setup_cells(self):
        """Setup a normal cellsv2 environment.

--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -58,6 +58,7 @@ class ServersTestBase(integrated_helpers._IntegratedTestBase):
    _min_count_parameter = 'min_count'

    def setUp(self):
+        self.computes = {}
        super(ServersTestBase, self).setUp()
        # The network service is called as part of server creates but no
        # networks have been populated in the db, so stub the methods.
@ -118,6 +119,30 @@ class ServersTest(ServersTestBase):
        for server in servers:
            LOG.debug("server: %s", server)

+    def _get_node_build_failures(self):
+        ctxt = context.get_admin_context()
+        computes = objects.ComputeNodeList.get_all(ctxt)
+        return {
+            node.hypervisor_hostname: int(node.stats.get('failed_builds', 0))
+            for node in computes}
+
+    def _run_periodics(self):
+        """Run the update_available_resource task on every compute manager
+
+        This runs periodics on the computes in an undefined order; some child
+        class redefined this function to force a specific order.
+        """
+
+        if self.compute.host not in self.computes:
+            self.computes[self.compute.host] = self.compute
+
+        ctx = context.get_admin_context()
+        for compute in self.computes.values():
+            LOG.info('Running periodic for compute (%s)',
+                compute.manager.host)
+            compute.manager.update_available_resource(ctx)
+        LOG.info('Finished with periodics')
+
    def test_create_server_with_error(self):
        # Create a server which will enter error state.

@ -139,6 +164,12 @@ class ServersTest(ServersTestBase):
        self.assertEqual('ERROR', found_server['status'])
        self._delete_server(created_server_id)

+        # We should have no (persisted) build failures until we update
+        # resources, after which we should have one
+        self.assertEqual([0], list(self._get_node_build_failures().values()))
+        self._run_periodics()
+        self.assertEqual([1], list(self._get_node_build_failures().values()))
+
    def test_create_and_delete_server(self):
        # Creates and deletes a server.

--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -4761,24 +4761,19 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
                nil_out_host_and_node=True)

    @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
-    @mock.patch('nova.objects.Service.get_by_compute_host')
-    def test_build_failures_disable_service(self, mock_service, mock_dbari):
+    @mock.patch('nova.compute.stats.Stats.build_failed')
+    def test_build_failures_reported(self, mock_failed, mock_dbari):
        mock_dbari.return_value = build_results.FAILED
        instance = objects.Instance(uuid=uuids.instance)
        for i in range(0, 10):
            self.compute.build_and_run_instance(None, instance, None,
                                                None, None)
-        service = mock_service.return_value
-        self.assertTrue(service.disabled)
-        self.assertEqual('Auto-disabled due to 10 build failures',
-                         service.disabled_reason)
-        service.save.assert_called_once_with()
-        self.assertEqual(0, self.compute._failed_builds)
+
+        self.assertEqual(10, mock_failed.call_count)

    @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
-    @mock.patch('nova.objects.Service.get_by_compute_host')
-    def test_build_failures_not_disable_service(self, mock_service,
-                                                mock_dbari):
+    @mock.patch('nova.compute.stats.Stats.build_failed')
+    def test_build_failures_not_reported(self, mock_failed, mock_dbari):
        self.flags(consecutive_build_service_disable_threshold=0,
                   group='compute')
        mock_dbari.return_value = build_results.FAILED
@ -4786,14 +4781,15 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
        for i in range(0, 10):
            self.compute.build_and_run_instance(None, instance, None,
                                                None, None)
-        service = mock_service.return_value
-        self.assertFalse(service.save.called)
-        self.assertEqual(10, self.compute._failed_builds)
+
+        mock_failed.assert_not_called()

    @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
-    @mock.patch('nova.objects.Service.get_by_compute_host')
-    def test_transient_build_failures_no_disable_service(self, mock_service,
-                                                         mock_dbari):
+    @mock.patch.object(manager.ComputeManager, '_build_failed')
+    @mock.patch.object(manager.ComputeManager, '_build_succeeded')
+    def test_transient_build_failures_no_report(self, mock_succeeded,
+                                                mock_failed,
+                                                mock_dbari):
        results = [build_results.FAILED,
                   build_results.ACTIVE,
                   build_results.RESCHEDULED]
@ -4809,31 +4805,34 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
        for i in range(0, 10):
            self.compute.build_and_run_instance(None, instance, None,
                                                None, None)
-        service = mock_service.return_value
-        self.assertFalse(service.save.called)
-        self.assertEqual(0, self.compute._failed_builds)
+
+        self.assertEqual(2, mock_failed.call_count)
+        self.assertEqual(8, mock_succeeded.call_count)

    @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
-    @mock.patch('nova.objects.Service.get_by_compute_host')
-    def test_build_reschedules_disable_service(self, mock_service, mock_dbari):
+    @mock.patch.object(manager.ComputeManager, '_build_failed')
+    @mock.patch.object(manager.ComputeManager, '_build_succeeded')
+    def test_build_reschedules_reported(self, mock_succeeded,
+                                        mock_failed,
+                                        mock_dbari):
        mock_dbari.return_value = build_results.RESCHEDULED
        instance = objects.Instance(uuid=uuids.instance)
        for i in range(0, 10):
            self.compute.build_and_run_instance(None, instance, None,
                                                None, None)
-        service = mock_service.return_value
-        self.assertTrue(service.disabled)
-        self.assertEqual('Auto-disabled due to 10 build failures',
-                         service.disabled_reason)
-        service.save.assert_called_once_with()
-        self.assertEqual(0, self.compute._failed_builds)
+
+        self.assertEqual(10, mock_failed.call_count)
+        mock_succeeded.assert_not_called()

    @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
-    @mock.patch('nova.objects.Service.get_by_compute_host')
    @mock.patch('nova.exception_wrapper._emit_exception_notification')
    @mock.patch('nova.compute.utils.add_instance_fault_from_exc')
-    def test_build_exceptions_disable_service(self, mock_if, mock_notify,
-                                              mock_service, mock_dbari):
+    @mock.patch.object(manager.ComputeManager, '_build_failed')
+    @mock.patch.object(manager.ComputeManager, '_build_succeeded')
+    def test_build_exceptions_reported(self, mock_succeeded,
+                                       mock_failed,
+                                       mock_if, mock_notify,
+                                       mock_dbari):
        mock_dbari.side_effect = test.TestingException()
        instance = objects.Instance(uuid=uuids.instance,
                                    task_state=None)
@ -4842,12 +4841,9 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
                              self.compute.build_and_run_instance,
                              None, instance, None,
                              None, None)
-        service = mock_service.return_value
-        self.assertTrue(service.disabled)
-        self.assertEqual('Auto-disabled due to 10 build failures',
-                         service.disabled_reason)
-        service.save.assert_called_once_with()
-        self.assertEqual(0, self.compute._failed_builds)
+
+        self.assertEqual(10, mock_failed.call_count)
+        mock_succeeded.assert_not_called()

    @mock.patch.object(manager.ComputeManager, '_shutdown_instance')
    @mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@ -1148,7 +1148,7 @@ class TestInitComputeNode(BaseTestCase):
            ram_allocation_ratio=1.0,
            cpu_allocation_ratio=1.0,
            disk_allocation_ratio=1.0,
-            stats={},
+            stats={'failed_builds': 0},
            pci_device_pools=objects.PciDevicePoolList(objects=[]),
            uuid=uuids.compute_node_uuid
        )
--- a/nova/tests/unit/compute/test_stats.py
+++ b/nova/tests/unit/compute/test_stats.py
@ -238,3 +238,19 @@ class StatsTestCase(test.NoDBTestCase):

        self.assertEqual(0, len(self.stats))
        self.assertEqual(0, len(self.stats.states))
+
+    def test_build_failed_succeded(self):
+        self.assertEqual('not-set', self.stats.get('failed_builds', 'not-set'))
+        self.stats.build_failed()
+        self.assertEqual(1, self.stats['failed_builds'])
+        self.stats.build_failed()
+        self.assertEqual(2, self.stats['failed_builds'])
+        self.stats.build_succeeded()
+        self.assertEqual(0, self.stats['failed_builds'])
+        self.stats.build_succeeded()
+        self.assertEqual(0, self.stats['failed_builds'])
+
+    def test_build_succeeded_first(self):
+        self.assertEqual('not-set', self.stats.get('failed_builds', 'not-set'))
+        self.stats.build_succeeded()
+        self.assertEqual(0, self.stats['failed_builds'])
--- a/nova/tests/unit/scheduler/test_caching_scheduler.py
+++ b/nova/tests/unit/scheduler/test_caching_scheduler.py
@ -163,6 +163,7 @@ class CachingSchedulerTestCase(test_scheduler.SchedulerTestCase):
        host_state.ram_allocation_ratio = 1.5
        host_state.disk_allocation_ratio = 1.0
        host_state.metrics = objects.MonitorMetricList(objects=[])
+        host_state.failed_builds = 0
        return host_state

    @mock.patch('nova.db.instance_extra_get_by_instance_uuid',
--- a/nova/tests/unit/scheduler/weights/test_weights_compute.py
+++ b/nova/tests/unit/scheduler/weights/test_weights_compute.py
@ -0,0 +1,57 @@
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+"""
+Tests For Scheduler build failure weights.
+"""
+
+from nova.scheduler import weights
+from nova.scheduler.weights import compute
+from nova import test
+from nova.tests.unit.scheduler import fakes
+
+
+class BuildFailureWeigherTestCase(test.NoDBTestCase):
+    def setUp(self):
+        super(BuildFailureWeigherTestCase, self).setUp()
+        self.weight_handler = weights.HostWeightHandler()
+        self.weighers = [compute.BuildFailureWeigher()]
+
+    def _get_weighed_host(self, hosts):
+        return self.weight_handler.get_weighed_objects(self.weighers,
+                hosts, {})
+
+    def _get_all_hosts(self):
+        host_values = [
+            ('host1', 'node1', {'failed_builds': 0}),
+            ('host2', 'node2', {'failed_builds': 1}),
+            ('host3', 'node3', {'failed_builds': 10}),
+            ('host4', 'node4', {'failed_builds': 100})
+        ]
+        return [fakes.FakeHostState(host, node, values)
+                for host, node, values in host_values]
+
+    def test_build_failure_weigher_disabled(self):
+        self.flags(build_failure_weight_multiplier=0.0,
+                   group='filter_scheduler')
+        hosts = self._get_all_hosts()
+        weighed_hosts = self._get_weighed_host(hosts)
+        self.assertTrue(all([wh.weight == 0.0
+                             for wh in weighed_hosts]))
+
+    def test_build_failure_weigher_scaled(self):
+        self.flags(build_failure_weight_multiplier=1000.0,
+                   group='filter_scheduler')
+        hosts = self._get_all_hosts()
+        weighed_hosts = self._get_weighed_host(hosts)
+        self.assertEqual([0, -10, -100, -1000],
+                         [wh.weight for wh in weighed_hosts])
--- a/releasenotes/notes/change-consecutive-boot-failure-counter-to-weigher-428de7da0ed2033a.yaml
+++ b/releasenotes/notes/change-consecutive-boot-failure-counter-to-weigher-428de7da0ed2033a.yaml
@ -0,0 +1,23 @@
+---
+security:
+  - |
+    To mitigate potential issues with compute nodes disabling
+    themselves in response to failures that were either non-fatal or
+    user-generated, the consecutive build failure counter
+    functionality in the compute service has been changed to advise
+    the scheduler of the count instead of self-disabling the service
+    upon exceeding the threshold. The
+    ``[compute]/consecutive_build_service_disable_threshold``
+    configuration option still controls whether the count is tracked,
+    but the action taken on this value has been changed to a scheduler
+    weigher. This allows the scheduler to be configured to weigh hosts
+    with consecutive failures lower than other hosts, configured by the
+    ``[filter_scheduler]/build_failure_weight_multiplier`` option. If
+    the compute threshold option is nonzero, computes will report their
+    failure count for the scheduler to consider. If the threshold
+    value is zero, then computes will not report this value
+    and the scheduler will assume the number of failures for
+    non-reporting compute nodes to be zero. By default, the scheduler
+    weigher is enabled and configured with a very large multiplier to
+    ensure that hosts with consecutive failures are scored low by
+    default.