From 5d86aa7edb60ec35fa43c670ae1f0513dc1e9ad5 Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Thu, 17 May 2018 07:28:59 -0700
Subject: [PATCH] Fix interpretation of max_attempts for scheduling alternates

Since max_attempts is not max_retries, we should look for max_attempts-1
alternate hosts. This change makes the num_alts variable actually reflect
the number of alternates we are looking for, and doesn't actually change
the logic at all. This also adds functional tests to verify max_attempts
behavior with 1 (no retries) and 2 (one retry), just to make sure it is
doing what we expect.

Further, we optimize _get_alternate_hosts() to only re-filter/weigh the
host list if we're looking for alternates for multiple instances. Now
that it's clear that num_alts is only nonzero if we are going to look
at the host list again to find those alternates, it can be more clearly
(and accurately) used to optimize out that extra step if we are not
getting alternates.

Change-Id: I16ba29385dd2db5467829e6c17fc395096b5cfd3
---
 nova/scheduler/filter_scheduler.py            | 10 ++---
 nova/tests/functional/test_servers.py         | 40 +++++++++++++++++++
 .../unit/scheduler/test_filter_scheduler.py   |  6 ++-
 3 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/nova/scheduler/filter_scheduler.py b/nova/scheduler/filter_scheduler.py
index a77b5a0bf3c0..5de9ed3aea55 100644
--- a/nova/scheduler/filter_scheduler.py
+++ b/nova/scheduler/filter_scheduler.py
@@ -158,7 +158,8 @@ class FilterScheduler(driver.Scheduler):
         # is based on CONF.scheduler.max_attempts; note that if there are not
         # enough filtered hosts to provide the full number of alternates, the
         # list of hosts may be shorter than this amount.
-        num_alts = CONF.scheduler.max_attempts if return_alternates else 0
+        num_alts = (CONF.scheduler.max_attempts - 1
+                    if return_alternates else 0)
 
         if (instance_uuids is None or
                 not self.USES_ALLOCATION_CANDIDATES or
@@ -332,9 +333,8 @@ class FilterScheduler(driver.Scheduler):
                              num_alts, alloc_reqs_by_rp_uuid=None,
                              allocation_request_version=None):
         # We only need to filter/weigh the hosts again if we're dealing with
-        # more than one instance since the single selected host will get
-        # filtered out of the list of alternates below.
-        if index > 0:
+        # more than one instance and are going to be picking alternates.
+        if index > 0 and num_alts > 0:
             # The selected_hosts have all had resources 'claimed' via
             # _consume_selected_host, so we need to filter/weigh and sort the
             # hosts again to get an accurate count for alternates.
@@ -364,7 +364,7 @@ class FilterScheduler(driver.Scheduler):
             # will have had its resources reduced and will have a much lower
             # chance of being able to fit another instance on it.
             for host in hosts:
-                if len(selected_plus_alts) >= num_alts:
+                if len(selected_plus_alts) >= num_alts + 1:
                     break
                 if host.cell_uuid == cell_uuid and host not in selected_hosts:
                     if alloc_reqs_by_rp_uuid is not None:
diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index a9a7887fce93..e292408b54f6 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -154,6 +154,46 @@ class ServersTest(ServersTestBase):
         self.assertEqual('ERROR', found_server['status'])
         self._delete_server(created_server_id)
 
+    def _test_create_server_with_error_with_retries(self):
+        # Create a server which will enter error state.
+
+        fake.set_nodes(['host2'])
+        self.addCleanup(fake.restore_nodes)
+        self.flags(host='host2')
+        self.compute2 = self.start_service('compute', host='host2')
+
+        fails = []
+
+        def throw_error(*args, **kwargs):
+            fails.append('one')
+            raise test.TestingException('Please retry me')
+
+        self.stub_out('nova.virt.fake.FakeDriver.spawn', throw_error)
+
+        server = self._build_minimal_create_server_request()
+        created_server = self.api.post_server({"server": server})
+        created_server_id = created_server['id']
+
+        found_server = self.api.get_server(created_server_id)
+        self.assertEqual(created_server_id, found_server['id'])
+
+        found_server = self._wait_for_state_change(found_server, 'BUILD')
+
+        self.assertEqual('ERROR', found_server['status'])
+        self._delete_server(created_server_id)
+
+        return len(fails)
+
+    def test_create_server_with_error_with_retries(self):
+        self.flags(max_attempts=2, group='scheduler')
+        fails = self._test_create_server_with_error_with_retries()
+        self.assertEqual(2, fails)
+
+    def test_create_server_with_error_with_no_retries(self):
+        self.flags(max_attempts=1, group='scheduler')
+        fails = self._test_create_server_with_error_with_retries()
+        self.assertEqual(1, fails)
+
     def test_create_and_delete_server(self):
         # Creates and deletes a server.
 
diff --git a/nova/tests/unit/scheduler/test_filter_scheduler.py b/nova/tests/unit/scheduler/test_filter_scheduler.py
index 7c3451b363fb..4cbf75caa2fe 100644
--- a/nova/tests/unit/scheduler/test_filter_scheduler.py
+++ b/nova/tests/unit/scheduler/test_filter_scheduler.py
@@ -845,8 +845,10 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
                 instance_uuids, alloc_reqs, None, return_alternates=True)
         self.assertEqual(num_instances, len(dests))
         # Filtering and weighing hosts should be called num_instances + 1 times
-        # unless num_instances == 1.
-        self.assertEqual(num_instances + 1 if num_instances > 1 else 1,
+        # unless we're not getting alternates, and then just num_instances
+        self.assertEqual(num_instances + 1
+                         if num_alternates > 0 and num_instances > 1
+                         else num_instances,
                          mock_sorted.call_count,
                          'Unexpected number of calls to filter hosts for %s '
                          'instances.' % num_instances)