Fix interpretation of max_attempts for scheduling alternates

Since max_attempts is not max_retries, we should look for max_attempts-1
alternate hosts. This change makes the num_alts variable actually reflect
the number of alternates we are looking for, and doesn't actually change
the logic at all. This also adds functional tests to verify max_attempts
behavior with 1 (no retries) and 2 (one retry), just to make sure it is
doing what we expect.

Further, we optimize _get_alternate_hosts() to only re-filter/weigh the
host list if we're looking for alternates for multiple instances. Now
that it's clear that num_alts is only nonzero if we are going to look
at the host list again to find those alternates, it can be more clearly
(and accurately) used to optimize out that extra step if we are not
getting alternates.

Closes-Bug: #1775625
Change-Id: I16ba29385dd2db5467829e6c17fc395096b5cfd3
(cherry picked from commit 5d86aa7edb)
This commit is contained in:
Dan Smith 2018-05-17 07:28:59 -07:00
parent c7d87f6691
commit 4e3dd818a3
3 changed files with 49 additions and 7 deletions

View File

@ -158,7 +158,8 @@ class FilterScheduler(driver.Scheduler):
# is based on CONF.scheduler.max_attempts; note that if there are not
# enough filtered hosts to provide the full number of alternates, the
# list of hosts may be shorter than this amount.
num_alts = CONF.scheduler.max_attempts if return_alternates else 0
num_alts = (CONF.scheduler.max_attempts - 1
if return_alternates else 0)
if (instance_uuids is None or
not self.USES_ALLOCATION_CANDIDATES or
@ -332,9 +333,8 @@ class FilterScheduler(driver.Scheduler):
num_alts, alloc_reqs_by_rp_uuid=None,
allocation_request_version=None):
# We only need to filter/weigh the hosts again if we're dealing with
# more than one instance since the single selected host will get
# filtered out of the list of alternates below.
if index > 0:
# more than one instance and are going to be picking alternates.
if index > 0 and num_alts > 0:
# The selected_hosts have all had resources 'claimed' via
# _consume_selected_host, so we need to filter/weigh and sort the
# hosts again to get an accurate count for alternates.
@ -364,7 +364,7 @@ class FilterScheduler(driver.Scheduler):
# will have had its resources reduced and will have a much lower
# chance of being able to fit another instance on it.
for host in hosts:
if len(selected_plus_alts) >= num_alts:
if len(selected_plus_alts) >= num_alts + 1:
break
if host.cell_uuid == cell_uuid and host not in selected_hosts:
if alloc_reqs_by_rp_uuid is not None:

View File

@ -154,6 +154,46 @@ class ServersTest(ServersTestBase):
self.assertEqual('ERROR', found_server['status'])
self._delete_server(created_server_id)
def _test_create_server_with_error_with_retries(self):
# Create a server which will enter error state.
fake.set_nodes(['host2'])
self.addCleanup(fake.restore_nodes)
self.flags(host='host2')
self.compute2 = self.start_service('compute', host='host2')
fails = []
def throw_error(*args, **kwargs):
fails.append('one')
raise test.TestingException('Please retry me')
self.stub_out('nova.virt.fake.FakeDriver.spawn', throw_error)
server = self._build_minimal_create_server_request()
created_server = self.api.post_server({"server": server})
created_server_id = created_server['id']
found_server = self.api.get_server(created_server_id)
self.assertEqual(created_server_id, found_server['id'])
found_server = self._wait_for_state_change(found_server, 'BUILD')
self.assertEqual('ERROR', found_server['status'])
self._delete_server(created_server_id)
return len(fails)
def test_create_server_with_error_with_retries(self):
self.flags(max_attempts=2, group='scheduler')
fails = self._test_create_server_with_error_with_retries()
self.assertEqual(2, fails)
def test_create_server_with_error_with_no_retries(self):
self.flags(max_attempts=1, group='scheduler')
fails = self._test_create_server_with_error_with_retries()
self.assertEqual(1, fails)
def test_create_and_delete_server(self):
# Creates and deletes a server.

View File

@ -845,8 +845,10 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase):
instance_uuids, alloc_reqs, None, return_alternates=True)
self.assertEqual(num_instances, len(dests))
# Filtering and weighing hosts should be called num_instances + 1 times
# unless num_instances == 1.
self.assertEqual(num_instances + 1 if num_instances > 1 else 1,
# unless we're not getting alternates, and then just num_instances
self.assertEqual(num_instances + 1
if num_alternates > 0 and num_instances > 1
else num_instances,
mock_sorted.call_count,
'Unexpected number of calls to filter hosts for %s '
'instances.' % num_instances)