Fix an error in _get_host_states when deleting a compute node

_get_host_states returns a generator which closes over seen_nodes, which
is local, and self.host_state_map, which is global. It also modifies
self.host_state_map, and will remove entries whose compute nodes are no
longer present.

If a compute node is deleted while a filter is still evaluating the
generator returned by _get_host_states, the entry in self.host_state_map
will be deleted if _get_host_states is called again. This will cause a
KeyError when the first generator comes to evaluate the entry for the
deleted compute node.

We fix this by modifying the returned generator expression to check
that a host_state_map entry still exists before returning it. An
existing unit test is modified to exhibit the bug.

Change-Id: Ibb7c43a0abc433f93fc3de71146263e6d5923666
Closes-Bug: #1739323
(cherry picked from commit d72b33b986)
This commit is contained in:
Matthew Booth 2017-12-16 20:27:08 +00:00 committed by Matt Riedemann
parent db243a5f8b
commit b6a1a19295
3 changed files with 20 additions and 7 deletions

View File

@ -151,7 +151,7 @@ class FilterScheduler(driver.Scheduler):
# host, we virtually consume resources on it so subsequent
# selections can adjust accordingly.
# Note: remember, we are using an iterator here. So only
# Note: remember, we are using a generator-iterator here. So only
# traverse this list once. This can bite you if the hosts
# are being scanned in a filter or weighing function.
hosts = self._get_all_host_states(elevated, spec_obj,
@ -342,8 +342,8 @@ class FilterScheduler(driver.Scheduler):
# The provider_summaries variable will be an empty dict when the
# Placement API found no providers that match the requested
# constraints, which in turn makes compute_uuids an empty list and
# get_host_states_by_uuids will return an empty tuple also, which will
# eventually result in a NoValidHost error.
# get_host_states_by_uuids will return an empty generator-iterator
# also, which will eventually result in a NoValidHost error.
compute_uuids = None
if provider_summaries is not None:
compute_uuids = list(provider_summaries.keys())

View File

@ -662,7 +662,7 @@ class HostManager(object):
return self._get_host_states(context, compute_nodes, services)
def get_all_host_states(self, context):
"""Returns a list of HostStates that represents all the hosts
"""Returns a generator of HostStates that represents all the hosts
the HostManager knows about. Also, each of the consumable resources
in HostState are pre-populated and adjusted based on data in the db.
"""
@ -672,7 +672,7 @@ class HostManager(object):
return self._get_host_states(context, compute_nodes, services)
def _get_host_states(self, context, compute_nodes, services):
"""Returns a tuple of HostStates given a list of computes.
"""Returns a generator over HostStates given a list of computes.
Also updates the HostStates internal mapping for the HostManager.
"""
@ -715,7 +715,13 @@ class HostManager(object):
"from scheduler"), {'host': host, 'node': node})
del self.host_state_map[state_key]
return (self.host_state_map[host] for host in seen_nodes)
# NOTE(mriedem): We are returning a generator, which means the global
# host_state_map could change due to a concurrent scheduling request
# where a compute node is now considered 'dead' and is removed from
# the host_state_map, so we have to be sure to check that the next
# seen_node is still in the map before returning it.
return (self.host_state_map[host] for host in seen_nodes
if host in self.host_state_map)
def _get_aggregates_info(self, host):
return [self.aggs_by_id[agg_id] for agg_id in

View File

@ -1023,7 +1023,10 @@ class HostManagerChangedNodesTestCase(test.NoDBTestCase):
context = 'fake_context'
# first call: all nodes
self.host_manager.get_all_host_states(context)
hosts = self.host_manager.get_all_host_states(context)
# get_all_host_states returns a generator so convert the values into
# an iterator
host_states1 = iter(hosts)
host_states_map = self.host_manager.host_state_map
self.assertEqual(len(host_states_map), 4)
@ -1031,6 +1034,10 @@ class HostManagerChangedNodesTestCase(test.NoDBTestCase):
self.host_manager.get_all_host_states(context)
host_states_map = self.host_manager.host_state_map
self.assertEqual(len(host_states_map), 3)
# Fake a concurrent request that is still processing the first result
# to make sure we properly handle that node4 is no longer in
# host_state_map.
list(host_states1)
@mock.patch('nova.objects.ServiceList.get_by_binary')
@mock.patch('nova.objects.ComputeNodeList.get_all')