Fix an error in _get_host_states when deleting a compute node
_get_host_states returns a generator which closes over seen_nodes, which
is local, and self.host_state_map, which is global. It also modifies
self.host_state_map, and will remove entries whose compute nodes are no
longer present.
If a compute node is deleted while a filter is still evaluating the
generator returned by _get_host_states, the entry in self.host_state_map
will be deleted if _get_host_states is called again. This will cause a
KeyError when the first generator comes to evaluate the entry for the
deleted compute node.
We fix this by modifying the returned generator expression to check
that a host_state_map entry still exists before returning it. An
existing unit test is modified to exhibit the bug.
Change-Id: Ibb7c43a0abc433f93fc3de71146263e6d5923666
Closes-Bug: #1739323
(cherry picked from commit d72b33b986
)
This commit is contained in:
parent
db243a5f8b
commit
b6a1a19295
|
@ -151,7 +151,7 @@ class FilterScheduler(driver.Scheduler):
|
|||
# host, we virtually consume resources on it so subsequent
|
||||
# selections can adjust accordingly.
|
||||
|
||||
# Note: remember, we are using an iterator here. So only
|
||||
# Note: remember, we are using a generator-iterator here. So only
|
||||
# traverse this list once. This can bite you if the hosts
|
||||
# are being scanned in a filter or weighing function.
|
||||
hosts = self._get_all_host_states(elevated, spec_obj,
|
||||
|
@ -342,8 +342,8 @@ class FilterScheduler(driver.Scheduler):
|
|||
# The provider_summaries variable will be an empty dict when the
|
||||
# Placement API found no providers that match the requested
|
||||
# constraints, which in turn makes compute_uuids an empty list and
|
||||
# get_host_states_by_uuids will return an empty tuple also, which will
|
||||
# eventually result in a NoValidHost error.
|
||||
# get_host_states_by_uuids will return an empty generator-iterator
|
||||
# also, which will eventually result in a NoValidHost error.
|
||||
compute_uuids = None
|
||||
if provider_summaries is not None:
|
||||
compute_uuids = list(provider_summaries.keys())
|
||||
|
|
|
@ -662,7 +662,7 @@ class HostManager(object):
|
|||
return self._get_host_states(context, compute_nodes, services)
|
||||
|
||||
def get_all_host_states(self, context):
|
||||
"""Returns a list of HostStates that represents all the hosts
|
||||
"""Returns a generator of HostStates that represents all the hosts
|
||||
the HostManager knows about. Also, each of the consumable resources
|
||||
in HostState are pre-populated and adjusted based on data in the db.
|
||||
"""
|
||||
|
@ -672,7 +672,7 @@ class HostManager(object):
|
|||
return self._get_host_states(context, compute_nodes, services)
|
||||
|
||||
def _get_host_states(self, context, compute_nodes, services):
|
||||
"""Returns a tuple of HostStates given a list of computes.
|
||||
"""Returns a generator over HostStates given a list of computes.
|
||||
|
||||
Also updates the HostStates internal mapping for the HostManager.
|
||||
"""
|
||||
|
@ -715,7 +715,13 @@ class HostManager(object):
|
|||
"from scheduler"), {'host': host, 'node': node})
|
||||
del self.host_state_map[state_key]
|
||||
|
||||
return (self.host_state_map[host] for host in seen_nodes)
|
||||
# NOTE(mriedem): We are returning a generator, which means the global
|
||||
# host_state_map could change due to a concurrent scheduling request
|
||||
# where a compute node is now considered 'dead' and is removed from
|
||||
# the host_state_map, so we have to be sure to check that the next
|
||||
# seen_node is still in the map before returning it.
|
||||
return (self.host_state_map[host] for host in seen_nodes
|
||||
if host in self.host_state_map)
|
||||
|
||||
def _get_aggregates_info(self, host):
|
||||
return [self.aggs_by_id[agg_id] for agg_id in
|
||||
|
|
|
@ -1023,7 +1023,10 @@ class HostManagerChangedNodesTestCase(test.NoDBTestCase):
|
|||
context = 'fake_context'
|
||||
|
||||
# first call: all nodes
|
||||
self.host_manager.get_all_host_states(context)
|
||||
hosts = self.host_manager.get_all_host_states(context)
|
||||
# get_all_host_states returns a generator so convert the values into
|
||||
# an iterator
|
||||
host_states1 = iter(hosts)
|
||||
host_states_map = self.host_manager.host_state_map
|
||||
self.assertEqual(len(host_states_map), 4)
|
||||
|
||||
|
@ -1031,6 +1034,10 @@ class HostManagerChangedNodesTestCase(test.NoDBTestCase):
|
|||
self.host_manager.get_all_host_states(context)
|
||||
host_states_map = self.host_manager.host_state_map
|
||||
self.assertEqual(len(host_states_map), 3)
|
||||
# Fake a concurrent request that is still processing the first result
|
||||
# to make sure we properly handle that node4 is no longer in
|
||||
# host_state_map.
|
||||
list(host_states1)
|
||||
|
||||
@mock.patch('nova.objects.ServiceList.get_by_binary')
|
||||
@mock.patch('nova.objects.ComputeNodeList.get_all')
|
||||
|
|
Loading…
Reference in New Issue