Fix an error in _get_host_states when deleting a compute node
_get_host_states returns a generator which closes over seen_nodes, which is local, and self.host_state_map, which is global. It also modifies self.host_state_map, and will remove entries whose compute nodes are no longer present. If a compute node is deleted while a filter is still evaluating the generator returned by _get_host_states, the entry in self.host_state_map will be deleted if _get_host_states is called again. This will cause a KeyError when the first generator comes to evaluate the entry for the deleted compute node. We fix this by modifying the returned generator expression to check that a host_state_map entry still exists before returning it. An existing unit test is modified to exhibit the bug. Conflicts: nova/scheduler/filter_scheduler.py nova/scheduler/host_manager.py NOTE(mriedem): The conflict in filter_scheduler.py is due to48268c73e3
in Pike which is not needed here. The conflict in host_manager.py is due tod1de538523
in Pike which is not needed here. Change-Id: Ibb7c43a0abc433f93fc3de71146263e6d5923666 Closes-Bug: #1739323 (cherry picked from commitd72b33b986
) (cherry picked from commitb6a1a19295
)
This commit is contained in:
parent
4f0ff43c79
commit
52679374a4
|
@ -97,7 +97,7 @@ class FilterScheduler(driver.Scheduler):
|
|||
# host, we virtually consume resources on it so subsequent
|
||||
# selections can adjust accordingly.
|
||||
|
||||
# Note: remember, we are using an iterator here. So only
|
||||
# Note: remember, we are using a generator-iterator here. So only
|
||||
# traverse this list once. This can bite you if the hosts
|
||||
# are being scanned in a filter or weighing function.
|
||||
hosts = self._get_all_host_states(elevated, spec_obj)
|
||||
|
|
|
@ -587,7 +587,7 @@ class HostManager(object):
|
|||
return self._get_host_states(context, compute_nodes)
|
||||
|
||||
def get_all_host_states(self, context):
|
||||
"""Returns a list of HostStates that represents all the hosts
|
||||
"""Returns a generator of HostStates that represents all the hosts
|
||||
the HostManager knows about. Also, each of the consumable resources
|
||||
in HostState are pre-populated and adjusted based on data in the db.
|
||||
"""
|
||||
|
@ -595,7 +595,7 @@ class HostManager(object):
|
|||
return self._get_host_states(context, compute_nodes)
|
||||
|
||||
def _get_host_states(self, context, compute_nodes):
|
||||
"""Returns a tuple of HostStates given a list of computes.
|
||||
"""Returns a generator over HostStates given a list of computes.
|
||||
|
||||
Also updates the HostStates internal mapping for the HostManager.
|
||||
"""
|
||||
|
@ -637,7 +637,13 @@ class HostManager(object):
|
|||
"from scheduler"), {'host': host, 'node': node})
|
||||
del self.host_state_map[state_key]
|
||||
|
||||
return (self.host_state_map[host] for host in seen_nodes)
|
||||
# NOTE(mriedem): We are returning a generator, which means the global
|
||||
# host_state_map could change due to a concurrent scheduling request
|
||||
# where a compute node is now considered 'dead' and is removed from
|
||||
# the host_state_map, so we have to be sure to check that the next
|
||||
# seen_node is still in the map before returning it.
|
||||
return (self.host_state_map[host] for host in seen_nodes
|
||||
if host in self.host_state_map)
|
||||
|
||||
def _get_aggregates_info(self, host):
|
||||
return [self.aggs_by_id[agg_id] for agg_id in
|
||||
|
|
|
@ -900,7 +900,10 @@ class HostManagerChangedNodesTestCase(test.NoDBTestCase):
|
|||
context = 'fake_context'
|
||||
|
||||
# first call: all nodes
|
||||
self.host_manager.get_all_host_states(context)
|
||||
hosts = self.host_manager.get_all_host_states(context)
|
||||
# get_all_host_states returns a generator so convert the values into
|
||||
# an iterator
|
||||
host_states1 = iter(hosts)
|
||||
host_states_map = self.host_manager.host_state_map
|
||||
self.assertEqual(len(host_states_map), 4)
|
||||
|
||||
|
@ -908,6 +911,10 @@ class HostManagerChangedNodesTestCase(test.NoDBTestCase):
|
|||
self.host_manager.get_all_host_states(context)
|
||||
host_states_map = self.host_manager.host_state_map
|
||||
self.assertEqual(len(host_states_map), 3)
|
||||
# Fake a concurrent request that is still processing the first result
|
||||
# to make sure we properly handle that node4 is no longer in
|
||||
# host_state_map.
|
||||
list(host_states1)
|
||||
|
||||
@mock.patch('nova.objects.ServiceList.get_by_binary')
|
||||
@mock.patch('nova.objects.ComputeNodeList.get_all')
|
||||
|
|
Loading…
Reference in New Issue