Stop globally caching host states in scheduler HostManager

Currently, in the scheduler HostManager, we cache host states in
a map global to all requests. This used to be okay because we were
always querying the entire compute node list for every request to
pass on to filtering. So we cached the host states globally and
updated them per request and removed "dead nodes" from the cache
(compute nodes still in the cache that were not returned from
ComputeNodeList.get_all).

As of Ocata, we started filtering our ComputeNodeList query based on
an answer from placement about which resource providers could satisfy
the request, instead of querying the entire compute node list every
time. This is much more efficient (don't consider compute nodes that
can't possibly fulfill the request) BUT it doesn't play well with the
global host state cache. We started seeing "Removing dead compute node"
messages in the logs, signaling removal of compute nodes from the
global cache when compute nodes were actually available.

If request A comes in and all compute nodes can satisfy its request,
then request B arrives concurrently and no compute nodes can satisfy
its request, the request B request will remove all the compute nodes
from the global host state cache and then request A will get "no valid
hosts" at the filtering stage because get_host_states_by_uuids returns
a generator that hands out hosts from the global host state cache.

This removes the global host state cache from the scheduler HostManager
and instead generates a fresh host state map per request and uses that
to return hosts from the generator. Because we're filtering the
ComputeNodeList based on a placement query per request, each request
can have a completely different set of compute nodes that can fulfill
it, so we're not gaining much by caching host states anyway.

Co-Authored-By: Dan Smith <dansmith@redhat.com>

Closes-Bug: #1742827
Related-Bug: #1739323

Change-Id: I40c17ed88f50ecbdedc4daf368fff10e90e7be11
(cherry picked from commit c98ac6adc5)
This commit is contained in:
melanie witt 2018-01-13 21:49:54 +00:00
parent b535f08085
commit bf65fdd59e
3 changed files with 86 additions and 52 deletions

View File

@ -341,7 +341,6 @@ class HostManager(object):
def __init__(self):
self.cells = None
self.host_state_map = {}
self.filter_handler = filters.HostFilterHandler()
filter_classes = self.filter_handler.get_matching_classes(
CONF.filter_scheduler.available_filters)
@ -677,6 +676,7 @@ class HostManager(object):
Also updates the HostStates internal mapping for the HostManager.
"""
# Get resource usage across the available compute nodes:
host_state_map = {}
seen_nodes = set()
for cell_uuid, computes in compute_nodes.items():
for compute in computes:
@ -690,12 +690,12 @@ class HostManager(object):
host = compute.host
node = compute.hypervisor_hostname
state_key = (host, node)
host_state = self.host_state_map.get(state_key)
host_state = host_state_map.get(state_key)
if not host_state:
host_state = self.host_state_cls(host, node,
cell_uuid,
compute=compute)
self.host_state_map[state_key] = host_state
host_state_map[state_key] = host_state
# We force to update the aggregates info each time a
# new request comes in, because some changes on the
# aggregates could have been happening after setting
@ -707,21 +707,7 @@ class HostManager(object):
seen_nodes.add(state_key)
# remove compute nodes from host_state_map if they are not active
dead_nodes = set(self.host_state_map.keys()) - seen_nodes
for state_key in dead_nodes:
host, node = state_key
LOG.info(_LI("Removing dead compute node %(host)s:%(node)s "
"from scheduler"), {'host': host, 'node': node})
del self.host_state_map[state_key]
# NOTE(mriedem): We are returning a generator, which means the global
# host_state_map could change due to a concurrent scheduling request
# where a compute node is now considered 'dead' and is removed from
# the host_state_map, so we have to be sure to check that the next
# seen_node is still in the map before returning it.
return (self.host_state_map[host] for host in seen_nodes
if host in self.host_state_map)
return (host_state_map[host] for host in seen_nodes)
def _get_aggregates_info(self, host):
return [self.aggs_by_id[agg_id] for agg_id in

View File

@ -495,9 +495,10 @@ class HostManagerTestCase(test.NoDBTestCase):
mock_get_by_binary.return_value = fakes.SERVICES
context = 'fake_context'
self.host_manager.get_all_host_states(context)
host_states_map = self.host_manager.host_state_map
self.assertEqual(len(host_states_map), 4)
# get_all_host_states returns a generator, so make a map from it
host_states_map = {(state.host, state.nodename): state for state in
self.host_manager.get_all_host_states(context)}
self.assertEqual(4, len(host_states_map))
calls = [
mock.call(
@ -560,8 +561,11 @@ class HostManagerTestCase(test.NoDBTestCase):
mock_get_by_host.return_value = objects.InstanceList()
self.host_manager.host_aggregates_map = collections.defaultdict(set)
self.host_manager.get_all_host_states('fake-context')
host_state = self.host_manager.host_state_map[('fake', 'fake')]
hosts = self.host_manager.get_all_host_states('fake-context')
# get_all_host_states returns a generator, so make a map from it
host_states_map = {(state.host, state.nodename): state for state in
hosts}
host_state = host_states_map[('fake', 'fake')]
self.assertEqual([], host_state.aggregates)
@mock.patch.object(nova.objects.InstanceList, 'get_by_host')
@ -581,8 +585,11 @@ class HostManagerTestCase(test.NoDBTestCase):
set, {'fake': set([1])})
self.host_manager.aggs_by_id = {1: fake_agg}
self.host_manager.get_all_host_states('fake-context')
host_state = self.host_manager.host_state_map[('fake', 'fake')]
hosts = self.host_manager.get_all_host_states('fake-context')
# get_all_host_states returns a generator, so make a map from it
host_states_map = {(state.host, state.nodename): state for state in
hosts}
host_state = host_states_map[('fake', 'fake')]
self.assertEqual([fake_agg], host_state.aggregates)
@mock.patch.object(nova.objects.InstanceList, 'get_by_host')
@ -605,8 +612,11 @@ class HostManagerTestCase(test.NoDBTestCase):
set, {'other': set([1])})
self.host_manager.aggs_by_id = {1: fake_agg}
self.host_manager.get_all_host_states('fake-context')
host_state = self.host_manager.host_state_map[('fake', 'fake')]
hosts = self.host_manager.get_all_host_states('fake-context')
# get_all_host_states returns a generator, so make a map from it
host_states_map = {(state.host, state.nodename): state for state in
hosts}
host_state = host_states_map[('fake', 'fake')]
self.assertEqual([], host_state.aggregates)
@mock.patch.object(nova.objects.InstanceList, 'get_by_host',
@ -1002,8 +1012,9 @@ class HostManagerChangedNodesTestCase(test.NoDBTestCase):
mock_get_by_binary.return_value = fakes.SERVICES
context = 'fake_context'
self.host_manager.get_all_host_states(context)
host_states_map = self.host_manager.host_state_map
# get_all_host_states returns a generator, so make a map from it
host_states_map = {(state.host, state.nodename): state for state in
self.host_manager.get_all_host_states(context)}
self.assertEqual(len(host_states_map), 4)
@mock.patch('nova.objects.ServiceList.get_by_binary')
@ -1024,20 +1035,16 @@ class HostManagerChangedNodesTestCase(test.NoDBTestCase):
# first call: all nodes
hosts = self.host_manager.get_all_host_states(context)
# get_all_host_states returns a generator so convert the values into
# an iterator
host_states1 = iter(hosts)
host_states_map = self.host_manager.host_state_map
# get_all_host_states returns a generator, so make a map from it
host_states_map = {(state.host, state.nodename): state for state in
hosts}
self.assertEqual(len(host_states_map), 4)
# second call: just running nodes
self.host_manager.get_all_host_states(context)
host_states_map = self.host_manager.host_state_map
hosts = self.host_manager.get_all_host_states(context)
host_states_map = {(state.host, state.nodename): state for state in
hosts}
self.assertEqual(len(host_states_map), 3)
# Fake a concurrent request that is still processing the first result
# to make sure we properly handle that node4 is no longer in
# host_state_map.
list(host_states1)
@mock.patch('nova.objects.ServiceList.get_by_binary')
@mock.patch('nova.objects.ComputeNodeList.get_all')
@ -1051,15 +1058,48 @@ class HostManagerChangedNodesTestCase(test.NoDBTestCase):
context = 'fake_context'
# first call: all nodes
self.host_manager.get_all_host_states(context)
host_states_map = self.host_manager.host_state_map
hosts = self.host_manager.get_all_host_states(context)
# get_all_host_states returns a generator, so make a map from it
host_states_map = {(state.host, state.nodename): state for state in
hosts}
self.assertEqual(len(host_states_map), 4)
# second call: no nodes
self.host_manager.get_all_host_states(context)
host_states_map = self.host_manager.host_state_map
hosts = self.host_manager.get_all_host_states(context)
host_states_map = {(state.host, state.nodename): state for state in
hosts}
self.assertEqual(len(host_states_map), 0)
@mock.patch('nova.objects.ServiceList.get_by_binary')
@mock.patch('nova.objects.ComputeNodeList.get_all_by_uuids')
@mock.patch('nova.objects.InstanceList.get_by_host')
def test_get_host_states_by_uuids(self, mock_get_by_host, mock_get_all,
mock_get_by_binary):
mock_get_by_host.return_value = objects.InstanceList()
mock_get_all.side_effect = [fakes.COMPUTE_NODES, []]
mock_get_by_binary.side_effect = [fakes.SERVICES, fakes.SERVICES]
# Request 1: all nodes can satisfy the request
hosts1 = self.host_manager.get_host_states_by_uuids(
mock.sentinel.ctxt1, mock.sentinel.uuids1, objects.RequestSpec())
# get_host_states_by_uuids returns a generator so convert the values
# into an iterator
host_states1 = iter(hosts1)
# Request 2: no nodes can satisfy the request
hosts2 = self.host_manager.get_host_states_by_uuids(
mock.sentinel.ctxt2, mock.sentinel.uuids2, objects.RequestSpec())
host_states2 = iter(hosts2)
# Fake a concurrent request that is still processing the first result
# to make sure all nodes are still available candidates to Request 1.
num_hosts1 = len(list(host_states1))
self.assertEqual(4, num_hosts1)
# Verify that no nodes are available to Request 2.
num_hosts2 = len(list(host_states2))
self.assertEqual(0, num_hosts2)
class HostStateTestCase(test.NoDBTestCase):
"""Test case for HostState class."""

View File

@ -78,9 +78,11 @@ class IronicHostManagerTestCase(test.NoDBTestCase):
mock_get_by_binary.return_value = ironic_fakes.SERVICES
context = 'fake_context'
self.host_manager.get_all_host_states(context)
hosts = self.host_manager.get_all_host_states(context)
self.assertEqual(0, mock_get_by_host.call_count)
host_states_map = self.host_manager.host_state_map
# get_all_host_states returns a generator, so make a map from it
host_states_map = {(state.host, state.nodename): state for state in
hosts}
self.assertEqual(len(host_states_map), 4)
for i in range(4):
@ -234,13 +236,16 @@ class IronicHostManagerChangedNodesTestCase(test.NoDBTestCase):
context = 'fake_context'
# first call: all nodes
self.host_manager.get_all_host_states(context)
host_states_map = self.host_manager.host_state_map
hosts = self.host_manager.get_all_host_states(context)
# get_all_host_states returns a generator, so make a map from it
host_states_map = {(state.host, state.nodename): state for state in
hosts}
self.assertEqual(4, len(host_states_map))
# second call: just running nodes
self.host_manager.get_all_host_states(context)
host_states_map = self.host_manager.host_state_map
hosts = self.host_manager.get_all_host_states(context)
host_states_map = {(state.host, state.nodename): state for state in
hosts}
self.assertEqual(3, len(host_states_map))
@mock.patch('nova.objects.ServiceList.get_by_binary')
@ -254,13 +259,16 @@ class IronicHostManagerChangedNodesTestCase(test.NoDBTestCase):
context = 'fake_context'
# first call: all nodes
self.host_manager.get_all_host_states(context)
host_states_map = self.host_manager.host_state_map
hosts = self.host_manager.get_all_host_states(context)
# get_all_host_states returns a generator, so make a map from it
host_states_map = {(state.host, state.nodename): state for state in
hosts}
self.assertEqual(len(host_states_map), 4)
# second call: no nodes
self.host_manager.get_all_host_states(context)
host_states_map = self.host_manager.host_state_map
host_states_map = {(state.host, state.nodename): state for state in
hosts}
self.assertEqual(len(host_states_map), 0)
def test_update_from_compute_node(self):