Cache host to cell mapping in HostManager

If the instances per host are not cached in the HostManager
we lookup the HostMapping per candidate compute node during
each scheduling request to get the CellMapping so we can target
that cell database to pull the instance uuids on the given host.

For example, if placement returned 20 compute node allocation
candidates and we don't have the instances cached for any of those,
we'll do 20 queries to the API DB to get host mappings.

We can improve this by caching the host to cell uuid after the first
lookup for a given host and then after that, get the CellMapping
from the cells cache (which is a dict, keyed by cell uuid, to the
CellMapping for that cell).

Change-Id: Ic6b1edfad2e384eb32c6942edc522ee301123cbc
Related-Bug: #1737465
(cherry picked from commit 3a69fdb333)
This commit is contained in:
Matt Riedemann 2019-06-05 13:30:25 -04:00
parent 3055229841
commit 677c95280b
5 changed files with 92 additions and 2 deletions

View File

@ -327,6 +327,10 @@ Nova Cells v2
found, 3 if a host with that name is not in a cell with that uuid, 4 if
a host with that name has instances (host not empty).
.. note::
The scheduler caches host-to-cell mapping information so when deleting
a host the scheduler may need to be restarted or sent the SIGHUP signal.
Placement
~~~~~~~~~

View File

@ -1765,6 +1765,10 @@ class CellV2Commands(object):
* The host has instances.
Returns 0 if the host is deleted successfully.
NOTE: The scheduler caches host-to-cell mapping information so when
deleting a host the scheduler may need to be restarted or sent the
SIGHUP signal.
"""
ctxt = context.get_admin_context()
# Find the CellMapping given the uuid.

View File

@ -675,6 +675,10 @@ class HostManager(object):
'cells': ', '.join(
[c.identity for c in disabled_cells])})
# Dict, keyed by host name, to cell UUID to be used to look up the
# cell a particular host is in (used with self.cells).
self.host_to_cell_uuid = {}
def get_host_states_by_uuids(self, context, compute_uuids, spec_obj):
if not self.cells:
@ -738,9 +742,40 @@ class HostManager(object):
return [self.aggs_by_id[agg_id] for agg_id in
self.host_aggregates_map[host]]
def _get_cell_mapping_for_host(self, context, host_name):
"""Finds the CellMapping for a particular host name
Relies on a cache to quickly fetch the CellMapping if we have looked
up this host before, otherwise gets the CellMapping via the
HostMapping record for the given host name.
:param context: nova auth request context
:param host_name: compute service host name
:returns: CellMapping object
:raises: HostMappingNotFound if the host is not mapped to a cell
"""
# Check to see if we have the host in our cache.
if host_name in self.host_to_cell_uuid:
cell_uuid = self.host_to_cell_uuid[host_name]
if cell_uuid in self.cells:
return self.cells[cell_uuid]
# Something is wrong so log a warning and just fall through to
# lookup the HostMapping.
LOG.warning('Host %s is expected to be in cell %s but that cell '
'uuid was not found in our cache. The service may '
'need to be restarted to refresh the cache.',
host_name, cell_uuid)
# We have not cached this host yet so get the HostMapping, cache the
# result and return the CellMapping.
hm = objects.HostMapping.get_by_host(context, host_name)
cell_mapping = hm.cell_mapping
self.host_to_cell_uuid[host_name] = cell_mapping.uuid
return cell_mapping
def _get_instances_by_host(self, context, host_name):
try:
hm = objects.HostMapping.get_by_host(context, host_name)
cm = self._get_cell_mapping_for_host(context, host_name)
except exception.HostMappingNotFound:
# It's possible to hit this when the compute service first starts
# up and casts to update_instance_info with an empty list but
@ -748,7 +783,7 @@ class HostManager(object):
LOG.info('Host mapping not found for host %s. Not tracking '
'instance info for this host.', host_name)
return {}
with context_module.target_cell(context, hm.cell_mapping) as cctxt:
with context_module.target_cell(context, cm) as cctxt:
uuids = objects.InstanceList.get_uuids_by_host(cctxt, host_name)
# Putting the context in the otherwise fake Instance object at
# least allows out of tree filters to lazy-load fields.

View File

@ -85,6 +85,9 @@ class SchedulerManager(manager.Manager):
# and enabled cells caches in the host manager. So every time an
# existing cell is disabled or enabled or a new cell is created, a
# SIGHUP signal has to be sent to the scheduler for proper scheduling.
# NOTE(mriedem): Similarly there is a host-to-cell cache which should
# be reset if a host is deleted from a cell and "discovered" in another
# cell.
self.driver.host_manager.refresh_cells_caches()
@messaging.expected_exceptions(exception.NoValidHost)

View File

@ -104,6 +104,8 @@ class HostManagerTestCase(test.NoDBTestCase):
transport_url='fake:///mq3',
disabled=False)
cells = [cell_mapping1, cell_mapping2, cell_mapping3]
# Throw a random host-to-cell in that cache to make sure it gets reset.
self.host_manager.host_to_cell_uuid['fake-host'] = cell_uuid1
with mock.patch('nova.objects.CellMappingList.get_all',
return_value=cells) as mock_cm:
self.host_manager.refresh_cells_caches()
@ -114,6 +116,8 @@ class HostManagerTestCase(test.NoDBTestCase):
# But it is still in the full list.
self.assertEqual(3, len(self.host_manager.cells))
self.assertIn(cell_uuid2, self.host_manager.cells)
# The host_to_cell_uuid cache should have been reset.
self.assertEqual({}, self.host_manager.host_to_cell_uuid)
def test_refresh_cells_caches_except_cell0(self):
ctxt = nova_context.RequestContext('fake-user', 'fake_project')
@ -131,6 +135,46 @@ class HostManagerTestCase(test.NoDBTestCase):
mock_cm.assert_called_once()
self.assertEqual(0, len(self.host_manager.cells))
@mock.patch('nova.objects.HostMapping.get_by_host',
return_value=objects.HostMapping(
cell_mapping=objects.CellMapping(uuid=uuids.cell1)))
def test_get_cell_mapping_for_host(self, mock_get_by_host):
# Starting with an empty cache, assert that the HostMapping is looked
# up and the result is cached.
ctxt = nova_context.get_admin_context()
host = 'fake-host'
self.assertEqual({}, self.host_manager.host_to_cell_uuid)
cm = self.host_manager._get_cell_mapping_for_host(ctxt, host)
self.assertIs(cm, mock_get_by_host.return_value.cell_mapping)
self.assertIn(host, self.host_manager.host_to_cell_uuid)
self.assertEqual(
uuids.cell1, self.host_manager.host_to_cell_uuid[host])
mock_get_by_host.assert_called_once_with(ctxt, host)
# Reset the mock and do it again, assert we do not query the DB.
mock_get_by_host.reset_mock()
self.host_manager._get_cell_mapping_for_host(ctxt, host)
mock_get_by_host.assert_not_called()
# Mix up the cache such that the host is mapped to a cell that
# is not in the cache which will make us query the DB. Also make the
# HostMapping query raise HostMappingNotFound to make sure that comes
# up to the caller.
mock_get_by_host.reset_mock()
self.host_manager.host_to_cell_uuid[host] = uuids.random_cell
mock_get_by_host.side_effect = exception.HostMappingNotFound(name=host)
with mock.patch('nova.scheduler.host_manager.LOG.warning') as warning:
self.assertRaises(exception.HostMappingNotFound,
self.host_manager._get_cell_mapping_for_host,
ctxt, host)
# We should have logged a warning because the host is cached to
# a cell uuid but that cell uuid is not in the cells cache.
warning.assert_called_once()
self.assertIn('Host %s is expected to be in cell %s',
warning.call_args[0][0])
# And we should have also tried to lookup the HostMapping in the DB
mock_get_by_host.assert_called_once_with(ctxt, host)
@mock.patch.object(nova.objects.InstanceList, 'get_by_filters')
@mock.patch.object(nova.objects.ComputeNodeList, 'get_all')
def test_init_instance_info_batches(self, mock_get_all,