From 3a69fdb333e80f5ad2e025a9823b11483fad55c2 Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Wed, 5 Jun 2019 13:30:25 -0400 Subject: [PATCH] Cache host to cell mapping in HostManager If the instances per host are not cached in the HostManager we lookup the HostMapping per candidate compute node during each scheduling request to get the CellMapping so we can target that cell database to pull the instance uuids on the given host. For example, if placement returned 20 compute node allocation candidates and we don't have the instances cached for any of those, we'll do 20 queries to the API DB to get host mappings. We can improve this by caching the host to cell uuid after the first lookup for a given host and then after that, get the CellMapping from the cells cache (which is a dict, keyed by cell uuid, to the CellMapping for that cell). Change-Id: Ic6b1edfad2e384eb32c6942edc522ee301123cbc Related-Bug: #1737465 --- doc/source/cli/nova-manage.rst | 4 ++ nova/cmd/manage.py | 4 ++ nova/scheduler/host_manager.py | 39 +++++++++++++++- nova/scheduler/manager.py | 3 ++ .../tests/unit/scheduler/test_host_manager.py | 44 +++++++++++++++++++ 5 files changed, 92 insertions(+), 2 deletions(-) diff --git a/doc/source/cli/nova-manage.rst b/doc/source/cli/nova-manage.rst index bb17990fe470..867154155d72 100644 --- a/doc/source/cli/nova-manage.rst +++ b/doc/source/cli/nova-manage.rst @@ -368,6 +368,10 @@ Nova Cells v2 found, 3 if a host with that name is not in a cell with that uuid, 4 if a host with that name has instances (host not empty). + .. note:: + + The scheduler caches host-to-cell mapping information so when deleting + a host the scheduler may need to be restarted or sent the SIGHUP signal. Placement ~~~~~~~~~ diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py index 88e965f20481..293d41518bc2 100644 --- a/nova/cmd/manage.py +++ b/nova/cmd/manage.py @@ -1599,6 +1599,10 @@ class CellV2Commands(object): * The host has instances. Returns 0 if the host is deleted successfully. + + NOTE: The scheduler caches host-to-cell mapping information so when + deleting a host the scheduler may need to be restarted or sent the + SIGHUP signal. """ ctxt = context.get_admin_context() # Find the CellMapping given the uuid. diff --git a/nova/scheduler/host_manager.py b/nova/scheduler/host_manager.py index 33a3b7b5d6ae..bb7464e10db8 100644 --- a/nova/scheduler/host_manager.py +++ b/nova/scheduler/host_manager.py @@ -756,6 +756,10 @@ class HostManager(object): 'cells': ', '.join( [c.identity for c in disabled_cells])}) + # Dict, keyed by host name, to cell UUID to be used to look up the + # cell a particular host is in (used with self.cells). + self.host_to_cell_uuid = {} + def get_host_states_by_uuids(self, context, compute_uuids, spec_obj): if not self.cells: @@ -819,9 +823,40 @@ class HostManager(object): return [self.aggs_by_id[agg_id] for agg_id in self.host_aggregates_map[host]] + def _get_cell_mapping_for_host(self, context, host_name): + """Finds the CellMapping for a particular host name + + Relies on a cache to quickly fetch the CellMapping if we have looked + up this host before, otherwise gets the CellMapping via the + HostMapping record for the given host name. + + :param context: nova auth request context + :param host_name: compute service host name + :returns: CellMapping object + :raises: HostMappingNotFound if the host is not mapped to a cell + """ + # Check to see if we have the host in our cache. + if host_name in self.host_to_cell_uuid: + cell_uuid = self.host_to_cell_uuid[host_name] + if cell_uuid in self.cells: + return self.cells[cell_uuid] + # Something is wrong so log a warning and just fall through to + # lookup the HostMapping. + LOG.warning('Host %s is expected to be in cell %s but that cell ' + 'uuid was not found in our cache. The service may ' + 'need to be restarted to refresh the cache.', + host_name, cell_uuid) + + # We have not cached this host yet so get the HostMapping, cache the + # result and return the CellMapping. + hm = objects.HostMapping.get_by_host(context, host_name) + cell_mapping = hm.cell_mapping + self.host_to_cell_uuid[host_name] = cell_mapping.uuid + return cell_mapping + def _get_instances_by_host(self, context, host_name): try: - hm = objects.HostMapping.get_by_host(context, host_name) + cm = self._get_cell_mapping_for_host(context, host_name) except exception.HostMappingNotFound: # It's possible to hit this when the compute service first starts # up and casts to update_instance_info with an empty list but @@ -829,7 +864,7 @@ class HostManager(object): LOG.info('Host mapping not found for host %s. Not tracking ' 'instance info for this host.', host_name) return {} - with context_module.target_cell(context, hm.cell_mapping) as cctxt: + with context_module.target_cell(context, cm) as cctxt: uuids = objects.InstanceList.get_uuids_by_host(cctxt, host_name) # Putting the context in the otherwise fake Instance object at # least allows out of tree filters to lazy-load fields. diff --git a/nova/scheduler/manager.py b/nova/scheduler/manager.py index 67dcf3fc6bc8..bd122f57a70f 100644 --- a/nova/scheduler/manager.py +++ b/nova/scheduler/manager.py @@ -99,6 +99,9 @@ class SchedulerManager(manager.Manager): # and enabled cells caches in the host manager. So every time an # existing cell is disabled or enabled or a new cell is created, a # SIGHUP signal has to be sent to the scheduler for proper scheduling. + # NOTE(mriedem): Similarly there is a host-to-cell cache which should + # be reset if a host is deleted from a cell and "discovered" in another + # cell. self.driver.host_manager.refresh_cells_caches() @messaging.expected_exceptions(exception.NoValidHost) diff --git a/nova/tests/unit/scheduler/test_host_manager.py b/nova/tests/unit/scheduler/test_host_manager.py index 4dc5571a16e4..b320b6addf9e 100644 --- a/nova/tests/unit/scheduler/test_host_manager.py +++ b/nova/tests/unit/scheduler/test_host_manager.py @@ -104,6 +104,8 @@ class HostManagerTestCase(test.NoDBTestCase): transport_url='fake:///mq3', disabled=False) cells = [cell_mapping1, cell_mapping2, cell_mapping3] + # Throw a random host-to-cell in that cache to make sure it gets reset. + self.host_manager.host_to_cell_uuid['fake-host'] = cell_uuid1 with mock.patch('nova.objects.CellMappingList.get_all', return_value=cells) as mock_cm: self.host_manager.refresh_cells_caches() @@ -114,6 +116,8 @@ class HostManagerTestCase(test.NoDBTestCase): # But it is still in the full list. self.assertEqual(3, len(self.host_manager.cells)) self.assertIn(cell_uuid2, self.host_manager.cells) + # The host_to_cell_uuid cache should have been reset. + self.assertEqual({}, self.host_manager.host_to_cell_uuid) def test_refresh_cells_caches_except_cell0(self): ctxt = nova_context.RequestContext('fake-user', 'fake_project') @@ -131,6 +135,46 @@ class HostManagerTestCase(test.NoDBTestCase): mock_cm.assert_called_once() self.assertEqual(0, len(self.host_manager.cells)) + @mock.patch('nova.objects.HostMapping.get_by_host', + return_value=objects.HostMapping( + cell_mapping=objects.CellMapping(uuid=uuids.cell1))) + def test_get_cell_mapping_for_host(self, mock_get_by_host): + # Starting with an empty cache, assert that the HostMapping is looked + # up and the result is cached. + ctxt = nova_context.get_admin_context() + host = 'fake-host' + self.assertEqual({}, self.host_manager.host_to_cell_uuid) + cm = self.host_manager._get_cell_mapping_for_host(ctxt, host) + self.assertIs(cm, mock_get_by_host.return_value.cell_mapping) + self.assertIn(host, self.host_manager.host_to_cell_uuid) + self.assertEqual( + uuids.cell1, self.host_manager.host_to_cell_uuid[host]) + mock_get_by_host.assert_called_once_with(ctxt, host) + + # Reset the mock and do it again, assert we do not query the DB. + mock_get_by_host.reset_mock() + self.host_manager._get_cell_mapping_for_host(ctxt, host) + mock_get_by_host.assert_not_called() + + # Mix up the cache such that the host is mapped to a cell that + # is not in the cache which will make us query the DB. Also make the + # HostMapping query raise HostMappingNotFound to make sure that comes + # up to the caller. + mock_get_by_host.reset_mock() + self.host_manager.host_to_cell_uuid[host] = uuids.random_cell + mock_get_by_host.side_effect = exception.HostMappingNotFound(name=host) + with mock.patch('nova.scheduler.host_manager.LOG.warning') as warning: + self.assertRaises(exception.HostMappingNotFound, + self.host_manager._get_cell_mapping_for_host, + ctxt, host) + # We should have logged a warning because the host is cached to + # a cell uuid but that cell uuid is not in the cells cache. + warning.assert_called_once() + self.assertIn('Host %s is expected to be in cell %s', + warning.call_args[0][0]) + # And we should have also tried to lookup the HostMapping in the DB + mock_get_by_host.assert_called_once_with(ctxt, host) + @mock.patch.object(nova.objects.InstanceList, 'get_by_filters') @mock.patch.object(nova.objects.ComputeNodeList, 'get_all') def test_init_instance_info_batches(self, mock_get_all,