diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 984caeefea4d..87c8b6ac0a91 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -10644,6 +10644,19 @@ class ComputeManager(manager.Manager): # Delete orphan compute node not reported by driver but still in db for cn in compute_nodes_in_db: if cn.hypervisor_hostname not in nodenames: + # if the node could be migrated, we don't delete + # the compute node database records + if not self.driver.is_node_deleted(cn.hypervisor_hostname): + LOG.warning( + "Found orphan compute node %(id)s " + "hypervisor host is %(hh)s, " + "nodes are %(nodes)s. " + "We are not deleting this as the driver " + "says this node has not been deleted.", + {'id': cn.id, 'hh': cn.hypervisor_hostname, + 'nodes': nodenames}) + continue + LOG.info("Deleting orphan compute node %(id)s " "hypervisor host is %(hh)s, " "nodes are %(nodes)s", diff --git a/nova/tests/functional/regressions/test_bug_1839560.py b/nova/tests/functional/regressions/test_bug_1839560.py index 54ae3af191f1..005f6537f3b9 100644 --- a/nova/tests/functional/regressions/test_bug_1839560.py +++ b/nova/tests/functional/regressions/test_bug_1839560.py @@ -11,6 +11,7 @@ # under the License. from oslo_log import log as logging +from unittest import mock from nova import context from nova.db.main import api as db_api @@ -20,6 +21,7 @@ from nova.tests import fixtures as nova_fixtures from nova.tests.functional import fixtures as func_fixtures from nova.tests.functional import integrated_helpers from nova import utils +from nova.virt import fake as fake_driver LOG = logging.getLogger(__name__) @@ -56,7 +58,10 @@ class PeriodicNodeRecreateTestCase(test.TestCase, # for each node. self.flags(compute_driver='fake.PredictableNodeUUIDDriver') - def test_update_available_resource_node_recreate(self): + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') + def test_update_available_resource_node_recreate(self, mock_delete): + # make the fake driver allow nodes to be delete, like ironic driver + mock_delete.return_value = True # First we create a compute service to manage a couple of fake nodes. compute = self.start_service('compute', 'node1') # When start_service runs, it will create the node1 ComputeNode. diff --git a/nova/tests/functional/regressions/test_bug_1853009.py b/nova/tests/functional/regressions/test_bug_1853009.py index 5266e6166b7e..c3a220ae5872 100644 --- a/nova/tests/functional/regressions/test_bug_1853009.py +++ b/nova/tests/functional/regressions/test_bug_1853009.py @@ -15,6 +15,7 @@ from unittest import mock from nova import context from nova import objects from nova.tests.functional import integrated_helpers +from nova.virt import fake as fake_driver class NodeRebalanceDeletedComputeNodeRaceTestCase( @@ -54,7 +55,10 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( self.nodename = 'fake-node' self.ctxt = context.get_admin_context() - def test_node_rebalance_deleted_compute_node_race(self): + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') + def test_node_rebalance_deleted_compute_node_race(self, mock_delete): + # make the fake driver allow nodes to be delete, like ironic driver + mock_delete.return_value = True # Simulate a service running and then stopping. host_a runs, creates # fake-node, then is stopped. The fake-node compute node is destroyed. # This leaves a soft-deleted node in the DB. diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 332e92bbf72a..249d1a5302d5 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -396,13 +396,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, {'node': mock.sentinel.node} ) + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') @mock.patch.object(manager, 'LOG') @mock.patch.object(manager.ComputeManager, '_update_available_resource_for_node') @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') def test_update_available_resource(self, get_db_nodes, get_avail_nodes, - update_mock, mock_log): + update_mock, mock_log, mock_deleted): mock_rt = self._mock_rt() rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject( self.compute, 'reportclient')).mock @@ -415,6 +416,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, get_db_nodes.return_value = db_nodes get_avail_nodes.return_value = avail_nodes + mock_deleted.return_value = True self.compute.update_available_resource(self.context, startup=True) get_db_nodes.assert_called_once_with(self.context, avail_nodes, use_slave=True, startup=True) @@ -460,12 +462,13 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, update_mock.assert_not_called() del_rp_mock.assert_not_called() + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') @mock.patch.object(manager.ComputeManager, '_update_available_resource_for_node') @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') def test_update_available_resource_destroy_rebalance( - self, get_db_nodes, get_avail_nodes, update_mock): + self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted): mock_rt = self._mock_rt() rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject( self.compute, 'reportclient')).mock @@ -476,6 +479,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, db_nodes[0].destroy.side_effect = exception.ObjectActionError( action='destroy', reason='host changed') get_avail_nodes.return_value = set() + mock_deleted.return_value = True self.compute.update_available_resource(self.context) get_db_nodes.assert_called_once_with(self.context, set(), use_slave=True, startup=False) @@ -486,6 +490,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, mock_rt.remove_node.assert_called_once_with('node1') rc_mock.invalidate_resource_provider.assert_called_once_with( db_nodes[0].uuid) + mock_deleted.assert_called_once_with('node1') + + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') + @mock.patch.object(manager.ComputeManager, + '_update_available_resource_for_node') + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') + def test_update_available_resource_no_destroy_rebalance( + self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted): + mock_rt = self._mock_rt() + rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject( + self.compute, 'reportclient')).mock + db_nodes = [self._make_compute_node('node1', 1)] + get_db_nodes.return_value = db_nodes + # Destroy can fail if nodes were rebalanced between getting the node + # list and calling destroy. + db_nodes[0].destroy.side_effect = exception.ObjectActionError( + action='destroy', reason='host changed') + get_avail_nodes.return_value = set() + mock_deleted.return_value = False + self.compute.update_available_resource(self.context) + get_db_nodes.assert_called_once_with(self.context, set(), + use_slave=True, startup=False) + self.assertEqual(0, update_mock.call_count) + + db_nodes[0].destroy.assert_not_called() + self.assertEqual(0, rc_mock.delete_resource_provider.call_count) + mock_rt.remove_node.assert_not_called() + rc_mock.invalidate_resource_provider.assert_not_called() + mock_deleted.assert_called_once_with('node1') @mock.patch('nova.context.get_admin_context') def test_pre_start_hook(self, get_admin_context): diff --git a/nova/virt/driver.py b/nova/virt/driver.py index 0d8da4d25547..f7f3530c638f 100644 --- a/nova/virt/driver.py +++ b/nova/virt/driver.py @@ -1156,6 +1156,26 @@ class ComputeDriver(object): """ raise NotImplementedError() + def is_node_deleted(self, nodename): + """Check this compute node has been deleted. + + This method is called when the compute manager notices that a + node that was previously reported as available is no longer + available. + In this case, we need to know if the node has actually been + deleted, or if it is simply no longer managed by this + nova-compute service. + If True is returned, the database and placement records will + be removed. + + :param nodename: + node which the caller wants to check if its deleted + :returns: True if the node is safe to delete + """ + # For most driver, compute nodes should never get deleted + # during a periodic task, they are only deleted via the API + return False + def pre_live_migration(self, context, instance, block_device_info, network_info, disk_info, migrate_data): """Prepare an instance for live migration diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 446b0db60756..ce9e0e29063d 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -697,6 +697,14 @@ class IronicDriver(virt_driver.ComputeDriver): except sdk_exc.ResourceNotFound: return False + def is_node_deleted(self, nodename): + # check if the node is missing in Ironic + try: + self._get_node(nodename) + return False + except sdk_exc.ResourceNotFound: + return True + def _refresh_hash_ring(self, ctxt): # When requesting a shard, we assume each compute service is # targeting a separate shard, so hard code peer_list to