Merge "Make compute node rebalance safer"
This commit is contained in:
commit
d29a9b64ee
|
@ -10644,6 +10644,19 @@ class ComputeManager(manager.Manager):
|
||||||
# Delete orphan compute node not reported by driver but still in db
|
# Delete orphan compute node not reported by driver but still in db
|
||||||
for cn in compute_nodes_in_db:
|
for cn in compute_nodes_in_db:
|
||||||
if cn.hypervisor_hostname not in nodenames:
|
if cn.hypervisor_hostname not in nodenames:
|
||||||
|
# if the node could be migrated, we don't delete
|
||||||
|
# the compute node database records
|
||||||
|
if not self.driver.is_node_deleted(cn.hypervisor_hostname):
|
||||||
|
LOG.warning(
|
||||||
|
"Found orphan compute node %(id)s "
|
||||||
|
"hypervisor host is %(hh)s, "
|
||||||
|
"nodes are %(nodes)s. "
|
||||||
|
"We are not deleting this as the driver "
|
||||||
|
"says this node has not been deleted.",
|
||||||
|
{'id': cn.id, 'hh': cn.hypervisor_hostname,
|
||||||
|
'nodes': nodenames})
|
||||||
|
continue
|
||||||
|
|
||||||
LOG.info("Deleting orphan compute node %(id)s "
|
LOG.info("Deleting orphan compute node %(id)s "
|
||||||
"hypervisor host is %(hh)s, "
|
"hypervisor host is %(hh)s, "
|
||||||
"nodes are %(nodes)s",
|
"nodes are %(nodes)s",
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
from oslo_log import log as logging
|
from oslo_log import log as logging
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
from nova import context
|
from nova import context
|
||||||
from nova.db.main import api as db_api
|
from nova.db.main import api as db_api
|
||||||
|
@ -20,6 +21,7 @@ from nova.tests import fixtures as nova_fixtures
|
||||||
from nova.tests.functional import fixtures as func_fixtures
|
from nova.tests.functional import fixtures as func_fixtures
|
||||||
from nova.tests.functional import integrated_helpers
|
from nova.tests.functional import integrated_helpers
|
||||||
from nova import utils
|
from nova import utils
|
||||||
|
from nova.virt import fake as fake_driver
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -56,7 +58,10 @@ class PeriodicNodeRecreateTestCase(test.TestCase,
|
||||||
# for each node.
|
# for each node.
|
||||||
self.flags(compute_driver='fake.PredictableNodeUUIDDriver')
|
self.flags(compute_driver='fake.PredictableNodeUUIDDriver')
|
||||||
|
|
||||||
def test_update_available_resource_node_recreate(self):
|
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||||
|
def test_update_available_resource_node_recreate(self, mock_delete):
|
||||||
|
# make the fake driver allow nodes to be delete, like ironic driver
|
||||||
|
mock_delete.return_value = True
|
||||||
# First we create a compute service to manage a couple of fake nodes.
|
# First we create a compute service to manage a couple of fake nodes.
|
||||||
compute = self.start_service('compute', 'node1')
|
compute = self.start_service('compute', 'node1')
|
||||||
# When start_service runs, it will create the node1 ComputeNode.
|
# When start_service runs, it will create the node1 ComputeNode.
|
||||||
|
|
|
@ -15,6 +15,7 @@ from unittest import mock
|
||||||
from nova import context
|
from nova import context
|
||||||
from nova import objects
|
from nova import objects
|
||||||
from nova.tests.functional import integrated_helpers
|
from nova.tests.functional import integrated_helpers
|
||||||
|
from nova.virt import fake as fake_driver
|
||||||
|
|
||||||
|
|
||||||
class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
||||||
|
@ -54,7 +55,10 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
||||||
self.nodename = 'fake-node'
|
self.nodename = 'fake-node'
|
||||||
self.ctxt = context.get_admin_context()
|
self.ctxt = context.get_admin_context()
|
||||||
|
|
||||||
def test_node_rebalance_deleted_compute_node_race(self):
|
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||||
|
def test_node_rebalance_deleted_compute_node_race(self, mock_delete):
|
||||||
|
# make the fake driver allow nodes to be delete, like ironic driver
|
||||||
|
mock_delete.return_value = True
|
||||||
# Simulate a service running and then stopping. host_a runs, creates
|
# Simulate a service running and then stopping. host_a runs, creates
|
||||||
# fake-node, then is stopped. The fake-node compute node is destroyed.
|
# fake-node, then is stopped. The fake-node compute node is destroyed.
|
||||||
# This leaves a soft-deleted node in the DB.
|
# This leaves a soft-deleted node in the DB.
|
||||||
|
|
|
@ -396,13 +396,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||||
{'node': mock.sentinel.node}
|
{'node': mock.sentinel.node}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||||
@mock.patch.object(manager, 'LOG')
|
@mock.patch.object(manager, 'LOG')
|
||||||
@mock.patch.object(manager.ComputeManager,
|
@mock.patch.object(manager.ComputeManager,
|
||||||
'_update_available_resource_for_node')
|
'_update_available_resource_for_node')
|
||||||
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
||||||
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
|
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
|
||||||
def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
|
def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
|
||||||
update_mock, mock_log):
|
update_mock, mock_log, mock_deleted):
|
||||||
mock_rt = self._mock_rt()
|
mock_rt = self._mock_rt()
|
||||||
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
|
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
|
||||||
self.compute, 'reportclient')).mock
|
self.compute, 'reportclient')).mock
|
||||||
|
@ -415,6 +416,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||||
|
|
||||||
get_db_nodes.return_value = db_nodes
|
get_db_nodes.return_value = db_nodes
|
||||||
get_avail_nodes.return_value = avail_nodes
|
get_avail_nodes.return_value = avail_nodes
|
||||||
|
mock_deleted.return_value = True
|
||||||
self.compute.update_available_resource(self.context, startup=True)
|
self.compute.update_available_resource(self.context, startup=True)
|
||||||
get_db_nodes.assert_called_once_with(self.context, avail_nodes,
|
get_db_nodes.assert_called_once_with(self.context, avail_nodes,
|
||||||
use_slave=True, startup=True)
|
use_slave=True, startup=True)
|
||||||
|
@ -460,12 +462,13 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||||
update_mock.assert_not_called()
|
update_mock.assert_not_called()
|
||||||
del_rp_mock.assert_not_called()
|
del_rp_mock.assert_not_called()
|
||||||
|
|
||||||
|
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||||
@mock.patch.object(manager.ComputeManager,
|
@mock.patch.object(manager.ComputeManager,
|
||||||
'_update_available_resource_for_node')
|
'_update_available_resource_for_node')
|
||||||
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
||||||
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
|
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
|
||||||
def test_update_available_resource_destroy_rebalance(
|
def test_update_available_resource_destroy_rebalance(
|
||||||
self, get_db_nodes, get_avail_nodes, update_mock):
|
self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted):
|
||||||
mock_rt = self._mock_rt()
|
mock_rt = self._mock_rt()
|
||||||
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
|
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
|
||||||
self.compute, 'reportclient')).mock
|
self.compute, 'reportclient')).mock
|
||||||
|
@ -476,6 +479,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||||
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
|
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
|
||||||
action='destroy', reason='host changed')
|
action='destroy', reason='host changed')
|
||||||
get_avail_nodes.return_value = set()
|
get_avail_nodes.return_value = set()
|
||||||
|
mock_deleted.return_value = True
|
||||||
self.compute.update_available_resource(self.context)
|
self.compute.update_available_resource(self.context)
|
||||||
get_db_nodes.assert_called_once_with(self.context, set(),
|
get_db_nodes.assert_called_once_with(self.context, set(),
|
||||||
use_slave=True, startup=False)
|
use_slave=True, startup=False)
|
||||||
|
@ -486,6 +490,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||||
mock_rt.remove_node.assert_called_once_with('node1')
|
mock_rt.remove_node.assert_called_once_with('node1')
|
||||||
rc_mock.invalidate_resource_provider.assert_called_once_with(
|
rc_mock.invalidate_resource_provider.assert_called_once_with(
|
||||||
db_nodes[0].uuid)
|
db_nodes[0].uuid)
|
||||||
|
mock_deleted.assert_called_once_with('node1')
|
||||||
|
|
||||||
|
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||||
|
@mock.patch.object(manager.ComputeManager,
|
||||||
|
'_update_available_resource_for_node')
|
||||||
|
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
||||||
|
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
|
||||||
|
def test_update_available_resource_no_destroy_rebalance(
|
||||||
|
self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted):
|
||||||
|
mock_rt = self._mock_rt()
|
||||||
|
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
|
||||||
|
self.compute, 'reportclient')).mock
|
||||||
|
db_nodes = [self._make_compute_node('node1', 1)]
|
||||||
|
get_db_nodes.return_value = db_nodes
|
||||||
|
# Destroy can fail if nodes were rebalanced between getting the node
|
||||||
|
# list and calling destroy.
|
||||||
|
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
|
||||||
|
action='destroy', reason='host changed')
|
||||||
|
get_avail_nodes.return_value = set()
|
||||||
|
mock_deleted.return_value = False
|
||||||
|
self.compute.update_available_resource(self.context)
|
||||||
|
get_db_nodes.assert_called_once_with(self.context, set(),
|
||||||
|
use_slave=True, startup=False)
|
||||||
|
self.assertEqual(0, update_mock.call_count)
|
||||||
|
|
||||||
|
db_nodes[0].destroy.assert_not_called()
|
||||||
|
self.assertEqual(0, rc_mock.delete_resource_provider.call_count)
|
||||||
|
mock_rt.remove_node.assert_not_called()
|
||||||
|
rc_mock.invalidate_resource_provider.assert_not_called()
|
||||||
|
mock_deleted.assert_called_once_with('node1')
|
||||||
|
|
||||||
@mock.patch('nova.context.get_admin_context')
|
@mock.patch('nova.context.get_admin_context')
|
||||||
def test_pre_start_hook(self, get_admin_context):
|
def test_pre_start_hook(self, get_admin_context):
|
||||||
|
|
|
@ -1156,6 +1156,26 @@ class ComputeDriver(object):
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def is_node_deleted(self, nodename):
|
||||||
|
"""Check this compute node has been deleted.
|
||||||
|
|
||||||
|
This method is called when the compute manager notices that a
|
||||||
|
node that was previously reported as available is no longer
|
||||||
|
available.
|
||||||
|
In this case, we need to know if the node has actually been
|
||||||
|
deleted, or if it is simply no longer managed by this
|
||||||
|
nova-compute service.
|
||||||
|
If True is returned, the database and placement records will
|
||||||
|
be removed.
|
||||||
|
|
||||||
|
:param nodename:
|
||||||
|
node which the caller wants to check if its deleted
|
||||||
|
:returns: True if the node is safe to delete
|
||||||
|
"""
|
||||||
|
# For most driver, compute nodes should never get deleted
|
||||||
|
# during a periodic task, they are only deleted via the API
|
||||||
|
return False
|
||||||
|
|
||||||
def pre_live_migration(self, context, instance, block_device_info,
|
def pre_live_migration(self, context, instance, block_device_info,
|
||||||
network_info, disk_info, migrate_data):
|
network_info, disk_info, migrate_data):
|
||||||
"""Prepare an instance for live migration
|
"""Prepare an instance for live migration
|
||||||
|
|
|
@ -697,6 +697,14 @@ class IronicDriver(virt_driver.ComputeDriver):
|
||||||
except sdk_exc.ResourceNotFound:
|
except sdk_exc.ResourceNotFound:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def is_node_deleted(self, nodename):
|
||||||
|
# check if the node is missing in Ironic
|
||||||
|
try:
|
||||||
|
self._get_node(nodename)
|
||||||
|
return False
|
||||||
|
except sdk_exc.ResourceNotFound:
|
||||||
|
return True
|
||||||
|
|
||||||
def _refresh_hash_ring(self, ctxt):
|
def _refresh_hash_ring(self, ctxt):
|
||||||
# When requesting a shard, we assume each compute service is
|
# When requesting a shard, we assume each compute service is
|
||||||
# targeting a separate shard, so hard code peer_list to
|
# targeting a separate shard, so hard code peer_list to
|
||||||
|
|
Loading…
Reference in New Issue