Avoid deleting allocations for instances being built

The resource tracker's _remove_deleted_instances_allocations() assumes that
InstanceNotFound means that an instance was deleted. That's not quite accurate,
as we would also see that in the window between creating allocations and actually
creating the instance in the cell database. So, the code now will kill
allocations for those instances before they are created.

This change makes us look up the instance with read_deleted=yes, and if we find
it with deleted=True, then we do the allocation removal. This does mean that
someone running a full DB archive at the instant an instance is deleted in some
way that didn't result in allocation removal as well could leak those. However,
we can log that (unlikely) situation.

Closes-Bug: #1729371

Conflicts:
      nova/compute/resource_tracker.py
      nova/tests/unit/compute/test_resource_tracker.py

NOTE(mriedem): Conflicts were due to not having change
1ff1310683 or change
e3b7f43e3a in Pike.

Change-Id: I4482ac2ecf8e07c197fd24c520b7f11fd5a10945
(cherry picked from commit d176175db4)
This commit is contained in:
Dan Smith 2017-11-01 08:49:07 -07:00 committed by Matt Riedemann
parent 60d6e87cac
commit 70ef1fa159
2 changed files with 68 additions and 69 deletions

View File

@ -1156,6 +1156,7 @@ class ResourceTracker(object):
known_instances = set(self.tracked_instances.keys())
allocations = self.reportclient.get_allocations_for_resource_provider(
cn.uuid) or {}
read_deleted_context = context.elevated(read_deleted='yes')
for instance_uuid, alloc in allocations.items():
if instance_uuid in known_instances:
LOG.debug("Instance %s actively managed on this compute host "
@ -1163,9 +1164,23 @@ class ResourceTracker(object):
instance_uuid, alloc)
continue
try:
instance = objects.Instance.get_by_uuid(context, instance_uuid,
instance = objects.Instance.get_by_uuid(read_deleted_context,
instance_uuid,
expected_attrs=[])
except exception.InstanceNotFound:
# The instance isn't even in the database. Either the scheduler
# _just_ created an allocation for it and we're racing with the
# creation in the cell database, or the instance was deleted
# and fully archived before we got a chance to run this. The
# former is far more likely than the latter. Avoid deleting
# allocations for a building instance here.
LOG.info("Instance %(uuid)s has allocations against this "
"compute host but is not found in the database.",
{'uuid': instance_uuid},
exc_info=False)
continue
if instance.deleted:
# The instance is gone, so we definitely want to remove
# allocations associated with it.
# NOTE(jaypipes): This will not be true if/when we support

View File

@ -108,6 +108,7 @@ _INSTANCE_TYPE_FIXTURES = {
'rxtx_factor': 0,
'vcpu_weight': 1,
'extra_specs': {},
'deleted': 0,
},
2: {
'id': 2,
@ -121,6 +122,7 @@ _INSTANCE_TYPE_FIXTURES = {
'rxtx_factor': 0,
'vcpu_weight': 1,
'extra_specs': {},
'deleted': 0,
},
}
@ -129,11 +131,11 @@ _INSTANCE_TYPE_OBJ_FIXTURES = {
1: objects.Flavor(id=1, flavorid='fakeid-1', name='fake1.small',
memory_mb=128, vcpus=1, root_gb=1,
ephemeral_gb=0, swap=0, rxtx_factor=0,
vcpu_weight=1, extra_specs={}),
vcpu_weight=1, extra_specs={}, deleted=False),
2: objects.Flavor(id=2, flavorid='fakeid-2', name='fake1.medium',
memory_mb=256, vcpus=2, root_gb=5,
ephemeral_gb=0, swap=0, rxtx_factor=0,
vcpu_weight=1, extra_specs={}),
vcpu_weight=1, extra_specs={}, deleted=False),
}
@ -193,6 +195,7 @@ _INSTANCE_FIXTURES = [
flavor = _INSTANCE_TYPE_OBJ_FIXTURES[1],
old_flavor = _INSTANCE_TYPE_OBJ_FIXTURES[1],
new_flavor = _INSTANCE_TYPE_OBJ_FIXTURES[1],
deleted = False,
),
objects.Instance(
id=2,
@ -216,6 +219,7 @@ _INSTANCE_FIXTURES = [
flavor = _INSTANCE_TYPE_OBJ_FIXTURES[2],
old_flavor = _INSTANCE_TYPE_OBJ_FIXTURES[2],
new_flavor = _INSTANCE_TYPE_OBJ_FIXTURES[2],
deleted = False,
),
]
@ -478,7 +482,7 @@ class TestUpdateAvailableResources(BaseTestCase):
# parameter that update_available_resource() eventually passes
# to _update().
with mock.patch.object(self.rt, '_update') as update_mock:
self.rt.update_available_resource(mock.sentinel.ctx, _NODENAME)
self.rt.update_available_resource(mock.MagicMock(), _NODENAME)
return update_mock
@mock.patch('nova.objects.InstancePCIRequests.get_by_instance',
@ -534,16 +538,16 @@ class TestUpdateAvailableResources(BaseTestCase):
vd = self.driver_mock
vd.get_available_resource.assert_called_once_with(_NODENAME)
get_mock.assert_called_once_with(mock.sentinel.ctx, _HOSTNAME,
get_mock.assert_called_once_with(mock.ANY, _HOSTNAME,
_NODENAME,
expected_attrs=[
'system_metadata',
'numa_topology',
'flavor',
'migration_context'])
get_cn_mock.assert_called_once_with(mock.sentinel.ctx, _HOSTNAME,
get_cn_mock.assert_called_once_with(mock.ANY, _HOSTNAME,
_NODENAME)
migr_mock.assert_called_once_with(mock.sentinel.ctx, _HOSTNAME,
migr_mock.assert_called_once_with(mock.ANY, _HOSTNAME,
_NODENAME)
expected_resources = copy.deepcopy(_COMPUTE_NODE_FIXTURES[0])
@ -585,8 +589,7 @@ class TestUpdateAvailableResources(BaseTestCase):
update_mock = self._update_available_resources()
get_cn_mock.assert_called_once_with(mock.sentinel.ctx, _HOSTNAME,
_NODENAME)
get_cn_mock.assert_called_once_with(mock.ANY, _HOSTNAME, _NODENAME)
expected_resources = copy.deepcopy(_COMPUTE_NODE_FIXTURES[0])
vals = {
'free_disk_gb': 5, # 6GB avail - 1 GB reserved
@ -633,8 +636,7 @@ class TestUpdateAvailableResources(BaseTestCase):
update_mock = self._update_available_resources()
get_cn_mock.assert_called_once_with(mock.sentinel.ctx, _HOSTNAME,
_NODENAME)
get_cn_mock.assert_called_once_with(mock.ANY, _HOSTNAME, _NODENAME)
expected_resources = copy.deepcopy(_COMPUTE_NODE_FIXTURES[0])
vals = {
'free_disk_gb': 5, # 6 - 1 used
@ -695,8 +697,7 @@ class TestUpdateAvailableResources(BaseTestCase):
update_mock = self._update_available_resources()
get_cn_mock.assert_called_once_with(mock.sentinel.ctx, _HOSTNAME,
_NODENAME)
get_cn_mock.assert_called_once_with(mock.ANY, _HOSTNAME, _NODENAME)
expected_resources = copy.deepcopy(_COMPUTE_NODE_FIXTURES[0])
vals = {
'free_disk_gb': 6,
@ -760,8 +761,7 @@ class TestUpdateAvailableResources(BaseTestCase):
update_mock = self._update_available_resources()
get_cn_mock.assert_called_once_with(mock.sentinel.ctx, _HOSTNAME,
_NODENAME)
get_cn_mock.assert_called_once_with(mock.ANY, _HOSTNAME, _NODENAME)
expected_resources = copy.deepcopy(_COMPUTE_NODE_FIXTURES[0])
vals = {
'free_disk_gb': 5,
@ -820,8 +820,7 @@ class TestUpdateAvailableResources(BaseTestCase):
update_mock = self._update_available_resources()
get_cn_mock.assert_called_once_with(mock.sentinel.ctx, _HOSTNAME,
_NODENAME)
get_cn_mock.assert_called_once_with(mock.ANY, _HOSTNAME, _NODENAME)
expected_resources = copy.deepcopy(_COMPUTE_NODE_FIXTURES[0])
vals = {
'free_disk_gb': 1,
@ -877,8 +876,7 @@ class TestUpdateAvailableResources(BaseTestCase):
update_mock = self._update_available_resources()
get_cn_mock.assert_called_once_with(mock.sentinel.ctx, _HOSTNAME,
_NODENAME)
get_cn_mock.assert_called_once_with(mock.ANY, _HOSTNAME, _NODENAME)
expected_resources = copy.deepcopy(_COMPUTE_NODE_FIXTURES[0])
vals = {
'free_disk_gb': 1,
@ -943,8 +941,7 @@ class TestUpdateAvailableResources(BaseTestCase):
update_mock = self._update_available_resources()
get_cn_mock.assert_called_once_with(mock.sentinel.ctx, _HOSTNAME,
_NODENAME)
get_cn_mock.assert_called_once_with(mock.ANY, _HOSTNAME, _NODENAME)
expected_resources = copy.deepcopy(_COMPUTE_NODE_FIXTURES[0])
vals = {
# 6 total - 1G existing - 5G new flav - 1G old flav
@ -1729,7 +1726,7 @@ class TestResize(BaseTestCase):
mig_context_obj = _MIGRATION_CONTEXT_FIXTURES[instance.uuid]
instance.migration_context = mig_context_obj
self.rt.update_available_resource(mock.sentinel.ctx, _NODENAME)
self.rt.update_available_resource(mock.MagicMock(), _NODENAME)
migration = objects.Migration(
id=3,
@ -1830,7 +1827,7 @@ class TestResize(BaseTestCase):
ctx = mock.MagicMock()
# Init compute node
self.rt.update_available_resource(mock.sentinel.ctx, _NODENAME)
self.rt.update_available_resource(mock.MagicMock(), _NODENAME)
expected = self.rt.compute_nodes[_NODENAME].obj_clone()
instance = _INSTANCE_FIXTURES[0].obj_clone()
@ -1955,7 +1952,7 @@ class TestResize(BaseTestCase):
migr_mock.return_value = []
get_cn_mock.return_value = _COMPUTE_NODE_FIXTURES[0]
self.rt.update_available_resource(mock.sentinel.ctx, _NODENAME)
self.rt.update_available_resource(mock.MagicMock(), _NODENAME)
instance = _INSTANCE_FIXTURES[0].obj_clone()
instance.task_state = task_states.RESIZE_MIGRATING
@ -2086,7 +2083,7 @@ class TestResize(BaseTestCase):
migr_mock.return_value = []
get_cn_mock.return_value = _COMPUTE_NODE_FIXTURES[0].obj_clone()
self.rt.update_available_resource(mock.sentinel.ctx, _NODENAME)
self.rt.update_available_resource(mock.MagicMock(), _NODENAME)
# Instance #1 is resizing to instance type 2 which has 2 vCPUs, 256MB
# RAM and 5GB root disk.
@ -2222,7 +2219,8 @@ class TestRebuild(BaseTestCase):
migr_mock.return_value = []
get_cn_mock.return_value = _COMPUTE_NODE_FIXTURES[0].obj_clone()
self.rt.update_available_resource(mock.sentinel.ctx, _NODENAME)
ctx = mock.MagicMock()
self.rt.update_available_resource(ctx, _NODENAME)
# Now emulate the evacuate command by calling rebuild_claim() on the
# resource tracker as the compute manager does, supplying a Migration
@ -2440,16 +2438,40 @@ class TestUpdateUsageFromInstance(BaseTestCase):
rc.get_allocations_for_resource_provider = mock.MagicMock(
return_value=allocs)
rc.delete_allocation_for_instance = mock.MagicMock()
mock_inst_get.side_effect = exc.InstanceNotFound(
instance_id=uuids.deleted)
mock_inst_get.return_value = objects.Instance(
uuid=uuids.deleted, deleted=True)
cn = self.rt.compute_nodes[_NODENAME]
ctx = mock.sentinel.ctx
ctx = mock.MagicMock()
# Call the method.
self.rt._remove_deleted_instances_allocations(ctx, cn)
# Only one call should be made to delete allocations, and that should
# be for the first instance created above
rc.delete_allocation_for_instance.assert_called_once_with(
uuids.deleted)
mock_inst_get.assert_called_once_with(
ctx.elevated.return_value,
uuids.deleted,
expected_attrs=[])
ctx.elevated.assert_called_once_with(read_deleted='yes')
@mock.patch('nova.objects.Instance.get_by_uuid')
def test_remove_deleted_instances_allocations_building_instance(self,
mock_inst_get):
rc = self.rt.reportclient
self.rt.tracked_instances = {}
allocs = {uuids.deleted: "fake_deleted_instance"}
rc.get_allocations_for_resource_provider = mock.MagicMock(
return_value=allocs)
rc.delete_allocation_for_instance = mock.MagicMock()
mock_inst_get.side_effect = exc.InstanceNotFound(
instance_id=uuids.deleted)
cn = self.rt.compute_nodes[_NODENAME]
ctx = mock.MagicMock()
# Call the method.
self.rt._remove_deleted_instances_allocations(ctx, cn)
# Instance wasn't found in the database at all, so the allocation
# should not have been deleted
self.assertFalse(rc.delete_allocation_for_instance.called)
@mock.patch('nova.objects.Instance.get_by_uuid')
def test_remove_deleted_instances_allocations_scheduled_instance(self,
@ -2469,7 +2491,7 @@ class TestUpdateUsageFromInstance(BaseTestCase):
mock_inst_get.side_effect = get_by_uuid
cn = self.rt.compute_nodes[_NODENAME]
ctx = mock.sentinel.ctx
ctx = mock.MagicMock()
# Call the method.
self.rt._remove_deleted_instances_allocations(ctx, cn)
# Scheduled instances should not have their allocations removed
@ -2498,48 +2520,10 @@ class TestUpdateUsageFromInstance(BaseTestCase):
mock_get.return_value = instance
cn = self.rt.compute_nodes[_NODENAME]
ctx = mock.sentinel.ctx
ctx = mock.MagicMock()
self.rt._remove_deleted_instances_allocations(ctx, cn)
mock_delete_allocs.assert_not_called()
@mock.patch('nova.objects.Instance.get_by_uuid')
def test_remove_deleted_instances_allocations_no_instance(self,
mock_inst_get):
# If for some reason an instance is no longer available, but
# there are allocations for it, we want to be sure those
# allocations are removed, not that an InstanceNotFound
# exception is not caught. Here we set up some allocations,
# one of which is for an instance that can no longer be
# found.
rc = self.rt.reportclient
self.rt.tracked_instances = {}
# Create 1 instance
instance = _INSTANCE_FIXTURES[0].obj_clone()
instance.uuid = uuids.inst0
# Mock out the allocation call
allocs = {uuids.scheduled: "fake_scheduled_instance",
uuids.inst0: "fake_instance_gone"}
rc.get_allocations_for_resource_provider = mock.MagicMock(
return_value=allocs)
rc.delete_allocation_for_instance = mock.MagicMock()
def get_by_uuid(ctx, inst_uuid, expected_attrs=None):
ret = _INSTANCE_FIXTURES[0].obj_clone()
ret.uuid = inst_uuid
if inst_uuid == uuids.scheduled:
ret.host = None
return ret
raise exc.InstanceNotFound(instance_id=inst_uuid)
mock_inst_get.side_effect = get_by_uuid
cn = self.rt.compute_nodes[_NODENAME]
ctx = mock.sentinel.ctx
# Call the method.
self.rt._remove_deleted_instances_allocations(ctx, cn)
# One call should be made to delete allocations, for our
# instance that no longer exists.
rc.delete_allocation_for_instance.assert_called_once_with(uuids.inst0)
def test_delete_allocation_for_shelve_offloaded_instance(self):
instance = _INSTANCE_FIXTURES[0].obj_clone()
instance.uuid = uuids.inst0