Don't delete allocation if instance being scheduled

There could be a potential race if something other than the compute
service create allocations (say, the scheduler) because then the
self-healing process that runs periodically on compute would delete that
allocation given it wouldn't find the related instance for that host.

Removing an unnecessary method in the report client code, and rather do
the loop in the RT code for directly calling to delete allocations.

Co-Authored-By: Ed Leafe <ed@leafe.com>

Blueprint: placement-claims

Change-Id: I80ba844a6e0fcea89f80aa253d57ac73092773ae
This commit is contained in:
Sylvain Bauza 2017-04-26 17:16:03 +02:00 committed by EdLeafe
parent d966d33227
commit 439f67e7bc
4 changed files with 64 additions and 58 deletions

View File

@ -992,11 +992,28 @@ class ResourceTracker(object):
if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL:
self._update_usage_from_instance(context, instance, nodename)
self.reportclient.remove_deleted_instances(cn,
self.tracked_instances.values())
# Remove allocations for instances that have been removed.
self._remove_deleted_instances_allocations(context, cn)
cn.free_ram_mb = max(0, cn.free_ram_mb)
cn.free_disk_gb = max(0, cn.free_disk_gb)
def _remove_deleted_instances_allocations(self, context, cn):
tracked_keys = set(self.tracked_instances.keys())
allocations = self.reportclient.get_allocations_for_resource_provider(
cn.uuid) or {}
allocations_to_delete = set(allocations.keys()) - tracked_keys
for instance_uuid in allocations_to_delete:
# Allocations related to instances being scheduled should not be
# deleted if we already wrote the allocation previously.
instance = objects.Instance.get_by_uuid(context, instance_uuid,
expected_attrs=[])
if not instance.host:
continue
LOG.warning('Deleting stale allocation for instance %s',
instance_uuid)
self.reportclient.delete_allocation_for_instance(instance_uuid)
def _find_orphaned_instances(self):
"""Given the set of instances and migrations already account for
by resource tracker, sanity check the hypervisor to determine

View File

@ -884,7 +884,7 @@ class SchedulerReportClient(object):
return r.status_code == 204
@safe_connect
def _delete_allocation_for_instance(self, uuid):
def delete_allocation_for_instance(self, uuid):
url = '/allocations/%s' % uuid
r = self.delete(url)
if r:
@ -905,10 +905,10 @@ class SchedulerReportClient(object):
if sign > 0:
self._allocate_for_instance(compute_node.uuid, instance)
else:
self._delete_allocation_for_instance(instance.uuid)
self.delete_allocation_for_instance(instance.uuid)
@safe_connect
def _get_allocations(self, rp_uuid):
def get_allocations_for_resource_provider(self, rp_uuid):
url = '/resource_providers/%s/allocations' % rp_uuid
resp = self.get(url)
if not resp:
@ -916,20 +916,6 @@ class SchedulerReportClient(object):
else:
return resp.json()['allocations']
def remove_deleted_instances(self, compute_node, instance_uuids):
allocations = self._get_allocations(compute_node.uuid)
if allocations is None:
allocations = {}
instance_dict = {instance['uuid']: instance
for instance in instance_uuids}
removed_instances = set(allocations.keys()) - set(instance_dict.keys())
for uuid in removed_instances:
LOG.warning(_LW('Deleting stale allocation for instance %s'),
uuid)
self._delete_allocation_for_instance(uuid)
@safe_connect
def delete_resource_provider(self, context, compute_node, cascade=False):
"""Deletes the ResourceProvider record for the compute_node.
@ -951,7 +937,7 @@ class SchedulerReportClient(object):
instances = objects.InstanceList.get_by_host_and_node(context,
host, nodename)
for instance in instances:
self._delete_allocation_for_instance(instance.uuid)
self.delete_allocation_for_instance(instance.uuid)
url = "/resource_providers/%s" % rp_uuid
resp = self.delete(url)
if resp:

View File

@ -2266,6 +2266,44 @@ class TestUpdateUsageFromInstance(BaseTestCase):
mock_update_usage.assert_called_once_with(
self.rt._get_usage_dict(self.instance), _NODENAME, sign=-1)
@mock.patch('nova.objects.Instance.get_by_uuid')
def test_remove_deleted_instances_allocations(self, mock_inst_get):
rc = self.rt.reportclient
self.rt.tracked_instances = {}
# Create 3 instances
instance_uuids = (uuids.inst0, uuids.inst1, uuids.inst2)
for i in range(3):
instance = _INSTANCE_FIXTURES[0].obj_clone()
instance.uuid = instance_uuids[i]
if i > 0:
# Only add the last two to tracked_instances, to simulate one
# deleted instance
inst_dict = obj_base.obj_to_primitive(instance)
self.rt.tracked_instances[instance.uuid] = inst_dict
# Mock out the allocation call
allocs = {uuids.scheduled: "fake_scheduled_instance"}
for i in range(3):
allocs[instance_uuids[i]] = "fake_instance_%s" % i
rc.get_allocations_for_resource_provider = mock.MagicMock(
return_value=allocs)
rc.delete_allocation_for_instance = mock.MagicMock()
def get_by_uuid(ctx, inst_uuid, expected_attrs=None):
ret = _INSTANCE_FIXTURES[0].obj_clone()
ret.uuid = inst_uuid
if inst_uuid == uuids.scheduled:
ret.host = None
return ret
mock_inst_get.side_effect = get_by_uuid
cn = self.rt.compute_nodes[_NODENAME]
ctx = mock.sentinel.ctx
# Call the method.
self.rt._remove_deleted_instances_allocations(ctx, cn)
# Only one call should be made to delete allocations, and that should
# be for the first instance created above
rc.delete_allocation_for_instance.assert_called_once_with(uuids.inst0)
class TestInstanceInResizeState(test.NoDBTestCase):
def test_active_suspending(self):

View File

@ -1482,41 +1482,6 @@ class TestAllocations(SchedulerReportClientTestCase):
self.client.update_instance_allocation(cn, inst, -1)
self.assertTrue(mock_warn.called)
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'delete')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get')
@mock.patch('nova.scheduler.client.report.'
'_instance_to_allocations_dict')
def test_remove_deleted_instances(self, mock_a, mock_get,
mock_delete):
cn = objects.ComputeNode(uuid=uuids.cn)
inst1 = objects.Instance(uuid=uuids.inst1)
inst2 = objects.Instance(uuid=uuids.inst2)
fake_allocations = {
'MEMORY_MB': 1024,
'VCPU': 2,
'DISK_GB': 101,
}
mock_get.return_value.json.return_value = {'allocations': {
inst1.uuid: fake_allocations,
inst2.uuid: fake_allocations,
}
}
# One instance still on the node, dict form as the
# RT tracks it
inst3 = {'uuid': 'foo'}
mock_delete.return_value = True
self.client.remove_deleted_instances(cn, [inst3])
mock_get.assert_called_once_with(
'/resource_providers/%s/allocations' % cn.uuid)
expected_calls = [
mock.call('/allocations/%s' % inst1.uuid),
mock.call('/allocations/%s' % inst2.uuid)]
mock_delete.assert_has_calls(expected_calls, any_order=True)
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'delete')
@mock.patch('nova.scheduler.client.report.LOG')
@ -1532,7 +1497,7 @@ class TestAllocations(SchedulerReportClientTestCase):
# py3 uses __bool__
mock_response.__bool__.return_value = False
mock_delete.return_value = mock_response
self.client._delete_allocation_for_instance(uuids.rp_uuid)
self.client.delete_allocation_for_instance(uuids.rp_uuid)
# make sure we didn't screw up the logic or the mock
mock_log.info.assert_not_called()
# make sure warning wasn't called for the 404
@ -1541,7 +1506,7 @@ class TestAllocations(SchedulerReportClientTestCase):
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"delete")
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"_delete_allocation_for_instance")
"delete_allocation_for_instance")
@mock.patch("nova.objects.InstanceList.get_by_host_and_node")
def test_delete_resource_provider_cascade(self, mock_by_host,
mock_del_alloc, mock_delete):
@ -1563,7 +1528,7 @@ class TestAllocations(SchedulerReportClientTestCase):
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"delete")
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"_delete_allocation_for_instance")
"delete_allocation_for_instance")
@mock.patch("nova.objects.InstanceList.get_by_host_and_node")
def test_delete_resource_provider_no_cascade(self, mock_by_host,
mock_del_alloc, mock_delete):