[stable only] Handle quota usage during create/delete races

There is a race during an instance delete while booting where, if the
build request was found, there is still a chance we can fail to lookup
the instance record and if so, we need to go ahead and commit the quota
decrement reservations. We're currently skipping the quotas.commit() if
we *don't* find the instance record. I believe this is a regression as of
commit 361b883831.

There are two reasons why we may fail to find the instance record after
we successfully found the build request:

  a) Conductor didn't create the instance record yet.
  b) Conductor deleted the instance record after finding the build
     request was deleted by us (the API)

In either case, conductor doesn't do anything to decrement quota usage
so we need to do it in the compute/api.

There is a second race where we find the build request and succeed in
looking up the instance record but then fail to delete the instance
record (instance.destroy() raises InstanceNotFound). This can happen if

  c) Conductor deleted the instance record after finding the build
     request was deleted by us but after we looked up the instance
  d) If we (the API) are racing with another delete request

If c) happens, we need to quotas.commit(). If d) happens, we need to
quotas.rollback(). Since we can't know which case it was when we get
InstanceNotFound, we'll do a quotas.rollback() to get rid of the
reservation record and force a refresh of quota usages to make sure
we end with correct usage in case rollback was not the right choice.

Closes-Bug: #1783613

Change-Id: I4d492dcad193ff0b1202dcae97954cc65b75c109
This commit is contained in:
melanie witt 2018-07-12 22:42:19 +00:00
parent 3120f40f63
commit 456b3d68bc
3 changed files with 59 additions and 22 deletions

View File

@ -77,6 +77,7 @@ from nova.pci import request as pci_request
from nova.policies import servers as servers_policies
import nova.policy
from nova import profiler
from nova import quota
from nova import rpc
from nova.scheduler import client as scheduler_client
from nova.scheduler import utils as scheduler_utils
@ -1743,16 +1744,11 @@ class API(base.Base):
instance.task_state,
project_id, user_id)
try:
# NOTE(alaski): Though the conductor halts the build process it
# does not currently delete the instance record. This is
# because in the near future the instance record will not be
# created if the buildrequest has been deleted here. For now we
# ensure the instance has been set to deleted at this point.
# Yes this directly contradicts the comment earlier in this
# method, but this is a temporary measure.
# Look up the instance because the current instance object was
# stashed on the buildrequest and therefore not complete enough
# to run .destroy().
# NOTE(melwitt): _lookup_instance doesn't raise
# InstanceNotFound, it returns (None, None) instead.
cell, instance = self._lookup_instance(context, instance.uuid)
if instance is not None:
# If instance is None it has already been deleted.
@ -1763,9 +1759,37 @@ class API(base.Base):
instance.destroy()
else:
instance.destroy()
quotas.commit()
# NOTE(melwitt): We need to commit the quota decrement whether
# we find the instance or not because we can't get here unless
# we succeeded in deleting the build request earlier in this
# method. So, if we failed to lookup the instance here, it
# means either:
# a) Conductor didn't create the instance record yet.
# b) Conductor has deleted the instance record after finding
# the build request was deleted by us (in the API).
# In either case, conductor doesn't do anything to decrement
# quota usage, so we need to do it here.
quotas.commit()
except exception.InstanceNotFound:
# InstanceNotFound can be raised by instance.destroy() if the
# instance was deleted by another racing delete request. If
# that happens, we should rollback quota here instead of
# committing it.
# We need to either rollback or commit here to get rid of the
# reservation record, in any case.
quotas.rollback()
# InstanceNotFound could also be raised by instance.destroy()
# if conductor deleted the instance record after we looked it
# up but before we attempted to destroy it. If that happened,
# we are racing with a create request and the correct thing to
# do would be to commit the reservations instead of rolling
# them back. Since we can't differentiate between the two
# scenarios for InstanceNotFound, refresh the quota usage for
# instances, cores, and ram to clean things up if rollback()
# wasn't the right choice.
quota.QUOTAS.usage_refresh(context,
resource_names=['instances',
'cores', 'ram'])
return True
return False

View File

@ -107,6 +107,10 @@ class TestDeleteWhileBootingInstanceNotFound(
current_usage = self.api.get_limits()
self.assertEqual(starting_usage['absolute']['totalInstancesUsed'] + 1,
current_usage['absolute']['totalInstancesUsed'])
self.assertEqual(starting_usage['absolute']['totalCoresUsed'] + 1,
current_usage['absolute']['totalCoresUsed'])
self.assertEqual(starting_usage['absolute']['totalRAMUsed'] + 512,
current_usage['absolute']['totalRAMUsed'])
# Stub out the API to make the instance lookup fail, simulating if
# conductor hadn't yet created it yet or deleted it after the build
@ -118,14 +122,15 @@ class TestDeleteWhileBootingInstanceNotFound(
self._delete_server(server['id'])
self._wait_for_instance_delete(server['id'])
# Now check the quota again. Because of the bug, we won't have
# decremented quota.
# Now check the quota again. Since the bug is fixed, ending usage
# should be current usage - 1.
ending_usage = self.api.get_limits()
self.assertEqual(current_usage['absolute']['totalInstancesUsed'],
self.assertEqual(current_usage['absolute']['totalInstancesUsed'] - 1,
ending_usage['absolute']['totalInstancesUsed'])
# TODO(melwitt): Uncomment this assert when the bug is fixed.
# self.assertEqual(current_usage['absolute']['totalInstancesUsed'] - 1,
# ending_usage['absolute']['totalInstancesUsed'])
self.assertEqual(current_usage['absolute']['totalCoresUsed'] - 1,
ending_usage['absolute']['totalCoresUsed'])
self.assertEqual(current_usage['absolute']['totalRAMUsed'] - 512,
ending_usage['absolute']['totalRAMUsed'])
def test_delete_while_booting_instance_destroy_fails(self):
# Get the current quota usage
@ -139,6 +144,10 @@ class TestDeleteWhileBootingInstanceNotFound(
current_usage = self.api.get_limits()
self.assertEqual(starting_usage['absolute']['totalInstancesUsed'] + 1,
current_usage['absolute']['totalInstancesUsed'])
self.assertEqual(starting_usage['absolute']['totalCoresUsed'] + 1,
current_usage['absolute']['totalCoresUsed'])
self.assertEqual(starting_usage['absolute']['totalRAMUsed'] + 512,
current_usage['absolute']['totalRAMUsed'])
# Stub out the API to make the instance destroy raise InstanceNotFound,
# simulating if conductor already deleted it.
@ -156,11 +165,12 @@ class TestDeleteWhileBootingInstanceNotFound(
self._delete_server(server['id'])
self._wait_for_instance_delete(server['id'])
# Now check the quota again. Because of the bug, we won't have
# decremented quota.
# Now check the quota again. Since the bug is fixed, ending usage
# should be current usage - 1.
ending_usage = self.api.get_limits()
self.assertEqual(current_usage['absolute']['totalInstancesUsed'],
self.assertEqual(current_usage['absolute']['totalInstancesUsed'] - 1,
ending_usage['absolute']['totalInstancesUsed'])
# TODO(melwitt): Uncomment this assert when the bug is fixed.
# self.assertEqual(current_usage['absolute']['totalInstancesUsed'] - 1,
# ending_usage['absolute']['totalInstancesUsed'])
self.assertEqual(current_usage['absolute']['totalCoresUsed'] - 1,
ending_usage['absolute']['totalCoresUsed'])
self.assertEqual(current_usage['absolute']['totalRAMUsed'] - 512,
ending_usage['absolute']['totalRAMUsed'])

View File

@ -1467,7 +1467,7 @@ class _ComputeAPIUnitTestMixIn(object):
self.assertTrue(
self.compute_api._delete_while_booting(self.context,
inst))
self.assertFalse(quota_mock.commit.called)
self.assertTrue(quota_mock.commit.called)
test()
@ -1476,6 +1476,7 @@ class _ComputeAPIUnitTestMixIn(object):
inst = self._create_instance_obj()
quota_mock = mock.MagicMock()
@mock.patch('nova.quota.QUOTAS.usage_refresh')
@mock.patch.object(self.compute_api, '_attempt_delete_of_buildrequest',
return_value=True)
@mock.patch.object(self.compute_api, '_lookup_instance',
@ -1483,12 +1484,14 @@ class _ComputeAPIUnitTestMixIn(object):
instance_id='fake'))
@mock.patch.object(self.compute_api, '_create_reservations',
return_value=quota_mock)
def test(mock_create_res, mock_lookup, mock_attempt):
def test(mock_create_res, mock_lookup, mock_attempt, mock_refresh):
self.assertTrue(
self.compute_api._delete_while_booting(self.context,
inst))
self.assertFalse(quota_mock.commit.called)
self.assertTrue(quota_mock.rollback.called)
mock_refresh.assert_called_once_with(
self.context, resource_names=['instances', 'cores', 'ram'])
test()