summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormelanie witt <melwittt@gmail.com>2018-07-12 22:42:19 +0000
committermelanie witt <melwittt@gmail.com>2018-08-08 23:02:56 +0000
commit456b3d68bc491a6965316e6a4f3fc11dd509237b (patch)
tree92185ff3cf3d511ae510b18bdbc22073da6951cf
parent3120f40f636ab4cbdcb9f5cd4e9175e99e1e1c63 (diff)
[stable only] Handle quota usage during create/delete racesstable/ocata
There is a race during an instance delete while booting where, if the build request was found, there is still a chance we can fail to lookup the instance record and if so, we need to go ahead and commit the quota decrement reservations. We're currently skipping the quotas.commit() if we *don't* find the instance record. I believe this is a regression as of commit 361b88383130be8da9d474d02a9f62136239d506. There are two reasons why we may fail to find the instance record after we successfully found the build request: a) Conductor didn't create the instance record yet. b) Conductor deleted the instance record after finding the build request was deleted by us (the API) In either case, conductor doesn't do anything to decrement quota usage so we need to do it in the compute/api. There is a second race where we find the build request and succeed in looking up the instance record but then fail to delete the instance record (instance.destroy() raises InstanceNotFound). This can happen if c) Conductor deleted the instance record after finding the build request was deleted by us but after we looked up the instance d) If we (the API) are racing with another delete request If c) happens, we need to quotas.commit(). If d) happens, we need to quotas.rollback(). Since we can't know which case it was when we get InstanceNotFound, we'll do a quotas.rollback() to get rid of the reservation record and force a refresh of quota usages to make sure we end with correct usage in case rollback was not the right choice. Closes-Bug: #1783613 Change-Id: I4d492dcad193ff0b1202dcae97954cc65b75c109
Notes
Notes (review): Code-Review+2: Matt Riedemann <mriedem.os@gmail.com> Code-Review+2: Dan Smith <dms@danplanet.com> Workflow+1: Dan Smith <dms@danplanet.com> Verified+2: Zuul Submitted-by: Zuul Submitted-at: Thu, 09 Aug 2018 22:31:49 +0000 Reviewed-on: https://review.openstack.org/582413 Project: openstack/nova Branch: refs/heads/stable/ocata
-rw-r--r--nova/compute/api.py40
-rw-r--r--nova/tests/functional/regressions/test_bug_1783613.py34
-rw-r--r--nova/tests/unit/compute/test_compute_api.py7
3 files changed, 59 insertions, 22 deletions
diff --git a/nova/compute/api.py b/nova/compute/api.py
index 2401f0e..07babad 100644
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@@ -77,6 +77,7 @@ from nova.pci import request as pci_request
77from nova.policies import servers as servers_policies 77from nova.policies import servers as servers_policies
78import nova.policy 78import nova.policy
79from nova import profiler 79from nova import profiler
80from nova import quota
80from nova import rpc 81from nova import rpc
81from nova.scheduler import client as scheduler_client 82from nova.scheduler import client as scheduler_client
82from nova.scheduler import utils as scheduler_utils 83from nova.scheduler import utils as scheduler_utils
@@ -1743,16 +1744,11 @@ class API(base.Base):
1743 instance.task_state, 1744 instance.task_state,
1744 project_id, user_id) 1745 project_id, user_id)
1745 try: 1746 try:
1746 # NOTE(alaski): Though the conductor halts the build process it
1747 # does not currently delete the instance record. This is
1748 # because in the near future the instance record will not be
1749 # created if the buildrequest has been deleted here. For now we
1750 # ensure the instance has been set to deleted at this point.
1751 # Yes this directly contradicts the comment earlier in this
1752 # method, but this is a temporary measure.
1753 # Look up the instance because the current instance object was 1747 # Look up the instance because the current instance object was
1754 # stashed on the buildrequest and therefore not complete enough 1748 # stashed on the buildrequest and therefore not complete enough
1755 # to run .destroy(). 1749 # to run .destroy().
1750 # NOTE(melwitt): _lookup_instance doesn't raise
1751 # InstanceNotFound, it returns (None, None) instead.
1756 cell, instance = self._lookup_instance(context, instance.uuid) 1752 cell, instance = self._lookup_instance(context, instance.uuid)
1757 if instance is not None: 1753 if instance is not None:
1758 # If instance is None it has already been deleted. 1754 # If instance is None it has already been deleted.
@@ -1763,9 +1759,37 @@ class API(base.Base):
1763 instance.destroy() 1759 instance.destroy()
1764 else: 1760 else:
1765 instance.destroy() 1761 instance.destroy()
1766 quotas.commit() 1762 # NOTE(melwitt): We need to commit the quota decrement whether
1763 # we find the instance or not because we can't get here unless
1764 # we succeeded in deleting the build request earlier in this
1765 # method. So, if we failed to lookup the instance here, it
1766 # means either:
1767 # a) Conductor didn't create the instance record yet.
1768 # b) Conductor has deleted the instance record after finding
1769 # the build request was deleted by us (in the API).
1770 # In either case, conductor doesn't do anything to decrement
1771 # quota usage, so we need to do it here.
1772 quotas.commit()
1767 except exception.InstanceNotFound: 1773 except exception.InstanceNotFound:
1774 # InstanceNotFound can be raised by instance.destroy() if the
1775 # instance was deleted by another racing delete request. If
1776 # that happens, we should rollback quota here instead of
1777 # committing it.
1778 # We need to either rollback or commit here to get rid of the
1779 # reservation record, in any case.
1768 quotas.rollback() 1780 quotas.rollback()
1781 # InstanceNotFound could also be raised by instance.destroy()
1782 # if conductor deleted the instance record after we looked it
1783 # up but before we attempted to destroy it. If that happened,
1784 # we are racing with a create request and the correct thing to
1785 # do would be to commit the reservations instead of rolling
1786 # them back. Since we can't differentiate between the two
1787 # scenarios for InstanceNotFound, refresh the quota usage for
1788 # instances, cores, and ram to clean things up if rollback()
1789 # wasn't the right choice.
1790 quota.QUOTAS.usage_refresh(context,
1791 resource_names=['instances',
1792 'cores', 'ram'])
1769 1793
1770 return True 1794 return True
1771 return False 1795 return False
diff --git a/nova/tests/functional/regressions/test_bug_1783613.py b/nova/tests/functional/regressions/test_bug_1783613.py
index 781e46d..e1714ca 100644
--- a/nova/tests/functional/regressions/test_bug_1783613.py
+++ b/nova/tests/functional/regressions/test_bug_1783613.py
@@ -107,6 +107,10 @@ class TestDeleteWhileBootingInstanceNotFound(
107 current_usage = self.api.get_limits() 107 current_usage = self.api.get_limits()
108 self.assertEqual(starting_usage['absolute']['totalInstancesUsed'] + 1, 108 self.assertEqual(starting_usage['absolute']['totalInstancesUsed'] + 1,
109 current_usage['absolute']['totalInstancesUsed']) 109 current_usage['absolute']['totalInstancesUsed'])
110 self.assertEqual(starting_usage['absolute']['totalCoresUsed'] + 1,
111 current_usage['absolute']['totalCoresUsed'])
112 self.assertEqual(starting_usage['absolute']['totalRAMUsed'] + 512,
113 current_usage['absolute']['totalRAMUsed'])
110 114
111 # Stub out the API to make the instance lookup fail, simulating if 115 # Stub out the API to make the instance lookup fail, simulating if
112 # conductor hadn't yet created it yet or deleted it after the build 116 # conductor hadn't yet created it yet or deleted it after the build
@@ -118,14 +122,15 @@ class TestDeleteWhileBootingInstanceNotFound(
118 self._delete_server(server['id']) 122 self._delete_server(server['id'])
119 self._wait_for_instance_delete(server['id']) 123 self._wait_for_instance_delete(server['id'])
120 124
121 # Now check the quota again. Because of the bug, we won't have 125 # Now check the quota again. Since the bug is fixed, ending usage
122 # decremented quota. 126 # should be current usage - 1.
123 ending_usage = self.api.get_limits() 127 ending_usage = self.api.get_limits()
124 self.assertEqual(current_usage['absolute']['totalInstancesUsed'], 128 self.assertEqual(current_usage['absolute']['totalInstancesUsed'] - 1,
125 ending_usage['absolute']['totalInstancesUsed']) 129 ending_usage['absolute']['totalInstancesUsed'])
126 # TODO(melwitt): Uncomment this assert when the bug is fixed. 130 self.assertEqual(current_usage['absolute']['totalCoresUsed'] - 1,
127 # self.assertEqual(current_usage['absolute']['totalInstancesUsed'] - 1, 131 ending_usage['absolute']['totalCoresUsed'])
128 # ending_usage['absolute']['totalInstancesUsed']) 132 self.assertEqual(current_usage['absolute']['totalRAMUsed'] - 512,
133 ending_usage['absolute']['totalRAMUsed'])
129 134
130 def test_delete_while_booting_instance_destroy_fails(self): 135 def test_delete_while_booting_instance_destroy_fails(self):
131 # Get the current quota usage 136 # Get the current quota usage
@@ -139,6 +144,10 @@ class TestDeleteWhileBootingInstanceNotFound(
139 current_usage = self.api.get_limits() 144 current_usage = self.api.get_limits()
140 self.assertEqual(starting_usage['absolute']['totalInstancesUsed'] + 1, 145 self.assertEqual(starting_usage['absolute']['totalInstancesUsed'] + 1,
141 current_usage['absolute']['totalInstancesUsed']) 146 current_usage['absolute']['totalInstancesUsed'])
147 self.assertEqual(starting_usage['absolute']['totalCoresUsed'] + 1,
148 current_usage['absolute']['totalCoresUsed'])
149 self.assertEqual(starting_usage['absolute']['totalRAMUsed'] + 512,
150 current_usage['absolute']['totalRAMUsed'])
142 151
143 # Stub out the API to make the instance destroy raise InstanceNotFound, 152 # Stub out the API to make the instance destroy raise InstanceNotFound,
144 # simulating if conductor already deleted it. 153 # simulating if conductor already deleted it.
@@ -156,11 +165,12 @@ class TestDeleteWhileBootingInstanceNotFound(
156 self._delete_server(server['id']) 165 self._delete_server(server['id'])
157 self._wait_for_instance_delete(server['id']) 166 self._wait_for_instance_delete(server['id'])
158 167
159 # Now check the quota again. Because of the bug, we won't have 168 # Now check the quota again. Since the bug is fixed, ending usage
160 # decremented quota. 169 # should be current usage - 1.
161 ending_usage = self.api.get_limits() 170 ending_usage = self.api.get_limits()
162 self.assertEqual(current_usage['absolute']['totalInstancesUsed'], 171 self.assertEqual(current_usage['absolute']['totalInstancesUsed'] - 1,
163 ending_usage['absolute']['totalInstancesUsed']) 172 ending_usage['absolute']['totalInstancesUsed'])
164 # TODO(melwitt): Uncomment this assert when the bug is fixed. 173 self.assertEqual(current_usage['absolute']['totalCoresUsed'] - 1,
165 # self.assertEqual(current_usage['absolute']['totalInstancesUsed'] - 1, 174 ending_usage['absolute']['totalCoresUsed'])
166 # ending_usage['absolute']['totalInstancesUsed']) 175 self.assertEqual(current_usage['absolute']['totalRAMUsed'] - 512,
176 ending_usage['absolute']['totalRAMUsed'])
diff --git a/nova/tests/unit/compute/test_compute_api.py b/nova/tests/unit/compute/test_compute_api.py
index 10003de..f10cbde 100644
--- a/nova/tests/unit/compute/test_compute_api.py
+++ b/nova/tests/unit/compute/test_compute_api.py
@@ -1467,7 +1467,7 @@ class _ComputeAPIUnitTestMixIn(object):
1467 self.assertTrue( 1467 self.assertTrue(
1468 self.compute_api._delete_while_booting(self.context, 1468 self.compute_api._delete_while_booting(self.context,
1469 inst)) 1469 inst))
1470 self.assertFalse(quota_mock.commit.called) 1470 self.assertTrue(quota_mock.commit.called)
1471 1471
1472 test() 1472 test()
1473 1473
@@ -1476,6 +1476,7 @@ class _ComputeAPIUnitTestMixIn(object):
1476 inst = self._create_instance_obj() 1476 inst = self._create_instance_obj()
1477 quota_mock = mock.MagicMock() 1477 quota_mock = mock.MagicMock()
1478 1478
1479 @mock.patch('nova.quota.QUOTAS.usage_refresh')
1479 @mock.patch.object(self.compute_api, '_attempt_delete_of_buildrequest', 1480 @mock.patch.object(self.compute_api, '_attempt_delete_of_buildrequest',
1480 return_value=True) 1481 return_value=True)
1481 @mock.patch.object(self.compute_api, '_lookup_instance', 1482 @mock.patch.object(self.compute_api, '_lookup_instance',
@@ -1483,12 +1484,14 @@ class _ComputeAPIUnitTestMixIn(object):
1483 instance_id='fake')) 1484 instance_id='fake'))
1484 @mock.patch.object(self.compute_api, '_create_reservations', 1485 @mock.patch.object(self.compute_api, '_create_reservations',
1485 return_value=quota_mock) 1486 return_value=quota_mock)
1486 def test(mock_create_res, mock_lookup, mock_attempt): 1487 def test(mock_create_res, mock_lookup, mock_attempt, mock_refresh):
1487 self.assertTrue( 1488 self.assertTrue(
1488 self.compute_api._delete_while_booting(self.context, 1489 self.compute_api._delete_while_booting(self.context,
1489 inst)) 1490 inst))
1490 self.assertFalse(quota_mock.commit.called) 1491 self.assertFalse(quota_mock.commit.called)
1491 self.assertTrue(quota_mock.rollback.called) 1492 self.assertTrue(quota_mock.rollback.called)
1493 mock_refresh.assert_called_once_with(
1494 self.context, resource_names=['instances', 'cores', 'ram'])
1492 1495
1493 test() 1496 test()
1494 1497