Add functional regression test for bug 1806064

Change I9269ffa2b80e48db96c622d0dc0817738854f602 in Pike
introduced a race condition where creating multiple
servers concurrently can fail the second instances quota
check which happens in conductor after the instance record
is created in the cell database but its related BDMs and
tags are not stored in the cell DB. When deleting the
server from the API, since the BDMs are not in the cell
database with the instance, they are not "seen" and thus
the volume attachments are not deleted and the volume is
orphaned. As for tags, you should be able to see the tags
on the server in ERROR status from the API before deleting
it.

This change adds a functional regression test to show both
the volume attachment and tag issue when we fail the quota
check in conductor.

Change-Id: I21c2189cc1de6b8e4857de77acd9f1ef8b6ea9f6
Related-Bug: #1806064
(cherry picked from commit 5d514b33e2)
This commit is contained in:
Matt Riedemann 2018-12-03 12:55:11 -05:00 committed by Vlad Gusev
parent 283b959207
commit 5b7e904126
1 changed files with 143 additions and 0 deletions

View File

@ -0,0 +1,143 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from nova.compute import utils as compute_utils
from nova import context as nova_context
from nova import exception
from nova import objects
from nova import test
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional import integrated_helpers
from nova.tests.unit import policy_fixture
class BootFromVolumeOverQuotaRaceDeleteTest(
test.TestCase, integrated_helpers.InstanceHelperMixin):
"""Test for regression bug 1806064 introduced in Pike.
This is very similar to regression bug 1404867 where reserved/attached
volumes during a boot from volume request are not detached while deleting
a server that failed to schedule.
In this case, scheduling is successful but the late quota check in
ComputeTaskManager.schedule_and_build_instances fails. In the case of a
scheduling failure, the instance record along with the associated BDMs
are created in the cell0 database and that is where the "local delete"
code in the API finds them to detach the related volumes. In the case of
the quota fail race, the instance has already been created in a selected
cell but the BDMs records have not been and are thus not "seen" during
the API local delete and the volumes are left attached to a deleted server.
An additional issue, covered in the test here, is that tags provided when
creating the server are not retrievable from the API after the late quota
check fails.
"""
def setUp(self):
super(BootFromVolumeOverQuotaRaceDeleteTest, self).setUp()
# We need the cinder fixture for boot from volume testing.
self.cinder_fixture = self.useFixture(
nova_fixtures.CinderFixtureNewAttachFlow(self))
# Use the standard fixtures.
self.useFixture(policy_fixture.RealPolicyFixture())
self.useFixture(nova_fixtures.NeutronFixture(self))
self.useFixture(nova_fixtures.PlacementFixture())
self.api = self.useFixture(nova_fixtures.OSAPIFixture(
api_version='v2.1')).api
# Use microversion 2.52 which allows creating a server with tags.
self.api.microversion = '2.52'
self.start_service('conductor')
self.start_service('scheduler')
self.start_service('compute')
def test_bfv_quota_race_local_delete(self):
# Setup a boot-from-volume request where the API will create a
# volume attachment record for the given pre-existing volume.
# We also tag the server since tags, like BDMs, should be created in
# the cell database along with the instance.
volume_id = nova_fixtures.CinderFixtureNewAttachFlow.IMAGE_BACKED_VOL
server = {
'server': {
'name': 'test_bfv_quota_race_local_delete',
'flavorRef': self.api.get_flavors()[0]['id'],
'imageRef': '',
'block_device_mapping_v2': [{
'boot_index': 0,
'source_type': 'volume',
'destination_type': 'volume',
'uuid': volume_id
}],
'networks': 'auto',
'tags': ['bfv']
}
}
# Now we need to stub out the quota check routine so that we can
# simulate the race where the initial quota check in the API passes
# but fails in conductor once the instance has been created in cell1.
original_quota_check = compute_utils.check_num_instances_quota
def stub_check_num_instances_quota(_self, context, instance_type,
min_count, *args, **kwargs):
# Determine where we are in the flow based on whether or not the
# min_count is 0 (API will pass 1, conductor will pass 0).
if min_count == 0:
raise exception.TooManyInstances(
'test_bfv_quota_race_local_delete')
# We're checking from the API so perform the original quota check.
return original_quota_check(
_self, context, instance_type, min_count, *args, **kwargs)
self.stub_out('nova.compute.utils.check_num_instances_quota',
stub_check_num_instances_quota)
server = self.api.post_server(server)
server = self._wait_for_state_change(self.api, server, 'ERROR')
# At this point, the build request should be gone and the instance
# should have been created in cell1.
context = nova_context.get_admin_context()
self.assertRaises(exception.BuildRequestNotFound,
objects.BuildRequest.get_by_instance_uuid,
context, server['id'])
# The default cell in the functional tests is cell1 but we want to
# specifically target cell1 to make sure the instance exists there
# and we're not just getting lucky somehow due to the fixture.
cell1 = self.cell_mappings[test.CELL1_NAME]
with nova_context.target_cell(context, cell1) as cctxt:
# This would raise InstanceNotFound if the instance isn't in cell1.
instance = objects.Instance.get_by_uuid(cctxt, server['id'])
self.assertIsNone(instance.host, 'instance.host should not be set')
# Make sure we can still view the tags on the server before it is
# deleted.
# FIXME(mriedem): This is bug 1806064 where the tags are not created
# in cell1 along with the instance when the quota check race failure
# occurs. Uncomment once fixed.
# self.assertEqual(['bfv'], server['tags'])
self.assertEqual([], server['tags'])
# Now delete the server which, since it does not have a host, will be
# deleted "locally" from the API.
self.api.delete_server(server['id'])
self._wait_until_deleted(server)
# The volume should have been detached by the API.
# FIXME(mriedem): This is bug 1806064 where the volume is not detached
# because the related BDM record was not created in cell1 along with
# the instance so the API could not "see" it. Uncomment once fixed.
# self.assertIsNone(
# self.cinder_fixture.volume_ids_for_instance(server['id']))
self.assertEqual(volume_id,
# volume_ids_for_instance is a generator so listify
list(self.cinder_fixture.volume_ids_for_instance(
server['id']))[0])