Add functional recreate test for regression bug 1825537

Change I2d9ab06b485f76550dbbff46f79f40ff4c97d12f in Rocky
(and backported through to Pike) added error handling to
the resize_instance and finish_resize methods to revert
allocations in placement when a failure occurs.

This is OK for resize_instance, which runs on the source
compute, as long as the instance.host/node values have not
yet been changed to the dest host/node before RPC casting
to the finish_resize method on the dest compute. It's OK
because the instance is still on the source compute and the
DB says so, so any attempt to recover the instance via hard
reboot or rebuild will be on the source host.

This is not OK for finish_resize because if we fail there
and revert the allocations, the instance host/node values
are already pointing at the dest compute and by reverting
the allocations in placement, placement will be incorrectly
tracking the instance usage with the old flavor against the
source node resource provider rather than the new flavor
against the dest node resource provider - where the instance
is actually running and the nova DB says the instance lives.

This change adds a simple functional regression test to
recreate the bug with a multi-host resize. There is already
a same-host resize functional test marked here which will
need to be fixed as well.

NOTE(mriedem): The test needed to be modified from Train
since we have to rely on waiting for the task_state to
change to None rather than the migration status changing
to "error" since change  Id6c0a0ee41520dd974052d7cdd17ca35d688f6b0
is not in Stein.

Change-Id: Ie9e294db7e24d0e3cbe83eee847f0fbfb7478900
Related-Bug: #1825537
(cherry picked from commit f4bb672106)
This commit is contained in:
Matt Riedemann 2019-04-19 11:54:07 -04:00
parent acd2daa9dc
commit eaa1fc6159
3 changed files with 90 additions and 0 deletions

View File

@ -0,0 +1,82 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from nova.tests.functional import integrated_helpers
class FinishResizeErrorAllocationCleanupTestCase(
integrated_helpers.ProviderUsageBaseTestCase):
"""Test for bug 1825537 introduced in Rocky and backported down to Pike.
Tests a scenario where finish_resize fails on the dest compute during a
resize and ensures resource provider allocations are properly cleaned up
in placement.
"""
compute_driver = 'fake.FakeFinishMigrationFailDriver'
def setUp(self):
super(FinishResizeErrorAllocationCleanupTestCase, self).setUp()
# Get the flavors we're going to use.
flavors = self.api.get_flavors()
self.flavor1 = flavors[0]
self.flavor2 = flavors[1]
def _resize_and_assert_error(self, server, dest_host):
# Now resize the server and wait for it to go to ERROR status because
# the finish_migration virt driver method in host2 should fail.
req = {'resize': {'flavorRef': self.flavor2['id']}}
self.api.post_server_action(server['id'], req)
# The instance is set to ERROR status before the fault is recorded so
# to avoid a race we need to wait for the task_state to change
# to None which happens after the fault is recorded.
server = self._wait_for_server_parameter(
self.admin_api, server,
{'status': 'ERROR', 'OS-EXT-STS:task_state': None})
# The server should be pointing at $dest_host because resize_instance
# will have updated the host/node value on the instance before casting
# to the finish_resize method on the dest compute.
self.assertEqual(dest_host, server['OS-EXT-SRV-ATTR:host'])
# In this case the FakeFinishMigrationFailDriver.finish_migration
# method raises VirtualInterfaceCreateException.
self.assertIn('Virtual Interface creation failed',
server['fault']['message'])
def test_finish_resize_fails_allocation_cleanup(self):
# Start two computes so we can resize across hosts.
self._start_compute('host1')
self._start_compute('host2')
# Create a server on host1.
server = self._boot_and_check_allocations(self.flavor1, 'host1')
# Resize to host2 which should fail.
self._resize_and_assert_error(server, 'host2')
# Check the resource provider allocations. Since the server is pointed
# at the dest host in the DB now, the dest node resource provider
# allocations should still exist with the new flavor.
source_rp_uuid = self._get_provider_uuid_by_host('host1')
dest_rp_uuid = self._get_provider_uuid_by_host('host2')
# FIXME(mriedem): This is bug 1825537 where the allocations are
# reverted when finish_resize fails so the dest node resource provider
# does not have any allocations and the instance allocations are for
# the old flavor on the source node resource provider even though the
# instance is not running on the source host nor pointed at the source
# host in the DB.
# self.assertFlavorMatchesAllocation(
# self.flavor2, server['id'], dest_rp_uuid)
dest_rp_usages = self._get_provider_usages(dest_rp_uuid)
no_usage = {'VCPU': 0, 'MEMORY_MB': 0, 'DISK_GB': 0}
self.assertEqual(no_usage, dest_rp_usages)
self.assertFlavorMatchesAllocation(
self.flavor1, server['id'], source_rp_uuid)

View File

@ -3723,6 +3723,7 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
# Ensure the allocation records still exist on the host.
source_rp_uuid = self._get_provider_uuid_by_host(hostname)
# FIXME(mriedem): This is wrong for the _finish_resize case.
# The new_flavor should have been subtracted from the doubled
# allocation which just leaves us with the original flavor.
self.assertFlavorMatchesUsage(source_rp_uuid, self.flavor1)

View File

@ -731,6 +731,13 @@ class MediumFakeDriverWithNestedCustomResources(MediumFakeDriver):
self.child_resources)
class FakeFinishMigrationFailDriver(FakeDriver):
"""FakeDriver variant that will raise an exception from finish_migration"""
def finish_migration(self, *args, **kwargs):
raise exception.VirtualInterfaceCreateException()
class FakeRescheduleDriver(FakeDriver):
"""FakeDriver derivative that triggers a reschedule on the first spawn
attempt. This is expected to only be used in tests that have more than