diff --git a/nova/compute/manager.py b/nova/compute/manager.py index ee4a1abd4bcf..0d98f33692d4 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -3786,6 +3786,7 @@ class ComputeManager(manager.Manager): current_period=True) self._notify_about_instance_usage( context, instance, "resize.prep.start") + failed = False try: self._prep_resize(context, image, instance, instance_type, filter_properties, @@ -3794,14 +3795,31 @@ class ComputeManager(manager.Manager): # instance to be migrated is backed by LVM. # Remove when LVM migration is implemented. except exception.MigrationPreCheckError: + # TODO(mriedem): How is it even possible to get here? + # _prep_resize does not call the driver. The resize_instance + # method does, but we RPC cast to the source node to do that + # so we shouldn't even get this exception... + failed = True raise except Exception: + failed = True # try to re-schedule the resize elsewhere: exc_info = sys.exc_info() self._reschedule_resize_or_reraise(context, image, instance, exc_info, instance_type, request_spec, filter_properties) finally: + if failed: + # Since we hit a failure, we're either rescheduling or dead + # and either way we need to cleanup any allocations created + # by the scheduler for the destination node. Note that for + # a resize to the same host, the scheduler will merge the + # flavors, so here we'd be subtracting the new flavor from + # the allocated resources on this node. + rt = self._get_resource_tracker() + rt.delete_allocation_for_failed_resize( + instance, node, instance_type) + extra_usage_info = dict( new_instance_type=instance_type.name, new_instance_type_id=instance_type.id) diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 437c765d2e04..41a8047f02a4 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -1237,6 +1237,30 @@ class ResourceTracker(object): 'on the source node %s', cn.uuid, instance=instance) + def delete_allocation_for_failed_resize(self, instance, node, flavor): + """Delete instance allocations for the node during a failed resize + + :param instance: The instance being resized/migrated. + :param node: The node provider on which the instance should have + allocations to remove. If this is a resize to the same host, then + the new_flavor resources are subtracted from the single allocation. + :param flavor: This is the new_flavor during a resize. + """ + resources = scheduler_utils.resources_from_flavor(instance, flavor) + cn = self.compute_nodes[node] + res = self.reportclient.remove_provider_from_instance_allocation( + instance.uuid, cn.uuid, instance.user_id, instance.project_id, + resources) + if not res: + if instance.instance_type_id == flavor.id: + operation = 'migration' + else: + operation = 'resize' + LOG.error('Failed to clean allocation after a failed ' + '%(operation)s on node %(node)s', + {'operation': operation, 'node': cn.uuid}, + instance=instance) + def _find_orphaned_instances(self): """Given the set of instances and migrations already account for by resource tracker, sanity check the hypervisor to determine diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index dcbd59a74e71..08b5235188f7 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -1014,11 +1014,15 @@ class SchedulerReportClient(object): then PUTs the resulting allocation back to the placement API for the consumer. - We call this method on two occasions: on the source host during a - confirm_resize() operation and on the destination host during a - revert_resize() operation. This is important to reconcile the - "doubled-up" allocation that the scheduler constructs when claiming - resources against the destination host during a move operation. + We call this method on three occasions: + + 1. On the source host during a confirm_resize() operation. + 2. On the destination host during a revert_resize() operation. + 3. On the destination host when prep_resize fails. + + This is important to reconcile the "doubled-up" allocation that the + scheduler constructs when claiming resources against the destination + host during a move operation. If the move was between hosts, the entire allocation for rp_uuid will be dropped. If the move is a resize on the same host, then we will diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index d932906bd518..1e9efb1b56b3 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -1953,39 +1953,17 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin): self.assertFlavorMatchesAllocation( {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages) - def test_rescheduling_when_migrating_instance(self): - """Tests that allocations are removed from the destination node by - the compute service when a cold migrate / resize fails and a reschedule - request is sent back to conductor. + def _wait_for_prep_resize_fail_completion(self, server, expected_action): + """Polls instance action events for the given instance and action + until it finds the compute_prep_resize action event with an error + result. """ - source_hostname = self.compute1.manager.host - server = self._boot_and_check_allocations( - self.flavor1, source_hostname) - - def fake_prep_resize(*args, **kwargs): - raise test.TestingException('Simulated _prep_resize failure.') - - # Yes this isn't great in a functional test, but it's simple. - self.stub_out('nova.compute.manager.ComputeManager._prep_resize', - fake_prep_resize) - - # Now migrate the server which is going to fail on the destination. - self.api.post_server_action(server['id'], {'migrate': None}) - - # When the destination compute calls conductor and conductor calls - # the scheduler to get another host, it's going to fail since there - # are only two hosts and we started on the first and failed on the - # second, so the scheduler will raise NoValidHost and a fault will - # be recorded on the instance. However, the instance is not put into - # ERROR state since it's still OK on the source host, but faults are - # only shown in the API for ERROR/DELETED instances, so we can't poll - # for the fault. So we poll for instance action events instead. completion_event = None for attempt in range(10): actions = self.api.get_instance_actions(server['id']) # Look for the migrate action. for action in actions: - if action['action'] == 'migrate': + if action['action'] == expected_action: events = ( self.api.api_get( '/servers/%s/os-instance-actions/%s' % @@ -2009,19 +1987,78 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin): self.fail('Timed out waiting for compute_prep_resize failure ' 'event. Current instance actions: %s' % actions) + def test_rescheduling_when_migrating_instance(self): + """Tests that allocations are removed from the destination node by + the compute service when a cold migrate / resize fails and a reschedule + request is sent back to conductor. + """ + source_hostname = self.compute1.manager.host + server = self._boot_and_check_allocations( + self.flavor1, source_hostname) + + def fake_prep_resize(*args, **kwargs): + raise test.TestingException('Simulated _prep_resize failure.') + + # Yes this isn't great in a functional test, but it's simple. + self.stub_out('nova.compute.manager.ComputeManager._prep_resize', + fake_prep_resize) + + # Now migrate the server which is going to fail on the destination. + self.api.post_server_action(server['id'], {'migrate': None}) + + self._wait_for_prep_resize_fail_completion(server, 'migrate') + dest_hostname = self._other_hostname(source_hostname) dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname) failed_usages = self._get_provider_usages(dest_rp_uuid) - # FIXME(mriedem): Due to bug 1712850 the allocations are not removed - # from the destination node before the reschedule. Remove this once - # the bug is fixed as it should be: - # # Expects no allocation records on the failed host. - # self.assertFlavorMatchesAllocation( - # {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages) - self.assertFlavorMatchesAllocation(self.flavor1, failed_usages) + # Expects no allocation records on the failed host. + self.assertFlavorMatchesAllocation( + {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages) # Ensure the allocation records still exist on the source host. source_rp_uuid = self._get_provider_uuid_by_host(source_hostname) source_usages = self._get_provider_usages(source_rp_uuid) self.assertFlavorMatchesAllocation(self.flavor1, source_usages) + + def test_resize_to_same_host_prep_resize_fails(self): + """Tests that when we resize to the same host and resize fails in + the prep_resize method, we cleanup the allocations before rescheduling. + """ + # make sure that the test only uses a single host + compute2_service_id = self.admin_api.get_services( + host=self.compute2.host, binary='nova-compute')[0]['id'] + self.admin_api.put_service(compute2_service_id, {'status': 'disabled'}) + + hostname = self.compute1.manager.host + rp_uuid = self._get_provider_uuid_by_host(hostname) + + server = self._boot_and_check_allocations(self.flavor1, hostname) + + def fake_prep_resize(*args, **kwargs): + # Ensure the allocations are doubled now before we fail. + usages = self._get_provider_usages(rp_uuid) + self.assertFlavorsMatchAllocation( + self.flavor1, self.flavor2, usages) + raise test.TestingException('Simulated _prep_resize failure.') + + # Yes this isn't great in a functional test, but it's simple. + self.stub_out('nova.compute.manager.ComputeManager._prep_resize', + fake_prep_resize) + + self.flags(allow_resize_to_same_host=True) + resize_req = { + 'resize': { + 'flavorRef': self.flavor2['id'] + } + } + self.api.post_server_action(server['id'], resize_req) + + self._wait_for_prep_resize_fail_completion(server, 'resize') + + # Ensure the allocation records still exist on the host. + source_rp_uuid = self._get_provider_uuid_by_host(hostname) + source_usages = self._get_provider_usages(source_rp_uuid) + # The new_flavor should have been subtracted from the doubled + # allocation which just leaves us with the original flavor. + self.assertFlavorMatchesAllocation(self.flavor1, source_usages)