From ad20a87028523f0a1fdf2e9319fac4537c9fbbf3 Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Wed, 15 Nov 2017 19:15:44 -0500 Subject: [PATCH] Always deallocate networking before reschedule if using Neutron When a server build fails on a selected compute host, the compute service will cast to conductor which calls the scheduler to select another host to attempt the build if retries are not exhausted. With commit 08d24b733ee9f4da44bfbb8d6d3914924a41ccdc, if retries are exhausted or the scheduler raises NoValidHost, conductor will deallocate networking for the instance. In the case of neutron, this means unbinding any ports that the user provided with the server create request and deleting any ports that nova-compute created during the allocate_for_instance() operation during server build. When an instance is deleted, it's networking is deallocated in the same way - unbind pre-existing ports, delete ports that nova created. The problem is when rescheduling from a failed host, if we successfully reschedule and build on a secondary host, any ports created from the original host are not cleaned up until the instance is deleted. For Ironic or SR-IOV ports, those are always deallocated. The ComputeDriver.deallocate_networks_on_reschedule() method defaults to False just so that the Ironic driver could override it, but really we should always cleanup neutron ports before rescheduling. Looking over bug report history, there are some mentions of different networking backends handling reschedules with multiple ports differently, in that sometimes it works and sometimes it fails. Regardless of the networking backend, however, we are at worst taking up port quota for the tenant for ports that will not be bound to whatever host the instance ends up on. There could also be legacy reasons for this behavior with nova-network, so that is side-stepped here by just restricting this check to whether or not neutron is being used. When we eventually remove nova-network we can then also remove the deallocate_networks_on_reschedule() method and SR-IOV check. NOTE(mriedem): There are a couple of changes to the unit test for code that didn't exist in Pike, due to the change for alternate hosts Iae904afb6cb4fcea8bb27741d774ffbe986a5fb4 and the change to pass the request spec to conductor Ie5233bd481013413f12e55201588d37a9688ae78. Change-Id: Ib2abf73166598ff14fce4e935efe15eeea0d4f7d Closes-Bug: #1597596 (cherry picked from commit 3a503a8f2b934f19049531c5c92130ca7cdd6a7f) (cherry picked from commit 9203326f84cd35243e9e6a73cd5fac62af27aaf5) --- nova/compute/manager.py | 9 ++++ nova/tests/unit/compute/test_compute_mgr.py | 51 +++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index be6c48deb81f..f253a25c72ca 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1880,12 +1880,21 @@ class ComputeManager(manager.Manager): retry['exc_reason'] = e.kwargs['reason'] # NOTE(comstud): Deallocate networks if the driver wants # us to do so. + # NOTE(mriedem): Always deallocate networking when using Neutron. + # This is to unbind any ports that the user supplied in the server + # create request, or delete any ports that nova created which were + # meant to be bound to this host. This check intentionally bypasses + # the result of deallocate_networks_on_reschedule because the + # default value in the driver is False, but that method was really + # only meant for Ironic and should be removed when nova-network is + # removed (since is_neutron() will then always be True). # NOTE(vladikr): SR-IOV ports should be deallocated to # allow new sriov pci devices to be allocated on a new host. # Otherwise, if devices with pci addresses are already allocated # on the destination host, the instance will fail to spawn. # info_cache.network_info should be present at this stage. if (self.driver.deallocate_networks_on_reschedule(instance) or + utils.is_neutron() or self.deallocate_sriov_ports_on_reschedule(instance)): self._cleanup_allocated_networks(context, instance, requested_networks) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index e007d9aea627..7afe72cf03af 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -4315,6 +4315,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): def test_rescheduled_exception(self, mock_hooks, mock_build_run, mock_build, mock_set, mock_nil, mock_save, mock_start, mock_finish): + self.flags(use_neutron=False) self._do_build_instance_update(mock_save, reschedule_update=True) mock_build_run.side_effect = exception.RescheduledException(reason='', instance_uuid=self.instance.uuid) @@ -4398,6 +4399,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): mock_macs_for_instance, mock_event_finish, mock_event_start, mock_ins_save, mock_build_ins, mock_build_and_run): + self.flags(use_neutron=False) instance = fake_instance.fake_instance_obj(self.context, vm_state=vm_states.ACTIVE, system_metadata={'network_allocated': 'True'}, @@ -4433,6 +4435,53 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): self.admin_pass, self.injected_files, self.requested_networks, self.security_groups, self.block_device_mapping) + @mock.patch.object(manager.ComputeManager, '_build_and_run_instance') + @mock.patch.object(manager.ComputeManager, '_cleanup_allocated_networks') + @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances') + @mock.patch.object(objects.Instance, 'save') + @mock.patch.object(objects.InstanceActionEvent, 'event_start') + @mock.patch.object(objects.InstanceActionEvent, + 'event_finish_with_failure') + @mock.patch.object(virt_driver.ComputeDriver, 'macs_for_instance') + def test_rescheduled_exception_with_network_allocated_with_neutron(self, + mock_macs_for_instance, mock_event_finish, mock_event_start, + mock_ins_save, mock_build_ins, mock_cleanup_network, + mock_build_and_run): + """Tests that we always cleanup allocated networks for the instance + when using neutron and before we reschedule off the failed host. + """ + instance = fake_instance.fake_instance_obj(self.context, + vm_state=vm_states.ACTIVE, + system_metadata={'network_allocated': 'True'}, + expected_attrs=['metadata', 'system_metadata', 'info_cache']) + mock_ins_save.return_value = instance + mock_macs_for_instance.return_value = [] + mock_build_and_run.side_effect = exception.RescheduledException( + reason='', instance_uuid=self.instance.uuid) + + self.compute._do_build_and_run_instance(self.context, instance, + self.image, request_spec={}, + filter_properties=self.filter_properties, + injected_files=self.injected_files, + admin_password=self.admin_pass, + requested_networks=self.requested_networks, + security_groups=self.security_groups, + block_device_mapping=self.block_device_mapping, node=self.node, + limits=self.limits) + + mock_build_and_run.assert_called_once_with(self.context, + instance, + self.image, self.injected_files, self.admin_pass, + self.requested_networks, self.security_groups, + self.block_device_mapping, self.node, self.limits, + self.filter_properties) + mock_cleanup_network.assert_called_once_with( + self.context, instance, self.requested_networks) + mock_build_ins.assert_called_once_with(self.context, + [instance], self.image, self.filter_properties, + self.admin_pass, self.injected_files, self.requested_networks, + self.security_groups, self.block_device_mapping) + @mock.patch.object(manager.ComputeManager, '_build_and_run_instance') @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances') @mock.patch.object(manager.ComputeManager, '_cleanup_allocated_networks') @@ -4552,6 +4601,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): mock_build_run, mock_build, mock_deallocate, mock_nil, mock_clean_net, mock_save, mock_start, mock_finish): + self.flags(use_neutron=False) self._do_build_instance_update(mock_save, reschedule_update=True) mock_build_run.side_effect = exception.RescheduledException(reason='', instance_uuid=self.instance.uuid) @@ -4992,6 +5042,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): def test_reschedule_on_resources_unavailable(self, mock_claim, mock_build, mock_nil, mock_save, mock_start, mock_finish, mock_notify): + self.flags(use_neutron=False) reason = 'resource unavailable' exc = exception.ComputeResourcesUnavailable(reason=reason) mock_claim.side_effect = exc