Always deallocate networking before reschedule if using Neutron

When a server build fails on a selected compute host, the compute
service will cast to conductor which calls the scheduler to select
another host to attempt the build if retries are not exhausted.

With commit 08d24b733e, if retries
are exhausted or the scheduler raises NoValidHost, conductor will
deallocate networking for the instance. In the case of neutron, this
means unbinding any ports that the user provided with the server
create request and deleting any ports that nova-compute created during
the allocate_for_instance() operation during server build.

When an instance is deleted, it's networking is deallocated in the same
way - unbind pre-existing ports, delete ports that nova created.

The problem is when rescheduling from a failed host, if we successfully
reschedule and build on a secondary host, any ports created from the
original host are not cleaned up until the instance is deleted. For
Ironic or SR-IOV ports, those are always deallocated.

The ComputeDriver.deallocate_networks_on_reschedule() method defaults
to False just so that the Ironic driver could override it, but really
we should always cleanup neutron ports before rescheduling.

Looking over bug report history, there are some mentions of different
networking backends handling reschedules with multiple ports differently,
in that sometimes it works and sometimes it fails. Regardless of the
networking backend, however, we are at worst taking up port quota for
the tenant for ports that will not be bound to whatever host the instance
ends up on.

There could also be legacy reasons for this behavior with nova-network,
so that is side-stepped here by just restricting this check to whether
or not neutron is being used. When we eventually remove nova-network we
can then also remove the deallocate_networks_on_reschedule() method and
SR-IOV check.

Change-Id: Ib2abf73166598ff14fce4e935efe15eeea0d4f7d
Closes-Bug: #1597596
(cherry picked from commit 3a503a8f2b)
This commit is contained in:
Matt Riedemann 2017-11-15 19:15:44 -05:00
parent 0390d5f0ce
commit 9203326f84
2 changed files with 61 additions and 0 deletions

View File

@ -1865,12 +1865,21 @@ class ComputeManager(manager.Manager):
retry['exc_reason'] = e.kwargs['reason']
# NOTE(comstud): Deallocate networks if the driver wants
# us to do so.
# NOTE(mriedem): Always deallocate networking when using Neutron.
# This is to unbind any ports that the user supplied in the server
# create request, or delete any ports that nova created which were
# meant to be bound to this host. This check intentionally bypasses
# the result of deallocate_networks_on_reschedule because the
# default value in the driver is False, but that method was really
# only meant for Ironic and should be removed when nova-network is
# removed (since is_neutron() will then always be True).
# NOTE(vladikr): SR-IOV ports should be deallocated to
# allow new sriov pci devices to be allocated on a new host.
# Otherwise, if devices with pci addresses are already allocated
# on the destination host, the instance will fail to spawn.
# info_cache.network_info should be present at this stage.
if (self.driver.deallocate_networks_on_reschedule(instance) or
utils.is_neutron() or
self.deallocate_sriov_ports_on_reschedule(instance)):
self._cleanup_allocated_networks(context, instance,
requested_networks)

View File

@ -4724,6 +4724,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
def test_rescheduled_exception(self, mock_hooks, mock_build_run,
mock_build, mock_set, mock_nil,
mock_save, mock_start, mock_finish):
self.flags(use_neutron=False)
self._do_build_instance_update(mock_save, reschedule_update=True)
mock_build_run.side_effect = exception.RescheduledException(reason='',
instance_uuid=self.instance.uuid)
@ -4809,6 +4810,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock_macs_for_instance, mock_event_finish,
mock_event_start, mock_ins_save,
mock_build_ins, mock_build_and_run):
self.flags(use_neutron=False)
instance = fake_instance.fake_instance_obj(self.context,
vm_state=vm_states.ACTIVE,
system_metadata={'network_allocated': 'True'},
@ -4845,6 +4847,54 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.security_groups, self.block_device_mapping,
request_spec={}, host_lists=[fake_host_list])
@mock.patch.object(manager.ComputeManager, '_build_and_run_instance')
@mock.patch.object(manager.ComputeManager, '_cleanup_allocated_networks')
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(virt_driver.ComputeDriver, 'macs_for_instance')
def test_rescheduled_exception_with_network_allocated_with_neutron(self,
mock_macs_for_instance, mock_event_finish, mock_event_start,
mock_ins_save, mock_build_ins, mock_cleanup_network,
mock_build_and_run):
"""Tests that we always cleanup allocated networks for the instance
when using neutron and before we reschedule off the failed host.
"""
instance = fake_instance.fake_instance_obj(self.context,
vm_state=vm_states.ACTIVE,
system_metadata={'network_allocated': 'True'},
expected_attrs=['metadata', 'system_metadata', 'info_cache'])
mock_ins_save.return_value = instance
mock_macs_for_instance.return_value = []
mock_build_and_run.side_effect = exception.RescheduledException(
reason='', instance_uuid=self.instance.uuid)
self.compute._do_build_and_run_instance(self.context, instance,
self.image, request_spec={},
filter_properties=self.filter_properties,
injected_files=self.injected_files,
admin_password=self.admin_pass,
requested_networks=self.requested_networks,
security_groups=self.security_groups,
block_device_mapping=self.block_device_mapping, node=self.node,
limits=self.limits, host_list=fake_host_list)
mock_build_and_run.assert_called_once_with(self.context,
instance,
self.image, self.injected_files, self.admin_pass,
self.requested_networks, self.security_groups,
self.block_device_mapping, self.node, self.limits,
self.filter_properties, {})
mock_cleanup_network.assert_called_once_with(
self.context, instance, self.requested_networks)
mock_build_ins.assert_called_once_with(self.context,
[instance], self.image, self.filter_properties,
self.admin_pass, self.injected_files, self.requested_networks,
self.security_groups, self.block_device_mapping,
request_spec={}, host_lists=[fake_host_list])
@mock.patch.object(manager.ComputeManager, '_build_and_run_instance')
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
@mock.patch.object(manager.ComputeManager, '_cleanup_allocated_networks')
@ -4965,6 +5015,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock_build_run, mock_build, mock_deallocate, mock_nil,
mock_clean_net, mock_save, mock_start,
mock_finish):
self.flags(use_neutron=False)
self._do_build_instance_update(mock_save, reschedule_update=True)
mock_build_run.side_effect = exception.RescheduledException(reason='',
instance_uuid=self.instance.uuid)
@ -5408,6 +5459,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
def test_reschedule_on_resources_unavailable(self, mock_claim,
mock_build, mock_nil, mock_save, mock_start,
mock_finish, mock_notify):
self.flags(use_neutron=False)
reason = 'resource unavailable'
exc = exception.ComputeResourcesUnavailable(reason=reason)
mock_claim.side_effect = exc