Correct instance port binding for rebuilds

The following 2 scenarios could result in an instance with incorrect
port binding and cause subsequent rebuilds to fail.

If an evacuation of an instance fails part way through, after the point
where we reassign the port binding to the new host but before we change
the instance host, we end up with the ports assigned to the wrong host.
This change adds a check to determine if there's any port binding host
mismatches and if so trigger setup of instance network.

During recovery of failed hosts, neutron could get overwhelmed and lose
messages, for example when active controller was powered-off in the
middle of instance evacuations. In this case the vif_type was set to
'binding_failed' or 'unbound'. We subsequently hit "Unsupported VIF
type" exception during instance hard_reboot or rebuild, leaving the
instance unrecoverable.

This commit changes _heal_instance_info_cache periodic task to update
port binding if evacuation fails due to above errors so that the
instance can be recovered later.

Closes-Bug: #1659062
Related-Bug: #1784579

Co-Authored-By: Gerry Kopec <gerry.kopec@windriver.com>
Co-Authored-By: Jim Gauld <James.Gauld@windriver.com>
Change-Id: I75fd15ac2a29e420c09499f2c41d11259ca811ae
Signed-off-by: Jack Ding <jack.ding@windriver.com>
This commit is contained in:
Jack Ding 2018-09-19 11:54:44 -04:00
parent c7db20d140
commit 5426350348
5 changed files with 252 additions and 12 deletions

View File

@ -962,11 +962,10 @@ class ComputeManager(manager.Manager):
LOG.debug(e, instance=instance)
except exception.VirtualInterfacePlugException:
# NOTE(mriedem): If we get here, it could be because the vif_type
# in the cache is "binding_failed" or "unbound". The only way to
# fix this is to try and bind the ports again, which would be
# expensive here on host startup. We could add a check to
# _heal_instance_info_cache to handle this, but probably only if
# the instance task_state is None.
# in the cache is "binding_failed" or "unbound".
# The periodic task _heal_instance_info_cache checks for this
# condition. It should fix this by binding the ports again when
# it gets to this instance.
LOG.exception('Virtual interface plugging failed for instance. '
'The port binding:host_id may need to be manually '
'updated.', instance=instance)
@ -7116,6 +7115,25 @@ class ComputeManager(manager.Manager):
action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK_DEST,
phase=fields.NotificationPhase.END)
def _require_nw_info_update(self, context, instance):
"""Detect whether there is a mismatch in binding:host_id, or
binding_failed or unbound binding:vif_type for any of the instances
ports.
"""
if not utils.is_neutron():
return False
search_opts = {'device_id': instance.uuid,
'fields': ['binding:host_id', 'binding:vif_type']}
ports = self.network_api.list_ports(context, **search_opts)
for p in ports['ports']:
if p.get('binding:host_id') != self.host:
return True
vif_type = p.get('binding:vif_type')
if (vif_type == network_model.VIF_TYPE_UNBOUND or
vif_type == network_model.VIF_TYPE_BINDING_FAILED):
return True
return False
@periodic_task.periodic_task(
spacing=CONF.heal_instance_info_cache_interval)
def _heal_instance_info_cache(self, context):
@ -7194,6 +7212,15 @@ class ComputeManager(manager.Manager):
if instance:
# We have an instance now to refresh
try:
# Fix potential mismatch in port binding if evacuation failed
# after reassigning the port binding to the dest host but
# before the instance host is changed.
# Do this only when instance has no pending task.
if instance.task_state is None and \
self._require_nw_info_update(context, instance):
LOG.info("Updating ports in neutron", instance=instance)
self.network_api.setup_instance_network_on_host(
context, instance, self.host)
# Call to network API to get instance info.. this will
# force an update to the instance's info_cache
self.network_api.get_instance_nw_info(

View File

@ -45,6 +45,8 @@ VIF_TYPE_MACVTAP = 'macvtap'
VIF_TYPE_AGILIO_OVS = 'agilio_ovs'
VIF_TYPE_BINDING_FAILED = 'binding_failed'
VIF_TYPE_VIF = 'vif'
VIF_TYPE_UNBOUND = 'unbound'
# Constants for dictionary keys in the 'vif_details' field in the VIF
# class

View File

@ -3273,13 +3273,16 @@ class API(base_api.NetworkAPI):
pci_mapping = None
port_updates = []
ports = data['ports']
FAILED_VIF_TYPES = (network_model.VIF_TYPE_UNBOUND,
network_model.VIF_TYPE_BINDING_FAILED)
for p in ports:
updates = {}
binding_profile = _get_binding_profile(p)
# If the host hasn't changed, like in the case of resizing to the
# same host, there is nothing to do.
if p.get(BINDING_HOST_ID) != host:
# We need to update the port binding if the host has changed or if
# the binding is clearly wrong due to previous lost messages.
vif_type = p.get('binding:vif_type')
if p.get(BINDING_HOST_ID) != host or vif_type in FAILED_VIF_TYPES:
# TODO(gibi): To support ports with resource request during
# server move operations we need to take care of 'allocation'
# key in the binding profile per binding.

View File

@ -7197,9 +7197,81 @@ class ComputeTestCase(BaseTestCase,
mock_is_older.assert_called_once_with(now,
CONF.running_deleted_instance_timeout)
@mock.patch('nova.network.neutronv2.api.API.list_ports')
@mock.patch.object(nova.utils, 'is_neutron')
def test_require_nw_info_update_not_neutron(self, mock_is_neutron,
mock_list_ports):
ctxt = context.get_admin_context()
instance = self._create_fake_instance_obj()
mock_is_neutron.return_value = False
val = self.compute._require_nw_info_update(ctxt, instance)
self.assertFalse(val)
mock_list_ports.assert_not_called()
@mock.patch('nova.network.neutronv2.api.API.list_ports')
@mock.patch.object(nova.utils, 'is_neutron')
def test_require_nw_info_update_host_match(self, mock_is_neutron,
mock_list_ports):
ctxt = context.get_admin_context()
instance = self._create_fake_instance_obj()
mock_is_neutron.return_value = True
mock_list_ports.return_value = {'ports': [
{'binding:host_id': self.compute.host,
'binding:vif_type': 'foo'}
]}
search_opts = {'device_id': instance.uuid,
'fields': ['binding:host_id', 'binding:vif_type']}
val = self.compute._require_nw_info_update(ctxt, instance)
# return false since hosts match and vif_type is not unbound or
# binding_failed
self.assertFalse(val)
mock_list_ports.assert_called_once_with(ctxt, **search_opts)
@mock.patch('nova.network.neutronv2.api.API.list_ports')
@mock.patch.object(nova.utils, 'is_neutron')
def test_require_nw_info_update_host_mismatch(self, mock_is_neutron,
mock_list_ports):
ctxt = context.get_admin_context()
instance = self._create_fake_instance_obj()
mock_is_neutron.return_value = True
mock_list_ports.return_value = {'ports': [
{'binding:host_id': 'foo', 'binding:vif_type': 'foo'}
]}
search_opts = {'device_id': instance.uuid,
'fields': ['binding:host_id', 'binding:vif_type']}
val = self.compute._require_nw_info_update(ctxt, instance)
self.assertTrue(val)
mock_list_ports.assert_called_once_with(ctxt, **search_opts)
@mock.patch('nova.network.neutronv2.api.API.list_ports')
@mock.patch.object(nova.utils, 'is_neutron')
def test_require_nw_info_update_failed_vif_types(
self, mock_is_neutron, mock_list_ports):
ctxt = context.get_admin_context()
instance = self._create_fake_instance_obj()
mock_is_neutron.return_value = True
search_opts = {'device_id': instance.uuid,
'fields': ['binding:host_id', 'binding:vif_type']}
FAILED_VIF_TYPES = (network_model.VIF_TYPE_UNBOUND,
network_model.VIF_TYPE_BINDING_FAILED)
for vif_type in FAILED_VIF_TYPES:
mock_list_ports.reset_mock()
mock_list_ports.return_value = {'ports': [
{'binding:host_id': self.compute.host,
'binding:vif_type': vif_type}
]}
val = self.compute._require_nw_info_update(ctxt, instance)
self.assertTrue(val)
mock_list_ports.assert_called_once_with(ctxt, **search_opts)
def _heal_instance_info_cache(self,
_get_instance_nw_info_raise=False,
_get_instance_nw_info_raise_cache=False):
_get_instance_nw_info_raise_cache=False,
_require_nw_info_update=False,
_task_state_not_none=False):
# Update on every call for the test
self.flags(heal_instance_info_cache_interval=-1)
ctxt = context.get_admin_context()
@ -7211,10 +7283,13 @@ class ComputeTestCase(BaseTestCase,
instance_map[inst_uuid] = fake_instance.fake_db_instance(
uuid=inst_uuid, host=CONF.host, created_at=None)
# These won't be in our instance since they're not requested
if _task_state_not_none:
instance_map[inst_uuid]['task_state'] = 'foo'
instances.append(instance_map[inst_uuid])
call_info = {'get_all_by_host': 0, 'get_by_uuid': 0,
'get_nw_info': 0, 'expected_instance': None}
'get_nw_info': 0, 'expected_instance': None,
'require_nw_info': 0, 'setup_network': 0}
def fake_instance_get_all_by_host(context, host,
columns_to_join, use_slave=False):
@ -7232,6 +7307,20 @@ class ComputeTestCase(BaseTestCase,
columns_to_join)
return instance_map[instance_uuid]
def fake_require_nw_info_update(cls, context, instance):
self.assertEqual(call_info['expected_instance']['uuid'],
instance['uuid'])
if call_info['expected_instance']['task_state'] is None:
call_info['require_nw_info'] += 1
return _require_nw_info_update
def fake_setup_instance_network_on_host(cls, context, instance, host):
self.assertEqual(call_info['expected_instance']['uuid'],
instance['uuid'])
if call_info['expected_instance']['task_state'] is None and \
_require_nw_info_update:
call_info['setup_network'] += 1
# NOTE(comstud): Override the stub in setUp()
def fake_get_instance_nw_info(cls, context, instance, **kwargs):
# Note that this exception gets caught in compute/manager
@ -7252,14 +7341,28 @@ class ComputeTestCase(BaseTestCase,
fake_instance_get_all_by_host)
self.stub_out('nova.db.api.instance_get_by_uuid',
fake_instance_get_by_uuid)
self.stub_out('nova.compute.manager.ComputeManager.'
'_require_nw_info_update',
fake_require_nw_info_update)
if CONF.use_neutron:
self.stub_out(
'nova.network.neutronv2.api.API.get_instance_nw_info',
fake_get_instance_nw_info)
self.stub_out(
'nova.network.neutronv2.api.API.'
'setup_instance_network_on_host',
fake_setup_instance_network_on_host)
else:
self.stub_out('nova.network.api.API.get_instance_nw_info',
fake_get_instance_nw_info)
self.stub_out(
'nova.network.api.API.setup_instance_network_on_host',
fake_setup_instance_network_on_host)
expected_require_nw_info = 0
expect_setup_network = 0
# Make an instance appear to be still Building
instances[0]['vm_state'] = vm_states.BUILDING
# Make an instance appear to be Deleting
@ -7269,12 +7372,26 @@ class ComputeTestCase(BaseTestCase,
self.compute._heal_instance_info_cache(ctxt)
self.assertEqual(1, call_info['get_all_by_host'])
self.assertEqual(0, call_info['get_by_uuid'])
if not _task_state_not_none:
expected_require_nw_info += 1
self.assertEqual(expected_require_nw_info,
call_info['require_nw_info'])
if _require_nw_info_update and not _task_state_not_none:
expect_setup_network += 1
self.assertEqual(expect_setup_network, call_info['setup_network'])
self.assertEqual(1, call_info['get_nw_info'])
call_info['expected_instance'] = instances[3]
self.compute._heal_instance_info_cache(ctxt)
self.assertEqual(1, call_info['get_all_by_host'])
self.assertEqual(1, call_info['get_by_uuid'])
if not _task_state_not_none:
expected_require_nw_info += 1
self.assertEqual(expected_require_nw_info,
call_info['require_nw_info'])
if _require_nw_info_update and not _task_state_not_none:
expect_setup_network += 1
self.assertEqual(expect_setup_network, call_info['setup_network'])
self.assertEqual(2, call_info['get_nw_info'])
# Make an instance switch hosts
@ -7288,6 +7405,13 @@ class ComputeTestCase(BaseTestCase,
self.compute._heal_instance_info_cache(ctxt)
self.assertEqual(1, call_info['get_all_by_host'])
self.assertEqual(4, call_info['get_by_uuid'])
if not _task_state_not_none:
expected_require_nw_info += 1
self.assertEqual(expected_require_nw_info,
call_info['require_nw_info'])
if _require_nw_info_update and not _task_state_not_none:
expect_setup_network += 1
self.assertEqual(expect_setup_network, call_info['setup_network'])
self.assertEqual(3, call_info['get_nw_info'])
# Should be no more left.
self.assertEqual(0, len(self.compute._instance_uuids_to_heal))
@ -7303,6 +7427,9 @@ class ComputeTestCase(BaseTestCase,
# Stays the same because we remove invalid entries from the list
self.assertEqual(4, call_info['get_by_uuid'])
# Stays the same because we didn't find anything to process
self.assertEqual(expected_require_nw_info,
call_info['require_nw_info'])
self.assertEqual(expect_setup_network, call_info['setup_network'])
self.assertEqual(3, call_info['get_nw_info'])
def test_heal_instance_info_cache(self):
@ -7314,6 +7441,14 @@ class ComputeTestCase(BaseTestCase,
def test_heal_instance_info_cache_with_info_cache_exception(self):
self._heal_instance_info_cache(_get_instance_nw_info_raise_cache=True)
def test_heal_instance_info_cache_with_port_update(self):
self._heal_instance_info_cache(_require_nw_info_update=True)
def test_heal_instance_info_cache_with_port_update_instance_not_steady(
self):
self._heal_instance_info_cache(_require_nw_info_update=True,
_task_state_not_none=True)
@mock.patch('nova.objects.InstanceList.get_by_filters')
@mock.patch('nova.compute.api.API.unrescue')
def test_poll_rescued_instances(self, unrescue, get):

View File

@ -4376,12 +4376,12 @@ class TestNeutronv2WithMock(_TestNeutronv2Common):
# a second port profile attribute 'fake_profile' this can be
# an sriov port profile attribute or a pci_slot attribute, but for
# now we are just using a fake one to show that the code does not
# remove the portbinding_profile it there is one.
# remove the portbinding_profile if there is one.
binding_profile = {'fake_profile': 'fake_data',
neutronapi.MIGRATING_ATTR: 'my-dest-host'}
fake_ports = {'ports': [
{'id': 'fake-port-1',
neutronapi.BINDING_PROFILE: binding_profile,
neutronapi.BINDING_PROFILE: binding_profile,
neutronapi.BINDING_HOST_ID: instance.host}]}
list_ports_mock = mock.Mock(return_value=fake_ports)
get_client_mock.return_value.list_ports = list_ports_mock
@ -4605,6 +4605,79 @@ class TestNeutronv2WithMock(_TestNeutronv2Common):
# No ports should be updated if the port's pci binding did not change.
update_port_mock.assert_not_called()
@mock.patch.object(neutronapi, 'get_client', return_value=mock.Mock())
def test_update_port_bindings_for_instance_with_same_host_failed_vif_type(
self, get_client_mock):
instance = fake_instance.fake_instance_obj(self.context)
self.api._has_port_binding_extension = mock.Mock(return_value=True)
list_ports_mock = mock.Mock()
update_port_mock = mock.Mock()
FAILED_VIF_TYPES = (model.VIF_TYPE_UNBOUND,
model.VIF_TYPE_BINDING_FAILED)
for vif_type in FAILED_VIF_TYPES:
binding_profile = {'fake_profile': 'fake_data',
neutronapi.MIGRATING_ATTR: 'my-dest-host'}
fake_ports = {'ports': [
{'id': 'fake-port-1',
'binding:vif_type': 'fake-vif-type',
neutronapi.BINDING_PROFILE: binding_profile,
neutronapi.BINDING_HOST_ID: instance.host},
{'id': 'fake-port-2',
'binding:vif_type': vif_type,
neutronapi.BINDING_PROFILE: binding_profile,
neutronapi.BINDING_HOST_ID: instance.host}
]}
list_ports_mock.return_value = fake_ports
get_client_mock.return_value.list_ports = list_ports_mock
get_client_mock.return_value.update_port = update_port_mock
update_port_mock.reset_mock()
self.api._update_port_binding_for_instance(self.context, instance,
instance.host)
# Assert that update_port was called on the port with a
# failed vif_type and MIGRATING_ATTR is removed
update_port_mock.assert_called_once_with(
'fake-port-2',
{'port': {neutronapi.BINDING_HOST_ID: instance.host,
neutronapi.BINDING_PROFILE: {
'fake_profile': 'fake_data'},
'device_owner': 'compute:%s' %
instance.availability_zone
}})
@mock.patch.object(neutronapi, 'get_client', return_value=mock.Mock())
def test_update_port_bindings_for_instance_with_diff_host_unbound_vif_type(
self, get_client_mock):
instance = fake_instance.fake_instance_obj(self.context)
self.api._has_port_binding_extension = mock.Mock(return_value=True)
binding_profile = {'fake_profile': 'fake_data',
neutronapi.MIGRATING_ATTR: 'my-dest-host'}
fake_ports = {'ports': [
{'id': 'fake-port-1',
'binding:vif_type': model.VIF_TYPE_UNBOUND,
neutronapi.BINDING_PROFILE: binding_profile,
neutronapi.BINDING_HOST_ID: instance.host},
]}
list_ports_mock = mock.Mock(return_value=fake_ports)
get_client_mock.return_value.list_ports = list_ports_mock
update_port_mock = mock.Mock()
get_client_mock.return_value.update_port = update_port_mock
self.api._update_port_binding_for_instance(self.context, instance,
'my-host')
# Assert that update_port was called on the port with a
# 'unbound' vif_type, host updated and MIGRATING_ATTR is removed
update_port_mock.assert_called_once_with(
'fake-port-1', {'port': {neutronapi.BINDING_HOST_ID: 'my-host',
neutronapi.BINDING_PROFILE: {
'fake_profile': 'fake_data'},
'device_owner': 'compute:%s' %
instance.availability_zone
}})
def test_get_pci_mapping_for_migration(self):
instance = fake_instance.fake_instance_obj(self.context)
instance.migration_context = objects.MigrationContext()