Handle volume API failure in _post_live_migration

Previously, if the call to Cinder in _post_live_migration failed, the
exception went unhandled and prevented us from calling
post_live_migration_at_destination - which is where we set instance
host and task state. This left the system in an inconsistent state,
with the instance actually running on the destination, but
with instance.host still set to the source. This patch simply wraps
the Cinder API calls in a try/except, and logs the exception instead
of blowing up. While "dumb", this has the virtue of being simple and
minimizing potential side effects. A comprehensive refactoring of
when, where and how we set instance host and task state to try to
guarantee consistency is left as a TODO.

Partial-bug: 1628606
Change-Id: Icb0bdaf454935b3713c35339394d260b33520de5
(cherry picked from commit 5513f48dea)
(cherry picked from commit cf3c2f391a)
This commit is contained in:
Artom Lifshitz 2018-10-10 14:53:14 -04:00
parent a48c1123cd
commit 53f9c8e510
2 changed files with 186 additions and 17 deletions

View File

@ -6289,23 +6289,44 @@ class ComputeManager(manager.Manager):
connector = self.driver.get_volume_connector(instance)
for bdm in bdms:
if bdm.is_volume:
if bdm.attachment_id is None:
# Prior to cinder v3.44:
# We don't want to actually mark the volume detached, or
# delete the bdm, just remove the connection from this
# host.
#
# remove the volume connection without detaching from
# hypervisor because the instance is not running anymore
# on the current host
self.volume_api.terminate_connection(ctxt, bdm.volume_id,
connector)
else:
# cinder v3.44 api flow - delete the old attachment
# for the source host
old_attachment_id = \
migrate_data.old_vol_attachment_ids[bdm.volume_id]
self.volume_api.attachment_delete(ctxt, old_attachment_id)
# Detaching volumes is a call to an external API that can fail.
# If it does, we need to handle it gracefully so that the call
# to post_live_migration_at_destination - where we set instance
# host and task state - still happens. We need to rethink the
# current approach of setting instance host and task state
# AFTER a whole bunch of things that could fail in unhandled
# ways, but that is left as a TODO(artom).
try:
if bdm.attachment_id is None:
# Prior to cinder v3.44:
# We don't want to actually mark the volume detached,
# or delete the bdm, just remove the connection from
# this host.
#
# remove the volume connection without detaching from
# hypervisor because the instance is not running
# anymore on the current host
self.volume_api.terminate_connection(ctxt,
bdm.volume_id,
connector)
else:
# cinder v3.44 api flow - delete the old attachment
# for the source host
old_attachment_id = \
migrate_data.old_vol_attachment_ids[bdm.volume_id]
self.volume_api.attachment_delete(ctxt,
old_attachment_id)
except Exception as e:
if bdm.attachment_id is None:
LOG.error('Connection for volume %s not terminated on '
'source host %s during post_live_migration: '
'%s', bdm.volume_id, self.host,
six.text_type(e), instance=instance)
else:
LOG.error('Volume attachment %s not deleted on source '
'host %s during post_live_migration: %s',
old_attachment_id, self.host,
six.text_type(e), instance=instance)
# Releasing vlan.
# (not necessary in current implementation?)

View File

@ -0,0 +1,148 @@
# Copyright 2018 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import mock
from nova.compute import api as compute_api
from nova import exception
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional import integrated_helpers
from nova.tests.unit import fake_notifier
from nova.tests import uuidsentinel as uuids
from nova.virt import fake
class FakeCinderError(object):
"""Poor man's Mock because we're stubbing out and not mock.patching. Stubs
out both terminate_connection and attachment_delete. We keep a raise and
call count to simulate a single volume error while being able to assert
that we still got called for all of an instance's volumes.
"""
def __init__(self):
self.raise_count = 0
self.call_count = 0
def __call__(self, *args, **kwargs):
self.call_count += 1
if self.raise_count == 0:
self.raise_count += 1
raise exception.CinderConnectionFailed(reason='Fake Cinder error')
class LiveMigrationCinderFailure(integrated_helpers._IntegratedTestBase,
integrated_helpers.InstanceHelperMixin):
api_major_version = 'v2.1'
microversion = 'latest'
def setUp(self):
super(LiveMigrationCinderFailure, self).setUp()
fake_notifier.stub_notifier(self)
self.addCleanup(fake_notifier.reset)
# Start a second compte node (the first one was started for us by
# _IntegratedTestBase. set_nodes() is needed to avoid duplicate
# nodenames. See comments in test_bug_1702454.py.
fake.set_nodes(['host2'])
self.addCleanup(fake.restore_nodes)
self.compute2 = self.start_service('compute', host='host2')
# To get the old Cinder flow we need to hack the service version, otherwise
# the new flow is attempted and CinderFixture complains about auth because
# it's not stubbing out the new flow methods.
@mock.patch(
'nova.objects.service.get_minimum_version_all_cells',
return_value=compute_api.CINDER_V3_ATTACH_MIN_COMPUTE_VERSION - 1)
def test_live_migrate_terminate_connection_fails(self, _):
self.useFixture(nova_fixtures.CinderFixture(self))
server = self.api.post_server({
'server': {
'flavorRef': 1,
'imageRef': '155d900f-4e14-4e4c-a73d-069cbf4541e6',
'name': 'live-migrate-terminate-connection-fail-test',
'networks': 'none',
'block_device_mapping_v2': [
{'boot_index': 0,
'uuid': uuids.broken_volume,
'source_type': 'volume',
'destination_type': 'volume'},
{'boot_index': 1,
'uuid': uuids.working_volume,
'source_type': 'volume',
'destination_type': 'volume'}]}})
server = self._wait_for_state_change(self.api, server, 'ACTIVE')
source = server['OS-EXT-SRV-ATTR:host']
if source == self.compute.host:
dest = self.compute2.host
else:
dest = self.compute.host
post = {
'os-migrateLive': {
'host': dest,
'block_migration': False,
}
}
stub_terminate_connection = FakeCinderError()
self.stub_out('nova.volume.cinder.API.terminate_connection',
stub_terminate_connection)
self.api.post_server_action(server['id'], post)
# Live migration should complete despite a volume failing to detach.
# Waiting for ACTIVE on dest is essentially an assert for just that.
self._wait_for_server_parameter(self.api, server,
{'OS-EXT-SRV-ATTR:host': dest,
'status': 'ACTIVE'})
self.assertEqual(2, stub_terminate_connection.call_count)
self.assertEqual(1, stub_terminate_connection.raise_count)
def test_live_migrate_attachment_delete_fails(self):
self.useFixture(nova_fixtures.CinderFixtureNewAttachFlow(self))
server = self.api.post_server({
'server': {
'flavorRef': 1,
'imageRef': '155d900f-4e14-4e4c-a73d-069cbf4541e6',
'name': 'live-migrate-attachment-delete-fail-test',
'networks': 'none',
'block_device_mapping_v2': [
{'boot_index': 0,
'uuid': uuids.broken_volume,
'source_type': 'volume',
'destination_type': 'volume'},
{'boot_index': 1,
'uuid': uuids.working_volume,
'source_type': 'volume',
'destination_type': 'volume'}]}})
server = self._wait_for_state_change(self.api, server, 'ACTIVE')
source = server['OS-EXT-SRV-ATTR:host']
if source == self.compute.host:
dest = self.compute2.host
else:
dest = self.compute.host
post = {
'os-migrateLive': {
'host': dest,
'block_migration': False,
}
}
stub_attachment_delete = FakeCinderError()
self.stub_out('nova.volume.cinder.API.attachment_delete',
stub_attachment_delete)
self.api.post_server_action(server['id'], post)
self._wait_for_server_parameter(self.api, server,
{'OS-EXT-SRV-ATTR:host': dest,
'status': 'ACTIVE'})
self.assertEqual(2, stub_attachment_delete.call_count)
self.assertEqual(1, stub_attachment_delete.raise_count)