From eae6aa80945f727e182ecef8ae2565dae1b770af Mon Sep 17 00:00:00 2001 From: Lee Yarwood Date: Mon, 21 Nov 2016 15:29:30 +0000 Subject: [PATCH] libvirt: Re-initialise volumes, encryptors, and vifs on hard reboot We call _hard_reboot during reboot, power_on, and resume_state_on_host_boot. It functions essentially by tearing as much of an instance as possible before recreating it, which additionally makes it useful to operators for attempting automated recovery of instances in an inconsistent state. The Libvirt driver would previously only call _destroy and _undefine_domain when hard rebooting an instance. This would leave vifs plugged, volumes connected, and encryptors attached on the host. It also means that when we try to restart the instance, we assume all these things are correctly configured. If they are not, the instance may fail to start at all, or may be incorrectly configured when starting. For example, consider an instance with an encrypted volume after a compute host reboot. When we attempt to start the instance, power_on will call _hard_reboot. The volume will be coincidentally re-attached as a side-effect of calling _get_guest_xml(!), but when we call _create_domain_and_network we pass reboot=True, which tells it not to reattach the encryptor, as it is assumed to be already attached. We are therefore left presenting the encrypted volume data directly to the instance without decryption. The approach in this patch is to ensure we recreate the instance as fully as possible during hard reboot. This means not passing vifs_already_plugged and reboot to _create_domain_and_network, which in turn requires that we fully destroy the instance first. This addresses the specific problem given in the example, but also a whole class of potential volume and vif related issues of inconsistent state. Because we now always tear down volumes, encryptors, and vifs, we are relying on the tear down of these things to be idempotent. This highlighted that detach of the luks and cryptsetup encryptors were not idempotent. We depend on the fixes for those os-brick drivers. NOTE(melwitt): In Ocata, we don't go through os-brick to handle detach of encrypted volumes and instead have our code under nova/volume/encryptors/. We're already ignoring exit code 4 for "not found" in cryptsetup.py and this makes luks.py consistent with that. We need to be able to ignore "already detached" encrypted volumes with this patch because of the re-initialization during hard reboot. Conflicts: nova/tests/unit/virt/libvirt/test_driver.py nova/virt/libvirt/driver.py NOTE(melwitt): The conflicts are due to _create_domain_and_network taking an additional disk_info argument in Ocata and the method for getting instance disk info was named _get_instance_disk_info instead of _get_instance_disk_info_from_config in Ocata. Closes-bug: #1724573 Change-Id: Id188d48609f3d22d14e16c7f6114291d547a8986 (cherry picked from commit 3f8daf080411b84ec0669f0642524ce8a7d19057) --- nova/tests/unit/virt/libvirt/test_driver.py | 13 ++++++------- .../tests/unit/volume/encryptors/test_luks.py | 8 ++++---- nova/virt/libvirt/driver.py | 19 ++++++++++--------- nova/volume/encryptors/luks.py | 5 ++++- 4 files changed, 24 insertions(+), 21 deletions(-) diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 0efab9ec5053..5cd6cb81d24e 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -11614,15 +11614,14 @@ class LibvirtConnTestCase(test.NoDBTestCase): mock_hard_reboot.assert_called_once_with(self.context, instance, [], None) - @mock.patch('nova.virt.libvirt.LibvirtDriver._undefine_domain') @mock.patch('nova.virt.libvirt.LibvirtDriver.get_info') @mock.patch('nova.virt.libvirt.LibvirtDriver._create_domain_and_network') @mock.patch('nova.virt.libvirt.LibvirtDriver._get_guest_xml') @mock.patch('nova.virt.libvirt.LibvirtDriver._get_instance_disk_info') - @mock.patch('nova.virt.libvirt.LibvirtDriver._destroy') + @mock.patch('nova.virt.libvirt.LibvirtDriver.destroy') def test_hard_reboot(self, mock_destroy, mock_get_disk_info, mock_get_guest_xml, mock_create_domain_and_network, - mock_get_info, mock_undefine): + mock_get_info): self.context.auth_token = True # any non-None value will suffice instance = objects.Instance(**self.test_instance) network_info = _fake_network_info(self, 1) @@ -11670,8 +11669,9 @@ class LibvirtConnTestCase(test.NoDBTestCase): for name in ('disk', 'disk.local'): self.assertTrue(disks[name].cache.called) - mock_destroy.assert_called_once_with(instance) - mock_undefine.assert_called_once_with(instance) + mock_destroy.assert_called_once_with(self.context, instance, + network_info, destroy_disks=False, + block_device_info=block_device_info) # Check the structure of disk_info passed to # _create_domain_and_network, but we don't care about any of the values @@ -11686,8 +11686,7 @@ class LibvirtConnTestCase(test.NoDBTestCase): } mock_create_domain_and_network.assert_called_once_with(self.context, dummyxml, instance, network_info, mock_disk_info, - block_device_info=block_device_info, - reboot=True, vifs_already_plugged=True) + block_device_info=block_device_info) @mock.patch('oslo_utils.fileutils.ensure_tree') @mock.patch('oslo_service.loopingcall.FixedIntervalLoopingCall') diff --git a/nova/tests/unit/volume/encryptors/test_luks.py b/nova/tests/unit/volume/encryptors/test_luks.py index 398e0050dc64..aebcd95cf2fd 100644 --- a/nova/tests/unit/volume/encryptors/test_luks.py +++ b/nova/tests/unit/volume/encryptors/test_luks.py @@ -163,7 +163,7 @@ class LuksEncryptorTestCase(test_cryptsetup.CryptsetupEncryptorTestCase): mock_execute.assert_has_calls([ mock.call('cryptsetup', 'luksClose', self.dev_name, - attempts=3, run_as_root=True, check_exit_code=True), + attempts=3, run_as_root=True, check_exit_code=[0, 4]), ]) self.assertEqual(1, mock_execute.call_count) @@ -173,7 +173,7 @@ class LuksEncryptorTestCase(test_cryptsetup.CryptsetupEncryptorTestCase): mock_execute.assert_has_calls([ mock.call('cryptsetup', 'luksClose', self.dev_name, - attempts=3, run_as_root=True, check_exit_code=True), + attempts=3, run_as_root=True, check_exit_code=[0, 4]), ]) self.assertEqual(1, mock_execute.call_count) @@ -217,7 +217,7 @@ class LuksEncryptorTestCase(test_cryptsetup.CryptsetupEncryptorTestCase): self.dev_name, process_input=fake_key_mangled, run_as_root=True, check_exit_code=True), mock.call('cryptsetup', 'luksClose', self.dev_name, - run_as_root=True, check_exit_code=True, attempts=3), + run_as_root=True, check_exit_code=[0, 4], attempts=3), mock.call('cryptsetup', 'luksAddKey', self.dev_path, process_input=''.join([fake_key_mangled, '\n', fake_key, @@ -227,7 +227,7 @@ class LuksEncryptorTestCase(test_cryptsetup.CryptsetupEncryptorTestCase): self.dev_name, process_input=fake_key, run_as_root=True, check_exit_code=True), mock.call('cryptsetup', 'luksClose', self.dev_name, - run_as_root=True, check_exit_code=True, attempts=3), + run_as_root=True, check_exit_code=[0, 4], attempts=3), mock.call('cryptsetup', 'luksRemoveKey', self.dev_path, process_input=fake_key_mangled, run_as_root=True, check_exit_code=True), diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index d47971125f63..ff8185a91238 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -2418,12 +2418,15 @@ class LibvirtDriver(driver.ComputeDriver): re-creates the domain to ensure the reboot happens, as the guest OS cannot ignore this action. """ - - self._destroy(instance) - # Domain XML will be redefined so we can safely undefine it - # from libvirt. This ensure that such process as create serial - # console for guest will run smoothly. - self._undefine_domain(instance) + # NOTE(mdbooth): In addition to performing a hard reboot of the domain, + # the hard reboot operation is relied upon by operators to be an + # automated attempt to fix as many things as possible about a + # non-functioning instance before resorting to manual intervention. + # With this goal in mind, we tear down all the aspects of an instance + # we can here without losing data. This allows us to re-initialise from + # scratch, and hopefully fix, most aspects of a non-functioning guest. + self.destroy(context, instance, network_info, destroy_disks=False, + block_device_info=block_device_info) # Convert the system metadata to image metadata # NOTE(mdbooth): This is a workaround for stateless Nova compute @@ -2459,9 +2462,7 @@ class LibvirtDriver(driver.ComputeDriver): # start the instance. self._create_domain_and_network(context, xml, instance, network_info, disk_info, - block_device_info=block_device_info, - reboot=True, - vifs_already_plugged=True) + block_device_info=block_device_info) self._prepare_pci_devices_for_use( pci_manager.get_instance_pci_devs(instance, 'all')) diff --git a/nova/volume/encryptors/luks.py b/nova/volume/encryptors/luks.py index 5b1923d43460..9ad54aafda66 100644 --- a/nova/volume/encryptors/luks.py +++ b/nova/volume/encryptors/luks.py @@ -163,6 +163,9 @@ class LuksEncryptor(cryptsetup.CryptsetupEncryptor): def _close_volume(self, **kwargs): """Closes the device (effectively removes the dm-crypt mapping).""" LOG.debug("closing encrypted volume %s", self.dev_path) + # cryptsetup returns 4 when attempting to destroy a non-active + # luks device. We are going to ignore this error code to avoid raising + # an exception while detaching an already detached volume. utils.execute('cryptsetup', 'luksClose', self.dev_name, - run_as_root=True, check_exit_code=True, + run_as_root=True, check_exit_code=[0, 4], attempts=3)