Allow disabling the evacuate cleanup mechanism in compute manager

This mechanism attempts to destroy any locally-running instances on startup if instance.host != self.host. The assumption is that the instance has been evacuated and is safely running elsewhere. This is a dangerous assumption to make, so this patch adds a configuration variable to disable this behavior if it's not desired. Note that disabling it may have implications for the case where instances *were* evacuated, given potential shared resources. To counter that problem, this patch also makes _init_instance() skip initialization of the instance if it appears to be owned by another host, logging a prominent warning in that case. As a result, if you have destroy_after_evacuate=False and you start a nova compute with an incorrect hostname, or run it twice from another host, then the worst that will happen is you get log warnings about the instances on the host being ignored. This should be an indication that something is wrong, but still allow for fixing it without any loss. If the configuration option is disabled and a legitimate evacuation does occur, simply enabling it and then restarting the compute service will cause the cleanup to occur. This is added to the workarounds config group because it is really only relevant while evacuate is fundamentally broken in this way. It needs to be refactored to be more robust, and once that is done, this should be able to go away. Conflicts: nova/compute/manager.py nova/tests/unit/compute/test_compute.py nova/tests/unit/compute/test_compute_mgr.py nova/utils.py NOTE: In nova/utils.py a new section has been introduced but only the option addessed by this backport has been included. DocImpact: New configuration option, and peril warning Partial-Bug: #1419785 (cherry picked from commit 922148ac45) -- squashed with commit -- Create a 'workarounds' config group. This group is for very specific reasons. If you're: - Working around an issue in a system tool (e.g. libvirt or qemu) where the fix is in flight/discussed in that community. - The tool can be/is fixed in some distributions and rather than patch the code those distributions can trivially set a config option to get the "correct" behavior. This is a good place for your workaround. (cherry picked from commit b1689b5840) -- Change-Id: Ib9a3c72c096822dd5c65c905117ae14994c73e99
2015-01-27 11:17:54 -08:00 · 2015-01-27 11:17:54 -08:00 · 6f1f9dbc21
parent be8be3916f
commit 6f1f9dbc21
4 changed files with 92 additions and 0 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -241,6 +241,8 @@ CONF.import_opt('enabled', 'nova.rdp', group='rdp')
 CONF.import_opt('html5_proxy_base_url', 'nova.rdp', group='rdp')
 CONF.import_opt('enabled', 'nova.console.serial', group='serial_console')
 CONF.import_opt('base_url', 'nova.console.serial', group='serial_console')
+CONF.import_opt('destroy_after_evacuate', 'nova.utils', group='workarounds')
+

 LOG = logging.getLogger(__name__)

@ -760,6 +762,17 @@ class ComputeManager(manager.Manager):
                               'vm_state': instance.vm_state},
                              instance=instance)
                    continue
+                if not CONF.workarounds.destroy_after_evacuate:
+                    LOG.warning(_LW('Instance %(uuid)s appears to have been '
+                                    'evacuated from this host to %(host)s. '
+                                    'Not destroying it locally due to '
+                                    'config setting '
+                                    '"workarounds.destroy_after_evacuate". '
+                                    'If this is not correct, enable that '
+                                    'option and restart nova-compute.'),
+                                {'uuid': instance.uuid,
+                                 'host': instance.host})
+                    continue
                LOG.info(_('Deleting instance as its host ('
                           '%(instance_host)s) is not equal to our '
                           'host (%(our_host)s).'),
@ -850,6 +863,20 @@ class ComputeManager(manager.Manager):
    def _init_instance(self, context, instance):
        '''Initialize this instance during service init.'''

+        # NOTE(danms): If the instance appears to not be owned by this
+        # host, it may have been evacuated away, but skipped by the
+        # evacuation cleanup code due to configuration. Thus, if that
+        # is a possibility, don't touch the instance in any way, but
+        # log the concern. This will help avoid potential issues on
+        # startup due to misconfiguration.
+        if instance.host != self.host:
+            LOG.warning(_LW('Instance %(uuid)s appears to not be owned '
+                            'by this host, but by %(host)s. Startup '
+                            'processing is being skipped.'),
+                        {'uuid': instance.uuid,
+                         'host': instance.host})
+            return
+
        # Instances that are shut down, or in an error state can not be
        # initialized and are not attempted to be recovered. The exception
        # to this are instances that are in RESIZE_MIGRATING or DELETING,
--- a/nova/tests/compute/test_compute.py
+++ b/nova/tests/compute/test_compute.py
@ -6699,6 +6699,7 @@ class ComputeTestCase(BaseTestCase):
        instance.id = 1
        instance.vm_state = vm_states.DELETED
        instance.deleted = False
+        instance.host = self.compute.host

        def fake_partial_deletion(context, instance):
            instance['deleted'] = instance['id']
@ -6716,6 +6717,7 @@ class ComputeTestCase(BaseTestCase):
        instance.uuid = str(uuid.uuid4())
        instance.vm_state = vm_states.DELETED
        instance.deleted = False
+        instance.host = self.compute.host

        self.mox.StubOutWithMock(self.compute, '_complete_partial_deletion')
        self.compute._complete_partial_deletion(
--- a/nova/tests/compute/test_compute_mgr.py
+++ b/nova/tests/compute/test_compute_mgr.py
@ -371,6 +371,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
                power_state=power_state.RUNNING,
                vm_state=vm_states.ACTIVE,
                task_state=None,
+                host=self.compute.host,
                expected_attrs=['info_cache'])

        with contextlib.nested(
@ -394,6 +395,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
                power_state=power_state.RUNNING,
                vm_state=vm_states.ACTIVE,
                task_state=None,
+                host=self.compute.host,
                expected_attrs=['info_cache'])

        self.flags(resume_guests_state_on_host_boot=True)
@ -427,6 +429,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
                uuid='fake-uuid',
                power_state=power_state.RUNNING,
                vm_state=vm_states.ACTIVE,
+                host=self.compute.host,
                task_state=task_states.DELETING)

        self.mox.StubOutWithMock(objects.BlockDeviceMappingList,
@ -459,6 +462,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
                task_state=task_states.RESIZE_MIGRATING,
                power_state=power_state.SHUTDOWN,
                system_metadata=sys_meta,
+                host=self.compute.host,
                expected_attrs=['system_metadata'])

        self.mox.StubOutWithMock(compute_utils, 'get_nw_info_for_instance')
@ -507,6 +511,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
                self.context,
                uuid='foo',
                vm_state=vm_states.ACTIVE,
+                host=self.compute.host,
                task_state=task_states.MIGRATING)
        with contextlib.nested(
            mock.patch.object(instance, 'save'),
@ -525,6 +530,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
                self.context,
                uuid='foo',
                vm_state=vm_state,
+                host=self.compute.host,
                task_state=task_state)
        with mock.patch.object(instance, 'save') as save:
            self.compute._init_instance(self.context, instance)
@ -547,6 +553,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
                    vm_state, task_state)

    def _test_init_instance_sets_building_tasks_error(self, instance):
+        instance.host = self.compute.host
        with mock.patch.object(instance, 'save') as save:
            self.compute._init_instance(self.context, instance)
            save.assert_called_once_with()
@ -588,6 +595,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
            self.compute.driver.post_interrupted_snapshot_cleanup = mock.Mock()
            instance.info_cache = None
            instance.power_state = power_state.RUNNING
+            instance.host = self.compute.host
            self.compute._init_instance(self.context, instance)
            save.assert_called_once_with()
            self.compute.driver.post_interrupted_snapshot_cleanup.\
@ -627,6 +635,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        instance.uuid = 'foo'
        instance.vm_state = vm_states.ERROR
        instance.task_state = task_states.IMAGE_UPLOADING
+        instance.host = self.compute.host
        self.mox.StubOutWithMock(compute_utils, 'get_nw_info_for_instance')
        self.mox.ReplayAll()
        self.compute._init_instance(self.context, instance)
@ -637,6 +646,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
                self.context,
                uuid='fake',
                vm_state=vm_states.ERROR,
+                host=self.compute.host,
                task_state=task_states.DELETING)
        self.mox.StubOutWithMock(objects.BlockDeviceMappingList,
                                 'get_by_instance_uuid')
@ -677,6 +687,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):

    def _test_init_instance_retries_reboot(self, instance, reboot_type,
                                           return_power_state):
+        instance.host = self.compute.host
        with contextlib.nested(
            mock.patch.object(self.compute, '_get_power_state',
                               return_value=return_power_state),
@ -731,6 +742,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
                                                power_state.NOSTATE)

    def _test_init_instance_cleans_reboot_state(self, instance):
+        instance.host = self.compute.host
        with contextlib.nested(
            mock.patch.object(self.compute, '_get_power_state',
                               return_value=power_state.RUNNING),
@ -768,6 +780,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        instance.id = 1
        instance.vm_state = vm_states.ACTIVE
        instance.task_state = task_states.POWERING_OFF
+        instance.host = self.compute.host
        with mock.patch.object(self.compute, 'stop_instance'):
            self.compute._init_instance(self.context, instance)
            call = mock.call(self.context, instance)
@ -779,6 +792,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        instance.id = 1
        instance.vm_state = vm_states.ACTIVE
        instance.task_state = task_states.POWERING_ON
+        instance.host = self.compute.host
        with mock.patch.object(self.compute, 'start_instance'):
            self.compute._init_instance(self.context, instance)
            call = mock.call(self.context, instance)
@ -790,6 +804,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        instance.id = 1
        instance.vm_state = vm_states.ACTIVE
        instance.task_state = task_states.POWERING_ON
+        instance.host = self.compute.host
        with mock.patch.object(self.compute, 'start_instance',
                              return_value=Exception):
            init_return = self.compute._init_instance(self.context, instance)
@ -803,6 +818,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        instance.id = 1
        instance.vm_state = vm_states.ACTIVE
        instance.task_state = task_states.POWERING_OFF
+        instance.host = self.compute.host
        with mock.patch.object(self.compute, 'stop_instance',
                              return_value=Exception):
            init_return = self.compute._init_instance(self.context, instance)
@ -1822,6 +1838,29 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        self._test_init_host_with_partial_migration(
            vm_state=vm_states.RESIZED)

+    @mock.patch('nova.compute.manager.ComputeManager._get_instances_on_driver')
+    def test_evacuate_disabled(self, mock_giod):
+        self.flags(destroy_after_evacuate=False, group='workarounds')
+        inst = mock.MagicMock()
+        inst.uuid = 'foo'
+        inst.host = self.compute.host + '-alt'
+        mock_giod.return_value = [inst]
+        with mock.patch.object(self.compute.driver, 'destroy') as mock_d:
+            self.compute._destroy_evacuated_instances(mock.MagicMock())
+            self.assertFalse(mock_d.called)
+
+    @mock.patch('nova.compute.manager.ComputeManager.'
+                '_destroy_evacuated_instances')
+    @mock.patch('nova.compute.manager.LOG')
+    def test_init_host_foreign_instance(self, mock_log, mock_destroy):
+        inst = mock.MagicMock()
+        inst.host = self.compute.host + '-alt'
+        self.compute._init_instance(mock.sentinel.context, inst)
+        self.assertFalse(inst.save.called)
+        self.assertTrue(mock_log.warning.called)
+        msg = mock_log.warning.call_args_list[0]
+        self.assertIn('appears to not be owned by this host', msg[0][0])
+
    @mock.patch('nova.compute.manager.ComputeManager._instance_update')
    def test_error_out_instance_on_exception_not_implemented_err(self,
                                                        inst_update_mock):
--- a/nova/utils.py
+++ b/nova/utils.py
@ -77,10 +77,34 @@ utils_opts = [
    cfg.StrOpt('tempdir',
               help='Explicitly specify the temporary working directory'),
 ]
+
+""" This group is for very specific reasons.
+
+If you're:
+ - Working around an issue in a system tool (e.g. libvirt or qemu) where the
+   fix is in flight/discussed in that community.
+ - The tool can be/is fixed in some distributions and rather than patch the
+   code those distributions can trivially set a config option to get the
+   "correct" behavior.
+
+This is a good place for your workaround.
+
+Please use with care!
+Document the BugID that your workaround is paired with."""
+
+workarounds_opts = [
+    cfg.BoolOpt('destroy_after_evacuate',
+                default=True,
+                help='Whether to destroy instances on startup when we suspect '
+                     'they have previously been evacuated. This can result in '
+                      'data loss if undesired. See '
+                      'https://launchpad.net/bugs/1419785'),
+    ]
 CONF = cfg.CONF
 CONF.register_opts(monkey_patch_opts)
 CONF.register_opts(utils_opts)
 CONF.import_opt('network_api_class', 'nova.network')
+CONF.register_opts(workarounds_opts, group='workarounds')

 LOG = logging.getLogger(__name__)