Allow disabling the evacuate cleanup mechanism in compute manager

This mechanism attempts to destroy any locally-running instances on
startup if instance.host != self.host. The assumption is that the
instance has been evacuated and is safely running elsewhere. This is
a dangerous assumption to make, so this patch adds a configuration
variable to disable this behavior if it's not desired.

Note that disabling it may have implications for the case where
instances *were* evacuated, given potential shared resources.
To counter that problem, this patch also makes _init_instance()
skip initialization of the instance if it appears to be owned
by another host, logging a prominent warning in that case.

As a result, if you have destroy_after_evacuate=False and you start
a nova compute with an incorrect hostname, or run it twice from
another host, then the worst that will happen is you get log
warnings about the instances on the host being ignored. This should
be an indication that something is wrong, but still allow for
fixing it without any loss. If the configuration option is disabled
and a legitimate evacuation does occur, simply enabling it and then
restarting the compute service will cause the cleanup to occur.

This is added to the workarounds config group because it is really
only relevant while evacuate is fundamentally broken in this way.
It needs to be refactored to be more robust, and once that is done,
this should be able to go away.

Conflicts:
        nova/compute/manager.py
        nova/tests/unit/compute/test_compute.py
        nova/tests/unit/compute/test_compute_mgr.py
        nova/utils.py

NOTE: In nova/utils.py a new section has been introduced but
only the option addessed by this backport has been included.

DocImpact: New configuration option, and peril warning
Partial-Bug: #1419785
(cherry picked from commit 922148ac45)

-- squashed with commit --

Create a 'workarounds' config group.

This group is for very specific reasons.

If you're:
- Working around an issue in a system tool (e.g. libvirt or qemu) where the fix
  is in flight/discussed in that community.
- The tool can be/is fixed in some distributions and rather than patch the code
  those distributions can trivially set a config option to get the "correct"
  behavior.
This is a good place for your workaround.

(cherry picked from commit b1689b5840)

--

Change-Id: Ib9a3c72c096822dd5c65c905117ae14994c73e99
This commit is contained in:
Tony Breeds 2015-01-27 11:17:54 -08:00 committed by Matt Riedemann
parent be8be3916f
commit 6f1f9dbc21
4 changed files with 92 additions and 0 deletions

View File

@ -241,6 +241,8 @@ CONF.import_opt('enabled', 'nova.rdp', group='rdp')
CONF.import_opt('html5_proxy_base_url', 'nova.rdp', group='rdp')
CONF.import_opt('enabled', 'nova.console.serial', group='serial_console')
CONF.import_opt('base_url', 'nova.console.serial', group='serial_console')
CONF.import_opt('destroy_after_evacuate', 'nova.utils', group='workarounds')
LOG = logging.getLogger(__name__)
@ -760,6 +762,17 @@ class ComputeManager(manager.Manager):
'vm_state': instance.vm_state},
instance=instance)
continue
if not CONF.workarounds.destroy_after_evacuate:
LOG.warning(_LW('Instance %(uuid)s appears to have been '
'evacuated from this host to %(host)s. '
'Not destroying it locally due to '
'config setting '
'"workarounds.destroy_after_evacuate". '
'If this is not correct, enable that '
'option and restart nova-compute.'),
{'uuid': instance.uuid,
'host': instance.host})
continue
LOG.info(_('Deleting instance as its host ('
'%(instance_host)s) is not equal to our '
'host (%(our_host)s).'),
@ -850,6 +863,20 @@ class ComputeManager(manager.Manager):
def _init_instance(self, context, instance):
'''Initialize this instance during service init.'''
# NOTE(danms): If the instance appears to not be owned by this
# host, it may have been evacuated away, but skipped by the
# evacuation cleanup code due to configuration. Thus, if that
# is a possibility, don't touch the instance in any way, but
# log the concern. This will help avoid potential issues on
# startup due to misconfiguration.
if instance.host != self.host:
LOG.warning(_LW('Instance %(uuid)s appears to not be owned '
'by this host, but by %(host)s. Startup '
'processing is being skipped.'),
{'uuid': instance.uuid,
'host': instance.host})
return
# Instances that are shut down, or in an error state can not be
# initialized and are not attempted to be recovered. The exception
# to this are instances that are in RESIZE_MIGRATING or DELETING,

View File

@ -6699,6 +6699,7 @@ class ComputeTestCase(BaseTestCase):
instance.id = 1
instance.vm_state = vm_states.DELETED
instance.deleted = False
instance.host = self.compute.host
def fake_partial_deletion(context, instance):
instance['deleted'] = instance['id']
@ -6716,6 +6717,7 @@ class ComputeTestCase(BaseTestCase):
instance.uuid = str(uuid.uuid4())
instance.vm_state = vm_states.DELETED
instance.deleted = False
instance.host = self.compute.host
self.mox.StubOutWithMock(self.compute, '_complete_partial_deletion')
self.compute._complete_partial_deletion(

View File

@ -371,6 +371,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
power_state=power_state.RUNNING,
vm_state=vm_states.ACTIVE,
task_state=None,
host=self.compute.host,
expected_attrs=['info_cache'])
with contextlib.nested(
@ -394,6 +395,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
power_state=power_state.RUNNING,
vm_state=vm_states.ACTIVE,
task_state=None,
host=self.compute.host,
expected_attrs=['info_cache'])
self.flags(resume_guests_state_on_host_boot=True)
@ -427,6 +429,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
uuid='fake-uuid',
power_state=power_state.RUNNING,
vm_state=vm_states.ACTIVE,
host=self.compute.host,
task_state=task_states.DELETING)
self.mox.StubOutWithMock(objects.BlockDeviceMappingList,
@ -459,6 +462,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
task_state=task_states.RESIZE_MIGRATING,
power_state=power_state.SHUTDOWN,
system_metadata=sys_meta,
host=self.compute.host,
expected_attrs=['system_metadata'])
self.mox.StubOutWithMock(compute_utils, 'get_nw_info_for_instance')
@ -507,6 +511,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self.context,
uuid='foo',
vm_state=vm_states.ACTIVE,
host=self.compute.host,
task_state=task_states.MIGRATING)
with contextlib.nested(
mock.patch.object(instance, 'save'),
@ -525,6 +530,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self.context,
uuid='foo',
vm_state=vm_state,
host=self.compute.host,
task_state=task_state)
with mock.patch.object(instance, 'save') as save:
self.compute._init_instance(self.context, instance)
@ -547,6 +553,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
vm_state, task_state)
def _test_init_instance_sets_building_tasks_error(self, instance):
instance.host = self.compute.host
with mock.patch.object(instance, 'save') as save:
self.compute._init_instance(self.context, instance)
save.assert_called_once_with()
@ -588,6 +595,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self.compute.driver.post_interrupted_snapshot_cleanup = mock.Mock()
instance.info_cache = None
instance.power_state = power_state.RUNNING
instance.host = self.compute.host
self.compute._init_instance(self.context, instance)
save.assert_called_once_with()
self.compute.driver.post_interrupted_snapshot_cleanup.\
@ -627,6 +635,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
instance.uuid = 'foo'
instance.vm_state = vm_states.ERROR
instance.task_state = task_states.IMAGE_UPLOADING
instance.host = self.compute.host
self.mox.StubOutWithMock(compute_utils, 'get_nw_info_for_instance')
self.mox.ReplayAll()
self.compute._init_instance(self.context, instance)
@ -637,6 +646,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self.context,
uuid='fake',
vm_state=vm_states.ERROR,
host=self.compute.host,
task_state=task_states.DELETING)
self.mox.StubOutWithMock(objects.BlockDeviceMappingList,
'get_by_instance_uuid')
@ -677,6 +687,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
def _test_init_instance_retries_reboot(self, instance, reboot_type,
return_power_state):
instance.host = self.compute.host
with contextlib.nested(
mock.patch.object(self.compute, '_get_power_state',
return_value=return_power_state),
@ -731,6 +742,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
power_state.NOSTATE)
def _test_init_instance_cleans_reboot_state(self, instance):
instance.host = self.compute.host
with contextlib.nested(
mock.patch.object(self.compute, '_get_power_state',
return_value=power_state.RUNNING),
@ -768,6 +780,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
instance.id = 1
instance.vm_state = vm_states.ACTIVE
instance.task_state = task_states.POWERING_OFF
instance.host = self.compute.host
with mock.patch.object(self.compute, 'stop_instance'):
self.compute._init_instance(self.context, instance)
call = mock.call(self.context, instance)
@ -779,6 +792,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
instance.id = 1
instance.vm_state = vm_states.ACTIVE
instance.task_state = task_states.POWERING_ON
instance.host = self.compute.host
with mock.patch.object(self.compute, 'start_instance'):
self.compute._init_instance(self.context, instance)
call = mock.call(self.context, instance)
@ -790,6 +804,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
instance.id = 1
instance.vm_state = vm_states.ACTIVE
instance.task_state = task_states.POWERING_ON
instance.host = self.compute.host
with mock.patch.object(self.compute, 'start_instance',
return_value=Exception):
init_return = self.compute._init_instance(self.context, instance)
@ -803,6 +818,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
instance.id = 1
instance.vm_state = vm_states.ACTIVE
instance.task_state = task_states.POWERING_OFF
instance.host = self.compute.host
with mock.patch.object(self.compute, 'stop_instance',
return_value=Exception):
init_return = self.compute._init_instance(self.context, instance)
@ -1822,6 +1838,29 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self._test_init_host_with_partial_migration(
vm_state=vm_states.RESIZED)
@mock.patch('nova.compute.manager.ComputeManager._get_instances_on_driver')
def test_evacuate_disabled(self, mock_giod):
self.flags(destroy_after_evacuate=False, group='workarounds')
inst = mock.MagicMock()
inst.uuid = 'foo'
inst.host = self.compute.host + '-alt'
mock_giod.return_value = [inst]
with mock.patch.object(self.compute.driver, 'destroy') as mock_d:
self.compute._destroy_evacuated_instances(mock.MagicMock())
self.assertFalse(mock_d.called)
@mock.patch('nova.compute.manager.ComputeManager.'
'_destroy_evacuated_instances')
@mock.patch('nova.compute.manager.LOG')
def test_init_host_foreign_instance(self, mock_log, mock_destroy):
inst = mock.MagicMock()
inst.host = self.compute.host + '-alt'
self.compute._init_instance(mock.sentinel.context, inst)
self.assertFalse(inst.save.called)
self.assertTrue(mock_log.warning.called)
msg = mock_log.warning.call_args_list[0]
self.assertIn('appears to not be owned by this host', msg[0][0])
@mock.patch('nova.compute.manager.ComputeManager._instance_update')
def test_error_out_instance_on_exception_not_implemented_err(self,
inst_update_mock):

View File

@ -77,10 +77,34 @@ utils_opts = [
cfg.StrOpt('tempdir',
help='Explicitly specify the temporary working directory'),
]
""" This group is for very specific reasons.
If you're:
- Working around an issue in a system tool (e.g. libvirt or qemu) where the
fix is in flight/discussed in that community.
- The tool can be/is fixed in some distributions and rather than patch the
code those distributions can trivially set a config option to get the
"correct" behavior.
This is a good place for your workaround.
Please use with care!
Document the BugID that your workaround is paired with."""
workarounds_opts = [
cfg.BoolOpt('destroy_after_evacuate',
default=True,
help='Whether to destroy instances on startup when we suspect '
'they have previously been evacuated. This can result in '
'data loss if undesired. See '
'https://launchpad.net/bugs/1419785'),
]
CONF = cfg.CONF
CONF.register_opts(monkey_patch_opts)
CONF.register_opts(utils_opts)
CONF.import_opt('network_api_class', 'nova.network')
CONF.register_opts(workarounds_opts, group='workarounds')
LOG = logging.getLogger(__name__)