Allow disabling the evacuate cleanup mechanism in compute manager
This mechanism attempts to destroy any locally-running instances on startup if instance.host != self.host. The assumption is that the instance has been evacuated and is safely running elsewhere. This is a dangerous assumption to make, so this patch adds a configuration variable to disable this behavior if it's not desired. Note that disabling it may have implications for the case where instances *were* evacuated, given potential shared resources. To counter that problem, this patch also makes _init_instance() skip initialization of the instance if it appears to be owned by another host, logging a prominent warning in that case. As a result, if you have destroy_after_evacuate=False and you start a nova compute with an incorrect hostname, or run it twice from another host, then the worst that will happen is you get log warnings about the instances on the host being ignored. This should be an indication that something is wrong, but still allow for fixing it without any loss. If the configuration option is disabled and a legitimate evacuation does occur, simply enabling it and then restarting the compute service will cause the cleanup to occur. This is added to the workarounds config group because it is really only relevant while evacuate is fundamentally broken in this way. It needs to be refactored to be more robust, and once that is done, this should be able to go away. DocImpact: New configuration option, and peril warning Partial-Bug: #1419785 Change-Id: Ib9a3c72c096822dd5c65c905117ae14994c73e99
This commit is contained in:
parent
10272d83d6
commit
922148ac45
|
@ -240,6 +240,8 @@ CONF.import_opt('enabled', 'nova.rdp', group='rdp')
|
|||
CONF.import_opt('html5_proxy_base_url', 'nova.rdp', group='rdp')
|
||||
CONF.import_opt('enabled', 'nova.console.serial', group='serial_console')
|
||||
CONF.import_opt('base_url', 'nova.console.serial', group='serial_console')
|
||||
CONF.import_opt('destroy_after_evacuate', 'nova.utils', group='workarounds')
|
||||
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
@ -750,6 +752,17 @@ class ComputeManager(manager.Manager):
|
|||
'vm_state': instance.vm_state},
|
||||
instance=instance)
|
||||
continue
|
||||
if not CONF.workarounds.destroy_after_evacuate:
|
||||
LOG.warning(_LW('Instance %(uuid)s appears to have been '
|
||||
'evacuated from this host to %(host)s. '
|
||||
'Not destroying it locally due to '
|
||||
'config setting '
|
||||
'"workarounds.destroy_after_evacuate". '
|
||||
'If this is not correct, enable that '
|
||||
'option and restart nova-compute.'),
|
||||
{'uuid': instance.uuid,
|
||||
'host': instance.host})
|
||||
continue
|
||||
LOG.info(_LI('Deleting instance as its host ('
|
||||
'%(instance_host)s) is not equal to our '
|
||||
'host (%(our_host)s).'),
|
||||
|
@ -841,6 +854,20 @@ class ComputeManager(manager.Manager):
|
|||
def _init_instance(self, context, instance):
|
||||
'''Initialize this instance during service init.'''
|
||||
|
||||
# NOTE(danms): If the instance appears to not be owned by this
|
||||
# host, it may have been evacuated away, but skipped by the
|
||||
# evacuation cleanup code due to configuration. Thus, if that
|
||||
# is a possibility, don't touch the instance in any way, but
|
||||
# log the concern. This will help avoid potential issues on
|
||||
# startup due to misconfiguration.
|
||||
if instance.host != self.host:
|
||||
LOG.warning(_LW('Instance %(uuid)s appears to not be owned '
|
||||
'by this host, but by %(host)s. Startup '
|
||||
'processing is being skipped.'),
|
||||
{'uuid': instance.uuid,
|
||||
'host': instance.host})
|
||||
return
|
||||
|
||||
# Instances that are shut down, or in an error state can not be
|
||||
# initialized and are not attempted to be recovered. The exception
|
||||
# to this are instances that are in RESIZE_MIGRATING or DELETING,
|
||||
|
|
|
@ -6731,6 +6731,7 @@ class ComputeTestCase(BaseTestCase):
|
|||
instance.id = 1
|
||||
instance.vm_state = vm_states.DELETED
|
||||
instance.deleted = False
|
||||
instance.host = self.compute.host
|
||||
|
||||
def fake_partial_deletion(context, instance):
|
||||
instance['deleted'] = instance['id']
|
||||
|
@ -6748,6 +6749,7 @@ class ComputeTestCase(BaseTestCase):
|
|||
instance.uuid = str(uuid.uuid4())
|
||||
instance.vm_state = vm_states.DELETED
|
||||
instance.deleted = False
|
||||
instance.host = self.compute.host
|
||||
|
||||
self.mox.StubOutWithMock(self.compute, '_complete_partial_deletion')
|
||||
self.compute._complete_partial_deletion(
|
||||
|
|
|
@ -470,6 +470,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
power_state=power_state.RUNNING,
|
||||
vm_state=vm_states.ACTIVE,
|
||||
task_state=None,
|
||||
host=self.compute.host,
|
||||
expected_attrs=['info_cache'])
|
||||
|
||||
with contextlib.nested(
|
||||
|
@ -515,6 +516,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
power_state=power_state.RUNNING,
|
||||
vm_state=vm_states.ACTIVE,
|
||||
task_state=None,
|
||||
host=self.compute.host,
|
||||
expected_attrs=['info_cache'])
|
||||
|
||||
self.flags(resume_guests_state_on_host_boot=True)
|
||||
|
@ -548,6 +550,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
uuid='fake-uuid',
|
||||
power_state=power_state.RUNNING,
|
||||
vm_state=vm_states.ACTIVE,
|
||||
host=self.compute.host,
|
||||
task_state=task_states.DELETING)
|
||||
|
||||
self.mox.StubOutWithMock(objects.BlockDeviceMappingList,
|
||||
|
@ -580,6 +583,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
task_state=task_states.RESIZE_MIGRATING,
|
||||
power_state=power_state.SHUTDOWN,
|
||||
system_metadata=sys_meta,
|
||||
host=self.compute.host,
|
||||
expected_attrs=['system_metadata'])
|
||||
|
||||
self.mox.StubOutWithMock(compute_utils, 'get_nw_info_for_instance')
|
||||
|
@ -628,6 +632,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
self.context,
|
||||
uuid='foo',
|
||||
vm_state=vm_states.ACTIVE,
|
||||
host=self.compute.host,
|
||||
task_state=task_states.MIGRATING)
|
||||
with contextlib.nested(
|
||||
mock.patch.object(instance, 'save'),
|
||||
|
@ -646,6 +651,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
self.context,
|
||||
uuid='foo',
|
||||
vm_state=vm_state,
|
||||
host=self.compute.host,
|
||||
task_state=task_state)
|
||||
with mock.patch.object(instance, 'save') as save:
|
||||
self.compute._init_instance(self.context, instance)
|
||||
|
@ -668,6 +674,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
vm_state, task_state)
|
||||
|
||||
def _test_init_instance_sets_building_tasks_error(self, instance):
|
||||
instance.host = self.compute.host
|
||||
with mock.patch.object(instance, 'save') as save:
|
||||
self.compute._init_instance(self.context, instance)
|
||||
save.assert_called_once_with()
|
||||
|
@ -709,6 +716,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
self.compute.driver.post_interrupted_snapshot_cleanup = mock.Mock()
|
||||
instance.info_cache = None
|
||||
instance.power_state = power_state.RUNNING
|
||||
instance.host = self.compute.host
|
||||
self.compute._init_instance(self.context, instance)
|
||||
save.assert_called_once_with()
|
||||
self.compute.driver.post_interrupted_snapshot_cleanup.\
|
||||
|
@ -726,6 +734,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
instance.power_state = power_state.RUNNING
|
||||
instance.vm_state = vm_states.ACTIVE
|
||||
instance.task_state = state
|
||||
instance.host = self.compute.host
|
||||
mock_get_power_state.return_value = powerstate
|
||||
|
||||
self.compute._init_instance(self.context, instance)
|
||||
|
@ -794,6 +803,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
instance.uuid = 'foo'
|
||||
instance.vm_state = vm_states.ERROR
|
||||
instance.task_state = task_states.IMAGE_UPLOADING
|
||||
instance.host = self.compute.host
|
||||
self.mox.StubOutWithMock(compute_utils, 'get_nw_info_for_instance')
|
||||
self.mox.ReplayAll()
|
||||
self.compute._init_instance(self.context, instance)
|
||||
|
@ -804,6 +814,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
self.context,
|
||||
uuid='fake',
|
||||
vm_state=vm_states.ERROR,
|
||||
host=self.compute.host,
|
||||
task_state=task_states.DELETING)
|
||||
self.mox.StubOutWithMock(objects.BlockDeviceMappingList,
|
||||
'get_by_instance_uuid')
|
||||
|
@ -844,6 +855,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
|
||||
def _test_init_instance_retries_reboot(self, instance, reboot_type,
|
||||
return_power_state):
|
||||
instance.host = self.compute.host
|
||||
with contextlib.nested(
|
||||
mock.patch.object(self.compute, '_get_power_state',
|
||||
return_value=return_power_state),
|
||||
|
@ -898,6 +910,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
power_state.NOSTATE)
|
||||
|
||||
def _test_init_instance_cleans_reboot_state(self, instance):
|
||||
instance.host = self.compute.host
|
||||
with contextlib.nested(
|
||||
mock.patch.object(self.compute, '_get_power_state',
|
||||
return_value=power_state.RUNNING),
|
||||
|
@ -935,6 +948,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
instance.id = 1
|
||||
instance.vm_state = vm_states.ACTIVE
|
||||
instance.task_state = task_states.POWERING_OFF
|
||||
instance.host = self.compute.host
|
||||
with mock.patch.object(self.compute, 'stop_instance'):
|
||||
self.compute._init_instance(self.context, instance)
|
||||
call = mock.call(self.context, instance)
|
||||
|
@ -946,6 +960,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
instance.id = 1
|
||||
instance.vm_state = vm_states.ACTIVE
|
||||
instance.task_state = task_states.POWERING_ON
|
||||
instance.host = self.compute.host
|
||||
with mock.patch.object(self.compute, 'start_instance'):
|
||||
self.compute._init_instance(self.context, instance)
|
||||
call = mock.call(self.context, instance)
|
||||
|
@ -957,6 +972,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
instance.id = 1
|
||||
instance.vm_state = vm_states.ACTIVE
|
||||
instance.task_state = task_states.POWERING_ON
|
||||
instance.host = self.compute.host
|
||||
with mock.patch.object(self.compute, 'start_instance',
|
||||
return_value=Exception):
|
||||
init_return = self.compute._init_instance(self.context, instance)
|
||||
|
@ -970,6 +986,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
instance.id = 1
|
||||
instance.vm_state = vm_states.ACTIVE
|
||||
instance.task_state = task_states.POWERING_OFF
|
||||
instance.host = self.compute.host
|
||||
with mock.patch.object(self.compute, 'stop_instance',
|
||||
return_value=Exception):
|
||||
init_return = self.compute._init_instance(self.context, instance)
|
||||
|
@ -1958,6 +1975,29 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
|||
self._test_init_host_with_partial_migration(
|
||||
vm_state=vm_states.RESIZED)
|
||||
|
||||
@mock.patch('nova.compute.manager.ComputeManager._get_instances_on_driver')
|
||||
def test_evacuate_disabled(self, mock_giod):
|
||||
self.flags(destroy_after_evacuate=False, group='workarounds')
|
||||
inst = mock.MagicMock()
|
||||
inst.uuid = 'foo'
|
||||
inst.host = self.compute.host + '-alt'
|
||||
mock_giod.return_value = [inst]
|
||||
with mock.patch.object(self.compute.driver, 'destroy') as mock_d:
|
||||
self.compute._destroy_evacuated_instances(mock.MagicMock())
|
||||
self.assertFalse(mock_d.called)
|
||||
|
||||
@mock.patch('nova.compute.manager.ComputeManager.'
|
||||
'_destroy_evacuated_instances')
|
||||
@mock.patch('nova.compute.manager.LOG')
|
||||
def test_init_host_foreign_instance(self, mock_log, mock_destroy):
|
||||
inst = mock.MagicMock()
|
||||
inst.host = self.compute.host + '-alt'
|
||||
self.compute._init_instance(mock.sentinel.context, inst)
|
||||
self.assertFalse(inst.save.called)
|
||||
self.assertTrue(mock_log.warning.called)
|
||||
msg = mock_log.warning.call_args_list[0]
|
||||
self.assertIn('appears to not be owned by this host', msg[0][0])
|
||||
|
||||
@mock.patch('nova.compute.manager.ComputeManager._instance_update')
|
||||
def test_error_out_instance_on_exception_not_implemented_err(self,
|
||||
inst_update_mock):
|
||||
|
|
|
@ -104,6 +104,12 @@ workarounds_opts = [
|
|||
'mechanism to disable livesnapshot while this is '
|
||||
'resolved. See '
|
||||
'https://bugs.launchpad.net/nova/+bug/1334398'),
|
||||
cfg.BoolOpt('destroy_after_evacuate',
|
||||
default=True,
|
||||
help='Whether to destroy instances on startup when we suspect '
|
||||
'they have previously been evacuated. This can result in '
|
||||
'data loss if undesired. See '
|
||||
'https://launchpad.net/bugs/1419785'),
|
||||
]
|
||||
CONF = cfg.CONF
|
||||
CONF.register_opts(monkey_patch_opts)
|
||||
|
|
Loading…
Reference in New Issue