Power off commands should give guests a chance to shutdown

Currently in libvirt operations which power off an instance such as stop,
shelve, rescue, and resize simply destroy the underlying VM. Some
GuestOS's do not react well to this type of power failure, and so it would
be better if these operations followed the same approach as soft_reboot
and give the guest as chance to shutdown gracefully.

The shutdown behavior is defined by two values:

- shutdown_timeout defines the overall period a Guest is allowed to
complete it's shutdown.  The default valus is set via nova.conf and can be
overridden on a per image basis by image metadata allowing different types
of guest OS to specify how long they need to shutdown cleanly.

- shutdown_retry_interval defines how frequently within that period
the Guest will be signaled to shutdown.  This is a protection against
guests that may not be ready to process the shutdown signal when it
is first issued. (e.g. still booting).  This is defined as a constant.

This is one of a set of changes that will eventually expose the choice
of whether to give the GuestOS a chance to shutdown via the API.

This change implements the libvirt changes to power_off() and adds
a clean shutdown to compute.manager.stop().

Subsequent patches will:
- Add clean shutdown to Shelve
- Add clean shutdown to Rescue
- Convert soft_reboot to use the same approach
- Expose clean shutdown via rpcapi
- Expose clean shutdown via API

Partially-Implements: blueprint user-defined-shutdown
Closes-Bug: #1196924
DocImpact

Change-Id: I432b0b0c09db82797f28deb5617f02ee45a4278c
This commit is contained in:
Phil Day 2014-01-24 15:43:20 +00:00
parent 5c4695741b
commit c07ed15415
14 changed files with 243 additions and 12 deletions

View File

@ -193,6 +193,10 @@ timeout_opts = [
default=0,
help="Automatically confirm resizes after N seconds. "
"Set to 0 to disable."),
cfg.IntOpt("shutdown_timeout",
default=60,
help="Total amount of time to wait in seconds for an instance "
"to perform a clean shutdown."),
]
running_deleted_opts = [
@ -568,6 +572,11 @@ class ComputeManager(manager.Manager):
target = messaging.Target(version='3.32')
# How long to wait in seconds before re-issuing a shutdown
# signal to a instance during power off. The overall
# time to wait is set by CONF.shutdown_timeout.
SHUTDOWN_RETRY_INTERVAL = 10
def __init__(self, compute_driver=None, *args, **kwargs):
"""Load configuration options and connect to the hypervisor."""
self.virtapi = ComputeVirtAPI(self)
@ -2186,6 +2195,25 @@ class ComputeManager(manager.Manager):
instance=instance)
self._set_instance_error_state(context, instance)
def _get_power_off_values(self, context, instance, clean_shutdown):
"""Get the timing configuration for powering down this instance."""
if clean_shutdown:
timeout = compute_utils.get_value_from_system_metadata(instance,
key='image_os_shutdown_timeout', type=int,
default=CONF.shutdown_timeout)
retry_interval = self.SHUTDOWN_RETRY_INTERVAL
else:
timeout = 0
retry_interval = 0
return timeout, retry_interval
def _power_off_instance(self, context, instance, clean_shutdown=True):
"""Power off an instance on this host."""
timeout, retry_interval = self._get_power_off_values(context,
instance, clean_shutdown)
self.driver.power_off(instance, timeout, retry_interval)
def _shutdown_instance(self, context, instance,
bdms, requested_networks=None, notify=True,
try_deallocate_networks=True):
@ -2377,14 +2405,14 @@ class ComputeManager(manager.Manager):
@reverts_task_state
@wrap_instance_event
@wrap_instance_fault
def stop_instance(self, context, instance):
def stop_instance(self, context, instance, clean_shutdown=True):
"""Stopping an instance on this host."""
@utils.synchronized(instance.uuid)
def do_stop_instance():
self._notify_about_instance_usage(context, instance,
"power_off.start")
self.driver.power_off(instance)
self._power_off_instance(context, instance, clean_shutdown)
current_power_state = self._get_power_state(context, instance)
instance.power_state = current_power_state
instance.vm_state = vm_states.STOPPED

View File

@ -219,6 +219,25 @@ def get_image_metadata(context, image_api, image_id_or_uri, instance):
return utils.get_image_from_system_metadata(system_meta)
def get_value_from_system_metadata(instance, key, type, default):
"""Get a value of a specified type from image metadata.
@param instance: The instance object
@param key: The name of the property to get
@param type: The python type the value is be returned as
@param default: The value to return if key is not set or not the right type
"""
value = instance.system_metadata.get(key, default)
try:
return type(value)
except ValueError:
LOG.warning(_LW("Metadata value %(value)s for %(key)s is not of "
"type %(type)s. Using default value %(default)s."),
{'value': value, 'key': key, 'type': type,
'default': default}, instance=instance)
return default
def notify_usage_exists(notifier, context, instance_ref, current_period=False,
ignore_missing_network_data=True,
system_metadata=None, extra_usage_info=None):

View File

@ -2539,7 +2539,8 @@ class CloudTestCase(test.TestCase):
self.stubs.Set(fake_virt.FakeDriver, 'power_on', fake_power_on)
def fake_power_off(self, instance):
def fake_power_off(self, instance,
shutdown_timeout, shutdown_attempts):
virt_driver['powered_off'] = True
self.stubs.Set(fake_virt.FakeDriver, 'power_off', fake_power_off)

View File

@ -2134,7 +2134,8 @@ class ComputeTestCase(BaseTestCase):
called = {'power_off': False}
def fake_driver_power_off(self, instance):
def fake_driver_power_off(self, instance,
shutdown_timeout, shutdown_attempts):
called['power_off'] = True
self.stubs.Set(nova.virt.fake.FakeDriver, 'power_off',

View File

@ -724,6 +724,28 @@ class ComputeGetImageMetadataTestCase(test.TestCase):
self.assertThat(expected, matchers.DictMatches(image_meta))
class ComputeUtilsGetValFromSysMetadata(test.TestCase):
def test_get_value_from_system_metadata(self):
instance = fake_instance.fake_instance_obj('fake-context')
system_meta = {'int_val': 1,
'int_string': '2',
'not_int': 'Nope'}
instance.system_metadata = system_meta
result = compute_utils.get_value_from_system_metadata(
instance, 'int_val', int, 0)
self.assertEqual(1, result)
result = compute_utils.get_value_from_system_metadata(
instance, 'int_string', int, 0)
self.assertEqual(2, result)
result = compute_utils.get_value_from_system_metadata(
instance, 'not_int', int, 0)
self.assertEqual(0, result)
class ComputeUtilsGetNWInfo(test.TestCase):
def test_instance_object_none_info_cache(self):
inst = fake_instance.fake_instance_obj('fake-context',

View File

@ -6193,6 +6193,82 @@ class LibvirtConnTestCase(test.TestCase,
conn._hard_reboot(self.context, instance, network_info,
block_device_info)
def _test_clean_shutdown(self, seconds_to_shutdown,
timeout, retry_interval,
shutdown_attempts, succeeds):
info_tuple = ('fake', 'fake', 'fake', 'also_fake')
shutdown_count = []
def count_shutdowns():
shutdown_count.append("shutdown")
# Mock domain
mock_domain = self.mox.CreateMock(libvirt.virDomain)
mock_domain.info().AndReturn(
(libvirt_driver.VIR_DOMAIN_RUNNING,) + info_tuple)
mock_domain.shutdown().WithSideEffects(count_shutdowns)
retry_countdown = retry_interval
for x in xrange(min(seconds_to_shutdown, timeout)):
mock_domain.info().AndReturn(
(libvirt_driver.VIR_DOMAIN_RUNNING,) + info_tuple)
if retry_countdown == 0:
mock_domain.shutdown().WithSideEffects(count_shutdowns)
retry_countdown = retry_interval
else:
retry_countdown -= 1
if seconds_to_shutdown < timeout:
mock_domain.info().AndReturn(
(libvirt_driver.VIR_DOMAIN_SHUTDOWN,) + info_tuple)
self.mox.ReplayAll()
def fake_lookup_by_name(instance_name):
return mock_domain
def fake_create_domain(**kwargs):
self.reboot_create_called = True
conn = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
instance = {"name": "instancename", "id": "instanceid",
"uuid": "875a8070-d0b9-4949-8b31-104d125c9a64"}
self.stubs.Set(conn, '_lookup_by_name', fake_lookup_by_name)
self.stubs.Set(conn, '_create_domain', fake_create_domain)
result = conn._clean_shutdown(instance, timeout, retry_interval)
self.assertEqual(succeeds, result)
self.assertEqual(shutdown_attempts, len(shutdown_count))
def test_clean_shutdown_first_time(self):
self._test_clean_shutdown(seconds_to_shutdown=2,
timeout=5,
retry_interval=3,
shutdown_attempts=1,
succeeds=True)
def test_clean_shutdown_with_retry(self):
self._test_clean_shutdown(seconds_to_shutdown=4,
timeout=5,
retry_interval=3,
shutdown_attempts=2,
succeeds=True)
def test_clean_shutdown_failure(self):
self._test_clean_shutdown(seconds_to_shutdown=6,
timeout=5,
retry_interval=3,
shutdown_attempts=2,
succeeds=False)
def test_clean_shutdown_no_wait(self):
self._test_clean_shutdown(seconds_to_shutdown=6,
timeout=0,
retry_interval=3,
shutdown_attempts=1,
succeeds=False)
def test_resume(self):
dummyxml = ("<domain type='kvm'><name>instance-0000000a</name>"
"<devices>"

View File

@ -103,7 +103,7 @@ class IronicAPIContractsTestCase(test.NoDBTestCase):
self._check_method(driver.ComputeDriver.power_off,
"ComputeDriver.power_off",
['self', 'instance'])
['self', 'instance', 'timeout', 'retry_interval'])
self._check_method(driver.ComputeDriver.power_on,
"ComputeDriver.power_on",

View File

@ -407,8 +407,9 @@ class BareMetalDriver(driver.ComputeDriver):
"""Cleanup after instance being destroyed."""
pass
def power_off(self, instance, node=None):
def power_off(self, instance, timeout=0, retry_interval=0, node=None):
"""Power off the specified instance."""
# TODO(PhilDay): Add support for timeout (clean shutdown)
if not node:
node = _get_baremetal_node_by_instance_uuid(instance['uuid'])
pm = get_power_manager(node=node, instance=instance)

View File

@ -589,10 +589,13 @@ class ComputeDriver(object):
# TODO(Vek): Need to pass context in for access to auth_token
raise NotImplementedError()
def power_off(self, instance):
def power_off(self, instance, timeout=0, retry_interval=0):
"""Power off the specified instance.
:param instance: nova.objects.instance.Instance
:param timeout: time to wait for GuestOS to shutdown
:param retry_interval: How often to signal guest while
waiting for it to shutdown
"""
raise NotImplementedError()

View File

@ -184,7 +184,7 @@ class FakeDriver(driver.ComputeDriver):
block_device_info=None):
pass
def power_off(self, instance):
def power_off(self, instance, shutdown_timeout=0, shutdown_attempts=0):
pass
def power_on(self, context, instance, network_info, block_device_info):

View File

@ -109,7 +109,8 @@ class HyperVDriver(driver.ComputeDriver):
def resume(self, context, instance, network_info, block_device_info=None):
self._vmops.resume(instance)
def power_off(self, instance):
def power_off(self, instance, timeout=0, retry_interval=0):
# TODO(PhilDay): Add support for timeout (clean shutdown)
self._vmops.power_off(instance)
def power_on(self, context, instance, network_info,

View File

@ -2182,8 +2182,85 @@ class LibvirtDriver(driver.ComputeDriver):
dom = self._lookup_by_name(instance['name'])
dom.resume()
def power_off(self, instance):
def _clean_shutdown(self, instance, timeout, retry_interval):
"""Attempt to shutdown the instance gracefully.
:param instance: The instance to be shutdown
:param timeout: How long to wait in seconds for the instance to
shutdown
:param retry_interval: How often in seconds to signal the instance
to shutdown while waiting
:returns: True if the shutdown succeeded
"""
# List of states that represent a shutdown instance
SHUTDOWN_STATES = [power_state.SHUTDOWN,
power_state.CRASHED]
try:
dom = self._lookup_by_name(instance["name"])
except exception.InstanceNotFound:
# If the instance has gone then we don't need to
# wait for it to shutdown
return True
(state, _max_mem, _mem, _cpus, _t) = dom.info()
state = LIBVIRT_POWER_STATE[state]
if state in SHUTDOWN_STATES:
LOG.info(_LI("Instance already shutdown."),
instance=instance)
return True
LOG.debug("Shutting down instance from state %s", state,
instance=instance)
dom.shutdown()
retry_countdown = retry_interval
for sec in six.moves.range(timeout):
dom = self._lookup_by_name(instance["name"])
(state, _max_mem, _mem, _cpus, _t) = dom.info()
state = LIBVIRT_POWER_STATE[state]
if state in SHUTDOWN_STATES:
LOG.info(_LI("Instance shutdown successfully after %d "
"seconds."), sec, instance=instance)
return True
# Note(PhilD): We can't assume that the Guest was able to process
# any previous shutdown signal (for example it may
# have still been startingup, so within the overall
# timeout we re-trigger the shutdown every
# retry_interval
if retry_countdown == 0:
retry_countdown = retry_interval
# Instance could shutdown at any time, in which case we
# will get an exception when we call shutdown
try:
LOG.debug("Instance in state %s after %d seconds - "
"resending shutdown", state, sec,
instance=instance)
dom.shutdown()
except libvirt.libvirtError:
# Assume this is because its now shutdown, so loop
# one more time to clean up.
LOG.debug("Ignoring libvirt exception from shutdown "
"request.", instance=instance)
continue
else:
retry_countdown -= 1
time.sleep(1)
LOG.info(_LI("Instance failed to shutdown in %d seconds."),
timeout, instance=instance)
return False
def power_off(self, instance, timeout=0, retry_interval=0):
"""Power off the specified instance."""
if timeout:
self._clean_shutdown(instance, timeout, retry_interval)
self._destroy(instance)
def power_on(self, context, instance, network_info,

View File

@ -494,8 +494,9 @@ class VMwareVCDriver(driver.ComputeDriver):
_vmops = self._get_vmops_for_compute_node(instance.node)
_vmops.unrescue(instance)
def power_off(self, instance):
def power_off(self, instance, timeout=0, retry_interval=0):
"""Power off the specified instance."""
# TODO(PhilDay): Add support for timeout (clean shutdown)
_vmops = self._get_vmops_for_compute_node(instance['node'])
_vmops.power_off(instance)

View File

@ -299,8 +299,9 @@ class XenAPIDriver(driver.ComputeDriver):
"""Unrescue the specified instance."""
self._vmops.unrescue(instance)
def power_off(self, instance):
def power_off(self, instance, timeout=0, retry_interval=0):
"""Power off the specified instance."""
# TODO(PhilDay): Add support for timeout (clean shutdown)
self._vmops.power_off(instance)
def power_on(self, context, instance, network_info,