Make 'error' instances recovery configurable
Currently on master error instances are evacuated from failed compute host and stopped at the destination host after evacuation. Some operators may not want to evacuate error instances in some cases like below: If user is running 1ACT/n SBY application on instances, launching error instances will cause unexpected effect. This patch adds a new config option 'ignore_instances_in_error_state' under [host_failure] section which makes the recovery of error instances configurable. If this config option set to True, masakari will skip the recovery of error instances otherwise it will evacuate error instances from a failed source compute node along with other instances. The default value for this config option is set to False. Change-Id: I24f8282357f28544fd1b56f270da22c7329a9f3d
This commit is contained in:
parent
778cac30cd
commit
a67e0c5038
|
@ -39,6 +39,15 @@ instances which contain 'HA_Enabled=True' metadata key, and then it will
|
|||
evacuate the remaining ones. When set to False, it will evacuate only those
|
||||
instances which contain 'HA_Enabled=True' metadata key."""),
|
||||
|
||||
cfg.BoolOpt('ignore_instances_in_error_state',
|
||||
default=False,
|
||||
help="""
|
||||
Operators can decide whether error instances should be allowed for evacuation
|
||||
from a failed source compute node or not. When set to True, it will ignore
|
||||
error instances from evacuation from a failed source compute node. When set to
|
||||
False, it will evacuate error instances along with other instances from a
|
||||
failed source compute node."""),
|
||||
|
||||
cfg.BoolOpt("add_reserved_host_to_aggregate",
|
||||
default=False,
|
||||
help="""
|
||||
|
|
|
@ -70,17 +70,35 @@ class PrepareHAEnabledInstancesTask(base.MasakariTask):
|
|||
self.novaclient = novaclient
|
||||
|
||||
def execute(self, context, host_name):
|
||||
def _filter_instances(instance_list):
|
||||
ha_enabled_instances = []
|
||||
non_ha_enabled_instances = []
|
||||
|
||||
for instance in instance_list:
|
||||
is_instance_ha_enabled = strutils.bool_from_string(
|
||||
instance.metadata.get('HA_Enabled', False))
|
||||
if CONF.host_failure.ignore_instances_in_error_state and (
|
||||
getattr(instance, "OS-EXT-STS:vm_state") == "error"):
|
||||
if is_instance_ha_enabled:
|
||||
msg = ("Ignoring recovery of HA_Enabled instance "
|
||||
"'%(instance_id)s' as it is in 'error' state.")
|
||||
LOG.info(msg, {'instance_id': instance.id})
|
||||
continue
|
||||
|
||||
if is_instance_ha_enabled:
|
||||
ha_enabled_instances.append(instance)
|
||||
else:
|
||||
non_ha_enabled_instances.append(instance)
|
||||
|
||||
if CONF.host_failure.evacuate_all_instances:
|
||||
ha_enabled_instances.extend(non_ha_enabled_instances)
|
||||
|
||||
return ha_enabled_instances
|
||||
|
||||
instance_list = self.novaclient.get_servers(context, host_name)
|
||||
|
||||
if CONF.host_failure.evacuate_all_instances:
|
||||
instance_list = sorted(
|
||||
instance_list, key=lambda k: strutils.bool_from_string(
|
||||
k.metadata.get('HA_Enabled', False)), reverse=True)
|
||||
else:
|
||||
instance_list = (
|
||||
[instance for instance in instance_list if
|
||||
strutils.bool_from_string(instance.metadata.get('HA_Enabled',
|
||||
False))])
|
||||
instance_list = _filter_instances(instance_list)
|
||||
|
||||
if not instance_list:
|
||||
msg = _('No instances to evacuate on host: %s.') % host_name
|
||||
LOG.info(msg)
|
||||
|
|
|
@ -53,15 +53,21 @@ class HostFailureTestCase(test.TestCase):
|
|||
self.novaclient = nova.API()
|
||||
self.fake_client = fakes.FakeNovaClient()
|
||||
|
||||
def _verify_instance_evacuated(self):
|
||||
for server in self.novaclient.get_servers(self.ctxt,
|
||||
self.instance_host):
|
||||
def _verify_instance_evacuated(self, old_instance_list):
|
||||
for server in old_instance_list:
|
||||
instance = self.novaclient.get_server(self.ctxt, server.id)
|
||||
self.assertIn(getattr(instance, 'OS-EXT-STS:vm_state'),
|
||||
['active', 'stopped', 'error'])
|
||||
self.assertNotEqual(self.instance_host,
|
||||
getattr(instance,
|
||||
'OS-EXT-SRV-ATTR:hypervisor_hostname'))
|
||||
|
||||
if CONF.host_failure.ignore_instances_in_error_state and getattr(
|
||||
server, 'OS-EXT-STS:vm_state') == 'error':
|
||||
self.assertEqual(
|
||||
self.instance_host, getattr(
|
||||
instance, 'OS-EXT-SRV-ATTR:hypervisor_hostname'))
|
||||
else:
|
||||
self.assertNotEqual(
|
||||
self.instance_host, getattr(
|
||||
instance, 'OS-EXT-SRV-ATTR:hypervisor_hostname'))
|
||||
|
||||
def _test_disable_compute_service(self, mock_enable_disable):
|
||||
task = host_failure.DisableComputeServiceTask(self.novaclient)
|
||||
|
@ -70,24 +76,27 @@ class HostFailureTestCase(test.TestCase):
|
|||
mock_enable_disable.assert_called_once_with(
|
||||
self.ctxt, self.instance_host)
|
||||
|
||||
def _test_instance_list(self):
|
||||
def _test_instance_list(self, instances_evacuation_count):
|
||||
task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient)
|
||||
instance_list = task.execute(
|
||||
self.ctxt, self.instance_host)
|
||||
evacuate_all_instances = CONF.host_failure.evacuate_all_instances
|
||||
instance_list = task.execute(self.ctxt, self.instance_host)
|
||||
|
||||
if evacuate_all_instances:
|
||||
self.assertEqual(len(self.fake_client.servers.list()),
|
||||
len(instance_list['instance_list']))
|
||||
else:
|
||||
for instance in instance_list['instance_list']:
|
||||
for instance in instance_list['instance_list']:
|
||||
if CONF.host_failure.ignore_instances_in_error_state:
|
||||
self.assertNotEqual("error",
|
||||
getattr(instance, "OS-EXT-STS:vm_state"))
|
||||
if not CONF.host_failure.evacuate_all_instances:
|
||||
self.assertTrue(instance.metadata.get('HA_Enabled', False))
|
||||
|
||||
self.assertEqual(instances_evacuation_count,
|
||||
len(instance_list['instance_list']))
|
||||
|
||||
return instance_list
|
||||
|
||||
def _evacuate_instances(self, instance_list, mock_enable_disable,
|
||||
reserved_host=None):
|
||||
task = host_failure.EvacuateInstancesTask(self.novaclient)
|
||||
old_instance_list = copy.deepcopy(instance_list['instance_list'])
|
||||
|
||||
if reserved_host:
|
||||
task.execute(self.ctxt, self.instance_host,
|
||||
instance_list['instance_list'],
|
||||
|
@ -99,7 +108,7 @@ class HostFailureTestCase(test.TestCase):
|
|||
self.ctxt, self.instance_host, instance_list['instance_list'])
|
||||
|
||||
# make sure instance is active and has different host
|
||||
self._verify_instance_evacuated()
|
||||
self._verify_instance_evacuated(old_instance_list)
|
||||
|
||||
@mock.patch('masakari.compute.nova.novaclient')
|
||||
def test_host_failure_flow_for_auto_recovery(
|
||||
|
@ -118,7 +127,7 @@ class HostFailureTestCase(test.TestCase):
|
|||
self._test_disable_compute_service(mock_enable_disable)
|
||||
|
||||
# execute PrepareHAEnabledInstancesTask
|
||||
instance_list = self._test_instance_list()
|
||||
instance_list = self._test_instance_list(2)
|
||||
|
||||
# execute EvacuateInstancesTask
|
||||
self._evacuate_instances(instance_list, mock_enable_disable)
|
||||
|
@ -146,7 +155,7 @@ class HostFailureTestCase(test.TestCase):
|
|||
self._test_disable_compute_service(mock_enable_disable)
|
||||
|
||||
# execute PrepareHAEnabledInstancesTask
|
||||
instance_list = self._test_instance_list()
|
||||
instance_list = self._test_instance_list(2)
|
||||
|
||||
# execute EvacuateInstancesTask
|
||||
with mock.patch.object(host_obj.Host, "save") as mock_save:
|
||||
|
@ -182,6 +191,50 @@ class HostFailureTestCase(test.TestCase):
|
|||
# execute EvacuateInstancesTask
|
||||
self._evacuate_instances(instance_list, mock_enable_disable)
|
||||
|
||||
@mock.patch('masakari.compute.nova.novaclient')
|
||||
def test_host_failure_flow_ignore_error_instances(
|
||||
self, _mock_novaclient, mock_unlock, mock_lock,
|
||||
mock_enable_disable):
|
||||
self.override_config("ignore_instances_in_error_state",
|
||||
True, "host_failure")
|
||||
self.override_config("evacuate_all_instances",
|
||||
True, "host_failure")
|
||||
_mock_novaclient.return_value = self.fake_client
|
||||
|
||||
# create ha_enabled test data
|
||||
self.fake_client.servers.create(id="1", host=self.instance_host,
|
||||
vm_state='error',
|
||||
ha_enabled=True)
|
||||
self.fake_client.servers.create(id="2", host=self.instance_host,
|
||||
vm_state='active',
|
||||
ha_enabled=True)
|
||||
|
||||
# execute PrepareHAEnabledInstancesTask
|
||||
instance_list = self._test_instance_list(1)
|
||||
|
||||
# execute EvacuateInstancesTask
|
||||
self._evacuate_instances(instance_list, mock_enable_disable)
|
||||
|
||||
@mock.patch('masakari.compute.nova.novaclient')
|
||||
def test_host_failure_flow_ignore_error_instances_raise_skip_host_recovery(
|
||||
self, _mock_novaclient, mock_unlock, mock_lock,
|
||||
mock_enable_disable):
|
||||
self.override_config("ignore_instances_in_error_state",
|
||||
True, "host_failure")
|
||||
self.override_config("evacuate_all_instances",
|
||||
False, "host_failure")
|
||||
_mock_novaclient.return_value = self.fake_client
|
||||
|
||||
# create ha_enabled test data
|
||||
self.fake_client.servers.create(id="1", host=self.instance_host,
|
||||
vm_state='error',
|
||||
ha_enabled=True)
|
||||
|
||||
# execute PrepareHAEnabledInstancesTask
|
||||
task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient)
|
||||
self.assertRaises(exception.SkipHostRecoveryException, task.execute,
|
||||
self.ctxt, self.instance_host)
|
||||
|
||||
@mock.patch('masakari.compute.nova.novaclient')
|
||||
def test_host_failure_flow_all_instances_active_resized_instance(
|
||||
self, _mock_novaclient, mock_unlock, mock_lock,
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
---
|
||||
features:
|
||||
- |
|
||||
Operators can decide whether error instances should be allowed for
|
||||
evacuation along with other instances from a failed source compute node
|
||||
or not. Added a new config option ``ignore_instances_in_error_state`` to
|
||||
achieve this. When set to True, masakari will skip the recovery of error
|
||||
instances otherwise it will evacuate error instances as well from a failed
|
||||
source compute node.
|
||||
|
||||
To use this feature, following config option need to be set under
|
||||
``host_failure`` section in 'masakari.conf' file::
|
||||
|
||||
[host_failure]
|
||||
ignore_instances_in_error_state = False
|
||||
|
||||
The default value for this config option is set to False.
|
Loading…
Reference in New Issue