Make 'error' instances recovery configurable

Currently on master error instances are evacuated from failed
compute host and stopped at the destination host after evacuation.
Some operators may not want to evacuate error instances in some
cases like below:
If user is running 1ACT/n SBY application on instances, launching
error instances will cause unexpected effect.

This patch adds a new config option 'ignore_instances_in_error_state'
under [host_failure] section which makes the recovery of error
instances configurable. If this config option set to True, masakari
will skip the recovery of error instances otherwise it will evacuate
error instances from a failed source compute node along with other
instances. The default value for this config option is set to False.

Change-Id: I24f8282357f28544fd1b56f270da22c7329a9f3d
This commit is contained in:
Dinesh Bhor 2017-08-14 18:16:32 +05:30
parent 778cac30cd
commit a67e0c5038
4 changed files with 124 additions and 27 deletions

View File

@ -39,6 +39,15 @@ instances which contain 'HA_Enabled=True' metadata key, and then it will
evacuate the remaining ones. When set to False, it will evacuate only those
instances which contain 'HA_Enabled=True' metadata key."""),
cfg.BoolOpt('ignore_instances_in_error_state',
default=False,
help="""
Operators can decide whether error instances should be allowed for evacuation
from a failed source compute node or not. When set to True, it will ignore
error instances from evacuation from a failed source compute node. When set to
False, it will evacuate error instances along with other instances from a
failed source compute node."""),
cfg.BoolOpt("add_reserved_host_to_aggregate",
default=False,
help="""

View File

@ -70,17 +70,35 @@ class PrepareHAEnabledInstancesTask(base.MasakariTask):
self.novaclient = novaclient
def execute(self, context, host_name):
def _filter_instances(instance_list):
ha_enabled_instances = []
non_ha_enabled_instances = []
for instance in instance_list:
is_instance_ha_enabled = strutils.bool_from_string(
instance.metadata.get('HA_Enabled', False))
if CONF.host_failure.ignore_instances_in_error_state and (
getattr(instance, "OS-EXT-STS:vm_state") == "error"):
if is_instance_ha_enabled:
msg = ("Ignoring recovery of HA_Enabled instance "
"'%(instance_id)s' as it is in 'error' state.")
LOG.info(msg, {'instance_id': instance.id})
continue
if is_instance_ha_enabled:
ha_enabled_instances.append(instance)
else:
non_ha_enabled_instances.append(instance)
if CONF.host_failure.evacuate_all_instances:
ha_enabled_instances.extend(non_ha_enabled_instances)
return ha_enabled_instances
instance_list = self.novaclient.get_servers(context, host_name)
if CONF.host_failure.evacuate_all_instances:
instance_list = sorted(
instance_list, key=lambda k: strutils.bool_from_string(
k.metadata.get('HA_Enabled', False)), reverse=True)
else:
instance_list = (
[instance for instance in instance_list if
strutils.bool_from_string(instance.metadata.get('HA_Enabled',
False))])
instance_list = _filter_instances(instance_list)
if not instance_list:
msg = _('No instances to evacuate on host: %s.') % host_name
LOG.info(msg)

View File

@ -53,15 +53,21 @@ class HostFailureTestCase(test.TestCase):
self.novaclient = nova.API()
self.fake_client = fakes.FakeNovaClient()
def _verify_instance_evacuated(self):
for server in self.novaclient.get_servers(self.ctxt,
self.instance_host):
def _verify_instance_evacuated(self, old_instance_list):
for server in old_instance_list:
instance = self.novaclient.get_server(self.ctxt, server.id)
self.assertIn(getattr(instance, 'OS-EXT-STS:vm_state'),
['active', 'stopped', 'error'])
self.assertNotEqual(self.instance_host,
getattr(instance,
'OS-EXT-SRV-ATTR:hypervisor_hostname'))
if CONF.host_failure.ignore_instances_in_error_state and getattr(
server, 'OS-EXT-STS:vm_state') == 'error':
self.assertEqual(
self.instance_host, getattr(
instance, 'OS-EXT-SRV-ATTR:hypervisor_hostname'))
else:
self.assertNotEqual(
self.instance_host, getattr(
instance, 'OS-EXT-SRV-ATTR:hypervisor_hostname'))
def _test_disable_compute_service(self, mock_enable_disable):
task = host_failure.DisableComputeServiceTask(self.novaclient)
@ -70,24 +76,27 @@ class HostFailureTestCase(test.TestCase):
mock_enable_disable.assert_called_once_with(
self.ctxt, self.instance_host)
def _test_instance_list(self):
def _test_instance_list(self, instances_evacuation_count):
task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient)
instance_list = task.execute(
self.ctxt, self.instance_host)
evacuate_all_instances = CONF.host_failure.evacuate_all_instances
instance_list = task.execute(self.ctxt, self.instance_host)
if evacuate_all_instances:
self.assertEqual(len(self.fake_client.servers.list()),
len(instance_list['instance_list']))
else:
for instance in instance_list['instance_list']:
for instance in instance_list['instance_list']:
if CONF.host_failure.ignore_instances_in_error_state:
self.assertNotEqual("error",
getattr(instance, "OS-EXT-STS:vm_state"))
if not CONF.host_failure.evacuate_all_instances:
self.assertTrue(instance.metadata.get('HA_Enabled', False))
self.assertEqual(instances_evacuation_count,
len(instance_list['instance_list']))
return instance_list
def _evacuate_instances(self, instance_list, mock_enable_disable,
reserved_host=None):
task = host_failure.EvacuateInstancesTask(self.novaclient)
old_instance_list = copy.deepcopy(instance_list['instance_list'])
if reserved_host:
task.execute(self.ctxt, self.instance_host,
instance_list['instance_list'],
@ -99,7 +108,7 @@ class HostFailureTestCase(test.TestCase):
self.ctxt, self.instance_host, instance_list['instance_list'])
# make sure instance is active and has different host
self._verify_instance_evacuated()
self._verify_instance_evacuated(old_instance_list)
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_for_auto_recovery(
@ -118,7 +127,7 @@ class HostFailureTestCase(test.TestCase):
self._test_disable_compute_service(mock_enable_disable)
# execute PrepareHAEnabledInstancesTask
instance_list = self._test_instance_list()
instance_list = self._test_instance_list(2)
# execute EvacuateInstancesTask
self._evacuate_instances(instance_list, mock_enable_disable)
@ -146,7 +155,7 @@ class HostFailureTestCase(test.TestCase):
self._test_disable_compute_service(mock_enable_disable)
# execute PrepareHAEnabledInstancesTask
instance_list = self._test_instance_list()
instance_list = self._test_instance_list(2)
# execute EvacuateInstancesTask
with mock.patch.object(host_obj.Host, "save") as mock_save:
@ -182,6 +191,50 @@ class HostFailureTestCase(test.TestCase):
# execute EvacuateInstancesTask
self._evacuate_instances(instance_list, mock_enable_disable)
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_ignore_error_instances(
self, _mock_novaclient, mock_unlock, mock_lock,
mock_enable_disable):
self.override_config("ignore_instances_in_error_state",
True, "host_failure")
self.override_config("evacuate_all_instances",
True, "host_failure")
_mock_novaclient.return_value = self.fake_client
# create ha_enabled test data
self.fake_client.servers.create(id="1", host=self.instance_host,
vm_state='error',
ha_enabled=True)
self.fake_client.servers.create(id="2", host=self.instance_host,
vm_state='active',
ha_enabled=True)
# execute PrepareHAEnabledInstancesTask
instance_list = self._test_instance_list(1)
# execute EvacuateInstancesTask
self._evacuate_instances(instance_list, mock_enable_disable)
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_ignore_error_instances_raise_skip_host_recovery(
self, _mock_novaclient, mock_unlock, mock_lock,
mock_enable_disable):
self.override_config("ignore_instances_in_error_state",
True, "host_failure")
self.override_config("evacuate_all_instances",
False, "host_failure")
_mock_novaclient.return_value = self.fake_client
# create ha_enabled test data
self.fake_client.servers.create(id="1", host=self.instance_host,
vm_state='error',
ha_enabled=True)
# execute PrepareHAEnabledInstancesTask
task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient)
self.assertRaises(exception.SkipHostRecoveryException, task.execute,
self.ctxt, self.instance_host)
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_all_instances_active_resized_instance(
self, _mock_novaclient, mock_unlock, mock_lock,

View File

@ -0,0 +1,17 @@
---
features:
- |
Operators can decide whether error instances should be allowed for
evacuation along with other instances from a failed source compute node
or not. Added a new config option ``ignore_instances_in_error_state`` to
achieve this. When set to True, masakari will skip the recovery of error
instances otherwise it will evacuate error instances as well from a failed
source compute node.
To use this feature, following config option need to be set under
``host_failure`` section in 'masakari.conf' file::
[host_failure]
ignore_instances_in_error_state = False
The default value for this config option is set to False.