Merge "Make 'error' instances recovery configurable"

This commit is contained in:
Jenkins 2017-08-17 06:08:52 +00:00 committed by Gerrit Code Review
commit 93556d3116
4 changed files with 124 additions and 27 deletions

View File

@ -39,6 +39,15 @@ instances which contain 'HA_Enabled=True' metadata key, and then it will
evacuate the remaining ones. When set to False, it will evacuate only those
instances which contain 'HA_Enabled=True' metadata key."""),
cfg.BoolOpt('ignore_instances_in_error_state',
default=False,
help="""
Operators can decide whether error instances should be allowed for evacuation
from a failed source compute node or not. When set to True, it will ignore
error instances from evacuation from a failed source compute node. When set to
False, it will evacuate error instances along with other instances from a
failed source compute node."""),
cfg.BoolOpt("add_reserved_host_to_aggregate",
default=False,
help="""

View File

@ -70,17 +70,35 @@ class PrepareHAEnabledInstancesTask(base.MasakariTask):
self.novaclient = novaclient
def execute(self, context, host_name):
def _filter_instances(instance_list):
ha_enabled_instances = []
non_ha_enabled_instances = []
for instance in instance_list:
is_instance_ha_enabled = strutils.bool_from_string(
instance.metadata.get('HA_Enabled', False))
if CONF.host_failure.ignore_instances_in_error_state and (
getattr(instance, "OS-EXT-STS:vm_state") == "error"):
if is_instance_ha_enabled:
msg = ("Ignoring recovery of HA_Enabled instance "
"'%(instance_id)s' as it is in 'error' state.")
LOG.info(msg, {'instance_id': instance.id})
continue
if is_instance_ha_enabled:
ha_enabled_instances.append(instance)
else:
non_ha_enabled_instances.append(instance)
if CONF.host_failure.evacuate_all_instances:
ha_enabled_instances.extend(non_ha_enabled_instances)
return ha_enabled_instances
instance_list = self.novaclient.get_servers(context, host_name)
if CONF.host_failure.evacuate_all_instances:
instance_list = sorted(
instance_list, key=lambda k: strutils.bool_from_string(
k.metadata.get('HA_Enabled', False)), reverse=True)
else:
instance_list = (
[instance for instance in instance_list if
strutils.bool_from_string(instance.metadata.get('HA_Enabled',
False))])
instance_list = _filter_instances(instance_list)
if not instance_list:
msg = _('No instances to evacuate on host: %s.') % host_name
LOG.info(msg)

View File

@ -53,15 +53,21 @@ class HostFailureTestCase(test.TestCase):
self.novaclient = nova.API()
self.fake_client = fakes.FakeNovaClient()
def _verify_instance_evacuated(self):
for server in self.novaclient.get_servers(self.ctxt,
self.instance_host):
def _verify_instance_evacuated(self, old_instance_list):
for server in old_instance_list:
instance = self.novaclient.get_server(self.ctxt, server.id)
self.assertIn(getattr(instance, 'OS-EXT-STS:vm_state'),
['active', 'stopped', 'error'])
self.assertNotEqual(self.instance_host,
getattr(instance,
'OS-EXT-SRV-ATTR:hypervisor_hostname'))
if CONF.host_failure.ignore_instances_in_error_state and getattr(
server, 'OS-EXT-STS:vm_state') == 'error':
self.assertEqual(
self.instance_host, getattr(
instance, 'OS-EXT-SRV-ATTR:hypervisor_hostname'))
else:
self.assertNotEqual(
self.instance_host, getattr(
instance, 'OS-EXT-SRV-ATTR:hypervisor_hostname'))
def _test_disable_compute_service(self, mock_enable_disable):
task = host_failure.DisableComputeServiceTask(self.novaclient)
@ -70,24 +76,27 @@ class HostFailureTestCase(test.TestCase):
mock_enable_disable.assert_called_once_with(
self.ctxt, self.instance_host)
def _test_instance_list(self):
def _test_instance_list(self, instances_evacuation_count):
task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient)
instance_list = task.execute(
self.ctxt, self.instance_host)
evacuate_all_instances = CONF.host_failure.evacuate_all_instances
instance_list = task.execute(self.ctxt, self.instance_host)
if evacuate_all_instances:
self.assertEqual(len(self.fake_client.servers.list()),
len(instance_list['instance_list']))
else:
for instance in instance_list['instance_list']:
for instance in instance_list['instance_list']:
if CONF.host_failure.ignore_instances_in_error_state:
self.assertNotEqual("error",
getattr(instance, "OS-EXT-STS:vm_state"))
if not CONF.host_failure.evacuate_all_instances:
self.assertTrue(instance.metadata.get('HA_Enabled', False))
self.assertEqual(instances_evacuation_count,
len(instance_list['instance_list']))
return instance_list
def _evacuate_instances(self, instance_list, mock_enable_disable,
reserved_host=None):
task = host_failure.EvacuateInstancesTask(self.novaclient)
old_instance_list = copy.deepcopy(instance_list['instance_list'])
if reserved_host:
task.execute(self.ctxt, self.instance_host,
instance_list['instance_list'],
@ -99,7 +108,7 @@ class HostFailureTestCase(test.TestCase):
self.ctxt, self.instance_host, instance_list['instance_list'])
# make sure instance is active and has different host
self._verify_instance_evacuated()
self._verify_instance_evacuated(old_instance_list)
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_for_auto_recovery(
@ -118,7 +127,7 @@ class HostFailureTestCase(test.TestCase):
self._test_disable_compute_service(mock_enable_disable)
# execute PrepareHAEnabledInstancesTask
instance_list = self._test_instance_list()
instance_list = self._test_instance_list(2)
# execute EvacuateInstancesTask
self._evacuate_instances(instance_list, mock_enable_disable)
@ -146,7 +155,7 @@ class HostFailureTestCase(test.TestCase):
self._test_disable_compute_service(mock_enable_disable)
# execute PrepareHAEnabledInstancesTask
instance_list = self._test_instance_list()
instance_list = self._test_instance_list(2)
# execute EvacuateInstancesTask
with mock.patch.object(host_obj.Host, "save") as mock_save:
@ -182,6 +191,50 @@ class HostFailureTestCase(test.TestCase):
# execute EvacuateInstancesTask
self._evacuate_instances(instance_list, mock_enable_disable)
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_ignore_error_instances(
self, _mock_novaclient, mock_unlock, mock_lock,
mock_enable_disable):
self.override_config("ignore_instances_in_error_state",
True, "host_failure")
self.override_config("evacuate_all_instances",
True, "host_failure")
_mock_novaclient.return_value = self.fake_client
# create ha_enabled test data
self.fake_client.servers.create(id="1", host=self.instance_host,
vm_state='error',
ha_enabled=True)
self.fake_client.servers.create(id="2", host=self.instance_host,
vm_state='active',
ha_enabled=True)
# execute PrepareHAEnabledInstancesTask
instance_list = self._test_instance_list(1)
# execute EvacuateInstancesTask
self._evacuate_instances(instance_list, mock_enable_disable)
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_ignore_error_instances_raise_skip_host_recovery(
self, _mock_novaclient, mock_unlock, mock_lock,
mock_enable_disable):
self.override_config("ignore_instances_in_error_state",
True, "host_failure")
self.override_config("evacuate_all_instances",
False, "host_failure")
_mock_novaclient.return_value = self.fake_client
# create ha_enabled test data
self.fake_client.servers.create(id="1", host=self.instance_host,
vm_state='error',
ha_enabled=True)
# execute PrepareHAEnabledInstancesTask
task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient)
self.assertRaises(exception.SkipHostRecoveryException, task.execute,
self.ctxt, self.instance_host)
@mock.patch('masakari.compute.nova.novaclient')
def test_host_failure_flow_all_instances_active_resized_instance(
self, _mock_novaclient, mock_unlock, mock_lock,

View File

@ -0,0 +1,17 @@
---
features:
- |
Operators can decide whether error instances should be allowed for
evacuation along with other instances from a failed source compute node
or not. Added a new config option ``ignore_instances_in_error_state`` to
achieve this. When set to True, masakari will skip the recovery of error
instances otherwise it will evacuate error instances as well from a failed
source compute node.
To use this feature, following config option need to be set under
``host_failure`` section in 'masakari.conf' file::
[host_failure]
ignore_instances_in_error_state = False
The default value for this config option is set to False.