summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDinesh Bhor <dinesh.bhor@nttdata.com>2017-08-14 18:16:32 +0530
committerDinesh Bhor <dinesh.bhor@nttdata.com>2017-08-17 10:24:52 +0530
commita67e0c50385a189006568817ff112c286a0048aa (patch)
tree9d886ba521e97f5759600939d87997c0954013ac
parent778cac30cd807ed30253f8695c269f159a509058 (diff)
Make 'error' instances recovery configurable
Currently on master error instances are evacuated from failed compute host and stopped at the destination host after evacuation. Some operators may not want to evacuate error instances in some cases like below: If user is running 1ACT/n SBY application on instances, launching error instances will cause unexpected effect. This patch adds a new config option 'ignore_instances_in_error_state' under [host_failure] section which makes the recovery of error instances configurable. If this config option set to True, masakari will skip the recovery of error instances otherwise it will evacuate error instances from a failed source compute node along with other instances. The default value for this config option is set to False. Change-Id: I24f8282357f28544fd1b56f270da22c7329a9f3d
Notes
Notes (review): Code-Review+2: Rikimaru Honjo <honjo.rikimaru@po.ntt-tx.co.jp> Workflow+1: Tushar Patil <tushar.vitthal.patil@gmail.com> Verified+2: Jenkins Submitted-by: Jenkins Submitted-at: Thu, 17 Aug 2017 06:08:52 +0000 Reviewed-on: https://review.openstack.org/493534 Project: openstack/masakari Branch: refs/heads/master
-rw-r--r--masakari/conf/engine_driver.py9
-rw-r--r--masakari/engine/drivers/taskflow/host_failure.py36
-rw-r--r--masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py89
-rw-r--r--releasenotes/notes/add_evacuate_error_instances_conf_option-5b4d1906137395f0.yaml17
4 files changed, 124 insertions, 27 deletions
diff --git a/masakari/conf/engine_driver.py b/masakari/conf/engine_driver.py
index 376df76..f1fa6d0 100644
--- a/masakari/conf/engine_driver.py
+++ b/masakari/conf/engine_driver.py
@@ -39,6 +39,15 @@ instances which contain 'HA_Enabled=True' metadata key, and then it will
39evacuate the remaining ones. When set to False, it will evacuate only those 39evacuate the remaining ones. When set to False, it will evacuate only those
40instances which contain 'HA_Enabled=True' metadata key."""), 40instances which contain 'HA_Enabled=True' metadata key."""),
41 41
42 cfg.BoolOpt('ignore_instances_in_error_state',
43 default=False,
44 help="""
45Operators can decide whether error instances should be allowed for evacuation
46from a failed source compute node or not. When set to True, it will ignore
47error instances from evacuation from a failed source compute node. When set to
48False, it will evacuate error instances along with other instances from a
49failed source compute node."""),
50
42 cfg.BoolOpt("add_reserved_host_to_aggregate", 51 cfg.BoolOpt("add_reserved_host_to_aggregate",
43 default=False, 52 default=False,
44 help=""" 53 help="""
diff --git a/masakari/engine/drivers/taskflow/host_failure.py b/masakari/engine/drivers/taskflow/host_failure.py
index 9f440cc..355292b 100644
--- a/masakari/engine/drivers/taskflow/host_failure.py
+++ b/masakari/engine/drivers/taskflow/host_failure.py
@@ -70,17 +70,35 @@ class PrepareHAEnabledInstancesTask(base.MasakariTask):
70 self.novaclient = novaclient 70 self.novaclient = novaclient
71 71
72 def execute(self, context, host_name): 72 def execute(self, context, host_name):
73 def _filter_instances(instance_list):
74 ha_enabled_instances = []
75 non_ha_enabled_instances = []
76
77 for instance in instance_list:
78 is_instance_ha_enabled = strutils.bool_from_string(
79 instance.metadata.get('HA_Enabled', False))
80 if CONF.host_failure.ignore_instances_in_error_state and (
81 getattr(instance, "OS-EXT-STS:vm_state") == "error"):
82 if is_instance_ha_enabled:
83 msg = ("Ignoring recovery of HA_Enabled instance "
84 "'%(instance_id)s' as it is in 'error' state.")
85 LOG.info(msg, {'instance_id': instance.id})
86 continue
87
88 if is_instance_ha_enabled:
89 ha_enabled_instances.append(instance)
90 else:
91 non_ha_enabled_instances.append(instance)
92
93 if CONF.host_failure.evacuate_all_instances:
94 ha_enabled_instances.extend(non_ha_enabled_instances)
95
96 return ha_enabled_instances
97
73 instance_list = self.novaclient.get_servers(context, host_name) 98 instance_list = self.novaclient.get_servers(context, host_name)
74 99
75 if CONF.host_failure.evacuate_all_instances: 100 instance_list = _filter_instances(instance_list)
76 instance_list = sorted( 101
77 instance_list, key=lambda k: strutils.bool_from_string(
78 k.metadata.get('HA_Enabled', False)), reverse=True)
79 else:
80 instance_list = (
81 [instance for instance in instance_list if
82 strutils.bool_from_string(instance.metadata.get('HA_Enabled',
83 False))])
84 if not instance_list: 102 if not instance_list:
85 msg = _('No instances to evacuate on host: %s.') % host_name 103 msg = _('No instances to evacuate on host: %s.') % host_name
86 LOG.info(msg) 104 LOG.info(msg)
diff --git a/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py b/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py
index 45172bc..1ee2e56 100644
--- a/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py
+++ b/masakari/tests/unit/engine/drivers/taskflow/test_host_failure_flow.py
@@ -53,15 +53,21 @@ class HostFailureTestCase(test.TestCase):
53 self.novaclient = nova.API() 53 self.novaclient = nova.API()
54 self.fake_client = fakes.FakeNovaClient() 54 self.fake_client = fakes.FakeNovaClient()
55 55
56 def _verify_instance_evacuated(self): 56 def _verify_instance_evacuated(self, old_instance_list):
57 for server in self.novaclient.get_servers(self.ctxt, 57 for server in old_instance_list:
58 self.instance_host):
59 instance = self.novaclient.get_server(self.ctxt, server.id) 58 instance = self.novaclient.get_server(self.ctxt, server.id)
60 self.assertIn(getattr(instance, 'OS-EXT-STS:vm_state'), 59 self.assertIn(getattr(instance, 'OS-EXT-STS:vm_state'),
61 ['active', 'stopped', 'error']) 60 ['active', 'stopped', 'error'])
62 self.assertNotEqual(self.instance_host, 61
63 getattr(instance, 62 if CONF.host_failure.ignore_instances_in_error_state and getattr(
64 'OS-EXT-SRV-ATTR:hypervisor_hostname')) 63 server, 'OS-EXT-STS:vm_state') == 'error':
64 self.assertEqual(
65 self.instance_host, getattr(
66 instance, 'OS-EXT-SRV-ATTR:hypervisor_hostname'))
67 else:
68 self.assertNotEqual(
69 self.instance_host, getattr(
70 instance, 'OS-EXT-SRV-ATTR:hypervisor_hostname'))
65 71
66 def _test_disable_compute_service(self, mock_enable_disable): 72 def _test_disable_compute_service(self, mock_enable_disable):
67 task = host_failure.DisableComputeServiceTask(self.novaclient) 73 task = host_failure.DisableComputeServiceTask(self.novaclient)
@@ -70,24 +76,27 @@ class HostFailureTestCase(test.TestCase):
70 mock_enable_disable.assert_called_once_with( 76 mock_enable_disable.assert_called_once_with(
71 self.ctxt, self.instance_host) 77 self.ctxt, self.instance_host)
72 78
73 def _test_instance_list(self): 79 def _test_instance_list(self, instances_evacuation_count):
74 task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient) 80 task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient)
75 instance_list = task.execute( 81 instance_list = task.execute(self.ctxt, self.instance_host)
76 self.ctxt, self.instance_host)
77 evacuate_all_instances = CONF.host_failure.evacuate_all_instances
78 82
79 if evacuate_all_instances: 83 for instance in instance_list['instance_list']:
80 self.assertEqual(len(self.fake_client.servers.list()), 84 if CONF.host_failure.ignore_instances_in_error_state:
81 len(instance_list['instance_list'])) 85 self.assertNotEqual("error",
82 else: 86 getattr(instance, "OS-EXT-STS:vm_state"))
83 for instance in instance_list['instance_list']: 87 if not CONF.host_failure.evacuate_all_instances:
84 self.assertTrue(instance.metadata.get('HA_Enabled', False)) 88 self.assertTrue(instance.metadata.get('HA_Enabled', False))
85 89
90 self.assertEqual(instances_evacuation_count,
91 len(instance_list['instance_list']))
92
86 return instance_list 93 return instance_list
87 94
88 def _evacuate_instances(self, instance_list, mock_enable_disable, 95 def _evacuate_instances(self, instance_list, mock_enable_disable,
89 reserved_host=None): 96 reserved_host=None):
90 task = host_failure.EvacuateInstancesTask(self.novaclient) 97 task = host_failure.EvacuateInstancesTask(self.novaclient)
98 old_instance_list = copy.deepcopy(instance_list['instance_list'])
99
91 if reserved_host: 100 if reserved_host:
92 task.execute(self.ctxt, self.instance_host, 101 task.execute(self.ctxt, self.instance_host,
93 instance_list['instance_list'], 102 instance_list['instance_list'],
@@ -99,7 +108,7 @@ class HostFailureTestCase(test.TestCase):
99 self.ctxt, self.instance_host, instance_list['instance_list']) 108 self.ctxt, self.instance_host, instance_list['instance_list'])
100 109
101 # make sure instance is active and has different host 110 # make sure instance is active and has different host
102 self._verify_instance_evacuated() 111 self._verify_instance_evacuated(old_instance_list)
103 112
104 @mock.patch('masakari.compute.nova.novaclient') 113 @mock.patch('masakari.compute.nova.novaclient')
105 def test_host_failure_flow_for_auto_recovery( 114 def test_host_failure_flow_for_auto_recovery(
@@ -118,7 +127,7 @@ class HostFailureTestCase(test.TestCase):
118 self._test_disable_compute_service(mock_enable_disable) 127 self._test_disable_compute_service(mock_enable_disable)
119 128
120 # execute PrepareHAEnabledInstancesTask 129 # execute PrepareHAEnabledInstancesTask
121 instance_list = self._test_instance_list() 130 instance_list = self._test_instance_list(2)
122 131
123 # execute EvacuateInstancesTask 132 # execute EvacuateInstancesTask
124 self._evacuate_instances(instance_list, mock_enable_disable) 133 self._evacuate_instances(instance_list, mock_enable_disable)
@@ -146,7 +155,7 @@ class HostFailureTestCase(test.TestCase):
146 self._test_disable_compute_service(mock_enable_disable) 155 self._test_disable_compute_service(mock_enable_disable)
147 156
148 # execute PrepareHAEnabledInstancesTask 157 # execute PrepareHAEnabledInstancesTask
149 instance_list = self._test_instance_list() 158 instance_list = self._test_instance_list(2)
150 159
151 # execute EvacuateInstancesTask 160 # execute EvacuateInstancesTask
152 with mock.patch.object(host_obj.Host, "save") as mock_save: 161 with mock.patch.object(host_obj.Host, "save") as mock_save:
@@ -183,6 +192,50 @@ class HostFailureTestCase(test.TestCase):
183 self._evacuate_instances(instance_list, mock_enable_disable) 192 self._evacuate_instances(instance_list, mock_enable_disable)
184 193
185 @mock.patch('masakari.compute.nova.novaclient') 194 @mock.patch('masakari.compute.nova.novaclient')
195 def test_host_failure_flow_ignore_error_instances(
196 self, _mock_novaclient, mock_unlock, mock_lock,
197 mock_enable_disable):
198 self.override_config("ignore_instances_in_error_state",
199 True, "host_failure")
200 self.override_config("evacuate_all_instances",
201 True, "host_failure")
202 _mock_novaclient.return_value = self.fake_client
203
204 # create ha_enabled test data
205 self.fake_client.servers.create(id="1", host=self.instance_host,
206 vm_state='error',
207 ha_enabled=True)
208 self.fake_client.servers.create(id="2", host=self.instance_host,
209 vm_state='active',
210 ha_enabled=True)
211
212 # execute PrepareHAEnabledInstancesTask
213 instance_list = self._test_instance_list(1)
214
215 # execute EvacuateInstancesTask
216 self._evacuate_instances(instance_list, mock_enable_disable)
217
218 @mock.patch('masakari.compute.nova.novaclient')
219 def test_host_failure_flow_ignore_error_instances_raise_skip_host_recovery(
220 self, _mock_novaclient, mock_unlock, mock_lock,
221 mock_enable_disable):
222 self.override_config("ignore_instances_in_error_state",
223 True, "host_failure")
224 self.override_config("evacuate_all_instances",
225 False, "host_failure")
226 _mock_novaclient.return_value = self.fake_client
227
228 # create ha_enabled test data
229 self.fake_client.servers.create(id="1", host=self.instance_host,
230 vm_state='error',
231 ha_enabled=True)
232
233 # execute PrepareHAEnabledInstancesTask
234 task = host_failure.PrepareHAEnabledInstancesTask(self.novaclient)
235 self.assertRaises(exception.SkipHostRecoveryException, task.execute,
236 self.ctxt, self.instance_host)
237
238 @mock.patch('masakari.compute.nova.novaclient')
186 def test_host_failure_flow_all_instances_active_resized_instance( 239 def test_host_failure_flow_all_instances_active_resized_instance(
187 self, _mock_novaclient, mock_unlock, mock_lock, 240 self, _mock_novaclient, mock_unlock, mock_lock,
188 mock_enable_disable): 241 mock_enable_disable):
diff --git a/releasenotes/notes/add_evacuate_error_instances_conf_option-5b4d1906137395f0.yaml b/releasenotes/notes/add_evacuate_error_instances_conf_option-5b4d1906137395f0.yaml
new file mode 100644
index 0000000..0c1ee97
--- /dev/null
+++ b/releasenotes/notes/add_evacuate_error_instances_conf_option-5b4d1906137395f0.yaml
@@ -0,0 +1,17 @@
1---
2features:
3 - |
4 Operators can decide whether error instances should be allowed for
5 evacuation along with other instances from a failed source compute node
6 or not. Added a new config option ``ignore_instances_in_error_state`` to
7 achieve this. When set to True, masakari will skip the recovery of error
8 instances otherwise it will evacuate error instances as well from a failed
9 source compute node.
10
11 To use this feature, following config option need to be set under
12 ``host_failure`` section in 'masakari.conf' file::
13
14 [host_failure]
15 ignore_instances_in_error_state = False
16
17 The default value for this config option is set to False.