Set "disabled reason" for compute service.

Masakari never sets reason why compute service was disabled.
"disabled reason" was added in config.

Closes-Bug: 1936181
Change-Id: I998f7884195b93927773c7186d61c13670a53662
This commit is contained in:
Mitya_Eremeev 2021-07-14 12:11:01 +03:00
parent f2e830f927
commit c861437b52
7 changed files with 42 additions and 3 deletions

View File

@ -27,6 +27,11 @@ host_recovery_group = cfg.OptGroup(
title='Host failure recovery options',
help="Configuration options for host failure recovery")
process_recovery_group = cfg.OptGroup(
'process_failure',
title='Process failure recovery options',
help="Configuration options for process failure recovery")
customized_recovery_flow_group = cfg.OptGroup(
'taskflow_driver_recovery_flows',
title='Customized recovery flow Options',
@ -80,6 +85,10 @@ Operators can decide whether reserved_host should be added to aggregate group
of failed compute host. When set to True, reserved host will be added to the
aggregate group of failed compute host. When set to False, the reserved_host
will not be added to the aggregate group of failed compute host."""),
cfg.StrOpt("service_disable_reason",
default="Masakari detected host failed.",
help="Compute disable reason in case Masakari detects host "
"failure."),
]
instance_failure_options = [
@ -220,14 +229,23 @@ The allowed values for this option is comma separated dictionary of object
names in between ``{`` and ``}``."""))
]
process_failure_opts = [
cfg.StrOpt("service_disable_reason",
default="Masakari detected process failed.",
help="Compute disable reason in case Masakari detects process "
"failure."),
]
def register_opts(conf):
conf.register_group(instance_recovery_group)
conf.register_group(host_recovery_group)
conf.register_group(process_recovery_group)
conf.register_group(customized_recovery_flow_group)
conf.register_group(taskflow_group)
conf.register_opts(instance_failure_options, group=instance_recovery_group)
conf.register_opts(host_failure_opts, group=host_recovery_group)
conf.register_opts(process_failure_opts, group=process_recovery_group)
conf.register_opts(taskflow_driver_recovery_flows,
group=customized_recovery_flow_group)
conf.register_opts(taskflow_options, group=taskflow_group)
@ -237,6 +255,7 @@ def list_opts():
return {
instance_recovery_group.name: instance_failure_options,
host_recovery_group.name: host_failure_opts,
process_recovery_group.name: process_failure_opts,
taskflow_group.name: taskflow_options
}

View File

@ -48,7 +48,8 @@ class DisableComputeServiceTask(base.MasakariTask):
def execute(self, host_name):
msg = "Disabling compute service on host: '%s'" % host_name
self.update_details(msg)
self.novaclient.enable_disable_service(self.context, host_name)
self.novaclient.enable_disable_service(self.context, host_name,
reason=CONF.host_failure.service_disable_reason)
# Sleep until nova-compute service is marked as disabled.
log_msg = ("Sleeping %(wait)s sec before starting recovery "
"thread until nova recognizes the node down.")

View File

@ -45,7 +45,8 @@ class DisableComputeNodeTask(base.MasakariTask):
if not self.novaclient.is_service_disabled(self.context, host_name,
process_name):
# disable compute node on given host
self.novaclient.enable_disable_service(self.context, host_name)
self.novaclient.enable_disable_service(self.context, host_name,
reason=CONF.process_failure.service_disable_reason)
msg = "Disabled compute service on host: '%s'" % host_name
self.update_details(msg, 1.0)
else:

View File

@ -52,6 +52,7 @@ class HostFailureTestCase(test.TestCase):
self.instance_host = "fake-host"
self.novaclient = nova.API()
self.fake_client = fakes.FakeNovaClient()
self.disabled_reason = CONF.host_failure.service_disable_reason
def _verify_instance_evacuated(self, old_instance_list):
for server in old_instance_list:
@ -86,7 +87,7 @@ class HostFailureTestCase(test.TestCase):
task.execute(self.instance_host)
mock_enable_disable.assert_called_once_with(
self.ctxt, self.instance_host)
self.ctxt, self.instance_host, reason=self.disabled_reason)
def _test_instance_list(self, instances_evacuation_count):
task = host_failure.PrepareHAEnabledInstancesTask(self.ctxt,

View File

@ -20,12 +20,15 @@ Unit Tests for process failure TaskFlow
from unittest import mock
from masakari.compute import nova
from masakari import conf
from masakari import context
from masakari.engine.drivers.taskflow import process_failure
from masakari import exception
from masakari import test
from masakari.tests.unit import fakes
CONF = conf.CONF
class ProcessFailureTestCase(test.TestCase):
@ -39,6 +42,7 @@ class ProcessFailureTestCase(test.TestCase):
# overriding 'wait_period_after_service_update' to 2 seconds
# to reduce the wait period.
self.override_config('wait_period_after_service_update', 2)
self.disabled_reason = CONF.process_failure.service_disable_reason
@mock.patch('masakari.compute.nova.novaclient')
@mock.patch('masakari.engine.drivers.taskflow.base.MasakariTask.'

View File

@ -172,6 +172,13 @@ class FakeNovaClient(object):
services.append(service)
return services
def disable_log_reason(self, service_id, reason):
for _service in self._services:
if _service.id == service_id:
service = _service
service.status = 'disabled'
service.disabled_reason = reason
def __init__(self):
self.servers = FakeNovaClient.ServerManager()
self.services = FakeNovaClient.Services()

View File

@ -0,0 +1,6 @@
---
features:
- |
Nova compute service "disable reason" is now set
in case of host or process failure.
It can be customised per type of failure via config.