Repeated check to determine host status

The original test is adapted because the code now no longer
overwrites the same status.

Change-Id: Ic77f932f56974a66a092b15b0d211efd73b9fc9c
Implements: bp retry-check-when-host-failure
Co-Authored-By: Radosław Piliszek <radoslaw.piliszek@gmail.com>
This commit is contained in:
suzhengwei 2020-11-06 15:17:20 +08:00 committed by Radosław Piliszek
parent be42d99854
commit 987584a1c6
4 changed files with 125 additions and 13 deletions

View File

@ -20,6 +20,14 @@ monitor_host_opts = [
cfg.IntOpt('monitoring_interval',
default=60,
help='Monitoring interval(in seconds) of node status.'),
cfg.IntOpt('monitoring_samples',
default=1,
help='''
Monitoring probes to collect before making the decision to send Masakari
notification about the node status. If and only if ``monitoring_samples``
consecutive reports have the same status, will the Masakari notification
be sent.
'''),
cfg.IntOpt('api_retry_max',
default=12,
help='Number of retries for send a notification in'

View File

@ -14,6 +14,7 @@
import socket
from collections import deque
import eventlet
from oslo_log import log as oslo_logging
from oslo_utils import timeutils
@ -56,6 +57,27 @@ class HandleHost(driver.DriverBase):
self.crmmon_xml_parser = parse_crmmon_xml.ParseCrmMonXml()
self.status_holder = hold_host_status.HostHoldStatus()
self.notifier = masakari.SendNotification()
self.monitoring_data = {}
def _update_monitoring_data(self, hostname, status):
health_history = self.monitoring_data.setdefault(
hostname, deque([], maxlen=CONF.host.monitoring_samples))
health_history.append(status)
def get_stabilised_host_status(self, hostname):
health_history = self.monitoring_data.get(hostname)
if len(health_history) < CONF.host.monitoring_samples:
LOG.debug("Not enough monitoring data for host %s.", hostname)
return '_being_collected'
stabilised_status = health_history[0]
# If and only if the sequence of host status is consistently the same,
# will it return that status.
if len(health_history) == health_history.count(stabilised_status):
return stabilised_status
else:
return '_uncertain'
def _check_pacemaker_services(self, target_service):
try:
@ -287,8 +309,8 @@ class HandleHost(driver.DriverBase):
if hostname == self.my_hostname:
continue
# Get current status and old status.
current_status = node_state_tag.get('crmd')
self._update_monitoring_data(hostname, current_status)
old_status = self.status_holder.get_host_status(hostname)
# If old_status is None, This is first get of host status.
@ -300,21 +322,28 @@ class HandleHost(driver.DriverBase):
self.status_holder.set_host_status(node_state_tag)
continue
stabilised_status = self.get_stabilised_host_status(hostname)
# Output host status.
msg = ("'%s' is '%s'.") % (hostname, current_status)
msg = ("'%s' is '%s' (current: '%s').") % (hostname,
stabilised_status,
current_status)
LOG.info("%s", msg)
# If host status changed, send a notification.
if current_status != old_status:
if current_status != 'online' and current_status != 'offline':
# If current_status is not 'online' or 'offline',
if stabilised_status == '_being_collected':
continue
# If host stabilised status changed, send a notification.
if stabilised_status != old_status:
if stabilised_status not in ['online', 'offline']:
# If stabilised_status is not 'online' or 'offline',
# hostmonitor doesn't send a notification.
msg = ("Since host status is '%s',"
" hostmonitor doesn't send a notification.") \
% current_status
% stabilised_status
LOG.info("%s", msg)
else:
event = self._make_event(hostname, current_status)
event = self._make_event(hostname, stabilised_status)
# Send a notification.
self.notifier.send_notification(
@ -322,8 +351,9 @@ class HandleHost(driver.DriverBase):
CONF.host.api_retry_interval,
event)
# Update host status.
self.status_holder.set_host_status(node_state_tag)
if stabilised_status != '_uncertain':
# Update host status.
self.status_holder.set_host_status(node_state_tag)
def _check_host_status_by_crm_mon(self):
crmmon_xml = self._get_crmmon_xml()

View File

@ -17,6 +17,7 @@ import testtools
from unittest import mock
from xml.etree import ElementTree
from collections import deque
import eventlet
from oslo_utils import timeutils
@ -629,9 +630,7 @@ class TestHandleHost(testtools.TestCase):
mock.call(node3),
mock.call(node4),
mock.call(node5)]
calls_set_host_status = [mock.call(node_state_node2),
mock.call(node_state_node3),
mock.call(node_state_node4),
calls_set_host_status = [mock.call(node_state_node4),
mock.call(node_state_node5)]
mock_get_host_status.assert_has_calls(calls_get_host_status)
mock_set_host_status.assert_has_calls(calls_set_host_status)
@ -639,6 +638,64 @@ class TestHandleHost(testtools.TestCase):
mock_send_notification.assert_called_once_with(
CONF.host.api_retry_max, CONF.host.api_retry_interval, test_event)
@mock.patch.object(masakari.SendNotification, 'send_notification')
@mock.patch.object(handle_host.HandleHost, '_make_event')
@mock.patch.object(hold_host_status.HostHoldStatus, 'set_host_status')
@mock.patch.object(hold_host_status.HostHoldStatus, 'get_host_status')
@mock.patch.object(socket, 'gethostname')
def test_check_if_status_changed_with_3_samples(
self, mock_gethostname, mock_get_host_status, mock_set_host_status,
mock_make_event, mock_send_notification):
mock_gethostname.return_value = 'node1'
mock_get_host_status.side_effect = \
[None, 'online', 'online', 'online']
mock_set_host_status.return_value = None
test_event = {'notification': 'test'}
mock_make_event.return_value = test_event
status_tag = ElementTree.fromstring(STATUS_TAG_XML)
node_state_tag_list = list(status_tag)
CONF.host.monitoring_samples = 3
obj = handle_host.HandleHost()
obj.monitoring_data = {
"node1": deque(['online', 'online', 'online'], maxlen=3),
"node2": deque(['offline', 'online', 'online'], maxlen=3),
"node3": deque(['offline'], maxlen=3),
"node4": deque(['online', 'offline', 'offline'], maxlen=3),
"node5": deque(['online', 'online', 'online'], maxlen=3),
}
obj._check_if_status_changed(node_state_tag_list)
self.assertEqual(deque(['online', 'online', 'online'], maxlen=3),
obj.monitoring_data.get('node2'))
self.assertEqual('online', obj.get_stabilised_host_status('node2'))
self.assertIn(mock.call(node_state_tag_list[1]),
mock_set_host_status.mock_calls)
self.assertEqual(deque(['offline', 'online'], maxlen=3),
obj.monitoring_data.get('node3'))
self.assertEqual('_being_collected',
obj.get_stabilised_host_status('node3'))
self.assertNotIn(mock.call(node_state_tag_list[2]),
mock_set_host_status.mock_calls)
self.assertEqual(deque(['offline', 'offline', 'offline'], maxlen=3),
obj.monitoring_data.get('node4'))
self.assertEqual('offline', obj.get_stabilised_host_status('node4'))
self.assertIn(mock.call(node_state_tag_list[3]),
mock_set_host_status.mock_calls)
self.assertEqual(deque(['online', 'online', 'other'], maxlen=3),
obj.monitoring_data.get('node5'))
self.assertEqual('_uncertain', obj.get_stabilised_host_status('node5'))
self.assertNotIn(mock.call(node_state_tag_list[4]),
mock_set_host_status.mock_calls)
mock_make_event.assert_called_once_with("node4", 'offline')
mock_send_notification.assert_called_once_with(
CONF.host.api_retry_max, CONF.host.api_retry_interval, test_event)
@mock.patch.object(handle_host.HandleHost, '_check_if_status_changed')
@mock.patch.object(parse_crmmon_xml.ParseCrmMonXml,
'get_node_state_tag_list')

View File

@ -0,0 +1,17 @@
---
features:
- |
Support for repeated check of node status in hostmonitor.
Repeated check is more reliable than single check to determine host
status, especially when there is network instability in play.
With this feature, the following config option can be set.
[host]
monitoring_samples = 3
The above means 3 checks will be done before the node status is decided.
The default value is 1 which is backwards compatible.
`Blueprint retry-check-when-host-failure <https://blueprints.launchpad.net/masakari-monitors/+spec/retry-check-when-host-failure>`__