Repeated check to determine host status

The original test is adapted because the code now no longer overwrites the same status. Change-Id: Ic77f932f56974a66a092b15b0d211efd73b9fc9c Implements: bp retry-check-when-host-failure Co-Authored-By: Radosław Piliszek <radoslaw.piliszek@gmail.com>
2020-11-06 15:17:20 +08:00 · 2020-11-06 15:17:20 +08:00 · 987584a1c6
parent be42d99854
commit 987584a1c6
4 changed files with 125 additions and 13 deletions
--- a/masakarimonitors/conf/host.py
+++ b/masakarimonitors/conf/host.py
@ -20,6 +20,14 @@ monitor_host_opts = [
    cfg.IntOpt('monitoring_interval',
               default=60,
               help='Monitoring interval(in seconds) of node status.'),
+    cfg.IntOpt('monitoring_samples',
+               default=1,
+               help='''
+Monitoring probes to collect before making the decision to send Masakari
+notification about the node status. If and only if ``monitoring_samples``
+consecutive reports have the same status, will the Masakari notification
+be sent.
+'''),
    cfg.IntOpt('api_retry_max',
               default=12,
               help='Number of retries for send a notification in'
--- a/masakarimonitors/hostmonitor/host_handler/handle_host.py
+++ b/masakarimonitors/hostmonitor/host_handler/handle_host.py
@ -14,6 +14,7 @@

 import socket

+from collections import deque
 import eventlet
 from oslo_log import log as oslo_logging
 from oslo_utils import timeutils
@ -56,6 +57,27 @@ class HandleHost(driver.DriverBase):
        self.crmmon_xml_parser = parse_crmmon_xml.ParseCrmMonXml()
        self.status_holder = hold_host_status.HostHoldStatus()
        self.notifier = masakari.SendNotification()
+        self.monitoring_data = {}
+
+    def _update_monitoring_data(self, hostname, status):
+        health_history = self.monitoring_data.setdefault(
+            hostname, deque([], maxlen=CONF.host.monitoring_samples))
+        health_history.append(status)
+
+    def get_stabilised_host_status(self, hostname):
+        health_history = self.monitoring_data.get(hostname)
+        if len(health_history) < CONF.host.monitoring_samples:
+            LOG.debug("Not enough monitoring data for host %s.", hostname)
+            return '_being_collected'
+
+        stabilised_status = health_history[0]
+
+        # If and only if the sequence of host status is consistently the same,
+        # will it return that status.
+        if len(health_history) == health_history.count(stabilised_status):
+            return stabilised_status
+        else:
+            return '_uncertain'

    def _check_pacemaker_services(self, target_service):
        try:
@ -287,8 +309,8 @@ class HandleHost(driver.DriverBase):
            if hostname == self.my_hostname:
                continue

-            # Get current status and old status.
            current_status = node_state_tag.get('crmd')
+            self._update_monitoring_data(hostname, current_status)
            old_status = self.status_holder.get_host_status(hostname)

            # If old_status is None, This is first get of host status.
@ -300,21 +322,28 @@ class HandleHost(driver.DriverBase):
                self.status_holder.set_host_status(node_state_tag)
                continue

+            stabilised_status = self.get_stabilised_host_status(hostname)
+
            # Output host status.
-            msg = ("'%s' is '%s'.") % (hostname, current_status)
+            msg = ("'%s' is '%s' (current: '%s').") % (hostname,
+                                                       stabilised_status,
+                                                       current_status)
            LOG.info("%s", msg)

-            # If host status changed, send a notification.
-            if current_status != old_status:
-                if current_status != 'online' and current_status != 'offline':
-                    # If current_status is not 'online' or 'offline',
+            if stabilised_status == '_being_collected':
+                continue
+
+            # If host stabilised status changed, send a notification.
+            if stabilised_status != old_status:
+                if stabilised_status not in ['online', 'offline']:
+                    # If stabilised_status is not 'online' or 'offline',
                    # hostmonitor doesn't send a notification.
                    msg = ("Since host status is '%s',"
                           " hostmonitor doesn't send a notification.") \
-                        % current_status
+                        % stabilised_status
                    LOG.info("%s", msg)
                else:
-                    event = self._make_event(hostname, current_status)
+                    event = self._make_event(hostname, stabilised_status)

                    # Send a notification.
                    self.notifier.send_notification(
@ -322,8 +351,9 @@ class HandleHost(driver.DriverBase):
                        CONF.host.api_retry_interval,
                        event)

-            # Update host status.
-            self.status_holder.set_host_status(node_state_tag)
+                if stabilised_status != '_uncertain':
+                    # Update host status.
+                    self.status_holder.set_host_status(node_state_tag)

    def _check_host_status_by_crm_mon(self):
        crmmon_xml = self._get_crmmon_xml()
--- a/masakarimonitors/tests/unit/hostmonitor/host_handler/test_handle_host.py
+++ b/masakarimonitors/tests/unit/hostmonitor/host_handler/test_handle_host.py
@ -17,6 +17,7 @@ import testtools
 from unittest import mock
 from xml.etree import ElementTree

+from collections import deque
 import eventlet
 from oslo_utils import timeutils

@ -629,9 +630,7 @@ class TestHandleHost(testtools.TestCase):
                                 mock.call(node3),
                                 mock.call(node4),
                                 mock.call(node5)]
-        calls_set_host_status = [mock.call(node_state_node2),
-                                 mock.call(node_state_node3),
-                                 mock.call(node_state_node4),
+        calls_set_host_status = [mock.call(node_state_node4),
                                 mock.call(node_state_node5)]
        mock_get_host_status.assert_has_calls(calls_get_host_status)
        mock_set_host_status.assert_has_calls(calls_set_host_status)
@ -639,6 +638,64 @@ class TestHandleHost(testtools.TestCase):
        mock_send_notification.assert_called_once_with(
            CONF.host.api_retry_max, CONF.host.api_retry_interval, test_event)

+    @mock.patch.object(masakari.SendNotification, 'send_notification')
+    @mock.patch.object(handle_host.HandleHost, '_make_event')
+    @mock.patch.object(hold_host_status.HostHoldStatus, 'set_host_status')
+    @mock.patch.object(hold_host_status.HostHoldStatus, 'get_host_status')
+    @mock.patch.object(socket, 'gethostname')
+    def test_check_if_status_changed_with_3_samples(
+        self, mock_gethostname, mock_get_host_status, mock_set_host_status,
+        mock_make_event, mock_send_notification):
+        mock_gethostname.return_value = 'node1'
+        mock_get_host_status.side_effect = \
+            [None, 'online', 'online', 'online']
+        mock_set_host_status.return_value = None
+        test_event = {'notification': 'test'}
+        mock_make_event.return_value = test_event
+
+        status_tag = ElementTree.fromstring(STATUS_TAG_XML)
+        node_state_tag_list = list(status_tag)
+        CONF.host.monitoring_samples = 3
+
+        obj = handle_host.HandleHost()
+        obj.monitoring_data = {
+            "node1": deque(['online', 'online', 'online'], maxlen=3),
+            "node2": deque(['offline', 'online', 'online'], maxlen=3),
+            "node3": deque(['offline'], maxlen=3),
+            "node4": deque(['online', 'offline', 'offline'], maxlen=3),
+            "node5": deque(['online', 'online', 'online'], maxlen=3),
+        }
+        obj._check_if_status_changed(node_state_tag_list)
+
+        self.assertEqual(deque(['online', 'online', 'online'], maxlen=3),
+                         obj.monitoring_data.get('node2'))
+        self.assertEqual('online', obj.get_stabilised_host_status('node2'))
+        self.assertIn(mock.call(node_state_tag_list[1]),
+                      mock_set_host_status.mock_calls)
+
+        self.assertEqual(deque(['offline', 'online'], maxlen=3),
+                         obj.monitoring_data.get('node3'))
+        self.assertEqual('_being_collected',
+                         obj.get_stabilised_host_status('node3'))
+        self.assertNotIn(mock.call(node_state_tag_list[2]),
+                         mock_set_host_status.mock_calls)
+
+        self.assertEqual(deque(['offline', 'offline', 'offline'], maxlen=3),
+                         obj.monitoring_data.get('node4'))
+        self.assertEqual('offline', obj.get_stabilised_host_status('node4'))
+        self.assertIn(mock.call(node_state_tag_list[3]),
+                      mock_set_host_status.mock_calls)
+
+        self.assertEqual(deque(['online', 'online', 'other'], maxlen=3),
+                         obj.monitoring_data.get('node5'))
+        self.assertEqual('_uncertain', obj.get_stabilised_host_status('node5'))
+        self.assertNotIn(mock.call(node_state_tag_list[4]),
+                         mock_set_host_status.mock_calls)
+
+        mock_make_event.assert_called_once_with("node4", 'offline')
+        mock_send_notification.assert_called_once_with(
+            CONF.host.api_retry_max, CONF.host.api_retry_interval, test_event)
+
    @mock.patch.object(handle_host.HandleHost, '_check_if_status_changed')
    @mock.patch.object(parse_crmmon_xml.ParseCrmMonXml,
                       'get_node_state_tag_list')
--- a/releasenotes/notes/bp-retry-check-when-host-failure-78649c512ef79199.yaml
+++ b/releasenotes/notes/bp-retry-check-when-host-failure-78649c512ef79199.yaml
@ -0,0 +1,17 @@
+---
+features:
+  - |
+    Support for repeated check of node status in hostmonitor.
+
+    Repeated check is more reliable than single check to determine host
+    status, especially when there is network instability in play.
+
+    With this feature, the following config option can be set.
+
+        [host]
+        monitoring_samples = 3
+
+    The above means 3 checks will be done before the node status is decided.
+    The default value is 1 which is backwards compatible.
+
+    `Blueprint retry-check-when-host-failure <https://blueprints.launchpad.net/masakari-monitors/+spec/retry-check-when-host-failure>`__