From e7154f3d77ee4c06eec603a850ec941668eb602f Mon Sep 17 00:00:00 2001 From: sue Date: Wed, 2 Jun 2021 16:38:05 +0800 Subject: [PATCH] Fix hostmonitor hanging forever after certain exceptions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hostmonitor, like other Masakari monitors, starts as an Oslo service (based on eventlet). The main thread is supposed to run a loop that has an internal wait mechanism (instead of reusing periodic_tasks from oslo_service). However, the loop could be broken, if an unexpected exception appeared, and it never ran again but the process was still alive (due to oslo_service not stopping). The example mentioned in the bug report is about unavailability of the Masakari API (and/or Keystone API) before notification sending. This exception is not caught early because SendNotification._make_client is called outside of the try block (unlike the actual notification sending). The exception bubbles up and stops the main loop, leaving a useless hostmonitor process. The user is unaware unless they notice the logs are no longer growing. While the general design begs for a revamp (we might get away with that by using Consul in the first place), the easy fix is to prevent exceptions breaking the loop completely so that the hostmonitor can continue to work and try to regain health. At the very least it will keep posting ERROR messages in the log which is more likely to be spotted in comparison to lack of logs (which is, unfortunately, less commonly considered an alerting situation). This change also fixes, adapts and robustifies the two relevant unit tests. Closes-Bug: #1930361 Co-Authored-By: Radosław Piliszek Change-Id: I7e3447dcddc7998e3e3c30f4f0019d91a99c79ce --- .../hostmonitor/host_handler/handle_host.py | 15 ++++------ .../host_handler/test_handle_host.py | 29 +++++++++---------- .../notes/bug-1930361-fa8ce8e9781ea967.yaml | 5 ++++ 3 files changed, 25 insertions(+), 24 deletions(-) create mode 100644 releasenotes/notes/bug-1930361-fa8ce8e9781ea967.yaml diff --git a/masakarimonitors/hostmonitor/host_handler/handle_host.py b/masakarimonitors/hostmonitor/host_handler/handle_host.py index 8db4373..2bf527a 100644 --- a/masakarimonitors/hostmonitor/host_handler/handle_host.py +++ b/masakarimonitors/hostmonitor/host_handler/handle_host.py @@ -424,10 +424,9 @@ class HandleHost(driver.DriverBase): This method monitors hosts. """ - try: - self.running = True - while self.running: - + self.running = True + while self.running: + try: # Check whether corosync communication between hosts # is normal. ret = self._check_hb_line() @@ -466,9 +465,7 @@ class HandleHost(driver.DriverBase): if status_func() != 0: LOG.warning("hostmonitor skips monitoring hosts.") - eventlet.greenthread.sleep(CONF.host.monitoring_interval) + except Exception as e: + LOG.exception("Exception caught: %s", e) - except Exception as e: - LOG.exception("Exception caught: %s", e) - - return + eventlet.greenthread.sleep(CONF.host.monitoring_interval) diff --git a/masakarimonitors/tests/unit/hostmonitor/host_handler/test_handle_host.py b/masakarimonitors/tests/unit/hostmonitor/host_handler/test_handle_host.py index 5033287..845deb3 100644 --- a/masakarimonitors/tests/unit/hostmonitor/host_handler/test_handle_host.py +++ b/masakarimonitors/tests/unit/hostmonitor/host_handler/test_handle_host.py @@ -859,25 +859,25 @@ class TestHandleHost(testtools.TestCase): def test_monitor_hosts(self, mock_check_hb_line, mock_check_pacemaker_services, - mock_check_host_status_by_cibadmin, mock_check_host_status_by_crmadmin, + mock_check_host_status_by_cibadmin, mock_sleep): mock_check_hb_line.side_effect = \ - [0, 1, 2, 0, Exception("Test exception.")] - mock_check_pacemaker_services.side_effect = [True, False, False] - mock_check_host_status_by_cibadmin.side_effect = [0, 1] + [0, 1, 2, 0, Exception("Test exception."), 0, KeyboardInterrupt()] + mock_check_pacemaker_services.side_effect = [True, False, False, True] mock_check_host_status_by_crmadmin.side_effect = [0, 1] + mock_check_host_status_by_cibadmin.side_effect = [0, 1, 0] mock_sleep.return_value = None obj = handle_host.HandleHost() - obj.monitor_hosts() + self.assertRaises(KeyboardInterrupt, obj.monitor_hosts) - self.assertEqual(5, mock_check_hb_line.call_count) - self.assertEqual(3, mock_check_pacemaker_services.call_count) + self.assertEqual(7, mock_check_hb_line.call_count) + self.assertEqual(4, mock_check_pacemaker_services.call_count) mock_check_pacemaker_services.assert_called_with('pacemaker_remote') - self.assertEqual(2, mock_check_host_status_by_cibadmin.call_count) self.assertEqual(2, mock_check_host_status_by_crmadmin.call_count) + self.assertEqual(3, mock_check_host_status_by_cibadmin.call_count) @mock.patch.object(eventlet.greenthread, 'sleep') @mock.patch.object(handle_host.HandleHost, @@ -892,16 +892,15 @@ class TestHandleHost(testtools.TestCase): CONF.host.restrict_to_remotes = True mock_check_hb_line.side_effect = \ - [0, Exception("Test exception.")] + [0, Exception("Test exception."), 0, KeyboardInterrupt()] mock_check_pacemaker_services.return_value = True - mock_check_host_status_by_crm_mon.side_effect = 0 + mock_check_host_status_by_crm_mon.return_value = 0 mock_sleep.return_value = None obj = handle_host.HandleHost() - obj.monitor_hosts() + self.assertRaises(KeyboardInterrupt, obj.monitor_hosts) - self.assertEqual(1, mock_check_hb_line.call_count) - self.assertEqual(1, mock_check_pacemaker_services.call_count) + self.assertEqual(4, mock_check_hb_line.call_count) + self.assertEqual(2, mock_check_pacemaker_services.call_count) mock_check_pacemaker_services.assert_called_with('pacemaker_remote') - self.assertEqual(1, mock_check_host_status_by_crm_mon.call_count) - mock_check_host_status_by_crm_mon.assert_called_once_with() + self.assertEqual(2, mock_check_host_status_by_crm_mon.call_count) diff --git a/releasenotes/notes/bug-1930361-fa8ce8e9781ea967.yaml b/releasenotes/notes/bug-1930361-fa8ce8e9781ea967.yaml new file mode 100644 index 0000000..e081f66 --- /dev/null +++ b/releasenotes/notes/bug-1930361-fa8ce8e9781ea967.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fixes hostmonitor hanging forever after certain exceptions. + `LP#1930361 `__