Fix hostmonitor hanging forever after certain exceptions

The hostmonitor, like other Masakari monitors, starts as an
Oslo service (based on eventlet). The main thread is supposed
to run a loop that has an internal wait mechanism (instead of
reusing periodic_tasks from oslo_service). However, the loop
could be broken, if an unexpected exception appeared, and it
never ran again but the process was still alive (due to
oslo_service not stopping). The example mentioned in the bug
report is about unavailability of the Masakari API (and/or
Keystone API) before notification sending. This exception is
not caught early because SendNotification._make_client is
called outside of the try block (unlike the actual notification
sending). The exception bubbles up and stops the main loop,
leaving a useless hostmonitor process. The user is unaware
unless they notice the logs are no longer growing.

While the general design begs for a revamp (we might get away
with that by using Consul in the first place), the easy fix is
to prevent exceptions breaking the loop completely so that the
hostmonitor can continue to work and try to regain health.
At the very least it will keep posting ERROR messages in the log
which is more likely to be spotted in comparison to lack of logs
(which is, unfortunately, less commonly considered an alerting
situation).

This change also fixes, adapts and robustifies the two relevant
unit tests.

Closes-Bug: #1930361
Co-Authored-By: Radosław Piliszek <radoslaw.piliszek@gmail.com>
Change-Id: I7e3447dcddc7998e3e3c30f4f0019d91a99c79ce
This commit is contained in:
sue 2021-06-02 16:38:05 +08:00 committed by Radosław Piliszek
parent 6a496e7871
commit e7154f3d77
3 changed files with 25 additions and 24 deletions

View File

@ -424,10 +424,9 @@ class HandleHost(driver.DriverBase):
This method monitors hosts.
"""
try:
self.running = True
while self.running:
self.running = True
while self.running:
try:
# Check whether corosync communication between hosts
# is normal.
ret = self._check_hb_line()
@ -466,9 +465,7 @@ class HandleHost(driver.DriverBase):
if status_func() != 0:
LOG.warning("hostmonitor skips monitoring hosts.")
eventlet.greenthread.sleep(CONF.host.monitoring_interval)
except Exception as e:
LOG.exception("Exception caught: %s", e)
except Exception as e:
LOG.exception("Exception caught: %s", e)
return
eventlet.greenthread.sleep(CONF.host.monitoring_interval)

View File

@ -859,25 +859,25 @@ class TestHandleHost(testtools.TestCase):
def test_monitor_hosts(self,
mock_check_hb_line,
mock_check_pacemaker_services,
mock_check_host_status_by_cibadmin,
mock_check_host_status_by_crmadmin,
mock_check_host_status_by_cibadmin,
mock_sleep):
mock_check_hb_line.side_effect = \
[0, 1, 2, 0, Exception("Test exception.")]
mock_check_pacemaker_services.side_effect = [True, False, False]
mock_check_host_status_by_cibadmin.side_effect = [0, 1]
[0, 1, 2, 0, Exception("Test exception."), 0, KeyboardInterrupt()]
mock_check_pacemaker_services.side_effect = [True, False, False, True]
mock_check_host_status_by_crmadmin.side_effect = [0, 1]
mock_check_host_status_by_cibadmin.side_effect = [0, 1, 0]
mock_sleep.return_value = None
obj = handle_host.HandleHost()
obj.monitor_hosts()
self.assertRaises(KeyboardInterrupt, obj.monitor_hosts)
self.assertEqual(5, mock_check_hb_line.call_count)
self.assertEqual(3, mock_check_pacemaker_services.call_count)
self.assertEqual(7, mock_check_hb_line.call_count)
self.assertEqual(4, mock_check_pacemaker_services.call_count)
mock_check_pacemaker_services.assert_called_with('pacemaker_remote')
self.assertEqual(2, mock_check_host_status_by_cibadmin.call_count)
self.assertEqual(2, mock_check_host_status_by_crmadmin.call_count)
self.assertEqual(3, mock_check_host_status_by_cibadmin.call_count)
@mock.patch.object(eventlet.greenthread, 'sleep')
@mock.patch.object(handle_host.HandleHost,
@ -892,16 +892,15 @@ class TestHandleHost(testtools.TestCase):
CONF.host.restrict_to_remotes = True
mock_check_hb_line.side_effect = \
[0, Exception("Test exception.")]
[0, Exception("Test exception."), 0, KeyboardInterrupt()]
mock_check_pacemaker_services.return_value = True
mock_check_host_status_by_crm_mon.side_effect = 0
mock_check_host_status_by_crm_mon.return_value = 0
mock_sleep.return_value = None
obj = handle_host.HandleHost()
obj.monitor_hosts()
self.assertRaises(KeyboardInterrupt, obj.monitor_hosts)
self.assertEqual(1, mock_check_hb_line.call_count)
self.assertEqual(1, mock_check_pacemaker_services.call_count)
self.assertEqual(4, mock_check_hb_line.call_count)
self.assertEqual(2, mock_check_pacemaker_services.call_count)
mock_check_pacemaker_services.assert_called_with('pacemaker_remote')
self.assertEqual(1, mock_check_host_status_by_crm_mon.call_count)
mock_check_host_status_by_crm_mon.assert_called_once_with()
self.assertEqual(2, mock_check_host_status_by_crm_mon.call_count)

View File

@ -0,0 +1,5 @@
---
fixes:
- |
Fixes hostmonitor hanging forever after certain exceptions.
`LP#1930361 <https://launchpad.net/bugs/1930361>`__