Add a jitter to heartbeat retries

Currently, if heartbeat fails, we reschedule it after 5 seconds.
This is fine for the first retry, but it can cause a thundering herd
problem when a lot of nodes fail to heartbeat at once.

This change adds jitter to the minimum wait of 5 seconds. The jitter is
not applied for forced heartbeats: they still have a minimum wait of
exactly 5 seconds from the last heartbeat.

The code is re-ordered to move the interval calculation to one place.
Bonus: correctly logging the next interval.

The unit tests have been rewritten to test the heartbeat process step by
step and not rely on the exact sequence of the calls.

Closes-Bug: #2038438
Change-Id: I4c4207b15fb3d48b55e340b7b3b54af833f92cb5
This commit is contained in:
Dmitry Tantsur 2023-10-02 17:32:46 +02:00
parent 62041d6d9e
commit 2ab8364649
No known key found for this signature in database
GPG Key ID: 315B2AF9FD216C60
3 changed files with 145 additions and 76 deletions

View File

@ -68,14 +68,24 @@ class IronicPythonAgentStatus(encoding.Serializable):
self.version = version
def _with_jitter(value, min_multiplier, max_multiplier):
interval_multiplier = random.uniform(min_multiplier, max_multiplier)
return value * interval_multiplier
class IronicPythonAgentHeartbeater(threading.Thread):
"""Thread that periodically heartbeats to Ironic."""
# If we could wait at most N seconds between heartbeats (or in case of an
# error) we will instead wait r x N seconds, where r is a random value
# between these multipliers.
# If we could wait at most N seconds between heartbeats, we will instead
# wait r x N seconds, where r is a random value between these multipliers.
min_jitter_multiplier = 0.3
max_jitter_multiplier = 0.6
# Error retry between 5 and 10 seconds, at least 12 retries with
# the default ramdisk_heartbeat_timeout of 300 and the worst case interval
# jitter of 0.6.
min_heartbeat_interval = 5
min_error_jitter_multiplier = 1.0
max_error_jitter_multiplier = 2.0
def __init__(self, agent):
"""Initialize the heartbeat thread.
@ -97,19 +107,39 @@ class IronicPythonAgentHeartbeater(threading.Thread):
LOG.info('Starting heartbeater')
self.agent.set_agent_advertise_addr()
while not self.stop_event.wait(min(self.interval, 5)):
if self._heartbeat_expected():
self.do_heartbeat()
while self._run_next():
eventlet.sleep(0)
def _run_next(self):
# The logic here makes sure we don't wait exactly 5 seconds more or
# less regardless of the current interval since it may cause a
# thundering herd problem when a lot of agents are heartbeating.
# Essentially, if the next heartbeat is due in 2 seconds, don't wait 5.
# But if the next one is scheduled in 2 minutes, do wait 5 to account
# for forced heartbeats.
wait = min(
self.min_heartbeat_interval,
# This operation checks how much of the initially planned interval
# we have still left. Compare with 0 in case we overshoot the goal.
max(0, self.interval - (_time() - self.previous_heartbeat)),
)
if self.stop_event.wait(wait):
return False # done
if self._heartbeat_expected():
self.do_heartbeat()
return True
def _heartbeat_expected(self):
elapsed = _time() - self.previous_heartbeat
# Normal heartbeating
if _time() > self.previous_heartbeat + self.interval:
if elapsed >= self.interval:
return True
# Forced heartbeating, but once in 5 seconds
if (self.heartbeat_forced
and _time() > self.previous_heartbeat + 5):
if self.heartbeat_forced and elapsed > self.min_heartbeat_interval:
return True
def do_heartbeat(self):
@ -121,20 +151,24 @@ class IronicPythonAgentHeartbeater(threading.Thread):
advertise_protocol=self.agent.advertise_protocol,
generated_cert=self.agent.generated_cert,
)
LOG.info('heartbeat successful')
except Exception as exc:
if isinstance(exc, errors.HeartbeatConflictError):
LOG.warning('conflict error sending heartbeat to %s',
self.agent.api_url)
else:
LOG.exception('error sending heartbeat to %s',
self.agent.api_url)
self.interval = _with_jitter(self.min_heartbeat_interval,
self.min_error_jitter_multiplier,
self.max_error_jitter_multiplier)
else:
LOG.debug('heartbeat successful')
self.heartbeat_forced = False
self.previous_heartbeat = _time()
except errors.HeartbeatConflictError:
LOG.warning('conflict error sending heartbeat to %s',
self.agent.api_url)
except Exception:
LOG.exception('error sending heartbeat to %s', self.agent.api_url)
finally:
interval_multiplier = random.uniform(self.min_jitter_multiplier,
self.max_jitter_multiplier)
self.interval = self.agent.heartbeat_timeout * interval_multiplier
LOG.info('sleeping before next heartbeat, interval: %s',
self.interval)
self.interval = _with_jitter(self.agent.heartbeat_timeout,
self.min_jitter_multiplier,
self.max_jitter_multiplier)
self.previous_heartbeat = _time()
LOG.info('sleeping before next heartbeat, interval: %s', self.interval)
def force_heartbeat(self):
self.heartbeat_forced = True

View File

@ -50,6 +50,20 @@ class FakeExtension(base.BaseAgentExtension):
pass
class FakeClock:
current = 0
last_wait = None
wait_result = False
def get(self):
return self.current
def wait(self, interval):
self.last_wait = interval
self.current += interval
return self.wait_result
class TestHeartbeater(ironic_agent_base.IronicAgentTest):
def setUp(self):
super(TestHeartbeater, self).setUp()
@ -64,65 +78,79 @@ class TestHeartbeater(ironic_agent_base.IronicAgentTest):
@mock.patch('ironic_python_agent.agent._time', autospec=True)
@mock.patch('random.uniform', autospec=True)
def test_heartbeat(self, mock_uniform, mock_time):
time_responses = []
uniform_responses = []
heartbeat_responses = []
wait_responses = []
expected_stop_calls = []
clock = FakeClock()
mock_time.side_effect = clock.get
self.heartbeater.stop_event.wait.side_effect = clock.wait
# FIRST RUN:
# initial delay is 0
expected_stop_calls.append(mock.call(0))
wait_responses.append(False)
# next heartbeat due at t=100
heartbeat_responses.append(100)
# random interval multiplier is 0.5
uniform_responses.append(0.5)
# time is now 50
time_responses.append(50)
heartbeat_mock = self.heartbeater.api.heartbeat
self.mock_agent.heartbeat_timeout = 20
# SECOND RUN:
expected_stop_calls.append(mock.call(5))
wait_responses.append(False)
# next heartbeat due at t=180
heartbeat_responses.append(180)
# random interval multiplier is 0.4
uniform_responses.append(0.4)
# time is now 80
time_responses.append(80)
# add one response for _time in _heartbeat_expected
time_responses.append(80)
# First run right after start
mock_uniform.return_value = 0.6
self.assertTrue(self.heartbeater._run_next())
self.assertEqual(0, clock.last_wait)
heartbeat_mock.assert_called_once_with(
uuid=self.mock_agent.get_node_uuid.return_value,
advertise_address=self.mock_agent.advertise_address,
advertise_protocol=self.mock_agent.advertise_protocol,
generated_cert=self.mock_agent.generated_cert)
heartbeat_mock.reset_mock()
self.assertEqual(12, self.heartbeater.interval) # 20*0.6
self.assertEqual(0, self.heartbeater.previous_heartbeat)
# THIRD RUN:
expected_stop_calls.append(mock.call(5))
wait_responses.append(False)
# this heartbeat attempt fails
heartbeat_responses.append(Exception('uh oh!'))
# random interval multiplier is 0.5
uniform_responses.append(0.5)
# we check the time to generate a fake deadline, now t=125
time_responses.append(125)
# time is now 125.5
time_responses.append(125.5)
# A few empty runs before reaching the next heartbeat
for ts in [5, 10]:
self.assertTrue(self.heartbeater._run_next())
self.assertEqual(5, clock.last_wait)
self.assertEqual(ts, clock.current)
heartbeat_mock.assert_not_called()
self.assertEqual(0, self.heartbeater.previous_heartbeat)
# FOURTH RUN:
expected_stop_calls.append(mock.call(5))
# Stop now
wait_responses.append(True)
# Second run when the heartbeat is due
mock_uniform.return_value = 0.4
self.assertTrue(self.heartbeater._run_next())
self.assertEqual(2, clock.last_wait) # 12-2*5
self.assertTrue(heartbeat_mock.called)
heartbeat_mock.reset_mock()
self.assertEqual(8, self.heartbeater.interval) # 20*0.4
self.assertEqual(12, self.heartbeater.previous_heartbeat)
# Hook it up and run it
mock_time.side_effect = time_responses
mock_uniform.side_effect = uniform_responses
self.mock_agent.heartbeat_timeout = 50
self.heartbeater.api.heartbeat.side_effect = heartbeat_responses
self.heartbeater.stop_event.wait.side_effect = wait_responses
self.heartbeater.run()
# One empty run before reaching the next heartbeat
self.assertTrue(self.heartbeater._run_next())
self.assertEqual(5, clock.last_wait)
heartbeat_mock.assert_not_called()
self.assertEqual(12, self.heartbeater.previous_heartbeat)
# Validate expectations
self.assertEqual(expected_stop_calls,
self.heartbeater.stop_event.wait.call_args_list)
self.assertEqual(self.heartbeater.api.heartbeat.call_count, 2)
self.assertEqual(mock_time.call_count, 5)
# Failed run resulting in a fast retry
mock_uniform.return_value = 1.2
heartbeat_mock.side_effect = Exception('uh oh!')
self.assertTrue(self.heartbeater._run_next())
self.assertEqual(3, clock.last_wait) # 8-5
self.assertTrue(heartbeat_mock.called)
heartbeat_mock.reset_mock(side_effect=True)
self.assertEqual(6, self.heartbeater.interval) # 5*1.2
self.assertEqual(20, self.heartbeater.previous_heartbeat)
# One empty run because 6>5
self.assertTrue(self.heartbeater._run_next())
self.assertEqual(5, clock.last_wait)
heartbeat_mock.assert_not_called()
self.assertEqual(20, self.heartbeater.previous_heartbeat)
# Retry after the remaining 1 second
mock_uniform.return_value = 0.5
self.assertTrue(self.heartbeater._run_next())
self.assertEqual(1, clock.last_wait)
self.assertTrue(heartbeat_mock.called)
heartbeat_mock.reset_mock()
self.assertEqual(10, self.heartbeater.interval) # 20*0.5
self.assertEqual(26, self.heartbeater.previous_heartbeat)
# Stop on the next empty run
clock.wait_result = True
self.assertFalse(self.heartbeater._run_next())
heartbeat_mock.assert_not_called()
self.assertEqual(26, self.heartbeater.previous_heartbeat)
@mock.patch('ironic_python_agent.agent._time', autospec=True)
def test__heartbeat_expected(self, mock_time):
@ -132,7 +160,7 @@ class TestHeartbeater(ironic_agent_base.IronicAgentTest):
self.heartbeater.interval = 0
self.heartbeater.heartbeat_forced = False
mock_time.return_value = 0
self.assertFalse(self.heartbeater._heartbeat_expected())
self.assertTrue(self.heartbeater._heartbeat_expected())
# 1st cadence
self.heartbeater.previous_heartbeat = 0

View File

@ -0,0 +1,7 @@
---
fixes:
- |
Adds random jitter to retried heartbeats after Ironic returns an error.
Previously, heartbeats would be retried after 5 seconds, potentially
causing a thundering herd problem if many nodes fail to heartbeat at
the same time.