Check keepalived health in the amphora

The health manager did not detect a keepalived failure inside the amphora.
This patch will not send a health heartbeat if keepalived is configured but
not running.

This patch also allows the health checks to continue after an initial failure.

Change-Id: Id21310bd5ded3747218d3872ab3c966e5ddf5356
Closes-Bug: #1695090
This commit is contained in:
Michael Johnson 2017-06-01 18:55:54 -07:00
parent 29219e4345
commit c7a2babf54
5 changed files with 155 additions and 20 deletions

View File

@ -6,8 +6,9 @@ Wants=network-online.target
[Service]
Type=forking
KillMode=process
ExecStart=/sbin/ip netns exec {{ amphora_nsname }} {{ keepalived_cmd }} -D -d -f {{ keepalived_cfg }}
ExecStart=/sbin/ip netns exec {{ amphora_nsname }} {{ keepalived_cmd }} -D -d -f {{ keepalived_cfg }} -p {{ keepalived_pid }}
ExecReload=/bin/kill -HUP $MAINPID
PIDFile={{ keepalived_pid }}
[Install]
WantedBy=multi-user.target

View File

@ -18,7 +18,7 @@ DAEMON="ip netns exec {{ amphora_nsname }} {{ keepalived_cmd }}"
NAME=octavia-keepalived
DESC=octavia-keepalived
TMPFILES="/tmp/.vrrp /tmp/.healthcheckers"
DAEMON_ARGS="-D -d -f {{ keepalived_cfg }}"
DAEMON_ARGS="-D -d -f {{ keepalived_cfg }} -p {{ keepalived_pid }}"
#includes lsb functions
. /lib/lsb/init-functions
@ -36,7 +36,7 @@ case "$1" in
do
test -e $file && test ! -L $file && rm $file
done
if start-stop-daemon --start --quiet --pidfile /var/run/$NAME.pid \
if start-stop-daemon --start --quiet --pidfile {{ keepalived_pid }} \
--exec $DAEMON -- $DAEMON_ARGS; then
log_end_msg 0
else
@ -45,7 +45,7 @@ case "$1" in
;;
stop)
log_daemon_msg "Stopping $DESC" "$NAME"
if start-stop-daemon --oknodo --stop --quiet --pidfile /var/run/$NAME.pid \
if start-stop-daemon --oknodo --stop --quiet --pidfile {{ keepalived_pid }} \
--exec $DAEMON; then
log_end_msg 0
else
@ -55,7 +55,7 @@ case "$1" in
reload|force-reload)
log_action_begin_msg "Reloading $DESC configuration..."
if start-stop-daemon --stop --quiet --signal 1 --pidfile \
/var/run/$NAME.pid --exec $DAEMON; then
{{ keepalived_pid }} --exec $DAEMON; then
log_end_msg 0
else
log_action_end_msg 1
@ -65,10 +65,10 @@ case "$1" in
log_action_begin_msg "Restarting $DESC" "$NAME"
start-stop-daemon --stop --quiet --pidfile \
/var/run/$NAME.pid --exec $DAEMON || true
{{ keepalived_pid }} --exec $DAEMON || true
sleep 1
if start-stop-daemon --start --quiet --pidfile \
/var/run/$NAME.pid --exec $DAEMON -- $DAEMON_ARGS; then
{{ keepalived_pid }} --exec $DAEMON -- $DAEMON_ARGS; then
log_end_msg 0
else
log_end_msg 1

View File

@ -22,4 +22,4 @@ stop on runlevel [!2345]
respawn
exec /sbin/ip netns exec {{ amphora_nsname }} {{ keepalived_cmd }} -n -D -d -f {{ keepalived_cfg }}
exec /sbin/ip netns exec {{ amphora_nsname }} {{ keepalived_cmd }} -n -D -d -f {{ keepalived_cfg }} -p {{ keepalived_pid }}

View File

@ -14,6 +14,7 @@
# License for the specific language governing permissions and limitations
# under the License.
import errno
import os
import time
@ -49,9 +50,48 @@ def list_sock_stat_files(hadir=None):
def run_sender(cmd_queue):
LOG.info('Health Manager Sender starting.')
sender = health_sender.UDPStatusSender()
keepalived_cfg_path = util.keepalived_cfg_path()
keepalived_pid_path = util.keepalived_pid_path()
while True:
message = build_stats_message()
sender.dosend(message)
try:
# If the keepalived config file is present check
# that it is running, otherwise don't send the health
# heartbeat
if os.path.isfile(keepalived_cfg_path):
# Is there a pid file for keepalived?
with open(keepalived_pid_path, 'r') as pid_file:
pid = int(pid_file.readline())
os.kill(pid, 0)
message = build_stats_message()
sender.dosend(message)
except IOError as e:
# Missing PID file, skip health heartbeat
if e.errno == errno.ENOENT:
LOG.error('Missing keepalived PID file {0}, skipping '
'health heartbeat.'.format(keepalived_pid_path))
else:
LOG.error('Failed to check keepalived and haproxy status '
'due to exception {0}, skipping health '
'heartbeat.'.format(str(e)))
except OSError as e:
# Keepalived is not running, skip health heartbeat
if e.errno == errno.ESRCH:
LOG.error('Keepalived is configured but not running, skipping '
'health heartbeat.'.format(keepalived_pid_path))
else:
LOG.error('Failed to check keepalived and haproxy status '
'due to exception {0}, skipping health '
'heartbeat.'.format(str(e)))
except Exception as e:
LOG.error('Failed to check keepalived and haproxy status '
'due to exception {0}, skipping health '
'heartbeat.'.format(str(e)))
try:
cmd = cmd_queue.get_nowait()
if cmd is 'reload':

View File

@ -153,6 +153,8 @@ class TestHealthDaemon(base.TestCase):
LISTENER_ID2 + '.sock'}
self.assertEqual(files, expected_files)
@mock.patch('os.kill')
@mock.patch('os.path.isfile')
@mock.patch('octavia.amphorae.backends.health_daemon.'
'health_daemon.time.sleep')
@mock.patch('oslo_config.cfg.CONF.reload_config_files')
@ -161,25 +163,31 @@ class TestHealthDaemon(base.TestCase):
@mock.patch('octavia.amphorae.backends.health_daemon.'
'health_sender.UDPStatusSender')
def test_run_sender(self, mock_UDPStatusSender, mock_build_msg,
mock_reload_cfg, mock_sleep):
mock_reload_cfg, mock_sleep, mock_isfile, mock_kill):
sender_mock = mock.MagicMock()
dosend_mock = mock.MagicMock()
sender_mock.dosend = dosend_mock
mock_UDPStatusSender.return_value = sender_mock
mock_build_msg.side_effect = ['TEST', Exception('break')]
mock_build_msg.side_effect = ['TEST']
mock_isfile.return_value = False
test_queue = queue.Queue()
self.assertRaisesRegex(Exception, 'break',
health_daemon.run_sender, test_queue)
with mock.patch('time.sleep') as mock_sleep:
mock_sleep.side_effect = Exception('break')
self.assertRaisesRegex(Exception, 'break',
health_daemon.run_sender, test_queue)
sender_mock.dosend.assert_called_once_with('TEST')
# Test a reload event
mock_build_msg.reset_mock()
mock_build_msg.side_effect = ['TEST', Exception('break')]
mock_build_msg.side_effect = ['TEST']
test_queue.put('reload')
self.assertRaisesRegex(Exception, 'break',
health_daemon.run_sender, test_queue)
with mock.patch('time.sleep') as mock_sleep:
mock_sleep.side_effect = Exception('break')
self.assertRaisesRegex(Exception, 'break',
health_daemon.run_sender, test_queue)
mock_reload_cfg.assert_called_once_with()
# Test the shutdown path
@ -193,10 +201,88 @@ class TestHealthDaemon(base.TestCase):
# Test an unknown command
mock_build_msg.reset_mock()
mock_build_msg.side_effect = ['TEST', Exception('break')]
mock_build_msg.side_effect = ['TEST']
test_queue.put('bogus')
self.assertRaisesRegex(Exception, 'break',
health_daemon.run_sender, test_queue)
with mock.patch('time.sleep') as mock_sleep:
mock_sleep.side_effect = Exception('break')
self.assertRaisesRegex(Exception, 'break',
health_daemon.run_sender, test_queue)
# Test keepalived config, but no PID
mock_build_msg.reset_mock()
dosend_mock.reset_mock()
mock_isfile.return_value = True
with mock.patch('octavia.amphorae.backends.health_daemon.'
'health_daemon.open', mock.mock_open()) as mock_open:
mock_open.side_effect = FileNotFoundError
test_queue.put('shutdown')
health_daemon.run_sender(test_queue)
mock_build_msg.assert_not_called()
dosend_mock.assert_not_called()
# Test keepalived config, but PID file error
mock_build_msg.reset_mock()
dosend_mock.reset_mock()
mock_isfile.return_value = True
with mock.patch('octavia.amphorae.backends.health_daemon.'
'health_daemon.open', mock.mock_open()) as mock_open:
mock_open.side_effect = IOError
test_queue.put('shutdown')
health_daemon.run_sender(test_queue)
mock_build_msg.assert_not_called()
dosend_mock.assert_not_called()
# Test keepalived config, but bogus PID
mock_build_msg.reset_mock()
dosend_mock.reset_mock()
mock_isfile.return_value = True
with mock.patch('octavia.amphorae.backends.health_daemon.'
'health_daemon.open',
mock.mock_open(read_data='foo')) as mock_open:
test_queue.put('shutdown')
health_daemon.run_sender(test_queue)
mock_build_msg.assert_not_called()
dosend_mock.assert_not_called()
# Test keepalived config, but not running
mock_build_msg.reset_mock()
dosend_mock.reset_mock()
mock_isfile.return_value = True
with mock.patch('octavia.amphorae.backends.health_daemon.'
'health_daemon.open',
mock.mock_open(read_data='999999')) as mock_open:
mock_kill.side_effect = ProccessNotFoundError
test_queue.put('shutdown')
health_daemon.run_sender(test_queue)
mock_build_msg.assert_not_called()
dosend_mock.assert_not_called()
# Test keepalived config, but process error
mock_build_msg.reset_mock()
dosend_mock.reset_mock()
mock_isfile.return_value = True
with mock.patch('octavia.amphorae.backends.health_daemon.'
'health_daemon.open',
mock.mock_open(read_data='999999')) as mock_open:
mock_kill.side_effect = OSError
test_queue.put('shutdown')
health_daemon.run_sender(test_queue)
mock_build_msg.assert_not_called()
dosend_mock.assert_not_called()
# Test with happy keepalive
sender_mock.reset_mock()
dosend_mock.reset_mock()
mock_kill.side_effect = [True]
mock_build_msg.reset_mock()
mock_build_msg.side_effect = ['TEST', 'TEST']
mock_isfile.return_value = True
test_queue.put('shutdown')
with mock.patch('octavia.amphorae.backends.health_daemon.'
'health_daemon.open',
mock.mock_open(read_data='999999')) as mock_open:
health_daemon.run_sender(test_queue)
sender_mock.dosend.assert_called_once_with('TEST')
@mock.patch('octavia.amphorae.backends.utils.haproxy_query.HAProxyQuery')
def test_get_stats(self, mock_query):
@ -266,3 +352,11 @@ class TestHealthDaemon(base.TestCase):
msg = health_daemon.build_stats_message()
self.assertEqual(msg['listeners'][LISTENER_ID1]['pools'], {})
class FileNotFoundError(IOError):
errno = 2
class ProccessNotFoundError(OSError):
errno = 3