Kill neutron-keepalived-state-change gracefully

When HA routers are deleted, neutron-keepalived-state-change was
killed by l3/ha_router using SIGKILL and orphaning its child process
'ip -o monitor'.

This patch implements a way to kill it gracefully through SIGTERM, and
also sending a SIGKILL if keepalived-state-change didn't die after a
10 seconds timeout. The SIGTERM handler in keepalived-state-change
kills the ip monitor process using SIGKILL.

It kills ip_monitor PID using kill instead of through IPMonitor.stop()
because the monitor runs as root and keepalived-state-change doesn't
(it drops its privileges after launching the monitor). Doing so, would
lead to a "Permission denied". We can safely kill the monitor this way
since it was started with respawn_interval set to None which means that
it won't be automatically respawned.

Closes-Bug: #1632099
Change-Id: I7385172a007fdd252ea3a1c03c58064160d07e9e
This commit is contained in:
Daniel Alvarez 2016-12-16 17:52:05 +00:00
parent 4f8a106cb5
commit 17c2b45d91
3 changed files with 53 additions and 1 deletions

View File

@ -14,7 +14,9 @@
import os
import shutil
import signal
import eventlet
import netaddr
from neutron_lib import constants as n_consts
from oslo_log import log as logging
@ -31,6 +33,7 @@ from neutron.extensions import portbindings
LOG = logging.getLogger(__name__)
HA_DEV_PREFIX = 'ha-'
IP_MONITOR_PROCESS_SERVICE = 'ip_monitor'
SIGTERM_TIMEOUT = 10
class HaRouterNamespace(namespaces.RouterNamespace):
@ -342,7 +345,12 @@ class HaRouter(router.RouterInfo):
pm = self._get_state_change_monitor_process_manager()
process_monitor.unregister(
self.router_id, IP_MONITOR_PROCESS_SERVICE)
pm.disable()
pm.disable(sig=str(int(signal.SIGTERM)))
try:
common_utils.wait_until_true(lambda: not pm.active,
timeout=SIGTERM_TIMEOUT)
except eventlet.timeout.Timeout:
pm.disable(sig=str(int(signal.SIGKILL)))
def update_initial_state(self, callback):
ha_device = ip_lib.IPDevice(

View File

@ -13,6 +13,7 @@
# under the License.
import os
import signal
import sys
import httplib2
@ -118,6 +119,21 @@ class MonitorDaemon(daemon.Daemon):
log_exception=False
)
def _kill_monitor(self):
if self.monitor:
# Kill PID instead of calling self.monitor.stop() because the ip
# monitor is running as root while keepalived-state-change is not
# (dropped privileges after launching the ip monitor) and will fail
# with "Permission denied". Also, we can safely do this because the
# monitor was launched with respawn_interval=None so it won't be
# automatically respawned
agent_utils.kill_process(self.monitor.pid, signal.SIGKILL,
run_as_root=True)
def handle_sigterm(self, signum, frame):
self._kill_monitor()
super(MonitorDaemon, self).handle_sigterm(signum, frame)
def configure(conf):
config.init(sys.argv[1:])

View File

@ -12,6 +12,8 @@
# License for the specific language governing permissions and limitations
# under the License.
import signal
import mock
from oslo_utils import uuidutils
@ -84,3 +86,29 @@ class TestBasicRouterOperations(base.BaseTestCase):
ri.remove_floating_ip(device, fip_cidr)
self.assertTrue(super_remove_floating_ip.called)
def test_destroy_state_change_monitor_ok(self):
ri = self._create_router(mock.MagicMock())
with mock.patch.object(ri,
'_get_state_change_monitor_process_manager')\
as m_get_state:
mock_pm = m_get_state.return_value
mock_pm.active = False
ri.destroy_state_change_monitor(mock_pm)
mock_pm.disable.assert_called_once_with(
sig=str(int(signal.SIGTERM)))
def test_destroy_state_change_monitor_force(self):
ri = self._create_router(mock.MagicMock())
with mock.patch.object(ri,
'_get_state_change_monitor_process_manager')\
as m_get_state:
mock_pm = m_get_state.return_value
mock_pm.active = False
with mock.patch.object(ha_router, 'SIGTERM_TIMEOUT', 0):
ri.destroy_state_change_monitor(mock_pm)
calls = ["sig='str(%d)'" % signal.SIGTERM,
"sig='str(%d)'" % signal.SIGKILL]
mock_pm.disable.has_calls(calls)