From 17c2b45d916168fa396caf1a6188e6c8a0c8d19a Mon Sep 17 00:00:00 2001 From: Daniel Alvarez Date: Fri, 16 Dec 2016 17:52:05 +0000 Subject: [PATCH] Kill neutron-keepalived-state-change gracefully When HA routers are deleted, neutron-keepalived-state-change was killed by l3/ha_router using SIGKILL and orphaning its child process 'ip -o monitor'. This patch implements a way to kill it gracefully through SIGTERM, and also sending a SIGKILL if keepalived-state-change didn't die after a 10 seconds timeout. The SIGTERM handler in keepalived-state-change kills the ip monitor process using SIGKILL. It kills ip_monitor PID using kill instead of through IPMonitor.stop() because the monitor runs as root and keepalived-state-change doesn't (it drops its privileges after launching the monitor). Doing so, would lead to a "Permission denied". We can safely kill the monitor this way since it was started with respawn_interval set to None which means that it won't be automatically respawned. Closes-Bug: #1632099 Change-Id: I7385172a007fdd252ea3a1c03c58064160d07e9e --- neutron/agent/l3/ha_router.py | 10 ++++++- neutron/agent/l3/keepalived_state_change.py | 16 +++++++++++ neutron/tests/unit/agent/l3/test_ha_router.py | 28 +++++++++++++++++++ 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/neutron/agent/l3/ha_router.py b/neutron/agent/l3/ha_router.py index d68453b1b04..b87929faa57 100644 --- a/neutron/agent/l3/ha_router.py +++ b/neutron/agent/l3/ha_router.py @@ -14,7 +14,9 @@ import os import shutil +import signal +import eventlet import netaddr from neutron_lib import constants as n_consts from oslo_log import log as logging @@ -31,6 +33,7 @@ from neutron.extensions import portbindings LOG = logging.getLogger(__name__) HA_DEV_PREFIX = 'ha-' IP_MONITOR_PROCESS_SERVICE = 'ip_monitor' +SIGTERM_TIMEOUT = 10 class HaRouterNamespace(namespaces.RouterNamespace): @@ -342,7 +345,12 @@ class HaRouter(router.RouterInfo): pm = self._get_state_change_monitor_process_manager() process_monitor.unregister( self.router_id, IP_MONITOR_PROCESS_SERVICE) - pm.disable() + pm.disable(sig=str(int(signal.SIGTERM))) + try: + common_utils.wait_until_true(lambda: not pm.active, + timeout=SIGTERM_TIMEOUT) + except eventlet.timeout.Timeout: + pm.disable(sig=str(int(signal.SIGKILL))) def update_initial_state(self, callback): ha_device = ip_lib.IPDevice( diff --git a/neutron/agent/l3/keepalived_state_change.py b/neutron/agent/l3/keepalived_state_change.py index d6a5eee83b3..040a391b4fe 100644 --- a/neutron/agent/l3/keepalived_state_change.py +++ b/neutron/agent/l3/keepalived_state_change.py @@ -13,6 +13,7 @@ # under the License. import os +import signal import sys import httplib2 @@ -118,6 +119,21 @@ class MonitorDaemon(daemon.Daemon): log_exception=False ) + def _kill_monitor(self): + if self.monitor: + # Kill PID instead of calling self.monitor.stop() because the ip + # monitor is running as root while keepalived-state-change is not + # (dropped privileges after launching the ip monitor) and will fail + # with "Permission denied". Also, we can safely do this because the + # monitor was launched with respawn_interval=None so it won't be + # automatically respawned + agent_utils.kill_process(self.monitor.pid, signal.SIGKILL, + run_as_root=True) + + def handle_sigterm(self, signum, frame): + self._kill_monitor() + super(MonitorDaemon, self).handle_sigterm(signum, frame) + def configure(conf): config.init(sys.argv[1:]) diff --git a/neutron/tests/unit/agent/l3/test_ha_router.py b/neutron/tests/unit/agent/l3/test_ha_router.py index b2c7bc0f976..37e13efe365 100644 --- a/neutron/tests/unit/agent/l3/test_ha_router.py +++ b/neutron/tests/unit/agent/l3/test_ha_router.py @@ -12,6 +12,8 @@ # License for the specific language governing permissions and limitations # under the License. +import signal + import mock from oslo_utils import uuidutils @@ -84,3 +86,29 @@ class TestBasicRouterOperations(base.BaseTestCase): ri.remove_floating_ip(device, fip_cidr) self.assertTrue(super_remove_floating_ip.called) + + def test_destroy_state_change_monitor_ok(self): + ri = self._create_router(mock.MagicMock()) + with mock.patch.object(ri, + '_get_state_change_monitor_process_manager')\ + as m_get_state: + mock_pm = m_get_state.return_value + mock_pm.active = False + ri.destroy_state_change_monitor(mock_pm) + + mock_pm.disable.assert_called_once_with( + sig=str(int(signal.SIGTERM))) + + def test_destroy_state_change_monitor_force(self): + ri = self._create_router(mock.MagicMock()) + with mock.patch.object(ri, + '_get_state_change_monitor_process_manager')\ + as m_get_state: + mock_pm = m_get_state.return_value + mock_pm.active = False + with mock.patch.object(ha_router, 'SIGTERM_TIMEOUT', 0): + ri.destroy_state_change_monitor(mock_pm) + + calls = ["sig='str(%d)'" % signal.SIGTERM, + "sig='str(%d)'" % signal.SIGKILL] + mock_pm.disable.has_calls(calls)