Cleanup .ctl/.pid files for both OpenSwan and LibreSwan

Change I5c215d70c348524979b740f882029f74e400e6d7 introduced cleanup
of pluto ctl/pid files on starting and restarting of pluto daemon
for LibreSwan driver. But the problem with managing these files is
also common for the OpenSwan driver: pluto daemon fails to start if
a pid file it tries to create already exists (see bug report for
details).

This change moves the cleaup functionality to the OpenSwanProcess so
that is will be used by both OpenSwan and LibreSwan drivers.
Also fixed a typo in _cleanup_control_files where it was attempted to
remove pluto.ctl.ctl file instead of pluto.ctl

Changed the name of 'libreswan' configuration section to 'pluto'.

DocImpact

Change-Id: I717e8fcc1add35b7099c977235e4eff5da9e093b
Closes-Bug: #1506794
This commit is contained in:
Elena Ezhova 2015-10-16 12:31:48 +03:00
parent 4733b73b7f
commit a71f30b232
4 changed files with 195 additions and 207 deletions

View File

@ -33,7 +33,7 @@
# Default is for ubuntu use, /etc/strongswan.d
# default_config_area=/etc/strongswan.d
[libreswan]
[pluto]
# Initial interval in seconds for checking if pluto daemon is shutdown
# shutdown_check_timeout=1
#

View File

@ -20,6 +20,7 @@ import shutil
import six
import socket
import eventlet
import jinja2
import netaddr
from neutron.agent.linux import ip_lib
@ -28,7 +29,7 @@ from neutron.api.v2 import attributes
from neutron.common import rpc as n_rpc
from neutron.common import utils as n_utils
from neutron import context
from neutron.i18n import _LE
from neutron.i18n import _LE, _LI, _LW
from neutron.plugins.common import constants
from neutron.plugins.common import utils as plugin_utils
from oslo_concurrency import lockutils
@ -77,6 +78,26 @@ openswan_opts = [
cfg.CONF.register_opts(openswan_opts, 'openswan')
pluto_opts = [
cfg.IntOpt('shutdown_check_timeout',
default=1,
help=_('Initial interval in seconds for checking if pluto '
'daemon is shutdown'),
deprecated_group='libreswan'),
cfg.IntOpt('shutdown_check_retries',
default=5,
help=_('The maximum number of retries for checking for '
'pluto daemon shutdown'),
deprecated_group='libreswan'),
cfg.FloatOpt('shutdown_check_back_off',
default=1.5,
help=_('A factor to increase the retry interval for '
'each retry'),
deprecated_group='libreswan')
]
cfg.CONF.register_opts(pluto_opts, 'pluto')
JINJA_ENV = None
IPSEC_CONNS = 'ipsec_site_connections'
@ -333,6 +354,7 @@ class OpenSwanProcess(BaseSwanProcess):
self.etc_dir, 'ipsec.conf')
self.pid_path = os.path.join(
self.config_dir, 'var', 'run', 'pluto')
self.pid_file = '%s.pid' % self.pid_path
def _execute(self, cmd, check_exit_code=True, extra_ok_codes=None):
"""Execute command on namespace."""
@ -357,6 +379,61 @@ class OpenSwanProcess(BaseSwanProcess):
self.vpnservice,
0o600)
def _process_running(self):
"""Checks if process is still running."""
# If no PID file, we assume the process is not running.
if not os.path.exists(self.pid_file):
return False
try:
# We take an ask-forgiveness-not-permission approach and rely
# on throwing to tell us something. If the pid file exists,
# delve into the process information and check if it matches
# our expected command line.
with open(self.pid_file, 'r') as f:
pid = f.readline().strip()
with open('/proc/%s/cmdline' % pid) as cmd_line_file:
cmd_line = cmd_line_file.readline()
if self.pid_path in cmd_line and 'pluto' in cmd_line:
# Okay the process is probably a pluto process
# and it contains the pid_path in the command
# line... could be a race. Log to error and return
# that it is *NOT* okay to clean up files. We are
# logging to error instead of debug because it
# indicates something bad has happened and this is
# valuable information for figuring it out.
LOG.error(_LE('Process %(pid)s exists with command '
'line %(cmd_line)s.') %
{'pid': pid, 'cmd_line': cmd_line})
return True
except IOError as e:
# This is logged as "info" instead of error because it simply
# means that we couldn't find the files to check on them.
LOG.info(_LI('Unable to find control files on startup for '
'router %(router)s: %(msg)s'),
{'router': self.id, 'msg': e})
return False
def _cleanup_control_files(self):
try:
ctl_file = '%s.ctl' % self.pid_path
LOG.debug('Removing %(pidfile)s and %(ctlfile)s',
{'pidfile': self.pid_file,
'ctlfile': ctl_file})
if os.path.exists(self.pid_file):
os.remove(self.pid_file)
if os.path.exists(ctl_file):
os.remove(ctl_file)
except OSError as e:
LOG.error(_LE('Unable to remove pluto control '
'files for router %(router)s. %(msg)s'),
{'router': self.id, 'msg': e})
def get_status(self):
return self._execute([self.binary,
'whack',
@ -366,7 +443,21 @@ class OpenSwanProcess(BaseSwanProcess):
def restart(self):
"""Restart the process."""
# stop() followed immediately by a start() runs the risk that the
# current pluto daemon has not had a chance to shutdown. We check
# the current process information to see if the daemon is still
# running and if so, wait a short interval and retry.
self.stop()
wait_interval = cfg.CONF.pluto.shutdown_check_timeout
for i in range(cfg.CONF.pluto.shutdown_check_retries):
if not self._process_running():
self._cleanup_control_files()
break
eventlet.sleep(wait_interval)
wait_interval *= cfg.CONF.pluto.shutdown_check_back_off
else:
LOG.warning(_LW('Server appears to still be running, restart '
'of router %s may fail'), self.id)
self.start()
return
@ -419,6 +510,22 @@ class OpenSwanProcess(BaseSwanProcess):
"""
if not self.namespace:
return
# NOTE: The restart operation calls the parent's start() instead of
# this one to avoid having to special case the startup file check.
# If anything is added to this method that needs to run whenever
# a restart occurs, it should be either added to the restart()
# override or things refactored to special-case start() when
# called from restart().
# If, by any reason, ctl and pid files weren't cleaned up, pluto
# won't be able to rewrite them and will fail to start. So we check
# to see if the process is running and if not, attempt a cleanup.
# In either case we fall through to allow the pluto process to
# start or fail in the usual way.
if not self._process_running():
self._cleanup_control_files()
virtual_private = self._virtual_privates()
#start pluto IKE keying daemon
cmd = [self.binary,

View File

@ -15,33 +15,12 @@
import os
import os.path
import eventlet
from neutron.i18n import _LE, _LI, _LW
from oslo_config import cfg
from oslo_log import log as logging
from neutron_vpnaas.services.vpn.device_drivers import ipsec
LOG = logging.getLogger(__name__)
libreswan_opts = [
cfg.IntOpt('shutdown_check_timeout',
default=1,
help=_('Initial interval in seconds for checking if pluto '
'daemon is shutdown')),
cfg.IntOpt('shutdown_check_retries',
default=5,
help=_('The maximum number of retries for checking for '
'pluto daemon shutdown')),
cfg.FloatOpt('shutdown_check_back_off',
default=1.5,
help=_('A factor to increase the retry interval for '
'each retry'))
]
cfg.CONF.register_opts(libreswan_opts, 'libreswan')
class LibreSwanProcess(ipsec.OpenSwanProcess):
"""Libreswan Process manager class.
@ -51,7 +30,6 @@ class LibreSwanProcess(ipsec.OpenSwanProcess):
def __init__(self, conf, process_id, vpnservice, namespace):
super(LibreSwanProcess, self).__init__(conf, process_id,
vpnservice, namespace)
self.pid_file = '%s.pid' % self.pid_path
def ensure_configs(self):
"""Generate config files which are needed for Libreswan.
@ -86,98 +64,6 @@ class LibreSwanProcess(ipsec.OpenSwanProcess):
except RuntimeError:
self._execute([self.binary, 'initnss', self.etc_dir])
def _process_running(self):
"""Checks if process is still running."""
# If no PID file, we assume the process is not running.
if not os.path.exists(self.pid_file):
return False
try:
# We take an ask-forgiveness-not-permission approach and rely
# on throwing to tell us something. If the pid file exists,
# delve into the process information and check if it matches
# our expected command line.
with open(self.pid_file, 'r') as f:
pid = f.readline().strip()
with open('/proc/%s/cmdline' % pid) as cmd_line_file:
cmd_line = cmd_line_file.readline()
if self.pid_path in cmd_line and 'pluto' in cmd_line:
# Okay the process is probably a libreswan process
# and it contains the pid_path in the command
# line... could be a race. Log to error and return
# that it is *NOT* okay to clean up files. We are
# logging to error instead of debug because it
# indicates something bad has happened and this is
# valuable information for figuring it out.
LOG.error(_LE('Process %(pid)s exists with command '
'line %(cmd_line)s.') %
{'pid': pid, 'cmd_line': cmd_line})
return True
except IOError as e:
# This is logged as "info" instead of error because it simply
# means that we couldn't find the files to check on them.
LOG.info(_LI('Unable to find control files on startup for '
'router %(router)s: %(msg)s'),
{'router': self.id, 'msg': e})
return False
def _cleanup_control_files(self):
try:
ctl_file = '%s.ctl' % self.pid_path
LOG.debug('Removing %(pidfile)s and %(ctlfile)s',
{'pidfile': self.pid_file,
'ctlfile': '%s.ctl' % ctl_file})
if os.path.exists(self.pid_file):
os.remove(self.pid_file)
if os.path.exists(ctl_file):
os.remove(ctl_file)
except OSError as e:
LOG.error(_LE('Unable to remove libreswan control '
'files for router %(router)s. %(msg)s'),
{'router': self.id, 'msg': e})
def start(self):
# NOTE: The restart operation calls the parent's start() instead of
# this one to avoid having to special case the startup file check.
# If anything is added to this method that needs to run whenever
# a restart occurs, it should be either added to the restart()
# override or things refactored to special-case start() when
# called from restart().
# LibreSwan's use of the capabilities library may prevent the ctl
# and pid files from being cleaned up, so we check to see if the
# process is running and if not, attempt a cleanup. In either case
# we fall through to allow the LibreSwan process to start or fail
# in the usual way.
if not self._process_running():
self._cleanup_control_files()
super(LibreSwanProcess, self).start()
def restart(self):
# stop() followed immediately by a start() runs the risk that the
# current pluto daemon has not had a chance to shutdown. We check
# the current process information to see if the daemon is still
# running and if so, wait a short interval and retry.
self.stop()
wait_interval = cfg.CONF.libreswan.shutdown_check_timeout
for i in range(cfg.CONF.libreswan.shutdown_check_retries):
if not self._process_running():
self._cleanup_control_files()
break
eventlet.sleep(wait_interval)
wait_interval *= cfg.CONF.libreswan.shutdown_check_back_off
else:
LOG.warning(_LW('Server appears to still be running, restart '
'of router %s may fail'), self.id)
super(LibreSwanProcess, self).start()
class LibreSwanDriver(ipsec.IPsecDriver):
def create_process(self, process_id, vpnservice, namespace):

View File

@ -690,12 +690,29 @@ class IPSecDeviceDVR(BaseIPsecDeviceDriver):
class TestOpenSwanProcess(BaseIPsecDeviceDriver):
_test_timeout = 1
_test_backoff = 2
_test_retries = 5
def setUp(self, driver=openswan_ipsec.OpenSwanDriver,
ipsec_process=openswan_ipsec.OpenSwanProcess):
super(TestOpenSwanProcess, self).setUp(driver, ipsec_process)
# Insulate tests against changes to configuration defaults.
self.conf.register_opts(openswan_ipsec.openswan_opts,
'openswan')
self.conf.set_override('state_path', '/tmp')
cfg.CONF.register_opts(openswan_ipsec.pluto_opts,
'pluto')
cfg.CONF.set_override('shutdown_check_timeout', self._test_timeout,
group='pluto')
cfg.CONF.set_override('shutdown_check_back_off', self._test_backoff,
group='pluto')
cfg.CONF.set_override('shutdown_check_retries', self._test_retries,
group='pluto')
self.addCleanup(cfg.CONF.reset)
self.os_remove = mock.patch('os.remove').start()
self.process = openswan_ipsec.OpenSwanProcess(self.conf,
'foo-process-id',
@ -810,84 +827,55 @@ class TestOpenSwanProcess(BaseIPsecDeviceDriver):
self.assertEqual(expected_connection_status_dict,
self.process.connection_status)
class TestLibreSwanProcess(base.BaseTestCase):
_test_timeout = 1
_test_backoff = 2
_test_retries = 5
def setUp(self):
super(TestLibreSwanProcess, self).setUp()
# Insulate tests against changes to configuration defaults.
cfg.CONF.register_opts(libreswan_ipsec.libreswan_opts,
'libreswan')
cfg.CONF.set_override('shutdown_check_timeout', self._test_timeout,
group='libreswan')
cfg.CONF.set_override('shutdown_check_back_off', self._test_backoff,
group='libreswan')
cfg.CONF.set_override('shutdown_check_retries', self._test_retries,
group='libreswan')
self.addCleanup(cfg.CONF.reset)
self.vpnservice = copy.deepcopy(FAKE_VPN_SERVICE)
self.parent_start = mock.patch('neutron_vpnaas.services.'
'vpn.device_drivers.ipsec.'
'OpenSwanProcess.start').start()
self.parent_stop = mock.patch('neutron_vpnaas.services.'
'vpn.device_drivers.ipsec.'
'OpenSwanProcess.stop').start()
self.os_remove = mock.patch('os.remove').start()
self.ipsec_process = libreswan_ipsec.LibreSwanProcess(cfg.CONF,
'foo-process-id',
self.vpnservice,
mock.ANY)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.LibreSwanProcess._cleanup_control_files')
def test_no_cleanups(self, cleanup_mock):
'ipsec.OpenSwanProcess._get_nexthop',
return_value='172.168.1.2')
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'ipsec.OpenSwanProcess._cleanup_control_files')
def test_no_cleanups(self, cleanup_mock, hop_mock):
# Not an "awesome test" but more of a check box item. Basically,
# what happens if we didn't need to clean up any files.
with mock.patch.object(self.ipsec_process,
with mock.patch.object(self.process,
'_process_running',
return_value=True) as query_mock:
self.ipsec_process.start()
self.assertEqual(1, self.parent_start.call_count)
self.process.start()
self.assertEqual(1, query_mock.call_count)
# This is really what is being tested here. If process is
# running, we shouldn't attempt a cleanup.
self.assertFalse(cleanup_mock.called)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'ipsec.OpenSwanProcess._get_nexthop',
return_value='172.168.1.2')
@mock.patch('os.path.exists', return_value=True)
def test_cleanup_files(self, exists_mock):
def test_cleanup_files(self, exists_mock, hop_mock):
# Tests the 'bones' of things really and kind of check-box-item-bogus
# test - this really needs exercising through a higher level test.
with mock.patch.object(self.ipsec_process,
with mock.patch.object(self.process,
'_process_running',
return_value=False) as query_mock:
fake_path = '/fake/path/run'
self.ipsec_process.pid_path = fake_path
self.ipsec_process.pid_file = '%s.pid' % fake_path
self.ipsec_process.start()
self.assertEqual(1, self.parent_start.call_count)
self.process.pid_path = fake_path
self.process.pid_file = '%s.pid' % fake_path
self.process.start()
self.assertEqual(1, query_mock.call_count)
self.assertEqual(2, self.os_remove.call_count)
self.os_remove.assert_has_calls([mock.call('%s.pid' % fake_path),
mock.call('%s.ctl' % fake_path)])
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.LibreSwanProcess._process_running',
return_value=False)
'ipsec.OpenSwanProcess._get_nexthop',
return_value='172.168.1.2')
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.LibreSwanProcess._cleanup_control_files')
'ipsec.OpenSwanProcess._process_running',
return_value=False)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'ipsec.OpenSwanProcess._cleanup_control_files')
@mock.patch('eventlet.sleep')
def test_restart_process_not_running(self, sleep_mock, cleanup_mock,
query_mock):
self.ipsec_process.restart()
# Lame checks that are really for sanity
self.assertTrue(self.parent_stop.called)
self.assertTrue(self.parent_start.called)
query_mock, hop_mock):
self.process.restart()
# Really what is being tested - retry configuration exists and that
# we do the right things when process check is false.
@ -896,111 +884,118 @@ class TestLibreSwanProcess(base.BaseTestCase):
self.assertFalse(sleep_mock.called)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.LibreSwanProcess._process_running',
return_value=True)
'ipsec.OpenSwanProcess._get_nexthop',
return_value='172.168.1.2')
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.LibreSwanProcess._cleanup_control_files')
'ipsec.OpenSwanProcess._process_running',
return_value=True)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'ipsec.OpenSwanProcess._cleanup_control_files')
@mock.patch('eventlet.sleep')
def test_restart_process_doesnt_stop(self, sleep_mock, cleanup_mock,
query_mock):
self.ipsec_process.restart()
# Lame checks that are really for sanity
self.assertTrue(self.parent_stop.called)
self.assertTrue(self.parent_start.called)
query_mock, hop_mock):
self.process.restart()
# Really what is being tested - retry configuration exists and that
# we do the right things when process check is True.
self.assertEqual(5, query_mock.call_count)
self.assertEqual(self._test_retries + 1, query_mock.call_count)
self.assertFalse(cleanup_mock.called)
self.assertEqual(5, sleep_mock.call_count)
self.assertEqual(self._test_retries, sleep_mock.call_count)
calls = [mock.call(1), mock.call(2), mock.call(4),
mock.call(8), mock.call(16)]
sleep_mock.assert_has_calls(calls)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.LibreSwanProcess._process_running',
side_effect=[True, True, False])
'ipsec.OpenSwanProcess._get_nexthop',
return_value='172.168.1.2')
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.LibreSwanProcess._cleanup_control_files')
'ipsec.OpenSwanProcess._process_running',
side_effect=[True, True, False, False])
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'ipsec.OpenSwanProcess._cleanup_control_files')
@mock.patch('eventlet.sleep')
def test_restart_process_retry_until_stop(self, sleep_mock, cleanup_mock,
query_mock):
self.ipsec_process.restart()
# Lame checks that are really for sanity
self.assertTrue(self.parent_start.called)
self.assertTrue(self.parent_stop.called)
query_mock, hop_mock):
self.process.restart()
# Really what is being tested - retry configuration exists and that
# we do the right things when process check is True a few times and
# then returns False.
self.assertEqual(3, query_mock.call_count)
self.assertEqual(4, query_mock.call_count)
self.assertTrue(cleanup_mock.called)
self.assertEqual(2, sleep_mock.call_count)
def test_process_running_no_pid(self):
with mock.patch('os.path.exists', return_value=False):
self.assertFalse(
self.ipsec_process._process_running())
self.process._process_running())
# open() is used elsewhere, so we need to inject a mocked open into the
# module to be tested.
@mock.patch('os.path.exists', return_value=True)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.open',
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.ipsec.open',
create=True,
side_effect=IOError)
def test_process_running_open_failure(self, mock_open, mock_exists):
self.assertFalse(self.ipsec_process._process_running())
self.assertFalse(self.process._process_running())
self.assertTrue(mock_exists.called)
self.assertTrue(mock_open.called)
@mock.patch('os.path.exists', return_value=True)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.open',
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.ipsec.open',
create=True,
side_effect=[io.StringIO(u'invalid'),
IOError])
def test_process_running_bogus_pid(self, mock_open, mock_exists):
with mock.patch.object(libreswan_ipsec.LOG, 'error'):
self.assertFalse(self.ipsec_process._process_running())
self.assertFalse(self.process._process_running())
self.assertTrue(mock_exists.called)
self.assertEqual(2, mock_open.call_count)
@mock.patch('os.path.exists', return_value=True)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.open',
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.ipsec.open',
create=True,
side_effect=[io.StringIO(u'134'), io.StringIO(u'')])
def test_process_running_no_cmdline(self, mock_open, mock_exists):
with mock.patch.object(libreswan_ipsec.LOG, 'error') as log_mock:
self.assertFalse(self.ipsec_process._process_running())
with mock.patch.object(openswan_ipsec.LOG, 'error') as log_mock:
self.assertFalse(self.process._process_running())
self.assertFalse(log_mock.called)
self.assertEqual(2, mock_open.call_count)
@mock.patch('os.path.exists', return_value=True)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.open',
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.ipsec.open',
create=True,
side_effect=[io.StringIO(u'134'), io.StringIO(u'ps ax')])
def test_process_running_cmdline_mismatch(self, mock_open, mock_exists):
with mock.patch.object(libreswan_ipsec.LOG, 'error') as log_mock:
self.assertFalse(self.ipsec_process._process_running())
with mock.patch.object(openswan_ipsec.LOG, 'error') as log_mock:
self.assertFalse(self.process._process_running())
self.assertFalse(log_mock.called)
self.assertEqual(2, mock_open.call_count)
@mock.patch('os.path.exists', return_value=True)
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.'
'libreswan_ipsec.open',
@mock.patch('neutron_vpnaas.services.vpn.device_drivers.ipsec.open',
create=True,
side_effect=[io.StringIO(u'134'),
io.StringIO(u'/usr/libexec/ipsec/pluto -ctlbase'
'/some/foo/path')])
def test_process_running_cmdline_match(self, mock_open, mock_exists):
self.ipsec_process.pid_path = '/some/foo/path'
with mock.patch.object(libreswan_ipsec.LOG, 'error') as log_mock:
self.assertTrue(self.ipsec_process._process_running())
self.process.pid_path = '/some/foo/path'
with mock.patch.object(openswan_ipsec.LOG, 'error') as log_mock:
self.assertTrue(self.process._process_running())
self.assertTrue(log_mock.called)
class TestLibreSwanProcess(base.BaseTestCase):
def setUp(self):
super(TestLibreSwanProcess, self).setUp()
self.vpnservice = copy.deepcopy(FAKE_VPN_SERVICE)
self.ipsec_process = libreswan_ipsec.LibreSwanProcess(cfg.CONF,
'foo-process-id',
self.vpnservice,
mock.ANY)
@mock.patch('os.remove')
@mock.patch('os.path.exists', return_value=True)
def test_ensure_configs_on_restart(self, exists_mock, remove_mock):