Clean stale lockfiles on service startup : fixes bug 785955
Adds cleanup_files_locks() to nova/utils, which cleans up stale locks left behind after process failures. Adds a call to clean up locks on service startup for nova-api, nova-cert, nova-compute, nova-network, nova-objectstore, and nova-scheduler. Adds tools/clean_file_locks.py, which can be used to manually clean stale locks. Change-Id: I752e0b24d3c7fc5f1dc290da355cbd7f430789b8
This commit is contained in:
parent
48c08d048b
commit
2fbccc0c69
|
@ -158,6 +158,7 @@ class Service(object):
|
||||||
vcs_string = version.version_string_with_vcs()
|
vcs_string = version.version_string_with_vcs()
|
||||||
LOG.audit(_('Starting %(topic)s node (version %(vcs_string)s)'),
|
LOG.audit(_('Starting %(topic)s node (version %(vcs_string)s)'),
|
||||||
{'topic': self.topic, 'vcs_string': vcs_string})
|
{'topic': self.topic, 'vcs_string': vcs_string})
|
||||||
|
utils.cleanup_file_locks()
|
||||||
self.manager.init_host()
|
self.manager.init_host()
|
||||||
self.model_disconnected = False
|
self.model_disconnected = False
|
||||||
ctxt = context.get_admin_context()
|
ctxt = context.get_admin_context()
|
||||||
|
@ -360,6 +361,7 @@ class WSGIService(object):
|
||||||
:returns: None
|
:returns: None
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
utils.cleanup_file_locks()
|
||||||
if self.manager:
|
if self.manager:
|
||||||
self.manager.init_host()
|
self.manager.init_host()
|
||||||
self.server.start()
|
self.server.start()
|
||||||
|
|
|
@ -18,6 +18,8 @@ import __builtin__
|
||||||
import datetime
|
import datetime
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import os
|
||||||
|
import os.path
|
||||||
|
import socket
|
||||||
import StringIO
|
import StringIO
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
|
@ -811,3 +813,150 @@ class Iso8601TimeTest(test.TestCase):
|
||||||
west = utils.parse_isotime(str)
|
west = utils.parse_isotime(str)
|
||||||
normed = utils.normalize_time(west)
|
normed = utils.normalize_time(west)
|
||||||
self._instaneous(normed, 2012, 2, 13, 23, 53, 07, 0)
|
self._instaneous(normed, 2012, 2, 13, 23, 53, 07, 0)
|
||||||
|
|
||||||
|
|
||||||
|
class TestLockCleanup(test.TestCase):
|
||||||
|
"""unit tests for utils.cleanup_file_locks()"""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
super(TestLockCleanup, self).setUp()
|
||||||
|
|
||||||
|
self.pid = os.getpid()
|
||||||
|
self.dead_pid = self._get_dead_pid()
|
||||||
|
self.lock_name = 'nova-testlock'
|
||||||
|
self.lock_file = os.path.join(FLAGS.lock_path,
|
||||||
|
self.lock_name + '.lock')
|
||||||
|
self.hostname = socket.gethostname()
|
||||||
|
print self.pid, self.dead_pid
|
||||||
|
try:
|
||||||
|
os.unlink(self.lock_file)
|
||||||
|
except OSError as (errno, strerror):
|
||||||
|
if errno == 2:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _get_dead_pid(self):
|
||||||
|
"""get a pid for a process that does not exist"""
|
||||||
|
|
||||||
|
candidate_pid = self.pid - 1
|
||||||
|
while os.path.exists(os.path.join('/proc', str(candidate_pid))):
|
||||||
|
candidate_pid -= 1
|
||||||
|
if candidate_pid == 1:
|
||||||
|
return 0
|
||||||
|
return candidate_pid
|
||||||
|
|
||||||
|
def _get_sentinel_name(self, hostname, pid, thread='MainThread'):
|
||||||
|
return os.path.join(FLAGS.lock_path,
|
||||||
|
'%s.%s-%d' % (hostname, thread, pid))
|
||||||
|
|
||||||
|
def _create_sentinel(self, hostname, pid, thread='MainThread'):
|
||||||
|
name = self._get_sentinel_name(hostname, pid, thread)
|
||||||
|
open(name, 'wb').close()
|
||||||
|
return name
|
||||||
|
|
||||||
|
def test_clean_stale_locks(self):
|
||||||
|
"""verify locks for dead processes are cleaned up"""
|
||||||
|
|
||||||
|
# create sentinels for two processes, us and a 'dead' one
|
||||||
|
# no actve lock
|
||||||
|
sentinel1 = self._create_sentinel(self.hostname, self.pid)
|
||||||
|
sentinel2 = self._create_sentinel(self.hostname, self.dead_pid)
|
||||||
|
|
||||||
|
utils.cleanup_file_locks()
|
||||||
|
|
||||||
|
self.assertTrue(os.path.exists(sentinel1))
|
||||||
|
self.assertFalse(os.path.exists(self.lock_file))
|
||||||
|
self.assertFalse(os.path.exists(sentinel2))
|
||||||
|
|
||||||
|
os.unlink(sentinel1)
|
||||||
|
|
||||||
|
def test_clean_stale_locks_active(self):
|
||||||
|
"""verify locks for dead processes are cleaned with an active lock """
|
||||||
|
|
||||||
|
# create sentinels for two processes, us and a 'dead' one
|
||||||
|
# create an active lock for us
|
||||||
|
sentinel1 = self._create_sentinel(self.hostname, self.pid)
|
||||||
|
sentinel2 = self._create_sentinel(self.hostname, self.dead_pid)
|
||||||
|
os.link(sentinel1, self.lock_file)
|
||||||
|
|
||||||
|
utils.cleanup_file_locks()
|
||||||
|
|
||||||
|
self.assertTrue(os.path.exists(sentinel1))
|
||||||
|
self.assertTrue(os.path.exists(self.lock_file))
|
||||||
|
self.assertFalse(os.path.exists(sentinel2))
|
||||||
|
|
||||||
|
os.unlink(sentinel1)
|
||||||
|
os.unlink(self.lock_file)
|
||||||
|
|
||||||
|
def test_clean_stale_with_threads(self):
|
||||||
|
"""verify locks for multiple threads are cleaned up """
|
||||||
|
|
||||||
|
# create sentinels for four threads in our process, and a 'dead'
|
||||||
|
# process. no lock.
|
||||||
|
sentinel1 = self._create_sentinel(self.hostname, self.pid, 'Default-1')
|
||||||
|
sentinel2 = self._create_sentinel(self.hostname, self.pid, 'Default-2')
|
||||||
|
sentinel3 = self._create_sentinel(self.hostname, self.pid, 'Default-3')
|
||||||
|
sentinel4 = self._create_sentinel(self.hostname, self.pid, 'Default-4')
|
||||||
|
sentinel5 = self._create_sentinel(self.hostname, self.dead_pid,
|
||||||
|
'Default-1')
|
||||||
|
|
||||||
|
utils.cleanup_file_locks()
|
||||||
|
|
||||||
|
self.assertTrue(os.path.exists(sentinel1))
|
||||||
|
self.assertTrue(os.path.exists(sentinel2))
|
||||||
|
self.assertTrue(os.path.exists(sentinel3))
|
||||||
|
self.assertTrue(os.path.exists(sentinel4))
|
||||||
|
self.assertFalse(os.path.exists(self.lock_file))
|
||||||
|
self.assertFalse(os.path.exists(sentinel5))
|
||||||
|
|
||||||
|
os.unlink(sentinel1)
|
||||||
|
os.unlink(sentinel2)
|
||||||
|
os.unlink(sentinel3)
|
||||||
|
os.unlink(sentinel4)
|
||||||
|
|
||||||
|
def test_clean_stale_with_threads_active(self):
|
||||||
|
"""verify locks for multiple threads are cleaned up """
|
||||||
|
|
||||||
|
# create sentinels for four threads in our process, and a 'dead'
|
||||||
|
# process
|
||||||
|
sentinel1 = self._create_sentinel(self.hostname, self.pid, 'Default-1')
|
||||||
|
sentinel2 = self._create_sentinel(self.hostname, self.pid, 'Default-2')
|
||||||
|
sentinel3 = self._create_sentinel(self.hostname, self.pid, 'Default-3')
|
||||||
|
sentinel4 = self._create_sentinel(self.hostname, self.pid, 'Default-4')
|
||||||
|
sentinel5 = self._create_sentinel(self.hostname, self.dead_pid,
|
||||||
|
'Default-1')
|
||||||
|
|
||||||
|
os.link(sentinel1, self.lock_file)
|
||||||
|
|
||||||
|
utils.cleanup_file_locks()
|
||||||
|
|
||||||
|
self.assertTrue(os.path.exists(sentinel1))
|
||||||
|
self.assertTrue(os.path.exists(sentinel2))
|
||||||
|
self.assertTrue(os.path.exists(sentinel3))
|
||||||
|
self.assertTrue(os.path.exists(sentinel4))
|
||||||
|
self.assertTrue(os.path.exists(self.lock_file))
|
||||||
|
self.assertFalse(os.path.exists(sentinel5))
|
||||||
|
|
||||||
|
os.unlink(sentinel1)
|
||||||
|
os.unlink(sentinel2)
|
||||||
|
os.unlink(sentinel3)
|
||||||
|
os.unlink(sentinel4)
|
||||||
|
os.unlink(self.lock_file)
|
||||||
|
|
||||||
|
def test_clean_bogus_lockfiles(self):
|
||||||
|
"""verify lockfiles are cleaned """
|
||||||
|
|
||||||
|
lock1 = os.path.join(FLAGS.lock_path, 'nova-testlock1.lock')
|
||||||
|
lock2 = os.path.join(FLAGS.lock_path, 'nova-testlock2.lock')
|
||||||
|
lock3 = os.path.join(FLAGS.lock_path, 'testlock3.lock')
|
||||||
|
|
||||||
|
open(lock1, 'wb').close()
|
||||||
|
open(lock2, 'wb').close()
|
||||||
|
open(lock3, 'wb').close()
|
||||||
|
|
||||||
|
utils.cleanup_file_locks()
|
||||||
|
|
||||||
|
self.assertFalse(os.path.exists(lock1))
|
||||||
|
self.assertFalse(os.path.exists(lock2))
|
||||||
|
self.assertTrue(os.path.exists(lock3))
|
||||||
|
|
||||||
|
os.unlink(lock3)
|
||||||
|
|
|
@ -26,7 +26,6 @@ import hashlib
|
||||||
import inspect
|
import inspect
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
import json
|
||||||
import lockfile
|
|
||||||
import os
|
import os
|
||||||
import pyclbr
|
import pyclbr
|
||||||
import random
|
import random
|
||||||
|
@ -46,6 +45,7 @@ from eventlet import greenthread
|
||||||
from eventlet import semaphore
|
from eventlet import semaphore
|
||||||
from eventlet.green import subprocess
|
from eventlet.green import subprocess
|
||||||
import iso8601
|
import iso8601
|
||||||
|
import lockfile
|
||||||
import netaddr
|
import netaddr
|
||||||
|
|
||||||
from nova import exception
|
from nova import exception
|
||||||
|
@ -857,6 +857,89 @@ def synchronized(name, external=False):
|
||||||
return wrap
|
return wrap
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_file_locks():
|
||||||
|
"""clean up stale locks left behind by process failures
|
||||||
|
|
||||||
|
The lockfile module, used by @synchronized, can leave stale lockfiles
|
||||||
|
behind after process failure. These locks can cause process hangs
|
||||||
|
at startup, when a process deadlocks on a lock which will never
|
||||||
|
be unlocked.
|
||||||
|
|
||||||
|
Intended to be called at service startup.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# NOTE(mikeyp) this routine incorporates some internal knowledge
|
||||||
|
# from the lockfile module, and this logic really
|
||||||
|
# should be part of that module.
|
||||||
|
#
|
||||||
|
# cleanup logic:
|
||||||
|
# 1) look for the lockfile modules's 'sentinel' files, of the form
|
||||||
|
# hostname.[thread-.*]-pid, extract the pid.
|
||||||
|
# if pid doesn't match a running process, delete the file since
|
||||||
|
# it's from a dead process.
|
||||||
|
# 2) check for the actual lockfiles. if lockfile exists with linkcount
|
||||||
|
# of 1, it's bogus, so delete it. A link count >= 2 indicates that
|
||||||
|
# there are probably sentinels still linked to it from active
|
||||||
|
# processes. This check isn't perfect, but there is no way to
|
||||||
|
# reliably tell which sentinels refer to which lock in the
|
||||||
|
# lockfile implementation.
|
||||||
|
|
||||||
|
if FLAGS.disable_process_locking:
|
||||||
|
return
|
||||||
|
|
||||||
|
hostname = socket.gethostname()
|
||||||
|
sentinel_re = hostname + r'\..*-(\d+$)'
|
||||||
|
lockfile_re = r'nova-.*\.lock'
|
||||||
|
files = os.listdir(FLAGS.lock_path)
|
||||||
|
|
||||||
|
# cleanup sentinels
|
||||||
|
for filename in files:
|
||||||
|
match = re.match(sentinel_re, filename)
|
||||||
|
if match is None:
|
||||||
|
continue
|
||||||
|
pid = match.group(1)
|
||||||
|
LOG.debug(_('Found sentinel %(filename)s for pid %(pid)s' %
|
||||||
|
{'filename': filename, 'pid': pid}))
|
||||||
|
if not os.path.exists(os.path.join('/proc', pid)):
|
||||||
|
delete_if_exists(os.path.join(FLAGS.lock_path, filename))
|
||||||
|
LOG.debug(_('Cleaned sentinel %(filename)s for pid %(pid)s' %
|
||||||
|
{'filename': filename, 'pid': pid}))
|
||||||
|
|
||||||
|
# cleanup lock files
|
||||||
|
for filename in files:
|
||||||
|
match = re.match(lockfile_re, filename)
|
||||||
|
if match is None:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
stat_info = os.stat(os.path.join(FLAGS.lock_path, filename))
|
||||||
|
except OSError as (errno, strerror):
|
||||||
|
if errno == 2: # doesn't exist
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
msg = _('Found lockfile %(file)s with link count %(count)d' %
|
||||||
|
{'file': filename, 'count': stat_info.st_nlink})
|
||||||
|
LOG.debug(msg)
|
||||||
|
if stat_info.st_nlink == 1:
|
||||||
|
delete_if_exists(os.path.join(FLAGS.lock_path, filename))
|
||||||
|
msg = _('Cleaned lockfile %(file)s with link count %(count)d' %
|
||||||
|
{'file': filename, 'count': stat_info.st_nlink})
|
||||||
|
LOG.debug(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def delete_if_exists(pathname):
|
||||||
|
"""delete a file, but ignore file not found error"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.unlink(pathname)
|
||||||
|
except OSError as (errno, strerror):
|
||||||
|
if errno == 2: # doesn't exist
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
def get_from_path(items, path):
|
def get_from_path(items, path):
|
||||||
"""Returns a list of items matching the specified path.
|
"""Returns a list of items matching the specified path.
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,63 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Copyright 2012 La Honda Research Center, Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""clean_file_locks.py - Cleans stale interprocess locks
|
||||||
|
|
||||||
|
This rountine can be used to find and delete stale lock files from
|
||||||
|
nova's interprocess synchroization. It can be used safely while services
|
||||||
|
are running.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import optparse
|
||||||
|
|
||||||
|
from nova import flags
|
||||||
|
from nova import utils
|
||||||
|
from nova import log
|
||||||
|
|
||||||
|
|
||||||
|
LOG = log.getLogger('nova.utils')
|
||||||
|
FLAGS = flags.FLAGS
|
||||||
|
|
||||||
|
|
||||||
|
def parse_options():
|
||||||
|
"""process command line options."""
|
||||||
|
|
||||||
|
parser = optparse.OptionParser('usage: %prog [options]')
|
||||||
|
parser.add_option('--verbose', action='store_true',
|
||||||
|
help='List lock files found and deleted')
|
||||||
|
|
||||||
|
options, args = parser.parse_args()
|
||||||
|
|
||||||
|
return options, args
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main loop."""
|
||||||
|
options, args = parse_options()
|
||||||
|
verbose = options.verbose
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
LOG.logger.setLevel(logging.DEBUG)
|
||||||
|
else:
|
||||||
|
LOG.logger.setLevel(logging.INFO)
|
||||||
|
LOG.info('Cleaning stale locks from %s' % FLAGS.lock_path)
|
||||||
|
utils.cleanup_file_locks()
|
||||||
|
LOG.info('Finished')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue