Clean stale lockfiles on service startup : fixes bug 785955
Adds cleanup_files_locks() to nova/utils, which cleans up stale locks left behind after process failures. Adds a call to clean up locks on service startup for nova-api, nova-cert, nova-compute, nova-network, nova-objectstore, and nova-scheduler. Adds tools/clean_file_locks.py, which can be used to manually clean stale locks. Change-Id: I752e0b24d3c7fc5f1dc290da355cbd7f430789b8
This commit is contained in:
parent
48c08d048b
commit
2fbccc0c69
|
@ -158,6 +158,7 @@ class Service(object):
|
|||
vcs_string = version.version_string_with_vcs()
|
||||
LOG.audit(_('Starting %(topic)s node (version %(vcs_string)s)'),
|
||||
{'topic': self.topic, 'vcs_string': vcs_string})
|
||||
utils.cleanup_file_locks()
|
||||
self.manager.init_host()
|
||||
self.model_disconnected = False
|
||||
ctxt = context.get_admin_context()
|
||||
|
@ -360,6 +361,7 @@ class WSGIService(object):
|
|||
:returns: None
|
||||
|
||||
"""
|
||||
utils.cleanup_file_locks()
|
||||
if self.manager:
|
||||
self.manager.init_host()
|
||||
self.server.start()
|
||||
|
|
|
@ -18,6 +18,8 @@ import __builtin__
|
|||
import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import os.path
|
||||
import socket
|
||||
import StringIO
|
||||
import tempfile
|
||||
|
||||
|
@ -811,3 +813,150 @@ class Iso8601TimeTest(test.TestCase):
|
|||
west = utils.parse_isotime(str)
|
||||
normed = utils.normalize_time(west)
|
||||
self._instaneous(normed, 2012, 2, 13, 23, 53, 07, 0)
|
||||
|
||||
|
||||
class TestLockCleanup(test.TestCase):
|
||||
"""unit tests for utils.cleanup_file_locks()"""
|
||||
|
||||
def setUp(self):
|
||||
super(TestLockCleanup, self).setUp()
|
||||
|
||||
self.pid = os.getpid()
|
||||
self.dead_pid = self._get_dead_pid()
|
||||
self.lock_name = 'nova-testlock'
|
||||
self.lock_file = os.path.join(FLAGS.lock_path,
|
||||
self.lock_name + '.lock')
|
||||
self.hostname = socket.gethostname()
|
||||
print self.pid, self.dead_pid
|
||||
try:
|
||||
os.unlink(self.lock_file)
|
||||
except OSError as (errno, strerror):
|
||||
if errno == 2:
|
||||
pass
|
||||
|
||||
def _get_dead_pid(self):
|
||||
"""get a pid for a process that does not exist"""
|
||||
|
||||
candidate_pid = self.pid - 1
|
||||
while os.path.exists(os.path.join('/proc', str(candidate_pid))):
|
||||
candidate_pid -= 1
|
||||
if candidate_pid == 1:
|
||||
return 0
|
||||
return candidate_pid
|
||||
|
||||
def _get_sentinel_name(self, hostname, pid, thread='MainThread'):
|
||||
return os.path.join(FLAGS.lock_path,
|
||||
'%s.%s-%d' % (hostname, thread, pid))
|
||||
|
||||
def _create_sentinel(self, hostname, pid, thread='MainThread'):
|
||||
name = self._get_sentinel_name(hostname, pid, thread)
|
||||
open(name, 'wb').close()
|
||||
return name
|
||||
|
||||
def test_clean_stale_locks(self):
|
||||
"""verify locks for dead processes are cleaned up"""
|
||||
|
||||
# create sentinels for two processes, us and a 'dead' one
|
||||
# no actve lock
|
||||
sentinel1 = self._create_sentinel(self.hostname, self.pid)
|
||||
sentinel2 = self._create_sentinel(self.hostname, self.dead_pid)
|
||||
|
||||
utils.cleanup_file_locks()
|
||||
|
||||
self.assertTrue(os.path.exists(sentinel1))
|
||||
self.assertFalse(os.path.exists(self.lock_file))
|
||||
self.assertFalse(os.path.exists(sentinel2))
|
||||
|
||||
os.unlink(sentinel1)
|
||||
|
||||
def test_clean_stale_locks_active(self):
|
||||
"""verify locks for dead processes are cleaned with an active lock """
|
||||
|
||||
# create sentinels for two processes, us and a 'dead' one
|
||||
# create an active lock for us
|
||||
sentinel1 = self._create_sentinel(self.hostname, self.pid)
|
||||
sentinel2 = self._create_sentinel(self.hostname, self.dead_pid)
|
||||
os.link(sentinel1, self.lock_file)
|
||||
|
||||
utils.cleanup_file_locks()
|
||||
|
||||
self.assertTrue(os.path.exists(sentinel1))
|
||||
self.assertTrue(os.path.exists(self.lock_file))
|
||||
self.assertFalse(os.path.exists(sentinel2))
|
||||
|
||||
os.unlink(sentinel1)
|
||||
os.unlink(self.lock_file)
|
||||
|
||||
def test_clean_stale_with_threads(self):
|
||||
"""verify locks for multiple threads are cleaned up """
|
||||
|
||||
# create sentinels for four threads in our process, and a 'dead'
|
||||
# process. no lock.
|
||||
sentinel1 = self._create_sentinel(self.hostname, self.pid, 'Default-1')
|
||||
sentinel2 = self._create_sentinel(self.hostname, self.pid, 'Default-2')
|
||||
sentinel3 = self._create_sentinel(self.hostname, self.pid, 'Default-3')
|
||||
sentinel4 = self._create_sentinel(self.hostname, self.pid, 'Default-4')
|
||||
sentinel5 = self._create_sentinel(self.hostname, self.dead_pid,
|
||||
'Default-1')
|
||||
|
||||
utils.cleanup_file_locks()
|
||||
|
||||
self.assertTrue(os.path.exists(sentinel1))
|
||||
self.assertTrue(os.path.exists(sentinel2))
|
||||
self.assertTrue(os.path.exists(sentinel3))
|
||||
self.assertTrue(os.path.exists(sentinel4))
|
||||
self.assertFalse(os.path.exists(self.lock_file))
|
||||
self.assertFalse(os.path.exists(sentinel5))
|
||||
|
||||
os.unlink(sentinel1)
|
||||
os.unlink(sentinel2)
|
||||
os.unlink(sentinel3)
|
||||
os.unlink(sentinel4)
|
||||
|
||||
def test_clean_stale_with_threads_active(self):
|
||||
"""verify locks for multiple threads are cleaned up """
|
||||
|
||||
# create sentinels for four threads in our process, and a 'dead'
|
||||
# process
|
||||
sentinel1 = self._create_sentinel(self.hostname, self.pid, 'Default-1')
|
||||
sentinel2 = self._create_sentinel(self.hostname, self.pid, 'Default-2')
|
||||
sentinel3 = self._create_sentinel(self.hostname, self.pid, 'Default-3')
|
||||
sentinel4 = self._create_sentinel(self.hostname, self.pid, 'Default-4')
|
||||
sentinel5 = self._create_sentinel(self.hostname, self.dead_pid,
|
||||
'Default-1')
|
||||
|
||||
os.link(sentinel1, self.lock_file)
|
||||
|
||||
utils.cleanup_file_locks()
|
||||
|
||||
self.assertTrue(os.path.exists(sentinel1))
|
||||
self.assertTrue(os.path.exists(sentinel2))
|
||||
self.assertTrue(os.path.exists(sentinel3))
|
||||
self.assertTrue(os.path.exists(sentinel4))
|
||||
self.assertTrue(os.path.exists(self.lock_file))
|
||||
self.assertFalse(os.path.exists(sentinel5))
|
||||
|
||||
os.unlink(sentinel1)
|
||||
os.unlink(sentinel2)
|
||||
os.unlink(sentinel3)
|
||||
os.unlink(sentinel4)
|
||||
os.unlink(self.lock_file)
|
||||
|
||||
def test_clean_bogus_lockfiles(self):
|
||||
"""verify lockfiles are cleaned """
|
||||
|
||||
lock1 = os.path.join(FLAGS.lock_path, 'nova-testlock1.lock')
|
||||
lock2 = os.path.join(FLAGS.lock_path, 'nova-testlock2.lock')
|
||||
lock3 = os.path.join(FLAGS.lock_path, 'testlock3.lock')
|
||||
|
||||
open(lock1, 'wb').close()
|
||||
open(lock2, 'wb').close()
|
||||
open(lock3, 'wb').close()
|
||||
|
||||
utils.cleanup_file_locks()
|
||||
|
||||
self.assertFalse(os.path.exists(lock1))
|
||||
self.assertFalse(os.path.exists(lock2))
|
||||
self.assertTrue(os.path.exists(lock3))
|
||||
|
||||
os.unlink(lock3)
|
||||
|
|
|
@ -26,7 +26,6 @@ import hashlib
|
|||
import inspect
|
||||
import itertools
|
||||
import json
|
||||
import lockfile
|
||||
import os
|
||||
import pyclbr
|
||||
import random
|
||||
|
@ -46,6 +45,7 @@ from eventlet import greenthread
|
|||
from eventlet import semaphore
|
||||
from eventlet.green import subprocess
|
||||
import iso8601
|
||||
import lockfile
|
||||
import netaddr
|
||||
|
||||
from nova import exception
|
||||
|
@ -857,6 +857,89 @@ def synchronized(name, external=False):
|
|||
return wrap
|
||||
|
||||
|
||||
def cleanup_file_locks():
|
||||
"""clean up stale locks left behind by process failures
|
||||
|
||||
The lockfile module, used by @synchronized, can leave stale lockfiles
|
||||
behind after process failure. These locks can cause process hangs
|
||||
at startup, when a process deadlocks on a lock which will never
|
||||
be unlocked.
|
||||
|
||||
Intended to be called at service startup.
|
||||
|
||||
"""
|
||||
|
||||
# NOTE(mikeyp) this routine incorporates some internal knowledge
|
||||
# from the lockfile module, and this logic really
|
||||
# should be part of that module.
|
||||
#
|
||||
# cleanup logic:
|
||||
# 1) look for the lockfile modules's 'sentinel' files, of the form
|
||||
# hostname.[thread-.*]-pid, extract the pid.
|
||||
# if pid doesn't match a running process, delete the file since
|
||||
# it's from a dead process.
|
||||
# 2) check for the actual lockfiles. if lockfile exists with linkcount
|
||||
# of 1, it's bogus, so delete it. A link count >= 2 indicates that
|
||||
# there are probably sentinels still linked to it from active
|
||||
# processes. This check isn't perfect, but there is no way to
|
||||
# reliably tell which sentinels refer to which lock in the
|
||||
# lockfile implementation.
|
||||
|
||||
if FLAGS.disable_process_locking:
|
||||
return
|
||||
|
||||
hostname = socket.gethostname()
|
||||
sentinel_re = hostname + r'\..*-(\d+$)'
|
||||
lockfile_re = r'nova-.*\.lock'
|
||||
files = os.listdir(FLAGS.lock_path)
|
||||
|
||||
# cleanup sentinels
|
||||
for filename in files:
|
||||
match = re.match(sentinel_re, filename)
|
||||
if match is None:
|
||||
continue
|
||||
pid = match.group(1)
|
||||
LOG.debug(_('Found sentinel %(filename)s for pid %(pid)s' %
|
||||
{'filename': filename, 'pid': pid}))
|
||||
if not os.path.exists(os.path.join('/proc', pid)):
|
||||
delete_if_exists(os.path.join(FLAGS.lock_path, filename))
|
||||
LOG.debug(_('Cleaned sentinel %(filename)s for pid %(pid)s' %
|
||||
{'filename': filename, 'pid': pid}))
|
||||
|
||||
# cleanup lock files
|
||||
for filename in files:
|
||||
match = re.match(lockfile_re, filename)
|
||||
if match is None:
|
||||
continue
|
||||
try:
|
||||
stat_info = os.stat(os.path.join(FLAGS.lock_path, filename))
|
||||
except OSError as (errno, strerror):
|
||||
if errno == 2: # doesn't exist
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
msg = _('Found lockfile %(file)s with link count %(count)d' %
|
||||
{'file': filename, 'count': stat_info.st_nlink})
|
||||
LOG.debug(msg)
|
||||
if stat_info.st_nlink == 1:
|
||||
delete_if_exists(os.path.join(FLAGS.lock_path, filename))
|
||||
msg = _('Cleaned lockfile %(file)s with link count %(count)d' %
|
||||
{'file': filename, 'count': stat_info.st_nlink})
|
||||
LOG.debug(msg)
|
||||
|
||||
|
||||
def delete_if_exists(pathname):
|
||||
"""delete a file, but ignore file not found error"""
|
||||
|
||||
try:
|
||||
os.unlink(pathname)
|
||||
except OSError as (errno, strerror):
|
||||
if errno == 2: # doesn't exist
|
||||
return
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
def get_from_path(items, path):
|
||||
"""Returns a list of items matching the specified path.
|
||||
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2012 La Honda Research Center, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""clean_file_locks.py - Cleans stale interprocess locks
|
||||
|
||||
This rountine can be used to find and delete stale lock files from
|
||||
nova's interprocess synchroization. It can be used safely while services
|
||||
are running.
|
||||
|
||||
"""
|
||||
|
||||
import logging
|
||||
import optparse
|
||||
|
||||
from nova import flags
|
||||
from nova import utils
|
||||
from nova import log
|
||||
|
||||
|
||||
LOG = log.getLogger('nova.utils')
|
||||
FLAGS = flags.FLAGS
|
||||
|
||||
|
||||
def parse_options():
|
||||
"""process command line options."""
|
||||
|
||||
parser = optparse.OptionParser('usage: %prog [options]')
|
||||
parser.add_option('--verbose', action='store_true',
|
||||
help='List lock files found and deleted')
|
||||
|
||||
options, args = parser.parse_args()
|
||||
|
||||
return options, args
|
||||
|
||||
|
||||
def main():
|
||||
"""Main loop."""
|
||||
options, args = parse_options()
|
||||
verbose = options.verbose
|
||||
|
||||
if verbose:
|
||||
LOG.logger.setLevel(logging.DEBUG)
|
||||
else:
|
||||
LOG.logger.setLevel(logging.INFO)
|
||||
LOG.info('Cleaning stale locks from %s' % FLAGS.lock_path)
|
||||
utils.cleanup_file_locks()
|
||||
LOG.info('Finished')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue