Clean stale lockfiles on service startup : fixes bug 785955

Adds cleanup_files_locks() to nova/utils, which cleans up stale locks
left behind after process failures.

Adds a call to clean up locks on service startup for nova-api, nova-cert,
nova-compute, nova-network, nova-objectstore, and nova-scheduler.

Adds tools/clean_file_locks.py, which can be used to manually clean
stale locks.

Change-Id: I752e0b24d3c7fc5f1dc290da355cbd7f430789b8
This commit is contained in:
Mike Pittaro 2012-02-24 09:56:26 -08:00
parent 48c08d048b
commit 2fbccc0c69
4 changed files with 298 additions and 1 deletions

View File

@ -158,6 +158,7 @@ class Service(object):
vcs_string = version.version_string_with_vcs()
LOG.audit(_('Starting %(topic)s node (version %(vcs_string)s)'),
{'topic': self.topic, 'vcs_string': vcs_string})
utils.cleanup_file_locks()
self.manager.init_host()
self.model_disconnected = False
ctxt = context.get_admin_context()
@ -360,6 +361,7 @@ class WSGIService(object):
:returns: None
"""
utils.cleanup_file_locks()
if self.manager:
self.manager.init_host()
self.server.start()

View File

@ -18,6 +18,8 @@ import __builtin__
import datetime
import hashlib
import os
import os.path
import socket
import StringIO
import tempfile
@ -811,3 +813,150 @@ class Iso8601TimeTest(test.TestCase):
west = utils.parse_isotime(str)
normed = utils.normalize_time(west)
self._instaneous(normed, 2012, 2, 13, 23, 53, 07, 0)
class TestLockCleanup(test.TestCase):
"""unit tests for utils.cleanup_file_locks()"""
def setUp(self):
super(TestLockCleanup, self).setUp()
self.pid = os.getpid()
self.dead_pid = self._get_dead_pid()
self.lock_name = 'nova-testlock'
self.lock_file = os.path.join(FLAGS.lock_path,
self.lock_name + '.lock')
self.hostname = socket.gethostname()
print self.pid, self.dead_pid
try:
os.unlink(self.lock_file)
except OSError as (errno, strerror):
if errno == 2:
pass
def _get_dead_pid(self):
"""get a pid for a process that does not exist"""
candidate_pid = self.pid - 1
while os.path.exists(os.path.join('/proc', str(candidate_pid))):
candidate_pid -= 1
if candidate_pid == 1:
return 0
return candidate_pid
def _get_sentinel_name(self, hostname, pid, thread='MainThread'):
return os.path.join(FLAGS.lock_path,
'%s.%s-%d' % (hostname, thread, pid))
def _create_sentinel(self, hostname, pid, thread='MainThread'):
name = self._get_sentinel_name(hostname, pid, thread)
open(name, 'wb').close()
return name
def test_clean_stale_locks(self):
"""verify locks for dead processes are cleaned up"""
# create sentinels for two processes, us and a 'dead' one
# no actve lock
sentinel1 = self._create_sentinel(self.hostname, self.pid)
sentinel2 = self._create_sentinel(self.hostname, self.dead_pid)
utils.cleanup_file_locks()
self.assertTrue(os.path.exists(sentinel1))
self.assertFalse(os.path.exists(self.lock_file))
self.assertFalse(os.path.exists(sentinel2))
os.unlink(sentinel1)
def test_clean_stale_locks_active(self):
"""verify locks for dead processes are cleaned with an active lock """
# create sentinels for two processes, us and a 'dead' one
# create an active lock for us
sentinel1 = self._create_sentinel(self.hostname, self.pid)
sentinel2 = self._create_sentinel(self.hostname, self.dead_pid)
os.link(sentinel1, self.lock_file)
utils.cleanup_file_locks()
self.assertTrue(os.path.exists(sentinel1))
self.assertTrue(os.path.exists(self.lock_file))
self.assertFalse(os.path.exists(sentinel2))
os.unlink(sentinel1)
os.unlink(self.lock_file)
def test_clean_stale_with_threads(self):
"""verify locks for multiple threads are cleaned up """
# create sentinels for four threads in our process, and a 'dead'
# process. no lock.
sentinel1 = self._create_sentinel(self.hostname, self.pid, 'Default-1')
sentinel2 = self._create_sentinel(self.hostname, self.pid, 'Default-2')
sentinel3 = self._create_sentinel(self.hostname, self.pid, 'Default-3')
sentinel4 = self._create_sentinel(self.hostname, self.pid, 'Default-4')
sentinel5 = self._create_sentinel(self.hostname, self.dead_pid,
'Default-1')
utils.cleanup_file_locks()
self.assertTrue(os.path.exists(sentinel1))
self.assertTrue(os.path.exists(sentinel2))
self.assertTrue(os.path.exists(sentinel3))
self.assertTrue(os.path.exists(sentinel4))
self.assertFalse(os.path.exists(self.lock_file))
self.assertFalse(os.path.exists(sentinel5))
os.unlink(sentinel1)
os.unlink(sentinel2)
os.unlink(sentinel3)
os.unlink(sentinel4)
def test_clean_stale_with_threads_active(self):
"""verify locks for multiple threads are cleaned up """
# create sentinels for four threads in our process, and a 'dead'
# process
sentinel1 = self._create_sentinel(self.hostname, self.pid, 'Default-1')
sentinel2 = self._create_sentinel(self.hostname, self.pid, 'Default-2')
sentinel3 = self._create_sentinel(self.hostname, self.pid, 'Default-3')
sentinel4 = self._create_sentinel(self.hostname, self.pid, 'Default-4')
sentinel5 = self._create_sentinel(self.hostname, self.dead_pid,
'Default-1')
os.link(sentinel1, self.lock_file)
utils.cleanup_file_locks()
self.assertTrue(os.path.exists(sentinel1))
self.assertTrue(os.path.exists(sentinel2))
self.assertTrue(os.path.exists(sentinel3))
self.assertTrue(os.path.exists(sentinel4))
self.assertTrue(os.path.exists(self.lock_file))
self.assertFalse(os.path.exists(sentinel5))
os.unlink(sentinel1)
os.unlink(sentinel2)
os.unlink(sentinel3)
os.unlink(sentinel4)
os.unlink(self.lock_file)
def test_clean_bogus_lockfiles(self):
"""verify lockfiles are cleaned """
lock1 = os.path.join(FLAGS.lock_path, 'nova-testlock1.lock')
lock2 = os.path.join(FLAGS.lock_path, 'nova-testlock2.lock')
lock3 = os.path.join(FLAGS.lock_path, 'testlock3.lock')
open(lock1, 'wb').close()
open(lock2, 'wb').close()
open(lock3, 'wb').close()
utils.cleanup_file_locks()
self.assertFalse(os.path.exists(lock1))
self.assertFalse(os.path.exists(lock2))
self.assertTrue(os.path.exists(lock3))
os.unlink(lock3)

View File

@ -26,7 +26,6 @@ import hashlib
import inspect
import itertools
import json
import lockfile
import os
import pyclbr
import random
@ -46,6 +45,7 @@ from eventlet import greenthread
from eventlet import semaphore
from eventlet.green import subprocess
import iso8601
import lockfile
import netaddr
from nova import exception
@ -857,6 +857,89 @@ def synchronized(name, external=False):
return wrap
def cleanup_file_locks():
"""clean up stale locks left behind by process failures
The lockfile module, used by @synchronized, can leave stale lockfiles
behind after process failure. These locks can cause process hangs
at startup, when a process deadlocks on a lock which will never
be unlocked.
Intended to be called at service startup.
"""
# NOTE(mikeyp) this routine incorporates some internal knowledge
# from the lockfile module, and this logic really
# should be part of that module.
#
# cleanup logic:
# 1) look for the lockfile modules's 'sentinel' files, of the form
# hostname.[thread-.*]-pid, extract the pid.
# if pid doesn't match a running process, delete the file since
# it's from a dead process.
# 2) check for the actual lockfiles. if lockfile exists with linkcount
# of 1, it's bogus, so delete it. A link count >= 2 indicates that
# there are probably sentinels still linked to it from active
# processes. This check isn't perfect, but there is no way to
# reliably tell which sentinels refer to which lock in the
# lockfile implementation.
if FLAGS.disable_process_locking:
return
hostname = socket.gethostname()
sentinel_re = hostname + r'\..*-(\d+$)'
lockfile_re = r'nova-.*\.lock'
files = os.listdir(FLAGS.lock_path)
# cleanup sentinels
for filename in files:
match = re.match(sentinel_re, filename)
if match is None:
continue
pid = match.group(1)
LOG.debug(_('Found sentinel %(filename)s for pid %(pid)s' %
{'filename': filename, 'pid': pid}))
if not os.path.exists(os.path.join('/proc', pid)):
delete_if_exists(os.path.join(FLAGS.lock_path, filename))
LOG.debug(_('Cleaned sentinel %(filename)s for pid %(pid)s' %
{'filename': filename, 'pid': pid}))
# cleanup lock files
for filename in files:
match = re.match(lockfile_re, filename)
if match is None:
continue
try:
stat_info = os.stat(os.path.join(FLAGS.lock_path, filename))
except OSError as (errno, strerror):
if errno == 2: # doesn't exist
continue
else:
raise
msg = _('Found lockfile %(file)s with link count %(count)d' %
{'file': filename, 'count': stat_info.st_nlink})
LOG.debug(msg)
if stat_info.st_nlink == 1:
delete_if_exists(os.path.join(FLAGS.lock_path, filename))
msg = _('Cleaned lockfile %(file)s with link count %(count)d' %
{'file': filename, 'count': stat_info.st_nlink})
LOG.debug(msg)
def delete_if_exists(pathname):
"""delete a file, but ignore file not found error"""
try:
os.unlink(pathname)
except OSError as (errno, strerror):
if errno == 2: # doesn't exist
return
else:
raise
def get_from_path(items, path):
"""Returns a list of items matching the specified path.

63
tools/clean_file_locks.py Executable file
View File

@ -0,0 +1,63 @@
#!/usr/bin/env python
# Copyright 2012 La Honda Research Center, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""clean_file_locks.py - Cleans stale interprocess locks
This rountine can be used to find and delete stale lock files from
nova's interprocess synchroization. It can be used safely while services
are running.
"""
import logging
import optparse
from nova import flags
from nova import utils
from nova import log
LOG = log.getLogger('nova.utils')
FLAGS = flags.FLAGS
def parse_options():
"""process command line options."""
parser = optparse.OptionParser('usage: %prog [options]')
parser.add_option('--verbose', action='store_true',
help='List lock files found and deleted')
options, args = parser.parse_args()
return options, args
def main():
"""Main loop."""
options, args = parse_options()
verbose = options.verbose
if verbose:
LOG.logger.setLevel(logging.DEBUG)
else:
LOG.logger.setLevel(logging.INFO)
LOG.info('Cleaning stale locks from %s' % FLAGS.lock_path)
utils.cleanup_file_locks()
LOG.info('Finished')
if __name__ == '__main__':
main()