diff --git a/nova/service.py b/nova/service.py index adf242f3dcae..1da10e6971b6 100644 --- a/nova/service.py +++ b/nova/service.py @@ -158,6 +158,7 @@ class Service(object): vcs_string = version.version_string_with_vcs() LOG.audit(_('Starting %(topic)s node (version %(vcs_string)s)'), {'topic': self.topic, 'vcs_string': vcs_string}) + utils.cleanup_file_locks() self.manager.init_host() self.model_disconnected = False ctxt = context.get_admin_context() @@ -360,6 +361,7 @@ class WSGIService(object): :returns: None """ + utils.cleanup_file_locks() if self.manager: self.manager.init_host() self.server.start() diff --git a/nova/tests/test_utils.py b/nova/tests/test_utils.py index 93146876e739..843b48dd39c4 100644 --- a/nova/tests/test_utils.py +++ b/nova/tests/test_utils.py @@ -18,6 +18,8 @@ import __builtin__ import datetime import hashlib import os +import os.path +import socket import StringIO import tempfile @@ -811,3 +813,150 @@ class Iso8601TimeTest(test.TestCase): west = utils.parse_isotime(str) normed = utils.normalize_time(west) self._instaneous(normed, 2012, 2, 13, 23, 53, 07, 0) + + +class TestLockCleanup(test.TestCase): + """unit tests for utils.cleanup_file_locks()""" + + def setUp(self): + super(TestLockCleanup, self).setUp() + + self.pid = os.getpid() + self.dead_pid = self._get_dead_pid() + self.lock_name = 'nova-testlock' + self.lock_file = os.path.join(FLAGS.lock_path, + self.lock_name + '.lock') + self.hostname = socket.gethostname() + print self.pid, self.dead_pid + try: + os.unlink(self.lock_file) + except OSError as (errno, strerror): + if errno == 2: + pass + + def _get_dead_pid(self): + """get a pid for a process that does not exist""" + + candidate_pid = self.pid - 1 + while os.path.exists(os.path.join('/proc', str(candidate_pid))): + candidate_pid -= 1 + if candidate_pid == 1: + return 0 + return candidate_pid + + def _get_sentinel_name(self, hostname, pid, thread='MainThread'): + return os.path.join(FLAGS.lock_path, + '%s.%s-%d' % (hostname, thread, pid)) + + def _create_sentinel(self, hostname, pid, thread='MainThread'): + name = self._get_sentinel_name(hostname, pid, thread) + open(name, 'wb').close() + return name + + def test_clean_stale_locks(self): + """verify locks for dead processes are cleaned up""" + + # create sentinels for two processes, us and a 'dead' one + # no actve lock + sentinel1 = self._create_sentinel(self.hostname, self.pid) + sentinel2 = self._create_sentinel(self.hostname, self.dead_pid) + + utils.cleanup_file_locks() + + self.assertTrue(os.path.exists(sentinel1)) + self.assertFalse(os.path.exists(self.lock_file)) + self.assertFalse(os.path.exists(sentinel2)) + + os.unlink(sentinel1) + + def test_clean_stale_locks_active(self): + """verify locks for dead processes are cleaned with an active lock """ + + # create sentinels for two processes, us and a 'dead' one + # create an active lock for us + sentinel1 = self._create_sentinel(self.hostname, self.pid) + sentinel2 = self._create_sentinel(self.hostname, self.dead_pid) + os.link(sentinel1, self.lock_file) + + utils.cleanup_file_locks() + + self.assertTrue(os.path.exists(sentinel1)) + self.assertTrue(os.path.exists(self.lock_file)) + self.assertFalse(os.path.exists(sentinel2)) + + os.unlink(sentinel1) + os.unlink(self.lock_file) + + def test_clean_stale_with_threads(self): + """verify locks for multiple threads are cleaned up """ + + # create sentinels for four threads in our process, and a 'dead' + # process. no lock. + sentinel1 = self._create_sentinel(self.hostname, self.pid, 'Default-1') + sentinel2 = self._create_sentinel(self.hostname, self.pid, 'Default-2') + sentinel3 = self._create_sentinel(self.hostname, self.pid, 'Default-3') + sentinel4 = self._create_sentinel(self.hostname, self.pid, 'Default-4') + sentinel5 = self._create_sentinel(self.hostname, self.dead_pid, + 'Default-1') + + utils.cleanup_file_locks() + + self.assertTrue(os.path.exists(sentinel1)) + self.assertTrue(os.path.exists(sentinel2)) + self.assertTrue(os.path.exists(sentinel3)) + self.assertTrue(os.path.exists(sentinel4)) + self.assertFalse(os.path.exists(self.lock_file)) + self.assertFalse(os.path.exists(sentinel5)) + + os.unlink(sentinel1) + os.unlink(sentinel2) + os.unlink(sentinel3) + os.unlink(sentinel4) + + def test_clean_stale_with_threads_active(self): + """verify locks for multiple threads are cleaned up """ + + # create sentinels for four threads in our process, and a 'dead' + # process + sentinel1 = self._create_sentinel(self.hostname, self.pid, 'Default-1') + sentinel2 = self._create_sentinel(self.hostname, self.pid, 'Default-2') + sentinel3 = self._create_sentinel(self.hostname, self.pid, 'Default-3') + sentinel4 = self._create_sentinel(self.hostname, self.pid, 'Default-4') + sentinel5 = self._create_sentinel(self.hostname, self.dead_pid, + 'Default-1') + + os.link(sentinel1, self.lock_file) + + utils.cleanup_file_locks() + + self.assertTrue(os.path.exists(sentinel1)) + self.assertTrue(os.path.exists(sentinel2)) + self.assertTrue(os.path.exists(sentinel3)) + self.assertTrue(os.path.exists(sentinel4)) + self.assertTrue(os.path.exists(self.lock_file)) + self.assertFalse(os.path.exists(sentinel5)) + + os.unlink(sentinel1) + os.unlink(sentinel2) + os.unlink(sentinel3) + os.unlink(sentinel4) + os.unlink(self.lock_file) + + def test_clean_bogus_lockfiles(self): + """verify lockfiles are cleaned """ + + lock1 = os.path.join(FLAGS.lock_path, 'nova-testlock1.lock') + lock2 = os.path.join(FLAGS.lock_path, 'nova-testlock2.lock') + lock3 = os.path.join(FLAGS.lock_path, 'testlock3.lock') + + open(lock1, 'wb').close() + open(lock2, 'wb').close() + open(lock3, 'wb').close() + + utils.cleanup_file_locks() + + self.assertFalse(os.path.exists(lock1)) + self.assertFalse(os.path.exists(lock2)) + self.assertTrue(os.path.exists(lock3)) + + os.unlink(lock3) diff --git a/nova/utils.py b/nova/utils.py index ec62f87fc4be..0f3e61897707 100644 --- a/nova/utils.py +++ b/nova/utils.py @@ -26,7 +26,6 @@ import hashlib import inspect import itertools import json -import lockfile import os import pyclbr import random @@ -46,6 +45,7 @@ from eventlet import greenthread from eventlet import semaphore from eventlet.green import subprocess import iso8601 +import lockfile import netaddr from nova import exception @@ -857,6 +857,89 @@ def synchronized(name, external=False): return wrap +def cleanup_file_locks(): + """clean up stale locks left behind by process failures + + The lockfile module, used by @synchronized, can leave stale lockfiles + behind after process failure. These locks can cause process hangs + at startup, when a process deadlocks on a lock which will never + be unlocked. + + Intended to be called at service startup. + + """ + + # NOTE(mikeyp) this routine incorporates some internal knowledge + # from the lockfile module, and this logic really + # should be part of that module. + # + # cleanup logic: + # 1) look for the lockfile modules's 'sentinel' files, of the form + # hostname.[thread-.*]-pid, extract the pid. + # if pid doesn't match a running process, delete the file since + # it's from a dead process. + # 2) check for the actual lockfiles. if lockfile exists with linkcount + # of 1, it's bogus, so delete it. A link count >= 2 indicates that + # there are probably sentinels still linked to it from active + # processes. This check isn't perfect, but there is no way to + # reliably tell which sentinels refer to which lock in the + # lockfile implementation. + + if FLAGS.disable_process_locking: + return + + hostname = socket.gethostname() + sentinel_re = hostname + r'\..*-(\d+$)' + lockfile_re = r'nova-.*\.lock' + files = os.listdir(FLAGS.lock_path) + + # cleanup sentinels + for filename in files: + match = re.match(sentinel_re, filename) + if match is None: + continue + pid = match.group(1) + LOG.debug(_('Found sentinel %(filename)s for pid %(pid)s' % + {'filename': filename, 'pid': pid})) + if not os.path.exists(os.path.join('/proc', pid)): + delete_if_exists(os.path.join(FLAGS.lock_path, filename)) + LOG.debug(_('Cleaned sentinel %(filename)s for pid %(pid)s' % + {'filename': filename, 'pid': pid})) + + # cleanup lock files + for filename in files: + match = re.match(lockfile_re, filename) + if match is None: + continue + try: + stat_info = os.stat(os.path.join(FLAGS.lock_path, filename)) + except OSError as (errno, strerror): + if errno == 2: # doesn't exist + continue + else: + raise + msg = _('Found lockfile %(file)s with link count %(count)d' % + {'file': filename, 'count': stat_info.st_nlink}) + LOG.debug(msg) + if stat_info.st_nlink == 1: + delete_if_exists(os.path.join(FLAGS.lock_path, filename)) + msg = _('Cleaned lockfile %(file)s with link count %(count)d' % + {'file': filename, 'count': stat_info.st_nlink}) + LOG.debug(msg) + + +def delete_if_exists(pathname): + """delete a file, but ignore file not found error""" + + try: + os.unlink(pathname) + except OSError as (errno, strerror): + if errno == 2: # doesn't exist + return + else: + raise + + def get_from_path(items, path): """Returns a list of items matching the specified path. diff --git a/tools/clean_file_locks.py b/tools/clean_file_locks.py new file mode 100755 index 000000000000..eb21177aa580 --- /dev/null +++ b/tools/clean_file_locks.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python + +# Copyright 2012 La Honda Research Center, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""clean_file_locks.py - Cleans stale interprocess locks + +This rountine can be used to find and delete stale lock files from +nova's interprocess synchroization. It can be used safely while services +are running. + +""" + +import logging +import optparse + +from nova import flags +from nova import utils +from nova import log + + +LOG = log.getLogger('nova.utils') +FLAGS = flags.FLAGS + + +def parse_options(): + """process command line options.""" + + parser = optparse.OptionParser('usage: %prog [options]') + parser.add_option('--verbose', action='store_true', + help='List lock files found and deleted') + + options, args = parser.parse_args() + + return options, args + + +def main(): + """Main loop.""" + options, args = parse_options() + verbose = options.verbose + + if verbose: + LOG.logger.setLevel(logging.DEBUG) + else: + LOG.logger.setLevel(logging.INFO) + LOG.info('Cleaning stale locks from %s' % FLAGS.lock_path) + utils.cleanup_file_locks() + LOG.info('Finished') + +if __name__ == '__main__': + main()