Don't try to stop a stack on a dead engine

Check that an engine is still alive before trying to stop the stack
during a stack-delete.

Change-Id: I3ce87dcefd5c4b5c542c3dc0d288655433bf69c4
Closes-Bug: #1279010
This commit is contained in:
Jason Dunsmore 2014-02-11 12:30:31 -06:00
parent 2a2cf5231d
commit de5feffa6a
3 changed files with 54 additions and 17 deletions

View File

@ -577,31 +577,33 @@ class EngineService(service.Service):
lock = stack_lock.StackLock(cnxt, stack, self.engine_id)
acquire_result = lock.try_acquire()
# Successfully acquired lock
if acquire_result is None:
self.thread_group_mgr.start_with_acquired_lock(stack, lock,
stack.delete)
return
elif acquire_result == self.engine_id: # Current engine has the lock
# Current engine has the lock
elif acquire_result == self.engine_id:
self.thread_group_mgr.stop(stack.id)
# If the lock isn't released here, then the call to
# start_with_lock below will raise an ActionInProgress
# exception. Ideally, we wouldn't be calling another
# release() here, since it should be called as soon as the
# ThreadGroup is stopped. But apparently there's a race
# between release() the next call to lock.acquire().
db_api.stack_lock_release(stack.id, self.engine_id)
else: # Another engine has the lock
other_engine_id = acquire_result
stop_result = remote_stop(other_engine_id)
# Another active engine has the lock
elif stack_lock.StackLock.engine_alive(cnxt, acquire_result):
stop_result = remote_stop(acquire_result)
if stop_result is None:
logger.debug(_("Successfully stopped remote task on engine %s")
% other_engine_id)
% acquire_result)
else:
raise exception.StopActionFailed(stack_name=stack.name,
engine_id=other_engine_id)
engine_id=acquire_result)
# If the lock isn't released here, then the call to
# start_with_lock below will raise an ActionInProgress
# exception. Ideally, we wouldn't be calling another
# release() here, since it should be called as soon as the
# ThreadGroup is stopped. But apparently there's a race
# between release() the next call to lock.acquire().
db_api.stack_lock_release(stack.id, acquire_result)
# There may be additional resources that we don't know about
# if an update was in-progress when the stack was stopped, so

View File

@ -34,12 +34,13 @@ class StackLock(object):
self.engine_id = engine_id
self.listener = None
def _engine_alive(self, engine_id):
@staticmethod
def engine_alive(context, engine_id):
topic = engine_id
rpc = proxy.RpcProxy(topic, "1.0")
msg = rpc.make_msg("listening")
try:
return rpc.call(self.context, msg, topic=topic,
return rpc.call(context, msg, topic=topic,
timeout=cfg.CONF.engine_life_check_timeout)
except rpc_common.Timeout:
return False
@ -72,7 +73,7 @@ class StackLock(object):
return
if lock_engine_id == self.engine_id or \
self._engine_alive(lock_engine_id):
self.engine_alive(self.context, lock_engine_id):
logger.debug(_("Lock on stack %(stack)s is owned by engine "
"%(engine)s") % {'stack': self.stack.id,
'engine': lock_engine_id})

View File

@ -728,6 +728,10 @@ class StackServiceCreateUpdateDeleteTest(HeatTestCase):
self.m.StubOutWithMock(stack_lock.StackLock, 'try_acquire')
stack_lock.StackLock.try_acquire().AndReturn("other-engine-fake-uuid")
self.m.StubOutWithMock(stack_lock.StackLock, 'engine_alive')
stack_lock.StackLock.engine_alive(self.ctx, "other-engine-fake-uuid")\
.AndReturn(True)
rpc = proxy.RpcProxy("other-engine-fake-uuid", "1.0")
msg = rpc.make_msg("stop_stack", stack_identity=mox.IgnoreArg())
self.m.StubOutWithMock(proxy.RpcProxy, 'call')
@ -755,6 +759,10 @@ class StackServiceCreateUpdateDeleteTest(HeatTestCase):
self.m.StubOutWithMock(stack_lock.StackLock, 'try_acquire')
stack_lock.StackLock.try_acquire().AndReturn("other-engine-fake-uuid")
self.m.StubOutWithMock(stack_lock.StackLock, 'engine_alive')
stack_lock.StackLock.engine_alive(self.ctx, "other-engine-fake-uuid")\
.AndReturn(True)
rpc = proxy.RpcProxy("other-engine-fake-uuid", "1.0")
msg = rpc.make_msg("stop_stack", stack_identity=mox.IgnoreArg())
self.m.StubOutWithMock(proxy.RpcProxy, 'call')
@ -769,6 +777,32 @@ class StackServiceCreateUpdateDeleteTest(HeatTestCase):
self.assertIsNone(self.man.delete_stack(self.ctx, stack.identifier()))
self.m.VerifyAll()
def test_stack_delete_other_dead_engine_active_lock(self):
stack_name = 'service_delete_test_stack'
stack = get_wordpress_stack(stack_name, self.ctx)
sid = stack.store()
# Insert a fake lock into the db
db_api.stack_lock_create(stack.id, "other-engine-fake-uuid")
st = db_api.stack_get(self.ctx, sid)
self.m.StubOutWithMock(parser.Stack, 'load')
parser.Stack.load(self.ctx, stack=st).MultipleTimes().AndReturn(stack)
self.m.StubOutWithMock(stack_lock.StackLock, 'try_acquire')
stack_lock.StackLock.try_acquire().AndReturn("other-engine-fake-uuid")
self.m.StubOutWithMock(stack_lock.StackLock, 'engine_alive')
stack_lock.StackLock.engine_alive(self.ctx, "other-engine-fake-uuid")\
.AndReturn(False)
self.m.StubOutWithMock(stack_lock.StackLock, 'acquire')
stack_lock.StackLock.acquire().AndReturn(None)
self.m.ReplayAll()
self.assertIsNone(self.man.delete_stack(self.ctx, stack.identifier()))
self.m.VerifyAll()
def test_stack_update(self):
stack_name = 'service_update_test_stack'
params = {'foo': 'bar'}