Avoid package upgrade collisions

During upgrade to a new cloud archive and a new rabbitmq-server
package, the package update will stop the rabbitmq service. If two or
more units do this near the same time the services will fail to restart.

This change adds the cluster_wait function before a package upgrade to
avoid collisions.

Fix relation_get for cluster_with causing extraneous hooks to execute.
Fix nagios stats collection logic.

Change-Id: I8d9aa38d917583fa45b5570eca9a78c813303e2b
Closes-Bug: #1778829
This commit is contained in:
David Ames 2018-07-10 10:48:21 -07:00
parent fd3e48f96c
commit 86edf1e5d8
3 changed files with 12 additions and 5 deletions

View File

@ -468,7 +468,9 @@ def cluster_with():
log('Host already clustered with %s.' % node)
cluster_rid = relation_id('cluster', local_unit())
is_clustered = relation_get(attribute='clustered', rid=cluster_rid)
is_clustered = relation_get(attribute='clustered',
rid=cluster_rid,
unit=local_unit())
log('am I clustered?: %s' % bool(is_clustered), level=DEBUG)
if not is_clustered:

View File

@ -613,12 +613,13 @@ def update_nrpe_checks():
rsync(os.path.join(charm_dir(), 'scripts',
'collect_rabbitmq_stats.sh'), script)
write_file(STATS_CRONFILE, cronjob)
elif os.path.isfile(STATS_CRONFILE):
os.remove(STATS_CRONFILE)
if config('management_plugin'):
rsync(os.path.join(charm_dir(), 'scripts',
'check_rabbitmq_cluster.py'),
os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_cluster.py'))
elif os.path.isfile(STATS_CRONFILE):
os.remove(STATS_CRONFILE)
# Find out if nrpe set nagios_hostname
hostname = nrpe.get_nagios_hostname()
@ -746,6 +747,10 @@ def config_changed():
# result in an upgrade if applicable only if we change the 'source'
# config option
if rabbit.archive_upgrade_available():
# Avoid packge upgrade collissions
# Stopping and attempting to start rabbitmqs at the same time leads to
# failed restarts
rabbit.cluster_wait()
rabbit.install_or_upgrade_packages()
if config('ssl') == 'off':

View File

@ -577,8 +577,8 @@ class RmqBasicDeployment(OpenStackAmuletDeployment):
if ret:
amulet.raise_status(amulet.FAIL, msg=ret)
u.log.debug('Sleeping 70s for 1m cron job to run...')
time.sleep(70)
u.log.debug('Sleeping 2ms for 1m cron job to run...')
time.sleep(120)
# check_rabbitmq_queue monitor
u.log.debug('Checking nrpe check_rabbitmq_queue on units...')