From 86edf1e5d86b97ec071578b1e910cd94b7a019e4 Mon Sep 17 00:00:00 2001 From: David Ames Date: Tue, 10 Jul 2018 10:48:21 -0700 Subject: [PATCH] Avoid package upgrade collisions During upgrade to a new cloud archive and a new rabbitmq-server package, the package update will stop the rabbitmq service. If two or more units do this near the same time the services will fail to restart. This change adds the cluster_wait function before a package upgrade to avoid collisions. Fix relation_get for cluster_with causing extraneous hooks to execute. Fix nagios stats collection logic. Change-Id: I8d9aa38d917583fa45b5570eca9a78c813303e2b Closes-Bug: #1778829 --- hooks/rabbit_utils.py | 4 +++- hooks/rabbitmq_server_relations.py | 9 +++++++-- tests/basic_deployment.py | 4 ++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/hooks/rabbit_utils.py b/hooks/rabbit_utils.py index ac3142b8..7c090c19 100644 --- a/hooks/rabbit_utils.py +++ b/hooks/rabbit_utils.py @@ -468,7 +468,9 @@ def cluster_with(): log('Host already clustered with %s.' % node) cluster_rid = relation_id('cluster', local_unit()) - is_clustered = relation_get(attribute='clustered', rid=cluster_rid) + is_clustered = relation_get(attribute='clustered', + rid=cluster_rid, + unit=local_unit()) log('am I clustered?: %s' % bool(is_clustered), level=DEBUG) if not is_clustered: diff --git a/hooks/rabbitmq_server_relations.py b/hooks/rabbitmq_server_relations.py index ce8cf72f..4036ddb4 100755 --- a/hooks/rabbitmq_server_relations.py +++ b/hooks/rabbitmq_server_relations.py @@ -613,12 +613,13 @@ def update_nrpe_checks(): rsync(os.path.join(charm_dir(), 'scripts', 'collect_rabbitmq_stats.sh'), script) write_file(STATS_CRONFILE, cronjob) + elif os.path.isfile(STATS_CRONFILE): + os.remove(STATS_CRONFILE) + if config('management_plugin'): rsync(os.path.join(charm_dir(), 'scripts', 'check_rabbitmq_cluster.py'), os.path.join(NAGIOS_PLUGINS, 'check_rabbitmq_cluster.py')) - elif os.path.isfile(STATS_CRONFILE): - os.remove(STATS_CRONFILE) # Find out if nrpe set nagios_hostname hostname = nrpe.get_nagios_hostname() @@ -746,6 +747,10 @@ def config_changed(): # result in an upgrade if applicable only if we change the 'source' # config option if rabbit.archive_upgrade_available(): + # Avoid packge upgrade collissions + # Stopping and attempting to start rabbitmqs at the same time leads to + # failed restarts + rabbit.cluster_wait() rabbit.install_or_upgrade_packages() if config('ssl') == 'off': diff --git a/tests/basic_deployment.py b/tests/basic_deployment.py index a672c233..a2d9a64d 100644 --- a/tests/basic_deployment.py +++ b/tests/basic_deployment.py @@ -577,8 +577,8 @@ class RmqBasicDeployment(OpenStackAmuletDeployment): if ret: amulet.raise_status(amulet.FAIL, msg=ret) - u.log.debug('Sleeping 70s for 1m cron job to run...') - time.sleep(70) + u.log.debug('Sleeping 2ms for 1m cron job to run...') + time.sleep(120) # check_rabbitmq_queue monitor u.log.debug('Checking nrpe check_rabbitmq_queue on units...')