From 8b46dfd637908adeb5a817f85fc97fad249be8a0 Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Wed, 20 May 2020 10:28:59 +0200 Subject: [PATCH] Fix peer readiness detection Sharing the admin password with peers over the 'cluster' relation was needed in case the leader would die and the next leader would then need that information. This was implemented years ago when the leader DB didn't exist. This led to a race condition described in the mentioned bug and can now be safely removed. Validated by deploying several keystone and glance units, then removing the keystone leader, then adding a glance unit and checking that this new unit gets its service credentials. Also added useful traces, made linter happy and increased fernet token expiration to avoid spurious test failures. Closes-Bug: #1818113 Change-Id: I004903e50f51e190467d71691982de26518d7149 --- charmhelpers/contrib/openstack/utils.py | 19 ++++++--- charmhelpers/contrib/storage/linux/ceph.py | 48 +++++++++++++++++----- charmhelpers/fetch/snap.py | 2 +- hooks/keystone_hooks.py | 11 ----- hooks/keystone_utils.py | 6 +++ tests/bundles/bionic-queens.yaml | 2 +- tests/bundles/bionic-rocky.yaml | 2 +- tests/bundles/bionic-stein.yaml | 2 +- tests/bundles/bionic-train.yaml | 2 +- tests/bundles/bionic-ussuri.yaml | 2 +- tests/bundles/focal-ussuri.yaml | 2 +- tests/bundles/xenial-ocata.yaml | 2 +- tests/bundles/xenial-pike.yaml | 2 +- tests/bundles/xenial-queens.yaml | 2 +- 14 files changed, 66 insertions(+), 38 deletions(-) diff --git a/charmhelpers/contrib/openstack/utils.py b/charmhelpers/contrib/openstack/utils.py index e59e0d1e..dbe7595d 100644 --- a/charmhelpers/contrib/openstack/utils.py +++ b/charmhelpers/contrib/openstack/utils.py @@ -225,7 +225,7 @@ SWIFT_CODENAMES = OrderedDict([ ('train', ['2.22.0', '2.23.0']), ('ussuri', - ['2.24.0']), + ['2.24.0', '2.25.0']), ]) # >= Liberty version->codename mapping @@ -2227,10 +2227,13 @@ def inform_peers_unit_state(state, relation_name='cluster'): if state not in UNIT_STATES: raise ValueError( "Setting invalid state {} for unit".format(state)) + this_unit = local_unit() for r_id in relation_ids(relation_name): + juju_log('Telling peer behind relation {} that {} is {}'.format( + r_id, this_unit, state), 'DEBUG') relation_set(relation_id=r_id, relation_settings={ - get_peer_key(local_unit()): state}) + get_peer_key(this_unit): state}) def get_peers_unit_state(relation_name='cluster'): @@ -2262,8 +2265,10 @@ def are_peers_ready(relation_name='cluster'): :returns: Whether all units are ready. :rtype: bool """ - unit_states = get_peers_unit_state(relation_name) - return all(v == UNIT_READY for v in unit_states.values()) + unit_states = get_peers_unit_state(relation_name).values() + juju_log('{} peers are in the following states: {}'.format( + relation_name, unit_states), 'DEBUG') + return all(state == UNIT_READY for state in unit_states) def inform_peers_if_ready(check_unit_ready_func, relation_name='cluster'): @@ -2346,7 +2351,9 @@ def get_api_application_status(): app_state, msg = get_api_unit_status() if app_state == WORKLOAD_STATES.ACTIVE: if are_peers_ready(): - return WORKLOAD_STATES.ACTIVE, 'Application Ready' + msg = 'Application Ready' else: - return WORKLOAD_STATES.WAITING, 'Some units are not ready' + app_state = WORKLOAD_STATES.WAITING + msg = 'Some units are not ready' + juju_log(msg, 'DEBUG') return app_state, msg diff --git a/charmhelpers/contrib/storage/linux/ceph.py b/charmhelpers/contrib/storage/linux/ceph.py index eb31b782..814d5c72 100644 --- a/charmhelpers/contrib/storage/linux/ceph.py +++ b/charmhelpers/contrib/storage/linux/ceph.py @@ -92,6 +92,7 @@ DEFAULT_PGS_PER_OSD_TARGET = 100 DEFAULT_POOL_WEIGHT = 10.0 LEGACY_PG_COUNT = 200 DEFAULT_MINIMUM_PGS = 2 +AUTOSCALER_DEFAULT_PGS = 32 class OsdPostUpgradeError(Exception): @@ -399,16 +400,28 @@ class ReplicatedPool(Pool): def create(self): if not pool_exists(self.service, self.name): + nautilus_or_later = cmp_pkgrevno('ceph-common', '14.2.0') >= 0 # Create it - cmd = ['ceph', '--id', self.service, 'osd', 'pool', 'create', - self.name, str(self.pg_num)] + if nautilus_or_later: + cmd = [ + 'ceph', '--id', self.service, 'osd', 'pool', 'create', + '--pg-num-min={}'.format( + min(AUTOSCALER_DEFAULT_PGS, self.pg_num) + ), + self.name, str(self.pg_num) + ] + else: + cmd = [ + 'ceph', '--id', self.service, 'osd', 'pool', 'create', + self.name, str(self.pg_num) + ] + try: check_call(cmd) # Set the pool replica size update_pool(client=self.service, pool=self.name, settings={'size': str(self.replicas)}) - nautilus_or_later = cmp_pkgrevno('ceph-common', '14.2.0') >= 0 if nautilus_or_later: # Ensure we set the expected pool ratio update_pool(client=self.service, @@ -419,13 +432,13 @@ class ReplicatedPool(Pool): pool=self.name, name=self.app_name) except CalledProcessError: - log('Could not set app name for pool {}'.format(self.name, level=WARNING)) + log('Could not set app name for pool {}'.format(self.name), level=WARNING) if 'pg_autoscaler' in enabled_manager_modules(): try: enable_pg_autoscale(self.service, self.name) except CalledProcessError as e: log('Could not configure auto scaling for pool {}: {}'.format( - self.name, e, level=WARNING)) + self.name, e), level=WARNING) except CalledProcessError: raise @@ -466,10 +479,24 @@ class ErasurePool(Pool): k = int(erasure_profile['k']) m = int(erasure_profile['m']) pgs = self.get_pgs(k + m, self.percent_data) + nautilus_or_later = cmp_pkgrevno('ceph-common', '14.2.0') >= 0 # Create it - cmd = ['ceph', '--id', self.service, 'osd', 'pool', 'create', - self.name, str(pgs), str(pgs), - 'erasure', self.erasure_code_profile] + if nautilus_or_later: + cmd = [ + 'ceph', '--id', self.service, 'osd', 'pool', 'create', + '--pg-num-min={}'.format( + min(AUTOSCALER_DEFAULT_PGS, pgs) + ), + self.name, str(pgs), str(pgs), + 'erasure', self.erasure_code_profile + ] + else: + cmd = [ + 'ceph', '--id', self.service, 'osd', 'pool', 'create', + self.name, str(pgs), str(pgs), + 'erasure', self.erasure_code_profile + ] + try: check_call(cmd) try: @@ -477,8 +504,7 @@ class ErasurePool(Pool): pool=self.name, name=self.app_name) except CalledProcessError: - log('Could not set app name for pool {}'.format(self.name, level=WARNING)) - nautilus_or_later = cmp_pkgrevno('ceph-common', '14.2.0') >= 0 + log('Could not set app name for pool {}'.format(self.name), level=WARNING) if nautilus_or_later: # Ensure we set the expected pool ratio update_pool(client=self.service, @@ -489,7 +515,7 @@ class ErasurePool(Pool): enable_pg_autoscale(self.service, self.name) except CalledProcessError as e: log('Could not configure auto scaling for pool {}: {}'.format( - self.name, e, level=WARNING)) + self.name, e), level=WARNING) except CalledProcessError: raise diff --git a/charmhelpers/fetch/snap.py b/charmhelpers/fetch/snap.py index 395836c7..fc70aa94 100644 --- a/charmhelpers/fetch/snap.py +++ b/charmhelpers/fetch/snap.py @@ -69,7 +69,7 @@ def _snap_exec(commands): .format(SNAP_NO_LOCK_RETRY_COUNT)) return_code = e.returncode log('Snap failed to acquire lock, trying again in {} seconds.' - .format(SNAP_NO_LOCK_RETRY_DELAY, level='WARN')) + .format(SNAP_NO_LOCK_RETRY_DELAY), level='WARN') sleep(SNAP_NO_LOCK_RETRY_DELAY) return return_code diff --git a/hooks/keystone_hooks.py b/hooks/keystone_hooks.py index 05100384..23747703 100755 --- a/hooks/keystone_hooks.py +++ b/hooks/keystone_hooks.py @@ -109,7 +109,6 @@ from keystone_utils import ( setup_ipv6, send_notifications, is_db_initialised, - filter_null, is_service_present, delete_service_entry, assess_status, @@ -147,7 +146,6 @@ from charmhelpers.contrib.openstack.ha.utils import ( from charmhelpers.payload.execd import execd_preinstall from charmhelpers.contrib.peerstorage import ( - peer_retrieve_by_prefix, peer_echo, ) from charmhelpers.contrib.openstack.ip import ( @@ -476,15 +474,6 @@ def identity_changed(relation_id=None, remote_unit=None): endpoints_checksum(endpoints[ep]) ) else: - # Each unit needs to set the db information otherwise if the unit - # with the info dies the settings die with it Bug# 1355848 - for rel_id in relation_ids('identity-service'): - peerdb_settings = peer_retrieve_by_prefix(rel_id) - # Ensure the null'd settings are unset in the relation. - peerdb_settings = filter_null(peerdb_settings) - if 'service_password' in peerdb_settings: - relation_set(relation_id=rel_id, **peerdb_settings) - log('Deferring identity_changed() to service leader.') if notifications_endpoints or notifications_checksums: diff --git a/hooks/keystone_utils.py b/hooks/keystone_utils.py index e5b77c22..61492c6c 100644 --- a/hooks/keystone_utils.py +++ b/hooks/keystone_utils.py @@ -1520,6 +1520,12 @@ def ensure_initial_admin(config): 'ldap' and config('ldap-readonly')): admin_username = config('admin-user') + + # NOTE(lourot): get_admin_passwd() will generate a new password if + # the juju config or the leader DB doesn't contain already one. The + # set_admin_passwd() callback will then store that password in the + # leader DB. So if the leader dies, the new leader will still have + # access to the password. if get_api_version() > 2: passwd = create_user_credentials(admin_username, get_admin_passwd, diff --git a/tests/bundles/bionic-queens.yaml b/tests/bundles/bionic-queens.yaml index ea8c8a75..91807708 100644 --- a/tests/bundles/bionic-queens.yaml +++ b/tests/bundles/bionic-queens.yaml @@ -27,7 +27,7 @@ applications: num_units: 3 options: token-provider: 'fernet' - token-expiration: 60 + token-expiration: 300 to: - '1' - '2' diff --git a/tests/bundles/bionic-rocky.yaml b/tests/bundles/bionic-rocky.yaml index de9fb809..019872c5 100644 --- a/tests/bundles/bionic-rocky.yaml +++ b/tests/bundles/bionic-rocky.yaml @@ -27,7 +27,7 @@ applications: num_units: 3 options: openstack-origin: cloud:bionic-rocky - token-expiration: 60 + token-expiration: 300 to: - '1' - '2' diff --git a/tests/bundles/bionic-stein.yaml b/tests/bundles/bionic-stein.yaml index b67e8e83..d9f86dc1 100644 --- a/tests/bundles/bionic-stein.yaml +++ b/tests/bundles/bionic-stein.yaml @@ -27,7 +27,7 @@ applications: num_units: 3 options: openstack-origin: cloud:bionic-stein - token-expiration: 60 + token-expiration: 300 to: - '1' - '2' diff --git a/tests/bundles/bionic-train.yaml b/tests/bundles/bionic-train.yaml index 8f080334..d4cdb1e4 100644 --- a/tests/bundles/bionic-train.yaml +++ b/tests/bundles/bionic-train.yaml @@ -27,7 +27,7 @@ applications: num_units: 3 options: openstack-origin: cloud:bionic-train - token-expiration: 60 + token-expiration: 300 to: - '1' - '2' diff --git a/tests/bundles/bionic-ussuri.yaml b/tests/bundles/bionic-ussuri.yaml index 79dabedf..5cbf3df1 100644 --- a/tests/bundles/bionic-ussuri.yaml +++ b/tests/bundles/bionic-ussuri.yaml @@ -26,7 +26,7 @@ applications: num_units: 3 options: openstack-origin: cloud:bionic-ussuri - token-expiration: 60 + token-expiration: 300 to: - '1' - '2' diff --git a/tests/bundles/focal-ussuri.yaml b/tests/bundles/focal-ussuri.yaml index 81c85536..7c94373b 100644 --- a/tests/bundles/focal-ussuri.yaml +++ b/tests/bundles/focal-ussuri.yaml @@ -40,7 +40,7 @@ applications: options: openstack-origin: *openstack-origin token-provider: 'fernet' - token-expiration: 60 + token-expiration: 300 to: - '3' - '4' diff --git a/tests/bundles/xenial-ocata.yaml b/tests/bundles/xenial-ocata.yaml index 09d3ae4e..64f9d19a 100644 --- a/tests/bundles/xenial-ocata.yaml +++ b/tests/bundles/xenial-ocata.yaml @@ -28,7 +28,7 @@ applications: options: openstack-origin: cloud:xenial-ocata token-provider: 'fernet' - token-expiration: 60 + token-expiration: 300 to: - '1' - '2' diff --git a/tests/bundles/xenial-pike.yaml b/tests/bundles/xenial-pike.yaml index bbdaca38..a29adcf4 100644 --- a/tests/bundles/xenial-pike.yaml +++ b/tests/bundles/xenial-pike.yaml @@ -28,7 +28,7 @@ applications: options: openstack-origin: cloud:xenial-pike token-provider: 'fernet' - token-expiration: 60 + token-expiration: 300 to: - '1' - '2' diff --git a/tests/bundles/xenial-queens.yaml b/tests/bundles/xenial-queens.yaml index 802b5acf..1320cfdc 100644 --- a/tests/bundles/xenial-queens.yaml +++ b/tests/bundles/xenial-queens.yaml @@ -28,7 +28,7 @@ applications: options: openstack-origin: cloud:xenial-queens token-provider: 'fernet' - token-expiration: 60 + token-expiration: 300 to: - '1' - '2'