Fix peer readiness detection

Sharing the admin password with peers over the 'cluster' relation was needed in case the leader would die and the next leader would then need that information. This was implemented years ago when the leader DB didn't exist. This led to a race condition described in the mentioned bug and can now be safely removed. Validated by deploying several keystone and glance units, then removing the keystone leader, then adding a glance unit and checking that this new unit gets its service credentials. Also added useful traces, made linter happy and increased fernet token expiration to avoid spurious test failures. Closes-Bug: #1818113 Change-Id: I004903e50f51e190467d71691982de26518d7149
2020-05-20 10:28:59 +02:00 · 2020-05-20 10:28:59 +02:00 · 8b46dfd637
parent cd722cb6fe
commit 8b46dfd637
14 changed files with 66 additions and 38 deletions
--- a/charmhelpers/contrib/openstack/utils.py
+++ b/charmhelpers/contrib/openstack/utils.py
@ -225,7 +225,7 @@ SWIFT_CODENAMES = OrderedDict([
    ('train',
        ['2.22.0', '2.23.0']),
    ('ussuri',
-        ['2.24.0']),
+        ['2.24.0', '2.25.0']),
 ])

 # >= Liberty version->codename mapping
@ -2227,10 +2227,13 @@ def inform_peers_unit_state(state, relation_name='cluster'):
    if state not in UNIT_STATES:
        raise ValueError(
            "Setting invalid state {} for unit".format(state))
+    this_unit = local_unit()
    for r_id in relation_ids(relation_name):
+        juju_log('Telling peer behind relation {} that {} is {}'.format(
+            r_id, this_unit, state), 'DEBUG')
        relation_set(relation_id=r_id,
                     relation_settings={
-                         get_peer_key(local_unit()): state})
+                         get_peer_key(this_unit): state})


 def get_peers_unit_state(relation_name='cluster'):
@ -2262,8 +2265,10 @@ def are_peers_ready(relation_name='cluster'):
    :returns: Whether all units are ready.
    :rtype: bool
    """
-    unit_states = get_peers_unit_state(relation_name)
-    return all(v == UNIT_READY for v in unit_states.values())
+    unit_states = get_peers_unit_state(relation_name).values()
+    juju_log('{} peers are in the following states: {}'.format(
+        relation_name, unit_states), 'DEBUG')
+    return all(state == UNIT_READY for state in unit_states)


 def inform_peers_if_ready(check_unit_ready_func, relation_name='cluster'):
@ -2346,7 +2351,9 @@ def get_api_application_status():
    app_state, msg = get_api_unit_status()
    if app_state == WORKLOAD_STATES.ACTIVE:
        if are_peers_ready():
-            return WORKLOAD_STATES.ACTIVE, 'Application Ready'
+            msg = 'Application Ready'
        else:
-            return WORKLOAD_STATES.WAITING, 'Some units are not ready'
+            app_state = WORKLOAD_STATES.WAITING
+            msg = 'Some units are not ready'
+    juju_log(msg, 'DEBUG')
    return app_state, msg
--- a/charmhelpers/contrib/storage/linux/ceph.py
+++ b/charmhelpers/contrib/storage/linux/ceph.py
@ -92,6 +92,7 @@ DEFAULT_PGS_PER_OSD_TARGET = 100
 DEFAULT_POOL_WEIGHT = 10.0
 LEGACY_PG_COUNT = 200
 DEFAULT_MINIMUM_PGS = 2
+AUTOSCALER_DEFAULT_PGS = 32


 class OsdPostUpgradeError(Exception):
@ -399,16 +400,28 @@ class ReplicatedPool(Pool):

    def create(self):
        if not pool_exists(self.service, self.name):
+            nautilus_or_later = cmp_pkgrevno('ceph-common', '14.2.0') >= 0
            # Create it
-            cmd = ['ceph', '--id', self.service, 'osd', 'pool', 'create',
-                   self.name, str(self.pg_num)]
+            if nautilus_or_later:
+                cmd = [
+                    'ceph', '--id', self.service, 'osd', 'pool', 'create',
+                    '--pg-num-min={}'.format(
+                        min(AUTOSCALER_DEFAULT_PGS, self.pg_num)
+                    ),
+                    self.name, str(self.pg_num)
+                ]
+            else:
+                cmd = [
+                    'ceph', '--id', self.service, 'osd', 'pool', 'create',
+                    self.name, str(self.pg_num)
+                ]
+
            try:
                check_call(cmd)
                # Set the pool replica size
                update_pool(client=self.service,
                            pool=self.name,
                            settings={'size': str(self.replicas)})
-                nautilus_or_later = cmp_pkgrevno('ceph-common', '14.2.0') >= 0
                if nautilus_or_later:
                    # Ensure we set the expected pool ratio
                    update_pool(client=self.service,
@ -419,13 +432,13 @@ class ReplicatedPool(Pool):
                                          pool=self.name,
                                          name=self.app_name)
                except CalledProcessError:
-                    log('Could not set app name for pool {}'.format(self.name, level=WARNING))
+                    log('Could not set app name for pool {}'.format(self.name), level=WARNING)
                if 'pg_autoscaler' in enabled_manager_modules():
                    try:
                        enable_pg_autoscale(self.service, self.name)
                    except CalledProcessError as e:
                        log('Could not configure auto scaling for pool {}: {}'.format(
-                            self.name, e, level=WARNING))
+                            self.name, e), level=WARNING)
            except CalledProcessError:
                raise

@ -466,10 +479,24 @@ class ErasurePool(Pool):
            k = int(erasure_profile['k'])
            m = int(erasure_profile['m'])
            pgs = self.get_pgs(k + m, self.percent_data)
+            nautilus_or_later = cmp_pkgrevno('ceph-common', '14.2.0') >= 0
            # Create it
-            cmd = ['ceph', '--id', self.service, 'osd', 'pool', 'create',
-                   self.name, str(pgs), str(pgs),
-                   'erasure', self.erasure_code_profile]
+            if nautilus_or_later:
+                cmd = [
+                    'ceph', '--id', self.service, 'osd', 'pool', 'create',
+                    '--pg-num-min={}'.format(
+                        min(AUTOSCALER_DEFAULT_PGS, pgs)
+                    ),
+                    self.name, str(pgs), str(pgs),
+                    'erasure', self.erasure_code_profile
+                ]
+            else:
+                cmd = [
+                    'ceph', '--id', self.service, 'osd', 'pool', 'create',
+                    self.name, str(pgs), str(pgs),
+                    'erasure', self.erasure_code_profile
+                ]
+
            try:
                check_call(cmd)
                try:
@ -477,8 +504,7 @@ class ErasurePool(Pool):
                                          pool=self.name,
                                          name=self.app_name)
                except CalledProcessError:
-                    log('Could not set app name for pool {}'.format(self.name, level=WARNING))
-                nautilus_or_later = cmp_pkgrevno('ceph-common', '14.2.0') >= 0
+                    log('Could not set app name for pool {}'.format(self.name), level=WARNING)
                if nautilus_or_later:
                    # Ensure we set the expected pool ratio
                    update_pool(client=self.service,
@ -489,7 +515,7 @@ class ErasurePool(Pool):
                        enable_pg_autoscale(self.service, self.name)
                    except CalledProcessError as e:
                        log('Could not configure auto scaling for pool {}: {}'.format(
-                            self.name, e, level=WARNING))
+                            self.name, e), level=WARNING)
            except CalledProcessError:
                raise

--- a/charmhelpers/fetch/snap.py
+++ b/charmhelpers/fetch/snap.py
@ -69,7 +69,7 @@ def _snap_exec(commands):
                    .format(SNAP_NO_LOCK_RETRY_COUNT))
            return_code = e.returncode
            log('Snap failed to acquire lock, trying again in {} seconds.'
-                .format(SNAP_NO_LOCK_RETRY_DELAY, level='WARN'))
+                .format(SNAP_NO_LOCK_RETRY_DELAY), level='WARN')
            sleep(SNAP_NO_LOCK_RETRY_DELAY)

    return return_code
--- a/hooks/keystone_hooks.py
+++ b/hooks/keystone_hooks.py
@ -109,7 +109,6 @@ from keystone_utils import (
    setup_ipv6,
    send_notifications,
    is_db_initialised,
-    filter_null,
    is_service_present,
    delete_service_entry,
    assess_status,
@ -147,7 +146,6 @@ from charmhelpers.contrib.openstack.ha.utils import (

 from charmhelpers.payload.execd import execd_preinstall
 from charmhelpers.contrib.peerstorage import (
-    peer_retrieve_by_prefix,
    peer_echo,
 )
 from charmhelpers.contrib.openstack.ip import (
@ -476,15 +474,6 @@ def identity_changed(relation_id=None, remote_unit=None):
                        endpoints_checksum(endpoints[ep])
                    )
    else:
-        # Each unit needs to set the db information otherwise if the unit
-        # with the info dies the settings die with it Bug# 1355848
-        for rel_id in relation_ids('identity-service'):
-            peerdb_settings = peer_retrieve_by_prefix(rel_id)
-            # Ensure the null'd settings are unset in the relation.
-            peerdb_settings = filter_null(peerdb_settings)
-            if 'service_password' in peerdb_settings:
-                relation_set(relation_id=rel_id, **peerdb_settings)
-
        log('Deferring identity_changed() to service leader.')

    if notifications_endpoints or notifications_checksums:
--- a/hooks/keystone_utils.py
+++ b/hooks/keystone_utils.py
@ -1520,6 +1520,12 @@ def ensure_initial_admin(config):
                'ldap' and config('ldap-readonly')):

            admin_username = config('admin-user')
+
+            # NOTE(lourot): get_admin_passwd() will generate a new password if
+            # the juju config or the leader DB doesn't contain already one. The
+            # set_admin_passwd() callback will then store that password in the
+            # leader DB. So if the leader dies, the new leader will still have
+            # access to the password.
            if get_api_version() > 2:
                passwd = create_user_credentials(admin_username,
                                                 get_admin_passwd,
--- a/tests/bundles/bionic-queens.yaml
+++ b/tests/bundles/bionic-queens.yaml
@ -27,7 +27,7 @@ applications:
    num_units: 3
    options:
      token-provider: 'fernet'
-      token-expiration: 60
+      token-expiration: 300
    to:
    - '1'
    - '2'
--- a/tests/bundles/bionic-rocky.yaml
+++ b/tests/bundles/bionic-rocky.yaml
@ -27,7 +27,7 @@ applications:
    num_units: 3
    options:
      openstack-origin: cloud:bionic-rocky
-      token-expiration: 60
+      token-expiration: 300
    to:
    - '1'
    - '2'
--- a/tests/bundles/bionic-stein.yaml
+++ b/tests/bundles/bionic-stein.yaml
@ -27,7 +27,7 @@ applications:
    num_units: 3
    options:
      openstack-origin: cloud:bionic-stein
-      token-expiration: 60
+      token-expiration: 300
    to:
    - '1'
    - '2'
--- a/tests/bundles/bionic-train.yaml
+++ b/tests/bundles/bionic-train.yaml
@ -27,7 +27,7 @@ applications:
    num_units: 3
    options:
      openstack-origin: cloud:bionic-train
-      token-expiration: 60
+      token-expiration: 300
    to:
    - '1'
    - '2'
--- a/tests/bundles/bionic-ussuri.yaml
+++ b/tests/bundles/bionic-ussuri.yaml
@ -26,7 +26,7 @@ applications:
    num_units: 3
    options:
      openstack-origin: cloud:bionic-ussuri
-      token-expiration: 60
+      token-expiration: 300
    to:
    - '1'
    - '2'
--- a/tests/bundles/focal-ussuri.yaml
+++ b/tests/bundles/focal-ussuri.yaml
@ -40,7 +40,7 @@ applications:
    options:
      openstack-origin: *openstack-origin
      token-provider: 'fernet'
-      token-expiration: 60
+      token-expiration: 300
    to:
    - '3'
    - '4'
--- a/tests/bundles/xenial-ocata.yaml
+++ b/tests/bundles/xenial-ocata.yaml
@ -28,7 +28,7 @@ applications:
    options:
      openstack-origin: cloud:xenial-ocata
      token-provider: 'fernet'
-      token-expiration: 60
+      token-expiration: 300
    to:
    - '1'
    - '2'
--- a/tests/bundles/xenial-pike.yaml
+++ b/tests/bundles/xenial-pike.yaml
@ -28,7 +28,7 @@ applications:
    options:
      openstack-origin: cloud:xenial-pike
      token-provider: 'fernet'
-      token-expiration: 60
+      token-expiration: 300
    to:
    - '1'
    - '2'
--- a/tests/bundles/xenial-queens.yaml
+++ b/tests/bundles/xenial-queens.yaml
@ -28,7 +28,7 @@ applications:
    options:
      openstack-origin: cloud:xenial-queens
      token-provider: 'fernet'
-      token-expiration: 60
+      token-expiration: 300
    to:
    - '1'
    - '2'