Fix peer readiness detection

Sharing the admin password with peers over the 'cluster' relation
was needed in case the leader would die and the next leader would
then need that information. This was implemented years ago when
the leader DB didn't exist.

This led to a race condition described in the mentioned bug and
can now be safely removed.

Validated by deploying several keystone and glance units, then
removing the keystone leader, then adding a glance unit and
checking that this new unit gets its service credentials.

Also added useful traces, made linter happy and increased fernet
token expiration to avoid spurious test failures.

Closes-Bug: #1818113
Change-Id: I004903e50f51e190467d71691982de26518d7149
This commit is contained in:
Aurelien Lourot 2020-05-20 10:28:59 +02:00
parent cd722cb6fe
commit 8b46dfd637
14 changed files with 66 additions and 38 deletions

View File

@ -225,7 +225,7 @@ SWIFT_CODENAMES = OrderedDict([
('train',
['2.22.0', '2.23.0']),
('ussuri',
['2.24.0']),
['2.24.0', '2.25.0']),
])
# >= Liberty version->codename mapping
@ -2227,10 +2227,13 @@ def inform_peers_unit_state(state, relation_name='cluster'):
if state not in UNIT_STATES:
raise ValueError(
"Setting invalid state {} for unit".format(state))
this_unit = local_unit()
for r_id in relation_ids(relation_name):
juju_log('Telling peer behind relation {} that {} is {}'.format(
r_id, this_unit, state), 'DEBUG')
relation_set(relation_id=r_id,
relation_settings={
get_peer_key(local_unit()): state})
get_peer_key(this_unit): state})
def get_peers_unit_state(relation_name='cluster'):
@ -2262,8 +2265,10 @@ def are_peers_ready(relation_name='cluster'):
:returns: Whether all units are ready.
:rtype: bool
"""
unit_states = get_peers_unit_state(relation_name)
return all(v == UNIT_READY for v in unit_states.values())
unit_states = get_peers_unit_state(relation_name).values()
juju_log('{} peers are in the following states: {}'.format(
relation_name, unit_states), 'DEBUG')
return all(state == UNIT_READY for state in unit_states)
def inform_peers_if_ready(check_unit_ready_func, relation_name='cluster'):
@ -2346,7 +2351,9 @@ def get_api_application_status():
app_state, msg = get_api_unit_status()
if app_state == WORKLOAD_STATES.ACTIVE:
if are_peers_ready():
return WORKLOAD_STATES.ACTIVE, 'Application Ready'
msg = 'Application Ready'
else:
return WORKLOAD_STATES.WAITING, 'Some units are not ready'
app_state = WORKLOAD_STATES.WAITING
msg = 'Some units are not ready'
juju_log(msg, 'DEBUG')
return app_state, msg

View File

@ -92,6 +92,7 @@ DEFAULT_PGS_PER_OSD_TARGET = 100
DEFAULT_POOL_WEIGHT = 10.0
LEGACY_PG_COUNT = 200
DEFAULT_MINIMUM_PGS = 2
AUTOSCALER_DEFAULT_PGS = 32
class OsdPostUpgradeError(Exception):
@ -399,16 +400,28 @@ class ReplicatedPool(Pool):
def create(self):
if not pool_exists(self.service, self.name):
nautilus_or_later = cmp_pkgrevno('ceph-common', '14.2.0') >= 0
# Create it
cmd = ['ceph', '--id', self.service, 'osd', 'pool', 'create',
self.name, str(self.pg_num)]
if nautilus_or_later:
cmd = [
'ceph', '--id', self.service, 'osd', 'pool', 'create',
'--pg-num-min={}'.format(
min(AUTOSCALER_DEFAULT_PGS, self.pg_num)
),
self.name, str(self.pg_num)
]
else:
cmd = [
'ceph', '--id', self.service, 'osd', 'pool', 'create',
self.name, str(self.pg_num)
]
try:
check_call(cmd)
# Set the pool replica size
update_pool(client=self.service,
pool=self.name,
settings={'size': str(self.replicas)})
nautilus_or_later = cmp_pkgrevno('ceph-common', '14.2.0') >= 0
if nautilus_or_later:
# Ensure we set the expected pool ratio
update_pool(client=self.service,
@ -419,13 +432,13 @@ class ReplicatedPool(Pool):
pool=self.name,
name=self.app_name)
except CalledProcessError:
log('Could not set app name for pool {}'.format(self.name, level=WARNING))
log('Could not set app name for pool {}'.format(self.name), level=WARNING)
if 'pg_autoscaler' in enabled_manager_modules():
try:
enable_pg_autoscale(self.service, self.name)
except CalledProcessError as e:
log('Could not configure auto scaling for pool {}: {}'.format(
self.name, e, level=WARNING))
self.name, e), level=WARNING)
except CalledProcessError:
raise
@ -466,10 +479,24 @@ class ErasurePool(Pool):
k = int(erasure_profile['k'])
m = int(erasure_profile['m'])
pgs = self.get_pgs(k + m, self.percent_data)
nautilus_or_later = cmp_pkgrevno('ceph-common', '14.2.0') >= 0
# Create it
cmd = ['ceph', '--id', self.service, 'osd', 'pool', 'create',
self.name, str(pgs), str(pgs),
'erasure', self.erasure_code_profile]
if nautilus_or_later:
cmd = [
'ceph', '--id', self.service, 'osd', 'pool', 'create',
'--pg-num-min={}'.format(
min(AUTOSCALER_DEFAULT_PGS, pgs)
),
self.name, str(pgs), str(pgs),
'erasure', self.erasure_code_profile
]
else:
cmd = [
'ceph', '--id', self.service, 'osd', 'pool', 'create',
self.name, str(pgs), str(pgs),
'erasure', self.erasure_code_profile
]
try:
check_call(cmd)
try:
@ -477,8 +504,7 @@ class ErasurePool(Pool):
pool=self.name,
name=self.app_name)
except CalledProcessError:
log('Could not set app name for pool {}'.format(self.name, level=WARNING))
nautilus_or_later = cmp_pkgrevno('ceph-common', '14.2.0') >= 0
log('Could not set app name for pool {}'.format(self.name), level=WARNING)
if nautilus_or_later:
# Ensure we set the expected pool ratio
update_pool(client=self.service,
@ -489,7 +515,7 @@ class ErasurePool(Pool):
enable_pg_autoscale(self.service, self.name)
except CalledProcessError as e:
log('Could not configure auto scaling for pool {}: {}'.format(
self.name, e, level=WARNING))
self.name, e), level=WARNING)
except CalledProcessError:
raise

View File

@ -69,7 +69,7 @@ def _snap_exec(commands):
.format(SNAP_NO_LOCK_RETRY_COUNT))
return_code = e.returncode
log('Snap failed to acquire lock, trying again in {} seconds.'
.format(SNAP_NO_LOCK_RETRY_DELAY, level='WARN'))
.format(SNAP_NO_LOCK_RETRY_DELAY), level='WARN')
sleep(SNAP_NO_LOCK_RETRY_DELAY)
return return_code

View File

@ -109,7 +109,6 @@ from keystone_utils import (
setup_ipv6,
send_notifications,
is_db_initialised,
filter_null,
is_service_present,
delete_service_entry,
assess_status,
@ -147,7 +146,6 @@ from charmhelpers.contrib.openstack.ha.utils import (
from charmhelpers.payload.execd import execd_preinstall
from charmhelpers.contrib.peerstorage import (
peer_retrieve_by_prefix,
peer_echo,
)
from charmhelpers.contrib.openstack.ip import (
@ -476,15 +474,6 @@ def identity_changed(relation_id=None, remote_unit=None):
endpoints_checksum(endpoints[ep])
)
else:
# Each unit needs to set the db information otherwise if the unit
# with the info dies the settings die with it Bug# 1355848
for rel_id in relation_ids('identity-service'):
peerdb_settings = peer_retrieve_by_prefix(rel_id)
# Ensure the null'd settings are unset in the relation.
peerdb_settings = filter_null(peerdb_settings)
if 'service_password' in peerdb_settings:
relation_set(relation_id=rel_id, **peerdb_settings)
log('Deferring identity_changed() to service leader.')
if notifications_endpoints or notifications_checksums:

View File

@ -1520,6 +1520,12 @@ def ensure_initial_admin(config):
'ldap' and config('ldap-readonly')):
admin_username = config('admin-user')
# NOTE(lourot): get_admin_passwd() will generate a new password if
# the juju config or the leader DB doesn't contain already one. The
# set_admin_passwd() callback will then store that password in the
# leader DB. So if the leader dies, the new leader will still have
# access to the password.
if get_api_version() > 2:
passwd = create_user_credentials(admin_username,
get_admin_passwd,

View File

@ -27,7 +27,7 @@ applications:
num_units: 3
options:
token-provider: 'fernet'
token-expiration: 60
token-expiration: 300
to:
- '1'
- '2'

View File

@ -27,7 +27,7 @@ applications:
num_units: 3
options:
openstack-origin: cloud:bionic-rocky
token-expiration: 60
token-expiration: 300
to:
- '1'
- '2'

View File

@ -27,7 +27,7 @@ applications:
num_units: 3
options:
openstack-origin: cloud:bionic-stein
token-expiration: 60
token-expiration: 300
to:
- '1'
- '2'

View File

@ -27,7 +27,7 @@ applications:
num_units: 3
options:
openstack-origin: cloud:bionic-train
token-expiration: 60
token-expiration: 300
to:
- '1'
- '2'

View File

@ -26,7 +26,7 @@ applications:
num_units: 3
options:
openstack-origin: cloud:bionic-ussuri
token-expiration: 60
token-expiration: 300
to:
- '1'
- '2'

View File

@ -40,7 +40,7 @@ applications:
options:
openstack-origin: *openstack-origin
token-provider: 'fernet'
token-expiration: 60
token-expiration: 300
to:
- '3'
- '4'

View File

@ -28,7 +28,7 @@ applications:
options:
openstack-origin: cloud:xenial-ocata
token-provider: 'fernet'
token-expiration: 60
token-expiration: 300
to:
- '1'
- '2'

View File

@ -28,7 +28,7 @@ applications:
options:
openstack-origin: cloud:xenial-pike
token-provider: 'fernet'
token-expiration: 60
token-expiration: 300
to:
- '1'
- '2'

View File

@ -28,7 +28,7 @@ applications:
options:
openstack-origin: cloud:xenial-queens
token-provider: 'fernet'
token-expiration: 60
token-expiration: 300
to:
- '1'
- '2'