apply runtime manifest deadlock waiting for management ip

Fix for "partition deleted immediately after creation"
adds mutex between config_apply_runtime_manifests()
and agent_audit() however:
1. config_apply_runtime_manifests is looping (max 300s)
   waiting for self._mgmt_ip to be set
2. agent_audit() is setting self._mgmt_ip but can't run
   because config_apply_runtime_manifests() is running

Move retry logic on self._mgmt_ip outside of
config_apply_runtime_manifests() so agent_audit()
can run.

Change-Id: I3b1e2ebdaa684fa16e21662fb703dffffa70abe3
Closes-Bug: #1790159
This commit is contained in:
Daniel Badea 2018-09-12 14:36:22 +00:00
parent b25961082f
commit d2dcb9882c
2 changed files with 39 additions and 34 deletions

View File

@ -36,6 +36,7 @@ Commands (from conductors) are received via RPC calls.
import errno
import fcntl
import os
import retrying
import shutil
import subprocess
import sys
@ -1354,6 +1355,11 @@ class AgentManager(service.PeriodicService):
self._update_config_applied(iconfig_uuid)
self._report_config_applied(context)
def _retry_on_missing_mgmt_ip(self, exception):
return isinstance(exception, exception.LocalManagementIpNotFound)
@retrying.retry(wait_fixed=15 * 1000, stop_max_delay=300 * 1000,
retry_on_exception=_retry_on_missing_mgmt_ip)
@utils.synchronized(LOCK_AGENT_ACTION, external=False)
def config_apply_runtime_manifest(self, context, config_uuid, config_dict):
"""Asynchronously, have the agent apply the runtime manifest with the
@ -1374,44 +1380,37 @@ class AgentManager(service.PeriodicService):
:returns: none ... uses asynchronous cast().
"""
try:
# runtime manifests can not be applied without the initial
# configuration applied
force = config_dict.get('force', False)
if (not force and
not os.path.isfile(tsc.INITIAL_CONFIG_COMPLETE_FLAG)):
# runtime manifests can not be applied without the initial
# configuration applied
force = config_dict.get('force', False)
if (not force and
not os.path.isfile(tsc.INITIAL_CONFIG_COMPLETE_FLAG)):
return
personalities = config_dict.get('personalities')
host_uuids = config_dict.get('host_uuids')
if host_uuids:
# ignore requests that are not intended for this host
if self._ihost_uuid not in host_uuids:
return
personalities = config_dict.get('personalities')
host_uuids = config_dict.get('host_uuids')
if host_uuids:
# ignore requests that are not intended for this host
if self._ihost_uuid not in host_uuids:
return
else:
# ignore requests that are not intended for host personality
for subfunction in self.subfunctions_list_get():
if subfunction in personalities:
break
else:
# ignore requests that are not intended for host personality
for subfunction in self.subfunctions_list_get():
if subfunction in personalities:
break
else:
return
LOG.info("config_apply_runtime_manifest: %s %s %s" % (
config_uuid, config_dict, self._ihost_personality))
time_slept = 0
while not self._mgmt_ip and time_slept < 300:
time.sleep(15)
time_slept += 15
if not self._mgmt_ip:
LOG.warn("config_apply_runtime_manifest: "
" timed out waiting for local management ip"
" %s %s %s" %
(config_uuid, config_dict, self._ihost_personality))
return
if not self._mgmt_ip:
raise exception.LocalManagementIpNotFound(
config_uuid=config_uuid, config_dict=config_dict,
host_personality=self._ihost_personality)
LOG.info("config_apply_runtime_manifest: %s %s %s" % (
config_uuid, config_dict, self._ihost_personality))
try:
if not os.path.exists(tsc.PUPPET_PATH):
# we must be controller-standby or storage, mount /var/run/platform
LOG.info("controller-standby or storage, mount /var/run/platform")

View File

@ -1265,3 +1265,9 @@ class IncompleteCephMonNetworkConfig(CephFailure):
class InvalidHelmNamespace(Invalid):
message = _("Invalid helm overrides namespace (%(namespace)s) for chart %(chart)s.")
class LocalManagementIpNotFound(NotFound):
message = _("Local management IP not found: "
"config_uuid=%(config_uuid), config_dict=%(config_dict), "
"host_personality=%(host_personality)")