Patch Tiller deployment to ensure self-recovery
On node startup, there appears to be a race condition between when kubelet sees a pod and when kubelet sees a service. Due to this race, required environment variable are missing to allow tiller to function properly. See the comment at https://github.com/kubernetes/kubernetes/blob/v1.18.1/pkg/kubelet/kubelet_pods.go#L566 This change patches the tiller deployment to make sure the four classes of environment variables are present prior to starting tiller. If any class of variables are not present in the environment, then exit. This will recreate the pod and will populate the correct environment for tiller to function. Since the upgrade to v1.18.1, this has been seen in simplex and duplex controller configurations. Review https://review.opendev.org/#/c/699307/ will cover patching during initial provisioning via ansible. This change will check that tiller is patched every time the conductor starts as part of the tiller upgrade logic. This will cover scenarios where tiller is manually removed from the cluster and reinstalled via helm. This change should be reverted once StarlingX moves to helm v3. Also removed dead code: get_k8s_secret() Change-Id: Icd199ec1b1e10840094c0eae59d53838f32ffd6f Closes-Bug: #1856078 Signed-off-by: Robert Church <robert.church@windriver.com>
This commit is contained in:
parent
3c6cb9785a
commit
24a0284e3d
|
@ -14,6 +14,7 @@
|
|||
from __future__ import absolute_import
|
||||
from distutils.version import LooseVersion
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
from kubernetes import config
|
||||
|
@ -27,6 +28,9 @@ from sysinv.common import exception
|
|||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
# Kubernetes Files
|
||||
KUBERNETES_ADMIN_CONF = '/etc/kubernetes/admin.conf'
|
||||
|
||||
# Possible states for each supported kubernetes version
|
||||
KUBE_STATE_AVAILABLE = 'available'
|
||||
KUBE_STATE_ACTIVE = 'active'
|
||||
|
@ -113,6 +117,13 @@ def get_kube_networking_upgrade_version(kube_upgrade):
|
|||
return kube_upgrade.to_version
|
||||
|
||||
|
||||
def is_k8s_configured():
|
||||
"""Check to see if the k8s admin config file exists."""
|
||||
if os.path.isfile(KUBERNETES_ADMIN_CONF):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class KubeOperator(object):
|
||||
|
||||
def __init__(self):
|
||||
|
@ -121,7 +132,12 @@ class KubeOperator(object):
|
|||
self._kube_client_custom_objects = None
|
||||
|
||||
def _load_kube_config(self):
|
||||
config.load_kube_config('/etc/kubernetes/admin.conf')
|
||||
if not is_k8s_configured():
|
||||
raise exception.SysinvException(
|
||||
"Kubernetes is not configured. API operations will not be "
|
||||
"available.")
|
||||
|
||||
config.load_kube_config(KUBERNETES_ADMIN_CONF)
|
||||
|
||||
# Workaround: Turn off SSL/TLS verification
|
||||
c = Configuration()
|
||||
|
|
|
@ -17,7 +17,6 @@ import pecan
|
|||
from oslo_log import log
|
||||
from sysinv.common import constants
|
||||
from sysinv.common import exception
|
||||
from sysinv.common import utils as cutils
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
@ -485,20 +484,3 @@ class K8RbdProvisioner(object):
|
|||
|
||||
base_name = 'ceph-pool'
|
||||
return str(base_name + '-' + name)
|
||||
|
||||
@staticmethod
|
||||
def get_k8s_secret(secret_name, namespace=None):
|
||||
try:
|
||||
cmd = ['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf',
|
||||
'get', 'secrets', secret_name]
|
||||
if namespace:
|
||||
cmd.append('--namespace=%s' % namespace)
|
||||
stdout, _ = cutils.execute(*cmd, run_as_root=False)
|
||||
except exception.ProcessExecutionError as e:
|
||||
if "not found" in e.stderr.lower():
|
||||
return None
|
||||
raise exception.SysinvException(
|
||||
"Error getting secret: %s in namespace: %s, "
|
||||
"Details: %s" % (secret_name, namespace, str(e)))
|
||||
|
||||
return stdout
|
||||
|
|
|
@ -2757,7 +2757,7 @@ class DockerHelper(object):
|
|||
# is a work around the permission issue in Armada container.
|
||||
kube_config = os.path.join(constants.APP_SYNCED_ARMADA_DATA_PATH,
|
||||
'admin.conf')
|
||||
shutil.copy('/etc/kubernetes/admin.conf', kube_config)
|
||||
shutil.copy(kubernetes.KUBERNETES_ADMIN_CONF, kube_config)
|
||||
os.chown(kube_config, 1000, grp.getgrnam("sys_protected").gr_gid)
|
||||
|
||||
overrides_dir = common.HELM_OVERRIDES_PATH
|
||||
|
@ -2799,6 +2799,9 @@ class DockerHelper(object):
|
|||
command=None)
|
||||
LOG.info("Armada service started!")
|
||||
return container
|
||||
except IOError as ie:
|
||||
if not kubernetes.is_k8s_configured():
|
||||
LOG.error("Unable to start Armada service: %s" % ie)
|
||||
except OSError as oe:
|
||||
LOG.error("Unable to make kubernetes config accessible to "
|
||||
"armada: %s" % oe)
|
||||
|
|
|
@ -68,6 +68,7 @@ from oslo_utils import excutils
|
|||
from oslo_utils import timeutils
|
||||
from oslo_utils import uuidutils
|
||||
from platform_util.license import license
|
||||
from ruamel import yaml
|
||||
from sqlalchemy.orm import exc
|
||||
from six.moves import http_client as httplib
|
||||
from sysinv._i18n import _
|
||||
|
@ -5267,6 +5268,52 @@ class ConductorManager(service.PeriodicService):
|
|||
return
|
||||
self.reapply_app(context, app_name)
|
||||
|
||||
def _patch_tiller_deployment(self):
|
||||
""" Ensure tiller is patched with restart logic."""
|
||||
LOG.info("Attempt to patch tiller deployment")
|
||||
try:
|
||||
# We have a race condition that may cause the tiller pod to not have
|
||||
# its environment set up correctly. This will patch the tiller
|
||||
# deployment to ensure that tiller can recover if that occurs. The
|
||||
# deployment is patched during the initial ansible run. This will
|
||||
# re-patch the deployment in the case when tiller has been removed
|
||||
# and reinstalled in the cluster after the system has been
|
||||
# installed. If tiller is already patched then the patch execution
|
||||
# is successful causing no change to the deployment.
|
||||
patch = {
|
||||
'spec': {
|
||||
'template': {
|
||||
'spec': {
|
||||
'containers': [{
|
||||
'name': 'tiller',
|
||||
'command': [
|
||||
'/bin/sh',
|
||||
'-cex',
|
||||
'#!/bin/sh\n'
|
||||
'env | grep -q -e ^TILLER_DEPLOY || exit\n'
|
||||
'env | grep -q -e ^KUBE_DNS || exit\n'
|
||||
'env | grep -q -e ^KUBERNETES_PORT || exit\n'
|
||||
'env | grep -q -e ^KUBERNETES_SERVICE || exit\n'
|
||||
'/tiller\n'
|
||||
]
|
||||
}]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cmd = ['kubectl',
|
||||
'--kubeconfig={}'.format(kubernetes.KUBERNETES_ADMIN_CONF),
|
||||
'patch', 'deployment', '-n', 'kube-system', 'tiller-deploy',
|
||||
'-p', yaml.dump(patch)]
|
||||
stdout, stderr = cutils.execute(*cmd, run_as_root=False)
|
||||
|
||||
except exception.ProcessExecutionError as e:
|
||||
raise exception.SysinvException(
|
||||
_("Error patching the tiller deployment, "
|
||||
"Details: %s") % str(e))
|
||||
|
||||
LOG.info("Tiller deployment has been patched")
|
||||
|
||||
def _upgrade_downgrade_kube_components(self):
|
||||
self._upgrade_downgrade_tiller()
|
||||
self._upgrade_downgrade_kube_networking()
|
||||
|
@ -5360,6 +5407,16 @@ class ConductorManager(service.PeriodicService):
|
|||
LOG.error("{}. Failed to upgrade/downgrade tiller.".format(e))
|
||||
return False
|
||||
|
||||
# Patch tiller to allow restarts if the environment is incomplete
|
||||
#
|
||||
# NOTE: This patch along with this upgrade functionality can be removed
|
||||
# once StarlingX moves to Helm v3
|
||||
try:
|
||||
self._patch_tiller_deployment()
|
||||
except Exception as e:
|
||||
LOG.error("{}. Failed to patch tiller deployment.".format(e))
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
@retry(retry_on_result=lambda x: x is False,
|
||||
|
@ -5438,7 +5495,8 @@ class ConductorManager(service.PeriodicService):
|
|||
:returns: list of namespaces
|
||||
"""
|
||||
try:
|
||||
cmd = ['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf',
|
||||
cmd = ['kubectl',
|
||||
'--kubeconfig={}'.format(kubernetes.KUBERNETES_ADMIN_CONF),
|
||||
'get', 'namespaces', '-o',
|
||||
'go-template=\'{{range .items}}{{.metadata.name}}\'{{end}}\'']
|
||||
stdout, stderr = cutils.execute(*cmd, run_as_root=False)
|
||||
|
|
|
@ -20,6 +20,7 @@ from stevedore import extension
|
|||
|
||||
from oslo_log import log as logging
|
||||
from sysinv.common import exception
|
||||
from sysinv.common import kubernetes
|
||||
from sysinv.common import utils
|
||||
from sysinv.helm import common
|
||||
|
||||
|
@ -451,7 +452,7 @@ class HelmOperator(object):
|
|||
cmd.extend(['--set', value_set])
|
||||
|
||||
env = os.environ.copy()
|
||||
env['KUBECONFIG'] = '/etc/kubernetes/admin.conf'
|
||||
env['KUBECONFIG'] = kubernetes.KUBERNETES_ADMIN_CONF
|
||||
|
||||
# Make a temporary directory with a fake chart in it
|
||||
try:
|
||||
|
|
|
@ -13,6 +13,7 @@ from eventlet.green import subprocess
|
|||
import ruamel.yaml as yaml
|
||||
from oslo_log import log as logging
|
||||
from sysinv.agent import rpcapi as agent_rpcapi
|
||||
from sysinv.common import kubernetes
|
||||
from sysinv.common import exception
|
||||
from sysinv.openstack.common import context
|
||||
import threading
|
||||
|
@ -43,7 +44,7 @@ def retrieve_helm_releases():
|
|||
:return: a dict of deployed helm releases
|
||||
"""
|
||||
helm_list = subprocess.Popen(
|
||||
['helm', '--kubeconfig', '/etc/kubernetes/admin.conf',
|
||||
['helm', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF,
|
||||
'list', '--output', 'yaml'],
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
timer = threading.Timer(20, helm_list.kill)
|
||||
|
@ -93,7 +94,7 @@ def delete_helm_release(release):
|
|||
:param release: the name of the helm release
|
||||
"""
|
||||
helm_cmd = subprocess.Popen(
|
||||
['helm', '--kubeconfig', '/etc/kubernetes/admin.conf',
|
||||
['helm', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF,
|
||||
'delete', release],
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
timer = threading.Timer(20, helm_cmd.kill)
|
||||
|
@ -123,7 +124,7 @@ def delete_helm_release(release):
|
|||
def get_openstack_pending_install_charts():
|
||||
try:
|
||||
return subprocess.check_output(
|
||||
['helm', '--kubeconfig', '/etc/kubernetes/admin.conf',
|
||||
['helm', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF,
|
||||
'list', '--namespace', 'openstack', '--pending'])
|
||||
except Exception as e:
|
||||
raise exception.HelmTillerFailure(
|
||||
|
@ -138,18 +139,21 @@ def helm_upgrade_tiller(image):
|
|||
# sed command until helm and tiller provide a fix for
|
||||
# https://github.com/helm/helm/issues/6374
|
||||
workaround_part1 = '--skip-refresh ' \
|
||||
'--service-account tiller ' \
|
||||
'--node-selectors "node-role.kubernetes.io/master"="" ' \
|
||||
'--override spec.template.spec.hostNetwork=true ' \
|
||||
'--override spec.selector.matchLabels.app=helm ' \
|
||||
'--override spec.selector.matchLabels.name=tiller ' \
|
||||
'--output yaml'
|
||||
'--service-account tiller ' \
|
||||
'--node-selectors "node-role.kubernetes.io/master"="" ' \
|
||||
'--override spec.template.spec.hostNetwork=true ' \
|
||||
'--override spec.selector.matchLabels.app=helm ' \
|
||||
'--override spec.selector.matchLabels.name=tiller ' \
|
||||
'--output yaml'
|
||||
workaround_part2 = \
|
||||
'| sed "s@apiVersion: extensions/v1beta1@apiVersion: apps/v1@" ' \
|
||||
'| kubectl --kubeconfig /etc/kubernetes/admin.conf replace --force -f -'
|
||||
'| kubectl --kubeconfig {} replace --force -f -'.format(
|
||||
kubernetes.KUBERNETES_ADMIN_CONF)
|
||||
|
||||
cmd = '{} {} {} {}'.format(
|
||||
'helm init --upgrade --kubeconfig /etc/kubernetes/admin.conf --tiller-image',
|
||||
cmd = '{} {} {} {} {} {}'.format(
|
||||
'helm init --upgrade --kubeconfig',
|
||||
kubernetes.KUBERNETES_ADMIN_CONF,
|
||||
'--tiller-image',
|
||||
image,
|
||||
workaround_part1,
|
||||
workaround_part2)
|
||||
|
|
|
@ -624,6 +624,13 @@ class TestKubeOperator(base.TestCase):
|
|||
|
||||
self.list_namespaced_pod_result = None
|
||||
|
||||
def mock_is_k8s_configured():
|
||||
return True
|
||||
self.mocked_is_k8s_configured = mock.patch(
|
||||
'sysinv.common.kubernetes.is_k8s_configured',
|
||||
mock_is_k8s_configured)
|
||||
self.mocked_is_k8s_configured.start()
|
||||
|
||||
def mock_list_namespaced_pod(obj, namespace, field_selector=""):
|
||||
pod_name = field_selector.split('metadata.name=', 1)[1]
|
||||
return self.list_namespaced_pod_result[pod_name]
|
||||
|
|
Loading…
Reference in New Issue