Patch Tiller deployment to ensure self-recovery

On node startup, there appears to be a race condition between when
kubelet sees a pod and when kubelet sees a service. Due to this race,
required environment variable are missing to allow tiller to function
properly.

See the comment at
https://github.com/kubernetes/kubernetes/blob/v1.18.1/pkg/kubelet/kubelet_pods.go#L566

This change patches the tiller deployment to make sure the four classes
of environment variables are present prior to starting tiller. If any
class of variables are not present in the environment, then exit. This
will recreate the pod and will populate the correct environment for
tiller to function.

Since the upgrade to v1.18.1, this has been seen in simplex and duplex
controller configurations.

Review https://review.opendev.org/#/c/699307/ will cover patching during
initial provisioning via ansible. This change will check that tiller is
patched every time the conductor starts as part of the tiller upgrade
logic. This will cover scenarios where tiller is manually removed from
the cluster and reinstalled via helm.

This change should be reverted once StarlingX moves to helm v3.

Also removed dead code: get_k8s_secret()

Change-Id: Icd199ec1b1e10840094c0eae59d53838f32ffd6f
Closes-Bug: #1856078
Signed-off-by: Robert Church <robert.church@windriver.com>
This commit is contained in:
Robert Church 2020-04-19 06:22:50 -04:00
parent 3c6cb9785a
commit 24a0284e3d
7 changed files with 105 additions and 34 deletions

View File

@ -14,6 +14,7 @@
from __future__ import absolute_import
from distutils.version import LooseVersion
import json
import os
import re
from kubernetes import config
@ -27,6 +28,9 @@ from sysinv.common import exception
LOG = logging.getLogger(__name__)
# Kubernetes Files
KUBERNETES_ADMIN_CONF = '/etc/kubernetes/admin.conf'
# Possible states for each supported kubernetes version
KUBE_STATE_AVAILABLE = 'available'
KUBE_STATE_ACTIVE = 'active'
@ -113,6 +117,13 @@ def get_kube_networking_upgrade_version(kube_upgrade):
return kube_upgrade.to_version
def is_k8s_configured():
"""Check to see if the k8s admin config file exists."""
if os.path.isfile(KUBERNETES_ADMIN_CONF):
return True
return False
class KubeOperator(object):
def __init__(self):
@ -121,7 +132,12 @@ class KubeOperator(object):
self._kube_client_custom_objects = None
def _load_kube_config(self):
config.load_kube_config('/etc/kubernetes/admin.conf')
if not is_k8s_configured():
raise exception.SysinvException(
"Kubernetes is not configured. API operations will not be "
"available.")
config.load_kube_config(KUBERNETES_ADMIN_CONF)
# Workaround: Turn off SSL/TLS verification
c = Configuration()

View File

@ -17,7 +17,6 @@ import pecan
from oslo_log import log
from sysinv.common import constants
from sysinv.common import exception
from sysinv.common import utils as cutils
LOG = log.getLogger(__name__)
@ -485,20 +484,3 @@ class K8RbdProvisioner(object):
base_name = 'ceph-pool'
return str(base_name + '-' + name)
@staticmethod
def get_k8s_secret(secret_name, namespace=None):
try:
cmd = ['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf',
'get', 'secrets', secret_name]
if namespace:
cmd.append('--namespace=%s' % namespace)
stdout, _ = cutils.execute(*cmd, run_as_root=False)
except exception.ProcessExecutionError as e:
if "not found" in e.stderr.lower():
return None
raise exception.SysinvException(
"Error getting secret: %s in namespace: %s, "
"Details: %s" % (secret_name, namespace, str(e)))
return stdout

View File

@ -2757,7 +2757,7 @@ class DockerHelper(object):
# is a work around the permission issue in Armada container.
kube_config = os.path.join(constants.APP_SYNCED_ARMADA_DATA_PATH,
'admin.conf')
shutil.copy('/etc/kubernetes/admin.conf', kube_config)
shutil.copy(kubernetes.KUBERNETES_ADMIN_CONF, kube_config)
os.chown(kube_config, 1000, grp.getgrnam("sys_protected").gr_gid)
overrides_dir = common.HELM_OVERRIDES_PATH
@ -2799,6 +2799,9 @@ class DockerHelper(object):
command=None)
LOG.info("Armada service started!")
return container
except IOError as ie:
if not kubernetes.is_k8s_configured():
LOG.error("Unable to start Armada service: %s" % ie)
except OSError as oe:
LOG.error("Unable to make kubernetes config accessible to "
"armada: %s" % oe)

View File

@ -68,6 +68,7 @@ from oslo_utils import excutils
from oslo_utils import timeutils
from oslo_utils import uuidutils
from platform_util.license import license
from ruamel import yaml
from sqlalchemy.orm import exc
from six.moves import http_client as httplib
from sysinv._i18n import _
@ -5267,6 +5268,52 @@ class ConductorManager(service.PeriodicService):
return
self.reapply_app(context, app_name)
def _patch_tiller_deployment(self):
""" Ensure tiller is patched with restart logic."""
LOG.info("Attempt to patch tiller deployment")
try:
# We have a race condition that may cause the tiller pod to not have
# its environment set up correctly. This will patch the tiller
# deployment to ensure that tiller can recover if that occurs. The
# deployment is patched during the initial ansible run. This will
# re-patch the deployment in the case when tiller has been removed
# and reinstalled in the cluster after the system has been
# installed. If tiller is already patched then the patch execution
# is successful causing no change to the deployment.
patch = {
'spec': {
'template': {
'spec': {
'containers': [{
'name': 'tiller',
'command': [
'/bin/sh',
'-cex',
'#!/bin/sh\n'
'env | grep -q -e ^TILLER_DEPLOY || exit\n'
'env | grep -q -e ^KUBE_DNS || exit\n'
'env | grep -q -e ^KUBERNETES_PORT || exit\n'
'env | grep -q -e ^KUBERNETES_SERVICE || exit\n'
'/tiller\n'
]
}]
}
}
}
}
cmd = ['kubectl',
'--kubeconfig={}'.format(kubernetes.KUBERNETES_ADMIN_CONF),
'patch', 'deployment', '-n', 'kube-system', 'tiller-deploy',
'-p', yaml.dump(patch)]
stdout, stderr = cutils.execute(*cmd, run_as_root=False)
except exception.ProcessExecutionError as e:
raise exception.SysinvException(
_("Error patching the tiller deployment, "
"Details: %s") % str(e))
LOG.info("Tiller deployment has been patched")
def _upgrade_downgrade_kube_components(self):
self._upgrade_downgrade_tiller()
self._upgrade_downgrade_kube_networking()
@ -5360,6 +5407,16 @@ class ConductorManager(service.PeriodicService):
LOG.error("{}. Failed to upgrade/downgrade tiller.".format(e))
return False
# Patch tiller to allow restarts if the environment is incomplete
#
# NOTE: This patch along with this upgrade functionality can be removed
# once StarlingX moves to Helm v3
try:
self._patch_tiller_deployment()
except Exception as e:
LOG.error("{}. Failed to patch tiller deployment.".format(e))
return False
return True
@retry(retry_on_result=lambda x: x is False,
@ -5438,7 +5495,8 @@ class ConductorManager(service.PeriodicService):
:returns: list of namespaces
"""
try:
cmd = ['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf',
cmd = ['kubectl',
'--kubeconfig={}'.format(kubernetes.KUBERNETES_ADMIN_CONF),
'get', 'namespaces', '-o',
'go-template=\'{{range .items}}{{.metadata.name}}\'{{end}}\'']
stdout, stderr = cutils.execute(*cmd, run_as_root=False)

View File

@ -20,6 +20,7 @@ from stevedore import extension
from oslo_log import log as logging
from sysinv.common import exception
from sysinv.common import kubernetes
from sysinv.common import utils
from sysinv.helm import common
@ -451,7 +452,7 @@ class HelmOperator(object):
cmd.extend(['--set', value_set])
env = os.environ.copy()
env['KUBECONFIG'] = '/etc/kubernetes/admin.conf'
env['KUBECONFIG'] = kubernetes.KUBERNETES_ADMIN_CONF
# Make a temporary directory with a fake chart in it
try:

View File

@ -13,6 +13,7 @@ from eventlet.green import subprocess
import ruamel.yaml as yaml
from oslo_log import log as logging
from sysinv.agent import rpcapi as agent_rpcapi
from sysinv.common import kubernetes
from sysinv.common import exception
from sysinv.openstack.common import context
import threading
@ -43,7 +44,7 @@ def retrieve_helm_releases():
:return: a dict of deployed helm releases
"""
helm_list = subprocess.Popen(
['helm', '--kubeconfig', '/etc/kubernetes/admin.conf',
['helm', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF,
'list', '--output', 'yaml'],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
timer = threading.Timer(20, helm_list.kill)
@ -93,7 +94,7 @@ def delete_helm_release(release):
:param release: the name of the helm release
"""
helm_cmd = subprocess.Popen(
['helm', '--kubeconfig', '/etc/kubernetes/admin.conf',
['helm', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF,
'delete', release],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
timer = threading.Timer(20, helm_cmd.kill)
@ -123,7 +124,7 @@ def delete_helm_release(release):
def get_openstack_pending_install_charts():
try:
return subprocess.check_output(
['helm', '--kubeconfig', '/etc/kubernetes/admin.conf',
['helm', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF,
'list', '--namespace', 'openstack', '--pending'])
except Exception as e:
raise exception.HelmTillerFailure(
@ -138,18 +139,21 @@ def helm_upgrade_tiller(image):
# sed command until helm and tiller provide a fix for
# https://github.com/helm/helm/issues/6374
workaround_part1 = '--skip-refresh ' \
'--service-account tiller ' \
'--node-selectors "node-role.kubernetes.io/master"="" ' \
'--override spec.template.spec.hostNetwork=true ' \
'--override spec.selector.matchLabels.app=helm ' \
'--override spec.selector.matchLabels.name=tiller ' \
'--output yaml'
'--service-account tiller ' \
'--node-selectors "node-role.kubernetes.io/master"="" ' \
'--override spec.template.spec.hostNetwork=true ' \
'--override spec.selector.matchLabels.app=helm ' \
'--override spec.selector.matchLabels.name=tiller ' \
'--output yaml'
workaround_part2 = \
'| sed "s@apiVersion: extensions/v1beta1@apiVersion: apps/v1@" ' \
'| kubectl --kubeconfig /etc/kubernetes/admin.conf replace --force -f -'
'| kubectl --kubeconfig {} replace --force -f -'.format(
kubernetes.KUBERNETES_ADMIN_CONF)
cmd = '{} {} {} {}'.format(
'helm init --upgrade --kubeconfig /etc/kubernetes/admin.conf --tiller-image',
cmd = '{} {} {} {} {} {}'.format(
'helm init --upgrade --kubeconfig',
kubernetes.KUBERNETES_ADMIN_CONF,
'--tiller-image',
image,
workaround_part1,
workaround_part2)

View File

@ -624,6 +624,13 @@ class TestKubeOperator(base.TestCase):
self.list_namespaced_pod_result = None
def mock_is_k8s_configured():
return True
self.mocked_is_k8s_configured = mock.patch(
'sysinv.common.kubernetes.is_k8s_configured',
mock_is_k8s_configured)
self.mocked_is_k8s_configured.start()
def mock_list_namespaced_pod(obj, namespace, field_selector=""):
pod_name = field_selector.split('metadata.name=', 1)[1]
return self.list_namespaced_pod_result[pod_name]