Patch Tiller deployment to ensure self-recovery

On node startup, there appears to be a race condition between when kubelet sees a pod and when kubelet sees a service. Due to this race, required environment variable are missing to allow tiller to function properly. See the comment at https://github.com/kubernetes/kubernetes/blob/v1.18.1/pkg/kubelet/kubelet_pods.go#L566 This change patches the tiller deployment to make sure the four classes of environment variables are present prior to starting tiller. If any class of variables are not present in the environment, then exit. This will recreate the pod and will populate the correct environment for tiller to function. Since the upgrade to v1.18.1, this has been seen in simplex and duplex controller configurations. Review https://review.opendev.org/#/c/699307/ will cover patching during initial provisioning via ansible. This change will check that tiller is patched every time the conductor starts as part of the tiller upgrade logic. This will cover scenarios where tiller is manually removed from the cluster and reinstalled via helm. This change should be reverted once StarlingX moves to helm v3. Also removed dead code: get_k8s_secret() Change-Id: Icd199ec1b1e10840094c0eae59d53838f32ffd6f Closes-Bug: #1856078 Signed-off-by: Robert Church <robert.church@windriver.com>
2020-04-19 06:22:50 -04:00 · 2020-04-19 06:22:50 -04:00 · 24a0284e3d
parent 3c6cb9785a
commit 24a0284e3d
7 changed files with 105 additions and 34 deletions
--- a/sysinv/sysinv/sysinv/sysinv/common/kubernetes.py
+++ b/sysinv/sysinv/sysinv/sysinv/common/kubernetes.py
@ -14,6 +14,7 @@
 from __future__ import absolute_import
 from distutils.version import LooseVersion
 import json
+import os
 import re

 from kubernetes import config
@ -27,6 +28,9 @@ from sysinv.common import exception

 LOG = logging.getLogger(__name__)

+# Kubernetes Files
+KUBERNETES_ADMIN_CONF = '/etc/kubernetes/admin.conf'
+
 # Possible states for each supported kubernetes version
 KUBE_STATE_AVAILABLE = 'available'
 KUBE_STATE_ACTIVE = 'active'
@ -113,6 +117,13 @@ def get_kube_networking_upgrade_version(kube_upgrade):
        return kube_upgrade.to_version


+def is_k8s_configured():
+    """Check to see if the k8s admin config file exists."""
+    if os.path.isfile(KUBERNETES_ADMIN_CONF):
+        return True
+    return False
+
+
 class KubeOperator(object):

    def __init__(self):
@ -121,7 +132,12 @@ class KubeOperator(object):
        self._kube_client_custom_objects = None

    def _load_kube_config(self):
-        config.load_kube_config('/etc/kubernetes/admin.conf')
+        if not is_k8s_configured():
+            raise exception.SysinvException(
+                "Kubernetes is not configured. API operations will not be "
+                "available.")
+
+        config.load_kube_config(KUBERNETES_ADMIN_CONF)

        # Workaround: Turn off SSL/TLS verification
        c = Configuration()
--- a/sysinv/sysinv/sysinv/sysinv/common/storage_backend_conf.py
+++ b/sysinv/sysinv/sysinv/sysinv/common/storage_backend_conf.py
@ -17,7 +17,6 @@ import pecan
 from oslo_log import log
 from sysinv.common import constants
 from sysinv.common import exception
-from sysinv.common import utils as cutils

 LOG = log.getLogger(__name__)

@ -485,20 +484,3 @@ class K8RbdProvisioner(object):

        base_name = 'ceph-pool'
        return str(base_name + '-' + name)
-
-    @staticmethod
-    def get_k8s_secret(secret_name, namespace=None):
-        try:
-            cmd = ['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf',
-                   'get', 'secrets', secret_name]
-            if namespace:
-                cmd.append('--namespace=%s' % namespace)
-            stdout, _ = cutils.execute(*cmd, run_as_root=False)
-        except exception.ProcessExecutionError as e:
-            if "not found" in e.stderr.lower():
-                return None
-            raise exception.SysinvException(
-                "Error getting secret: %s in namespace: %s, "
-                "Details: %s" % (secret_name, namespace, str(e)))
-
-        return stdout
--- a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py
+++ b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py
@ -2757,7 +2757,7 @@ class DockerHelper(object):
                # is a work around the permission issue in Armada container.
                kube_config = os.path.join(constants.APP_SYNCED_ARMADA_DATA_PATH,
                                           'admin.conf')
-                shutil.copy('/etc/kubernetes/admin.conf', kube_config)
+                shutil.copy(kubernetes.KUBERNETES_ADMIN_CONF, kube_config)
                os.chown(kube_config, 1000, grp.getgrnam("sys_protected").gr_gid)

                overrides_dir = common.HELM_OVERRIDES_PATH
@ -2799,6 +2799,9 @@ class DockerHelper(object):
                    command=None)
                LOG.info("Armada service started!")
                return container
+            except IOError as ie:
+                if not kubernetes.is_k8s_configured():
+                    LOG.error("Unable to start Armada service: %s" % ie)
            except OSError as oe:
                LOG.error("Unable to make kubernetes config accessible to "
                          "armada: %s" % oe)
--- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py
+++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py
@ -68,6 +68,7 @@ from oslo_utils import excutils
 from oslo_utils import timeutils
 from oslo_utils import uuidutils
 from platform_util.license import license
+from ruamel import yaml
 from sqlalchemy.orm import exc
 from six.moves import http_client as httplib
 from sysinv._i18n import _
@ -5267,6 +5268,52 @@ class ConductorManager(service.PeriodicService):
                return
        self.reapply_app(context, app_name)

+    def _patch_tiller_deployment(self):
+        """ Ensure tiller is patched with restart logic."""
+        LOG.info("Attempt to patch tiller deployment")
+        try:
+            # We have a race condition that may cause the tiller pod to not have
+            # its environment set up correctly. This will patch the tiller
+            # deployment to ensure that tiller can recover if that occurs. The
+            # deployment is patched during the initial ansible run. This will
+            # re-patch the deployment in the case when tiller has been removed
+            # and reinstalled in the cluster after the system has been
+            # installed. If tiller is already patched then the patch execution
+            # is successful causing no change to the deployment.
+            patch = {
+                'spec': {
+                    'template': {
+                        'spec': {
+                            'containers': [{
+                                'name': 'tiller',
+                                'command': [
+                                    '/bin/sh',
+                                    '-cex',
+                                    '#!/bin/sh\n'
+                                    'env | grep -q -e ^TILLER_DEPLOY || exit\n'
+                                    'env | grep -q -e ^KUBE_DNS || exit\n'
+                                    'env | grep -q -e ^KUBERNETES_PORT || exit\n'
+                                    'env | grep -q -e ^KUBERNETES_SERVICE || exit\n'
+                                    '/tiller\n'
+                                ]
+                            }]
+                        }
+                    }
+                }
+            }
+            cmd = ['kubectl',
+                   '--kubeconfig={}'.format(kubernetes.KUBERNETES_ADMIN_CONF),
+                   'patch', 'deployment', '-n', 'kube-system', 'tiller-deploy',
+                   '-p', yaml.dump(patch)]
+            stdout, stderr = cutils.execute(*cmd, run_as_root=False)
+
+        except exception.ProcessExecutionError as e:
+            raise exception.SysinvException(
+                _("Error patching the tiller deployment, "
+                  "Details: %s") % str(e))
+
+        LOG.info("Tiller deployment has been patched")
+
    def _upgrade_downgrade_kube_components(self):
        self._upgrade_downgrade_tiller()
        self._upgrade_downgrade_kube_networking()
@ -5360,6 +5407,16 @@ class ConductorManager(service.PeriodicService):
            LOG.error("{}. Failed to upgrade/downgrade tiller.".format(e))
            return False

+        # Patch tiller to allow restarts if the environment is incomplete
+        #
+        # NOTE: This patch along with this upgrade functionality can be removed
+        # once StarlingX moves to Helm v3
+        try:
+            self._patch_tiller_deployment()
+        except Exception as e:
+            LOG.error("{}. Failed to patch tiller deployment.".format(e))
+            return False
+
        return True

    @retry(retry_on_result=lambda x: x is False,
@ -5438,7 +5495,8 @@ class ConductorManager(service.PeriodicService):
        :returns: list of namespaces
        """
        try:
-            cmd = ['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf',
+            cmd = ['kubectl',
+                   '--kubeconfig={}'.format(kubernetes.KUBERNETES_ADMIN_CONF),
                   'get', 'namespaces', '-o',
                   'go-template=\'{{range .items}}{{.metadata.name}}\'{{end}}\'']
            stdout, stderr = cutils.execute(*cmd, run_as_root=False)
--- a/sysinv/sysinv/sysinv/sysinv/helm/helm.py
+++ b/sysinv/sysinv/sysinv/sysinv/helm/helm.py
@ -20,6 +20,7 @@ from stevedore import extension

 from oslo_log import log as logging
 from sysinv.common import exception
+from sysinv.common import kubernetes
 from sysinv.common import utils
 from sysinv.helm import common

@ -451,7 +452,7 @@ class HelmOperator(object):
                    cmd.extend(['--set', value_set])

        env = os.environ.copy()
-        env['KUBECONFIG'] = '/etc/kubernetes/admin.conf'
+        env['KUBECONFIG'] = kubernetes.KUBERNETES_ADMIN_CONF

        # Make a temporary directory with a fake chart in it
        try:
--- a/sysinv/sysinv/sysinv/sysinv/helm/utils.py
+++ b/sysinv/sysinv/sysinv/sysinv/helm/utils.py
@ -13,6 +13,7 @@ from eventlet.green import subprocess
 import ruamel.yaml as yaml
 from oslo_log import log as logging
 from sysinv.agent import rpcapi as agent_rpcapi
+from sysinv.common import kubernetes
 from sysinv.common import exception
 from sysinv.openstack.common import context
 import threading
@ -43,7 +44,7 @@ def retrieve_helm_releases():
    :return: a dict of deployed helm releases
    """
    helm_list = subprocess.Popen(
-        ['helm', '--kubeconfig', '/etc/kubernetes/admin.conf',
+        ['helm', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF,
         'list', '--output', 'yaml'],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    timer = threading.Timer(20, helm_list.kill)
@ -93,7 +94,7 @@ def delete_helm_release(release):
    :param release: the name of the helm release
    """
    helm_cmd = subprocess.Popen(
-        ['helm', '--kubeconfig', '/etc/kubernetes/admin.conf',
+        ['helm', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF,
         'delete', release],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    timer = threading.Timer(20, helm_cmd.kill)
@ -123,7 +124,7 @@ def delete_helm_release(release):
 def get_openstack_pending_install_charts():
    try:
        return subprocess.check_output(
-            ['helm', '--kubeconfig', '/etc/kubernetes/admin.conf',
+            ['helm', '--kubeconfig', kubernetes.KUBERNETES_ADMIN_CONF,
             'list', '--namespace', 'openstack', '--pending'])
    except Exception as e:
        raise exception.HelmTillerFailure(
@ -138,18 +139,21 @@ def helm_upgrade_tiller(image):
        # sed command until helm and tiller provide a fix for
        # https://github.com/helm/helm/issues/6374
        workaround_part1 = '--skip-refresh ' \
-                      '--service-account tiller ' \
-                      '--node-selectors "node-role.kubernetes.io/master"="" ' \
-                      '--override spec.template.spec.hostNetwork=true ' \
-                      '--override spec.selector.matchLabels.app=helm ' \
-                      '--override spec.selector.matchLabels.name=tiller ' \
-                      '--output yaml'
+                    '--service-account tiller ' \
+                    '--node-selectors "node-role.kubernetes.io/master"="" ' \
+                    '--override spec.template.spec.hostNetwork=true ' \
+                    '--override spec.selector.matchLabels.app=helm ' \
+                    '--override spec.selector.matchLabels.name=tiller ' \
+                    '--output yaml'
        workaround_part2 = \
            '| sed "s@apiVersion: extensions/v1beta1@apiVersion: apps/v1@" ' \
-            '| kubectl --kubeconfig /etc/kubernetes/admin.conf replace --force -f -'
+            '| kubectl --kubeconfig {} replace --force -f -'.format(
+                kubernetes.KUBERNETES_ADMIN_CONF)

-        cmd = '{} {} {} {}'.format(
-            'helm init --upgrade --kubeconfig /etc/kubernetes/admin.conf --tiller-image',
+        cmd = '{} {} {} {} {} {}'.format(
+            'helm init --upgrade --kubeconfig',
+            kubernetes.KUBERNETES_ADMIN_CONF,
+            '--tiller-image',
            image,
            workaround_part1,
            workaround_part2)
--- a/sysinv/sysinv/sysinv/sysinv/tests/common/test_kubernetes.py
+++ b/sysinv/sysinv/sysinv/sysinv/tests/common/test_kubernetes.py
@ -624,6 +624,13 @@ class TestKubeOperator(base.TestCase):

        self.list_namespaced_pod_result = None

+        def mock_is_k8s_configured():
+            return True
+        self.mocked_is_k8s_configured = mock.patch(
+            'sysinv.common.kubernetes.is_k8s_configured',
+            mock_is_k8s_configured)
+        self.mocked_is_k8s_configured.start()
+
        def mock_list_namespaced_pod(obj, namespace, field_selector=""):
            pod_name = field_selector.split('metadata.name=', 1)[1]
            return self.list_namespaced_pod_result[pod_name]