Health probe for Ranger-agent pods

Health probe for Ranger-agent pods is used for both liveness and readiness probe. ranger-agent-api and ranger-agent-engine pods: - Sends an RPC call with a known method to pod's listener queue. Probe is successful if call returns with no error. If listener is not reachable or fails to respond in time, returns failure to probe. - Check if the rpc socket status on ranger-agent pods to rabbitmq are in established state. ranger-agent-api pod: - Launch a call to pod's open interface. Probe is successful if call returns; otherwise failure if response has error or timed out. Change-Id: I7a22fd50d47e58df19b413ed65ab528e2d78d609
2019-09-20 10:07:44 -07:00 · 2019-09-20 10:07:44 -07:00 · 3b9adc2bf0
parent 71fdc5fdb7
commit 3b9adc2bf0
7 changed files with 350 additions and 37 deletions
--- a/ranger-agent/templates/bin/_health-probe.py.tpl
+++ b/ranger-agent/templates/bin/_health-probe.py.tpl
@ -0,0 +1,256 @@
+#!/usr/bin/env python
+
+# Copyright 2019 The Openstack-Helm Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Health probe script for OpenStack service that uses RPC/unix domain socket for
+communication. Check's the RPC tcp socket status on the process and send
+message to service through rpc call method and expects a reply.
+
+Script returns failure to Kubernetes only when
+  a. TCP socket for the RPC communication are not established.
+  b. service is not reachable or
+  c. service times out sending a reply.
+
+sys.stderr.write() writes to pod's events on failures.
+
+Usage example for Ranger-agent-engine:
+# python health-probe.py --config-file /etc/ranger-agent/ranger-agent.conf \
+#  --service-queue-name ord-notifier-q
+
+"""
+
+import psutil
+import requests
+import socket
+import sys
+
+from oslo_config import cfg
+from oslo_context import context
+from oslo_log import log
+import oslo_messaging
+
+try:
+    from configparser import ConfigParser
+except ImportError:
+    from ConfigParser import ConfigParser
+
+tcp_established = "ESTABLISHED"
+
+
+def check_service_status(transport, service_queue_name):
+    """Verify service status. Return success if service consumes message"""
+    service_error = False
+    try:
+        target = oslo_messaging.Target(topic=service_queue_name,
+                                       server=socket.gethostname())
+        client = oslo_messaging.RPCClient(transport, target,
+                                          timeout=75,
+                                          retry=0)
+        cctxt = client.prepare(version='1.0')
+        results = cctxt.call(context.RequestContext(),
+                             'invoke_health_probe_rpc')
+
+        for value in results.values():
+            if value == 'failed':
+                sys.stderr.write("Health probe detects problem "
+                                 ": %s\n" % results)
+                if not cfg.CONF.liveness_probe:
+                    service_error = True
+                    sys.exit(1)
+                break
+
+    except oslo_messaging.exceptions.MessageDeliveryFailure:
+        # Log to pod events
+        sys.stderr.write("Health probe unable to reach message bus\n")
+        sys.exit(0)  # return success
+    except oslo_messaging.rpc.client.RemoteError as re:
+        message = getattr(re, "message", str(re))
+        if ("Endpoint does not support RPC method" in message) or \
+                ("Endpoint does not support RPC version" in message):
+            sys.exit(0)  # Call reached the service
+        else:
+            sys.stderr.write("Health probe unable to reach service\n")
+            sys.exit(1)  # return failure
+    except oslo_messaging.exceptions.MessagingTimeout:
+        sys.stderr.write("Health probe timed out. Service is down or "
+                         "response timed out\n")
+        sys.exit(1)  # return failure
+    except Exception as ex:
+        message = getattr(ex, "message", str(ex))
+        sys.stderr.write("Health probe caught exception sending message to "
+                         "service: %s\n" % message)
+        sys.exit(0)
+    except:
+        sys.stderr.write("Health probe caught exception sending message to"
+                         " service\n")
+        if service_error:
+            sys.exit(1)
+        else:
+            sys.exit(0)
+
+
+def tcp_socket_status(process, ports):
+    """Check the tcp socket status on a process"""
+    sock_count = 0
+    parentId = 0
+    for pr in psutil.pids():
+        try:
+            p = psutil.Process(pr)
+            if p.name() in process:
+                if parentId == 0:
+                    parentId = p.pid
+                else:
+                    if p.ppid() == parentId and not cfg.CONF.check_all_pids:
+                        continue
+                pcon = p.connections()
+                for con in pcon:
+                    try:
+                        rport = con.raddr[1]
+                        status = con.status
+                    except IndexError:
+                        continue
+                    if rport in ports and status == tcp_established:
+                        sock_count = sock_count + 1
+        except psutil.NoSuchProcess:
+            continue
+
+    if sock_count == 0:
+        return 0
+    else:
+        return 1
+
+
+def get_rabbitmq_ports():
+    """Get the rabbitmq port from config file"""
+    rabbit_ports = set()
+
+    try:
+        transport_url = oslo_messaging.TransportURL.parse(cfg.CONF)
+        for host in transport_url.hosts:
+            rabbit_ports.add(host.port)
+    except Exception as ex:
+        message = getattr(ex, "message", str(ex))
+        sys.stderr.write("Health probe caught exception reading "
+                         "RabbitMQ ports: %s" % message)
+        sys.exit(0)  # return success
+
+    return rabbit_ports
+
+
+def test_tcp_socket(service_name):
+    """Check tcp socket to rabbitmq is in Established state"""
+    r_ports = get_rabbitmq_ports()
+
+    # service_name is the same as process name for ranger-agent app
+    proc = cfg.CONF.service_name
+    if r_ports and tcp_socket_status(service_name, r_ports) == 0:
+        sys.stderr.write("RabbitMQ socket not established\n")
+        # Do not kill the pod if RabbitMQ is not reachable/down
+        if not cfg.CONF.liveness_probe:
+            sys.exit(1)
+
+
+def test_ranger_agent_api_reachable():
+    """Test ranger-agent-api for response"""
+
+    # get ranger-agent-api port
+    config = ConfigParser()
+    config.read(cfg.CONF.config_file)
+    port = config.get('api', 'port')
+
+    url = "http://localhost:{}/v1/ord/health_check".format(port)
+    try:
+        response = requests.get(url, timeout=30)
+        if response.status_code != 200:
+            sys.exit(1)
+    except requests.exceptions.ConnectionError as ce:
+        message = getattr(ce, "message", str(ce))
+        sys.stderr.write("Health probe ConnectionError Exp: %s\n" % message)
+        sys.exit(1)
+    except requests.exceptions.ReadTimeout as to:
+        message = getattr(to, "message", str(to))
+        sys.stderr.write("Health probe ReadTimeout Exp: %s\n" % message)
+        sys.exit(1)
+    except Exception as ex:
+        message = getattr(ex, "message", str(ex))
+        sys.stderr.write("Health probe caught Unknown Exp: %s\n" % message)
+        sys.exit(1)
+
+
+def test_rpc_liveness(rabbit_group, service_queue_name):
+    """Test if service can consume message from queue"""
+    try:
+        transport = oslo_messaging.get_transport(cfg.CONF)
+    except Exception as ex:
+        message = getattr(ex, "message", str(ex))
+        sys.stderr.write("Message bus driver load error: %s" % message)
+        sys.exit(0)  # return success
+
+    if not cfg.CONF.transport_url or \
+            not service_queue_name:
+        sys.stderr.write("Both message bus URL and service's queue name are "
+                         "required for health probe to work")
+        sys.exit(0)  # return success
+
+    try:
+        cfg.CONF.set_override('rabbit_max_retries', 2,
+                              group=rabbit_group)  # 3 attempts
+    except cfg.NoSuchOptError as ex:
+        cfg.CONF.register_opt(cfg.IntOpt('rabbit_max_retries', default=2),
+                              group=rabbit_group)
+
+    check_service_status(transport, service_queue_name)
+
+
+def run_health_check():
+    oslo_messaging.set_transport_defaults(control_exchange='openstack')
+
+    rabbit_group = cfg.OptGroup(name='oslo_messaging_rabbit',
+                                title='RabbitMQ options')
+    cfg.CONF.register_group(rabbit_group)
+    cfg.CONF.register_cli_opt(cfg.StrOpt('service-name'))
+    cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False,
+                                          required=False))
+    cfg.CONF.register_cli_opt(cfg.BoolOpt('check-all-pids', default=False,
+                                          required=False))
+
+    cfg.CONF(sys.argv[1:])
+
+    log.logging.basicConfig(level=log.ERROR)
+
+    dict_services = {
+        "ranger-agent-engine": "ord-notifier-q",
+        "ranger-agent-api": "ord-listener-q"
+    }
+
+    service_name = cfg.CONF.service_name
+    if service_name in dict_services:
+        service_queue_name = dict_services[service_name]
+    else:
+        sys.stderr.write("Invalid service name: %s\n" % service_name)
+        sys.exit(0)  # return success
+
+    if service_name == 'ranger-agent-api':
+        test_ranger_agent_api_reachable()
+
+    test_tcp_socket(service_name)
+    test_rpc_liveness(rabbit_group, service_queue_name)
+
+
+if __name__ == "__main__":
+    run_health_check()
+
+    sys.exit(0)  # return success
--- a/ranger-agent/templates/bin/_ranger-agent-test.sh.tpl
+++ b/ranger-agent/templates/bin/_ranger-agent-test.sh.tpl
@ -21,7 +21,7 @@ set -ex
 # Come up with a ranger agent payload
 region="${REGION_NAME}"
 url="${RANGER_SERVICE_URL}"
-UUID=$(python -c 'import uuid; print uuid.uuid1()')
+UUID=$(python -c 'import uuid; print(uuid.uuid1())')

 PAYLOAD="{\"ord-notifier\":{
    \"request-id\":\"$UUID\",
@ -47,11 +47,11 @@ function assertContains()
       msg="$(curl -s "$url?Id=$UUID")"
     fi
     if echo "$msg" | grep -q "$expected"; then
-       echo "***TEST IS PASSED: EXPECTED=$expected is in Responce"
+       echo "***TEST IS PASSED: EXPECTED=$expected is in Response"
       break
     else
       if [ "$n" == "5" ]; then
-         echo "***FAILED: EXPECTED=$expected in Responce"
+         echo "***FAILED: EXPECTED=$expected in Response"
         exit 1
       fi
       n=$[$n+1]
--- a/ranger-agent/templates/configmap-bin.yaml
+++ b/ranger-agent/templates/configmap-bin.yaml
@ -38,8 +38,8 @@ data:
 {{ tuple "bin/_ranger-agent-api.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
  ranger-agent-engine.sh: |
 {{ tuple "bin/_ranger-agent-engine.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
-  health-check.sh: |+
-{{ tuple "bin/_health-check.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
+  health-probe.py: |
+{{ tuple "bin/_health-probe.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
  ranger-agent-test.sh: |+
 {{ tuple "bin/_ranger-agent-test.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
  rabbit-init.sh: |
--- a/ranger-agent/templates/deployment-ranger-agent-api.yaml
+++ b/ranger-agent/templates/deployment-ranger-agent-api.yaml
@ -14,15 +14,37 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */}}

+{{- define "RangerAgentApiReadinessProbeTemplate" }}
+exec:
+  command:
+    - python
+    - /tmp/health-probe.py
+    - --config-file
+    - /etc/ranger-agent/ranger-agent.conf
+    - --service-name
+    - ranger-agent-api
+{{- end }}
+{{- define "RangerAgentApiLivenessProbeTemplate" }}
+exec:
+  command:
+    - python
+    - /tmp/health-probe.py
+    - --config-file
+    - /etc/ranger-agent/ranger-agent.conf
+    - --service-name
+    - ranger-agent-api
+    - --liveness-probe
+{{- end }}
+
 {{- if .Values.manifests.deployment_ranger_agent_api }}
 {{- $envAll := . }}

 {{- $mounts_ranger_agent_api := .Values.pod.mounts.ranger_agent_api.ranger_agent_api }}
 {{- $mounts_ranger_agent_api_init := .Values.pod.mounts.ranger_agent_api.init_container }}

-
 {{- $serviceAccountName := "ranger-agent-api" }}
 {{ tuple $envAll "api" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
+
 ---
 apiVersion: apps/v1beta1
 kind: Deployment
@ -71,16 +93,8 @@ spec:
          ports:
            - name: ranger-api
              containerPort: {{ tuple "ranger-agent" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
-          livenessProbe:
-            tcpSocket:
-              port: {{ tuple "ranger-agent" "internal" "api" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
-          readinessProbe:
-           exec:
-            command:
-             - /tmp/health-check.sh
-             - apireadiness
-           initialDelaySeconds: 30
-           timeoutSeconds: 5
+{{ dict "envAll" $envAll "component" "api" "container" "ranger-agent-api" "type" "readiness" "probeTemplate" (include "RangerAgentApiReadinessProbeTemplate" $envAll | fromYaml) | include "helm-toolkit.snippets.kubernetes_probe" | indent 10 }}
+{{ dict "envAll" $envAll "component" "api" "container" "ranger-agent-api" "type" "liveness" "probeTemplate" (include "RangerAgentApiLivenessProbeTemplate" $envAll | fromYaml) | include "helm-toolkit.snippets.kubernetes_probe" | indent 10 }}
          volumeMounts:
            - name: pod-etc-ranger-agent
              mountPath: /etc/ranger-agent
@ -89,8 +103,8 @@ spec:
              subPath: ranger-agent-api.sh
              readOnly: true
            - name: ranger-agent-bin
-              mountPath: /tmp/health-check.sh
-              subPath: health-check.sh
+              mountPath: /tmp/health-probe.py
+              subPath: health-probe.py
              readOnly: true
            - name: ranger-agent-etc
              mountPath: /etc/ranger-agent/ranger-agent.conf
--- a/ranger-agent/templates/deployment-ranger-agent-engine.yaml
+++ b/ranger-agent/templates/deployment-ranger-agent-engine.yaml
@ -13,6 +13,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */}}
+
+{{- define "RangerAgentEngineReadinessProbeTemplate" }}
+exec:
+  command:
+    - python
+    - /tmp/health-probe.py
+    - --config-file
+    - /etc/ranger-agent/ranger-agent.conf
+    - --service-name
+    - ranger-agent-engine
+{{- end }}
+{{- define "RangerAgentEngineLivenessProbeTemplate" }}
+exec:
+  command:
+    - python
+    - /tmp/health-probe.py
+    - --config-file
+    - /etc/ranger-agent/ranger-agent.conf
+    - --service-name
+    - ranger-agent-engine
+    - --liveness-probe
+{{- end }}
+
 {{- if .Values.manifests.deployment_ranger_agent_engine }}
 {{- $envAll := . }}

@ -21,6 +44,7 @@ limitations under the License.

 {{- $serviceAccountName := "ranger-agent-engine" }}
 {{ tuple $envAll "engine" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
+
 ---
 apiVersion: apps/v1beta1
 kind: Deployment
@ -92,20 +116,8 @@ spec:
                command:
                  - /tmp/ranger-agent-engine.sh
                  - stop
-          livenessProbe:
-           exec:
-            command:
-             - /tmp/health-check.sh
-             - engineliveness
-           initialDelaySeconds: 30
-           timeoutSeconds: 5
-          readinessProbe:
-           exec:
-            command:
-             - /tmp/health-check.sh
-             - enginereadiness
-           initialDelaySeconds: 30
-           timeoutSeconds: 5
+{{ dict "envAll" $envAll "component" "engine" "container" "ranger-agent-engine" "type" "readiness" "probeTemplate" (include "RangerAgentEngineReadinessProbeTemplate" $envAll | fromYaml) | include "helm-toolkit.snippets.kubernetes_probe" | indent 10 }}
+{{ dict "envAll" $envAll "component" "engine" "container" "ranger-agent-engine" "type" "liveness" "probeTemplate" (include "RangerAgentEngineLivenessProbeTemplate" $envAll | fromYaml) | include "helm-toolkit.snippets.kubernetes_probe" | indent 10 }}
          volumeMounts:
            - name: pod-etc-ranger-agent
              mountPath: /etc/ranger-agent
@ -114,8 +126,8 @@ spec:
              subPath: ranger-agent-engine.sh
              readOnly: true
            - name: ranger-agent-bin
-              mountPath: /tmp/health-check.sh
-              subPath: health-check.sh
+              mountPath: /tmp/health-probe.py
+              subPath: health-probe.py
              readOnly: true
            - name: ranger-agent-etc
              mountPath: /etc/ranger-agent/ranger-agent.conf
--- a/ranger-agent/values.yaml
+++ b/ranger-agent/values.yaml
@ -27,8 +27,8 @@ images:
    ks_service: docker.io/openstackhelm/heat:newton-ubuntu_xenial
    ks_user: docker.io/openstackhelm/heat:newton-ubuntu_xenial
    rabbit_init: docker.io/rabbitmq:3.7-management
-    ranger-agent_db_sync: quay.io/attcomdev/ranger-agent:60529ac023bf550f0e9cb9e0eb4d4eb3dbf2d5c6
-    ranger_agent: quay.io/attcomdev/ranger-agent:60529ac023bf550f0e9cb9e0eb4d4eb3dbf2d5c6
+    ranger-agent_db_sync: quay.io/attcomdev/ranger-agent:02114b616b50c24e7f1f27d9b1ab3d722b4b20b2
+    ranger_agent: quay.io/attcomdev/ranger-agent:02114b616b50c24e7f1f27d9b1ab3d722b4b20b2
    scripted_test: docker.io/openstackhelm/heat:newton-ubuntu_xenial
  pull_policy: "IfNotPresent"
  local_registry:
@ -264,6 +264,35 @@ pod:
      limits:
        memory: "1024Mi"
        cpu: "2000m"
+  probes:
+    api:
+      ranger-agent-api:
+        readiness:
+          enabled: true
+          params:
+            initialDelaySeconds: 80
+            periodSeconds: 95
+            timeoutSeconds: 85
+        liveness:
+          enabled: true
+          params:
+            initialDelaySeconds: 120
+            periodSeconds: 95
+            timeoutSeconds: 85
+    engine:
+      ranger-agent-engine:
+        readiness:
+          enabled: true
+          params:
+            initialDelaySeconds: 80
+            periodSeconds: 95
+            timeoutSeconds: 85
+        liveness:
+          enabled: true
+          params:
+            initialDelaySeconds: 120
+            periodSeconds: 95
+            timeoutSeconds: 85

 # Names of secrets used  and environmental checks
 secrets:
@ -517,6 +546,7 @@ conf:
      api_paste_config: /etc/ranger-agent/api-paste.ini
      local_repo: ranger_repo
      resource_status_check_wait: 15
+      enable_heat_health_check: true
    api:
      host: 0.0.0.0
    database:
--- a/tools/gate/scripts/070-deploy-ranger-agent.sh
+++ b/tools/gate/scripts/070-deploy-ranger-agent.sh
@ -7,7 +7,8 @@ tee /tmp/ranger-agent.yaml << EOF
 conf:
  ranger_agent:
    DEFAULT:
-      enable_rds_callback_check: False
+      enable_rds_callback_check: false
+      enable_heat_health_check: false
  ssh:
    ssh_key: null
    ssh_config: null