diff --git a/nova/templates/bin/_health-probe.py.tpl b/nova/templates/bin/_health-probe.py.tpl new file mode 100644 index 0000000000..683387476a --- /dev/null +++ b/nova/templates/bin/_health-probe.py.tpl @@ -0,0 +1,208 @@ +#!/usr/bin/env python2 + +# Copyright 2019 The Openstack-Helm Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Health probe script for OpenStack service that uses RPC/unix domain socket for +communication. Check's the RPC tcp socket status on the process and send +message to service through rpc call method and expects a reply. It is expected +to receive failure from the service's RPC server as the method does not exist. + +Script returns failure to Kubernetes only when + a. TCP socket for the RPC communication are not established. + b. service is not reachable or + c. service times out sending a reply. + +sys.stderr.write() writes to pod's events on failures. + +Usage example for Nova Compute: +# python health-probe-rpc.py --config-file /etc/nova/nova.conf \ +# --service-queue-name compute + +""" + +import psutil +import socket +import sys + +from oslo_config import cfg +from oslo_context import context +from oslo_log import log +import oslo_messaging + + +tcp_established = "ESTABLISHED" + + +def check_service_status(transport): + """Verify service status. Return success if service consumes message""" + try: + target = oslo_messaging.Target(topic=cfg.CONF.service_queue_name, + server=socket.gethostname()) + client = oslo_messaging.RPCClient(transport, target, + timeout=60, + retry=2) + client.call(context.RequestContext(), + 'pod_health_probe_method_ignore_errors') + except oslo_messaging.exceptions.MessageDeliveryFailure: + # Log to pod events + sys.stderr.write("Health probe unable to reach message bus") + sys.exit(0) # return success + except oslo_messaging.rpc.client.RemoteError as re: + if ("Endpoint does not support RPC method" in re.message) or \ + ("Endpoint does not support RPC version" in re.message): + sys.exit(0) # Call reached the service + else: + sys.stderr.write("Health probe unable to reach service") + sys.exit(1) # return failure + except oslo_messaging.exceptions.MessagingTimeout: + sys.stderr.write("Health probe timed out. Agent is down or response " + "timed out") + sys.exit(1) # return failure + except Exception as ex: + sys.stderr.write("Health probe caught exception sending message to " + "service: %s" % ex.message) + sys.exit(0) + except: + sys.stderr.write("Health probe caught exception sending message to" + " service") + sys.exit(0) + + +def tcp_socket_status(process, port): + """Check the tcp socket status on a process""" + sock_count = 0 + parentId = 0 + for pr in psutil.pids(): + try: + p = psutil.Process(pr) + if p.name() == process: + if parentId == 0: + parentId = p.pid + else: + if p.ppid() == parentId: + continue + pcon = p.connections() + for con in pcon: + try: + rport = con.raddr[1] + status = con.status + except IndexError: + continue + if rport == port and status == tcp_established: + sock_count = sock_count + 1 + except psutil.NoSuchProcess: + continue + + if sock_count == 0: + return 0 + else: + return 1 + + +def configured_port_in_conf(): + """Get the rabbitmq/Database port configured in config file""" + rabbitmq_port = 0 + database_port = 0 + try: + with open(sys.argv[2]) as conf_file: + for line in conf_file: + if "transport_url" in line: + rabbitmq_port = int(line.split(':', 3)[3].split('/')[0]) + elif "connection =" in line: + service = line.split(':', 3)[3].split('/')[1].rstrip('\n') + if service == "nova": + database_port = int( + line.split(':', 3)[3].split('/')[0]) + return rabbitmq_port, database_port + except IOError: + sys.stderr.write("Nova Config file not present") + sys.exit(1) + + +def test_tcp_socket(service): + """Check tcp socket to rabbitmq/db is in Established state""" + dict_services = { + "compute": "nova-compute", + "conductor": "nova-conductor", + "consoleauth": "nova-consoleaut", + "scheduler": "nova-scheduler" + } + r_port, d_port = configured_port_in_conf() + + if service in dict_services: + proc = dict_services[service] + if r_port != 0 and tcp_socket_status(proc, r_port) == 0: + sys.stderr.write("RabbitMQ socket not established") + # Do not kill the pod if RabbitMQ is not reachable/down + if not cfg.CONF.liveness_probe: + sys.exit(1) + + # let's do the db check + if service != "compute": + if d_port != 0 and tcp_socket_status(proc, d_port) == 0: + sys.stderr.write("Database socket not established") + # Do not kill the pod if database is not reachable/down + # there could be no socket as well as typically connections + # get closed after an idle timeout + # Just log it to pod events + if not cfg.CONF.liveness_probe: + sys.exit(1) + + +def test_rpc_liveness(): + """Test if service can consume message from queue""" + oslo_messaging.set_transport_defaults(control_exchange='nova') + + rabbit_group = cfg.OptGroup(name='oslo_messaging_rabbit', + title='RabbitMQ options') + cfg.CONF.register_group(rabbit_group) + cfg.CONF.register_cli_opt(cfg.StrOpt('service-queue-name')) + cfg.CONF.register_cli_opt(cfg.BoolOpt('liveness-probe', default=False, + required=False)) + + cfg.CONF(sys.argv[1:]) + + log.logging.basicConfig(level=log.ERROR) + + try: + transport = oslo_messaging.get_transport(cfg.CONF) + except Exception as ex: + sys.stderr.write("Message bus driver load error: %s" % ex.message) + sys.exit(0) # return success + + if not cfg.CONF.transport_url or \ + not cfg.CONF.service_queue_name: + sys.stderr.write("Both message bus URL and service's queue name are " + "required for health probe to work") + sys.exit(0) # return success + + try: + cfg.CONF.set_override('rabbit_max_retries', 2, + group=rabbit_group) # 3 attempts + except cfg.NoSuchOptError as ex: + cfg.CONF.register_opt(cfg.IntOpt('rabbit_max_retries', default=2), + group=rabbit_group) + + service = cfg.CONF.service_queue_name + test_tcp_socket(service) + + check_service_status(transport) + + +if __name__ == "__main__": + test_rpc_liveness() + + sys.exit(0) # return success diff --git a/nova/templates/configmap-bin.yaml b/nova/templates/configmap-bin.yaml index e422b62196..c58b90bd7e 100644 --- a/nova/templates/configmap-bin.yaml +++ b/nova/templates/configmap-bin.yaml @@ -51,6 +51,8 @@ data: ceph-admin-keyring.sh: | {{ tuple "bin/_ceph-admin-keyring.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} {{- end }} + health-probe.py: | +{{ tuple "bin/_health-probe.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} nova-api.sh: | {{ tuple "bin/_nova-api.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} nova-api-metadata.sh: | diff --git a/nova/templates/daemonset-compute.yaml b/nova/templates/daemonset-compute.yaml index 463ea72bb7..35236bdadc 100644 --- a/nova/templates/daemonset-compute.yaml +++ b/nova/templates/daemonset-compute.yaml @@ -180,6 +180,31 @@ spec: - name: LIBVIRT_CEPH_SECRET_UUID value: "{{ .Values.conf.ceph.secret_uuid }}" {{ end }} + readinessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/nova/nova.conf + - --service-queue-name + - compute + initialDelaySeconds: 80 + periodSeconds: 90 + timeoutSeconds: 70 + livenessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/nova/nova.conf + - --service-queue-name + - compute + - --liveness-probe + initialDelaySeconds: 120 + periodSeconds: 90 + timeoutSeconds: 70 command: - /tmp/nova-compute.sh volumeMounts: @@ -187,6 +212,10 @@ spec: mountPath: /tmp/nova-compute.sh subPath: nova-compute.sh readOnly: true + - name: nova-bin + mountPath: /tmp/health-probe.py + subPath: health-probe.py + readOnly: true - name: nova-etc mountPath: /etc/nova/nova.conf subPath: nova.conf diff --git a/nova/templates/deployment-conductor.yaml b/nova/templates/deployment-conductor.yaml index 33de6413c5..1e66e41932 100644 --- a/nova/templates/deployment-conductor.yaml +++ b/nova/templates/deployment-conductor.yaml @@ -60,6 +60,31 @@ spec: {{ tuple $envAll $envAll.Values.pod.resources.conductor | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }} securityContext: allowPrivilegeEscalation: false + readinessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/nova/nova.conf + - --service-queue-name + - conductor + initialDelaySeconds: 80 + periodSeconds: 90 + timeoutSeconds: 70 + livenessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/nova/nova.conf + - --service-queue-name + - conductor + - --liveness-probe + initialDelaySeconds: 120 + periodSeconds: 90 + timeoutSeconds: 70 command: - /tmp/nova-conductor.sh volumeMounts: @@ -67,6 +92,10 @@ spec: mountPath: /tmp/nova-conductor.sh subPath: nova-conductor.sh readOnly: true + - name: nova-bin + mountPath: /tmp/health-probe.py + subPath: health-probe.py + readOnly: true - name: nova-etc mountPath: /etc/nova/nova.conf subPath: nova.conf diff --git a/nova/templates/deployment-consoleauth.yaml b/nova/templates/deployment-consoleauth.yaml index 29832d56a6..75b66e7939 100644 --- a/nova/templates/deployment-consoleauth.yaml +++ b/nova/templates/deployment-consoleauth.yaml @@ -60,6 +60,31 @@ spec: {{ tuple $envAll $envAll.Values.pod.resources.consoleauth | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }} securityContext: allowPrivilegeEscalation: false + readinessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/nova/nova.conf + - --service-queue-name + - consoleauth + initialDelaySeconds: 80 + periodSeconds: 90 + timeoutSeconds: 70 + livenessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/nova/nova.conf + - --service-queue-name + - consoleauth + - --liveness-probe + initialDelaySeconds: 120 + periodSeconds: 90 + timeoutSeconds: 70 command: - /tmp/nova-consoleauth.sh volumeMounts: @@ -67,6 +92,10 @@ spec: mountPath: /tmp/nova-consoleauth.sh subPath: nova-consoleauth.sh readOnly: true + - name: nova-bin + mountPath: /tmp/health-probe.py + subPath: health-probe.py + readOnly: true - name: nova-etc mountPath: /etc/nova/nova.conf subPath: nova.conf diff --git a/nova/templates/deployment-novncproxy.yaml b/nova/templates/deployment-novncproxy.yaml index 8d187c8b88..cf9fda0243 100644 --- a/nova/templates/deployment-novncproxy.yaml +++ b/nova/templates/deployment-novncproxy.yaml @@ -94,6 +94,14 @@ spec: - name: nova-novncproxy {{ tuple $envAll "nova_novncproxy" | include "helm-toolkit.snippets.image" | indent 10 }} {{ tuple $envAll $envAll.Values.pod.resources.novncproxy | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }} + readinessProbe: + tcpSocket: + port: {{ tuple "compute_novnc_proxy" "internal" "novnc_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} + initialDelaySeconds: 30 + livenessProbe: + tcpSocket: + port: {{ tuple "compute_novnc_proxy" "internal" "novnc_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} + initialDelaySeconds: 30 command: - /tmp/nova-console-proxy.sh ports: diff --git a/nova/templates/deployment-scheduler.yaml b/nova/templates/deployment-scheduler.yaml index a3d46e5db0..9611d9509f 100644 --- a/nova/templates/deployment-scheduler.yaml +++ b/nova/templates/deployment-scheduler.yaml @@ -60,6 +60,31 @@ spec: {{ tuple $envAll $envAll.Values.pod.resources.scheduler | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }} securityContext: allowPrivilegeEscalation: false + readinessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/nova/nova.conf + - --service-queue-name + - scheduler + initialDelaySeconds: 80 + periodSeconds: 90 + timeoutSeconds: 70 + livenessProbe: + exec: + command: + - python + - /tmp/health-probe.py + - --config-file + - /etc/nova/nova.conf + - --service-queue-name + - scheduler + - --liveness-probe + initialDelaySeconds: 120 + periodSeconds: 90 + timeoutSeconds: 70 command: - /tmp/nova-scheduler.sh volumeMounts: @@ -67,6 +92,10 @@ spec: mountPath: /tmp/nova-scheduler.sh subPath: nova-scheduler.sh readOnly: true + - name: nova-bin + mountPath: /tmp/health-probe.py + subPath: health-probe.py + readOnly: true - name: nova-etc mountPath: /etc/nova/nova.conf subPath: nova.conf diff --git a/nova/templates/deployment-spiceproxy.yaml b/nova/templates/deployment-spiceproxy.yaml index b026d753ea..4507bde4ce 100644 --- a/nova/templates/deployment-spiceproxy.yaml +++ b/nova/templates/deployment-spiceproxy.yaml @@ -94,6 +94,14 @@ spec: - name: nova-spiceproxy {{ tuple $envAll "nova_spiceproxy" | include "helm-toolkit.snippets.image" | indent 10 }} {{ tuple $envAll $envAll.Values.pod.resources.spiceproxy | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }} + readinessProbe: + tcpSocket: + port: {{ tuple "compute_spice_proxy" "internal" "spice_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} + initialDelaySeconds: 30 + livenessProbe: + tcpSocket: + port: {{ tuple "compute_spice_proxy" "internal" "spice_proxy" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} + initialDelaySeconds: 30 command: - /tmp/nova-console-proxy.sh ports: