From dfb4654fba4faf1d8959bfe7d79ed1f6adaf7ae6 Mon Sep 17 00:00:00 2001 From: Steve Wilkerson Date: Wed, 7 Nov 2018 07:26:39 -0600 Subject: [PATCH] Nagios: Configuration updates This moves to update the host used for the ceph health checks, as we should be checking the ceph-mgr service directly for ceph metrics instead of trying to curl the host directly. This also changes the ceph_health_check to use the base-os hostgroup instead of the placeholder ceph-mgr host group, as we're just executing a simple check against the ceph-mgr service. This also adds default configuration values for the max_concurrent_checks (60) and check_workers (4) values instead of leaving them at the defaults Nagios uses (0 and # cores, respectively) Change-Id: Ib4072fcd545d8c05d5e9e4a93085a8330be6dfe0 --- nagios/templates/deployment.yaml | 2 ++ nagios/values.yaml | 27 +++++++++++++++++++-------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/nagios/templates/deployment.yaml b/nagios/templates/deployment.yaml index 8650bf45b..c4ee3ff7c 100644 --- a/nagios/templates/deployment.yaml +++ b/nagios/templates/deployment.yaml @@ -155,6 +155,8 @@ spec: value: {{ $envAll.Values.conf.nagios.notification.http.primary_target }} - name: REST_NOTIF_SECONDARY_TARGET_URL value: {{ $envAll.Values.conf.nagios.notification.http.secondary_target }} + - name: CEPH_MGR_SERVICE + value: {{ tuple "ceph_mgr" "internal" "metrics" $envAll | include "helm-toolkit.endpoints.host_and_port_endpoint_uri_lookup" }}/metrics - name: PROMETHEUS_SERVICE valueFrom: secretKeyRef: diff --git a/nagios/values.yaml b/nagios/values.yaml index 4a8224324..7fb2b9cf5 100644 --- a/nagios/values.yaml +++ b/nagios/values.yaml @@ -155,6 +155,19 @@ endpoints: port: http: default: 80 + ceph_mgr: + namespace: null + hosts: + default: ceph-mgr + host_fqdn_override: + default: null + port: + mgr: + default: 7000 + metrics: + default: 9283 + scheme: + default: http network: nagios: @@ -366,9 +379,6 @@ conf: - base-os: hostgroup_name: base-os alias: "base-os" - - ceph_mgr_placeholder: - hostgroup_name: ceph_mgr_placeholder - alias: "ceph_mgr_placeholder" commands: - send_service_snmp_trap: command_name: send_service_snmp_trap @@ -456,7 +466,7 @@ conf: command_line: $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_ntp_clock_skew_high' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- NTP clock skew is more than 2 seconds.' --ok_message 'OK- NTP clock skew is less than 2 seconds.' - check_ceph_health: command_name: check_ceph_health - command_line: $USER1$/check_exporter_health_metric.py --exporter_api 'http://$HOSTADDRESS$:9283/metrics' --health_metric ceph_health_status --critical 2 --warning 1 + command_line: $USER1$/check_exporter_health_metric.py --exporter_api $USER10$ --health_metric ceph_health_status --critical 2 --warning 1 - check_prometheus_hosts: command_name: check_prometheus_hosts command_line: $USER1$/check_update_prometheus_hosts.py --prometheus_api $USER2$ --object_file_loc /opt/nagios/etc/objects/prometheus_discovery_objects.cfg @@ -469,14 +479,14 @@ conf: contact_groups: snmp_and_http_notifying_contact_group check_interval: 60 notification_interval: 120 - retry_interval: 15 + retry_interval: 30 register: 0 - check_ceph_health: use: notifying_service - hostgroup_name: ^ceph_mgr.*$ + hostgroup_name: base-os service_description: "CEPH_health" check_command: check_ceph_health - check_interval: 60 + check_interval: 300 - check_hosts_health: use: generic-service hostgroup_name: prometheus-hosts @@ -1075,7 +1085,7 @@ conf: service_interleave_factor: s host_inter_check_delay_method: s max_host_check_spread: 30 - max_concurrent_checks: 0 + max_concurrent_checks: 60 check_result_reaper_frequency: 10 max_check_result_reaper_time: 30 check_result_path: /opt/nagios/var/spool/checkresults @@ -1106,6 +1116,7 @@ conf: retained_contact_host_attribute_mask: 0 retained_contact_service_attribute_mask: 0 interval_length: 1 + check_workers: 4 check_for_updates: 1 bare_update_check: 0 use_aggressive_host_checking: 0