1. Configurable prometheus monitoring persistent storage
* Add metrics_retention_days magnum label allowing user to specify prometheus server scraped metrics retention days (default: 14) * Add metrics_retention_size magnum label allowing user to specify prometheus server metrics storage maximum size in Gib (default: 14) * Add metrics_scrape_interval allowing user to specify prometheus scrape frequency in seconds (default: 30) * Add metrics_storage_class_name allowing user to specify the storageClass to use as external retention for pod fail-over data persistency task: 39509 story: 2006765 Change-Id: I42117837e8e3cd03f3cb723df4d73692ead0d169 Signed-off-by: Diogo Guerra <diogo.filipe.tomas.guerra@cern.ch>
This commit is contained in:
parent
e24bf6252f
commit
37497ccf5b
|
@ -320,6 +320,14 @@ the table are linked to more details elsewhere in the user guide.
|
|||
| `monitoring_enabled`_ | - true | false |
|
||||
| | - false | |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `monitoring_retention_days`_ | see below | see below |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `monitoring_retention_size`_ | see below | see below |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `monitoring_storage_class_name`_ | see below | see below |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `monitoring_interval_seconds`_ | see below | see below |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `prometheus_operator_chart_tag`_ | see below | see below |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `prometheus_adapter_enabled`_ | - true | true |
|
||||
|
@ -1475,6 +1483,25 @@ _`monitoring_enabled`
|
|||
helm_client_tag<v3.0.0.
|
||||
Default: false
|
||||
|
||||
_`monitoring_retention_days`
|
||||
The number of time (in days) that prometheus metrics should be kept.
|
||||
Default: 14
|
||||
|
||||
_`monitoring_retention_size`
|
||||
The maximum memory (in GiB) allowed to be used by prometheus server to
|
||||
store metrics.
|
||||
Default: 14
|
||||
|
||||
_`monitoring_interval_seconds`
|
||||
The time interval (in seconds) between consecutive metric scrapings.
|
||||
Default: 30
|
||||
|
||||
_`monitoring_storage_class_name`
|
||||
The kubernetes storage class name to use for the prometheus pvc.
|
||||
Using this label will activate the usage of a pvc instead of local
|
||||
disk space.
|
||||
Default: ""
|
||||
|
||||
_`prometheus_adapter_enabled`
|
||||
Enable installation of cluster custom metrics provided by the
|
||||
stable/prometheus-adapter helm chart. This service depends on
|
||||
|
|
|
@ -59,6 +59,10 @@ VERIFY_CA="$VERIFY_CA"
|
|||
CLUSTER_UUID="$CLUSTER_UUID"
|
||||
MAGNUM_URL="$MAGNUM_URL"
|
||||
MONITORING_ENABLED="$MONITORING_ENABLED"
|
||||
MONITORING_RETENTION_DAYS="$MONITORING_RETENTION_DAYS"
|
||||
MONITORING_RETENTION_SIZE="$MONITORING_RETENTION_SIZE"
|
||||
MONITORING_INTERVAL_SECONDS="$MONITORING_INTERVAL_SECONDS"
|
||||
MONITORING_STORAGE_CLASS_NAME="$MONITORING_STORAGE_CLASS_NAME"
|
||||
PROMETHEUS_OPERATOR_CHART_TAG="$PROMETHEUS_OPERATOR_CHART_TAG"
|
||||
PROMETHEUS_ADAPTER_ENABLED="$PROMETHEUS_ADAPTER_ENABLED"
|
||||
PROMETHEUS_ADAPTER_CHART_TAG="$PROMETHEUS_ADAPTER_CHART_TAG"
|
||||
|
|
|
@ -21,6 +21,11 @@ EOF
|
|||
PROMETHEUS_SERVER_CPU=$(expr 128 + 7 \* ${MAX_NODE_COUNT} )
|
||||
PROMETHEUS_SERVER_RAM=$(expr 256 + 40 \* ${MAX_NODE_COUNT})
|
||||
|
||||
# Because the PVC and Prometheus use different scales for the volume size
|
||||
# conversion is needed. The prometheus-monitoring value (in GB) is the conversion
|
||||
# with a ratio of (1 GiB = 1.073741824 GB) and then rounded to int
|
||||
MONITORING_RETENTION_SIZE_GB=$(echo | awk "{print int(${MONITORING_RETENTION_SIZE}*1.073741824)}")
|
||||
|
||||
# Validate if communication node <-> master is secure or insecure
|
||||
PROTOCOL="https"
|
||||
INSECURE_SKIP_VERIFY="False"
|
||||
|
@ -193,6 +198,7 @@ prometheus-operator:
|
|||
|
||||
prometheus:
|
||||
prometheusSpec:
|
||||
scrapeInterval: ${MONITORING_INTERVAL_SECONDS}s
|
||||
scrapeInterval: 30s
|
||||
evaluationInterval: 30s
|
||||
image:
|
||||
|
@ -209,6 +215,8 @@ prometheus-operator:
|
|||
# - kube-controller-manager-certificates
|
||||
# - kube-scheduler-certificates
|
||||
# - kube-proxy-manager-certificates
|
||||
retention: ${MONITORING_RETENTION_DAYS}d
|
||||
retentionSize: ${MONITORING_RETENTION_SIZE_GB}GB
|
||||
resources:
|
||||
requests:
|
||||
cpu: ${PROMETHEUS_SERVER_CPU}m
|
||||
|
@ -216,6 +224,21 @@ prometheus-operator:
|
|||
priorityClassName: "system-cluster-critical"
|
||||
EOF
|
||||
|
||||
#######################
|
||||
# Set up definitions for persistent storage using k8s storageClass
|
||||
if [ "${MONITORING_STORAGE_CLASS_NAME}" != "" ]; then
|
||||
cat << EOF >> ${HELM_CHART_DIR}/values.yaml
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: ${MONITORING_STORAGE_CLASS_NAME}
|
||||
accessModes: ["ReadWriteMany"]
|
||||
resources:
|
||||
requests:
|
||||
storage: ${MONITORING_RETENTION_SIZE}Gi
|
||||
EOF
|
||||
fi #END PERSISTENT STORAGE CONFIG
|
||||
|
||||
#######################
|
||||
# Set up definitions for ingress objects
|
||||
|
||||
|
@ -225,17 +248,17 @@ EOF
|
|||
:
|
||||
elif [ "${INGRESS_CONTROLLER}" == "traefik" ]; then
|
||||
cat << EOF >> ${HELM_CHART_DIR}/values.yaml
|
||||
additionalServiceMonitors:
|
||||
- name: prometheus-traefik-metrics
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: traefik
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- kube-system
|
||||
endpoints:
|
||||
- path: /metrics
|
||||
port: metrics
|
||||
additionalServiceMonitors:
|
||||
- name: prometheus-traefik-metrics
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: traefik
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- kube-system
|
||||
endpoints:
|
||||
- path: /metrics
|
||||
port: metrics
|
||||
EOF
|
||||
fi #END INGRESS
|
||||
|
||||
|
|
|
@ -98,6 +98,10 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
|
|||
'metrics_server_enabled',
|
||||
'metrics_server_chart_tag',
|
||||
'monitoring_enabled',
|
||||
'monitoring_retention_days',
|
||||
'monitoring_retention_size',
|
||||
'monitoring_interval_seconds',
|
||||
'monitoring_storage_class_name',
|
||||
'prometheus_operator_chart_tag',
|
||||
'prometheus_adapter_enabled',
|
||||
'prometheus_adapter_chart_tag',
|
||||
|
|
|
@ -694,6 +694,28 @@ parameters:
|
|||
description: Enable or disable prometheus-operator monitoring solution.
|
||||
default: false
|
||||
|
||||
monitoring_retention_days:
|
||||
type: number
|
||||
description: The number of time (in days) that prometheus metrics should be kept.
|
||||
default: 14
|
||||
|
||||
monitoring_retention_size:
|
||||
type: number
|
||||
description: >
|
||||
The maximum memory (in Gi) allowed to be used by prometheus server to store metrics.
|
||||
default: 14
|
||||
|
||||
monitoring_interval_seconds:
|
||||
type: number
|
||||
description: >
|
||||
The time interval (in seconds) between consecutive metric scrapings.
|
||||
default: 30
|
||||
|
||||
monitoring_storage_class_name:
|
||||
type: string
|
||||
description: The kubernetes storage class name to use for the prometheus pvc.
|
||||
default: ""
|
||||
|
||||
prometheus_operator_chart_tag:
|
||||
type: string
|
||||
description: The stable/prometheus-operator chart version to use.
|
||||
|
@ -1228,6 +1250,10 @@ resources:
|
|||
keystone_auth_enabled: {get_param: keystone_auth_enabled}
|
||||
k8s_keystone_auth_tag: {get_param: k8s_keystone_auth_tag}
|
||||
monitoring_enabled: {get_param: monitoring_enabled}
|
||||
monitoring_retention_days: {get_param: monitoring_retention_days}
|
||||
monitoring_retention_size: {get_param: monitoring_retention_size}
|
||||
monitoring_interval_seconds: {get_param: monitoring_interval_seconds}
|
||||
monitoring_storage_class_name: {get_param: monitoring_storage_class_name}
|
||||
prometheus_operator_chart_tag: {get_param: prometheus_operator_chart_tag}
|
||||
prometheus_adapter_enabled: {get_param: prometheus_adapter_enabled}
|
||||
prometheus_adapter_chart_tag: {get_param: prometheus_adapter_chart_tag}
|
||||
|
|
|
@ -463,6 +463,24 @@ parameters:
|
|||
type: boolean
|
||||
description: Enable or disable prometheus-operator monitoring solution.
|
||||
|
||||
monitoring_retention_days:
|
||||
type: number
|
||||
description: The number of time (in days) that prometheus metrics should be kept.
|
||||
|
||||
monitoring_retention_size:
|
||||
type: number
|
||||
description: >
|
||||
The maximum memory (in Gi) allowed to be used by prometheus server to store metrics.
|
||||
|
||||
monitoring_interval_seconds:
|
||||
type: number
|
||||
description: >
|
||||
The time interval (in seconds) between consecutive metric scrapings.
|
||||
|
||||
monitoring_storage_class_name:
|
||||
type: string
|
||||
description: The kubernetes storage class name to use for the prometheus pvc.
|
||||
|
||||
prometheus_operator_chart_tag:
|
||||
type: string
|
||||
description: The stable/prometheus-operator chart version to use.
|
||||
|
@ -795,6 +813,10 @@ resources:
|
|||
"$KEYSTONE_AUTH_ENABLED": {get_param: keystone_auth_enabled}
|
||||
"$K8S_KEYSTONE_AUTH_TAG": {get_param: k8s_keystone_auth_tag}
|
||||
"$MONITORING_ENABLED": {get_param: monitoring_enabled}
|
||||
"$MONITORING_RETENTION_DAYS": {get_param: monitoring_retention_days}
|
||||
"$MONITORING_RETENTION_SIZE": {get_param: monitoring_retention_size}
|
||||
"$MONITORING_INTERVAL_SECONDS": {get_param: monitoring_interval_seconds}
|
||||
"$MONITORING_STORAGE_CLASS_NAME": {get_param: monitoring_storage_class_name}
|
||||
"$PROMETHEUS_OPERATOR_CHART_TAG": {get_param: prometheus_operator_chart_tag}
|
||||
"$PROMETHEUS_ADAPTER_ENABLED": {get_param: prometheus_adapter_enabled}
|
||||
"$PROMETHEUS_ADAPTER_CHART_TAG": {get_param: prometheus_adapter_chart_tag}
|
||||
|
|
|
@ -704,6 +704,28 @@ parameters:
|
|||
description: Enable or disable prometheus-operator monitoring solution.
|
||||
default: false
|
||||
|
||||
monitoring_retention_days:
|
||||
type: number
|
||||
description: The number of time (in days) that prometheus metrics should be kept.
|
||||
default: 14
|
||||
|
||||
monitoring_retention_size:
|
||||
type: number
|
||||
description: >
|
||||
The maximum memory (in Gi) allowed to be used by prometheus server to store metrics.
|
||||
default: 14
|
||||
|
||||
monitoring_interval_seconds:
|
||||
type: number
|
||||
description: >
|
||||
The time interval (in seconds) between consecutive metric scrapings.
|
||||
default: 30
|
||||
|
||||
monitoring_storage_class_name:
|
||||
type: string
|
||||
description: The kubernetes storage class name to use for the prometheus pvc.
|
||||
default: ""
|
||||
|
||||
prometheus_operator_chart_tag:
|
||||
type: string
|
||||
description: The stable/prometheus-operator chart version to use.
|
||||
|
@ -1256,6 +1278,10 @@ resources:
|
|||
keystone_auth_enabled: {get_param: keystone_auth_enabled}
|
||||
k8s_keystone_auth_tag: {get_param: k8s_keystone_auth_tag}
|
||||
monitoring_enabled: {get_param: monitoring_enabled}
|
||||
monitoring_retention_days: {get_param: monitoring_retention_days}
|
||||
monitoring_retention_size: {get_param: monitoring_retention_size}
|
||||
monitoring_interval_seconds: {get_param: monitoring_interval_seconds}
|
||||
monitoring_storage_class_name: {get_param: monitoring_storage_class_name}
|
||||
prometheus_operator_chart_tag: {get_param: prometheus_operator_chart_tag}
|
||||
prometheus_adapter_enabled: {get_param: prometheus_adapter_enabled}
|
||||
prometheus_adapter_chart_tag: {get_param: prometheus_adapter_chart_tag}
|
||||
|
|
|
@ -467,6 +467,24 @@ parameters:
|
|||
type: boolean
|
||||
description: Enable or disable prometheus-operator monitoring solution.
|
||||
|
||||
monitoring_retention_days:
|
||||
type: number
|
||||
description: The number of time (in days) that prometheus metrics should be kept.
|
||||
|
||||
monitoring_retention_size:
|
||||
type: number
|
||||
description: >
|
||||
The maximum memory (in Gi) allowed to be used by prometheus server to store metrics.
|
||||
|
||||
monitoring_interval_seconds:
|
||||
type: number
|
||||
description: >
|
||||
The time interval (in seconds) between consecutive metric scrapings.
|
||||
|
||||
monitoring_storage_class_name:
|
||||
type: string
|
||||
description: The kubernetes storage class name to use for the prometheus pvc.
|
||||
|
||||
prometheus_operator_chart_tag:
|
||||
type: string
|
||||
description: The stable/prometheus-operator chart version to use.
|
||||
|
@ -814,6 +832,10 @@ resources:
|
|||
"$KEYSTONE_AUTH_ENABLED": {get_param: keystone_auth_enabled}
|
||||
"$K8S_KEYSTONE_AUTH_TAG": {get_param: k8s_keystone_auth_tag}
|
||||
"$MONITORING_ENABLED": {get_param: monitoring_enabled}
|
||||
"$MONITORING_RETENTION_DAYS": {get_param: monitoring_retention_days}
|
||||
"$MONITORING_RETENTION_SIZE": {get_param: monitoring_retention_size}
|
||||
"$MONITORING_INTERVAL_SECONDS": {get_param: monitoring_interval_seconds}
|
||||
"$MONITORING_STORAGE_CLASS_NAME": {get_param: monitoring_storage_class_name}
|
||||
"$PROMETHEUS_OPERATOR_CHART_TAG": {get_param: prometheus_operator_chart_tag}
|
||||
"$PROMETHEUS_ADAPTER_ENABLED": {get_param: prometheus_adapter_enabled}
|
||||
"$PROMETHEUS_ADAPTER_CHART_TAG": {get_param: prometheus_adapter_chart_tag}
|
||||
|
|
|
@ -539,6 +539,14 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
|||
'k8s_keystone_auth_tag')
|
||||
monitoring_enabled = mock_cluster.labels.get(
|
||||
'monitoring_enabled')
|
||||
monitoring_retention_days = mock_cluster.labels.get(
|
||||
'monitoring_retention_days')
|
||||
monitoring_retention_size = mock_cluster.labels.get(
|
||||
'monitoring_retention_size')
|
||||
monitoring_interval_seconds = mock_cluster.labels.get(
|
||||
'monitoring_interval_seconds')
|
||||
monitoring_storage_class_name = mock_cluster.labels.get(
|
||||
'monitoring_storage_class_name')
|
||||
prometheus_operator_chart_tag = mock_cluster.labels.get(
|
||||
'prometheus_operator_chart_tag')
|
||||
prometheus_adapter_enabled = mock_cluster.labels.get(
|
||||
|
@ -674,6 +682,10 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
|||
'keystone_auth_enabled': keystone_auth_enabled,
|
||||
'k8s_keystone_auth_tag': k8s_keystone_auth_tag,
|
||||
'monitoring_enabled': monitoring_enabled,
|
||||
'monitoring_retention_days': monitoring_retention_days,
|
||||
'monitoring_retention_size': monitoring_retention_size,
|
||||
'monitoring_interval_seconds': monitoring_interval_seconds,
|
||||
'monitoring_storage_class_name': monitoring_storage_class_name,
|
||||
'prometheus_operator_chart_tag': prometheus_operator_chart_tag,
|
||||
'prometheus_adapter_enabled': prometheus_adapter_enabled,
|
||||
'prometheus_adapter_chart_tag': prometheus_adapter_chart_tag,
|
||||
|
@ -1070,6 +1082,14 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
|||
'k8s_keystone_auth_tag')
|
||||
monitoring_enabled = mock_cluster.labels.get(
|
||||
'monitoring_enabled')
|
||||
monitoring_retention_days = mock_cluster.labels.get(
|
||||
'monitoring_retention_days')
|
||||
monitoring_retention_size = mock_cluster.labels.get(
|
||||
'monitoring_retention_size')
|
||||
monitoring_interval_seconds = mock_cluster.labels.get(
|
||||
'monitoring_interval_seconds')
|
||||
monitoring_storage_class_name = mock_cluster.labels.get(
|
||||
'monitoring_storage_class_name')
|
||||
prometheus_operator_chart_tag = mock_cluster.labels.get(
|
||||
'prometheus_operator_chart_tag')
|
||||
prometheus_adapter_enabled = mock_cluster.labels.get(
|
||||
|
@ -1208,6 +1228,10 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
|||
'keystone_auth_enabled': keystone_auth_enabled,
|
||||
'k8s_keystone_auth_tag': k8s_keystone_auth_tag,
|
||||
'monitoring_enabled': monitoring_enabled,
|
||||
'monitoring_retention_days': monitoring_retention_days,
|
||||
'monitoring_retention_size': monitoring_retention_size,
|
||||
'monitoring_interval_seconds': monitoring_interval_seconds,
|
||||
'monitoring_storage_class_name': monitoring_storage_class_name,
|
||||
'prometheus_operator_chart_tag': prometheus_operator_chart_tag,
|
||||
'prometheus_adapter_enabled': prometheus_adapter_enabled,
|
||||
'prometheus_adapter_chart_tag': prometheus_adapter_chart_tag,
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
---
|
||||
features:
|
||||
- |
|
||||
Added metrics_retention_days magnum label allowing user to specify
|
||||
prometheus server scraped metrics retention days (default: 14).
|
||||
Added metrics_retention_size_gi magnum label allowing user to specify
|
||||
prometheus server metrics storage maximum size in Gi (default: 14).
|
||||
Added metrics_interval_seconds allowing user to specify prometheus
|
||||
scrape frequency in seconds (default: 30).
|
||||
Added metrics_storage_class_name allowing user to specify the
|
||||
storageClass to use as external retention for pod fail-over data
|
||||
persistency.
|
Loading…
Reference in New Issue