openstack-helm-infra/prometheus/values.yaml

762 lines
33 KiB
YAML

# Copyright 2017 The Openstack-Helm Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Default values for prometheus.
# This is a YAML-formatted file.
# Declare name/value pairs to be passed into your templates.
# name: value
images:
tags:
prometheus: docker.io/prom/prometheus:v2.0.0
helm_tests: docker.io/kolla/ubuntu-source-heat-engine:3.0.3
dep_check: quay.io/stackanetes/kubernetes-entrypoint:v0.3.0
image_repo_sync: docker.io/docker:17.07.0
pull_policy: IfNotPresent
local_registry:
active: false
exclude:
- dep_check
- image_repo_sync
labels:
node_selector_key: openstack-control-plane
node_selector_value: enabled
pod:
affinity:
anti:
type:
default: preferredDuringSchedulingIgnoredDuringExecution
topologyKey:
default: kubernetes.io/hostname
mounts:
prometheus:
prometheus:
init_container: null
replicas:
prometheus: 1
lifecycle:
upgrades:
revision_history: 3
pod_replacement_strategy: RollingUpdate
rolling_update:
max_unavailable: 1
max_surge: 3
termination_grace_period:
prometheus:
timeout: 30
resources:
enabled: false
prometheus:
limits:
memory: "1024Mi"
cpu: "2000m"
requests:
memory: "128Mi"
cpu: "500m"
jobs:
image_repo_sync:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "1024Mi"
cpu: "2000m"
tests:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "1024Mi"
cpu: "2000m"
endpoints:
cluster_domain_suffix: cluster.local
local_image_registry:
name: docker-registry
namespace: docker-registry
hosts:
default: localhost
internal: docker-registry
node: localhost
host_fqdn_override:
default: null
port:
registry:
node: 5000
monitoring:
name: prometheus
namespace: null
hosts:
default: prom-metrics
public: prometheus
host_fqdn_override:
default: null
path:
default: null
scheme:
default: 'http'
port:
api:
default: 9090
public: 80
alerts:
name: alertmanager
namespace: null
hosts:
default: alerts-api
public: alertmanager
host_fqdn_override:
default: null
path:
default: null
scheme:
default: 'http'
port:
api:
default: 9093
public: 80
dependencies:
dynamic:
common:
local_image_registry:
jobs:
- prometheus-image-repo-sync
services:
- endpoint: node
service: local_image_registry
static:
image_repo_sync:
services:
- endpoint: internal
service: local_image_registry
prometheus:
services: null
monitoring:
prometheus:
enabled: true
prometheus:
scrape: true
network:
prometheus:
ingress:
public: true
proxy_body_size: 1024M
node_port:
enabled: false
port: 30900
storage:
enabled: true
pvc:
name: prometheus-pvc
access_mode: [ "ReadWriteOnce" ]
requests:
storage: 5Gi
storage_class: general
manifests:
configmap_bin: true
configmap_etc: true
ingress_prometheus: true
helm_tests: true
job_image_repo_sync: true
service_ingress_prometheus: true
service: true
statefulset_prometheus: true
conf:
prometheus:
storage:
tsdb:
path: /var/lib/prometheus/data
retention: 7d
min_block_duration: 2h
max_block_duration: 6h
log:
format: logger:stdout?json=true
level: info
query:
max_concurrency: 20
timeout: 2m
web_admin_api:
enabled: false
scrape_configs:
global:
scrape_interval: 60s
evaluation_interval: 60s
rule_files:
- /etc/config/rules/alertmanager.rules
- /etc/config/rules/etcd3.rules
- /etc/config/rules/kubernetes.rules
- /etc/config/rules/kube_apiserver.rules
- /etc/config/rules/kube_controller_manager.rules
- /etc/config/rules/kubelet.rules
scrape_configs:
- job_name: kubelet
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
scrape_interval: 45s
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels:
- __meta_kubernetes_node_name
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- source_labels:
- __meta_kubernetes_node_name
action: replace
target_label: kubernetes_io_hostname
# Scrape config for Kubelet cAdvisor.
#
# This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
# (those whose names begin with 'container_') have been removed from the
# Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to
# retrieve those metrics.
#
# In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor
# HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics"
# in that case (and ensure cAdvisor's HTTP server hasn't been disabled with
# the --cadvisor-port=0 Kubelet flag).
#
# This job is not necessary and should be removed in Kubernetes 1.6 and
# earlier versions, or it will cause the metrics to be scraped twice.
- job_name: 'kubernetes-cadvisor'
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
scrape_interval: 45s
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels:
- __meta_kubernetes_node_name
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- source_labels:
- __meta_kubernetes_node_name
action: replace
target_label: kubernetes_io_hostname
metric_relabel_configs:
- action: replace
source_labels:
- id
regex: '^/machine\.slice/machine-rkt\\x2d([^\\]+)\\.+/([^/]+)\.service$'
target_label: rkt_container_name
replacement: '${2}-${1}'
- action: replace
source_labels:
- id
regex: '^/system\.slice/(.+)\.service$'
target_label: systemd_service_name
replacement: '${1}'
# Scrape config for API servers.
#
# Kubernetes exposes API servers as endpoints to the default/kubernetes
# service so this uses `endpoints` role and uses relabelling to only keep
# the endpoints associated with the default/kubernetes service using the
# default named port `https`. This works for single API server deployments as
# well as HA API server deployments.
- job_name: 'apiserver'
kubernetes_sd_configs:
- role: endpoints
scrape_interval: 45s
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & bearer token file config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
# insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# Keep only the default/kubernetes service endpoints for the https port. This
# will add targets for each API server which Kubernetes adds an endpoint to
# the default/kubernetes service.
relabel_configs:
- source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
action: keep
regex: default;kubernetes;https
# Scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
# to set this to `https` & most likely set the `tls_config` of the scrape config.
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
# service then set this appropriately.
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
scrape_interval: 60s
relabel_configs:
- source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
action: keep
regex: true
- source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
action: replace
target_label: __scheme__
regex: (https?)
- source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels:
- __meta_kubernetes_namespace
action: replace
target_label: kubernetes_namespace
- source_labels:
- __meta_kubernetes_service_name
action: replace
target_label: kubernetes_name
- source_labels:
- __meta_kubernetes_service_name
target_label: job
replacement: ${1}
# Example scrape config for pods
#
# The relabeling allows the actual pod scrape endpoint to be configured via the
# following annotations:
#
# * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: Scrape the pod on the indicated port instead of the
# pod's declared ports (default is a port-free target if none are declared).
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: calico-etcd
honor_labels: false
kubernetes_sd_configs:
- role: service
scrape_interval: 20s
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- action: keep
source_labels:
- __meta_kubernetes_service_name
regex: "calico-etcd"
- action: keep
source_labels:
- __meta_kubernetes_namespace
regex: kube-system
target_label: namespace
- source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- source_labels:
- __meta_kubernetes_service_name
target_label: service
- source_labels:
- __meta_kubernetes_service_name
target_label: job
replacement: ${1}
- source_labels:
- __meta_kubernetes_service_label
target_label: job
regex: calico-etcd
replacement: ${1}
- target_label: endpoint
replacement: "calico-etcd"
alerting:
alertmanagers:
- kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels:
- __meta_kubernetes_pod_label_name
regex: alertmanager
action: keep
- source_labels:
- __meta_kubernetes_namespace
regex: openstack
action: keep
- source_labels:
- __meta_kubernetes_pod_container_port_number
regex:
action: drop
rules:
alertmanager:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", "alertmanager-$1", "alertmanager", "(.*)") != 1
for: 5m
labels:
severity: critical
annotations:
description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync.
summary: Alertmanager configurations are inconsistent
- alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
for: 5m
labels:
severity: warning
annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers disappeared from discovery.
summary: Alertmanager down or not discovered
- alert: FailedReload
expr: alertmanager_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.
summary: Alertmanager configuration reload has failed
etcd3:
groups:
- name: etcd3.rules
rules:
- alert: InsufficientMembers
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
for: 3m
labels:
severity: critical
annotations:
description: If one more etcd member goes down the cluster will be unavailable
summary: etcd cluster insufficient members
- alert: NoLeader
expr: etcd_server_has_leader{job="etcd"} == 0
for: 1m
labels:
severity: critical
annotations:
description: etcd member {{ $labels.instance }} has no leader
summary: etcd member has no leader
- alert: HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: critical
annotations:
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow
summary: slow gRPC requests
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow
summary: slow HTTP requests
- alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
summary: etcd member communication is slow
- alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour
summary: a high number of proposals within the etcd cluster are failing
- alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} fync durations are high
summary: high fsync durations
- alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations
kube_apiserver:
groups:
- name: kube-apiserver.rules
rules:
- alert: K8SApiserverDown
expr: absent(up{job="apiserver"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: Prometheus failed to scrape API server(s), or all API servers have disappeared from service discovery.
summary: API server unreachable
- alert: K8SApiServerLatency
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
for: 10m
labels:
severity: warning
annotations:
description: 99th percentile Latency for {{ $labels.verb }} requests to the kube-apiserver is higher than 1s.
summary: Kubernetes apiserver latency is high
kube_controller_manager:
groups:
- name: kube-controller-manager.rules
rules:
- alert: K8SControllerManagerDown
expr: absent(up{job="kube-controller-manager"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
summary: Controller manager is down
kubelet:
groups:
- name: kubelet.rules
rules:
- alert: K8SNodeNotReady
expr: kube_node_status_ready{condition="true"} == 0
for: 1h
labels:
severity: warning
annotations:
description: The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour
summary: Node status is NotReady
- alert: K8SManyNodesNotReady
expr: count(kube_node_status_ready{condition="true"} == 0) > 1 and (count(kube_node_status_ready{condition="true"} == 0) / count(kube_node_status_ready{condition="true"})) > 0.2
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady state).'
summary: Many Kubernetes nodes are Not Ready
- alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.03
for: 1h
labels:
severity: warning
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletDown
expr: absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) > 0.1
for: 1h
labels:
severity: critical
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100
labels:
severity: warning
annotations:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 110
summary: Kubelet is close to pod limit
kubernetes:
groups:
- name: kubernetes.rules
rules:
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:spec_cpu_shares
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:cpu_usage:rate
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_usage:bytes
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_rss:bytes
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_cache:bytes
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:disk_usage:bytes
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name)
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
- record: cluster_namespace_controller_pod_container:memory_oom:rate
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, container_name, scope, type)
- record: cluster:memory_allocation:percent
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
- record: cluster:memory_used:percent
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)
- record: cluster:cpu_allocation:percent
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"} * ON(cluster, instance) machine_cpu_cores) BY (cluster)
- record: cluster:node_cpu_use:percent
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores) BY (cluster)
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
labels:
quantile: "0.99"
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
labels:
quantile: "0.9"
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, cluster, job, resource, verb)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"