summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJenkins <jenkins@review.openstack.org>2016-10-14 08:34:15 +0000
committerGerrit Code Review <review@openstack.org>2016-10-14 08:34:15 +0000
commit1c12b277bf8cb902c7444e10d33330a546f04525 (patch)
tree81cf651d704fbeef66d3ab58343a638471d0c08d
parentb064db32b5be73d9e479e58f8d27e508e040a335 (diff)
parentd248ed36d713a3ba7370e6e1f00ea4d27dd16f57 (diff)
Merge "Update documentation for 1.0"1.0rc1
-rw-r--r--doc/user/source/appendix_alarms.rst2240
-rw-r--r--doc/user/source/configure_alarms.rst40
-rw-r--r--doc/user/source/release_notes.rst28
3 files changed, 2291 insertions, 17 deletions
diff --git a/doc/user/source/appendix_alarms.rst b/doc/user/source/appendix_alarms.rst
index e3302cd..8169649 100644
--- a/doc/user/source/appendix_alarms.rst
+++ b/doc/user/source/appendix_alarms.rst
@@ -48,6 +48,50 @@ The following is a list of StackLight built-in alarms::
48 window: 120 48 window: 120
49 periods: 0 49 periods: 0
50 function: avg 50 function: avg
51 - name: 'swap-usage-critical'
52 description: 'There is no more swap free space'
53 severity: 'critical'
54 enabled: 'true'
55 trigger:
56 logical_operator: 'or'
57 rules:
58 - metric: swap_free
59 relational_operator: '=='
60 threshold: 0
61 window: 60
62 periods: 0
63 function: max
64 - name: 'swap-activity-warning'
65 description: 'The swap activity is high'
66 severity: 'warning'
67 enabled: 'true'
68 trigger:
69 logical_operator: 'or'
70 rules:
71 - metric: swap_io_in
72 relational_operator: '>='
73 threshold: 1048576 # 1 Mb/s
74 window: 120
75 periods: 0
76 function: avg
77 - metric: swap_io_out
78 relational_operator: '>='
79 threshold: 1048576 # 1 Mb/s
80 window: 120
81 periods: 0
82 function: avg
83 - name: 'swap-usage-warning'
84 description: 'The swap free space is low'
85 severity: 'warning'
86 enabled: 'true'
87 trigger:
88 rules:
89 - metric: swap_percent_used
90 relational_operator: '>='
91 threshold: 0.8
92 window: 60
93 periods: 0
94 function: avg
51 - name: 'cpu-critical-compute' 95 - name: 'cpu-critical-compute'
52 description: 'The CPU usage is too high (compute node)' 96 description: 'The CPU usage is too high (compute node)'
53 severity: 'critical' 97 severity: 'critical'
@@ -186,6 +230,9 @@ The following is a list of StackLight built-in alarms::
186 - name: 'rabbitmq-disk-limit-critical' 230 - name: 'rabbitmq-disk-limit-critical'
187 description: 'RabbitMQ has reached the free disk threshold. All producers are blocked' 231 description: 'RabbitMQ has reached the free disk threshold. All producers are blocked'
188 severity: 'critical' 232 severity: 'critical'
233 # If the local RabbitMQ instance is down, it will be caught by the
234 # rabbitmq-check alarm
235 no_data_policy: 'okay'
189 enabled: 'true' 236 enabled: 'true'
190 trigger: 237 trigger:
191 logical_operator: 'or' 238 logical_operator: 'or'
@@ -199,6 +246,9 @@ The following is a list of StackLight built-in alarms::
199 - name: 'rabbitmq-disk-limit-warning' 246 - name: 'rabbitmq-disk-limit-warning'
200 description: 'RabbitMQ is getting close to the free disk threshold' 247 description: 'RabbitMQ is getting close to the free disk threshold'
201 severity: 'warning' 248 severity: 'warning'
249 # If the local RabbitMQ instance is down, it will be caught by the
250 # rabbitmq-check alarm
251 no_data_policy: 'okay'
202 enabled: 'true' 252 enabled: 'true'
203 trigger: 253 trigger:
204 logical_operator: 'or' 254 logical_operator: 'or'
@@ -212,6 +262,9 @@ The following is a list of StackLight built-in alarms::
212 - name: 'rabbitmq-memory-limit-critical' 262 - name: 'rabbitmq-memory-limit-critical'
213 description: 'RabbitMQ has reached the memory threshold. All producers are blocked' 263 description: 'RabbitMQ has reached the memory threshold. All producers are blocked'
214 severity: 'critical' 264 severity: 'critical'
265 # If the local RabbitMQ instance is down, it will be caught by the
266 # rabbitmq-check alarm
267 no_data_policy: 'okay'
215 enabled: 'true' 268 enabled: 'true'
216 trigger: 269 trigger:
217 logical_operator: 'or' 270 logical_operator: 'or'
@@ -225,6 +278,9 @@ The following is a list of StackLight built-in alarms::
225 - name: 'rabbitmq-memory-limit-warning' 278 - name: 'rabbitmq-memory-limit-warning'
226 description: 'RabbitMQ is getting close to the memory threshold' 279 description: 'RabbitMQ is getting close to the memory threshold'
227 severity: 'warning' 280 severity: 'warning'
281 # If the local RabbitMQ instance is down, it will be caught by the
282 # rabbitmq-check alarm
283 no_data_policy: 'okay'
228 enabled: 'true' 284 enabled: 'true'
229 trigger: 285 trigger:
230 logical_operator: 'or' 286 logical_operator: 'or'
@@ -238,6 +294,9 @@ The following is a list of StackLight built-in alarms::
238 - name: 'rabbitmq-queue-warning' 294 - name: 'rabbitmq-queue-warning'
239 description: 'The number of outstanding messages is too high' 295 description: 'The number of outstanding messages is too high'
240 severity: 'warning' 296 severity: 'warning'
297 # If the local RabbitMQ instance is down, it will be caught by the
298 # rabbitmq-check alarm
299 no_data_policy: 'okay'
241 enabled: 'true' 300 enabled: 'true'
242 trigger: 301 trigger:
243 logical_operator: 'or' 302 logical_operator: 'or'
@@ -248,6 +307,57 @@ The following is a list of StackLight built-in alarms::
248 window: 120 307 window: 120
249 periods: 0 308 periods: 0
250 function: avg 309 function: avg
310 - name: 'rabbitmq-pacemaker-down'
311 description: 'The RabbitMQ cluster is down'
312 severity: 'down'
313 no_data_policy: 'skip' # the metric is only collected from the DC node
314 enabled: 'true'
315 trigger:
316 logical_operator: 'and'
317 rules:
318 - metric: pacemaker_resource_percent
319 fields:
320 resource: rabbitmq
321 status: up
322 relational_operator: '=='
323 threshold: 0
324 window: 60
325 periods: 0
326 function: last
327 - name: 'rabbitmq-pacemaker-critical'
328 description: 'The RabbitMQ cluster is critical because less than half of the nodes are up'
329 severity: 'critical'
330 no_data_policy: 'skip' # the metric is only collected from the DC node
331 enabled: 'true'
332 trigger:
333 logical_operator: 'and'
334 rules:
335 - metric: pacemaker_resource_percent
336 fields:
337 resource: rabbitmq
338 status: up
339 relational_operator: '<'
340 threshold: 50
341 window: 60
342 periods: 0
343 function: last
344 - name: 'rabbitmq-pacemaker-warning'
345 description: 'The RabbitMQ cluster is degraded because some RabbitMQ nodes are missing'
346 severity: 'warning'
347 no_data_policy: 'skip' # the metric is only collected from the DC node
348 enabled: 'true'
349 trigger:
350 logical_operator: 'and'
351 rules:
352 - metric: pacemaker_resource_percent
353 fields:
354 resource: rabbitmq
355 status: up
356 relational_operator: '<'
357 threshold: 100
358 window: 60
359 periods: 0
360 function: last
251 - name: 'apache-warning' 361 - name: 'apache-warning'
252 description: 'There is no Apache idle workers available' 362 description: 'There is no Apache idle workers available'
253 severity: 'warning' 363 severity: 'warning'
@@ -261,6 +371,18 @@ The following is a list of StackLight built-in alarms::
261 window: 60 371 window: 60
262 periods: 0 372 periods: 0
263 function: min 373 function: min
374 - name: 'apache-check'
375 description: 'Apache cannot be checked'
376 severity: 'down'
377 enabled: 'true'
378 trigger:
379 rules:
380 - metric: apache_check
381 relational_operator: '=='
382 threshold: 0
383 window: 60
384 periods: 0
385 function: last
264 - name: 'log-fs-warning' 386 - name: 'log-fs-warning'
265 description: "The log filesystem's free space is low" 387 description: "The log filesystem's free space is low"
266 severity: 'warning' 388 severity: 'warning'
@@ -271,7 +393,7 @@ The following is a list of StackLight built-in alarms::
271 fields: 393 fields:
272 fs: '/var/log' 394 fs: '/var/log'
273 relational_operator: '<' 395 relational_operator: '<'
274 threshold: 10 396 threshold: 10
275 window: 60 397 window: 60
276 periods: 0 398 periods: 0
277 function: min 399 function: min
@@ -285,7 +407,7 @@ The following is a list of StackLight built-in alarms::
285 fields: 407 fields:
286 fs: '/var/log' 408 fs: '/var/log'
287 relational_operator: '<' 409 relational_operator: '<'
288 threshold: 5 410 threshold: 5
289 window: 60 411 window: 60
290 periods: 0 412 periods: 0
291 function: min 413 function: min
@@ -299,7 +421,7 @@ The following is a list of StackLight built-in alarms::
299 fields: 421 fields:
300 fs: '/' 422 fs: '/'
301 relational_operator: '<' 423 relational_operator: '<'
302 threshold: 5 424 threshold: 10
303 window: 60 425 window: 60
304 periods: 0 426 periods: 0
305 function: min 427 function: min
@@ -313,7 +435,7 @@ The following is a list of StackLight built-in alarms::
313 fields: 435 fields:
314 fs: '/' 436 fs: '/'
315 relational_operator: '<' 437 relational_operator: '<'
316 threshold: 2 438 threshold: 5
317 window: 60 439 window: 60
318 periods: 0 440 periods: 0
319 function: min 441 function: min
@@ -327,7 +449,7 @@ The following is a list of StackLight built-in alarms::
327 fields: 449 fields:
328 fs: '/var/lib/mysql' 450 fs: '/var/lib/mysql'
329 relational_operator: '<' 451 relational_operator: '<'
330 threshold: 5 452 threshold: 10
331 window: 60 453 window: 60
332 periods: 0 454 periods: 0
333 function: min 455 function: min
@@ -341,7 +463,7 @@ The following is a list of StackLight built-in alarms::
341 fields: 463 fields:
342 fs: '/var/lib/mysql' 464 fs: '/var/lib/mysql'
343 relational_operator: '<' 465 relational_operator: '<'
344 threshold: 2 466 threshold: 5
345 window: 60 467 window: 60
346 periods: 0 468 periods: 0
347 function: min 469 function: min
@@ -373,6 +495,54 @@ The following is a list of StackLight built-in alarms::
373 window: 60 495 window: 60
374 periods: 0 496 periods: 0
375 function: min 497 function: min
498 - name: 'other-fs-warning'
499 description: "The filesystem's free space is low"
500 severity: 'warning'
501 enabled: 'true'
502 no_data_policy: 'okay'
503 trigger:
504 rules:
505 - metric: fs_space_percent_free
506 fields:
507 fs: '!= /var/lib/nova && != /var/log && != /var/lib/mysql && != / && !~ ceph%-%d+$'
508 group_by: [fs]
509 relational_operator: '<'
510 threshold: 10
511 window: 60
512 periods: 0
513 function: min
514 - name: 'other-fs-critical'
515 description: "The filesystem's free space is too low"
516 severity: 'critical'
517 enabled: 'true'
518 no_data_policy: 'okay'
519 trigger:
520 rules:
521 - metric: fs_space_percent_free
522 fields:
523 fs: '!= /var/lib/nova && != /var/log && != /var/lib/mysql && != / && !~ ceph%-%d+$'
524 group_by: [fs]
525 relational_operator: '<'
526 threshold: 5
527 window: 60
528 periods: 0
529 function: min
530 - name: 'osd-disk-critical'
531 description: "The filesystem's free space is too low (OSD disk)"
532 severity: 'critical'
533 enabled: 'true'
534 trigger:
535 rules:
536 - metric: fs_space_percent_free
537 fields:
538 # Real FS is /var/lib/ceph/osd/ceph-0 but Collectd substituted '/' by '-'
539 fs: '=~ ceph/%d+$'
540 group_by: [fs]
541 relational_operator: '<'
542 threshold: 5
543 window: 60
544 periods: 0
545 function: min
376 - name: 'nova-api-http-errors' 546 - name: 'nova-api-http-errors'
377 description: 'Too many 5xx HTTP errors have been detected on nova-api' 547 description: 'Too many 5xx HTTP errors have been detected on nova-api'
378 severity: 'warning' 548 severity: 'warning'
@@ -391,6 +561,7 @@ The following is a list of StackLight built-in alarms::
391 - name: 'nova-logs-error' 561 - name: 'nova-logs-error'
392 description: 'Too many errors have been detected in Nova logs' 562 description: 'Too many errors have been detected in Nova logs'
393 severity: 'warning' 563 severity: 'warning'
564 no_data_policy: 'okay'
394 enabled: 'true' 565 enabled: 'true'
395 trigger: 566 trigger:
396 logical_operator: 'or' 567 logical_operator: 'or'
@@ -422,6 +593,7 @@ The following is a list of StackLight built-in alarms::
422 - name: 'heat-logs-error' 593 - name: 'heat-logs-error'
423 description: 'Too many errors have been detected in Heat logs' 594 description: 'Too many errors have been detected in Heat logs'
424 severity: 'warning' 595 severity: 'warning'
596 no_data_policy: 'okay'
425 enabled: 'true' 597 enabled: 'true'
426 trigger: 598 trigger:
427 logical_operator: 'or' 599 logical_operator: 'or'
@@ -444,12 +616,29 @@ The following is a list of StackLight built-in alarms::
444 rules: 616 rules:
445 - metric: haproxy_backend_response_5xx 617 - metric: haproxy_backend_response_5xx
446 fields: 618 fields:
447 backend: 'swift-api' 619 backend: 'swift-api || object-storage'
448 relational_operator: '>' 620 relational_operator: '>'
449 threshold: 0 621 threshold: 0
450 window: 60 622 window: 60
451 periods: 1 623 periods: 1
452 function: diff 624 function: diff
625 - name: 'swift-logs-error'
626 description: 'Too many errors have been detected in Swift logs'
627 severity: 'warning'
628 no_data_policy: 'okay'
629 enabled: 'true'
630 trigger:
631 logical_operator: 'or'
632 rules:
633 - metric: log_messages
634 fields:
635 service: 'swift'
636 level: 'error'
637 relational_operator: '>'
638 threshold: 0.1
639 window: 70
640 periods: 0
641 function: max
453 - name: 'cinder-api-http-errors' 642 - name: 'cinder-api-http-errors'
454 description: 'Too many 5xx HTTP errors have been detected on cinder-api' 643 description: 'Too many 5xx HTTP errors have been detected on cinder-api'
455 severity: 'warning' 644 severity: 'warning'
@@ -468,6 +657,7 @@ The following is a list of StackLight built-in alarms::
468 - name: 'cinder-logs-error' 657 - name: 'cinder-logs-error'
469 description: 'Too many errors have been detected in Cinder logs' 658 description: 'Too many errors have been detected in Cinder logs'
470 severity: 'warning' 659 severity: 'warning'
660 no_data_policy: 'okay'
471 enabled: 'true' 661 enabled: 'true'
472 trigger: 662 trigger:
473 logical_operator: 'or' 663 logical_operator: 'or'
@@ -499,6 +689,7 @@ The following is a list of StackLight built-in alarms::
499 - name: 'glance-logs-error' 689 - name: 'glance-logs-error'
500 description: 'Too many errors have been detected in Glance logs' 690 description: 'Too many errors have been detected in Glance logs'
501 severity: 'warning' 691 severity: 'warning'
692 no_data_policy: 'okay'
502 enabled: 'true' 693 enabled: 'true'
503 trigger: 694 trigger:
504 logical_operator: 'or' 695 logical_operator: 'or'
@@ -530,6 +721,7 @@ The following is a list of StackLight built-in alarms::
530 - name: 'neutron-logs-error' 721 - name: 'neutron-logs-error'
531 description: 'Too many errors have been detected in Neutron logs' 722 description: 'Too many errors have been detected in Neutron logs'
532 severity: 'warning' 723 severity: 'warning'
724 no_data_policy: 'okay'
533 enabled: 'true' 725 enabled: 'true'
534 trigger: 726 trigger:
535 logical_operator: 'or' 727 logical_operator: 'or'
@@ -543,6 +735,24 @@ The following is a list of StackLight built-in alarms::
543 window: 70 735 window: 70
544 periods: 0 736 periods: 0
545 function: max 737 function: max
738 - name: 'keystone-response-time-duration'
739 description: 'Keystone API is too slow'
740 severity: 'warning'
741 no_data_policy: 'okay'
742 enabled: 'true'
743 trigger:
744 logical_operator: 'or'
745 rules:
746 - metric: openstack_keystone_http_response_times
747 fields:
748 http_method: '== GET || == POST'
749 http_status: '!= 5xx'
750 relational_operator: '>'
751 threshold: 0.3
752 window: 60
753 periods: 0
754 value: upper_90
755 function: max
546 - name: 'keystone-public-api-http-errors' 756 - name: 'keystone-public-api-http-errors'
547 description: 'Too many 5xx HTTP errors have been detected on keystone-public-api' 757 description: 'Too many 5xx HTTP errors have been detected on keystone-public-api'
548 severity: 'warning' 758 severity: 'warning'
@@ -573,9 +783,25 @@ The following is a list of StackLight built-in alarms::
573 window: 60 783 window: 60
574 periods: 1 784 periods: 1
575 function: diff 785 function: diff
786 - name: 'horizon-web-http-errors'
787 description: 'Too many 5xx HTTP errors have been detected on horizon'
788 severity: 'warning'
789 enabled: 'true'
790 trigger:
791 logical_operator: 'or'
792 rules:
793 - metric: haproxy_backend_response_5xx
794 fields:
795 backend: 'horizon-web || horizon-https'
796 relational_operator: '>'
797 threshold: 0
798 window: 60
799 periods: 1
800 function: diff
576 - name: 'keystone-logs-error' 801 - name: 'keystone-logs-error'
577 description: 'Too many errors have been detected in Keystone logs' 802 description: 'Too many errors have been detected in Keystone logs'
578 severity: 'warning' 803 severity: 'warning'
804 no_data_policy: 'okay'
579 enabled: 'true' 805 enabled: 'true'
580 trigger: 806 trigger:
581 logical_operator: 'or' 807 logical_operator: 'or'
@@ -691,7 +917,7 @@ The following is a list of StackLight built-in alarms::
691 fields: 917 fields:
692 fs: '/opt/es/data' # Real FS is /opt/es-data but Collectd substituted '/' by '-' 918 fs: '/opt/es/data' # Real FS is /opt/es-data but Collectd substituted '/' by '-'
693 relational_operator: '<' 919 relational_operator: '<'
694 threshold: 20 920 threshold: 20 # The low watermark for disk usage is 85% by default
695 window: 60 921 window: 60
696 periods: 0 922 periods: 0
697 function: min 923 function: min
@@ -705,7 +931,7 @@ The following is a list of StackLight built-in alarms::
705 fields: 931 fields:
706 fs: '/opt/es/data' # Real FS is /opt/es-data but Collectd substituted '/' by '-' 932 fs: '/opt/es/data' # Real FS is /opt/es-data but Collectd substituted '/' by '-'
707 relational_operator: '<' 933 relational_operator: '<'
708 threshold: 15 934 threshold: 15 # The high watermark for disk usage is 90% by default
709 window: 60 935 window: 60
710 periods: 0 936 periods: 0
711 function: min 937 function: min
@@ -736,4 +962,1998 @@ The following is a list of StackLight built-in alarms::
736 threshold: 5 962 threshold: 5
737 window: 60 963 window: 60
738 periods: 0 964 periods: 0
739 function: min \ No newline at end of file 965 function: min
966 - name: 'haproxy-check'
967 description: "HAProxy cannot be checked"
968 severity: 'down'
969 enabled: 'true'
970 trigger:
971 rules:
972 - metric: haproxy_check
973 relational_operator: '=='
974 threshold: 0
975 window: 60
976 periods: 0
977 function: last
978 - name: 'rabbitmq-check'
979 description: "RabbitMQ cannot be checked"
980 # This alarm's severity is warning because the effective status of the
981 # RabbitMQ cluster is computed by rabbitmq-pacemaker-* alarms.
982 # This alarm is still useful because it will report the node(s) on which
983 # RabbitMQ isn't running.
984 severity: 'warning'
985 enabled: 'true'
986 trigger:
987 rules:
988 - metric: rabbitmq_check
989 relational_operator: '=='
990 threshold: 0
991 window: 60
992 periods: 0
993 function: last
994 - name: 'ceph-mon-check'
995 description: "Ceph monitor cannot be checked"
996 severity: 'down'
997 enabled: 'true'
998 trigger:
999 rules:
1000 - metric: ceph_mon_check
1001 relational_operator: '=='
1002 threshold: 0
1003 window: 60
1004 periods: 0
1005 function: last
1006 - name: 'ceph-osd-check'
1007 description: "Ceph OSD cannot be checked"
1008 severity: 'down'
1009 enabled: 'true'
1010 trigger:
1011 rules:
1012 - metric: ceph_osd_check
1013 relational_operator: '=='
1014 threshold: 0
1015 window: 80 # The metric interval collection is 60s
1016 periods: 0
1017 function: last
1018 - name: 'pacemaker-check'
1019 description: "Pacemaker cannot be checked"
1020 severity: 'down'
1021 enabled: 'true'
1022 trigger:
1023 rules:
1024 - metric: pacemaker_check
1025 relational_operator: '=='
1026 threshold: 0
1027 window: 60
1028 periods: 0
1029 function: last
1030 - name: 'elasticsearch-check'
1031 description: "Elasticsearch cannot be checked"
1032 severity: 'down'
1033 enabled: 'true'
1034 trigger:
1035 rules:
1036 - metric: elasticsearch_check
1037 relational_operator: '=='
1038 threshold: 0
1039 window: 60
1040 periods: 0
1041 function: last
1042 - name: 'influxdb-check'
1043 description: "InfluxDB cannot be checked"
1044 severity: 'down'
1045 enabled: 'true'
1046 trigger:
1047 rules:
1048 - metric: influxdb_check
1049 relational_operator: '=='
1050 threshold: 0
1051 window: 60
1052 periods: 0
1053 function: last
1054 - name: 'libvirt-check'
1055 description: "Libvirt cannot be checked"
1056 severity: 'down'
1057 enabled: 'true'
1058 trigger:
1059 rules:
1060 - metric: libvirt_check
1061 relational_operator: '=='
1062 threshold: 0
1063 window: 60
1064 periods: 0
1065 function: last
1066 - name: 'memcached-check'
1067 description: "memcached cannot be checked"
1068 severity: 'down'
1069 enabled: 'true'
1070 trigger:
1071 rules:
1072 - metric: memcached_check
1073 relational_operator: '=='
1074 threshold: 0
1075 window: 60
1076 periods: 0
1077 function: last
1078 - name: 'mysql-check'
1079 description: "MySQL cannot be checked"
1080 severity: 'down'
1081 enabled: 'true'
1082 trigger:
1083 rules:
1084 - metric: mysql_check
1085 relational_operator: '=='
1086 threshold: 0
1087 window: 60
1088 periods: 0
1089 function: last
1090 - name: 'network-warning-dropped-rx'
1091 description: "Some received packets have been dropped"
1092 severity: 'warning'
1093 enabled: 'true'
1094 trigger:
1095 rules:
1096 - metric: if_dropped_rx
1097 relational_operator: '>'
1098 threshold: 100
1099 window: 60
1100 periods: 0
1101 function: avg
1102 - name: 'network-critical-dropped-rx'
1103 description: "Too many received packets have been dropped"
1104 severity: 'critical'
1105 enabled: 'true'
1106 trigger:
1107 rules:
1108 - metric: if_dropped_rx
1109 relational_operator: '>'
1110 threshold: 1000
1111 window: 60
1112 periods: 0
1113 function: avg
1114 - name: 'network-warning-dropped-tx'
1115 description: "Some transmitted packets have been dropped"
1116 severity: 'warning'
1117 enabled: 'true'
1118 trigger:
1119 rules:
1120 - metric: if_dropped_tx
1121 relational_operator: '>'
1122 threshold: 100
1123 window: 60
1124 periods: 0
1125 function: avg
1126 - name: 'network-critical-dropped-tx'
1127 description: "Too many transmitted packets have been dropped"
1128 severity: 'critical'
1129 enabled: 'true'
1130 trigger:
1131 rules:
1132 - metric: if_dropped_tx
1133 relational_operator: '>'
1134 threshold: 1000
1135 function: avg
1136 window: 60
1137 - name: 'instance-creation-time-warning'
1138 description: "Instance creation takes too much time"
1139 severity: 'warning'
1140 no_data_policy: 'okay' # This is a sporadic metric
1141 enabled: 'true'
1142 trigger:
1143 rules:
1144 - metric: openstack_nova_instance_creation_time
1145 relational_operator: '>'
1146 threshold: 20
1147 window: 600
1148 periods: 0
1149 function: avg
1150 - name: 'hdd-errors-critical'
1151 description: 'Errors on hard drive(s) have been detected'
1152 severity: 'critical'
1153 enabled: 'true'
1154 no_data_policy: okay
1155 trigger:
1156 rules:
1157 - metric: hdd_errors_rate
1158 group_by: ['device']
1159 relational_operator: '>'
1160 threshold: 0
1161 window: 60
1162 periods: 0
1163 function: max
1164 - name: 'total-nova-free-vcpu-warning'
1165 description: 'There is none VCPU available for new instances'
1166 severity: 'warning'
1167 enabled: 'true'
1168 no_data_policy: skip # the metric is only collected from the aggregator node
1169 trigger:
1170 rules:
1171 - metric: openstack_nova_total_free_vcpus
1172 relational_operator: '=='
1173 threshold: 0
1174 window: 60
1175 periods: 0
1176 function: max
1177 - name: 'total-nova-free-memory-warning'
1178 description: 'There is none memory available for new instances'
1179 severity: 'warning'
1180 enabled: 'true'
1181 no_data_policy: skip # the metric is only collected from the aggregator node
1182 trigger:
1183 rules:
1184 - metric: openstack_nova_total_free_ram
1185 relational_operator: '=='
1186 threshold: 0
1187 window: 60
1188 periods: 0
1189 function: max
1190
1191 # Adds alarm on local check for OpenStack services endpoint
1192 - name: 'cinder-api-local-endpoint'
1193 description: 'Cinder API is locally down'
1194 severity: 'down'
1195 enabled: 'true'
1196 trigger:
1197 rules:
1198 - metric: openstack_check_local_api
1199 fields:
1200 service: 'cinder-api'
1201 relational_operator: '=='
1202 threshold: 0
1203 window: 60
1204 periods: 0
1205 function: last
1206 - name: 'glance-api-local-endpoint'
1207 description: 'Glance API is locally down'
1208 severity: 'down'
1209 enabled: 'true'
1210 trigger:
1211 rules:
1212 - metric: openstack_check_local_api
1213 fields:
1214 service: 'glance-api'
1215 relational_operator: '=='
1216 threshold: 0
1217 window: 60
1218 periods: 0
1219 function: last
1220 - name: 'heat-api-local-endpoint'
1221 description: 'Heat API is locally down'
1222 severity: 'down'
1223 enabled: 'true'
1224 trigger:
1225 rules:
1226 - metric: openstack_check_local_api
1227 fields:
1228 service: 'heat-api'
1229 relational_operator: '=='
1230 threshold: 0
1231 window: 60
1232 periods: 0
1233 function: last
1234 - name: 'heat-cfn-api-local-endpoint'
1235 description: 'Heat CFN API is locally down'
1236 severity: 'down'
1237 enabled: 'true'
1238 trigger:
1239 rules:
1240 - metric: openstack_check_local_api
1241 fields:
1242 service: 'heat-cfn-api'
1243 relational_operator: '=='
1244 threshold: 0
1245 window: 60
1246 periods: 0
1247 function: last
1248 - name: 'keystone-public-api-local-endpoint'
1249 description: 'Keystone public API is locally down'
1250 severity: 'down'
1251 enabled: 'true'
1252 trigger:
1253 rules:
1254 - metric: openstack_check_local_api
1255 fields:
1256 service: 'keystone-public-api'
1257 relational_operator: '=='
1258 threshold: 0
1259 window: 60
1260 periods: 0
1261 function: last
1262 - name: 'neutron-api-local-endpoint'
1263 description: 'Neutron API is locally down'
1264 severity: 'down'
1265 enabled: 'true'
1266 trigger:
1267 rules:
1268 - metric: openstack_check_local_api
1269 fields:
1270 service: 'neutron-api'
1271 relational_operator: '=='
1272 threshold: 0
1273 window: 60
1274 periods: 0
1275 function: last
1276 - name: 'nova-api-local-endpoint'
1277 description: 'Nova API is locally down'
1278 severity: 'down'
1279 enabled: 'true'
1280 trigger:
1281 rules:
1282 - metric: openstack_check_local_api
1283 fields:
1284 service: 'nova-api'
1285 relational_operator: '=='
1286 threshold: 0
1287 window: 60
1288 periods: 0
1289 function: last
1290 - name: 'swift-api-local-endpoint'
1291 description: 'Swift API is locally down'
1292 severity: 'down'
1293 enabled: 'true'
1294 trigger:
1295 rules:
1296 - metric: openstack_check_local_api
1297 fields:
1298 service: 'swift-api'
1299 relational_operator: '=='
1300 threshold: 0
1301 window: 60
1302 periods: 0
1303 function: last
1304
1305 # Following are the OpenStack service check API definitions and
1306 # also InfluxDB API
1307 - name: 'influxdb-api-check-failed'
1308 description: 'Endpoint check for InfluxDB is failed'
1309 severity: 'down'
1310 no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
1311 enabled: 'true'
1312 trigger:
1313 rules:
1314 - metric: http_check
1315 fields:
1316 service: 'influxdb-cluster'
1317 relational_operator: '=='
1318 threshold: 0
1319 window: 60
1320 periods: 0
1321 function: last
1322 - name: 'nova-api-check-failed'
1323 description: 'Endpoint check for nova-api is failed'
1324 severity: 'down'
1325 no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
1326 enabled: 'true'
1327 trigger:
1328 rules:
1329 - metric: openstack_check_api
1330 fields:
1331 service: 'nova-api'
1332 relational_operator: '=='
1333 threshold: 0
1334 window: 60
1335 periods: 0
1336 function: last
1337 - name: 'neutron-api-check-failed'
1338 description: 'Endpoint check for neutron-api is failed'
1339 severity: 'down'
1340 no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
1341 enabled: 'true'
1342 trigger:
1343 rules:
1344 - metric: openstack_check_api
1345 fields:
1346 service: 'neutron-api'
1347 relational_operator: '=='
1348 threshold: 0
1349 window: 60
1350 periods: 0
1351 function: last
1352 - name: 'cinder-api-check-failed'
1353 description: 'Endpoint check for cinder-api is failed'
1354 severity: 'down'
1355 no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
1356 enabled: 'true'
1357 trigger:
1358 rules:
1359 - metric: openstack_check_api
1360 fields:
1361 service: 'cinder-api'
1362 relational_operator: '=='
1363 threshold: 0
1364 window: 60
1365 periods: 0
1366 function: last
1367 - name: 'cinder-v2-api-check-failed'
1368 description: 'Endpoint check for cinder-v2-api is failed'
1369 severity: 'down'
1370 no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
1371 enabled: 'true'
1372 trigger:
1373 rules:
1374 - metric: openstack_check_api
1375 fields:
1376 service: 'cinder-v2-api'
1377 relational_operator: '=='
1378 threshold: 0
1379 window: 60
1380 periods: 0
1381 function: last
1382 - name: 'glance-api-check-failed'
1383 description: 'Endpoint check for glance-api is failed'
1384 severity: 'down'
1385 no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
1386 enabled: 'true'
1387 trigger:
1388 rules:
1389 - metric: openstack_check_api
1390 fields:
1391 service: 'glance-api'
1392 relational_operator: '=='
1393 threshold: 0
1394 window: 60
1395 periods: 0
1396 function: last
1397 - name: 'heat-api-check-failed'
1398 description: 'Endpoint check for heat-api is failed'
1399 severity: 'down'
1400 no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
1401 enabled: 'true'
1402 trigger:
1403 rules:
1404 - metric: openstack_check_api
1405 fields:
1406 service: 'heat-api'
1407 relational_operator: '=='
1408 threshold: 0
1409 window: 60
1410 periods: 0
1411 function: last
1412 - name: 'heat-cfn-api-check-failed'
1413 description: 'Endpoint check for heat-cfn-api is failed'
1414 severity: 'down'
1415 no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
1416 enabled: 'true'
1417 trigger:
1418 rules:
1419 - metric: openstack_check_api
1420 fields:
1421 service: 'heat-cfn-api'
1422 relational_operator: '=='
1423 threshold: 0
1424 window: 60
1425 periods: 0
1426 function: last
1427 - name: 'swift-api-check-failed'
1428 description: 'Endpoint check for swift-api is failed'
1429 severity: 'down'
1430 no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
1431 enabled: 'true'
1432 trigger:
1433 rules:
1434 - metric: openstack_check_api
1435 fields:
1436 service: 'swift-api'
1437 relational_operator: '=='
1438 threshold: 0
1439 window: 60
1440 periods: 0
1441 function: last
1442 - name: 'swift-s3-api-check-failed'
1443 description: 'Endpoint check for swift-s3-api is failed'
1444 severity: 'down'
1445 no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
1446 enabled: 'true'
1447 trigger:
1448 rules:
1449 - metric: openstack_check_api
1450 fields:
1451 service: 'swift-s3-api'
1452 relational_operator: '=='
1453 threshold: 0
1454 window: 60
1455 periods: 0
1456 function: last
1457 - name: 'keystone-public-api-check-failed'
1458 description: 'Endpoint check for keystone-public-api is failed'
1459 severity: 'down'
1460 no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
1461 enabled: 'true'
1462 trigger:
1463 rules:
1464 - metric: openstack_check_api
1465 fields:
1466 service: 'keystone-public-api'
1467 relational_operator: '=='
1468 threshold: 0
1469 window: 60
1470 periods: 0
1471 function: last
1472 - name: 'ceilometer-api-check-failed'
1473 description: 'Endpoint check for ceilometer-api is failed'
1474 severity: 'down'
1475 no_data_policy: 'skip' # the metric is only collected from the controller running the management VIP
1476 enabled: 'true'
1477 trigger:
1478 rules:
1479 - metric: openstack_check_api
1480 fields:
1481 service: 'ceilometer-api'
1482 relational_operator: '=='
1483 threshold: 0
1484 window: 60
1485 periods: 0
1486 function: last
1487
1488 # Following are the AFD generated to check API backends
1489 # All backends are down
1490 - name: 'elasticsearch-api-backends-all-down'
1491 description: 'All Elasticsearch backends are down'
1492 severity: 'down'
1493 enabled: 'true'
1494 trigger:
1495 rules:
1496 - metric: haproxy_backend_servers
1497 fields:
1498 backend: 'elasticsearch-rest'
1499 state: 'up'
1500 relational_operator: '=='
1501 threshold: 0
1502 window: 60
1503 periods: 0
1504 function: last
1505 - name: 'kibana-api-backends-all-down'
1506 description: 'All API backends are down for Kibana'
1507 severity: 'down'
1508 enabled: 'true'
1509 trigger:
1510 rules:
1511 - metric: haproxy_backend_servers
1512 fields:
1513 backend: 'kibana'
1514 state: 'up'
1515 relational_operator: '=='
1516 threshold: 0
1517 window: 60
1518 periods: 0
1519 function: last
1520 - name: 'influxdb-api-backends-all-down'
1521 description: 'All API backends are down for InfluxDB'
1522 severity: 'down'
1523 enabled: 'true'
1524 trigger:
1525 rules:
1526 - metric: haproxy_backend_servers
1527 fields:
1528 backend: 'influxdb'
1529 state: 'up'
1530 relational_operator: '=='
1531 threshold: 0
1532 window: 60
1533 periods: 0
1534 function: last
1535 - name: 'grafana-api-backends-all-down'
1536 description: 'All API backends are down for Grafana'
1537 severity: 'down'
1538 enabled: 'true'
1539 trigger:
1540 rules:
1541 - metric: haproxy_backend_servers
1542 fields:
1543 backend: 'grafana'
1544 state: 'up'
1545 relational_operator: '=='
1546 threshold: 0
1547 window: 60
1548 periods: 0
1549 function: last
1550 - name: 'glance-registry-api-backends-all-down'
1551 description: 'All API backends are down for glance-registry-api'
1552 severity: 'down'
1553 enabled: 'true'
1554 trigger:
1555 rules:
1556 - metric: haproxy_backend_servers
1557 fields:
1558 backend: 'glance-registry-api'
1559 state: 'up'
1560 relational_operator: '=='
1561 threshold: 0
1562 window: 60
1563 periods: 0
1564 function: last
1565 - name: 'nova-api-backends-all-down'
1566 description: 'All API backends are down for nova-api'
1567 severity: 'down'
1568 enabled: 'true'
1569 trigger:
1570 rules:
1571 - metric: haproxy_backend_servers
1572 fields:
1573 backend: 'nova-api'
1574 state: 'up'
1575 relational_operator: '=='
1576 threshold: 0
1577 window: 60
1578 periods: 0
1579 function: last
1580 - name: 'cinder-api-backends-all-down'
1581 description: 'All API backends are down for cinder-api'
1582 severity: 'down'
1583 enabled: 'true'
1584 trigger:
1585 rules:
1586 - metric: haproxy_backend_servers
1587 fields:
1588 backend: 'cinder-api'
1589 state: 'up'
1590 relational_operator: '=='
1591 threshold: 0
1592 window: 60
1593 periods: 0
1594 function: last
1595 - name: 'object-storage-api-backends-all-down'
1596 description: 'All API backends are down for object-storage'
1597 severity: 'down'
1598 enabled: 'true'
1599 trigger:
1600 rules:
1601 - metric: haproxy_backend_servers
1602 fields:
1603 backend: 'object-storage'
1604 state: 'up'
1605 relational_operator: '=='
1606 threshold: 0
1607 window: 60
1608 periods: 0
1609 function: last
1610 - name: 'heat-cfn-api-backends-all-down'
1611 description: 'All API backends are down for heat-cfn-api'
1612 severity: 'down'
1613 enabled: 'true'
1614 trigger:
1615 rules:
1616 - metric: haproxy_backend_servers
1617 fields:
1618 backend: 'heat-cfn-api'
1619 state: 'up'
1620 relational_operator: '=='
1621 threshold: 0
1622 window: 60
1623 periods: 0
1624 function: last
1625 - name: 'horizon-web-api-backends-all-down'
1626 description: 'All API backends are down for horizon-web'
1627 severity: 'down'
1628 enabled: 'true'
1629 trigger:
1630 rules:
1631 - metric: haproxy_backend_servers
1632 fields:
1633 backend: 'horizon-web || horizon-https'
1634 state: 'up'
1635 relational_operator: '=='
1636 threshold: 0
1637 window: 60
1638 periods: 0
1639 function: last
1640 - name: 'nova-novncproxy-websocket-api-backends-all-down'
1641 description: 'All API backends are down for nova-novncproxy-websocket'
1642 severity: 'down'
1643 enabled: 'true'
1644 trigger:
1645 rules:
1646 - metric: haproxy_backend_servers
1647 fields:
1648 backend: 'nova-novncproxy-websocket'
1649 state: 'up'
1650 relational_operator: '=='
1651 threshold: 0
1652 window: 60
1653 periods: 0
1654 function: last
1655 - name: 'heat-api-backends-all-down'
1656 description: 'All API backends are down for heat-api'
1657 severity: 'down'
1658 enabled: 'true'
1659 trigger:
1660 rules:
1661 - metric: haproxy_backend_servers
1662 fields:
1663 backend: 'heat-api'
1664 state: 'up'
1665 relational_operator: '=='
1666 threshold: 0
1667 window: 60
1668 periods: 0
1669 function: last
1670 - name: 'keystone-public-api-backends-all-down'
1671 description: 'All API backends are down for keystone-public-api'
1672 severity: 'down'
1673 enabled: 'true'
1674 trigger:
1675 rules:
1676 - metric: haproxy_backend_servers
1677 fields:
1678 backend: 'keystone-public-api'
1679 state: 'up'
1680 relational_operator: '=='
1681 threshold: 0
1682 window: 60
1683 periods: 0
1684 function: last
1685 - name: 'heat-cloudwatch-api-backends-all-down'
1686 description: 'All API backends are down for heat-cloudwatch-api'
1687 severity: 'down'
1688 enabled: 'true'
1689 trigger:
1690 rules:
1691 - metric: haproxy_backend_servers
1692 fields:
1693 backend: 'heat-cloudwatch-api'
1694 state: 'up'
1695 relational_operator: '=='
1696 threshold: 0
1697 window: 60
1698 periods: 0
1699 function: last
1700 - name: 'nova-metadata-api-backends-all-down'
1701 description: 'All API backends are down for nova-metadata-api'
1702 severity: 'down'
1703 enabled: 'true'
1704 trigger:
1705 rules:
1706 - metric: haproxy_backend_servers
1707 fields:
1708 backend: 'nova-metadata-api'
1709 state: 'up'
1710 relational_operator: '=='
1711 threshold: 0
1712 window: 60
1713 periods: 0
1714 function: last
1715 - name: 'mysqld-tcp-api-backends-all-down'
1716 description: 'All API backends are down for mysqld-tcp'
1717 severity: 'down'
1718 enabled: 'true'
1719 trigger:
1720 rules:
1721 - metric: haproxy_backend_servers
1722 fields:
1723 backend: 'mysqld-tcp'
1724 state: 'up'
1725 relational_operator: '=='
1726 threshold: 0
1727 window: 60
1728 periods: 0
1729 function: last
1730 - name: 'keystone-admin-api-backends-all-down'
1731 description: 'All API backends are down for keystone-admin-api'
1732 severity: 'down'
1733 enabled: 'true'
1734 trigger:
1735 rules:
1736 - metric: haproxy_backend_servers
1737 fields:
1738 backend: 'keystone-admin-api'
1739 state: 'up'
1740 relational_operator: '=='
1741 threshold: 0
1742 window: 60
1743 periods: 0
1744 function: last
1745 - name: 'glance-api-backends-all-down'
1746 description: 'All API backends are down for glance-api'
1747 severity: 'down'
1748 enabled: 'true'
1749 trigger:
1750 rules:
1751 - metric: haproxy_backend_servers
1752 fields:
1753 backend: 'glance-api'
1754 state: 'up'
1755 relational_operator: '=='
1756 threshold: 0
1757 window: 60
1758 periods: 0
1759 function: last
1760 - name: 'neutron-api-backends-all-down'
1761 description: 'All API backends are down for neutron-api'
1762 severity: 'down'
1763 enabled: 'true'
1764 trigger:
1765 rules:
1766 - metric: haproxy_backend_servers
1767 fields:
1768 backend: 'neutron-api'
1769 state: 'up'
1770 relational_operator: '=='
1771 threshold: 0
1772 window: 60
1773 periods: 0
1774 function: last
1775 - name: 'swift-api-backends-all-down'
1776 description: 'All API backends are down for swift-api'
1777 severity: 'down'
1778 enabled: 'true'
1779 trigger:
1780 rules:
1781 - metric: haproxy_backend_servers
1782 fields:
1783 backend: 'swift-api || object-storage'
1784 state: 'up'
1785 relational_operator: '=='
1786 threshold: 0
1787 window: 60
1788 periods: 0
1789 function: last
1790 - name: 'ceilometer-api-backends-all-down'
1791 description: 'All API backends are down for ceilometer-api'
1792 severity: 'down'
1793 enabled: 'true'
1794 trigger:
1795 rules:
1796 - metric: haproxy_backend_servers
1797 fields:
1798 backend: 'ceilometer-api'
1799 state: 'up'
1800 relational_operator: '=='
1801 threshold: 0
1802 window: 60
1803 periods: 0
1804 function: last
1805 # At least one backend is down
1806 - name: 'elasticsearch-api-backends-one-down'
1807 description: 'At least one API backend is down for elasticsearch'
1808 severity: 'warning'
1809 enabled: 'true'
1810 trigger:
1811 rules:
1812 - metric: haproxy_backend_servers
1813 fields:
1814 backend: 'elasticsearch-rest'
1815 state: 'down'
1816 relational_operator: '>'
1817 threshold: 0
1818 window: 60
1819 periods: 0
1820 function: last
1821 - name: 'kibana-api-backends-one-down'
1822 description: 'At least one API backend is down for kibana'
1823 severity: 'warning'
1824 enabled: 'true'
1825 trigger:
1826 rules:
1827 - metric: haproxy_backend_servers
1828 fields:
1829 backend: 'kibana'
1830 state: 'down'
1831 relational_operator: '>'
1832 threshold: 0
1833 window: 60
1834 periods: 0
1835 function: last
1836 - name: 'influxdb-api-backends-one-down'
1837 description: 'At least one API backend is down for influxdb'
1838 severity: 'warning'
1839 enabled: 'true'
1840 trigger:
1841 rules:
1842 - metric: haproxy_backend_servers
1843 fields:
1844 backend: 'influxdb'
1845 state: 'down'
1846 relational_operator: '>'
1847 threshold: 0
1848 window: 60
1849 periods: 0
1850 function: last
1851 - name: 'grafana-api-backends-one-down'
1852 description: 'At least one API backend is down for grafana'
1853 severity: 'warning'
1854 enabled: 'true'
1855 trigger:
1856 rules:
1857 - metric: haproxy_backend_servers
1858 fields:
1859 backend: 'grafana'
1860 state: 'down'
1861 relational_operator: '>'
1862 threshold: 0
1863 window: 60
1864 periods: 0
1865 function: last
1866 - name: 'glance-registry-api-backends-one-down'
1867 description: 'At least one API backend is down for glance-registry-api'
1868 severity: 'warning'
1869 enabled: 'true'
1870 trigger:
1871 rules:
1872 - metric: haproxy_backend_servers
1873 fields:
1874 backend: 'glance-registry-api'
1875 state: 'down'
1876 relational_operator: '>'
1877 threshold: 0
1878 window: 60
1879 periods: 0
1880 function: last
1881 - name: 'nova-api-backends-one-down'
1882 description: 'At least one API backend is down for nova-api'
1883 severity: 'warning'
1884 enabled: 'true'
1885 trigger:
1886 rules:
1887 - metric: haproxy_backend_servers
1888 fields:
1889 backend: 'nova-api'
1890 state: 'down'
1891 relational_operator: '>'
1892 threshold: 0
1893 window: 60
1894 periods: 0
1895 function: last
1896 - name: 'cinder-api-backends-one-down'
1897 description: 'At least one API backend is down for cinder-api'
1898 severity: 'warning'
1899 enabled: 'true'
1900 trigger:
1901 rules:
1902 - metric: haproxy_backend_servers
1903 fields:
1904 backend: 'cinder-api'
1905 state: 'down'
1906 relational_operator: '>'
1907 threshold: 0
1908 window: 60
1909 periods: 0
1910 function: last
1911 - name: 'object-storage-api-backends-one-down'
1912 description: 'At least one API backend is down for object-storage'
1913 severity: 'warning'
1914 enabled: 'true'
1915 trigger:
1916 rules:
1917 - metric: haproxy_backend_servers
1918 fields:
1919 backend: 'object-storage'
1920 state: 'down'
1921 relational_operator: '>'
1922 threshold: 0
1923 window: 60
1924 periods: 0
1925 function: last
1926 - name: 'heat-cfn-api-backends-one-down'
1927 description: 'At least one API backend is down for heat-cfn-api'
1928 severity: 'warning'
1929 enabled: 'true'
1930 trigger:
1931 rules:
1932 - metric: haproxy_backend_servers
1933 fields:
1934 backend: 'heat-cfn-api'
1935 state: 'down'
1936 relational_operator: '>'
1937 threshold: 0
1938 window: 60
1939 periods: 0
1940 function: last
1941 - name: 'horizon-web-api-backends-one-down'
1942 description: 'At least one API backend is down for horizon-web'
1943 severity: 'warning'
1944 enabled: 'true'
1945 trigger:
1946 rules:
1947 - metric: haproxy_backend_servers
1948 fields:
1949 backend: 'horizon-web || horizon-https'
1950 state: 'down'
1951 relational_operator: '>'
1952 threshold: 0
1953 window: 60
1954 periods: 0
1955 function: last
1956 - name: 'nova-novncproxy-websocket-api-backends-one-down'
1957 description: 'At least one API backend is down for nova-novncproxy-websocket'
1958 severity: 'warning'
1959 enabled: 'true'
1960 trigger:
1961 rules:
1962 - metric: haproxy_backend_servers
1963 fields:
1964 backend: 'nova-novncproxy-websocket'
1965 state: 'down'
1966 relational_operator: '>'
1967 threshold: 0
1968 window: 60
1969 periods: 0
1970 function: last
1971 - name: 'heat-api-backends-one-down'
1972 description: 'At least one API backend is down for heat-api'
1973 severity: 'warning'
1974 enabled: 'true'
1975 trigger:
1976 rules:
1977 - metric: haproxy_backend_servers
1978 fields:
1979 backend: 'heat-api'
1980 state: 'down'
1981 relational_operator: '>'
1982 threshold: 0
1983 window: 60
1984 periods: 0
1985 function: last
1986 - name: 'keystone-public-api-backends-one-down'
1987 description: 'At least one API backend is down for keystone-public-api'
1988 severity: 'warning'
1989 enabled: 'true'
1990 trigger:
1991 rules:
1992 - metric: haproxy_backend_servers
1993 fields:
1994 backend: 'keystone-public-api'
1995 state: 'down'
1996 relational_operator: '>'
1997 threshold: 0
1998 window: 60
1999 periods: 0
2000 function: last
2001 - name: 'heat-cloudwatch-api-backends-one-down'
2002 description: 'At least one API backend is down for heat-cloudwatch-api'
2003 severity: 'warning'
2004 enabled: 'true'
2005 trigger:
2006 rules:
2007 - metric: haproxy_backend_servers
2008 fields:
2009 backend: 'heat-cloudwatch-api'
2010 state: 'down'
2011 relational_operator: '>'
2012 threshold: 0
2013 window: 60
2014 periods: 0
2015 function: last
2016 - name: 'nova-metadata-api-backends-one-down'
2017 description: 'At least one API backend is down for nova-metadata-api'
2018 severity: 'warning'
2019 enabled: 'true'
2020 trigger:
2021 rules:
2022 - metric: haproxy_backend_servers
2023 fields:
2024 backend: 'nova-metadata-api'
2025 state: 'down'
2026 relational_operator: '>'
2027 threshold: 0
2028 window: 60
2029 periods: 0
2030 function: last
2031 - name: 'mysqld-tcp-api-backends-one-down'
2032 description: 'At least one API backend is down for mysqld-tcp'
2033 severity: 'warning'
2034 enabled: 'true'
2035 trigger:
2036 rules:
2037 - metric: haproxy_backend_servers
2038 fields:
2039 backend: 'mysqld-tcp'
2040 state: 'down'
2041 relational_operator: '>'
2042 threshold: 0
2043 window: 60
2044 periods: 0
2045 function: last
2046 - name: 'keystone-admin-api-backends-one-down'
2047 description: 'At least one API backend is down for keystone-admin-api'
2048 severity: 'warning'
2049 enabled: 'true'
2050 trigger:
2051 rules:
2052 - metric: haproxy_backend_servers
2053 fields:
2054 backend: 'keystone-admin-api'
2055 state: 'down'
2056 relational_operator: '>'
2057 threshold: 0
2058 window: 60
2059 periods: 0
2060 function: last
2061 - name: 'glance-api-backends-one-down'
2062 description: 'At least one API backend is down for glance-api'
2063 severity: 'warning'
2064 enabled: 'true'
2065 trigger:
2066 rules:
2067 - metric: haproxy_backend_servers
2068 fields:
2069 backend: 'glance-api'
2070 state: 'down'
2071 relational_operator: '>'
2072 threshold: 0
2073 window: 60
2074 periods: 0
2075 function: last
2076 - name: 'neutron-api-backends-one-down'
2077 description: 'At least one API backend is down for neutron-api'
2078 severity: 'warning'
2079 enabled: 'true'
2080 trigger:
2081 rules:
2082 - metric: haproxy_backend_servers
2083 fields:
2084 backend: 'neutron-api'
2085 state: 'down'
2086 relational_operator: '>'
2087 threshold: 0
2088 window: 60
2089 periods: 0
2090 function: last
2091 - name: 'swift-api-backends-one-down'
2092 description: 'At least one API backend is down for swift-api'
2093 severity: 'warning'
2094 enabled: 'true'
2095 trigger:
2096 rules:
2097 - metric: haproxy_backend_servers
2098 fields:
2099 backend: 'swift-api || object-storage'
2100 state: 'down'
2101 relational_operator: '>'
2102 threshold: 0
2103 window: 60
2104 periods: 0
2105 function: last
2106 - name: 'ceilometer-api-backends-one-down'
2107 description: 'At least one API backend is down for ceilometer-api'
2108 severity: 'warning'
2109 enabled: 'true'
2110 trigger:
2111 rules:
2112 - metric: haproxy_backend_servers
2113 fields:
2114 backend: 'ceilometer-api'
2115 state: 'down'
2116 relational_operator: '>'
2117 threshold: 0
2118 window: 60
2119 periods: 0
2120 function: last
2121 # Less than 50% of backends are up
2122 - name: 'elasticsearch-api-backends-majority-down'
2123 description: 'Less than 50% of backends are up for elasticsearch'
2124 severity: 'critical'
2125 enabled: 'true'
2126 trigger:
2127 rules:
2128 - metric: haproxy_backend_servers_percent
2129 fields:
2130 backend: 'elasticsearch-rest'
2131 state: 'up'
2132 relational_operator: '<='
2133 threshold: 50
2134 window: 60
2135 periods: 0
2136 function: last
2137 - name: 'kibana-api-backends-majority-down'
2138 description: 'Less than 50% of backends are up for kibana'
2139 severity: 'critical'
2140 enabled: 'true'
2141 trigger:
2142 rules:
2143 - metric: haproxy_backend_servers_percent
2144 fields:
2145 backend: 'kibana'
2146 state: 'up'
2147 relational_operator: '<='
2148 threshold: 50
2149 window: 60
2150 periods: 0
2151 function: last
2152 - name: 'influxdb-api-backends-majority-down'
2153 description: 'Less than 50% of backends are up for influxdb'
2154 severity: 'critical'
2155 enabled: 'true'
2156 trigger:
2157 rules:
2158 - metric: haproxy_backend_servers_percent
2159 fields:
2160 backend: 'influxdb'
2161 state: 'up'
2162 relational_operator: '<='
2163 threshold: 50
2164 window: 60
2165 periods: 0
2166 function: last
2167 - name: 'grafana-api-backends-majority-down'
2168 description: 'Less than 50% of backends are up for grafana'
2169 severity: 'critical'
2170 enabled: 'true'
2171 trigger:
2172 rules:
2173 - metric: haproxy_backend_servers_percent
2174 fields:
2175 backend: 'grafana'
2176 state: 'up'
2177 relational_operator: '<='
2178 threshold: 50
2179 window: 60
2180 periods: 0
2181 function: last
2182 - name: 'glance-registry-api-backends-majority-down'
2183 description: 'Less than 50% of backends are up for glance-registry-api'
2184 severity: 'critical'
2185 enabled: 'true'
2186 trigger:
2187 rules:
2188 - metric: haproxy_backend_servers_percent
2189 fields:
2190 backend: 'glance-registry-api'
2191 state: 'up'
2192 relational_operator: '<='
2193 threshold: 50
2194 window: 60
2195 periods: 0
2196 function: last
2197 - name: 'nova-api-backends-majority-down'
2198 description: 'Less than 50% of backends are up for nova-api'
2199 severity: 'critical'
2200 enabled: 'true'
2201 trigger:
2202 rules:
2203 - metric: haproxy_backend_servers_percent
2204 fields:
2205 backend: 'nova-api'
2206 state: 'up'
2207 relational_operator: '<='
2208 threshold: 50
2209 window: 60
2210 periods: 0
2211 function: last
2212 - name: 'cinder-api-backends-majority-down'
2213 description: 'Less than 50% of backends are up for cinder-api'
2214 severity: 'critical'
2215 enabled: 'true'
2216 trigger:
2217 rules:
2218 - metric: haproxy_backend_servers_percent
2219 fields:
2220 backend: 'cinder-api'
2221 state: 'up'
2222
2223 relational_operator: '<='
2224 threshold: 50
2225 window: 60
2226 periods: 0
2227 function: last
2228 - name: 'object-storage-api-backends-majority-down'
2229 description: 'Less than 50% of backends are up for object-storage'
2230 severity: 'critical'
2231 enabled: 'true'
2232 trigger:
2233 rules:
2234 - metric: haproxy_backend_servers_percent
2235 fields:
2236 backend: 'object-storage'
2237 state: 'up'
2238 relational_operator: '<='
2239 threshold: 50
2240 window: 60
2241 periods: 0
2242 function: last
2243 - name: 'heat-cfn-api-backends-majority-down'
2244 description: 'Less than 50% of backends are up for heat-cfn-api'
2245 severity: 'critical'
2246 enabled: 'true'
2247 trigger:
2248 rules:
2249 - metric: haproxy_backend_servers_percent
2250 fields:
2251 backend: 'heat-cfn-api'
2252 state: 'up'
2253 relational_operator: '<='
2254 threshold: 50
2255 window: 60
2256 periods: 0
2257 function: last
2258 - name: 'horizon-web-api-backends-majority-down'
2259 description: 'Less than 50% of backends are up for horizon-web'
2260 severity: 'critical'
2261 enabled: 'true'
2262 trigger:
2263 rules:
2264 - metric: haproxy_backend_servers_percent
2265 fields:
2266 backend: 'horizon-web || horizon-https'
2267 state: 'up'
2268 relational_operator: '<='
2269 threshold: 50
2270 window: 60
2271 periods: 0
2272 function: last
2273 - name: 'nova-novncproxy-websocket-api-backends-majority-down'
2274 description: 'Less than 50% of backends are up for nova-novncproxy-websocket'
2275 severity: 'critical'
2276 enabled: 'true'
2277 trigger:
2278 rules:
2279 - metric: haproxy_backend_servers_percent
2280 fields:
2281 backend: 'nova-novncproxy-websocket'
2282 state: 'up'
2283 relational_operator: '<='
2284 threshold: 50
2285 window: 60
2286 periods: 0
2287 function: last
2288 - name: 'heat-api-backends-majority-down'
2289 description: 'Less than 50% of backends are up for heat-api'
2290 severity: 'critical'
2291 enabled: 'true'
2292 trigger:
2293 rules:
2294 - metric: haproxy_backend_servers_percent
2295 fields:
2296 backend: 'heat-api'
2297 state: 'up'
2298 relational_operator: '<='
2299 threshold: 50
2300 window: 60
2301 periods: 0
2302 function: last
2303 - name: 'keystone-public-api-backends-majority-down'
2304 description: 'Less than 50% of backends are up for keystone-public-api'
2305 severity: 'critical'
2306 enabled: 'true'
2307 trigger:
2308 rules:
2309 - metric: haproxy_backend_servers_percent
2310 fields:
2311 backend: 'keystone-public-api'
2312 state: 'up'
2313 relational_operator: '<='
2314 threshold: 50
2315 window: 60
2316 periods: 0
2317 function: last
2318 - name: 'heat-cloudwatch-api-backends-majority-down'
2319 description: 'Less than 50% of backends are up for heat-cloudwatch-api'
2320 severity: 'critical'
2321 enabled: 'true'
2322 trigger:
2323 rules:
2324 - metric: haproxy_backend_servers_percent
2325 fields:
2326 backend: 'heat-cloudwatch-api'
2327 state: 'up'
2328 relational_operator: '<='
2329 threshold: 50
2330 window: 60
2331 periods: 0
2332 function: last
2333 - name: 'nova-metadata-api-backends-majority-down'
2334 description: 'Less than 50% of backends are up for nova-metadata-api'
2335 severity: 'critical'
2336 enabled: 'true'
2337 trigger:
2338 rules:
2339 - metric: haproxy_backend_servers_percent
2340 fields:
2341 backend: 'nova-metadata-api'
2342 state: 'up'
2343 relational_operator: '<='
2344 threshold: 50
2345 window: 60
2346 periods: 0
2347 function: last
2348 - name: 'mysqld-tcp-api-backends-majority-down'
2349 description: 'Less than 50% of backends are up for mysqld-tcp'
2350 severity: 'critical'
2351 enabled: 'true'
2352 trigger:
2353 rules:
2354 - metric: haproxy_backend_servers_percent
2355 fields:
2356 backend: 'mysqld-tcp'
2357 state: 'up'
2358 relational_operator: '<='
2359 threshold: 50
2360 window: 60
2361 periods: 0
2362 function: last
2363 - name: 'keystone-admin-api-backends-majority-down'
2364 description: 'Less than 50% of backends are up for keystone-admin-api'
2365 severity: 'critical'
2366 enabled: 'true'
2367 trigger:
2368 rules:
2369 - metric: haproxy_backend_servers_percent
2370 fields:
2371 backend: 'keystone-admin-api'
2372 state: 'up'
2373 relational_operator: '<='
2374 threshold: 50
2375 window: 60
2376 periods: 0
2377 function: last
2378 - name: 'glance-api-backends-majority-down'
2379 description: 'Less than 50% of backends are up for glance-api'
2380 severity: 'critical'
2381 enabled: 'true'
2382 trigger:
2383 rules:
2384 - metric: haproxy_backend_servers_percent
2385 fields:
2386 backend: 'glance-api'
2387 state: 'up'
2388 relational_operator: '<='
2389 threshold: 50
2390 window: 60
2391 periods: 0
2392 function: last
2393 - name: 'neutron-api-backends-majority-down'
2394 description: 'Less than 50% of backends are up for neutron-api'
2395 severity: 'critical'
2396 enabled: 'true'
2397 trigger:
2398 rules:
2399 - metric: haproxy_backend_servers_percent
2400 fields:
2401 backend: 'neutron-api'
2402 state: 'up'
2403 relational_operator: '<='
2404 threshold: 50
2405 window: 60
2406 periods: 0
2407 function: last
2408 - name: 'swift-api-backends-majority-down'
2409 description: 'Less than 50% of backends are up for swift-api'
2410 severity: 'critical'
2411 enabled: 'true'
2412 trigger:
2413 rules:
2414 - metric: haproxy_backend_servers_percent
2415 fields:
2416 backend: 'swift-api || object-storage'
2417 state: 'up'
2418 relational_operator: '<='
2419 threshold: 50
2420 window: 60
2421 periods: 0
2422 function: last
2423 - name: 'ceilometer-api-backends-majority-down'
2424 description: 'Less than 50% of backends are up for ceilometer-api'
2425 severity: 'critical'
2426 enabled: 'true'
2427 trigger:
2428 rules:
2429 - metric: haproxy_backend_servers_percent
2430 fields:
2431 backend: 'ceilometer-api'
2432 state: 'up'
2433 relational_operator: '<='
2434 threshold: 50
2435 window: 60
2436 periods: 0
2437 function: last
2438
2439 # Following are the AFD generated to check workers
2440 # All workers are down
2441 - name: 'nova-scheduler-all-down'
2442 description: 'All Nova schedulers are down'
2443 severity: 'down'
2444 no_data_policy: 'skip' # the metric is only collected from the DC node
2445 enabled: 'true'
2446 trigger:
2447 rules:
2448 - metric: openstack_nova_services
2449 fields:
2450 service: 'scheduler'
2451 state: 'up'
2452 relational_operator: '=='
2453 threshold: 0
2454 window: 60
2455 periods: 0
2456 function: last
2457 - name: 'nova-cert-all-down'
2458 description: 'All Nova certs are down'
2459 severity: 'down'
2460 no_data_policy: 'skip' # the metric is only collected from the DC node
2461 enabled: 'true'
2462 trigger:
2463 rules:
2464 - metric: openstack_nova_services
2465 fields:
2466 service: 'cert'
2467 state: 'up'
2468 relational_operator: '=='
2469 threshold: 0
2470 window: 60
2471 periods: 0
2472 function: last
2473 - name: 'nova-consoleauth-all-down'
2474 description: 'All Nova consoleauths are down'
2475 severity: 'down'
2476 no_data_policy: 'skip' # the metric is only collected from the DC node
2477 enabled: 'true'
2478 trigger:
2479 rules:
2480 - metric: openstack_nova_services
2481 fields:
2482 service: 'consoleauth'
2483 state: 'up'
2484 relational_operator: '=='
2485 threshold: 0
2486 window: 60
2487 periods: 0
2488 function: last
2489 - name: 'nova-compute-all-down'
2490 description: 'All Nova computes are down'
2491 severity: 'down'
2492 no_data_policy: 'skip' # the metric is only collected from the DC node
2493 enabled: 'true'
2494 trigger:
2495 rules:
2496 - metric: openstack_nova_services
2497 fields:
2498 service: 'compute'
2499 state: 'up'
2500 relational_operator: '=='
2501 threshold: 0
2502 window: 60
2503 periods: 0
2504 function: last
2505 - name: 'nova-conductor-all-down'
2506 description: 'All Nova conductors are down'
2507 severity: 'down'
2508 no_data_policy: 'skip' # the metric is only collected from the DC node
2509 enabled: 'true'
2510 trigger:
2511 rules:
2512 - metric: openstack_nova_services
2513 fields:
2514 service: 'conductor'
2515 state: 'up'
2516 relational_operator: '=='
2517 threshold: 0
2518 window: 60
2519 periods: 0
2520 function: last
2521 - name: 'cinder-scheduler-all-down'
2522 description: 'All Cinder schedulers are down'
2523 severity: 'down'
2524 no_data_policy: 'skip' # the metric is only collected from the DC node
2525 enabled: 'true'
2526 trigger:
2527 rules:
2528 - metric: openstack_cinder_services
2529 fields:
2530 service: 'scheduler'
2531 state: 'up'
2532 relational_operator: '=='
2533 threshold: 0
2534 window: 60
2535 periods: 0
2536 function: last
2537 - name: 'cinder-volume-all-down'
2538 description: 'All Cinder volumes are down'
2539 severity: 'down'
2540 no_data_policy: 'skip' # the metric is only collected from the DC node
2541 enabled: 'true'
2542 trigger:
2543 rules:
2544 - metric: openstack_cinder_services
2545 fields:
2546 service: 'volume'
2547 state: 'up'
2548 relational_operator: '=='
2549 threshold: 0
2550 window: 60
2551 periods: 0
2552 function: last
2553 - name: 'neutron-l3-all-down'
2554 description: 'All Neutron L3 agents are down'
2555 severity: 'down'
2556 no_data_policy: 'skip' # the metric is only collected from the DC node
2557 enabled: 'true'
2558 trigger:
2559 rules:
2560 - metric: openstack_neutron_agents
2561 fields:
2562 service: 'l3'
2563 state: 'up'
2564 relational_operator: '=='
2565 threshold: 0
2566 window: 60
2567 periods: 0
2568 function: last
2569 - name: 'neutron-dhcp-all-down'
2570 description: 'All Neutron DHCP agents are down'
2571 severity: 'down'
2572 no_data_policy: 'skip' # the metric is only collected from the DC node
2573 enabled: 'true'
2574 trigger:
2575 rules:
2576 - metric: openstack_neutron_agents
2577 fields:
2578 service: 'dhcp'
2579 state: 'up'
2580 relational_operator: '=='
2581 threshold: 0
2582 window: 60
2583 periods: 0
2584 function: last
2585 - name: 'neutron-metadata-all-down'
2586 description: 'All Neutron metadata agents are down'
2587 severity: 'down'
2588 no_data_policy: 'skip' # the metric is only collected from the DC node
2589 enabled: 'true'
2590 trigger:
2591 rules:
2592 - metric: openstack_neutron_agents
2593 fields:
2594 service: 'metadata'
2595 state: 'up'
2596 relational_operator: '=='
2597 threshold: 0
2598 window: 60
2599 periods: 0
2600 function: last
2601 - name: 'neutron-openvswitch-all-down'
2602 description: 'All Neutron openvswitch agents are down'
2603 severity: 'down'
2604 no_data_policy: 'skip' # the metric is only collected from the DC node
2605 enabled: 'true'
2606 trigger:
2607 rules:
2608 - metric: openstack_neutron_agents
2609 fields:
2610 service: 'openvswitch'
2611 state: 'up'
2612 relational_operator: '=='
2613 threshold: 0
2614 window: 60
2615 periods: 0
2616 function: last
2617 # At least one backend is down
2618 - name: 'nova-scheduler-one-down'
2619 description: 'At least one Nova scheduler is down'
2620 severity: 'warning'
2621 no_data_policy: 'skip' # the metric is only collected from the DC node
2622 enabled: 'true'
2623 trigger:
2624 rules:
2625 - metric: openstack_nova_services
2626 fields:
2627 service: 'scheduler'
2628 state: 'down'
2629 relational_operator: '>'
2630 threshold: 0
2631 window: 60
2632 periods: 0
2633 function: last
2634 - name: 'nova-cert-one-down'
2635 description: 'At least one Nova cert is down'
2636 severity: 'warning'
2637 no_data_policy: 'skip' # the metric is only collected from the DC node
2638 enabled: 'true'
2639 trigger:
2640 rules:
2641 - metric: openstack_nova_services
2642 fields:
2643 service: 'cert'
2644 state: 'down'
2645 relational_operator: '>'
2646 threshold: 0
2647 window: 60
2648 periods: 0
2649 function: last
2650 - name: 'nova-consoleauth-one-down'
2651 description: 'At least one Nova consoleauth is down'
2652 severity: 'warning'
2653 no_data_policy: 'skip' # the metric is only collected from the DC node
2654 enabled: 'true'
2655 trigger:
2656 rules:
2657 - metric: openstack_nova_services
2658 fields:
2659 service: 'consoleauth'
2660 state: 'down'
2661 relational_operator: '>'
2662 threshold: 0
2663 window: 60
2664 periods: 0
2665 function: last
2666 - name: 'nova-compute-one-down'
2667 description: 'At least one Nova compute is down'
2668 severity: 'warning'
2669 no_data_policy: 'skip' # the metric is only collected from the DC node
2670 enabled: 'true'
2671 trigger:
2672 rules:
2673 - metric: openstack_nova_services
2674 fields:
2675 service: 'compute'
2676 state: 'down'
2677 relational_operator: '>'
2678 threshold: 0
2679 window: 60
2680 periods: 0
2681 function: last
2682 - name: 'nova-conductor-one-down'
2683 description: 'At least one Nova conductor is down'
2684 severity: 'warning'
2685 no_data_policy: 'skip' # the metric is only collected from the DC node
2686 enabled: 'true'
2687 trigger:
2688 rules:
2689 - metric: openstack_nova_services
2690 fields:
2691 service: 'conductor'
2692 state: 'down'
2693 relational_operator: '>'
2694 threshold: 0
2695 window: 60
2696 periods: 0
2697 function: last
2698 - name: 'cinder-scheduler-one-down'
2699 description: 'At least one Cinder scheduler is down'
2700 severity: 'warning'
2701 no_data_policy: 'skip' # the metric is only collected from the DC node
2702 enabled: 'true'
2703 trigger:
2704 rules:
2705 - metric: openstack_cinder_services
2706 fields:
2707 service: 'scheduler'
2708 state: 'down'
2709 relational_operator: '>'
2710 threshold: 0
2711 window: 60
2712 periods: 0
2713 function: last
2714 - name: 'cinder-volume-one-down'
2715 description: 'At least one Cinder volume is down'
2716 severity: 'warning'
2717 no_data_policy: 'skip' # the metric is only collected from the DC node
2718 enabled: 'true'
2719 trigger:
2720 rules:
2721 - metric: openstack_cinder_services
2722 fields:
2723 service: 'volume'
2724 state: 'down'
2725 relational_operator: '>'
2726 threshold: 0
2727 window: 60
2728 periods: 0
2729 function: last
2730 - name: 'neutron-l3-one-down'
2731 description: 'At least one L3 agent is down'
2732 severity: 'warning'
2733 no_data_policy: 'skip' # the metric is only collected from the DC node
2734 enabled: 'true'
2735 trigger:
2736 rules:
2737 - metric: openstack_neutron_agents
2738 fields:
2739 service: 'l3'
2740 state: 'down'
2741 relational_operator: '>'
2742 threshold: 0
2743 window: 60
2744 periods: 0
2745 function: last
2746 - name: 'neutron-dhcp-one-down'
2747 description: 'At least one DHCP agent is down'
2748 severity: 'warning'
2749 no_data_policy: 'skip' # the metric is only collected from the DC node
2750 enabled: 'true'
2751 trigger:
2752 rules:
2753 - metric: openstack_neutron_agents
2754 fields:
2755 service: 'dhcp'
2756 state: 'down'
2757 relational_operator: '>'
2758 threshold: 0
2759 window: 60
2760 periods: 0
2761 function: last
2762 - name: 'neutron-metadata-one-down'
2763 description: 'At least one metadata agents is down'
2764 severity: 'warning'
2765 no_data_policy: 'skip' # the metric is only collected from the DC node
2766 enabled: 'true'
2767 trigger:
2768 rules:
2769 - metric: openstack_neutron_agents
2770 fields:
2771 service: 'metadata'
2772 state: 'down'
2773 relational_operator: '>'
2774 threshold: 0
2775 window: 60
2776 periods: 0
2777 function: last
2778 - name: 'neutron-openvswitch-one-down'
2779 description: 'At least one openvswitch agents is down'
2780 severity: 'warning'
2781 no_data_policy: 'skip' # the metric is only collected from the DC node
2782 enabled: 'true'
2783 trigger:
2784 rules:
2785 - metric: openstack_neutron_agents
2786 fields:
2787 service: 'openvswitch'
2788 state: 'down'
2789 relational_operator: '>'
2790 threshold: 0
2791 window: 60
2792 periods: 0
2793 function: last
2794 # Less than 50% of service are up (compared to up and down).
2795 - name: 'nova-scheduler-majority-down'
2796 description: 'Less than 50% of Nova schedulers are up'
2797 severity: 'critical'
2798 enabled: 'true'
2799 trigger:
2800 rules:
2801 - metric: openstack_nova_services_percent
2802 fields:
2803 service: 'scheduler'
2804 state: 'up'
2805 relational_operator: '<='
2806 threshold: 50
2807 window: 60
2808 periods: 0
2809 function: last
2810 - name: 'nova-cert-majority-down'
2811 description: 'Less than 50% of Nova certs are up'
2812 severity: 'critical'
2813 enabled: 'true'
2814 trigger:
2815 rules:
2816 - metric: openstack_nova_services_percent
2817 fields:
2818 service: 'cert'
2819 state: 'up'
2820 relational_operator: '<='
2821 threshold: 50
2822 window: 60
2823 periods: 0
2824 function: last
2825 - name: 'nova-consoleauth-majority-down'
2826 description: 'Less than 50% of Nova consoleauths are up'
2827 severity: 'critical'
2828 enabled: 'true'
2829 trigger:
2830 rules:
2831 - metric: openstack_nova_services_percent
2832 fields:
2833 service: 'consoleauth'
2834 state: 'up'
2835 relational_operator: '<='
2836 threshold: 50
2837 window: 60
2838 periods: 0
2839 function: last
2840 - name: 'nova-compute-majority-down'
2841 description: 'Less than 50% of Nova computes are up'
2842 severity: 'critical'
2843 enabled: 'true'
2844 trigger:
2845 rules:
2846 - metric: openstack_nova_services_percent
2847 fields:
2848 service: 'compute'
2849 state: 'up'
2850 relational_operator: '<='
2851 threshold: 50
2852 window: 60
2853 periods: 0
2854 function: last
2855 - name: 'nova-conductor-majority-down'
2856 description: 'Less than 50% of Nova conductors are up'
2857 severity: 'critical'
2858 enabled: 'true'
2859 trigger:
2860 rules:
2861 - metric: openstack_nova_services_percent
2862 fields:
2863 service: 'conductor'
2864 state: 'up'
2865 relational_operator: '<='
2866 threshold: 50
2867 window: 60
2868 periods: 0
2869 function: last
2870 - name: 'cinder-scheduler-majority-down'
2871 description: 'Less than 50% of Cinder schedulers are up'
2872 severity: 'critical'
2873 enabled: 'true'
2874 trigger:
2875 rules:
2876 - metric: openstack_cinder_services_percent
2877 fields:
2878 service: 'scheduler'
2879 state: 'up'
2880 relational_operator: '<='
2881 threshold: 50
2882 window: 60
2883 periods: 0
2884 function: last
2885 - name: 'cinder-volume-majority-down'
2886 description: 'Less than 50% of Cinder volumes are up'
2887 severity: 'critical'
2888 enabled: 'true'
2889 trigger:
2890 rules:
2891 - metric: openstack_cinder_services_percent
2892 fields:
2893 service: 'volume'
2894 state: 'up'
2895 relational_operator: '<='
2896 threshold: 50
2897 window: 60
2898 periods: 0
2899 function: last
2900 - name: 'neutron-l3-majority-down'
2901 description: 'Less than 50% of Neutron L3 agents are up'
2902 severity: 'critical'
2903 enabled: 'true'
2904 trigger:
2905 rules:
2906 - metric: openstack_neutron_agents_percent
2907 fields:
2908 service: 'l3'
2909 state: 'up'
2910 relational_operator: '<='
2911 threshold: 50
2912 window: 60
2913 periods: 0
2914 function: last
2915 - name: 'neutron-dhcp-majority-down'
2916 description: 'Less than 50% of Neutron DHCP agents are up'
2917 severity: 'critical'
2918 enabled: 'true'
2919 trigger:
2920 rules:
2921 - metric: openstack_neutron_agents_percent
2922 fields:
2923 service: 'dhcp'
2924 state: 'up'
2925 relational_operator: '<='
2926 threshold: 50
2927 window: 60
2928 periods: 0
2929 function: last
2930 - name: 'neutron-metadata-majority-down'
2931 description: 'Less than 50% of Neutron metadata agents are up'
2932 severity: 'critical'
2933 enabled: 'true'
2934 trigger:
2935 rules:
2936 - metric: openstack_neutron_agents_percent
2937 fields:
2938 service: 'metadata'
2939 state: 'up'
2940 relational_operator: '<='
2941 threshold: 50
2942 window: 60
2943 periods: 0
2944 function: last
2945 - name: 'neutron-openvswitch-majority-down'
2946 description: 'Less than 50% of Neutron openvswitch agents are up'
2947 severity: 'critical'
2948 enabled: 'true'
2949 trigger:
2950 rules:
2951 - metric: openstack_neutron_agents_percent
2952 fields:
2953 service: 'openvswitch'
2954 state: 'up'
2955 relational_operator: '<='
2956 threshold: 50
2957 window: 60
2958 periods: 0
2959 function: last
diff --git a/doc/user/source/configure_alarms.rst b/doc/user/source/configure_alarms.rst
index cbc850c..0c302a9 100644
--- a/doc/user/source/configure_alarms.rst
+++ b/doc/user/source/configure_alarms.rst
@@ -368,12 +368,21 @@ file. This file has the following sections:
368 to that category of nodes. For example:: 368 to that category of nodes. For example::
369 369
370 node_cluster_alarms: 370 node_cluster_alarms:
371 controller: 371 controller-nodes:
372 cpu: ['cpu-critical-controller', 'cpu-warning-controller'] 372 apply_to_node: controller
373 root-fs: ['root-fs-critical', 'root-fs-warning'] 373 alerting: enabled
374 log-fs: ['log-fs-critical', 'log-fs-warning'] 374 members:
375 375 cpu:
376 Creates three alarm groups for the cluster of nodes called 'controller': 376 alarms: ['cpu-critical-controller', 'cpu-warning-controller']
377 root-fs:
378 alarms: ['root-fs-critical', 'root-fs-warning']
379 log-fs:
380 alarms: ['log-fs-critical', 'log-fs-warning']
381 hdd-errors:
382 alerting: enabled_with_notification
383 alarms: ['hdd-errors-critical']
384
385 Creates four alarm groups for the cluster of controller nodes:
377 386
378 * The *cpu* alarm group is mapped to two alarms defined in the ``alarms`` 387 * The *cpu* alarm group is mapped to two alarms defined in the ``alarms``
379 section known as the 'cpu-critical-controller' and 388 section known as the 'cpu-critical-controller' and
@@ -388,6 +397,13 @@ file. This file has the following sections:
388 section known as the 'log-fs-critical' and 'log-fs-warning' alarms. These 397 section known as the 'log-fs-critical' and 'log-fs-warning' alarms. These
389 alarms monitor the file system where the logs are created on the 398 alarms monitor the file system where the logs are created on the
390 controller nodes. 399 controller nodes.
400 * The *hdd-errors* alarm group is mapped to the 'hdd-errors-critical' alarm
401 defined in the ``alarms`` section. This alarm monitors the ``kern.log``
402 log entries containing critical IO errors detected by the kernel.
403 The *hdd-error* alarm obtains the *enabled_with_notification* alerting
404 attribute, meaning that the operator will be notified if any of the
405 controller nodes encounters a disk failure. Other alarms do not trigger
406 notification per node but at an aggregated cluster level.
391 407
392 .. note:: An *alarm group* is a mere implementation artifact (although it 408 .. note:: An *alarm group* is a mere implementation artifact (although it
393 has functional value) that is primarily used to distribute the alarms 409 has functional value) that is primarily used to distribute the alarms
@@ -425,7 +441,7 @@ structure of that file.
425 important to keep exactly the same copy of 441 important to keep exactly the same copy of
426 ``/etc/hiera/override/gse_filters.yaml`` across all the nodes of the 442 ``/etc/hiera/override/gse_filters.yaml`` across all the nodes of the
427 OpenStack environment including the node(s) where Nagios is installed. 443 OpenStack environment including the node(s) where Nagios is installed.
428 444
429The aggregation rules and correlation policies are defined in the ``/etc/hiera/override/gse_filters.yaml`` configuration file. 445The aggregation rules and correlation policies are defined in the ``/etc/hiera/override/gse_filters.yaml`` configuration file.
430 446
431This file has the following sections: 447This file has the following sections:
@@ -590,6 +606,7 @@ the service cluster aggregation rules::
590 output_metric_name: cluster_service_status 606 output_metric_name: cluster_service_status
591 interval: 10 607 interval: 10
592 warm_up_period: 20 608 warm_up_period: 20
609 alerting: enabled_with_notification
593 clusters: 610 clusters:
594 nova-api: 611 nova-api:
595 policy: highest_severity 612 policy: highest_severity
@@ -638,6 +655,10 @@ Where
638| The number of seconds after a (re)start that the GSE plugin will wait 655| The number of seconds after a (re)start that the GSE plugin will wait
639 before emitting its metric messages. 656 before emitting its metric messages.
640 657
658| alerting
659| Type: string (one of 'disabled', 'enabled' or 'enabled_with_notification').
660| The alerting configuration of the service clusters.
661
641| clusters 662| clusters
642| Type: list 663| Type: list
643| The list of service clusters that the plugin handles. See 664| The list of service clusters that the plugin handles. See
@@ -720,6 +741,7 @@ cluster aggregation rules::
720 output_metric_name: cluster_node_status 741 output_metric_name: cluster_node_status
721 interval: 10 742 interval: 10
722 warm_up_period: 80 743 warm_up_period: 80
744 alerting: enabled_with_notification
723 clusters: 745 clusters:
724 controller: 746 controller:
725 policy: majority_of_members 747 policy: majority_of_members
@@ -768,6 +790,10 @@ Where
768| The number of seconds after a (re)start that the GSE plugin will wait 790| The number of seconds after a (re)start that the GSE plugin will wait
769 before emitting its metric messages. 791 before emitting its metric messages.
770 792
793| alerting
794| Type: string (one of 'disabled', 'enabled' or 'enabled_with_notification').
795| The alerting configuration of the node clusters.
796
771| clusters 797| clusters
772| Type: list 798| Type: list
773| The list of node clusters that the plugin handles. See 799| The list of node clusters that the plugin handles. See
diff --git a/doc/user/source/release_notes.rst b/doc/user/source/release_notes.rst
index fd4e580..838dc27 100644
--- a/doc/user/source/release_notes.rst
+++ b/doc/user/source/release_notes.rst
@@ -10,6 +10,34 @@ Release notes
10Version 1.0.0 10Version 1.0.0
11+++++++++++++ 11+++++++++++++
12 12
13The StackLight Collector plugin 1.0.0 for Fuel contains the following updates:
14
15New alarms:
16
17 * Monitor RabbitMQ based on Pacemaker point-of-view
18 * Monitor all partitions and OSD disk(s)
19 * Horizon HTTP 5xx errors
20 * Keystone slow response times
21 * HDD errors
22 * SWAP percent usage
23 * Network packet drops
24 * Local OpenStack API checks
25 * Local checks for services: Apache, Memcached, MySQL, RabbitMQ, Pacemaker
26
27Alarm enhancements:
28
29 * Added the ``group by`` attribute support for alarm rules
30 * Added support for ``pattern matching`` to filter metric dimensions
31
32Bug fixes:
33
34 * Fixed the concurrent execution of logrotate.
35 See `#1455104 <https://bugs.launchpad.net/lma-toolchain/+bug/1455104>`_.
36 * Implemented the capability for the Elasticsearch bulk size to increase when
37 required. See `#1617211 <https://bugs.launchpad.net/lma-toolchain/+bug/1617211>`_.
38 * Implemented the capability to use RabbitMQ management API in place of the
39 :command:`rabbitmqctl` command.
40
13Version 0.10.0 41Version 0.10.0
14++++++++++++++ 42++++++++++++++
15 43