--- # # Copyright (c) 2013-2018 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # ############################################################################ # # ALARM & CUSTOMER LOG DOCUMENTATION # ############################################################################ ############################################################################ # # Record Format ... for documentation # # 100.001: # Type: < Alarm | Log > # Description: < yaml string > # OR # [ < yaml string >, // list of yaml strings # < yaml string > ] # OR # critical: < yaml string > // i.e. dictionary of yaml strings indexed by severity # major: < yaml string > # minor: < yaml string > # warning: < yaml string > # Entity_Instance_ID: < yaml string ... e.g. host=.interface= > # OR # [ < yaml string >, // list of yaml strings # < yaml string > ] # Severity: < critical | major | minor | warning > # OR # [ critical, major ] // list of severity values # Proposed_Repair_Action: < yaml string > // NOTE ALARM ONLY FIELD # OR # critical: < yaml string > // i.e. dictionary of yaml strings indexed by severity # major: < yaml string > # minor: < yaml string > # warning: < yaml string > # Maintenance_Action: < yaml string > // NOTE ALARM ONLY FIELD # OR # critical: < yaml string > // i.e. dictionary of yaml strings indexed by severity # major: < yaml string > # minor: < yaml string > # warning: < yaml string > # Inhibit_Alarms: < True | False > // NOTE ALARM ONLY FIELD # Alarm_Type: < operational-violation | ... > # Probable_Cause: < timing-problem | ... > # OR # [ < timing-problem | ... >, // list of probable-causes # < timing-problem | ... > ] # Service_Affecting: < True | False > # Suppression: < True | False > // NOTE ALARM ONLY FIELD # Management_Affecting_Severity: < none | critical | major | minor | warning > # // lowest alarm level of this type that will block forced upgrades & orchestration actions # Degrade_Affecting_Severity: < none | critical | major | minor > # // lowest alarm level of this type sets a host to 'degraded' # # # Other Notes: # - use general record format above # - the only dictionaries allowed are ones indexed by severity # - if there are multiple lists in a record, # then they should all have the same # of items and corresponding list items represent instance of alarm # - if you can't describe the alarm/log based on the above rules, # then you can use a multi-line string format # - DELETING alarms from events.yaml: alarms should only be deleted when going to a new Titanium Cloud release # - if all possible alarm severities are mgmt affecting, the convention is to # use 'warning' as the Management_Affecting_Severity, even if warning is not a possible severity for that alarm # # Testing: # - Testing of events.yaml can be done by running regular make command # and specifying fm-doc: # nice -n 20 ionice -c Idle make -C build fm-doc.rebuild # - When building, events.yaml will be parsed for correct format, and also # to ensure that Alarm IDs defined in constants.py and fmAlarm.h are # listed in events.yaml # ############################################################################ #--------------------------------------------------------------------------- # Monitored Resource Alarms #--------------------------------------------------------------------------- 100.101: Type: Alarm Description: |- Platform CPU threshold exceeded; threshold x%, actual y% . CRITICAL @ 95% MAJOR @ 90% MINOR @ 80% Entity_Instance_ID: host= Severity: [critical, major, minor] Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support." Maintenance_Action: critical: degrade major: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: threshold-crossed Service_Affecting: False Suppression: True Management_Affecting_Severity: major Degrade_Affecting_Severity: critical 100.102: Type: Alarm Description: |- VSwitch CPU threshold exceeded; threshold x%, actual y% . CRITICAL @ 95% MAJOR @ 90% MINOR @ 80% Entity_Instance_ID: host= Severity: [critical, major, minor] Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support." Maintenance_Action: critical: degrade major: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: threshold-crossed Service_Affecting: False Suppression: True Management_Affecting_Severity: none Degrade_Affecting_Severity: none 100.103: Type: Alarm Description: |- Memory threshold exceeded; threshold x%, actual y% . CRITICAL @ 90% MAJOR @ 80% MINOR @ 70% Entity_Instance_ID: host= Severity: [critical, major, minor] Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support; may require additional memory on Host." Maintenance_Action: critical: degrade major: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: threshold-crossed Service_Affecting: False Suppression: True Management_Affecting_Severity: none Degrade_Affecting_Severity: critical 100.104: # NOTE This should really be split into two different Alarms. Type: Alarm Description: |- host=.filesystem= File System threshold exceeded; threshold x%, actual y% . CRITICAL @ 90% MAJOR @ 80% MINOR @ 70% OR host=.volumegroup= Monitor and if condition persists, consider adding additional physical volumes to the volume group. Entity_Instance_ID: |- host=.filesystem= OR host=.volumegroup= Severity: [critical, major, minor] Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support." Maintenance_Action: critical: degrade major: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: threshold-crossed Service_Affecting: False Suppression: True Management_Affecting_Severity: critical Degrade_Affecting_Severity: critical #-------- # 100.105: Retired (with R2 release): previously monitored /etc/nova/instances # NFS mount from controller to computes #-------- 100.106: Type: Alarm Description: "'OAM' Port failed." Entity_Instance_ID: host=.port= Severity: major Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unknown Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: major 100.107: Type: Alarm Description: |- 'OAM' Interface degraded. OR 'OAM' Interface failed. Entity_Instance_ID: host=.interface= Severity: [critical, major] Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: critical: degrade major: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unknown Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: major 100.108: Type: Alarm Description: "'MGMT' Port failed." Entity_Instance_ID: host=.port= Severity: major Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unknown Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: major 100.109: Type: Alarm Description: |- 'MGMT' Interface degraded. OR 'MGMT' Interface failed. Entity_Instance_ID: host=.interface= Severity: [critical, major] Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: critical: degrade major: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unknown Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: major 100.110: Type: Alarm Description: "'CLUSTER-HOST' Port failed." Entity_Instance_ID: host=.port= Severity: major Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unknown Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: major 100.111: Type: Alarm Description: |- 'CLUSTER-HOST' Interface degraded. OR 'CLUSTER-HOST' Interface failed. Entity_Instance_ID: host=.interface= Severity: [critical, major] Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: critical: degrade major: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unknown Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: major 100.112: Type: Alarm Description: "'DATA-VRS' Port down." Entity_Instance_ID: host=.port= Severity: major Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unknown Service_Affecting: True Suppression: True Management_Affecting_Severity: none Degrade_Affecting_Severity: major 100.113: Type: Alarm Description: |- 'DATA-VRS' Interface degraded. OR 'DATA-VRS' Interface down. Entity_Instance_ID: host=.interface= Severity: [critical, major] Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: major: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unknown Service_Affecting: True Suppression: True Management_Affecting_Severity: none Degrade_Affecting_Severity: major 100.114: Type: Alarm Description: major: "NTP configuration does not contain any valid or reachable NTP servers." minor: "NTP address is not a valid or a reachable NTP server." Entity_Instance_ID: major: host=.ntp minor: host=.ntp= Severity: [major, minor] Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support." Maintenance_Action: none Inhibit_Alarms: Alarm_Type: communication Probable_Cause: unknown Service_Affecting: False Suppression: False Management_Affecting_Severity: none Degrade_Affecting_Severity: none 100.115: Type: Alarm Description: "VSwitch Memory Usage, processor threshold exceeded; threshold x%, actual y% ." Entity_Instance_ID: host=.processor= Severity: [critical, major, minor] Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support." Maintenance_Action: critical: degrade major: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: threshold-crossed Service_Affecting: False Suppression: True Management_Affecting_Severity: none Degrade_Affecting_Severity: critical 100.116: Type: Alarm Description: "Cinder LVM Thinpool Usage threshold exceeded; threshold x%, actual y% ." Entity_Instance_ID: host= Severity: [critical, major, minor] Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support." Maintenance_Action: critical: degrade major: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: threshold-crossed Service_Affecting: False Suppression: True Management_Affecting_Severity: none Degrade_Affecting_Severity: critical 100.117: Type: Alarm Description: "Nova LVM Thinpool Usage threshold exceeded; threshold x%, actual y% ." Entity_Instance_ID: host= Severity: [critical, major, minor] Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support." Maintenance_Action: critical: degrade major: degrade Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: threshold-crossed Service_Affecting: False Suppression: True Management_Affecting_Severity: major Degrade_Affecting_Severity: critical 100.118: Type: Alarm Description: Controller cannot establish connection with remote logging server. Entity_Instance_ID: host= Severity: minor Proposed_Repair_Action: "Ensure Remote Log Server IP is reachable from Controller through OAM interface; otherwise contact next level of support." Maintenance_Action: none Inhibit_Alarms: False Alarm_Type: communication Probable_Cause: communication-subsystem-failure Service_Affecting: False Suppression: False Management_Affecting_Severity: none Degrade_Affecting_Severity: none 100.119: Type: Alarm Description: major: "PTP configuration or out-of-tolerance timestamping conditions" minor: "PTP out-of-tolerance timestamping condition" Entity_Instance_ID: |- host=.ptp OR host=.ptp=no-lock OR host=.ptp=.unsupported=hardware-timestamping OR host=.ptp=.unsupported=software-timestamping OR host=.ptp=.unsupported=legacy-timestamping OR host=.ptp=out-of-tolerance Severity: [major, minor] Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support." Maintenance_Action: none Inhibit_Alarms: Alarm_Type: communication Probable_Cause: unknown Service_Affecting: False Suppression: False Management_Affecting_Severity: none Degrade_Affecting_Severity: none #--------------------------------------------------------------------------- # MAINTENANCE #--------------------------------------------------------------------------- 200.001: Type: Alarm Description: was administratively locked to take it out-of-service. Entity_Instance_ID: host= Severity: warning Proposed_Repair_Action: Administratively unlock Host to bring it back in-service. Maintenance_Action: none Inhibit_Alarms: True Alarm_Type: operational-violation Probable_Cause: out-of-service Service_Affecting: True Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 200.004: Type: Alarm Description: |- experienced a service-affecting failure. Host is being auto recovered by Reboot. Entity_Instance_ID: host= Severity: critical Proposed_Repair_Action: If auto-recovery is consistently unable to recover host to the unlocked-enabled state contact next level of support or lock and replace failing host. Maintenance_Action: auto recover Inhibit_Alarms: False Alarm_Type: operational-violation Probable_Cause: application-subsystem-failure Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 200.011: Type: Alarm Description: experienced a configuration failure during initialization. Host is being re-configured by Reboot. Entity_Instance_ID: host= Severity: critical Proposed_Repair_Action: If auto-recovery is consistently unable to recover host to the unlocked-enabled state contact next level of support or lock and replace failing host. Maintenance_Action: auto-recover Inhibit_Alarms: False Alarm_Type: operational-violation Probable_Cause: configuration-or-customization-error Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 200.010: Type: Alarm Description: access to board management module has failed. Entity_Instance_ID: host= Severity: warning Proposed_Repair_Action: Check Host's board management configuration and connectivity. Maintenance_Action: auto recover Inhibit_Alarms: False Alarm_Type: operational-violation Probable_Cause: communication-subsystem-failure Service_Affecting: False Suppression: False Management_Affecting_Severity: none Degrade_Affecting_Severity: none 200.012: Type: Alarm Description: controller function has in-service failure while compute services remain healthy. Entity_Instance_ID: host= Severity: major Proposed_Repair_Action: Lock and then Unlock host to recover. Avoid using 'Force Lock' action as that will impact compute services running on this host. If lock action fails then contact next level of support to investigate and recover. Maintenance_Action: "degrade - requires manual action" Inhibit_Alarms: False Alarm_Type: operational-violation Probable_Cause: communication-subsystem-failure Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: major 200.013: Type: Alarm Description: compute service of the only available controller is not poperational. Auto-recovery is disabled. Deggrading host instead. Entity_Instance_ID: host= Severity: major Proposed_Repair_Action: Enable second controller and Switch Activity (Swact) over to it as soon as possible. Then Lock and Unlock host to recover its local compute service. Maintenance_Action: "degrade - requires manual action" Inhibit_Alarms: False Alarm_Type: operational-violation Probable_Cause: communication-subsystem-failure Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: major 200.005: Type: Alarm Description: |- Degrade: is experiencing an intermittent 'Management Network' communication failures that have exceeded its lower alarming threshold. Failure: is experiencing a persistent critical 'Management Network' communication failure." Entity_Instance_ID: host= Severity: [critical, major] Proposed_Repair_Action: "Check 'Management Network' connectivity and support for multicast messaging. If problem consistently occurs after that and Host is reset, then contact next level of support or lock and replace failing host." Maintenance_Action: auto recover Inhibit_Alarms: False Alarm_Type: communication Probable_Cause: unknown Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 200.009: Type: Alarm Description: |- Degrade: is experiencing an intermittent 'Cluster-host Network' communication failures that have exceeded its lower alarming threshold. Failure: is experiencing a persistent critical 'Cluster-host Network' communication failure." Entity_Instance_ID: host= Severity: [critical, major] Proposed_Repair_Action: "Check 'Cluster-host Network' connectivity and support for multicast messaging. If problem consistently occurs after that and Host is reset, then contact next level of support or lock and replace failing host." Maintenance_Action: auto recover Inhibit_Alarms: False Alarm_Type: communication Probable_Cause: unknown Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 200.006: Type: Alarm Description: |- Main Process Monitor Daemon Failure (major): 'Process Monitor' (pmond) process is not running or functioning properly. The system is trying to recover this process. Monitored Process Failure (critical/major/minor): Critical: critical '' process has failed and could not be auto-recovered gracefully. Auto-recovery progression by host reboot is required and in progress. Major: is degraded due to the failure of its '' process. Auto recovery of this major process is in progress. Minor: '' process has failed. Auto recovery of this minor process is in progress. OR '' process has failed. Manual recovery is required. Entity_Instance_ID: host=.process= Severity: [critical, major, minor] Proposed_Repair_Action: |- If this alarm does not automatically clear after some time and continues to be asserted after Host is locked and unlocked then contact next level of support for root cause analysis and recovery. If problem consistently occurs after Host is locked and unlocked then contact next level of support for root cause analysis and recovery." Maintenance_Action: critical: auto-recover major: degrade minor: Inhibit_Alarms: False Alarm_Type: operational-violation Probable_Cause: unknown Service_Affecting: critical: True major: True minor: False Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: major # 200.006: // NOTE using duplicate ID of a completely analogous Alarm for this # Type: Log # Description: |- # Main Process Monitor Daemon Failure (major) # 'Process Monitor' (pmond) process is not running or functioning properly. # The system is trying to recover this process. # # Monitored Process Failure (critical/major/minor) # critical: critical '' process has failed and could not be auto-recovered gracefully. # Auto-recovery progression by host reboot is required and in progress. # major: is degraded due to the failure of its '' process. Auto recovery of this major process is in progress. # minor: '' process has failed. Auto recovery of this minor process is in progress. # OR # '' process has failed. Manual recovery is required. # Entity_Instance_ID: host=.process= # Severity: minor # Alarm_Type: other # Probable_Cause: unspecified-reason # Service_Affecting: True 200.007: Type: Alarm Description: critical: "Host is degraded due to a 'critical' out-of-tolerance reading from the '' sensor" major: "Host is degraded due to a 'major' out-of-tolerance reading from the '' sensor" minor: "Host is reporting a 'minor' out-of-tolerance reading from the '' sensor" Entity_Instance_ID: host=.sensor= Severity: [critical, major, minor] Proposed_Repair_Action: "If problem consistently occurs after Host is power cycled and or reset, contact next level of support or lock and replace failing host." Maintenance_Action: critical: degrade major: degrade minor: auto-recover (polling) Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unspecified-reason Service_Affecting: critical: True major: False minor: False Suppression: True Management_Affecting_Severity: none Degrade_Affecting_Severity: critical 200.014: Type: Alarm Description: "The Hardware Monitor was unable to load, configure and monitor one or more hardware sensors." Entity_Instance_ID: host= Severity: minor Proposed_Repair_Action: Check Board Management Controller provisioning. Try reprovisioning the BMC. If problem persists try power cycling the host and then the entire server including the BMC power. If problem persists then contact next level of support. Maintenance_Action: None Inhibit_Alarms: False Alarm_Type: operational-violation Probable_Cause: unknown Service_Affecting: False Suppression: True Management_Affecting_Severity: none Degrade_Affecting_Severity: none 200.015: Type: Alarm Description: Unable to read one or more sensor groups from this host's board management controller Entity_Instance_ID: host= Severity: major Proposed_Repair_Action: Check board management connectivity and try rebooting the board management controller. If problem persists contact next level of support or lock and replace failing host. Maintenance_Action: None Inhibit_Alarms: False Alarm_Type: operational-violation Probable_Cause: unknown Service_Affecting: False Suppression: False Management_Affecting_Severity: none Degrade_Affecting_Severity: none 200.020: Type: Log Description: [" has been 'discovered' on the network", " has been 'added' to the system", " has 'entered' multi-node failure avoidance", " has 'exited' multi-node failure avoidance"] Entity_Instance_ID: [host=.event=discovered, host=.event=add, host=.event=mnfa_enter, host=.event=mnfa_exit] Severity: warning Alarm_Type: other Probable_Cause: unspecified-reason Service_Affecting: True 200.021: Type: Log Description: [" board management controller has been 'provisioned'", " board management controller has been 're-provisioned'", " board management controller has been 'de-provisioned'", " manual 'unlock' request", " manual 'reboot' request", " manual 'reset' request", " manual 'power-off' request", " manual 'power-on' request", " manual 'reinstall' request", " manual 'force-lock' request", " manual 'delete' request", " manual 'controller switchover' request"] Entity_Instance_ID: [host=.command=provision, host=.command=reprovision, host=.command=deprovision, host=.command=unlock, host=.command=reboot, host=.command=reset, host=.command=power-off, host=.command=power-on, host=.command=reinstall, host=.command=force-lock, host=.command=delete, host=.command=swact] Severity: warning Alarm_Type: other Probable_Cause: unspecified-reason Service_Affecting: False 200.022: Type: Log Description: [" is now 'disabled'", " is now 'enabled'", " is now 'online'", " is now 'offline'", " is 'disabled-failed' to the system", " reinstall failed", " reinstall completed successfully"] Entity_Instance_ID: [host=.state=disabled, host=.state=enabled, host=.status=online, host=.status=offline, host=.status=failed, host=.status=reinstall-failed, host=.status=reinstall-complete] Severity: warning Alarm_Type: other Probable_Cause: unspecified-reason Service_Affecting: True #--------------------------------------------------------------------------- # BACKUP AND RESTORE #--------------------------------------------------------------------------- 210.001: Type: Alarm Description: System Backup in progress. Entity_Instance_ID: host=controller Severity: minor Proposed_Repair_Action: No action required. Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unspecified-reason Service_Affecting: False Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none #--------------------------------------------------------------------------- # SYSTEM CONFIGURATION #--------------------------------------------------------------------------- 250.001: Type: Alarm Description: Configuration is out-of-date. Entity_Instance_ID: host= Severity: major Proposed_Repair_Action: Administratively lock and unlock to update config. Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unspecified-reason Service_Affecting: True Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 250.002: Type: Alarm Description: Ceph cache tiering configuration is out-of-date. Entity_Instance_ID: cluster= Severity: major Proposed_Repair_Action: Apply Ceph service parameter settings. Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unspecified-reason Service_Affecting: False Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 250.003: Type: Alarm Description: Kubernetes certificates rotation failed on host . Entity_Instance_ID: host= Severity: major Proposed_Repair_Action: Rotate kubernetes certificates manually. Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unspecified-reason Service_Affecting: False Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none #--------------------------------------------------------------------------- # VM Compute Services #--------------------------------------------------------------------------- 270.001: Type: Alarm Description: "Host compute services failure[, reason = ]" Entity_Instance_ID: host=.services=compute Severity: critical Proposed_Repair_Action: Wait for host services recovery to complete; if problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: unspecified-reason Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 270.101: Type: Log Description: "Host compute services failure[, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 270.102: Type: Log Description: Host compute services enabled Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 270.103: Type: Log Description: Host compute services disabled Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 275.001: Type: Log Description: Host hypervisor is now - Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False #--------------------------------------------------------------------------- # DISTRIBUTED CLOUD #--------------------------------------------------------------------------- 280.001: Type: Alarm Description: is offline Entity_Instance_ID: subcloud= Severity: critical Proposed_Repair_Action: Wait for subcloud to become online; if problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: communication Probable_Cause: loss-of-signal Service_Affecting: False Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 280.002: Type: Alarm Description: sync_status is out-of-sync Entity_Instance_ID: [subcloud=.resource=] Severity: major Proposed_Repair_Action: If problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: other Probable_Cause: application-subsystem-failure Service_Affecting: False Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none #--------------------------------------------------------------------------- # NETWORK #--------------------------------------------------------------------------- 300.001: Type: Alarm Description: "'Data' Port failed." Entity_Instance_ID: host=.port= Severity: major Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: loss-of-signal Service_Affecting: True Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 300.002: Type: Alarm Description: |- 'Data' Interface degraded. OR 'Data' Interface failed. Entity_Instance_ID: host=.interface= Severity: [critical, major] Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: loss-of-signal Service_Affecting: True Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: critical 300.003: Type: Alarm Description: Networking Agent not responding. Entity_Instance_ID: host=.agent= Severity: major Proposed_Repair_Action: "If condition persists, attempt to clear issue by administratively locking and unlocking the Host." Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: underlying-resource-unavailable Service_Affecting: True Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 300.004: Type: Alarm Description: No enabled compute host with connectivity to provider network. Entity_Instance_ID: service=networking.providernet= Severity: major Proposed_Repair_Action: Enable compute hosts with required provider network connectivity. Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: underlying-resource-unavailable Service_Affecting: True Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 300.005: Type: Alarm Description: |- Communication failure detected over provider network x% for ranges y% on host z%. OR Communication failure detected over provider network x% on host z%. Entity_Instance_ID: host=.service=networking.providernet= Severity: major Proposed_Repair_Action: Check neighbour switch port VLAN assignments. Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: underlying-resource-unavailable Service_Affecting: True Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 300.010: Type: Alarm Description: |- ML2 Driver Agent non-reachable OR ML2 Driver Agent reachable but non-responsive OR ML2 Driver Agent authentication failure OR ML2 Driver Agent is unable to sync Neutron database Entity_Instance_ID: host=.ml2driver= Severity: major Proposed_Repair_Action: "Monitor and if condition persists, contact next level of support." Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: underlying-resource-unavailable Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 300.012: Type: Alarm Description: "Openflow Controller connection failed." Entity_Instance_ID: host=.openflow-controller= Severity: major Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: loss-of-signal Service_Affecting: True Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: critical 300.013: Type: Alarm Description: |- No active Openflow controller connections found for this network. OR One or more Openflow controller connections in disconnected state for this network. Entity_Instance_ID: host=.openflow-network= Severity: [critical, major] Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: loss-of-signal Service_Affecting: True Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: critical 300.014: Type: Alarm Description: "OVSDB Manager connection failed." Entity_Instance_ID: host=.sdn-controller= Severity: major Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: loss-of-signal Service_Affecting: True Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: critical 300.015: Type: Alarm Description: "No active OVSDB connections found." Entity_Instance_ID: host= Severity: critical Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: loss-of-signal Service_Affecting: True Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: critical 300.016: Type: Alarm Description: "Dynamic routing agent x% lost connectivity to peer y%." Entity_Instance_ID: host=,agent=,bgp-peer= Severity: major Proposed_Repair_Action: If condition persists, fix connectivity to peer. Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: loss-of-signal Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none #--------------------------------------------------------------------------- # HIGH AVAILABILITY #--------------------------------------------------------------------------- 400.001: Type: Alarm Description: |- Service group failure; . OR Service group degraded; . OR Service group warning; . Entity_Instance_ID: service_domain=.service_group=.host= Severity: [critical, major, minor] Proposed_Repair_Action: Contact next level of support. Maintenance_Action: Inhibit_Alarms: False Alarm_Type: processing-error Probable_Cause: underlying-resource-unavailable Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: major 400.002: Type: Alarm Description: |- Service group loss of redundancy; expected standby member but only standby member available. OR Service group loss of redundancy; expected standby member but only standby member available. OR Service group loss of redundancy; expected active member but no active members available. OR Service group loss of redundancy; expected active member but only active member available. Entity_Instance_ID: service_domain=.service_group= Severity: major Proposed_Repair_Action: "Bring a controller node back in to service, otherwise contact next level of support." Maintenance_Action: Inhibit_Alarms: False Alarm_Type: processing-error Probable_Cause: underlying-resource-unavailable Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 400.003: Type: Alarm Description: |- License key is not installed; a valid license key is required for operation. OR License key has expired or is invalid; a valid license key is required for operation. OR Evaluation license key will expire on ; there are days remaining in this evaluation. OR Evaluation license key will expire on ; there is only 1 day remaining in this evaluation. Entity_Instance_ID: host= Severity: critical Proposed_Repair_Action: Contact next level of support to obtain a new license key. Maintenance_Action: Inhibit_Alarms: False Alarm_Type: processing-error Probable_Cause: key-expired Service_Affecting: True Suppression: False Management_Affecting_Severity: critical Degrade_Affecting_Severity: none # 400.004: // NOTE Removed # Type: Alarm # Description: Service group software modification detected; . # Entity_Instance_ID: host= # Severity: major # Proposed_Repair_Action: Contact next level of support. # Maintenance_Action: # Inhibit_Alarms: False # Alarm_Type: processing-error # Probable_Cause: software-program-error # Service_Affecting: True # Suppression: False 400.005: Type: Alarm Description: |- Communication failure detected with peer over port . OR Communication failure detected with peer over port within the last 30 seconds. Entity_Instance_ID: host=.network= Severity: major Proposed_Repair_Action: Check cabling and far-end port configuration and status on adjacent equipment. Maintenance_Action: Inhibit_Alarms: False Alarm_Type: communication Probable_Cause: underlying-resource-unavailable Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none #--------------------------------------------------------------------------- # SM #--------------------------------------------------------------------------- 401.001: Type: Log Description: Service group state change from to on host Entity_Instance_ID: service_domain=.service_group=.host= Severity: critical Alarm_Type: processing-error Probable_Cause: unspecified-reason Service_Affecting: True 401.002: Type: Log Description: |- Service group loss of redundancy; expected standby member but no standby members available or Service group loss of redundancy; expected standby member but only standby member(s) available or Service group has no active members available; expected active member(s) or Service group loss of redundancy; expected active member(s) but only active member(s) available Entity_Instance_ID: service_domain=.service_group= Severity: critical Alarm_Type: processing-error Probable_Cause: unspecified-reason Service_Affecting: True 401.003: Type: Log Description: |- License key has expired or is invalid or Evaluation license key will expire on or License key is valid Entity_Instance_ID: host= Severity: critical Alarm_Type: processing-error Probable_Cause: unspecified-reason Service_Affecting: True 401.005: Type: Log Description: |- Communication failure detected with peer over port on host or Communication failure detected with peer over port on host within the last seconds or Communication established with peer over port on host Entity_Instance_ID: host=.network= Severity: critical Alarm_Type: processing-error Probable_Cause: unspecified-reason Service_Affecting: True 401.007: Type: Log Description: Swact or swact-force Entity_Instance_ID: host= Severity: critical Alarm_Type: processing-error Probable_Cause: unspecified-reason Service_Affecting: True #--------------------------------------------------------------------------- # SECURITY #--------------------------------------------------------------------------- 500.100: Type: Alarm Description: TPM initialization failed on host. Entity_Instance_ID: host= Severity: major Proposed_Repair_Action: reinstall HTTPS certificate; if problem persists contact next level of support. Maintenance_Action: degrade Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: procedural-error Service_Affecting: True Suppression: False Management_Affecting_Severity: none Degrade_Affecting_Severity: none 500.101: Type: Alarm Description: Developer patch certificate enabled. Entity_Instance_ID: host=controller Severity: critical Proposed_Repair_Action: Reinstall system to disable developer certificate and remove untrusted patches. Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unspecified-reason Service_Affecting: False Suppression: False Management_Affecting_Severity: none Degrade_Affecting_Severity: none 500.500: Type: Log Description: "Host has IMA Appraisal failure for service when executing , reason = ]" Entity_Instance_ID: host=.service= Severity: major Alarm_Type: integrity-violation Probable_Cause: information-modification-detected Service_Affecting: False #--------------------------------------------------------------------------- # VM #--------------------------------------------------------------------------- 700.001: Type: Alarm Description: |- Instance owned by has failed on host Instance owned by has failed to schedule Entity_Instance_ID: tenant=.instance= Severity: critical Proposed_Repair_Action: The system will attempt recovery; no repair action required Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: software-error Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.002: Type: Alarm Description: Instance owned by is paused on host Entity_Instance_ID: tenant=.instance= Severity: critical Proposed_Repair_Action: Unpause the instance Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: procedural-error Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.003: Type: Alarm Description: Instance owned by is suspended on host Entity_Instance_ID: tenant=.instance= Severity: critical Proposed_Repair_Action: Resume the instance Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: procedural-error Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.004: Type: Alarm Description: Instance owned by is stopped on host Entity_Instance_ID: tenant=.instance= Severity: critical Proposed_Repair_Action: Start the instance Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: procedural-error Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.005: Type: Alarm Description: Instance owned by is rebooting on host Entity_Instance_ID: tenant=.instance= Severity: critical Proposed_Repair_Action: Wait for reboot to complete; if problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: unspecified-reason Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.006: Type: Alarm Description: Instance owned by is rebuilding on host Entity_Instance_ID: tenant=.instance= Severity: critical Proposed_Repair_Action: Wait for rebuild to complete; if problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: underlying-resource-unavailable Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.007: Type: Alarm Description: Instance owned by is evacuating from host Entity_Instance_ID: tenant=.instance= Severity: critical Proposed_Repair_Action: Wait for evacuate to complete; if problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: underlying-resource-unavailable Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.008: Type: Alarm Description: Instance owned by is live migrating from host Entity_Instance_ID: tenant=.instance= Severity: warning Proposed_Repair_Action: Wait for live migration to complete; if problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: unspecified-reason Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.009: Type: Alarm Description: Instance owned by is cold migrating from host Entity_Instance_ID: tenant=.instance= Severity: critical Proposed_Repair_Action: Wait for cold migration to complete; if problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: unspecified-reason Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.010: Type: Alarm Description: Instance owned by has been cold-migrated to host waiting for confirmation Entity_Instance_ID: tenant=.instance= Severity: critical Proposed_Repair_Action: Confirm or revert cold-migrate of instance Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: unspecified-reason Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.011: Type: Alarm Description: Instance owned by is reverting cold migrate to host Entity_Instance_ID: tenant=.instance= Severity: critical Proposed_Repair_Action: "Wait for cold migration revert to complete; if problem persists contact next level of support" Maintenance_Action: Inhibit_Alarms: Alarm_Type: other Probable_Cause: unspecified-reason Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.012: Type: Alarm Description: Instance owned by is resizing on host Entity_Instance_ID: tenant=.instance= Severity: critical Proposed_Repair_Action: Wait for resize to complete; if problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: unspecified-reason Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.013: Type: Alarm Description: Instance owned by has been resized on host waiting for confirmation Entity_Instance_ID: itenant=.instance= Severity: critical Proposed_Repair_Action: Confirm or revert resize of instance Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: unspecified-reason Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.014: Type: Alarm Description: Instance owned by is reverting resize on host Entity_Instance_ID: tenant=.instance= Severity: critical Proposed_Repair_Action: "Wait for resize revert to complete; if problem persists contact next level of support" Maintenance_Action: Inhibit_Alarms: Alarm_Type: other Probable_Cause: unspecified-reason Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.015: Type: Alarm Description: Guest Heartbeat not established for instance owned by on host Entity_Instance_ID: tenant=.instance= Severity: major Proposed_Repair_Action: "Verify that the instance is running the Guest-Client daemon, or disable Guest Heartbeat for the instance if no longer needed, otherwise contact next level of support" Maintenance_Action: Inhibit_Alarms: Alarm_Type: communication Probable_Cause: procedural-error Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.016: Type: Alarm Description: Multi-Node Recovery Mode Entity_Instance_ID: subsystem=vim Severity: minor Proposed_Repair_Action: "Wait for the system to exit out of this mode" Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 700.017: Type: Alarm Description: Server group policy was not satisfied Entity_Instance_ID: server-group Severity: minor Proposed_Repair_Action: "Migrate instances in an attempt to satisfy the policy; if problem persists contact next level of support" Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: procedural-error Service_Affecting: True Suppression: True Management_Affecting_Severity: none Degrade_Affecting_Severity: none 700.101: Type: Log Description: Instance is enabled on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.102: Type: Log Description: Instance owned by has failed[, reason = ] Instance owned by has failed to schedule[, reason = ] Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.103: Type: Log Description: Create issued |by the system> against owned by Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.104: Type: Log Description: Creating instance owned by Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.105: Type: Log Description: "Create rejected for instance [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.106: Type: Log Description: "Create cancelled for instance [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.107: Type: Log Description: "Create failed for instance [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.108: Type: Log Description: Inance owned by has been created Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.109: Type: Log Description: "Delete issued |by the system> against instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.110: Type: Log Description: Deleting instance owned by Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.111: Type: Log Description: "Delete rejected for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.112: Type: Log Description: "Delete cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.113: Type: Log Description: "Delete failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.114: Type: Log Description: Deleted instance owned by Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.115: Type: Log Description: "Pause issued |by the system> against instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.116: Type: Log Description: Pause inprogress for instance on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.117: Type: Log Description: "Pause rejected for instance enabled on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.118: Type: Log Description: "Pause cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.119: Type: Log Description: "Pause failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.120: Type: Log Description: Pause complete for instance now paused on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.121: Type: Log Description: "Unpause issued |by the system> against instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.122: Type: Log Description: Unpause inprogress for instance on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.123: Type: Log Description: "Unpause rejected for instance paused on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.124: Type: Log Description: "Unpause cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.125: Type: Log Description: "Unpause failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.126: Type: Log Description: Unpause complete for instance now enabled on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.127: Type: Log Description: "Suspend issued |by the system> against instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.128: Type: Log Description: Suspend inprogress for instance on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.129: Type: Log Description: "Suspend rejected for instance enabled on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.130: Type: Log Description: "Suspend cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.131: Type: Log Description: "Suspend failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.132: Type: Log Description: Suspend complete for instance now suspended on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.133: Type: Log Description: "Resume issued |by the system> against instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.134: Type: Log Description: Resume inprogress for instance on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.135: Type: Log Description: "Resume rejected for instance suspended on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.136: Type: Log Description: "Resume cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.137: Type: Log Description: "Resume failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.138: Type: Log Description: Resume complete for instance now enabled on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.139: Type: Log Description: "Start issued |by the system> against instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.140: Type: Log Description: Start inprogress for instance on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.141: Type: Log Description: "Start rejected for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.142: Type: Log Description: "Start cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.143: Type: Log Description: "Start failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.144: Type: Log Description: Start complete for instance now enabled on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.145: Type: Log Description: "Stop issued |by the system|by the instance> against instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.146: Type: Log Description: Stop inprogress for instance on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.147: Type: Log Description: "Stop rejected for instance enabled on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.148: Type: Log Description: "Stop cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.149: Type: Log Description: "Stop failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.150: Type: Log Description: Stop complete for instance now disabled on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.151: Type: Log Description: "Live-Migrate issued |by the system> against instance owned by from host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.152: Type: Log Description: Live-Migrate inprogress for instance from host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.153: Type: Log Description: "Live-Migrate rejected for instance now on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.154: Type: Log Description: "Live-Migrate cancelled for instance now on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.155: Type: Log Description: "Live-Migrate failed for instance now on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.156: Type: Log Description: Live-Migrate complete for instance now enabled on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.157: Type: Log Description: "Cold-Migrate issued |by the system> against instance owned by from host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.158: Type: Log Description: Cold-Migrate inprogress for instance from host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.159: Type: Log Description: "Cold-Migrate rejected for instance now on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.160: Type: Log Description: "Cold-Migrate cancelled for instance now on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.161: Type: Log Description: "Cold-Migrate failed for instance now on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.162: Type: Log Description: Cold-Migrate complete for instance now enabled on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.163: Type: Log Description: "Cold-Migrate-Confirm issued |by the system> against instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.164: Type: Log Description: Cold-Migrate-Confirm inprogress for instance on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.165: Type: Log Description: "Cold-Migrate-Confirm rejected for instance now enabled on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.166: Type: Log Description: "Cold-Migrate-Confirm cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.167: Type: Log Description: "Cold-Migrate-Confirm failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.168: Type: Log Description: Cold-Migrate-Confirm complete for instance enabled on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.169: Type: Log Description: "Cold-Migrate-Revert issued |by the system> against instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.170: Type: Log Description: Cold-Migrate-Revert inprogress for instance from host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.171: Type: Log Description: "Cold-Migrate-Revert rejected for instance now on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.172: Type: Log Description: "Cold-Migrate-Revert cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.173: Type: Log Description: "Cold-Migrate-Revert failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.174: Type: Log Description: Cold-Migrate-Revert complete for instance now enabled on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.175: Type: Log Description: "Evacuate issued |by the system> against instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.176: Type: Log Description: Evacuating instance owned by from host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.177: Type: Log Description: "Evacuate rejected for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.178: Type: Log Description: "Evacuate cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.179: Type: Log Description: "Evacuate failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.180: Type: Log Description: Evacuate complete for instance now enabled on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.181: Type: Log Description: "Reboot <(soft-reboot)|(hard-reboot)> issued |by the system|by the instance> against instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.182: Type: Log Description: Reboot inprogress for instance on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.183: Type: Log Description: "Reboot rejected for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.184: Type: Log Description: "Reboot cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.185: Type: Log Description: "Reboot failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.186: Type: Log Description: Reboot complete for instance now enabled on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.187: Type: Log Description: "Rebuild issued |by the system> against instance using image on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.188: Type: Log Description: Rebuild inprogress for instance on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.189: Type: Log Description: "Rebuild rejected for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.190: Type: Log Description: "Rebuild cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.191: Type: Log Description: "Rebuild failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.192: Type: Log Description: Rebuild complete for instance now enabled on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.193: Type: Log Description: "Resize issued |by the system> against instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.194: Type: Log Description: Resize inprogress for instance on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.195: Type: Log Description: "Resize rejected for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.196: Type: Log Description: "Resize cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.197: Type: Log Description: "Resize failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.198: Type: Log Description: Resize complete for instance enabled on host waiting for confirmation Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.199: Type: Log Description: "Resize-Confirm issued |by the system> against instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.200: Type: Log Description: Resize-Confirm inprogress for instance on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.201: Type: Log Description: "Resize-Confirm rejected for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.202: Type: Log Description: "Resize-Confirm cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.203: Type: Log Description: "Resize-Confirm failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.204: Type: Log Description: Resize-Confirm complete for instance enabled on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.205: Type: Log Description: "Resize-Revert issued |by the system> against instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.206: Type: Log Description: Resize-Revert inprogress for instance on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.207: Type: Log Description: "Resize-Revert rejected for instance owned by on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.208: Type: Log Description: "Resize-Revert cancelled for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.209: Type: Log Description: "Resize-Revert failed for instance on host [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.210: Type: Log Description: Resize-Revert complete for instance enabled on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.211: Type: Log Description: Guest Heartbeat established for instance on host Entity_Instance_ID: tenant=.instance= Severity: major Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.212: Type: Log Description: Guest Heartbeat disconnected for instance on host Entity_Instance_ID: tenant=.instance= Severity: major Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.213: Type: Log Description: "Guest Heartbeat failed for instance [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.214: Type: Log Description: Instance has been renamed to owned by on host Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.215: Type: Log Description: "Guest Health Check failed for instance [, reason = ]" Entity_Instance_ID: tenant=.instance= Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.216: Type: Log Description: "Entered Multi-Node Recovery Mode" Entity_Instance_ID: subsystem=vim Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 700.217: Type: Log Description: "Exited Multi-Node Recovery Mode" Entity_Instance_ID: subsystem=vim Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False #--------------------------------------------------------------------------- # APPLICATION #--------------------------------------------------------------------------- 750.001: Type: Alarm Description: "Application Upload Failure" Entity_Instance_ID: k8s_application= Severity: warning Proposed_Repair_Action: "Check system inventory log for cause." Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: unknown Service_Affecting: False Suppression: True Management_Affecting_Severity: none Degrade_Affecting_Severity: none 750.002: Type: Alarm Description: "Application Apply Failure" Entity_Instance_ID: k8s_application= Severity: major Proposed_Repair_Action: "Retry applying the application. If the issue persists, please check system inventory log for cause." Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: unknown Service_Affecting: True Suppression: True Management_Affecting_Severity: none Degrade_Affecting_Severity: none 750.003: Type: Alarm Description: "Application Remove Failure" Entity_Instance_ID: k8s_application= Severity: major Proposed_Repair_Action: "Retry removing the application. If the issue persists, please check system inventory log for cause." Maintenance_Action: Inhibit_Alarms: Alarm_Type: processing-error Probable_Cause: unknown Service_Affecting: True Suppression: True Management_Affecting_Severity: none Degrade_Affecting_Severity: none 750.004: Type: Alarm Description: "Application Apply In Progress" Entity_Instance_ID: k8s_application= Severity: warning Proposed_Repair_Action: "No action required." Maintenance_Action: Inhibit_Alarms: Alarm_Type: other Probable_Cause: unknown Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 750.005: Type: Alarm Description: "Application Update In Progress" Entity_Instance_ID: k8s_application= Severity: warning Proposed_Repair_Action: "No action required." Maintenance_Action: Inhibit_Alarms: Alarm_Type: other Probable_Cause: unknown Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 750.006: Type: Alarm Description: "Automatic Application Re-Apply Is Pending" Entity_Instance_ID: k8s_application= Severity: warning Proposed_Repair_Action: "Ensure all hosts are either locked or unlocked. When the system is stable the application will be automatically reapplied." Maintenance_Action: Inhibit_Alarms: Alarm_Type: other Probable_Cause: unknown Service_Affecting: False Suppression: True Management_Affecting_Severity: none Degrade_Affecting_Severity: none #--------------------------------------------------------------------------- # STORAGE #--------------------------------------------------------------------------- 800.001: Type: Alarm Description: |- Storage Alarm Condition: 1 mons down, quorum 1,2 controller-1,storage-0 Entity_Instance_ID: cluster= Severity: [critical, major] Proposed_Repair_Action: "If problem persists, contact next level of support." Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: equipment-malfunction Service_Affecting: critical: True major: False Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 800.010: Type: Alarm Description: |- Potential data loss. No available OSDs in storage replication group. Entity_Instance_ID: cluster=.peergroup= Severity: [critical] Proposed_Repair_Action: "Ensure storage hosts from replication group are unlocked and available. Check if OSDs of each storage host are up and running. If problem persists contact next level of support." Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: equipment-malfunction Service_Affecting: critical: True Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 800.011: Type: Alarm Description: |- Loss of replication in peergroup. Entity_Instance_ID: cluster=.peergroup= Severity: [major] Proposed_Repair_Action: "Ensure storage hosts from replication group are unlocked and available. Check if OSDs of each storage host are up and running. If problem persists contact next level of support." Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: equipment-malfunction Service_Affecting: major: True Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 800.002: Type: Alarm Description: ["Image storage media is full: There is not enough disk space on the image storage media.", "Instance snapshot failed: There is not enough disk space on the image storage media.", "Supplied () and generated from uploaded image () did not match. Setting image status to 'killed'.", "Error in store configuration. Adding images to store is disabled.", "Forbidden upload attempt: ", "Insufficient permissions on image storage media: ", "Denying attempt to upload image larger than bytes.", "Denying attempt to upload image because it exceeds the quota: ", "Received HTTP error while uploading image ", "Client disconnected before sending all data to backend", "Failed to upload image "] Entity_Instance_ID: ["image=, instance=", "tenant=, instance=", "image=, instance=", "image=, instance=", "image=, instance=", "image=, instance=", "image=, instance=", "image=, instance=", "image=, instance=", "image=, instance=", "image=, instance="] Alarm_Type: [physical-violation, physical-violation, integrity-violation, integrity-violation, security-service-or-mechanism-violation, security-service-or-mechanism-violation, security-service-or-mechanism-violation, security-service-or-mechanism-violation, communication, communication, operational-violation] Severity: warning Proposed_Repair_Action: Maintenance_Action: Inhibit_Alarms: Probable_Cause: unspecified-reason Service_Affecting: False Suppression: False Management_Affecting_Severity: none Degrade_Affecting_Severity: none 800.003: Type: Alarm Description: |- Storage Alarm Condition: Quota/Space mismatch for the tier. The sum of Ceph pool quotas does not match the tier size. Entity_Instance_ID: cluster=.tier= Severity: minor Proposed_Repair_Action: "Update ceph storage pool quotas to use all available tier space." Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: configuration-out-of-date Service_Affecting: False Suppression: False Management_Affecting_Severity: none Degrade_Affecting_Severity: none 800.100: Type: Alarm Description: |- Storage Alarm Condition: Cinder I/O Congestion is above normal range and is building Entity_Instance_ID: cinder_io_monitor Severity: major Proposed_Repair_Action: "Reduce the I/O load on the Cinder LVM backend. Use Cinder QoS mechanisms on high usage volumes." Maintenance_Action: Inhibit_Alarms: Alarm_Type: qos Probable_Cause: congestion Service_Affecting: False Suppression: False Management_Affecting_Severity: none Degrade_Affecting_Severity: none 800.101: Type: Alarm Description: |- Storage Alarm Condition: Cinder I/O Congestion is high and impacting guest performance Entity_Instance_ID: cinder_io_monitor Severity: critical Proposed_Repair_Action: "Reduce the I/O load on the Cinder LVM backend. Cinder actions may fail until congestion is reduced. Use Cinder QoS mechanisms on high usage volumes." Maintenance_Action: Inhibit_Alarms: Alarm_Type: qos Probable_Cause: congestion Service_Affecting: False Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 800.102: Type: Alarm Description: |- Storage Alarm Condition: PV configuration on . Reason: . Entity_Instance_ID: pv= Severity: [critical, major] Proposed_Repair_Action: "Remove failed PV and associated Storage Device then recreate them." Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: configuration-or-customization-error Service_Affecting: True Suppression: False Management_Affecting_Severity: major Degrade_Affecting_Severity: none 800.103: Type: Alarm Description: |- Storage Alarm Condition: [ Metadata usage for LVM thin pool / exceeded threshold and automatic extension failed, Metadata usage for LVM thin pool / exceeded threshold ]; threshold x%, actual y%. Entity_Instance_ID: .lvmthinpool=/ Severity: critical Proposed_Repair_Action: "Increase Storage Space Allotment for Cinder on the 'lvm' backend. Consult the System Administration Manual for more details. If problem persists, contact next level of support." Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: threshold-crossed Service_Affecting: False Suppression: False Management_Affecting_Severity: major Degrade_Affecting_Severity: none 800.104: Type: Alarm Description: |- Storage Alarm Condition: configuration failed to apply on host: . Entity_Instance_ID: storage_backend= Severity: critical Proposed_Repair_Action: "Update backend setting to reapply configuration. Consult the System Administration Manual for more details. If problem persists, contact next level of support." Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: configuration-or-customization-error Service_Affecting: True Suppression: False Management_Affecting_Severity: major Degrade_Affecting_Severity: none #--------------------------------------------------------------------------- # SOFTWARE #--------------------------------------------------------------------------- 900.001: Type: Alarm Description: Patching operation in progress. Entity_Instance_ID: host=controller Severity: minor Proposed_Repair_Action: Complete reboots of affected hosts. Maintenance_Action: Inhibit_Alarms: Alarm_Type: environmental Probable_Cause: unspecified-reason Service_Affecting: False Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 900.002: Type: Alarm Description: Obsolete patch in system. Entity_Instance_ID: host=controller Severity: warning Proposed_Repair_Action: Remove and delete obsolete patches. Maintenance_Action: Inhibit_Alarms: Alarm_Type: environmental Probable_Cause: unspecified-reason Service_Affecting: False Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 900.003: Type: Alarm Description: Patch host install failure. Entity_Instance_ID: host= Severity: major Proposed_Repair_Action: Undo patching operation. Maintenance_Action: Inhibit_Alarms: Alarm_Type: environmental Probable_Cause: unspecified-reason Service_Affecting: False Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 900.004: Type: Alarm Description: Host version mismatch. Entity_Instance_ID: host= Severity: major Proposed_Repair_Action: Reinstall host to update applied load. Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unspecified-reason Service_Affecting: True Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 900.005: Type: Alarm Description: System Upgrade in progress. Entity_Instance_ID: host=controller Severity: minor Proposed_Repair_Action: No action required. Maintenance_Action: Inhibit_Alarms: Alarm_Type: operational-violation Probable_Cause: unspecified-reason Service_Affecting: False Suppression: False Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 900.101: Type: Alarm Description: Software patch auto-apply inprogress Entity_Instance_ID: orchestration=sw-patch Severity: major Proposed_Repair_Action: Wait for software patch auto-apply to complete; if problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 900.102: Type: Alarm Description: Software patch auto-apply aborting Entity_Instance_ID: orchestration=sw-patch Severity: major Proposed_Repair_Action: Wait for software patch auto-apply abort to complete; if problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 900.103: Type: Alarm Description: Software patch auto-apply failed Entity_Instance_ID: orchestration=sw-patch Severity: critical Proposed_Repair_Action: Attempt to apply software patches manually; if problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: underlying-resource-unavailable Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 900.111: Type: Log Description: Software patch auto-apply start Entity_Instance_ID: orchestration=sw-patch Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.112: Type: Log Description: Software patch auto-apply inprogress Entity_Instance_ID: orchestration=sw-patch Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.113: Type: Log Description: Software patch auto-apply rejected Entity_Instance_ID: orchestration=sw-patch Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.114: Type: Log Description: Software patch auto-apply cancelled Entity_Instance_ID: orchestration=sw-patch Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.115: Type: Log Description: Software patch auto-apply failed Entity_Instance_ID: orchestration=sw-patch Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.116: Type: Log Description: Software patch auto-apply completed Entity_Instance_ID: orchestration=sw-patch Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.117: Type: Log Description: Software patch auto-apply abort Entity_Instance_ID: orchestration=sw-patch Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.118: Type: Log Description: Software patch auto-apply aborting Entity_Instance_ID: orchestration=sw-patch Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.119: Type: Log Description: Software patch auto-apply abort rejected Entity_Instance_ID: orchestration=sw-patch Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.120: Type: Log Description: Software patch auto-apply abort failed Entity_Instance_ID: orchestration=sw-patch Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.121: Type: Log Description: Software patch auto-apply aborted Entity_Instance_ID: orchestration=sw-patch Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.201: Type: Alarm Description: Software upgrade auto-apply inprogress Entity_Instance_ID: orchestration=sw-upgrade Severity: major Proposed_Repair_Action: Wait for software upgrade auto-apply to complete; if problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 900.202: Type: Alarm Description: Software upgrade auto-apply aborting Entity_Instance_ID: orchestration=sw-upgrade Severity: major Proposed_Repair_Action: Wait for software upgrade auto-apply abort to complete; if problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 900.203: Type: Alarm Description: Software upgrade auto-apply failed Entity_Instance_ID: orchestration=sw-upgrade Severity: critical Proposed_Repair_Action: Attempt to apply software upgrade manually; if problem persists contact next level of support Maintenance_Action: Inhibit_Alarms: Alarm_Type: equipment Probable_Cause: underlying-resource-unavailable Service_Affecting: True Suppression: True Management_Affecting_Severity: warning Degrade_Affecting_Severity: none 900.211: Type: Log Description: Software upgrade auto-apply start Entity_Instance_ID: orchestration=sw-upgrade Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.212: Type: Log Description: Software upgrade auto-apply inprogress Entity_Instance_ID: orchestration=sw-upgrade Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.213: Type: Log Description: Software upgrade auto-apply rejected Entity_Instance_ID: orchestration=sw-upgrade Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.214: Type: Log Description: Software upgrade auto-apply cancelled Entity_Instance_ID: orchestration=sw-upgrade Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.215: Type: Log Description: Software upgrade auto-apply failed Entity_Instance_ID: orchestration=sw-upgrade Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.216: Type: Log Description: Software upgrade auto-apply completed Entity_Instance_ID: orchestration=sw-upgrade Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.217: Type: Log Description: Software upgrade auto-apply abort Entity_Instance_ID: orchestration=sw-upgrade Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.218: Type: Log Description: Software upgrade auto-apply aborting Entity_Instance_ID: orchestration=sw-upgrade Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.219: Type: Log Description: Software upgrade auto-apply abort rejected Entity_Instance_ID: orchestration=sw-upgrade Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.220: Type: Log Description: Software upgrade auto-apply abort failed Entity_Instance_ID: orchestration=sw-upgrade Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False 900.221: Type: Log Description: Software upgrade auto-apply aborted Entity_Instance_ID: orchestration=sw-upgrade Severity: critical Alarm_Type: equipment Probable_Cause: unspecified-reason Service_Affecting: False ...