aodh/aodh/notifier/heat.py

120 lines
4.3 KiB
Python

# Copyright 2019 Catalyst Cloud Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from oslo_log import log
from aodh import keystone_client as aodh_keystone
from aodh import notifier
LOG = log.getLogger(__name__)
class TrustHeatAlarmNotifier(notifier.AlarmNotifier):
"""Heat autohealing notifier.
The auto-healing notifier works together with loadbalancer_member_health
evaluator.
Presumably, the end user defines a Heat template which contains an
autoscaling group and all the members in the group are joined in an Octavia
load balancer in order to expose service to the outside, so that when the
stack scales up or scales down, Heat makes sure the new members are joining
the load balancer automatically and the old members are removed.
However, this notifier deals with the situation that when some member
fails, the stack could be recovered by marking the given autoscaling group
member unhealthy, then update Heat stack in place. In order to do that, the
notifier needs to know:
- Heat top/root stack ID.
- Heat autoscaling group ID.
- The failed Octavia pool members.
"""
def __init__(self, conf):
super(TrustHeatAlarmNotifier, self).__init__(conf)
self.conf = conf
def notify(self, action, alarm_id, alarm_name, severity, previous, current,
reason, reason_data):
LOG.info(
"Notifying alarm %(alarm_name)s %(alarm_id)s of %(severity)s "
"priority from %(previous)s to %(current)s with action %(action)s"
" because %(reason)s." %
{'alarm_name': alarm_name,
'alarm_id': alarm_id,
'severity': severity,
'previous': previous,
'current': current,
'action': action.geturl(),
'reason': reason}
)
trust_id = action.username
stack_id = reason_data.get("stack_id")
asg_id = reason_data.get("asg_id")
unhealthy_members = reason_data.get("unhealthy_members", [])
unhealthy_resources = []
if not stack_id or not asg_id:
LOG.error(
"stack_id and asg_id must exist to notify alarm %s", alarm_id
)
return
heat_client = aodh_keystone.get_heat_client_from_trust(
self.conf, trust_id
)
for member in unhealthy_members:
target_resources = heat_client.resources.list(
stack_id, nested_depth=3,
filters={"physical_resource_id": member["id"]}
)
if len(target_resources) > 0:
# There should be only one item.
unhealthy_resources.append(
target_resources[0].parent_resource
)
if not unhealthy_resources:
LOG.warning("No unhealthy resource found for the alarm %s",
alarm_id)
return
try:
for res in unhealthy_resources:
heat_client.resources.mark_unhealthy(
asg_id,
res,
True,
"unhealthy load balancer member"
)
LOG.info(
"Heat resource %(resource_id)s is marked as unhealthy "
"for alarm %(alarm_id)s",
{"resource_id": res, "alarm_id": alarm_id}
)
heat_client.stacks.update(stack_id, existing=True)
LOG.info(
"Heat stack %(stack_id)s is updated for alarm "
"%(alarm_id)s",
{"stack_id": stack_id, "alarm_id": alarm_id}
)
except Exception as e:
LOG.exception("Failed to communicate with Heat service for alarm "
"%s, error: %s",
alarm_id, str(e))