From a8285b564db551c4f0131d60212b85b0e6462da4 Mon Sep 17 00:00:00 2001 From: Lingxian Kong Date: Tue, 14 May 2019 22:05:04 +1200 Subject: [PATCH] Support Heat auto-healing notifier The auto-healing notifier works together with loadbalancer_member_health evaluator. Presumably, the end user defines a Heat template which contains an autoscaling group and all the members in the group are joined in an Octavia load balancer in order to expose service to the outside, so that when the stack scales up or scales down, Heat makes sure the new members are joining the load balancer automatically and the old members are removed. However, this notifier deals with the situation that when some member fails, the stack could be recovered by marking the given autoscaling group member unhealthy, then update Heat stack in place. Change-Id: I6e92d1fc2125e155bb5068ff2c14fa318b126442 --- aodh/keystone_client.py | 14 +++ aodh/notifier/heat.py | 114 ++++++++++++++++++ aodh/tests/unit/notifier/__init__.py | 0 aodh/tests/unit/notifier/base.py | 27 +++++ aodh/tests/unit/notifier/test_heat.py | 78 ++++++++++++ ...uto-healing-notifier-794b64de776811e9.yaml | 3 + requirements.txt | 1 + setup.cfg | 1 + 8 files changed, 238 insertions(+) create mode 100644 aodh/notifier/heat.py create mode 100644 aodh/tests/unit/notifier/__init__.py create mode 100644 aodh/tests/unit/notifier/base.py create mode 100644 aodh/tests/unit/notifier/test_heat.py create mode 100644 releasenotes/notes/auto-healing-notifier-794b64de776811e9.yaml diff --git a/aodh/keystone_client.py b/aodh/keystone_client.py index 3cf10ca7d..4f3e02e7e 100644 --- a/aodh/keystone_client.py +++ b/aodh/keystone_client.py @@ -15,6 +15,7 @@ import os +from heatclient import client as heatclient from keystoneauth1 import exceptions as ka_exception from keystoneauth1.identity.generic import password from keystoneauth1 import loading as ka_loading @@ -93,6 +94,19 @@ def url_for(conf, **kwargs): return sess.get_endpoint(**kwargs) +def get_heat_client_from_trust(conf, trust_id): + ks_client = get_trusted_client(conf, trust_id) + sess = ks_client.session + + endpoint = sess.get_endpoint( + service_type='orchestration', + interface="internal", + region_name=conf.service_credentials.region_name + ) + + return heatclient.Client("1", endpoint=endpoint, session=sess) + + OPTS = [ cfg.StrOpt('region-name', default=os.environ.get('OS_REGION_NAME'), diff --git a/aodh/notifier/heat.py b/aodh/notifier/heat.py new file mode 100644 index 000000000..157ad968f --- /dev/null +++ b/aodh/notifier/heat.py @@ -0,0 +1,114 @@ +# Copyright 2019 Catalyst Cloud Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from oslo_log import log +from oslo_utils import uuidutils +import six + +from aodh import keystone_client as aodh_keystone +from aodh import notifier + +LOG = log.getLogger(__name__) + + +class TrustHeatAlarmNotifier(notifier.AlarmNotifier): + """Heat autohealing notifier. + + The auto-healing notifier works together with loadbalancer_member_health + evaluator. + + Presumably, the end user defines a Heat template which contains an + autoscaling group and all the members in the group are joined in an Octavia + load balancer in order to expose service to the outside, so that when the + stack scales up or scales down, Heat makes sure the new members are joining + the load balancer automatically and the old members are removed. + + However, this notifier deals with the situation that when some member + fails, the stack could be recovered by marking the given autoscaling group + member unhealthy, then update Heat stack in place. In order to do that, the + notifier needs to know: + + - Heat stack ID. + - Heat autoscaling group ID. + - The failed Octavia pool members. + + The resource ID in the autoscaling group is saved in the Octavia member + tags. So, only Octavia stable/stein or later versions are supported. + """ + + def __init__(self, conf): + super(TrustHeatAlarmNotifier, self).__init__(conf) + self.conf = conf + + def notify(self, action, alarm_id, alarm_name, severity, previous, current, + reason, reason_data): + LOG.info( + "Notifying alarm %(alarm_name)s %(alarm_id)s of %(severity)s " + "priority from %(previous)s to %(current)s with action %(action)s" + " because %(reason)s." % + {'alarm_name': alarm_name, + 'alarm_id': alarm_id, + 'severity': severity, + 'previous': previous, + 'current': current, + 'action': action.geturl(), + 'reason': reason} + ) + + trust_id = action.username + stack_id = reason_data.get("stack_id") + asg_id = reason_data.get("asg_id") + + if not stack_id or not asg_id: + LOG.warning( + "stack_id and asg_id must exist to notify alarm %s", alarm_id + ) + return + + resources = [] + unhealthy_members = reason_data.get("unhealthy_members", []) + + for member in unhealthy_members: + for tag in member.get("tags", []): + if uuidutils.is_uuid_like(tag): + resources.append(tag) + + if resources: + try: + heat_client = aodh_keystone.get_heat_client_from_trust( + self.conf, trust_id + ) + + for res in resources: + heat_client.resources.mark_unhealthy( + asg_id, + res, + True, + "unhealthy load balancer member" + ) + LOG.info( + "Heat resource %(resource_id)s is marked as unhealthy " + "for alarm %(alarm_id)s", + {"resource_id": res, "alarm_id": alarm_id} + ) + + heat_client.stacks.update(stack_id, existing=True) + LOG.info( + "Heat stack %(stack_id)s is updated for alarm " + "%(alarm_id)s", + {"stack_id": stack_id, "alarm_id": alarm_id} + ) + except Exception as e: + LOG.exception("Failed to communicate with Heat service, " + "error: %s", six.text_type(e)) diff --git a/aodh/tests/unit/notifier/__init__.py b/aodh/tests/unit/notifier/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/aodh/tests/unit/notifier/base.py b/aodh/tests/unit/notifier/base.py new file mode 100644 index 000000000..74e87e45e --- /dev/null +++ b/aodh/tests/unit/notifier/base.py @@ -0,0 +1,27 @@ +# Copyright 2019 Catalyst Cloud Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from oslo_config import fixture +from oslotest import base + +from aodh import service + + +class TestNotifierBase(base.BaseTestCase): + def setUp(self): + super(TestNotifierBase, self).setUp() + + conf = service.prepare_service(argv=[], config_files=[]) + + self.conf = self.useFixture(fixture.Config(conf)).conf diff --git a/aodh/tests/unit/notifier/test_heat.py b/aodh/tests/unit/notifier/test_heat.py new file mode 100644 index 000000000..c889e0fb3 --- /dev/null +++ b/aodh/tests/unit/notifier/test_heat.py @@ -0,0 +1,78 @@ +# Copyright 2019 Catalyst Cloud Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import mock +from oslo_utils import netutils + +from aodh.notifier import heat as heat_notifier +from aodh.tests.unit.notifier import base + + +class TestTrustHeatAlarmNotifier(base.TestNotifierBase): + @mock.patch("aodh.keystone_client.get_heat_client_from_trust") + def test_notify(self, mock_heatclient): + action = netutils.urlsplit("trust+autohealer://fake_trust_id:delete@") + alarm_id = "fake_alarm_id" + alarm_name = "fake_alarm_name" + severity = "low" + previous = "ok" + current = "alarm" + reason = "no good reason" + reason_data = { + "stack_id": "fake_stack_id", + "asg_id": "fake_asg_id", + "unhealthy_members": [ + {"tags": ["3bd8bc5a-7632-11e9-84cd-00224d6b7bc1"]} + ] + } + + notifier = heat_notifier.TrustHeatAlarmNotifier(self.conf) + notifier.notify(action, alarm_id, alarm_name, severity, previous, + current, reason, reason_data) + + mock_heatclient.assert_called_once_with(self.conf, "fake_trust_id") + + mock_client = mock_heatclient.return_value + mock_client.resources.mark_unhealthy.assert_called_once_with( + "fake_asg_id", + "3bd8bc5a-7632-11e9-84cd-00224d6b7bc1", + True, + "unhealthy load balancer member" + ) + + mock_client.stacks.update.assert_called_once_with( + "fake_stack_id", existing=True + ) + + @mock.patch("aodh.keystone_client.get_heat_client_from_trust") + def test_notify_stack_id_missing(self, mock_heatclient): + action = netutils.urlsplit("trust+autohealer://fake_trust_id:delete@") + alarm_id = "fake_alarm_id" + alarm_name = "fake_alarm_name" + severity = "low" + previous = "ok" + current = "alarm" + reason = "no good reason" + reason_data = { + "asg_id": "fake_asg_id", + "unhealthy_members": [ + {"tags": ["3bd8bc5a-7632-11e9-84cd-00224d6b7bc1"]} + ] + } + + notifier = heat_notifier.TrustHeatAlarmNotifier(self.conf) + notifier.notify(action, alarm_id, alarm_name, severity, previous, + current, reason, reason_data) + + self.assertFalse(mock_heatclient.called) diff --git a/releasenotes/notes/auto-healing-notifier-794b64de776811e9.yaml b/releasenotes/notes/auto-healing-notifier-794b64de776811e9.yaml new file mode 100644 index 000000000..cb84da46d --- /dev/null +++ b/releasenotes/notes/auto-healing-notifier-794b64de776811e9.yaml @@ -0,0 +1,3 @@ +features: + - Added a new notifier(``trust+heat``) that works together with + ``loadbalancer_member_health`` evaluator for auto-healing purpose. diff --git a/requirements.txt b/requirements.txt index 5745eeb2e..8e09a8f3b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,3 +37,4 @@ keystoneauth1>=2.1 debtcollector>=1.2.0 # Apache-2.0 python-octaviaclient>=1.8.0 python-dateutil # BSD +python-heatclient>=1.17.0 diff --git a/setup.cfg b/setup.cfg index 62485c63d..9ca36fac3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -91,6 +91,7 @@ aodh.notifier = trust+https = aodh.notifier.trust:TrustRestAlarmNotifier zaqar = aodh.notifier.zaqar:ZaqarAlarmNotifier trust+zaqar = aodh.notifier.zaqar:TrustZaqarAlarmNotifier + trust+heat = aodh.notifier.heat:TrustHeatAlarmNotifier wsgi_scripts = aodh-api = aodh.api.app:build_wsgi_app