Notify if metrics are not defined or not added for hosts

Now Monitoring driver is getting notifier instance to be used
in case of failure get data for a certain metric or failed to find
the metric added on a certain host, the notifier will be used to
notify admins list (notify-list in config file) that something is
wrong with the metrics and need to look at it

Base notification driver changed to support the following methods:

1. notify_status: will be used to notify if node is failed and
   evacuated
2. notify: will be used anywhere in monitoring drivers to notify
administrators if something is wrong.

Change-Id: I12eddf03d1921a04f5fa9a8101a471bea5f9c507
This commit is contained in:
Saad Zaher 2016-11-21 17:06:23 +00:00
parent 7ca4980a8c
commit 61558d2b33
9 changed files with 95 additions and 27 deletions

View File

@ -26,8 +26,10 @@ def main():
config.configure()
config.setup_logging()
LOG.info('Starting Freezer DR ... ')
# initialize the notification driver as it will be used in many parts
notifier = NotificationManager()
# load and initialize the monitoring driver
monitor = MonitorManager()
monitor = MonitorManager(notifier=notifier.get_driver())
# Do the monitoring procedure
# Monitor, analyse, nodes down ?, wait, double check ? evacuate ..
nodes = monitor.monitor()
@ -40,7 +42,6 @@ def main():
evac = EvacuationManager()
notify_nodes = evac.get_nodes_details(nodes)
evac.evacuate(nodes)
notifier = NotificationManager()
notifier.notify(notify_nodes, 'success')
else:
print "No nodes reported to be down"

View File

@ -28,7 +28,7 @@ class MonitorBaseDriver(object):
"""
_OPTS = []
def __init__(self, backend_name):
def __init__(self, backend_name, notifier):
"""
Initializing the driver. Any monitoring system requires the following
parameters to call it's api. All these parameters can be passed from the
@ -36,9 +36,15 @@ class MonitorBaseDriver(object):
:param backend_name: Name of section in the configuration file that
contains your driver initialization details; like username, password,
endpoint and so on. Variables in this section depends on your driver
:param notifier: Notifier instance which can be used to notify the
admins in case of error or problem happened during the DR process.
You should only call notify method and send it your message to send
it to the admins
"""
CONF.register_opts(self._OPTS, group=backend_name)
self.conf = CONF.get(backend_name)
self.notifier = notifier
@abc.abstractmethod
def get_data(self):

View File

@ -21,12 +21,13 @@ LOG = log.getLogger(__name__)
class MonitorManager(object):
def __init__(self):
def __init__(self, notifier):
monitor = CONF.get('monitoring')
backend_name = monitor['backend_name']
self.driver = importutils.import_object(
monitor.driver,
backend_name=backend_name
backend_name=backend_name,
notifier=notifier
)
driver_info = self.driver.get_info()
LOG.info('Initializing driver %s with version %s found in %s' %

View File

@ -42,8 +42,9 @@ class StandardDriver(MonitorBaseDriver):
' key:value format'),
]
def __init__(self, backend_name):
super(StandardDriver, self).__init__(backend_name=backend_name)
def __init__(self, backend_name, notifier):
super(StandardDriver, self).__init__(backend_name=backend_name,
notifier=notifier)
self.endpoint = self.conf.endpoint
client = OSClient(
authurl=self.conf.endpoint,

View File

@ -11,8 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from oslo_config import cfg
from freezer_dr.monitors.common.driver import MonitorBaseDriver
CONF = cfg.CONF
class DummyDriver(MonitorBaseDriver):
"""A monitoring driver that returns a configured list of nodes as failed.
@ -22,11 +27,18 @@ class DummyDriver(MonitorBaseDriver):
monitoring section of the freezer_dr configuration file as follows:
kwargs = nodes_down:hostname1;hostname2
"""
_OPTS = [
cfg.ListOpt('nodes_down',
default=[],
required=True,
help="fake list of failed compute nodes.")
]
def __init__(self, username, password, endpoint, **kwargs):
super(DummyDriver, self).__init__(username, password, endpoint, **kwargs)
def __init__(self, backend_name, notifier):
super(DummyDriver, self).__init__(backend_name=backend_name,
notifier=notifier)
hostnames = kwargs['nodes_down'].split(';')
hostnames = self.conf.get('nodes_down', [])
self.nodes_down = [{'host': n} for n in hostnames]
def get_data(self):
@ -48,5 +60,5 @@ class DummyDriver(MonitorBaseDriver):
return {
'name': 'Freezer DR Dummy Driver',
'version': 1.0,
'author': 'Hewlett-Packard Development Company, L.P'
'author': 'Hewlett-Packard Enterprise Development, L.P'
}

View File

@ -82,8 +82,9 @@ class MonascaDriver(driver.MonitorBaseDriver):
"Default is all")
]
def __init__(self, backend_name):
super(MonascaDriver, self).__init__(backend_name=backend_name)
def __init__(self, backend_name, notifier):
super(MonascaDriver, self).__init__(backend_name=backend_name,
notifier=notifier)
self.monasca_client = client.Client(
"2_0",
self.conf['monasca_url'],
@ -198,7 +199,9 @@ class MonascaDriver(driver.MonitorBaseDriver):
for node, metrics in nodes.iteritems():
node_data = {node: []}
for metric_name, metric_data in metrics.iteritems():
node_data[node].append(self.__process_metric(metric_name, metric_data))
node_data[node].append(
self.__process_metric(node, metric_name, metric_data)
)
nodes_data.append(node_data)
aggregate = self.conf.get('aggregate', 'all')
@ -221,7 +224,7 @@ class MonascaDriver(driver.MonitorBaseDriver):
if True in host.values()
]
def __process_metric(self, metric_name, metric_data):
def __process_metric(self, node, metric_name, metric_data):
metric_conf = CONF[metric_name]
# process UNDETERMINED State and change it to the required state
metric_data = [
@ -229,6 +232,30 @@ class MonascaDriver(driver.MonitorBaseDriver):
metric_conf.get('undetermined', 'ALARM').upper()
for i in metric_data
]
if not metric_data:
message = """
No data found for this metric: {0} <br />
Data returned: {1} <br />
hostname: {2} <br />
Cause might be: <br />
<ul>
<li>Metric is not defined in Monasca </li>
<li>Alarm with this metric name is not set for this host </li>
<li>Check your Monasca configuration and Metric configuration
defined in freezer-dr.conf </li>
</ul>
You can try this command to check: <br />
$ monasca alarm-list --metric-name {3} --metric-dimensions
hostname={2}
<br /> <br />
Freezer-DR
""".format(metric_name, str(metric_data), node,
metric_conf['metric_name'])
self.notifier.notify(message)
LOG.warning("No data found for metric: {0} on host: {1}".format(
metric_name, node
))
exit(1)
# build the decision
aggregate = metric_conf.get('aggregate')
aggregate += "(x=='ALARM' for x in metric_data)"

View File

@ -17,15 +17,13 @@ import abc
@six.add_metaclass(abc.ABCMeta)
class NotifierBaseDriver(object):
"""
Used to notify admins/users at any stage that an error happened or process
completed or something went wrong !
""" Used to notify admins/users at any stage that an error happened or
process completed or something went wrong !
"""
def __init__(self, url, username, password, templates_dir, notify_from,
admin_list=None, **kwargs):
"""
Initialize the notification backend.
""" Initialize the notification backend.
:param url: Notification system backend
:param username: Username
:param password: Password
@ -42,12 +40,20 @@ class NotifierBaseDriver(object):
self.options = kwargs
@abc.abstractmethod
def notify(self, node, status):
"""
Custom notification method. Can be used if you want to send custom
def notify_status(self, node, status):
""" Custom notification method. Can be used if you want to send custom
notification about Tenant, Instance, or go deeper if you want
:param node: Compute Host, Tenant, Instance, ...
:param status: Error, Success, Info
:return: True, False
"""
pass
@abc.abstractmethod
def notify(self, message):
""" This method will be used in different places to notify admins
about certain problem
:param message: String message name
:return:
"""
pass

View File

@ -45,9 +45,10 @@ class NotificationManager(object):
:return:
"""
for node in nodes:
self.driver.notify(node, status)
self.driver.notify_status(node, status)
def get_driver(self):
return self.driver

View File

@ -44,7 +44,7 @@ class StandardEmail(NotifierBaseDriver):
LOG.info('Logged in !')
self.server = server
def notify(self, node, status):
def notify_status(self, node, status):
_template = 'info.jinja'
if status == 'success':
_template = 'user_success.jinja'
@ -102,6 +102,19 @@ class StandardEmail(NotifierBaseDriver):
except Exception as e:
LOG.error(e)
def notify(self, message):
try:
self.send_email(
mail_from=self.notify_from,
mail_to=self.notify_from,
subject="[Freezer-DR] Problem Occurred",
html_msg=message,
cc_list=self.admin_list or []
)
return True
except:
return False
def __exit__(self, exc_type, exc_val, exc_tb):
self.server.quit()