Notify if metrics are not defined or not added for hosts
Now Monitoring driver is getting notifier instance to be used in case of failure get data for a certain metric or failed to find the metric added on a certain host, the notifier will be used to notify admins list (notify-list in config file) that something is wrong with the metrics and need to look at it Base notification driver changed to support the following methods: 1. notify_status: will be used to notify if node is failed and evacuated 2. notify: will be used anywhere in monitoring drivers to notify administrators if something is wrong. Change-Id: I12eddf03d1921a04f5fa9a8101a471bea5f9c507
This commit is contained in:
parent
7ca4980a8c
commit
61558d2b33
|
@ -26,8 +26,10 @@ def main():
|
||||||
config.configure()
|
config.configure()
|
||||||
config.setup_logging()
|
config.setup_logging()
|
||||||
LOG.info('Starting Freezer DR ... ')
|
LOG.info('Starting Freezer DR ... ')
|
||||||
|
# initialize the notification driver as it will be used in many parts
|
||||||
|
notifier = NotificationManager()
|
||||||
# load and initialize the monitoring driver
|
# load and initialize the monitoring driver
|
||||||
monitor = MonitorManager()
|
monitor = MonitorManager(notifier=notifier.get_driver())
|
||||||
# Do the monitoring procedure
|
# Do the monitoring procedure
|
||||||
# Monitor, analyse, nodes down ?, wait, double check ? evacuate ..
|
# Monitor, analyse, nodes down ?, wait, double check ? evacuate ..
|
||||||
nodes = monitor.monitor()
|
nodes = monitor.monitor()
|
||||||
|
@ -40,7 +42,6 @@ def main():
|
||||||
evac = EvacuationManager()
|
evac = EvacuationManager()
|
||||||
notify_nodes = evac.get_nodes_details(nodes)
|
notify_nodes = evac.get_nodes_details(nodes)
|
||||||
evac.evacuate(nodes)
|
evac.evacuate(nodes)
|
||||||
notifier = NotificationManager()
|
|
||||||
notifier.notify(notify_nodes, 'success')
|
notifier.notify(notify_nodes, 'success')
|
||||||
else:
|
else:
|
||||||
print "No nodes reported to be down"
|
print "No nodes reported to be down"
|
||||||
|
|
|
@ -28,7 +28,7 @@ class MonitorBaseDriver(object):
|
||||||
"""
|
"""
|
||||||
_OPTS = []
|
_OPTS = []
|
||||||
|
|
||||||
def __init__(self, backend_name):
|
def __init__(self, backend_name, notifier):
|
||||||
"""
|
"""
|
||||||
Initializing the driver. Any monitoring system requires the following
|
Initializing the driver. Any monitoring system requires the following
|
||||||
parameters to call it's api. All these parameters can be passed from the
|
parameters to call it's api. All these parameters can be passed from the
|
||||||
|
@ -36,9 +36,15 @@ class MonitorBaseDriver(object):
|
||||||
:param backend_name: Name of section in the configuration file that
|
:param backend_name: Name of section in the configuration file that
|
||||||
contains your driver initialization details; like username, password,
|
contains your driver initialization details; like username, password,
|
||||||
endpoint and so on. Variables in this section depends on your driver
|
endpoint and so on. Variables in this section depends on your driver
|
||||||
|
|
||||||
|
:param notifier: Notifier instance which can be used to notify the
|
||||||
|
admins in case of error or problem happened during the DR process.
|
||||||
|
You should only call notify method and send it your message to send
|
||||||
|
it to the admins
|
||||||
"""
|
"""
|
||||||
CONF.register_opts(self._OPTS, group=backend_name)
|
CONF.register_opts(self._OPTS, group=backend_name)
|
||||||
self.conf = CONF.get(backend_name)
|
self.conf = CONF.get(backend_name)
|
||||||
|
self.notifier = notifier
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def get_data(self):
|
def get_data(self):
|
||||||
|
|
|
@ -21,12 +21,13 @@ LOG = log.getLogger(__name__)
|
||||||
|
|
||||||
class MonitorManager(object):
|
class MonitorManager(object):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, notifier):
|
||||||
monitor = CONF.get('monitoring')
|
monitor = CONF.get('monitoring')
|
||||||
backend_name = monitor['backend_name']
|
backend_name = monitor['backend_name']
|
||||||
self.driver = importutils.import_object(
|
self.driver = importutils.import_object(
|
||||||
monitor.driver,
|
monitor.driver,
|
||||||
backend_name=backend_name
|
backend_name=backend_name,
|
||||||
|
notifier=notifier
|
||||||
)
|
)
|
||||||
driver_info = self.driver.get_info()
|
driver_info = self.driver.get_info()
|
||||||
LOG.info('Initializing driver %s with version %s found in %s' %
|
LOG.info('Initializing driver %s with version %s found in %s' %
|
||||||
|
|
|
@ -42,8 +42,9 @@ class StandardDriver(MonitorBaseDriver):
|
||||||
' key:value format'),
|
' key:value format'),
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, backend_name):
|
def __init__(self, backend_name, notifier):
|
||||||
super(StandardDriver, self).__init__(backend_name=backend_name)
|
super(StandardDriver, self).__init__(backend_name=backend_name,
|
||||||
|
notifier=notifier)
|
||||||
self.endpoint = self.conf.endpoint
|
self.endpoint = self.conf.endpoint
|
||||||
client = OSClient(
|
client = OSClient(
|
||||||
authurl=self.conf.endpoint,
|
authurl=self.conf.endpoint,
|
||||||
|
|
|
@ -11,8 +11,13 @@
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
from oslo_config import cfg
|
||||||
|
|
||||||
from freezer_dr.monitors.common.driver import MonitorBaseDriver
|
from freezer_dr.monitors.common.driver import MonitorBaseDriver
|
||||||
|
|
||||||
|
CONF = cfg.CONF
|
||||||
|
|
||||||
|
|
||||||
class DummyDriver(MonitorBaseDriver):
|
class DummyDriver(MonitorBaseDriver):
|
||||||
"""A monitoring driver that returns a configured list of nodes as failed.
|
"""A monitoring driver that returns a configured list of nodes as failed.
|
||||||
|
@ -22,11 +27,18 @@ class DummyDriver(MonitorBaseDriver):
|
||||||
monitoring section of the freezer_dr configuration file as follows:
|
monitoring section of the freezer_dr configuration file as follows:
|
||||||
kwargs = nodes_down:hostname1;hostname2
|
kwargs = nodes_down:hostname1;hostname2
|
||||||
"""
|
"""
|
||||||
|
_OPTS = [
|
||||||
|
cfg.ListOpt('nodes_down',
|
||||||
|
default=[],
|
||||||
|
required=True,
|
||||||
|
help="fake list of failed compute nodes.")
|
||||||
|
]
|
||||||
|
|
||||||
def __init__(self, username, password, endpoint, **kwargs):
|
def __init__(self, backend_name, notifier):
|
||||||
super(DummyDriver, self).__init__(username, password, endpoint, **kwargs)
|
super(DummyDriver, self).__init__(backend_name=backend_name,
|
||||||
|
notifier=notifier)
|
||||||
|
|
||||||
hostnames = kwargs['nodes_down'].split(';')
|
hostnames = self.conf.get('nodes_down', [])
|
||||||
self.nodes_down = [{'host': n} for n in hostnames]
|
self.nodes_down = [{'host': n} for n in hostnames]
|
||||||
|
|
||||||
def get_data(self):
|
def get_data(self):
|
||||||
|
@ -48,5 +60,5 @@ class DummyDriver(MonitorBaseDriver):
|
||||||
return {
|
return {
|
||||||
'name': 'Freezer DR Dummy Driver',
|
'name': 'Freezer DR Dummy Driver',
|
||||||
'version': 1.0,
|
'version': 1.0,
|
||||||
'author': 'Hewlett-Packard Development Company, L.P'
|
'author': 'Hewlett-Packard Enterprise Development, L.P'
|
||||||
}
|
}
|
||||||
|
|
|
@ -82,8 +82,9 @@ class MonascaDriver(driver.MonitorBaseDriver):
|
||||||
"Default is all")
|
"Default is all")
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, backend_name):
|
def __init__(self, backend_name, notifier):
|
||||||
super(MonascaDriver, self).__init__(backend_name=backend_name)
|
super(MonascaDriver, self).__init__(backend_name=backend_name,
|
||||||
|
notifier=notifier)
|
||||||
self.monasca_client = client.Client(
|
self.monasca_client = client.Client(
|
||||||
"2_0",
|
"2_0",
|
||||||
self.conf['monasca_url'],
|
self.conf['monasca_url'],
|
||||||
|
@ -198,7 +199,9 @@ class MonascaDriver(driver.MonitorBaseDriver):
|
||||||
for node, metrics in nodes.iteritems():
|
for node, metrics in nodes.iteritems():
|
||||||
node_data = {node: []}
|
node_data = {node: []}
|
||||||
for metric_name, metric_data in metrics.iteritems():
|
for metric_name, metric_data in metrics.iteritems():
|
||||||
node_data[node].append(self.__process_metric(metric_name, metric_data))
|
node_data[node].append(
|
||||||
|
self.__process_metric(node, metric_name, metric_data)
|
||||||
|
)
|
||||||
nodes_data.append(node_data)
|
nodes_data.append(node_data)
|
||||||
|
|
||||||
aggregate = self.conf.get('aggregate', 'all')
|
aggregate = self.conf.get('aggregate', 'all')
|
||||||
|
@ -221,7 +224,7 @@ class MonascaDriver(driver.MonitorBaseDriver):
|
||||||
if True in host.values()
|
if True in host.values()
|
||||||
]
|
]
|
||||||
|
|
||||||
def __process_metric(self, metric_name, metric_data):
|
def __process_metric(self, node, metric_name, metric_data):
|
||||||
metric_conf = CONF[metric_name]
|
metric_conf = CONF[metric_name]
|
||||||
# process UNDETERMINED State and change it to the required state
|
# process UNDETERMINED State and change it to the required state
|
||||||
metric_data = [
|
metric_data = [
|
||||||
|
@ -229,6 +232,30 @@ class MonascaDriver(driver.MonitorBaseDriver):
|
||||||
metric_conf.get('undetermined', 'ALARM').upper()
|
metric_conf.get('undetermined', 'ALARM').upper()
|
||||||
for i in metric_data
|
for i in metric_data
|
||||||
]
|
]
|
||||||
|
if not metric_data:
|
||||||
|
message = """
|
||||||
|
No data found for this metric: {0} <br />
|
||||||
|
Data returned: {1} <br />
|
||||||
|
hostname: {2} <br />
|
||||||
|
Cause might be: <br />
|
||||||
|
<ul>
|
||||||
|
<li>Metric is not defined in Monasca </li>
|
||||||
|
<li>Alarm with this metric name is not set for this host </li>
|
||||||
|
<li>Check your Monasca configuration and Metric configuration
|
||||||
|
defined in freezer-dr.conf </li>
|
||||||
|
</ul>
|
||||||
|
You can try this command to check: <br />
|
||||||
|
$ monasca alarm-list --metric-name {3} --metric-dimensions
|
||||||
|
hostname={2}
|
||||||
|
<br /> <br />
|
||||||
|
Freezer-DR
|
||||||
|
""".format(metric_name, str(metric_data), node,
|
||||||
|
metric_conf['metric_name'])
|
||||||
|
self.notifier.notify(message)
|
||||||
|
LOG.warning("No data found for metric: {0} on host: {1}".format(
|
||||||
|
metric_name, node
|
||||||
|
))
|
||||||
|
exit(1)
|
||||||
# build the decision
|
# build the decision
|
||||||
aggregate = metric_conf.get('aggregate')
|
aggregate = metric_conf.get('aggregate')
|
||||||
aggregate += "(x=='ALARM' for x in metric_data)"
|
aggregate += "(x=='ALARM' for x in metric_data)"
|
||||||
|
|
|
@ -17,15 +17,13 @@ import abc
|
||||||
|
|
||||||
@six.add_metaclass(abc.ABCMeta)
|
@six.add_metaclass(abc.ABCMeta)
|
||||||
class NotifierBaseDriver(object):
|
class NotifierBaseDriver(object):
|
||||||
"""
|
""" Used to notify admins/users at any stage that an error happened or
|
||||||
Used to notify admins/users at any stage that an error happened or process
|
process completed or something went wrong !
|
||||||
completed or something went wrong !
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, url, username, password, templates_dir, notify_from,
|
def __init__(self, url, username, password, templates_dir, notify_from,
|
||||||
admin_list=None, **kwargs):
|
admin_list=None, **kwargs):
|
||||||
"""
|
""" Initialize the notification backend.
|
||||||
Initialize the notification backend.
|
|
||||||
:param url: Notification system backend
|
:param url: Notification system backend
|
||||||
:param username: Username
|
:param username: Username
|
||||||
:param password: Password
|
:param password: Password
|
||||||
|
@ -42,12 +40,20 @@ class NotifierBaseDriver(object):
|
||||||
self.options = kwargs
|
self.options = kwargs
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def notify(self, node, status):
|
def notify_status(self, node, status):
|
||||||
"""
|
""" Custom notification method. Can be used if you want to send custom
|
||||||
Custom notification method. Can be used if you want to send custom
|
|
||||||
notification about Tenant, Instance, or go deeper if you want
|
notification about Tenant, Instance, or go deeper if you want
|
||||||
:param node: Compute Host, Tenant, Instance, ...
|
:param node: Compute Host, Tenant, Instance, ...
|
||||||
:param status: Error, Success, Info
|
:param status: Error, Success, Info
|
||||||
:return: True, False
|
:return: True, False
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def notify(self, message):
|
||||||
|
""" This method will be used in different places to notify admins
|
||||||
|
about certain problem
|
||||||
|
:param message: String message name
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
|
@ -45,9 +45,10 @@ class NotificationManager(object):
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
for node in nodes:
|
for node in nodes:
|
||||||
self.driver.notify(node, status)
|
self.driver.notify_status(node, status)
|
||||||
|
|
||||||
|
def get_driver(self):
|
||||||
|
return self.driver
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -44,7 +44,7 @@ class StandardEmail(NotifierBaseDriver):
|
||||||
LOG.info('Logged in !')
|
LOG.info('Logged in !')
|
||||||
self.server = server
|
self.server = server
|
||||||
|
|
||||||
def notify(self, node, status):
|
def notify_status(self, node, status):
|
||||||
_template = 'info.jinja'
|
_template = 'info.jinja'
|
||||||
if status == 'success':
|
if status == 'success':
|
||||||
_template = 'user_success.jinja'
|
_template = 'user_success.jinja'
|
||||||
|
@ -102,6 +102,19 @@ class StandardEmail(NotifierBaseDriver):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
LOG.error(e)
|
LOG.error(e)
|
||||||
|
|
||||||
|
def notify(self, message):
|
||||||
|
try:
|
||||||
|
self.send_email(
|
||||||
|
mail_from=self.notify_from,
|
||||||
|
mail_to=self.notify_from,
|
||||||
|
subject="[Freezer-DR] Problem Occurred",
|
||||||
|
html_msg=message,
|
||||||
|
cc_list=self.admin_list or []
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
self.server.quit()
|
self.server.quit()
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue