freezer-dr/freezer_dr/monitors/drivers/monasca/driver.py

340 lines
13 KiB
Python

# (c) Copyright 2016 Hewlett-Packard Development Enterprise, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import http.client
import time
import urllib.parse
from monascaclient import client
from oslo_config import cfg
from oslo_log import log
from freezer_dr.common import utils
from freezer_dr.monitors.common import driver
CONF = cfg.CONF
LOG = log.getLogger(__name__)
class MonascaDriver(driver.MonitorBaseDriver):
"""Monasca monitoring driver to monitor compute nodes. It makes use of
Monasca to monitor the compute nodes. Metric information
needed. 'hostname' must be used in dimensions to filter the
values in alarms. You need to define alarms for all hosts with
the required metrics.
"""
_OPTS = [
cfg.StrOpt('keystone_url',
help="Keystone Url for authentication",
required=True),
cfg.StrOpt('username',
help="cloud user used to record monasca alerts and alarms",
required=True),
cfg.StrOpt('password',
help="Cloud user's password",
required=True),
cfg.StrOpt('project_name',
help='Project/Tenant name. Default is admin',
default='admin',
required=True),
cfg.BoolOpt('insecure',
help='Use insecure connection.',
default=False),
cfg.StrOpt('project_domain_id',
help="Project Domain Id. Default is default",
default='default'),
cfg.StrOpt('user_domain_id',
help="User Domain Id. Default is default",
default='default'),
cfg.StrOpt('cacert',
help='CA certificate. Default is None',
default=None),
cfg.StrOpt('monasca_url',
help='Monasca endpoint URL. This is required to create a '
'monasca client instance '
),
cfg.ListOpt('metrics',
help='Monasca Metrics that needs to be checked. Each metric'
' should be defined in a seperate section in the'
' configuration file.',
default=['host_alive_status'],
required=True
),
cfg.StrOpt('aggregate',
choices=['any', 'all'],
default='all',
help="If more than one metric used and they reported "
"different states i.e.(a:failed, b:success) should we "
"evacuate the compute host if only one metric failed "
"(any) or only if all failed we evacuate (all). "
"Default is all")
]
def __init__(self, backend_name, notifier):
super(MonascaDriver, self).__init__(backend_name=backend_name,
notifier=notifier)
self.monasca_client = client.Client(
"2_0",
self.conf['monasca_url'],
auth_url=self.conf['keystone_url'],
username=self.conf['username'],
password=self.conf['password'],
project_name=self.conf['project_name'],
user_doamin_id=self.conf['user_domain_id'],
project_doamin_id=self.conf['project_domain_id'],
insecure=self.conf.get('insecure'),
cacert=self.conf.get('cacert', None)
)
# Compute nodes might be disabled or set to maintenance mode so
# freezer-dr needs to process only enabled nodes ...
self.nodes = [node for node in self.get_compute_nodes()
if node['status'] == "enabled"]
# register metric options in their groups and load their values
self.__load_metrics()
def _get_raw_data(self):
""" This function returns the raw data we got from Monasca before
processing and normalizing. You shouldn't call this function directly.
:return: dict contains:
{
hostname1: {
metric_name1: [{metric value 1}, {metric value 2}]
metric_name2: [{metric value 1}, {metric value 2}]
},
hostname2: {
metric_name1: [{metric value 1}, {metric value 2}]
metric_name2: [{metric value 1}, {metric value 2}]
}
}
"""
data = {}
for node in self.nodes:
data[node['host']] = {}
for metric in self.conf.metrics:
data[node['host']][metric] = self.monasca_client.alarms.list(
**self._build_metrics(
metric=metric,
hostname=node['host']
)
)
return data
def get_data(self):
"""This function returns monitoring data from Monasca. It calls
_get_raw_data to get raw data and then process these data returns
a normalized dict
:return: dict contains::
{
hostname1: {
metric_name1: ['Ok', 'ALARM', 'UNDETERMINED']
metric_name2: ['OK', 'OK', 'OK']
},
hostname2: {
metric_name1: ['Ok', 'ALARM', 'OK']
metric_name2: ['ALARM', 'UNDETERMINED', 'OK']
}
}
"""
data = self._get_raw_data()
data2 = {}
for host, metric_results in data.items():
data2[host] = {}
for metric_name, metric_values in metric_results.iteritems():
data2[host][metric_name] = []
for metric_value in metric_values:
data2[host][metric_name].append(metric_value.get('state'))
return data2
def process_failed(self, nodes=None, wait=1):
time.sleep(wait)
data = self.get_data()
nodes_down = self.analyze_nodes(nodes=data)
# Thanks Eldar :) for sets
nodes_down_hosts = set([dnode['host'] for dnode in nodes_down])
return [node for node in nodes if node['host'] in nodes_down_hosts]
def get_metrics(self):
"""Lists all metrics
:return: List of Metrics
"""
return self.conf['metrics']
def _build_metrics(self, metric, hostname=None):
"""Build the query to send to Monasca"""
metric = CONF[metric]
dimensions = {'hostname': hostname}
dimensions.update(metric.get('dimensions', {}))
fields = {
'metric_dimensions': dimensions,
'metric_name': metric['metric_name']
}
return fields
def analyze_nodes(self, nodes):
"""It will check if the nodes are in 'OK' state or not. If not they
will considered down. We have three states as follow:
1. OK
2. ALARM
3. UNDEFINED
"""
# @todo(szaher) use list comprehension instead of loops
# list below is correct and should return the extact same value like
# the two nested for loops
# nodes_down = [
# {"host": hostname} for hostname, metrics in nodes.iteritems() if
# [True for name, values in metrics.iteritems() if 'ALARM' in values]
# ]
nodes_data = []
for node, metrics in nodes.iteritems():
node_data = {node: []}
for metric_name, metric_data in metrics.iteritems():
node_data[node].append(
self.__process_metric(node, metric_name, metric_data)
)
nodes_data.append(node_data)
aggregate = self.conf.get('aggregate', 'all')
aggregate += '({0})'
nodes_down = []
for node_data in nodes_data:
node_info = {}
for node, data in node_data.iteritems():
if not data:
LOG.warning('No data available for node: {0}'.format(node))
continue
node_info[node] = eval(aggregate.format(data))
if node_info:
nodes_down.append(node_info)
if not nodes_down:
return []
return [
{'host': host.keys()[0]} for host in nodes_down
if True in host.values()
]
def __process_metric(self, node, metric_name, metric_data):
"""Process metric values got from Monasca.
Handles UNDETERMINED states and changes it to required state(read
from config file).
If no metric data found,"""
metric_conf = CONF[metric_name]
# process UNDETERMINED State and change it to the required state
metric_data = [
i if i in ['OK', 'ALARM'] else
metric_conf.get('undetermined', 'ALARM').upper()
for i in metric_data
]
if not metric_data:
message = """
No data found for this metric: {0} <br />
Data returned: {1} <br />
hostname: {2} <br />
Cause might be: <br />
<ul>
<li>Metric is not defined in Monasca </li>
<li>Alarm with this metric name is not set for this host </li>
<li>Check your Monasca configuration and Metric configuration
defined in freezer-dr.conf </li>
</ul>
You can try this command to check: <br /><code>
$ monasca alarm-list --metric-name {3} --metric-dimensions
hostname={2}
</code>
<br /> <br />
Freezer-DR
""".format(metric_name, str(metric_data), node,
metric_conf['metric_name'])
self.notifier.notify(message)
LOG.warning("No data found for metric: {0} on host: {1}".format(
metric_name, node
))
exit(1)
# build the decision
aggregate = metric_conf.get('aggregate')
aggregate += "(x=='ALARM' for x in metric_data)"
return eval(aggregate)
def is_alive(self):
url = urllib.parse.urlparse(self.conf.monasca_url)
if url.scheme == 'https':
http_connector = http.client.HTTPSConnection
else:
http_connector = http.client.HTTPConnection
try:
connection = http_connector(host=url.netloc)
connection.request('HEAD', url=url.path)
response = connection.getresponse()
except http.client.socket.error:
return False
try:
if getattr(response, 'status') in [200, 401]:
return True
except AttributeError:
pass
return False
def get_info(self):
return {
'name': 'Monasca Driver',
'version': 1.0,
'author': 'Hewlett-Packard Development Enterprise, L.P'
}
def get_compute_nodes(self):
"""Get a list of available compute hosts."""
client = utils.get_os_client()
return client.novacomputes()
def __load_metrics(self):
"""load custom sections created by user"""
for metric in self.conf.metrics:
CONF.register_opts(self.__metric_opts, group=metric)
@property
def __metric_opts(self):
"""List of options to be used in metric defined sections"""
return [
cfg.StrOpt("metric_name",
help="Metric Name used to log monitoring information"
" in Monasca",
required=True),
cfg.DictOpt("dimensions",
default={},
help="Dict that contains dimensions information. "
"component:nova-compute,service:compute",
),
cfg.StrOpt("aggregate",
choices=["any", "all"],
help="How to consider the compute node is down. If you "
"metric reports many states, like checking "
"different services on the compute host, should we"
" consider if one component down all are down or"
" only if all components are down. Default is all."
" This means if all components fail, freezer-dr"
" will consider the host failed",
default='all'
),
cfg.StrOpt("undetermined",
choices=['OK', 'ALARM'],
default='ALARM',
help="How to handle UNDETERMINED states. It can be "
"ignored, will be considered OK state or can be "
"considered ALARM. Default is ALARM")
]