Implemented Monasca Monitoring Driver

Use Monasca to monitor the compute nodes.

Change-Id: If1ac08d6bbafc065796e89836aa6175e57ef0a54
Implements: blueprint implement-monasca-monitoring-driver
Depends-On: Id330e5500f07ff19b7a2f82df236a4e0108668c9
This commit is contained in:
Saad Zaher 2016-11-17 12:19:46 +00:00
parent 061144c6c8
commit 7ca4980a8c
2 changed files with 302 additions and 0 deletions

View File

@ -0,0 +1,302 @@
# (c) Copyright 2016 Hewlett-Packard Development Enterprise, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import httplib
import time
import urlparse
from monascaclient import client
from oslo_config import cfg
from oslo_log import log
from freezer_dr.common import utils
from freezer_dr.monitors.common import driver
CONF = cfg.CONF
LOG = log.getLogger(__name__)
class MonascaDriver(driver.MonitorBaseDriver):
_OPTS = [
cfg.StrOpt('keystone_url',
help="Keystone Url for authentication",
required=True),
cfg.StrOpt('username',
help="cloud user used to record monasca alerts and alarms",
required=True),
cfg.StrOpt('password',
help="Cloud user's password",
required=True),
cfg.StrOpt('project_name',
help='Project/Tenant name. Default is admin',
default='admin',
required=True),
cfg.BoolOpt('insecure',
help='Use insecure connection.',
default=False),
cfg.BoolOpt('include_catalog',
help='Include service catalog.',
default=True),
cfg.StrOpt('interface',
help='Interface (internal, public, admin) ',
default='internal'),
cfg.StrOpt('project_domain_id',
help="Project Domain Id. Default is default",
default='default'),
cfg.StrOpt('user_domain_id',
help="User Domain Id. Default is default",
default='default'),
cfg.StrOpt('cacert',
help='CA certificate. Default is None',
default=None),
cfg.StrOpt('monasca_url',
help='Monasca endpoint URL. This is required to create a '
'monasca client instance '
),
cfg.ListOpt('metrics',
help='Monasca Metrics that needs to be checked. Each metric'
' should be defined in a seperate section in the'
' configuration file.',
default=['host_alive_status'],
required=True
),
cfg.StrOpt('aggregate',
choices=['any', 'all'],
default='all',
help="If more than one metric used and they reported "
"different states i.e.(a:failed, b:success) should we "
"evacuate the compute host if only one metric failed "
"(any) or only if all failed we evacuate (all). "
"Default is all")
]
def __init__(self, backend_name):
super(MonascaDriver, self).__init__(backend_name=backend_name)
self.monasca_client = client.Client(
"2_0",
self.conf['monasca_url'],
auth_url=self.conf['keystone_url'],
username=self.conf['username'],
password=self.conf['password'],
project_name=self.conf['project_name'],
user_doamin_id=self.conf['user_domain_id'],
project_doamin_id=self.conf['project_domain_id'],
interface=self.conf['interface'],
include_catalog=self.conf.get('include_catalog'),
insecure=self.conf.get('insecure'),
cacert=self.conf.get('cacert', None)
)
self.nodes = self.get_compute_nodes()
# register metric options in their groups and load their values
self.load_metrics()
def _get_raw_data(self):
""" This function returns the raw data we got from Monasca before
processing and normalizing. You shouldn't call this function directly.
:return: dict contains:
{
hostname1: {
metric_name1: [{metric value 1}, {metric value 2}]
metric_name2: [{metric value 1}, {metric value 2}]
},
hostname2: {
metric_name1: [{metric value 1}, {metric value 2}]
metric_name2: [{metric value 1}, {metric value 2}]
}
}
"""
data = {}
for node in self.nodes:
data[node['host']] = {}
for metric in self.conf.metrics:
data[node['host']][metric] = self.monasca_client.alarms.list(
**self._build_metrics(
metric=metric,
hostname=node['host']
)
)
return data
def get_data(self):
"""This function returns monitoring data from Monasca. It calls
_get_raw_data to get raw data and then process these data returns
a normalized dict
:return: doct contains:
{
hostname1: {
metric_name1: ['Ok', 'ALARM', 'UNDETERMINED']
metric_name2: ['OK', 'OK', 'OK']
},
hostname2: {
metric_name1: ['Ok', 'ALARM', 'OK']
metric_name2: ['ALARM', 'UNDETERMINED', 'OK']
}
}
"""
data = self._get_raw_data()
data2 = {}
for host, metric_results in data.iteritems():
data2[host] = {}
for metric_name, metric_values in metric_results.iteritems():
data2[host][metric_name] = []
for metric_value in metric_values:
data2[host][metric_name].append(metric_value.get('state'))
return data2
def process_failed(self, nodes=None, wait=1):
time.sleep(wait)
data = self.get_data()
nodes_down = self.analyze_nodes(nodes=data)
# Thanks Eldar :) for sets
nodes_down_hosts = set([dnode['host'] for dnode in nodes_down])
return [node for node in nodes if node['host'] in nodes_down_hosts]
def get_metrics(self):
"""Lists all metrics
:return: List of Metrics
"""
return self.conf['metrics']
def _build_metrics(self, metric, hostname=None):
metric = CONF[metric]
dimensions = {'hostname': hostname}
dimensions.update(metric.get('dimensions', {}))
fields = {
'metric_dimensions': dimensions,
'metric_name': metric['metric_name']
}
return fields
def analyze_nodes(self, nodes):
"""It will check if the nodes are in 'OK' state or not. If not they
will considered down. We have three states as follow:
1. OK
2. ALARM
3. UNDEFINED
"""
# @todo use list comprehension instead of loops
# list below is correct and should return the extact same value like
# the two nested for loops
# nodes_down = [
# {"host": hostname} for hostname, metrics in nodes.iteritems() if
# [True for name, values in metrics.iteritems() if 'ALARM' in values]
# ]
nodes_data = []
for node, metrics in nodes.iteritems():
node_data = {node: []}
for metric_name, metric_data in metrics.iteritems():
node_data[node].append(self.__process_metric(metric_name, metric_data))
nodes_data.append(node_data)
aggregate = self.conf.get('aggregate', 'all')
aggregate += '({0})'
nodes_down = []
for node_data in nodes_data:
node_info = {}
for node, data in node_data.iteritems():
if not data:
LOG.warning('No data available for node: {0}'.format(node))
continue
node_info[node] = eval(aggregate.format(data))
if node_info:
nodes_down.append(node_info)
if not nodes_down:
return []
return [
{'host': host.keys()[0]} for host in nodes_down
if True in host.values()
]
def __process_metric(self, metric_name, metric_data):
metric_conf = CONF[metric_name]
# process UNDETERMINED State and change it to the required state
metric_data = [
i if i in ['OK', 'ALARM'] else
metric_conf.get('undetermined', 'ALARM').upper()
for i in metric_data
]
# build the decision
aggregate = metric_conf.get('aggregate')
aggregate += "(x=='ALARM' for x in metric_data)"
return eval(aggregate)
def is_alive(self):
url = urlparse.urlparse(self.conf.monasca_url)
if url.scheme == 'https':
http_connector = httplib.HTTPSConnection
else:
http_connector = httplib.HTTPConnection
try:
connection = http_connector(host=url.netloc)
connection.request('HEAD', url=url.path)
response = connection.getresponse()
except httplib.socket.error:
return False
try:
if getattr(response, 'status') in [200, 401]:
return True
except AttributeError:
pass
return False
def get_info(self):
return {
'name': 'Monasca Driver',
'version': 1.0,
'author': 'Hewlett-Packard Development Enterprise, L.P'
}
def get_compute_nodes(self):
"""Get a list of available compute hosts."""
client = utils.get_os_client()
return client.novacomputes()
def load_metrics(self):
for metric in self.conf.metrics:
CONF.register_opts(self._metric_opts, group=metric)
@property
def _metric_opts(self):
return [
cfg.StrOpt("metric_name",
help="Metric Name used to log monitoring information"
" in Monasca",
required=True),
cfg.DictOpt("dimensions",
default={},
help="Dict that contains dimensions information. "
"component:nova-compute,service:compute",
),
cfg.StrOpt("aggregate",
choices=["any", "all"],
help="How to consider the compute node is down. If you "
"metric reports many states, like checking "
"different services on the compute host, should we"
" consider if one component down all are down or"
" only if all components are down. Default is all."
" This means if all components fail, freezer-dr"
" will consider the host failed",
default='all'
),
cfg.StrOpt("undetermined",
choices=['OK', 'ALARM'],
default='ALARM',
help="How to handle UNDETERMINED states. It can be "
"ignored, will be considered OK state or can be "
"considered ALARM. Default is ALARM")
]