Implemented Monasca Monitoring Driver
Use Monasca to monitor the compute nodes. Change-Id: If1ac08d6bbafc065796e89836aa6175e57ef0a54 Implements: blueprint implement-monasca-monitoring-driver Depends-On: Id330e5500f07ff19b7a2f82df236a4e0108668c9
This commit is contained in:
parent
061144c6c8
commit
7ca4980a8c
|
@ -0,0 +1,302 @@
|
|||
# (c) Copyright 2016 Hewlett-Packard Development Enterprise, L.P.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import httplib
|
||||
import time
|
||||
import urlparse
|
||||
|
||||
from monascaclient import client
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log
|
||||
|
||||
from freezer_dr.common import utils
|
||||
from freezer_dr.monitors.common import driver
|
||||
|
||||
CONF = cfg.CONF
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
||||
class MonascaDriver(driver.MonitorBaseDriver):
|
||||
|
||||
_OPTS = [
|
||||
cfg.StrOpt('keystone_url',
|
||||
help="Keystone Url for authentication",
|
||||
required=True),
|
||||
cfg.StrOpt('username',
|
||||
help="cloud user used to record monasca alerts and alarms",
|
||||
required=True),
|
||||
cfg.StrOpt('password',
|
||||
help="Cloud user's password",
|
||||
required=True),
|
||||
cfg.StrOpt('project_name',
|
||||
help='Project/Tenant name. Default is admin',
|
||||
default='admin',
|
||||
required=True),
|
||||
cfg.BoolOpt('insecure',
|
||||
help='Use insecure connection.',
|
||||
default=False),
|
||||
cfg.BoolOpt('include_catalog',
|
||||
help='Include service catalog.',
|
||||
default=True),
|
||||
cfg.StrOpt('interface',
|
||||
help='Interface (internal, public, admin) ',
|
||||
default='internal'),
|
||||
cfg.StrOpt('project_domain_id',
|
||||
help="Project Domain Id. Default is default",
|
||||
default='default'),
|
||||
cfg.StrOpt('user_domain_id',
|
||||
help="User Domain Id. Default is default",
|
||||
default='default'),
|
||||
cfg.StrOpt('cacert',
|
||||
help='CA certificate. Default is None',
|
||||
default=None),
|
||||
cfg.StrOpt('monasca_url',
|
||||
help='Monasca endpoint URL. This is required to create a '
|
||||
'monasca client instance '
|
||||
),
|
||||
cfg.ListOpt('metrics',
|
||||
help='Monasca Metrics that needs to be checked. Each metric'
|
||||
' should be defined in a seperate section in the'
|
||||
' configuration file.',
|
||||
default=['host_alive_status'],
|
||||
required=True
|
||||
),
|
||||
cfg.StrOpt('aggregate',
|
||||
choices=['any', 'all'],
|
||||
default='all',
|
||||
help="If more than one metric used and they reported "
|
||||
"different states i.e.(a:failed, b:success) should we "
|
||||
"evacuate the compute host if only one metric failed "
|
||||
"(any) or only if all failed we evacuate (all). "
|
||||
"Default is all")
|
||||
]
|
||||
|
||||
def __init__(self, backend_name):
|
||||
super(MonascaDriver, self).__init__(backend_name=backend_name)
|
||||
self.monasca_client = client.Client(
|
||||
"2_0",
|
||||
self.conf['monasca_url'],
|
||||
auth_url=self.conf['keystone_url'],
|
||||
username=self.conf['username'],
|
||||
password=self.conf['password'],
|
||||
project_name=self.conf['project_name'],
|
||||
user_doamin_id=self.conf['user_domain_id'],
|
||||
project_doamin_id=self.conf['project_domain_id'],
|
||||
interface=self.conf['interface'],
|
||||
include_catalog=self.conf.get('include_catalog'),
|
||||
insecure=self.conf.get('insecure'),
|
||||
cacert=self.conf.get('cacert', None)
|
||||
)
|
||||
self.nodes = self.get_compute_nodes()
|
||||
# register metric options in their groups and load their values
|
||||
self.load_metrics()
|
||||
|
||||
def _get_raw_data(self):
|
||||
""" This function returns the raw data we got from Monasca before
|
||||
processing and normalizing. You shouldn't call this function directly.
|
||||
:return: dict contains:
|
||||
{
|
||||
hostname1: {
|
||||
metric_name1: [{metric value 1}, {metric value 2}]
|
||||
metric_name2: [{metric value 1}, {metric value 2}]
|
||||
},
|
||||
hostname2: {
|
||||
metric_name1: [{metric value 1}, {metric value 2}]
|
||||
metric_name2: [{metric value 1}, {metric value 2}]
|
||||
}
|
||||
}
|
||||
"""
|
||||
data = {}
|
||||
for node in self.nodes:
|
||||
data[node['host']] = {}
|
||||
for metric in self.conf.metrics:
|
||||
data[node['host']][metric] = self.monasca_client.alarms.list(
|
||||
**self._build_metrics(
|
||||
metric=metric,
|
||||
hostname=node['host']
|
||||
)
|
||||
)
|
||||
return data
|
||||
|
||||
def get_data(self):
|
||||
"""This function returns monitoring data from Monasca. It calls
|
||||
_get_raw_data to get raw data and then process these data returns
|
||||
a normalized dict
|
||||
:return: doct contains:
|
||||
{
|
||||
hostname1: {
|
||||
metric_name1: ['Ok', 'ALARM', 'UNDETERMINED']
|
||||
metric_name2: ['OK', 'OK', 'OK']
|
||||
},
|
||||
hostname2: {
|
||||
metric_name1: ['Ok', 'ALARM', 'OK']
|
||||
metric_name2: ['ALARM', 'UNDETERMINED', 'OK']
|
||||
}
|
||||
}
|
||||
"""
|
||||
data = self._get_raw_data()
|
||||
data2 = {}
|
||||
for host, metric_results in data.iteritems():
|
||||
data2[host] = {}
|
||||
for metric_name, metric_values in metric_results.iteritems():
|
||||
data2[host][metric_name] = []
|
||||
for metric_value in metric_values:
|
||||
data2[host][metric_name].append(metric_value.get('state'))
|
||||
return data2
|
||||
|
||||
def process_failed(self, nodes=None, wait=1):
|
||||
time.sleep(wait)
|
||||
data = self.get_data()
|
||||
nodes_down = self.analyze_nodes(nodes=data)
|
||||
# Thanks Eldar :) for sets
|
||||
nodes_down_hosts = set([dnode['host'] for dnode in nodes_down])
|
||||
return [node for node in nodes if node['host'] in nodes_down_hosts]
|
||||
|
||||
def get_metrics(self):
|
||||
"""Lists all metrics
|
||||
:return: List of Metrics
|
||||
"""
|
||||
return self.conf['metrics']
|
||||
|
||||
def _build_metrics(self, metric, hostname=None):
|
||||
metric = CONF[metric]
|
||||
dimensions = {'hostname': hostname}
|
||||
dimensions.update(metric.get('dimensions', {}))
|
||||
|
||||
fields = {
|
||||
'metric_dimensions': dimensions,
|
||||
'metric_name': metric['metric_name']
|
||||
}
|
||||
return fields
|
||||
|
||||
def analyze_nodes(self, nodes):
|
||||
"""It will check if the nodes are in 'OK' state or not. If not they
|
||||
will considered down. We have three states as follow:
|
||||
1. OK
|
||||
2. ALARM
|
||||
3. UNDEFINED
|
||||
"""
|
||||
# @todo use list comprehension instead of loops
|
||||
# list below is correct and should return the extact same value like
|
||||
# the two nested for loops
|
||||
# nodes_down = [
|
||||
# {"host": hostname} for hostname, metrics in nodes.iteritems() if
|
||||
# [True for name, values in metrics.iteritems() if 'ALARM' in values]
|
||||
# ]
|
||||
nodes_data = []
|
||||
for node, metrics in nodes.iteritems():
|
||||
node_data = {node: []}
|
||||
for metric_name, metric_data in metrics.iteritems():
|
||||
node_data[node].append(self.__process_metric(metric_name, metric_data))
|
||||
nodes_data.append(node_data)
|
||||
|
||||
aggregate = self.conf.get('aggregate', 'all')
|
||||
aggregate += '({0})'
|
||||
nodes_down = []
|
||||
for node_data in nodes_data:
|
||||
node_info = {}
|
||||
for node, data in node_data.iteritems():
|
||||
if not data:
|
||||
LOG.warning('No data available for node: {0}'.format(node))
|
||||
continue
|
||||
node_info[node] = eval(aggregate.format(data))
|
||||
if node_info:
|
||||
nodes_down.append(node_info)
|
||||
|
||||
if not nodes_down:
|
||||
return []
|
||||
return [
|
||||
{'host': host.keys()[0]} for host in nodes_down
|
||||
if True in host.values()
|
||||
]
|
||||
|
||||
def __process_metric(self, metric_name, metric_data):
|
||||
metric_conf = CONF[metric_name]
|
||||
# process UNDETERMINED State and change it to the required state
|
||||
metric_data = [
|
||||
i if i in ['OK', 'ALARM'] else
|
||||
metric_conf.get('undetermined', 'ALARM').upper()
|
||||
for i in metric_data
|
||||
]
|
||||
# build the decision
|
||||
aggregate = metric_conf.get('aggregate')
|
||||
aggregate += "(x=='ALARM' for x in metric_data)"
|
||||
return eval(aggregate)
|
||||
|
||||
def is_alive(self):
|
||||
url = urlparse.urlparse(self.conf.monasca_url)
|
||||
if url.scheme == 'https':
|
||||
http_connector = httplib.HTTPSConnection
|
||||
else:
|
||||
http_connector = httplib.HTTPConnection
|
||||
try:
|
||||
connection = http_connector(host=url.netloc)
|
||||
connection.request('HEAD', url=url.path)
|
||||
response = connection.getresponse()
|
||||
except httplib.socket.error:
|
||||
return False
|
||||
try:
|
||||
if getattr(response, 'status') in [200, 401]:
|
||||
return True
|
||||
except AttributeError:
|
||||
pass
|
||||
return False
|
||||
|
||||
def get_info(self):
|
||||
return {
|
||||
'name': 'Monasca Driver',
|
||||
'version': 1.0,
|
||||
'author': 'Hewlett-Packard Development Enterprise, L.P'
|
||||
}
|
||||
|
||||
def get_compute_nodes(self):
|
||||
"""Get a list of available compute hosts."""
|
||||
client = utils.get_os_client()
|
||||
return client.novacomputes()
|
||||
|
||||
def load_metrics(self):
|
||||
for metric in self.conf.metrics:
|
||||
CONF.register_opts(self._metric_opts, group=metric)
|
||||
|
||||
@property
|
||||
def _metric_opts(self):
|
||||
return [
|
||||
cfg.StrOpt("metric_name",
|
||||
help="Metric Name used to log monitoring information"
|
||||
" in Monasca",
|
||||
required=True),
|
||||
cfg.DictOpt("dimensions",
|
||||
default={},
|
||||
help="Dict that contains dimensions information. "
|
||||
"component:nova-compute,service:compute",
|
||||
),
|
||||
cfg.StrOpt("aggregate",
|
||||
choices=["any", "all"],
|
||||
help="How to consider the compute node is down. If you "
|
||||
"metric reports many states, like checking "
|
||||
"different services on the compute host, should we"
|
||||
" consider if one component down all are down or"
|
||||
" only if all components are down. Default is all."
|
||||
" This means if all components fail, freezer-dr"
|
||||
" will consider the host failed",
|
||||
default='all'
|
||||
),
|
||||
cfg.StrOpt("undetermined",
|
||||
choices=['OK', 'ALARM'],
|
||||
default='ALARM',
|
||||
help="How to handle UNDETERMINED states. It can be "
|
||||
"ignored, will be considered OK state or can be "
|
||||
"considered ALARM. Default is ALARM")
|
||||
|
||||
]
|
Loading…
Reference in New Issue