Implemented Monasca Monitoring Driver

Use Monasca to monitor the compute nodes. Change-Id: If1ac08d6bbafc065796e89836aa6175e57ef0a54 Implements: blueprint implement-monasca-monitoring-driver Depends-On: Id330e5500f07ff19b7a2f82df236a4e0108668c9
2016-11-17 12:19:46 +00:00 · 2016-11-17 12:19:46 +00:00 · 7ca4980a8c
parent 061144c6c8
commit 7ca4980a8c
2 changed files with 302 additions and 0 deletions
--- a/freezer_dr/monitors/drivers/monasca/init.py
+++ b/freezer_dr/monitors/drivers/monasca/init.py
--- a/freezer_dr/monitors/drivers/monasca/driver.py
+++ b/freezer_dr/monitors/drivers/monasca/driver.py
@ -0,0 +1,302 @@
+# (c) Copyright 2016 Hewlett-Packard Development Enterprise, L.P.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import httplib
+import time
+import urlparse
+
+from monascaclient import client
+from oslo_config import cfg
+from oslo_log import log
+
+from freezer_dr.common import utils
+from freezer_dr.monitors.common import driver
+
+CONF = cfg.CONF
+LOG = log.getLogger(__name__)
+
+
+class MonascaDriver(driver.MonitorBaseDriver):
+
+    _OPTS = [
+        cfg.StrOpt('keystone_url',
+                   help="Keystone Url for authentication",
+                   required=True),
+        cfg.StrOpt('username',
+                   help="cloud user used to record monasca alerts and alarms",
+                   required=True),
+        cfg.StrOpt('password',
+                   help="Cloud user's password",
+                   required=True),
+        cfg.StrOpt('project_name',
+                   help='Project/Tenant name. Default is admin',
+                   default='admin',
+                   required=True),
+        cfg.BoolOpt('insecure',
+                    help='Use insecure connection.',
+                    default=False),
+        cfg.BoolOpt('include_catalog',
+                    help='Include service catalog.',
+                    default=True),
+        cfg.StrOpt('interface',
+                   help='Interface (internal, public, admin) ',
+                   default='internal'),
+        cfg.StrOpt('project_domain_id',
+                   help="Project Domain Id. Default is default",
+                   default='default'),
+        cfg.StrOpt('user_domain_id',
+                   help="User Domain Id. Default is default",
+                   default='default'),
+        cfg.StrOpt('cacert',
+                   help='CA certificate. Default is None',
+                   default=None),
+        cfg.StrOpt('monasca_url',
+                   help='Monasca endpoint URL. This is required to create a '
+                        'monasca client instance '
+                   ),
+        cfg.ListOpt('metrics',
+                    help='Monasca Metrics that needs to be checked. Each metric'
+                         ' should be defined in a seperate section in the'
+                         ' configuration file.',
+                    default=['host_alive_status'],
+                    required=True
+                    ),
+        cfg.StrOpt('aggregate',
+                   choices=['any', 'all'],
+                   default='all',
+                   help="If more than one metric used and they reported "
+                        "different states i.e.(a:failed, b:success) should we "
+                        "evacuate the compute host if only one metric failed "
+                        "(any) or only if all failed we evacuate (all). "
+                        "Default is all")
+    ]
+
+    def __init__(self, backend_name):
+        super(MonascaDriver, self).__init__(backend_name=backend_name)
+        self.monasca_client = client.Client(
+            "2_0",
+            self.conf['monasca_url'],
+            auth_url=self.conf['keystone_url'],
+            username=self.conf['username'],
+            password=self.conf['password'],
+            project_name=self.conf['project_name'],
+            user_doamin_id=self.conf['user_domain_id'],
+            project_doamin_id=self.conf['project_domain_id'],
+            interface=self.conf['interface'],
+            include_catalog=self.conf.get('include_catalog'),
+            insecure=self.conf.get('insecure'),
+            cacert=self.conf.get('cacert', None)
+            )
+        self.nodes = self.get_compute_nodes()
+        # register metric options in their groups and load their values
+        self.load_metrics()
+
+    def _get_raw_data(self):
+        """ This function returns the raw data we got from Monasca before
+        processing and normalizing. You shouldn't call this function directly.
+        :return: dict contains:
+        {
+            hostname1: {
+                metric_name1: [{metric value 1}, {metric value 2}]
+                metric_name2: [{metric value 1}, {metric value 2}]
+            },
+            hostname2: {
+                metric_name1: [{metric value 1}, {metric value 2}]
+                metric_name2: [{metric value 1}, {metric value 2}]
+            }
+        }
+        """
+        data = {}
+        for node in self.nodes:
+            data[node['host']] = {}
+            for metric in self.conf.metrics:
+                data[node['host']][metric] = self.monasca_client.alarms.list(
+                    **self._build_metrics(
+                        metric=metric,
+                        hostname=node['host']
+                    )
+                )
+        return data
+
+    def get_data(self):
+        """This function returns monitoring data from Monasca. It calls
+         _get_raw_data to get raw data and then process these data returns
+        a normalized dict
+        :return: doct contains:
+        {
+            hostname1: {
+                metric_name1: ['Ok', 'ALARM', 'UNDETERMINED']
+                metric_name2: ['OK', 'OK', 'OK']
+            },
+            hostname2: {
+                metric_name1: ['Ok', 'ALARM', 'OK']
+                metric_name2: ['ALARM', 'UNDETERMINED', 'OK']
+            }
+        }
+        """
+        data = self._get_raw_data()
+        data2 = {}
+        for host, metric_results in data.iteritems():
+            data2[host] = {}
+            for metric_name, metric_values in metric_results.iteritems():
+                data2[host][metric_name] = []
+                for metric_value in metric_values:
+                    data2[host][metric_name].append(metric_value.get('state'))
+        return data2
+
+    def process_failed(self, nodes=None, wait=1):
+        time.sleep(wait)
+        data = self.get_data()
+        nodes_down = self.analyze_nodes(nodes=data)
+        # Thanks Eldar :) for sets
+        nodes_down_hosts = set([dnode['host'] for dnode in nodes_down])
+        return [node for node in nodes if node['host'] in nodes_down_hosts]
+
+    def get_metrics(self):
+        """Lists all metrics
+        :return: List of Metrics
+        """
+        return self.conf['metrics']
+
+    def _build_metrics(self, metric, hostname=None):
+        metric = CONF[metric]
+        dimensions = {'hostname': hostname}
+        dimensions.update(metric.get('dimensions', {}))
+
+        fields = {
+            'metric_dimensions': dimensions,
+            'metric_name': metric['metric_name']
+        }
+        return fields
+
+    def analyze_nodes(self, nodes):
+        """It will check if the nodes are in 'OK' state or not. If not they
+        will considered down. We have three states as follow:
+            1. OK
+            2. ALARM
+            3. UNDEFINED
+        """
+        # @todo use list comprehension instead of loops
+        # list below is correct and should return the extact same value like
+        # the two nested for loops
+        # nodes_down = [
+        #     {"host": hostname} for hostname, metrics in nodes.iteritems() if
+        #   [True for name, values in metrics.iteritems() if 'ALARM' in values]
+        #     ]
+        nodes_data = []
+        for node, metrics in nodes.iteritems():
+            node_data = {node: []}
+            for metric_name, metric_data in metrics.iteritems():
+                node_data[node].append(self.__process_metric(metric_name, metric_data))
+            nodes_data.append(node_data)
+
+        aggregate = self.conf.get('aggregate', 'all')
+        aggregate += '({0})'
+        nodes_down = []
+        for node_data in nodes_data:
+            node_info = {}
+            for node, data in node_data.iteritems():
+                if not data:
+                    LOG.warning('No data available for node: {0}'.format(node))
+                    continue
+                node_info[node] = eval(aggregate.format(data))
+            if node_info:
+                nodes_down.append(node_info)
+
+        if not nodes_down:
+            return []
+        return [
+            {'host': host.keys()[0]} for host in nodes_down
+            if True in host.values()
+            ]
+
+    def __process_metric(self, metric_name, metric_data):
+        metric_conf = CONF[metric_name]
+        # process UNDETERMINED State and change it to the required state
+        metric_data = [
+            i if i in ['OK', 'ALARM'] else
+            metric_conf.get('undetermined', 'ALARM').upper()
+            for i in metric_data
+        ]
+        # build the decision
+        aggregate = metric_conf.get('aggregate')
+        aggregate += "(x=='ALARM' for x in metric_data)"
+        return eval(aggregate)
+
+    def is_alive(self):
+        url = urlparse.urlparse(self.conf.monasca_url)
+        if url.scheme == 'https':
+            http_connector = httplib.HTTPSConnection
+        else:
+            http_connector = httplib.HTTPConnection
+        try:
+            connection = http_connector(host=url.netloc)
+            connection.request('HEAD', url=url.path)
+            response = connection.getresponse()
+        except httplib.socket.error:
+            return False
+        try:
+            if getattr(response, 'status') in [200, 401]:
+                return True
+        except AttributeError:
+            pass
+        return False
+
+    def get_info(self):
+        return {
+            'name': 'Monasca Driver',
+            'version': 1.0,
+            'author': 'Hewlett-Packard Development Enterprise, L.P'
+                }
+
+    def get_compute_nodes(self):
+        """Get a list of available compute hosts."""
+        client = utils.get_os_client()
+        return client.novacomputes()
+
+    def load_metrics(self):
+        for metric in self.conf.metrics:
+            CONF.register_opts(self._metric_opts, group=metric)
+
+    @property
+    def _metric_opts(self):
+        return [
+            cfg.StrOpt("metric_name",
+                       help="Metric Name used to log monitoring information"
+                            " in Monasca",
+                       required=True),
+            cfg.DictOpt("dimensions",
+                        default={},
+                        help="Dict that contains dimensions information. "
+                             "component:nova-compute,service:compute",
+                        ),
+            cfg.StrOpt("aggregate",
+                       choices=["any", "all"],
+                       help="How to consider the compute node is down. If you "
+                            "metric reports many states, like checking "
+                            "different services on the compute host, should we"
+                            " consider if one component down all are down or"
+                            " only if all components are down. Default is all."
+                            " This means if all components fail, freezer-dr"
+                            " will consider the host failed",
+                       default='all'
+                       ),
+            cfg.StrOpt("undetermined",
+                       choices=['OK', 'ALARM'],
+                       default='ALARM',
+                       help="How to handle UNDETERMINED states. It can be "
+                            "ignored, will be considered OK state or can be "
+                            "considered ALARM. Default is ALARM")
+
+        ]