Buidling a plugable evacuators - still need some work !

Change-Id: I03ab6d72cecbb05bed9366e7959cf2b573dd0047
2016-01-06 18:22:57 +00:00 · 2016-01-06 18:22:57 +00:00 · 043a677238
parent da9fbe0ee7
commit 043a677238
13 changed files with 358 additions and 14 deletions
--- a/9
+++ b/9
@ -0,0 +1,9 @@
+# hints for Saad
+We need to divide the software to be like monitoring component calling evacuation component
+We need to decide we will process host by host or all hosts at once ?
+Do we need to use threading ??
+
+in docs/diagram.pdf > Stop monitoring this node and alert admin. How can we achieve something like that with the native driver without something to maintain the status of the previous run ?
+
+After adding the notification driver, we can handle the failed nodes by triggering the driver to send notification directly to the admin ...
+
--- a/etc/osha.conf.sample
+++ b/etc/osha.conf.sample
@ -91,6 +91,30 @@
 #instance_uuid_format = "[instance: %(uuid)s] "


+[evacuation]
+
+#
+# From osha
+#
+
+# Time in seconds to wait between retries to disable compute node or put it in
+# maintenance mode. Default 10 seconds (string value)
+#driver = osha.evacuators.drivers.osha.standard.OshaStandardEvacuator
+
+# Time in seconds to wait between retries to disable compute node or put it in
+# maintenance mode. Default 10 seconds (integer value)
+#wait = 10
+
+# Number of retries to put node in maintenance mode before reporting failure to
+# evacuate the node (integer value)
+#retries = 1
+
+# Dict contains kwargs to be passed to the evacuator driver. In case you have
+# additional args needs to be passed to your evacuator please, list them as
+# key0:value0, key1:value1, .... (dict value)
+#options =
+
+
 [fencer]

 #
@ -125,7 +149,7 @@
 # Openstack auth URI i.e. http://controller:5000 (string value)
 #auth_uri = <None>

-# Openstack auth URL i.e. http://controller:35357 (string value)
+# Openstack auth URL i.e. http://controller:35357/v3 (string value)
 #auth_url = <None>

 # Openstack auth plugin i.e. ( password, token, ...) password is the only
--- a/osha/common/config.py
+++ b/osha/common/config.py
@ -71,7 +71,7 @@ _KEYSTONE_AUTH_TOKEN = [
               help='Openstack auth URI i.e. http://controller:5000',
               dest='auth_uri'),
    cfg.StrOpt('auth_url',
-               help='Openstack auth URL i.e. http://controller:35357',
+               help='Openstack auth URL i.e. http://controller:35357/v3',
               dest='auth_url'),
    cfg.StrOpt('auth_plugin',
               help='Openstack auth plugin i.e. ( password, token, ...) '
@ -106,6 +106,33 @@ _KEYSTONE_AUTH_TOKEN = [
 ]


+_EVACUATION = [
+    cfg.StrOpt('driver',
+               default='osha.evacuators.drivers.osha.standard.'
+                       'OshaStandardEvacuator',
+               help='Time in seconds to wait between retries to disable compute'
+                    ' node or put it in maintenance mode. Default 10 seconds',
+               dest='driver'),
+    cfg.IntOpt('wait',
+               default=10,
+               help='Time in seconds to wait between retries to disable compute'
+                    ' node or put it in maintenance mode. Default 10 seconds',
+               dest='wait'),
+    cfg.IntOpt('retries',
+               default=1,
+               help='Number of retries to put node in maintenance mode before '
+                    'reporting failure to evacuate the node',
+               dest='retries'),
+    cfg.DictOpt('options',
+                default={},
+                help='Dict contains kwargs to be passed to the evacuator driver'
+                     '. In case you have additional args needs to be passed to '
+                     'your evacuator please, list them as key0:value0, '
+                     'key1:value1, ....',
+                dest='options')
+]
+
+
 def build_os_options():
    osclient_opts = [
        cfg.StrOpt('os-username',
@ -200,6 +227,14 @@ def configure():
    CONF.register_group(fencers_grp)
    CONF.register_opts(_FENCER, group='fencer')

+    # Evacuation Section :)
+    evacuators_grp = cfg.OptGroup('evacuation',
+                                  title='Evacuation Options',
+                                  help='Evacuation Driver/plugin opts to be '
+                                       'used to Evacuate compute nodes')
+    CONF.register_group(evacuators_grp)
+    CONF.register_opts(_EVACUATION, group='evacuation')
+
    # Osha Auth
    keystone_grp = cfg.OptGroup('keystone_authtoken',
                                title='Keystone Auth Options',
@ -239,7 +274,8 @@ def list_opts():
        None: _COMMON,
        'monitoring': _MONITORS,
        'keystone_authtoken': _KEYSTONE_AUTH_TOKEN,
-        'fencer': _FENCER
+        'fencer': _FENCER,
+        'evacuation': _EVACUATION
    }

    return _OPTS.items()
--- a/osha/common/osclient.py
+++ b/osha/common/osclient.py
@ -104,14 +104,26 @@ class OSClient:
                                       endpoint_type=self.endpoint_type)
        self.authSession = new_sess
        evacuated_nodes = []
+        print "Nodes", nodes
        for node in nodes:
            hypervisors = nova.hypervisors.search(node.get('host'), True)
+            print "Hypervisor found is:", hypervisors
            for hypervisor in hypervisors:
                host = {'host': node.get('host'), 'servers': hypervisor.servers}
                evacuated_nodes.append(host)
                for server in hypervisor.servers:
-                    output = nova.servers.evacuate(server.get('uuid'),
+                    try:
+                        output = nova.servers.evacuate(server.get('uuid'),
                                                   on_shared_storage=True)
+                    except Exception as e:
+                        print "ERRORORRRROROROROROROROROROROROROROROROROROROROR"
+                        print e
+                    for i in range(0, 100):
+                        print "-",
+                        if i == 50:
+                            print "Evacuation Result !",
+                    print
+
                    print output
                    exit()

@ -136,16 +148,17 @@ class OSClient:
        auth_session = session.Session(auth=self.authSession.auth)
        return auth_session

-    def get_node_status(self, hostname):
+    def get_node_status(self, node):
        """
        Check the node nova-service status and if it's disabled or not
-        :param hostname: of the required node
-        :return: return dict contains the node status if it's disabled or not !
+        :param node: dict contains node info
+        :return: True or False. True => node disabled, False => node is enabled
+        or unknow status !
        """
        nova = novaclient.Client(session=self.authSession,
                                 endpoint_type=self.endpoint_type)
        try:
-            node = nova.services.find(host=hostname)
+            node = nova.services.find(host=node.get('host'))
            print node
        except Exception as e:
            LOG.error(e)
@ -153,14 +166,17 @@ class OSClient:

        if not node:
            return False
-        return node.to_dict()
+        node = node.to_dict()
+        if node.get('status') == 'disabled':
+            return True
+        return False

-    def disable_node(self, hostname):
+    def disable_node(self, node):
        auth_session = session.Session(auth=self.authSession.auth)
        nova = novaclient.Client(session=auth_session,
                                 endpoint_type=self.endpoint_type)
        try:
-            node = nova.services.find(host=hostname)
+            node = nova.services.find(host=node.get('host'))
        except Exception as e:
            LOG.error(e)
            return False
@ -181,3 +197,14 @@ class OSClient:
            return False

        return True
+
+    def get_hypervisor_instances(self, node):
+        auth_session = session.Session(auth=self.authSession.auth)
+        nova = novaclient.Client(session=auth_session,
+                                 endpoint_type=self.endpoint_type)
+        hypervisors = nova.hypervisors.search(node.get('host'), True)
+        if not hypervisors:
+            return []
+        return hypervisors[0].servers
+
+
--- a/osha/common/utils.py
+++ b/osha/common/utils.py
@ -13,6 +13,12 @@
 # limitations under the License.

 import os
+from osha.common.osclient import OSClient
+from oslo_config import cfg
+from oslo_log import log
+
+CONF = cfg.CONF
+LOG = log.getLogger(__name__)


 def env(*env_vars, **kwargs):
@ -21,3 +27,24 @@ def env(*env_vars, **kwargs):
        if value:
            return value
    return kwargs.get('default', '')
+
+
+def get_os_client():
+    """
+    Loads credentials from [keystone_authtoken] section in the configuration
+    file and initialize the client and return an instance of the client
+    :return: Initialized instance of OS Client
+    """
+    credentials = CONF.get('keystone_authtoken')
+    client = OSClient(
+        authurl=credentials.get('auth_url'),
+        username=credentials.get('username'),
+        password=credentials.get('password'),
+        project_name=credentials.get('project_name'),
+        user_domain_id=credentials.get('user_domain_id'),
+        project_domain_id=credentials.get('project_domain_id'),
+        project_domain_name=credentials.get('project_domain_name'),
+        user_domain_name=credentials.get('user_domain_name')
+    )
+
+    return client
--- a/osha/evacuators/init.py
+++ b/osha/evacuators/init.py
@ -0,0 +1 @@
+__author__ = 'saad'
--- a/osha/evacuators/common/init.py
+++ b/osha/evacuators/common/init.py
@ -0,0 +1 @@
+__author__ = 'saad'
--- a/osha/evacuators/common/driver.py
+++ b/osha/evacuators/common/driver.py
@ -0,0 +1,90 @@
+# (c) Copyright 2014,2015 Hewlett-Packard Development Company, L.P.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import abc
+import six
+
+
+@six.add_metaclass(abc.ABCMeta)
+class EvacuatorBaseDriver(object):
+    """
+    Abstract class for all evacuation drivers should implement to have
+    a unified interface
+    """
+
+    def __init__(self, wait, retries, **kwargs):
+        """
+        Initialize Evacuation driver with the config args
+        :param wait: time in seconds that the evcauator should wait before
+        retrying to disable the node
+        :param retries: Number of times the evacuator will try to disable the
+        compute node
+        :param kwargs: Dict of arguments that any future driver may need to
+        load it from the config file
+        :return: None
+        """
+        self.wait = wait
+        self.retries = retries
+        self.options = kwargs
+
+    @abc.abstractmethod
+    def disable_node(self, node):
+        """
+        Disable the compute node from accepting any new VMs or requests
+        :param node: dict contains node's hostname
+        :return: True pr False
+        """
+        pass
+
+    @abc.abstractmethod
+    def is_node_disabled(self, node):
+        """
+        Check if node is already disabled or not
+        :param node: dict contains node's hostname
+        :return: True or False
+        """
+        pass
+
+    @abc.abstractmethod
+    def evacuate_nodes(self, nodes):
+        """
+        Will evacuate all running VMs on the required nodes
+        :param nodes: list of nodes
+        :return: list of nodes with updated status
+        """
+        pass
+
+    @abc.abstractmethod
+    def get_node_instances(self, node):
+        """
+        List instances on a compute host
+        :param node: dict contains node's hostname
+        :return: List contains running VMs on a given node
+        """
+        pass
+
+    def get_info(self):
+        """
+        Get Driver Information
+        :return: Dict contains driver information
+        """
+        pass
+
+    @abc.abstractmethod
+    def get_node_status(self, node):
+        """
+        Check the node status and report it
+        :param node: dict contains node's hostname
+        :return: dict with key 'status': 'True or False'
+        """
+        pass
--- a/osha/evacuators/common/manager.py
+++ b/osha/evacuators/common/manager.py
@ -0,0 +1,83 @@
+# (c) Copyright 2014,2015 Hewlett-Packard Development Company, L.P.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from oslo_config import cfg
+from oslo_log import log
+from oslo_utils import importutils
+from osha.fencers.common.manager import FencerManager
+
+CONF = cfg.CONF
+LOG = log.getLogger(__name__)
+
+
+class EvacuationManager(object):
+
+    def __init__(self, enable_fencing=True):
+        evcuation_conf = CONF.get('evacuation')
+        self.driver = importutils.import_object(
+            evcuation_conf.get('driver'),
+            evcuation_conf.get('wait'),
+            evcuation_conf.get('retries'),
+            **evcuation_conf.get('options')
+        )
+        self.enable_fencing = enable_fencing
+        self.wait = evcuation_conf.get('wait')
+        self.retires = evcuation_conf.get('retries', 1)
+        if self.retires <= 0:
+            self.retires = 1
+
+    def evacuate(self, nodes):
+        # try to disable node
+        # @todo needs more error handling like if the status didn't update or
+        # we are unable to disable the node ???
+        failed_nodes = []  # maintain nodes that are going to fail at any state
+        succeeded_nodes = []
+        for node in nodes:
+            for i in range(0, self.retires):
+                status = self._disable_node(node)
+                # if True ( node disabled ) break the loop
+                if status:
+                    break
+                else:
+                    status = False
+            node['status'] = status
+            # make sure the disable request was successful
+            if not self.driver.get_node_status(node):
+                failed_nodes.append(node)
+                nodes.remove(node)  # if the node failed at any step no reason
+                # to move it to the next step
+            else:
+                succeeded_nodes.append(node)
+
+        nodes = succeeded_nodes
+        if self.enable_fencing:
+            fencer = FencerManager(nodes)
+            nodes = fencer.fence()
+
+        succeeded_nodes = []
+        for node in nodes:
+            node['instances'] = self.driver.get_node_instances(node)
+            succeeded_nodes.append(node)
+
+        nodes = succeeded_nodes
+        from time import sleep
+        sleep(30)
+        evacuated_nodes = self.driver.evacuate_nodes(nodes)
+        return evacuated_nodes
+
+    def _disable_node(self, node):
+        if not self.driver.is_node_disabled(node):
+                return self.driver.disable_node(node)
+        else:
+            True
+
--- a/osha/evacuators/drivers/init.py
+++ b/osha/evacuators/drivers/init.py
@ -0,0 +1 @@
+__author__ = 'saad'
--- a/osha/evacuators/drivers/osha/init.py
+++ b/osha/evacuators/drivers/osha/init.py
@ -0,0 +1 @@
+__author__ = 'saad'
--- a/osha/evacuators/drivers/osha/standard.py
+++ b/osha/evacuators/drivers/osha/standard.py
@ -0,0 +1,44 @@
+# (c) Copyright 2014,2015 Hewlett-Packard Development Company, L.P.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from oslo_config import cfg
+from oslo_log import log
+from osha.evacuators.common.driver import EvacuatorBaseDriver
+from osha.common.utils import get_os_client
+CONF = cfg.CONF
+LOG = log.getLogger(__name__)
+
+
+class OshaStandardEvacuator(EvacuatorBaseDriver):
+
+    def __init__(self, wait, retires, **kwargs):
+        super(OshaStandardEvacuator, self).__init__(wait, retires, **kwargs)
+        self.client = get_os_client()
+
+    def get_node_instances(self, node):
+        return self.client.get_hypervisor_instances(node)
+
+    def disable_node(self, node):
+        return self.client.disable_node(node)
+
+    def get_node_status(self, node):
+        return self.client.get_node_status(node)
+
+    def is_node_disabled(self, node):
+        return self.client.get_node_status(node)
+
+    def evacuate_nodes(self, nodes):
+        return self.client.evacuate(nodes)
+
+
+
--- a/osha/main.py
+++ b/osha/main.py
@ -15,7 +15,7 @@ from osha.common import config
 from oslo_config import cfg
 from oslo_log import log
 from osha.monitors.common.manager import MonitorManager
-from osha.evacuate import EvacuationManager
+from osha.evacuators.common.manager import EvacuationManager

 CONF = cfg.CONF
 LOG = log.getLogger(__name__)
@ -36,8 +36,8 @@ def main():
        # deployments
        # Load Fence driver
        # Shutdown the node
-        evacuator = EvacuationManager(nodes)
-        evacuator.evacuate()
+        evac = EvacuationManager()
+        evac.evacuate(nodes)
        exit()

        print "Fenced nodes are", nodes