Big Bang

Change-Id: I16c69cf1d2ad6ff7c5b05ec5f3528e7cd8785249
2015-12-21 11:35:58 +00:00 · 2015-12-21 11:35:58 +00:00 · 0ee63ad837
parent b635db01e8
commit 0ee63ad837
10 changed files with 687 additions and 0 deletions
--- a/README.rst
+++ b/README.rst
@ -0,0 +1,13 @@
+Openstack Compute High Availability
+
+Osha allows Openstack to have High availability in compute nodes. Simply it monitors all compute nodes in your deployment
+and if there is any failure in one of the computes it launches the evacuation tool to evacuate this node and move all
+instances to another compute node.
+
+Osha has a plugable architecture so you can use any monitoring system you want to use it for monitoring your compute nodes
+just by adding a simple plugin and adjust your configuration file to use this plugin or combination of plugins if you want
+
+Osha runs as scheduler in the control plane which communicates with the monitoring system to get compute nodes status
+For running osha under high availability mode, it should run with active passive mode.
+
+
--- a/daemon.py
+++ b/daemon.py
@ -0,0 +1,165 @@
+#!/usr/bin/env python
+ 
+import sys, os, time, atexit
+from signal import SIGTERM
+import logging as log
+
+ 
+class Daemon:
+        """
+        A generic daemon class.
+       
+        Usage: subclass the Daemon class and override the run() method
+        """
+        def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null',
+                     stderr='/dev/null'):
+                self.stdin = stdin
+                self.stdout = stdout
+                self.stderr = stderr
+                self.pidfile = pidfile
+       
+        def daemonize(self):
+                """
+                do the UNIX double-fork magic, see Stevens' "Advanced
+                Programming in the UNIX Environment" for details
+                 (ISBN 0201563177)
+                http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16
+                """
+                try:
+                        pid = os.fork()
+                        if pid > 0:
+                                # exit first parent
+                                sys.exit(0)
+                except OSError, e:
+                        sys.stderr.write("fork #1 failed: %d (%s)\n" %
+                                         (e.errno, e.strerror))
+                        log.error(e)
+                        sys.exit(1)
+
+                # decouple from parent environment
+                os.chdir("/")
+                os.setsid()
+                os.umask(0)
+       
+                # do second fork
+                try:
+                        pid = os.fork()
+                        if pid > 0:
+                                # exit from second parent
+                                sys.exit(0)
+                except OSError, e:
+                        sys.stderr.write("fork #2 failed: %d (%s)\n"
+                                         % (e.errno, e.strerror))
+                        log.error(e)
+                        sys.exit(1)
+
+                # redirect standard file descriptors
+                sys.stdout.flush()
+                sys.stderr.flush()
+                si = file(self.stdin, 'r')
+                so = file(self.stdout, 'a+')
+                se = file(self.stderr, 'a+', 0)
+                os.dup2(si.fileno(), sys.stdin.fileno())
+                os.dup2(so.fileno(), sys.stdout.fileno())
+                os.dup2(se.fileno(), sys.stderr.fileno())
+
+                # write pidfile
+                atexit.register(self.delpid)
+
+                pid = str(os.getpid())
+                f = file(self.pidfile, 'w+')
+                f.write("%s\n" % pid)
+                f.close()
+       
+        def delpid(self):
+                os.remove(self.pidfile)
+ 
+        def start(self):
+                """
+                Start the daemon
+                """
+                log.error("Test")
+                # Check for a pidfile to see if the daemon already runs
+                try:
+                        pf = file(self.pidfile,'r')
+                        pid = int(pf.read().strip())
+                        pf.close()
+                except IOError as e:
+                        pid = None
+
+                if pid:
+                        message = "pidfile %s already exist. Daemon" \
+                                  " already running?\n"
+                        sys.stderr.write(message % self.pidfile)
+                        sys.exit(1)
+
+                # Start the daemon
+                self.daemonize()
+                self.run()
+
+        # @todo needs some enhancement like check /proc/%pid/status if it's
+        # really running or not ! may be it's killed by external process
+        # the PID won't be updated !
+        def status(self):
+                try:
+                        pf = file(self.pidfile, 'r')
+                        pid = int(pf.read().strip())
+                        pf.close()
+                except IOError as e:
+                        pid = None
+
+                if pid:
+                        message = "pidfile %s already exist. Daemon already " \
+                                  "running. PID: %d \n"
+                        sys.stdout.write(message % (self.pidfile, pid))
+                        sys.exit(0)
+                else:
+                        message = "Service not running !\n"
+                        sys.stdout.write(message)
+                        sys.exit(0)
+ 
+        def stop(self):
+                """
+                Stop the daemon
+                """
+                # Get the pid from the pidfile
+                try:
+                        pf = file(self.pidfile,'r')
+                        pid = int(pf.read().strip())
+                        pf.close()
+                except IOError:
+                        pid = None
+       
+                if not pid:
+                        message = "pidfile %s does not exist." \
+                                  " Daemon not running?\n"
+                        sys.stderr.write(message % self.pidfile)
+                        return  # not an error in a restart
+ 
+                # Try killing the daemon process       
+                try:
+                        while 1:
+                                os.kill(pid, SIGTERM)
+                                time.sleep(0.1)
+                except OSError, err:
+                        err = str(err)
+                        if err.find("No such process") > 0:
+                                if os.path.exists(self.pidfile):
+                                        os.remove(self.pidfile)
+                        else:
+                                print str(err)
+                                sys.exit(1)
+ 
+        def restart(self):
+                """
+                Restart the daemon
+                """
+                self.stop()
+                self.start()
+ 
+        def run(self):
+                """
+                You should override this method when you subclass Daemon.
+                 It will be called after the process has been
+                daemonized by start() or restart().
+                """
--- a/evacuate.py
+++ b/evacuate.py
@ -0,0 +1,22 @@
+#__author__ = 'saad'
+from monitor import Monitor
+import osclient
+
+password = 'BOMrLNGHsoBb'
+user_id = 'ec2548d6acb54e7ba24f479e2f3cb1a5'
+username = 'admin'
+auth_url = 'http://192.168.245.9:35357/v3'
+project_name = 'demo'
+project_id = 'f749b2874b0040aca92ea131210eb774'
+user_domain_name = 'Default'
+project_domain_name = 'Default'
+
+client = osclient.OSClient(authurl=auth_url,
+                           username=username,
+                           password=password,
+                           user_domain_name=user_domain_name,
+                           project_name=project_name,
+                           project_domain_name=project_domain_name,
+                           endpoint_type='internal')
+monitor = Monitor(client, 1)
+monitor.monitor()
--- a/ipmitool.py
+++ b/ipmitool.py
@ -0,0 +1,115 @@
+# __author__ = 'saad'
+import sys
+import subprocess
+from distutils import spawn
+
+
+class IpmiInterface:
+
+    _IPMI = 'ipmitool'
+    _RAW_CMD = '{0} -I {1} -H {2} -U {3} -P {4} '
+    _SUPPORTED_INTERFACES = ['lan', 'lanplus']
+
+    def __init__(self, host, username, password, verbose=False,
+                 interface='lanplus'):
+        self._IPMI = spawn.find_executable('ipmitool')
+        if not self._IPMI:
+            self._IPMI = spawn.find_executable('ipmitool',
+                                               path=':'.join(sys.path))
+        if interface not in self._SUPPORTED_INTERFACES:
+            raise Exception("Provided Interface is not supported")
+
+        self._host = host
+        self._username = username
+        self._password = password
+        self._verbose = verbose
+        self._interface = interface
+
+        self._update_cmd_credentials(
+            host=host,
+            username=username,
+            password=password,
+            interface=interface
+        )
+
+    def _update_cmd_credentials(self, host, username, password, interface):
+        """
+        Update credentials to work with different server
+        :param host: IPMI IP address of the server
+        :param username: IPMI username
+        :param password: IPMI password
+        :param interface: IPMI Interface lan, lanplus
+        """
+        cmd = self._RAW_CMD.format(
+            self._IPMI,
+            interface,
+            host,
+            username,
+            password
+        )
+        self._cmd = cmd
+
+    def get_power_status(self):
+        """
+        get the machine power status
+        :return: 1 if the power is on and 0 if the power is off. otherwise it
+        will return -1 for unknown state
+        """
+        cmd = self._cmd + ' chassis power status'
+        output = self._process_request(cmd)
+        if self._verbose:
+            print "[Debug]: ", output
+        if 'is on'.lower() in output.lower():
+            return 1
+        elif 'is off'.lower() in output.lower():
+            return 0
+        return -1  # power status unknown
+
+    def power_down(self):
+        """
+        shutdown the machine
+        """
+        cmd = self._cmd + ' chassis power down'
+        output = self._process_request(cmd)
+        return output
+
+    def power_reset(self):
+        """
+        restart the machine
+        """
+        cmd = self._cmd + ' chassis power reset'
+        return self._process_request(cmd)
+
+    def power_on(self):
+        """
+        power on the machine
+        """
+        cmd = self._cmd + ' chassis power on'
+        return self._process_request(cmd)
+
+    def _process_request(self, cmd):
+        if self._verbose:
+            print "Executing IPMI command: ", cmd
+
+        process = subprocess.Popen(cmd, shell=True,
+                                   stdout=subprocess.PIPE,
+                                   stderr=subprocess.PIPE)
+        output, error = process.communicate()
+
+        if self._verbose:
+            print "[Debug] Process Output: ", output
+            print "[Debug] Process Error: ", error
+
+        if process.returncode:
+            raise Exception(error)
+        return output
+
+    def _custom_cmd(self, cmd):
+        """
+        execute custom ipmitool commands
+        :param cmd: string contains the command, for credentials and interface
+         you should _update_cmd_credentials to update them first
+        :return: output of the command you sent or raise error
+        """
+        cmd = self._cmd + cmd
+        return self._process_request(cmd)
--- a/monitor.py
+++ b/monitor.py
@ -0,0 +1,90 @@
+#__author__ = 'saad'
+from time import sleep
+
+
+class Monitor(object):
+    def __init__(self, client, wait):
+        self.client = client
+        self.wait = wait
+
+    def get_down_nodes(self):
+        # list all down nova compute
+        nova_down = self.is_nova_service_down()
+        # list all down hypervisors
+        hypervisor_down = self.is_hpyervisor_down()
+        # list all down openvswitch agents
+        agents_down = self.is_neutron_agents_down()
+
+        nodes_down = []
+        for node in nova_down:
+            if node in hypervisor_down and node in agents_down:
+                nodes_down.append(node)
+        return nodes_down
+
+    def monitor(self):
+        nodes_down = self.get_down_nodes()
+        nodes_to_evacuate = []
+        if nodes_down:
+            nodes_to_evacuate = self.process_failed_nodes(nodes_down)
+
+        evacuated_nodes = []
+        if nodes_to_evacuate:
+            evacuated_nodes = self.evacuate(nodes_to_evacuate)
+        if not evacuated_nodes:
+            raise "Error: node didn't evacuated !", nodes_to_evacuate
+
+        self.notify(evacuated_nodes)
+
+    # @todo needs to be implemented !
+    def notify(self, nodes):
+        print "These nodes %s Evacuated" % nodes[0]['host']
+        print nodes
+        """
+        will be used to notify the admins that there is something went wrong !
+        """
+        pass
+
+    def evacuate(self, nodes):
+        # @todo add shutdown process
+        # maintence mode not working with libvirt
+        # self.client.set_in_maintance(nodes)
+        evacuated = self.client.evacuate(nodes)
+        return evacuated
+
+    def process_failed_nodes(self, nodes):
+        sleep(self.wait)
+        nodes_down = self.get_down_nodes()
+        to_be_evacuated = []
+        for node in nodes_down:
+            if node in nodes:
+                to_be_evacuated.append(node)
+
+        return to_be_evacuated
+
+    def is_hpyervisor_down(self):
+        hypervisors = self.client.novahypervisors()
+        down_hosts = []
+        for hypervisor in hypervisors:
+            if hypervisor.get('state') == 'down':
+                host = {}
+                host['host'] = hypervisor.get('service').get('host')
+                down_hosts.append(hypervisor.get('service').get('host'))
+
+        return down_hosts
+
+    def is_nova_service_down(self):
+        computes = self.client.novacomputes()
+        down_hosts = []
+        for node in computes:
+            if node.get('state') == 'down' and node.get('status') == 'enabled':
+                down_hosts.append(node.get('host'))
+        return down_hosts
+
+    def is_neutron_agents_down(self):
+        agents = self.client.neutronagents()
+        down_hosts = []
+        for agent in agents:
+            if agent.get('admin_state_up') and not agent.get('alive'):
+                down_hosts.append(agent.get('host'))
+
+        return down_hosts
--- a/osclient.py
+++ b/osclient.py
@ -0,0 +1,116 @@
+__author__ = 'saad'
+
+from keystoneclient.auth.identity import v3
+from keystoneclient import session
+from novaclient.v2 import client as novaclient
+from neutronclient.v2_0 import client as neutronclient
+
+
+class OSClient:
+    def __init__(self, authurl, authmethod = 'password', ** kwargs):
+        """
+        Provide Openstack credentials to initalize the connection to Openstack
+        :param authmethod: string authmethod should be password or token but
+         currently we support only password !
+        :param kwargs: username, user_id, project_name, project_id,
+        default_domain_id,
+        """
+        self.authmethod = authmethod
+        self.authurl = authurl
+        if authmethod == 'password':
+            self.username = kwargs.get('username', None)
+            self.password = kwargs.get('password')
+            self.project_name = kwargs.get('project_name', None)
+            self.project_id = kwargs.get('project_id', None)
+            self.user_id = kwargs.get('user_id', None)
+            self.user_domain_id = kwargs.get('user_domain_id', None)
+            self.user_domain_name = kwargs.get('user_domain_name', None)
+            self.project_domain_name = kwargs.get('project_domain_name', None)
+            self.endpoint_type = kwargs.get('endpoint_type', 'internal')
+        else:
+            print "The available authmethod is password for the time being" \
+                  "Please, provide a password credentials :) "
+
+        self.auth()
+
+    def auth(self):
+        auth = v3.Password(auth_url=self.authurl,
+                           username=self.username,
+                           password=self.password,
+                           project_name=self.project_name,
+                           user_domain_id=self.user_domain_id,
+                           user_domain_name=self.user_domain_name,
+                           project_domain_name=self.project_domain_name)
+        self.authSession = session.Session(auth=auth)
+
+    def novacomputes(self):
+        nova = novaclient.Client(session=self.authSession,
+                                 endpoint_type=self.endpoint_type)
+        services = nova.services.list()
+        compute_nodes = []
+        compute_hosts = []
+        for service in services:
+            service = service.to_dict()
+            if service.get('binary') == 'nova-compute':
+                compute_nodes.append(service)
+                compute_hosts.append(service.get('host'))
+        self.compute_hosts = compute_hosts
+        return compute_nodes
+
+    def novahypervisors(self):
+        nova = novaclient.Client(session=self.authSession,
+                                 endpoint_type=self.endpoint_type)
+        hypervisors = nova.hypervisors.list()
+        nova_hypervisors = []
+
+        for hypervisor in hypervisors:
+            nova_hypervisors.append(hypervisor.to_dict())
+        return nova_hypervisors
+
+    def neutronagents(self, hosts=[]):
+        if not hosts:
+            hosts = self.compute_hosts
+        new_sess = session.Session(auth=self.authSession.auth)
+        neutron = neutronclient.Client(session=new_sess,
+                                       endpoint_type=self.endpoint_type)
+        self.authSession = new_sess
+        agents = neutron.list_agents()
+        neutron_agents = []
+        for agent in agents.get('agents'):
+                if agent.get('host') in hosts and agent.get('binary') == \
+                        'neutron-openvswitch-agent':
+                    neutron_agents.append(agent)
+
+        return neutron_agents
+
+    def evacuate(self, nodes):
+        new_sess = session.Session(auth=self.authSession.auth)
+        nova = novaclient.Client(session=new_sess,
+                                       endpoint_type=self.endpoint_type)
+        self.authSession = new_sess
+        evacuated_nodes = []
+        for node in nodes:
+            hypervisors = nova.hypervisors.search(node, True)
+            for hypervisor in hypervisors:
+                host = {'host': node, 'servers': hypervisor.servers}
+                evacuated_nodes.append(host)
+                for server in hypervisor.servers:
+                    pass
+#                    output = nova.servers.evacuate(server.get('uuid'),
+#                                                   on_shared_storage=True)
+        return evacuated_nodes
+
+    def set_in_maintance(self, nodes):
+        new_sess = session.Session(auth=self.authSession.auth)
+        nova = novaclient.Client(session=new_sess,
+                                       endpoint_type=self.endpoint_type)
+        self.authSession = new_sess
+        for node in nodes:
+            output = []
+            host = nova.hosts.get(node)[0]
+            values = {"maintenance_mode": "enable"}
+            try:
+                output.append(host.update(values))
+            except Exception as e:
+                print "ERROR ::: ", e
+            return output
--- a/osha.py
+++ b/osha.py
@ -0,0 +1,37 @@
+#__author__ = 'saad'
+from daemon import Daemon
+import sys
+import logging as log
+import time
+log.basicConfig(filename='osha.log')
+
+
+class Osha(Daemon):
+
+    def run(self):
+        # @todo scheduling code goes here ! may be apscheduler or just cron !
+        # just as a test ...
+        while True:
+            time.sleep(1)
+
+
+if __name__ == '__main__':
+    osha = Osha('/var/run/osha/osha.pid') # won't run unless the folder is
+    # already created and have the correct permissions !
+    if len(sys.argv) == 2:
+        if sys.argv[1] == 'start':
+            osha.start()
+        elif sys.argv[1] == 'stop':
+            osha.stop()
+        elif sys.argv[1] == 'restart':
+            osha.restart()
+        elif sys.argv[1] == 'status':
+            osha.status()
+        else:
+            print "Unknown command "
+            print "Usage %s start|stop|restart" % sys.argv[0]
+            sys.exit(2)
+        sys.exit(0)
+    else:
+        print "Usage %s start|stop|restart" % sys.argv[0]
+        sys.exit(0)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,9 @@
+pbr>=0.6,!=0.7,<1.0
+python-keystoneclient>=1.2.0,<1.4.0
+python-neutronclient>=2.4.0,<2.5.0
+python-novaclient>=2.22.0,<2.24.0
+python-openstackclient>=1.0.3,<1.1.0
+PyYAML>=3.1.0
+oslo.config>=1.9.3,<1.10.0  # Apache-2.0
+oslo.i18n>=1.5.0,<1.6.0  # Apache-2.0
+oslo.log>=1.0.0,<1.1.0  # Apache-2.0
--- a/servers.yml
+++ b/servers.yml
@ -0,0 +1,60 @@
+#
+# (c) Copyright 2015 Hewlett Packard Enterprise Development Company LP
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+---
+
+  servers:
+
+    - id: deployer
+      ip-addr: 192.168.10.254
+      hostname: padawan-cp1-c0-m1-mgmt
+      ilo-ip: 192.168.9.2
+      ilo-password: password
+      ilo-user: admin
+
+    - id: ccn-0001
+      ip-addr: 192.168.10.3
+      hostname: padawan-cp1-c1-m1-mgmt
+      ilo-ip: 192.168.9.3
+      ilo-password: password
+      ilo-user: admin
+
+    - id: ccn-0002
+      ip-addr: 192.168.10.4
+      hostname: padawan-cp1-c1-m2-mgmt
+      ilo-ip: 192.168.9.4
+      ilo-password: password
+      ilo-user: admin
+
+    - id: ccn-0003
+      ip-addr: 192.168.10.5
+      hostname: padawan-cp1-c1-m3-mgmt
+      ilo-ip: 192.168.9.5
+      ilo-password: password
+      ilo-user: admin
+
+    - id: COMPUTE-0001
+      ip-addr: 192.168.10.6
+      hostname: padawan-ccp-comp0001-mgmt
+      ilo-ip: 192.168.9.6
+      ilo-password: password
+      ilo-user: admin
+
+    - id: COMPUTE-0002
+      ip-addr: 192.168.10.7
+      hostname: padawan-ccp-comp0002-mgmt
+      ilo-ip: 192.168.9.7
+      ilo-password: password
+      ilo-user: admin
--- a/yaml_parser.py
+++ b/yaml_parser.py
@ -0,0 +1,60 @@
+# __author__ = 'saad'
+import yaml
+import os
+
+
+class YamlParser(object):
+
+    _INDEX = 'servers'
+
+    def __init__(self, yml_file, index='servers'):
+        """
+        Provide Yaml file to parse it and process data
+        :param yml_file: path to yaml file
+        :param index: the key in the .yml file to get all servers listed under
+        this key. the default 'is servers'
+        """
+        self.file = yml_file
+        self._INDEX = index
+        self.data = self.parse()
+
+    def parse(self):
+        if not self.file:
+            raise "No file specified !"
+        if not os.path.exists(self.file) or not os.path.isfile(self.file):
+            raise "File desn't exists"
+
+        stream = file(self.file, 'r')
+        data = yaml.load(stream)
+        return data
+
+    def find_server_by_ip(self, ip):
+        """
+        get server information ilo username, password and ip
+        :param ip: mgmt ip address of the server, this should be the same like
+        the ip in the .yml file
+        :return: dict contains server information
+        """
+        return self.find_server('ip-addr', ip)
+
+    def find_server_by_hostname(self, hostname):
+        """
+        get server information ilo username, password and ip
+        :param hostname: hostname matches one of the ones in the .yml file
+        :return: dict contains the server information
+        """
+        return self.find_server(key='hostname', value=hostname)
+
+    def find_server(self, key, value):
+        """
+        Generic function to query the .yml file to get server information by any
+        key.
+        :param key:
+        :param value:
+        :return:
+        """
+        for server in self.data.get(self._INDEX):
+            if server.get(key) == value:
+                return server
+
+        return None