diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..b41fdd2 --- /dev/null +++ b/README.rst @@ -0,0 +1,13 @@ +Openstack Compute High Availability + +Osha allows Openstack to have High availability in compute nodes. Simply it monitors all compute nodes in your deployment +and if there is any failure in one of the computes it launches the evacuation tool to evacuate this node and move all +instances to another compute node. + +Osha has a plugable architecture so you can use any monitoring system you want to use it for monitoring your compute nodes +just by adding a simple plugin and adjust your configuration file to use this plugin or combination of plugins if you want + +Osha runs as scheduler in the control plane which communicates with the monitoring system to get compute nodes status +For running osha under high availability mode, it should run with active passive mode. + + diff --git a/daemon.py b/daemon.py new file mode 100644 index 0000000..6cbe11c --- /dev/null +++ b/daemon.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python + +import sys, os, time, atexit +from signal import SIGTERM +import logging as log + + +class Daemon: + """ + A generic daemon class. + + Usage: subclass the Daemon class and override the run() method + """ + def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null', + stderr='/dev/null'): + self.stdin = stdin + self.stdout = stdout + self.stderr = stderr + self.pidfile = pidfile + + def daemonize(self): + """ + do the UNIX double-fork magic, see Stevens' "Advanced + Programming in the UNIX Environment" for details + (ISBN 0201563177) + http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16 + """ + try: + pid = os.fork() + if pid > 0: + # exit first parent + sys.exit(0) + except OSError, e: + sys.stderr.write("fork #1 failed: %d (%s)\n" % + (e.errno, e.strerror)) + log.error(e) + sys.exit(1) + + # decouple from parent environment + os.chdir("/") + os.setsid() + os.umask(0) + + # do second fork + try: + pid = os.fork() + if pid > 0: + # exit from second parent + sys.exit(0) + except OSError, e: + sys.stderr.write("fork #2 failed: %d (%s)\n" + % (e.errno, e.strerror)) + log.error(e) + sys.exit(1) + + # redirect standard file descriptors + sys.stdout.flush() + sys.stderr.flush() + si = file(self.stdin, 'r') + so = file(self.stdout, 'a+') + se = file(self.stderr, 'a+', 0) + os.dup2(si.fileno(), sys.stdin.fileno()) + os.dup2(so.fileno(), sys.stdout.fileno()) + os.dup2(se.fileno(), sys.stderr.fileno()) + + # write pidfile + atexit.register(self.delpid) + + pid = str(os.getpid()) + f = file(self.pidfile, 'w+') + f.write("%s\n" % pid) + f.close() + + def delpid(self): + os.remove(self.pidfile) + + def start(self): + """ + Start the daemon + """ + log.error("Test") + # Check for a pidfile to see if the daemon already runs + try: + pf = file(self.pidfile,'r') + pid = int(pf.read().strip()) + pf.close() + except IOError as e: + pid = None + + if pid: + message = "pidfile %s already exist. Daemon" \ + " already running?\n" + sys.stderr.write(message % self.pidfile) + sys.exit(1) + + # Start the daemon + self.daemonize() + self.run() + + # @todo needs some enhancement like check /proc/%pid/status if it's + # really running or not ! may be it's killed by external process + # the PID won't be updated ! + def status(self): + try: + pf = file(self.pidfile, 'r') + pid = int(pf.read().strip()) + pf.close() + except IOError as e: + pid = None + + if pid: + message = "pidfile %s already exist. Daemon already " \ + "running. PID: %d \n" + sys.stdout.write(message % (self.pidfile, pid)) + sys.exit(0) + else: + message = "Service not running !\n" + sys.stdout.write(message) + sys.exit(0) + + def stop(self): + """ + Stop the daemon + """ + # Get the pid from the pidfile + try: + pf = file(self.pidfile,'r') + pid = int(pf.read().strip()) + pf.close() + except IOError: + pid = None + + if not pid: + message = "pidfile %s does not exist." \ + " Daemon not running?\n" + sys.stderr.write(message % self.pidfile) + return # not an error in a restart + + # Try killing the daemon process + try: + while 1: + os.kill(pid, SIGTERM) + time.sleep(0.1) + except OSError, err: + err = str(err) + if err.find("No such process") > 0: + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + else: + print str(err) + sys.exit(1) + + def restart(self): + """ + Restart the daemon + """ + self.stop() + self.start() + + def run(self): + """ + You should override this method when you subclass Daemon. + It will be called after the process has been + daemonized by start() or restart(). + """ \ No newline at end of file diff --git a/evacuate.py b/evacuate.py new file mode 100644 index 0000000..70edc9f --- /dev/null +++ b/evacuate.py @@ -0,0 +1,22 @@ +#__author__ = 'saad' +from monitor import Monitor +import osclient + +password = 'BOMrLNGHsoBb' +user_id = 'ec2548d6acb54e7ba24f479e2f3cb1a5' +username = 'admin' +auth_url = 'http://192.168.245.9:35357/v3' +project_name = 'demo' +project_id = 'f749b2874b0040aca92ea131210eb774' +user_domain_name = 'Default' +project_domain_name = 'Default' + +client = osclient.OSClient(authurl=auth_url, + username=username, + password=password, + user_domain_name=user_domain_name, + project_name=project_name, + project_domain_name=project_domain_name, + endpoint_type='internal') +monitor = Monitor(client, 1) +monitor.monitor() \ No newline at end of file diff --git a/ipmitool.py b/ipmitool.py new file mode 100644 index 0000000..8a807fa --- /dev/null +++ b/ipmitool.py @@ -0,0 +1,115 @@ +# __author__ = 'saad' +import sys +import subprocess +from distutils import spawn + + +class IpmiInterface: + + _IPMI = 'ipmitool' + _RAW_CMD = '{0} -I {1} -H {2} -U {3} -P {4} ' + _SUPPORTED_INTERFACES = ['lan', 'lanplus'] + + def __init__(self, host, username, password, verbose=False, + interface='lanplus'): + self._IPMI = spawn.find_executable('ipmitool') + if not self._IPMI: + self._IPMI = spawn.find_executable('ipmitool', + path=':'.join(sys.path)) + if interface not in self._SUPPORTED_INTERFACES: + raise Exception("Provided Interface is not supported") + + self._host = host + self._username = username + self._password = password + self._verbose = verbose + self._interface = interface + + self._update_cmd_credentials( + host=host, + username=username, + password=password, + interface=interface + ) + + def _update_cmd_credentials(self, host, username, password, interface): + """ + Update credentials to work with different server + :param host: IPMI IP address of the server + :param username: IPMI username + :param password: IPMI password + :param interface: IPMI Interface lan, lanplus + """ + cmd = self._RAW_CMD.format( + self._IPMI, + interface, + host, + username, + password + ) + self._cmd = cmd + + def get_power_status(self): + """ + get the machine power status + :return: 1 if the power is on and 0 if the power is off. otherwise it + will return -1 for unknown state + """ + cmd = self._cmd + ' chassis power status' + output = self._process_request(cmd) + if self._verbose: + print "[Debug]: ", output + if 'is on'.lower() in output.lower(): + return 1 + elif 'is off'.lower() in output.lower(): + return 0 + return -1 # power status unknown + + def power_down(self): + """ + shutdown the machine + """ + cmd = self._cmd + ' chassis power down' + output = self._process_request(cmd) + return output + + def power_reset(self): + """ + restart the machine + """ + cmd = self._cmd + ' chassis power reset' + return self._process_request(cmd) + + def power_on(self): + """ + power on the machine + """ + cmd = self._cmd + ' chassis power on' + return self._process_request(cmd) + + def _process_request(self, cmd): + if self._verbose: + print "Executing IPMI command: ", cmd + + process = subprocess.Popen(cmd, shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, error = process.communicate() + + if self._verbose: + print "[Debug] Process Output: ", output + print "[Debug] Process Error: ", error + + if process.returncode: + raise Exception(error) + return output + + def _custom_cmd(self, cmd): + """ + execute custom ipmitool commands + :param cmd: string contains the command, for credentials and interface + you should _update_cmd_credentials to update them first + :return: output of the command you sent or raise error + """ + cmd = self._cmd + cmd + return self._process_request(cmd) diff --git a/monitor.py b/monitor.py new file mode 100644 index 0000000..7346ba4 --- /dev/null +++ b/monitor.py @@ -0,0 +1,90 @@ +#__author__ = 'saad' +from time import sleep + + +class Monitor(object): + def __init__(self, client, wait): + self.client = client + self.wait = wait + + def get_down_nodes(self): + # list all down nova compute + nova_down = self.is_nova_service_down() + # list all down hypervisors + hypervisor_down = self.is_hpyervisor_down() + # list all down openvswitch agents + agents_down = self.is_neutron_agents_down() + + nodes_down = [] + for node in nova_down: + if node in hypervisor_down and node in agents_down: + nodes_down.append(node) + return nodes_down + + def monitor(self): + nodes_down = self.get_down_nodes() + nodes_to_evacuate = [] + if nodes_down: + nodes_to_evacuate = self.process_failed_nodes(nodes_down) + + evacuated_nodes = [] + if nodes_to_evacuate: + evacuated_nodes = self.evacuate(nodes_to_evacuate) + if not evacuated_nodes: + raise "Error: node didn't evacuated !", nodes_to_evacuate + + self.notify(evacuated_nodes) + + # @todo needs to be implemented ! + def notify(self, nodes): + print "These nodes %s Evacuated" % nodes[0]['host'] + print nodes + """ + will be used to notify the admins that there is something went wrong ! + """ + pass + + def evacuate(self, nodes): + # @todo add shutdown process + # maintence mode not working with libvirt + # self.client.set_in_maintance(nodes) + evacuated = self.client.evacuate(nodes) + return evacuated + + def process_failed_nodes(self, nodes): + sleep(self.wait) + nodes_down = self.get_down_nodes() + to_be_evacuated = [] + for node in nodes_down: + if node in nodes: + to_be_evacuated.append(node) + + return to_be_evacuated + + def is_hpyervisor_down(self): + hypervisors = self.client.novahypervisors() + down_hosts = [] + for hypervisor in hypervisors: + if hypervisor.get('state') == 'down': + host = {} + host['host'] = hypervisor.get('service').get('host') + down_hosts.append(hypervisor.get('service').get('host')) + + return down_hosts + + def is_nova_service_down(self): + computes = self.client.novacomputes() + down_hosts = [] + for node in computes: + if node.get('state') == 'down' and node.get('status') == 'enabled': + down_hosts.append(node.get('host')) + return down_hosts + + def is_neutron_agents_down(self): + agents = self.client.neutronagents() + down_hosts = [] + for agent in agents: + if agent.get('admin_state_up') and not agent.get('alive'): + down_hosts.append(agent.get('host')) + + return down_hosts diff --git a/osclient.py b/osclient.py new file mode 100644 index 0000000..ed92810 --- /dev/null +++ b/osclient.py @@ -0,0 +1,116 @@ +__author__ = 'saad' + +from keystoneclient.auth.identity import v3 +from keystoneclient import session +from novaclient.v2 import client as novaclient +from neutronclient.v2_0 import client as neutronclient + + +class OSClient: + def __init__(self, authurl, authmethod = 'password', ** kwargs): + """ + Provide Openstack credentials to initalize the connection to Openstack + :param authmethod: string authmethod should be password or token but + currently we support only password ! + :param kwargs: username, user_id, project_name, project_id, + default_domain_id, + """ + self.authmethod = authmethod + self.authurl = authurl + if authmethod == 'password': + self.username = kwargs.get('username', None) + self.password = kwargs.get('password') + self.project_name = kwargs.get('project_name', None) + self.project_id = kwargs.get('project_id', None) + self.user_id = kwargs.get('user_id', None) + self.user_domain_id = kwargs.get('user_domain_id', None) + self.user_domain_name = kwargs.get('user_domain_name', None) + self.project_domain_name = kwargs.get('project_domain_name', None) + self.endpoint_type = kwargs.get('endpoint_type', 'internal') + else: + print "The available authmethod is password for the time being" \ + "Please, provide a password credentials :) " + + self.auth() + + def auth(self): + auth = v3.Password(auth_url=self.authurl, + username=self.username, + password=self.password, + project_name=self.project_name, + user_domain_id=self.user_domain_id, + user_domain_name=self.user_domain_name, + project_domain_name=self.project_domain_name) + self.authSession = session.Session(auth=auth) + + def novacomputes(self): + nova = novaclient.Client(session=self.authSession, + endpoint_type=self.endpoint_type) + services = nova.services.list() + compute_nodes = [] + compute_hosts = [] + for service in services: + service = service.to_dict() + if service.get('binary') == 'nova-compute': + compute_nodes.append(service) + compute_hosts.append(service.get('host')) + self.compute_hosts = compute_hosts + return compute_nodes + + def novahypervisors(self): + nova = novaclient.Client(session=self.authSession, + endpoint_type=self.endpoint_type) + hypervisors = nova.hypervisors.list() + nova_hypervisors = [] + + for hypervisor in hypervisors: + nova_hypervisors.append(hypervisor.to_dict()) + return nova_hypervisors + + def neutronagents(self, hosts=[]): + if not hosts: + hosts = self.compute_hosts + new_sess = session.Session(auth=self.authSession.auth) + neutron = neutronclient.Client(session=new_sess, + endpoint_type=self.endpoint_type) + self.authSession = new_sess + agents = neutron.list_agents() + neutron_agents = [] + for agent in agents.get('agents'): + if agent.get('host') in hosts and agent.get('binary') == \ + 'neutron-openvswitch-agent': + neutron_agents.append(agent) + + return neutron_agents + + def evacuate(self, nodes): + new_sess = session.Session(auth=self.authSession.auth) + nova = novaclient.Client(session=new_sess, + endpoint_type=self.endpoint_type) + self.authSession = new_sess + evacuated_nodes = [] + for node in nodes: + hypervisors = nova.hypervisors.search(node, True) + for hypervisor in hypervisors: + host = {'host': node, 'servers': hypervisor.servers} + evacuated_nodes.append(host) + for server in hypervisor.servers: + pass +# output = nova.servers.evacuate(server.get('uuid'), +# on_shared_storage=True) + return evacuated_nodes + + def set_in_maintance(self, nodes): + new_sess = session.Session(auth=self.authSession.auth) + nova = novaclient.Client(session=new_sess, + endpoint_type=self.endpoint_type) + self.authSession = new_sess + for node in nodes: + output = [] + host = nova.hosts.get(node)[0] + values = {"maintenance_mode": "enable"} + try: + output.append(host.update(values)) + except Exception as e: + print "ERROR ::: ", e + return output diff --git a/osha.py b/osha.py new file mode 100644 index 0000000..c1510ae --- /dev/null +++ b/osha.py @@ -0,0 +1,37 @@ +#__author__ = 'saad' +from daemon import Daemon +import sys +import logging as log +import time +log.basicConfig(filename='osha.log') + + +class Osha(Daemon): + + def run(self): + # @todo scheduling code goes here ! may be apscheduler or just cron ! + # just as a test ... + while True: + time.sleep(1) + + +if __name__ == '__main__': + osha = Osha('/var/run/osha/osha.pid') # won't run unless the folder is + # already created and have the correct permissions ! + if len(sys.argv) == 2: + if sys.argv[1] == 'start': + osha.start() + elif sys.argv[1] == 'stop': + osha.stop() + elif sys.argv[1] == 'restart': + osha.restart() + elif sys.argv[1] == 'status': + osha.status() + else: + print "Unknown command " + print "Usage %s start|stop|restart" % sys.argv[0] + sys.exit(2) + sys.exit(0) + else: + print "Usage %s start|stop|restart" % sys.argv[0] + sys.exit(0) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..99185de --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +pbr>=0.6,!=0.7,<1.0 +python-keystoneclient>=1.2.0,<1.4.0 +python-neutronclient>=2.4.0,<2.5.0 +python-novaclient>=2.22.0,<2.24.0 +python-openstackclient>=1.0.3,<1.1.0 +PyYAML>=3.1.0 +oslo.config>=1.9.3,<1.10.0 # Apache-2.0 +oslo.i18n>=1.5.0,<1.6.0 # Apache-2.0 +oslo.log>=1.0.0,<1.1.0 # Apache-2.0 \ No newline at end of file diff --git a/servers.yml b/servers.yml new file mode 100644 index 0000000..dae017f --- /dev/null +++ b/servers.yml @@ -0,0 +1,60 @@ +# +# (c) Copyright 2015 Hewlett Packard Enterprise Development Company LP +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +--- + + servers: + + - id: deployer + ip-addr: 192.168.10.254 + hostname: padawan-cp1-c0-m1-mgmt + ilo-ip: 192.168.9.2 + ilo-password: password + ilo-user: admin + + - id: ccn-0001 + ip-addr: 192.168.10.3 + hostname: padawan-cp1-c1-m1-mgmt + ilo-ip: 192.168.9.3 + ilo-password: password + ilo-user: admin + + - id: ccn-0002 + ip-addr: 192.168.10.4 + hostname: padawan-cp1-c1-m2-mgmt + ilo-ip: 192.168.9.4 + ilo-password: password + ilo-user: admin + + - id: ccn-0003 + ip-addr: 192.168.10.5 + hostname: padawan-cp1-c1-m3-mgmt + ilo-ip: 192.168.9.5 + ilo-password: password + ilo-user: admin + + - id: COMPUTE-0001 + ip-addr: 192.168.10.6 + hostname: padawan-ccp-comp0001-mgmt + ilo-ip: 192.168.9.6 + ilo-password: password + ilo-user: admin + + - id: COMPUTE-0002 + ip-addr: 192.168.10.7 + hostname: padawan-ccp-comp0002-mgmt + ilo-ip: 192.168.9.7 + ilo-password: password + ilo-user: admin \ No newline at end of file diff --git a/yaml_parser.py b/yaml_parser.py new file mode 100644 index 0000000..d53da6d --- /dev/null +++ b/yaml_parser.py @@ -0,0 +1,60 @@ +# __author__ = 'saad' +import yaml +import os + + +class YamlParser(object): + + _INDEX = 'servers' + + def __init__(self, yml_file, index='servers'): + """ + Provide Yaml file to parse it and process data + :param yml_file: path to yaml file + :param index: the key in the .yml file to get all servers listed under + this key. the default 'is servers' + """ + self.file = yml_file + self._INDEX = index + self.data = self.parse() + + def parse(self): + if not self.file: + raise "No file specified !" + if not os.path.exists(self.file) or not os.path.isfile(self.file): + raise "File desn't exists" + + stream = file(self.file, 'r') + data = yaml.load(stream) + return data + + def find_server_by_ip(self, ip): + """ + get server information ilo username, password and ip + :param ip: mgmt ip address of the server, this should be the same like + the ip in the .yml file + :return: dict contains server information + """ + return self.find_server('ip-addr', ip) + + def find_server_by_hostname(self, hostname): + """ + get server information ilo username, password and ip + :param hostname: hostname matches one of the ones in the .yml file + :return: dict contains the server information + """ + return self.find_server(key='hostname', value=hostname) + + def find_server(self, key, value): + """ + Generic function to query the .yml file to get server information by any + key. + :param key: + :param value: + :return: + """ + for server in self.data.get(self._INDEX): + if server.get(key) == value: + return server + + return None \ No newline at end of file