From 0ee63ad837f8fd1433170688bf4a8753f4ce96ad Mon Sep 17 00:00:00 2001 From: Saad Zaher Date: Mon, 21 Dec 2015 11:35:58 +0000 Subject: [PATCH] Big Bang Change-Id: I16c69cf1d2ad6ff7c5b05ec5f3528e7cd8785249 --- README.rst | 13 ++++ daemon.py | 165 +++++++++++++++++++++++++++++++++++++++++++++++ evacuate.py | 22 +++++++ ipmitool.py | 115 +++++++++++++++++++++++++++++++++ monitor.py | 90 ++++++++++++++++++++++++++ osclient.py | 116 +++++++++++++++++++++++++++++++++ osha.py | 37 +++++++++++ requirements.txt | 9 +++ servers.yml | 60 +++++++++++++++++ yaml_parser.py | 60 +++++++++++++++++ 10 files changed, 687 insertions(+) create mode 100644 README.rst create mode 100644 daemon.py create mode 100644 evacuate.py create mode 100644 ipmitool.py create mode 100644 monitor.py create mode 100644 osclient.py create mode 100644 osha.py create mode 100644 requirements.txt create mode 100644 servers.yml create mode 100644 yaml_parser.py diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..b41fdd2 --- /dev/null +++ b/README.rst @@ -0,0 +1,13 @@ +Openstack Compute High Availability + +Osha allows Openstack to have High availability in compute nodes. Simply it monitors all compute nodes in your deployment +and if there is any failure in one of the computes it launches the evacuation tool to evacuate this node and move all +instances to another compute node. + +Osha has a plugable architecture so you can use any monitoring system you want to use it for monitoring your compute nodes +just by adding a simple plugin and adjust your configuration file to use this plugin or combination of plugins if you want + +Osha runs as scheduler in the control plane which communicates with the monitoring system to get compute nodes status +For running osha under high availability mode, it should run with active passive mode. + + diff --git a/daemon.py b/daemon.py new file mode 100644 index 0000000..6cbe11c --- /dev/null +++ b/daemon.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python + +import sys, os, time, atexit +from signal import SIGTERM +import logging as log + + +class Daemon: + """ + A generic daemon class. + + Usage: subclass the Daemon class and override the run() method + """ + def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null', + stderr='/dev/null'): + self.stdin = stdin + self.stdout = stdout + self.stderr = stderr + self.pidfile = pidfile + + def daemonize(self): + """ + do the UNIX double-fork magic, see Stevens' "Advanced + Programming in the UNIX Environment" for details + (ISBN 0201563177) + http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16 + """ + try: + pid = os.fork() + if pid > 0: + # exit first parent + sys.exit(0) + except OSError, e: + sys.stderr.write("fork #1 failed: %d (%s)\n" % + (e.errno, e.strerror)) + log.error(e) + sys.exit(1) + + # decouple from parent environment + os.chdir("/") + os.setsid() + os.umask(0) + + # do second fork + try: + pid = os.fork() + if pid > 0: + # exit from second parent + sys.exit(0) + except OSError, e: + sys.stderr.write("fork #2 failed: %d (%s)\n" + % (e.errno, e.strerror)) + log.error(e) + sys.exit(1) + + # redirect standard file descriptors + sys.stdout.flush() + sys.stderr.flush() + si = file(self.stdin, 'r') + so = file(self.stdout, 'a+') + se = file(self.stderr, 'a+', 0) + os.dup2(si.fileno(), sys.stdin.fileno()) + os.dup2(so.fileno(), sys.stdout.fileno()) + os.dup2(se.fileno(), sys.stderr.fileno()) + + # write pidfile + atexit.register(self.delpid) + + pid = str(os.getpid()) + f = file(self.pidfile, 'w+') + f.write("%s\n" % pid) + f.close() + + def delpid(self): + os.remove(self.pidfile) + + def start(self): + """ + Start the daemon + """ + log.error("Test") + # Check for a pidfile to see if the daemon already runs + try: + pf = file(self.pidfile,'r') + pid = int(pf.read().strip()) + pf.close() + except IOError as e: + pid = None + + if pid: + message = "pidfile %s already exist. Daemon" \ + " already running?\n" + sys.stderr.write(message % self.pidfile) + sys.exit(1) + + # Start the daemon + self.daemonize() + self.run() + + # @todo needs some enhancement like check /proc/%pid/status if it's + # really running or not ! may be it's killed by external process + # the PID won't be updated ! + def status(self): + try: + pf = file(self.pidfile, 'r') + pid = int(pf.read().strip()) + pf.close() + except IOError as e: + pid = None + + if pid: + message = "pidfile %s already exist. Daemon already " \ + "running. PID: %d \n" + sys.stdout.write(message % (self.pidfile, pid)) + sys.exit(0) + else: + message = "Service not running !\n" + sys.stdout.write(message) + sys.exit(0) + + def stop(self): + """ + Stop the daemon + """ + # Get the pid from the pidfile + try: + pf = file(self.pidfile,'r') + pid = int(pf.read().strip()) + pf.close() + except IOError: + pid = None + + if not pid: + message = "pidfile %s does not exist." \ + " Daemon not running?\n" + sys.stderr.write(message % self.pidfile) + return # not an error in a restart + + # Try killing the daemon process + try: + while 1: + os.kill(pid, SIGTERM) + time.sleep(0.1) + except OSError, err: + err = str(err) + if err.find("No such process") > 0: + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + else: + print str(err) + sys.exit(1) + + def restart(self): + """ + Restart the daemon + """ + self.stop() + self.start() + + def run(self): + """ + You should override this method when you subclass Daemon. + It will be called after the process has been + daemonized by start() or restart(). + """ \ No newline at end of file diff --git a/evacuate.py b/evacuate.py new file mode 100644 index 0000000..70edc9f --- /dev/null +++ b/evacuate.py @@ -0,0 +1,22 @@ +#__author__ = 'saad' +from monitor import Monitor +import osclient + +password = 'BOMrLNGHsoBb' +user_id = 'ec2548d6acb54e7ba24f479e2f3cb1a5' +username = 'admin' +auth_url = 'http://192.168.245.9:35357/v3' +project_name = 'demo' +project_id = 'f749b2874b0040aca92ea131210eb774' +user_domain_name = 'Default' +project_domain_name = 'Default' + +client = osclient.OSClient(authurl=auth_url, + username=username, + password=password, + user_domain_name=user_domain_name, + project_name=project_name, + project_domain_name=project_domain_name, + endpoint_type='internal') +monitor = Monitor(client, 1) +monitor.monitor() \ No newline at end of file diff --git a/ipmitool.py b/ipmitool.py new file mode 100644 index 0000000..8a807fa --- /dev/null +++ b/ipmitool.py @@ -0,0 +1,115 @@ +# __author__ = 'saad' +import sys +import subprocess +from distutils import spawn + + +class IpmiInterface: + + _IPMI = 'ipmitool' + _RAW_CMD = '{0} -I {1} -H {2} -U {3} -P {4} ' + _SUPPORTED_INTERFACES = ['lan', 'lanplus'] + + def __init__(self, host, username, password, verbose=False, + interface='lanplus'): + self._IPMI = spawn.find_executable('ipmitool') + if not self._IPMI: + self._IPMI = spawn.find_executable('ipmitool', + path=':'.join(sys.path)) + if interface not in self._SUPPORTED_INTERFACES: + raise Exception("Provided Interface is not supported") + + self._host = host + self._username = username + self._password = password + self._verbose = verbose + self._interface = interface + + self._update_cmd_credentials( + host=host, + username=username, + password=password, + interface=interface + ) + + def _update_cmd_credentials(self, host, username, password, interface): + """ + Update credentials to work with different server + :param host: IPMI IP address of the server + :param username: IPMI username + :param password: IPMI password + :param interface: IPMI Interface lan, lanplus + """ + cmd = self._RAW_CMD.format( + self._IPMI, + interface, + host, + username, + password + ) + self._cmd = cmd + + def get_power_status(self): + """ + get the machine power status + :return: 1 if the power is on and 0 if the power is off. otherwise it + will return -1 for unknown state + """ + cmd = self._cmd + ' chassis power status' + output = self._process_request(cmd) + if self._verbose: + print "[Debug]: ", output + if 'is on'.lower() in output.lower(): + return 1 + elif 'is off'.lower() in output.lower(): + return 0 + return -1 # power status unknown + + def power_down(self): + """ + shutdown the machine + """ + cmd = self._cmd + ' chassis power down' + output = self._process_request(cmd) + return output + + def power_reset(self): + """ + restart the machine + """ + cmd = self._cmd + ' chassis power reset' + return self._process_request(cmd) + + def power_on(self): + """ + power on the machine + """ + cmd = self._cmd + ' chassis power on' + return self._process_request(cmd) + + def _process_request(self, cmd): + if self._verbose: + print "Executing IPMI command: ", cmd + + process = subprocess.Popen(cmd, shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, error = process.communicate() + + if self._verbose: + print "[Debug] Process Output: ", output + print "[Debug] Process Error: ", error + + if process.returncode: + raise Exception(error) + return output + + def _custom_cmd(self, cmd): + """ + execute custom ipmitool commands + :param cmd: string contains the command, for credentials and interface + you should _update_cmd_credentials to update them first + :return: output of the command you sent or raise error + """ + cmd = self._cmd + cmd + return self._process_request(cmd) diff --git a/monitor.py b/monitor.py new file mode 100644 index 0000000..7346ba4 --- /dev/null +++ b/monitor.py @@ -0,0 +1,90 @@ +#__author__ = 'saad' +from time import sleep + + +class Monitor(object): + def __init__(self, client, wait): + self.client = client + self.wait = wait + + def get_down_nodes(self): + # list all down nova compute + nova_down = self.is_nova_service_down() + # list all down hypervisors + hypervisor_down = self.is_hpyervisor_down() + # list all down openvswitch agents + agents_down = self.is_neutron_agents_down() + + nodes_down = [] + for node in nova_down: + if node in hypervisor_down and node in agents_down: + nodes_down.append(node) + return nodes_down + + def monitor(self): + nodes_down = self.get_down_nodes() + nodes_to_evacuate = [] + if nodes_down: + nodes_to_evacuate = self.process_failed_nodes(nodes_down) + + evacuated_nodes = [] + if nodes_to_evacuate: + evacuated_nodes = self.evacuate(nodes_to_evacuate) + if not evacuated_nodes: + raise "Error: node didn't evacuated !", nodes_to_evacuate + + self.notify(evacuated_nodes) + + # @todo needs to be implemented ! + def notify(self, nodes): + print "These nodes %s Evacuated" % nodes[0]['host'] + print nodes + """ + will be used to notify the admins that there is something went wrong ! + """ + pass + + def evacuate(self, nodes): + # @todo add shutdown process + # maintence mode not working with libvirt + # self.client.set_in_maintance(nodes) + evacuated = self.client.evacuate(nodes) + return evacuated + + def process_failed_nodes(self, nodes): + sleep(self.wait) + nodes_down = self.get_down_nodes() + to_be_evacuated = [] + for node in nodes_down: + if node in nodes: + to_be_evacuated.append(node) + + return to_be_evacuated + + def is_hpyervisor_down(self): + hypervisors = self.client.novahypervisors() + down_hosts = [] + for hypervisor in hypervisors: + if hypervisor.get('state') == 'down': + host = {} + host['host'] = hypervisor.get('service').get('host') + down_hosts.append(hypervisor.get('service').get('host')) + + return down_hosts + + def is_nova_service_down(self): + computes = self.client.novacomputes() + down_hosts = [] + for node in computes: + if node.get('state') == 'down' and node.get('status') == 'enabled': + down_hosts.append(node.get('host')) + return down_hosts + + def is_neutron_agents_down(self): + agents = self.client.neutronagents() + down_hosts = [] + for agent in agents: + if agent.get('admin_state_up') and not agent.get('alive'): + down_hosts.append(agent.get('host')) + + return down_hosts diff --git a/osclient.py b/osclient.py new file mode 100644 index 0000000..ed92810 --- /dev/null +++ b/osclient.py @@ -0,0 +1,116 @@ +__author__ = 'saad' + +from keystoneclient.auth.identity import v3 +from keystoneclient import session +from novaclient.v2 import client as novaclient +from neutronclient.v2_0 import client as neutronclient + + +class OSClient: + def __init__(self, authurl, authmethod = 'password', ** kwargs): + """ + Provide Openstack credentials to initalize the connection to Openstack + :param authmethod: string authmethod should be password or token but + currently we support only password ! + :param kwargs: username, user_id, project_name, project_id, + default_domain_id, + """ + self.authmethod = authmethod + self.authurl = authurl + if authmethod == 'password': + self.username = kwargs.get('username', None) + self.password = kwargs.get('password') + self.project_name = kwargs.get('project_name', None) + self.project_id = kwargs.get('project_id', None) + self.user_id = kwargs.get('user_id', None) + self.user_domain_id = kwargs.get('user_domain_id', None) + self.user_domain_name = kwargs.get('user_domain_name', None) + self.project_domain_name = kwargs.get('project_domain_name', None) + self.endpoint_type = kwargs.get('endpoint_type', 'internal') + else: + print "The available authmethod is password for the time being" \ + "Please, provide a password credentials :) " + + self.auth() + + def auth(self): + auth = v3.Password(auth_url=self.authurl, + username=self.username, + password=self.password, + project_name=self.project_name, + user_domain_id=self.user_domain_id, + user_domain_name=self.user_domain_name, + project_domain_name=self.project_domain_name) + self.authSession = session.Session(auth=auth) + + def novacomputes(self): + nova = novaclient.Client(session=self.authSession, + endpoint_type=self.endpoint_type) + services = nova.services.list() + compute_nodes = [] + compute_hosts = [] + for service in services: + service = service.to_dict() + if service.get('binary') == 'nova-compute': + compute_nodes.append(service) + compute_hosts.append(service.get('host')) + self.compute_hosts = compute_hosts + return compute_nodes + + def novahypervisors(self): + nova = novaclient.Client(session=self.authSession, + endpoint_type=self.endpoint_type) + hypervisors = nova.hypervisors.list() + nova_hypervisors = [] + + for hypervisor in hypervisors: + nova_hypervisors.append(hypervisor.to_dict()) + return nova_hypervisors + + def neutronagents(self, hosts=[]): + if not hosts: + hosts = self.compute_hosts + new_sess = session.Session(auth=self.authSession.auth) + neutron = neutronclient.Client(session=new_sess, + endpoint_type=self.endpoint_type) + self.authSession = new_sess + agents = neutron.list_agents() + neutron_agents = [] + for agent in agents.get('agents'): + if agent.get('host') in hosts and agent.get('binary') == \ + 'neutron-openvswitch-agent': + neutron_agents.append(agent) + + return neutron_agents + + def evacuate(self, nodes): + new_sess = session.Session(auth=self.authSession.auth) + nova = novaclient.Client(session=new_sess, + endpoint_type=self.endpoint_type) + self.authSession = new_sess + evacuated_nodes = [] + for node in nodes: + hypervisors = nova.hypervisors.search(node, True) + for hypervisor in hypervisors: + host = {'host': node, 'servers': hypervisor.servers} + evacuated_nodes.append(host) + for server in hypervisor.servers: + pass +# output = nova.servers.evacuate(server.get('uuid'), +# on_shared_storage=True) + return evacuated_nodes + + def set_in_maintance(self, nodes): + new_sess = session.Session(auth=self.authSession.auth) + nova = novaclient.Client(session=new_sess, + endpoint_type=self.endpoint_type) + self.authSession = new_sess + for node in nodes: + output = [] + host = nova.hosts.get(node)[0] + values = {"maintenance_mode": "enable"} + try: + output.append(host.update(values)) + except Exception as e: + print "ERROR ::: ", e + return output diff --git a/osha.py b/osha.py new file mode 100644 index 0000000..c1510ae --- /dev/null +++ b/osha.py @@ -0,0 +1,37 @@ +#__author__ = 'saad' +from daemon import Daemon +import sys +import logging as log +import time +log.basicConfig(filename='osha.log') + + +class Osha(Daemon): + + def run(self): + # @todo scheduling code goes here ! may be apscheduler or just cron ! + # just as a test ... + while True: + time.sleep(1) + + +if __name__ == '__main__': + osha = Osha('/var/run/osha/osha.pid') # won't run unless the folder is + # already created and have the correct permissions ! + if len(sys.argv) == 2: + if sys.argv[1] == 'start': + osha.start() + elif sys.argv[1] == 'stop': + osha.stop() + elif sys.argv[1] == 'restart': + osha.restart() + elif sys.argv[1] == 'status': + osha.status() + else: + print "Unknown command " + print "Usage %s start|stop|restart" % sys.argv[0] + sys.exit(2) + sys.exit(0) + else: + print "Usage %s start|stop|restart" % sys.argv[0] + sys.exit(0) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..99185de --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +pbr>=0.6,!=0.7,<1.0 +python-keystoneclient>=1.2.0,<1.4.0 +python-neutronclient>=2.4.0,<2.5.0 +python-novaclient>=2.22.0,<2.24.0 +python-openstackclient>=1.0.3,<1.1.0 +PyYAML>=3.1.0 +oslo.config>=1.9.3,<1.10.0 # Apache-2.0 +oslo.i18n>=1.5.0,<1.6.0 # Apache-2.0 +oslo.log>=1.0.0,<1.1.0 # Apache-2.0 \ No newline at end of file diff --git a/servers.yml b/servers.yml new file mode 100644 index 0000000..dae017f --- /dev/null +++ b/servers.yml @@ -0,0 +1,60 @@ +# +# (c) Copyright 2015 Hewlett Packard Enterprise Development Company LP +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +--- + + servers: + + - id: deployer + ip-addr: 192.168.10.254 + hostname: padawan-cp1-c0-m1-mgmt + ilo-ip: 192.168.9.2 + ilo-password: password + ilo-user: admin + + - id: ccn-0001 + ip-addr: 192.168.10.3 + hostname: padawan-cp1-c1-m1-mgmt + ilo-ip: 192.168.9.3 + ilo-password: password + ilo-user: admin + + - id: ccn-0002 + ip-addr: 192.168.10.4 + hostname: padawan-cp1-c1-m2-mgmt + ilo-ip: 192.168.9.4 + ilo-password: password + ilo-user: admin + + - id: ccn-0003 + ip-addr: 192.168.10.5 + hostname: padawan-cp1-c1-m3-mgmt + ilo-ip: 192.168.9.5 + ilo-password: password + ilo-user: admin + + - id: COMPUTE-0001 + ip-addr: 192.168.10.6 + hostname: padawan-ccp-comp0001-mgmt + ilo-ip: 192.168.9.6 + ilo-password: password + ilo-user: admin + + - id: COMPUTE-0002 + ip-addr: 192.168.10.7 + hostname: padawan-ccp-comp0002-mgmt + ilo-ip: 192.168.9.7 + ilo-password: password + ilo-user: admin \ No newline at end of file diff --git a/yaml_parser.py b/yaml_parser.py new file mode 100644 index 0000000..d53da6d --- /dev/null +++ b/yaml_parser.py @@ -0,0 +1,60 @@ +# __author__ = 'saad' +import yaml +import os + + +class YamlParser(object): + + _INDEX = 'servers' + + def __init__(self, yml_file, index='servers'): + """ + Provide Yaml file to parse it and process data + :param yml_file: path to yaml file + :param index: the key in the .yml file to get all servers listed under + this key. the default 'is servers' + """ + self.file = yml_file + self._INDEX = index + self.data = self.parse() + + def parse(self): + if not self.file: + raise "No file specified !" + if not os.path.exists(self.file) or not os.path.isfile(self.file): + raise "File desn't exists" + + stream = file(self.file, 'r') + data = yaml.load(stream) + return data + + def find_server_by_ip(self, ip): + """ + get server information ilo username, password and ip + :param ip: mgmt ip address of the server, this should be the same like + the ip in the .yml file + :return: dict contains server information + """ + return self.find_server('ip-addr', ip) + + def find_server_by_hostname(self, hostname): + """ + get server information ilo username, password and ip + :param hostname: hostname matches one of the ones in the .yml file + :return: dict contains the server information + """ + return self.find_server(key='hostname', value=hostname) + + def find_server(self, key, value): + """ + Generic function to query the .yml file to get server information by any + key. + :param key: + :param value: + :return: + """ + for server in self.data.get(self._INDEX): + if server.get(key) == value: + return server + + return None \ No newline at end of file