HACluster refactoring

This commit is contained in:
Andres Rodriguez 2012-12-11 07:54:36 -05:00
parent 1c22ba36b4
commit f2b2373497
7 changed files with 191 additions and 81 deletions

26
TODO
View File

@ -15,23 +15,13 @@ HA Cluster (pacemaker/corosync) Charm
* TODO: Fix Disable upstart jobs
- sudo sh -c "echo 'manual' > /etc/init/SERVICE.override"
* BIG PROBlEM:
- given that we can only deploy hacluster once, and its config defines
the corosync configuration options, then we need to change the approach
on how the corosync is defined. Possible solution:
- in the 'service/charm' that uses hacluster, it will define the corosync options
- Instead of network source, it can define interfaces to use and assume each ethX
interface is connected to the same network and autodetect the network address.
* TODO: on juju destroy-server quantum, ha-relation-broken is executed.
we need to put nodes in standby or delete them.
* ERROR/BUG (discuss with jamespage):
- On add-unit in controller environment:
- subordinate (in added unit) gets the relation data in ha-relation-joined
- On add-unit in openstack
- subordinate (in added unit) *DOESN'T* get the the relation data in ha-relation-joined
- This is fine really cause we don't really need to re-add the services.
- However, the problem is that upstart jobs don't get stopped.
update-rc.d -f pacemaker remove
update-rc.d pacemaker start 50 1 2 3 4 5 . stop 01 0 6 .
TODO: Problem seems to be that peer-relation gets executed before the subordinate relation.
In that case, peer relation would have to put nodes in standby and then the subordinate relation
will have to put the nodes online and configure the services. Or probably not use it at all.
Hanode-relation puts node in standby.
ha-relation counts nodes in hanode-relation and if >2 then we online them and setup cluster.

View File

@ -1,9 +1,4 @@
options:
corosync_bindnetaddr:
type: string
description: |
Network address of the interface on which corosync will communicate
with the other nodes of the cluster.
corosync_mcastaddr:
default: 226.94.1.1
type: string
@ -11,13 +6,6 @@ options:
Multicast IP address to use for exchanging messages over the network.
If multiple clusters are on the same bindnetaddr network, this value
can be changed.
corosync_mcastport:
default: 5405
type: int
description: |
Multicast Port number to use for exchanging messages. If multiple
clusters sit on the same Multicast IP Address, this value needs to
be changed.
corosync_pcmk_ver:
default: 1
type: int
@ -26,6 +14,7 @@ options:
Corosync how to start pacemaker
corosync_key:
type: string
default: corosync-key
description: |
This value will become the Corosync authentication key. To generate
a suitable value use:

View File

@ -0,0 +1 @@
hooks.py

View File

@ -21,27 +21,42 @@ import pcmk
def install():
utils.juju_log('INFO', 'Begin install hook.')
utils.configure_source()
utils.install('corosync', 'pacemaker', 'openstack-resource-agents')
utils.install('corosync', 'pacemaker', 'openstack-resource-agents', 'python-netaddr')
utils.juju_log('INFO', 'End install hook.')
def get_corosync_conf():
for relid in utils.relation_ids('ha'):
for unit in utils.relation_list(relid):
conf = {
'corosync_bindnetaddr': utils.get_network_address(
utils.relation_get('corosync_bindiface',
unit, relid)),
'corosync_mcastport': utils.relation_get('corosync_mcastport',
unit, relid),
'corosync_mcastaddr': utils.config_get('corosync_mcastaddr'),
'corosync_pcmk_ver': utils.config_get('corosync_pcmk_ver'),
}
if None not in conf.itervalues():
return conf
return None
def emit_corosync_conf():
# read config variables
corosync_conf_context = {
'corosync_bindnetaddr': utils.config_get('corosync_bindnetaddr'),
'corosync_mcastaddr': utils.config_get('corosync_mcastaddr'),
'corosync_mcastport': utils.config_get('corosync_mcastport'),
'corosync_pcmk_ver': utils.config_get('corosync_pcmk_ver'),
}
# write /etc/default/corosync file
with open('/etc/default/corosync', 'w') as corosync_default:
corosync_default.write(utils.render_template('corosync', corosync_conf_context))
corosync_conf_context = get_corosync_conf()
# write config file (/etc/corosync/corosync.conf
with open('/etc/corosync/corosync.conf', 'w') as corosync_conf:
corosync_conf.write(utils.render_template('corosync.conf', corosync_conf_context))
def emit_base_conf():
corosync_default_context = {'corosync_enabled': 'yes'}
# write /etc/default/corosync file
with open('/etc/default/corosync', 'w') as corosync_default:
corosync_default.write(utils.render_template('corosync', corosync_default_context))
# write the authkey
corosync_key=utils.config_get('corosync_key')
with open(corosync_key, 'w') as corosync_key_file:
@ -51,12 +66,6 @@ def emit_corosync_conf():
def config_changed():
utils.juju_log('INFO', 'Begin config-changed hook.')
# validate configuration options
corosync_bindnetaddr = utils.config_get('corosync_bindnetaddr')
if corosync_bindnetaddr == '':
utils.juju_log('CRITICAL', 'No bindnetaddr supplied, cannot proceed.')
sys.exit(1)
corosync_key = utils.config_get('corosync_key')
if corosync_key == '':
utils.juju_log('CRITICAL',
@ -64,7 +73,7 @@ def config_changed():
sys.exit(1)
# Create a new config file
emit_corosync_conf()
emit_base_conf()
utils.juju_log('INFO', 'End config-changed hook.')
@ -81,14 +90,15 @@ def start():
else:
utils.start("corosync")
# TODO: Only start pacemaker after making sure
# Only start pacemaker after making sure
# corosync has been started
# Wait a few seconds for corosync to start.
time.sleep(2)
if utils.running("pacemaker"):
utils.restart("pacemaker")
else:
utils.start("pacemaker")
if utils.running("corosync"):
if utils.running("pacemaker"):
utils.restart("pacemaker")
else:
utils.start("pacemaker")
def stop():
@ -100,14 +110,23 @@ def stop():
def ha_relation():
utils.juju_log('INFO', 'Begin ha relation joined/changed hook')
pcmk.wait_for_pcmk()
if utils.relation_get("corosync_bindiface") is None:
return
elif utils.relation_get("corosync_mcastport") is None:
return
else:
emit_corosync_conf()
utils.restart("corosync")
time.sleep(2)
utils.restart("pacemaker")
cmd = "crm configure property stonith-enabled=false"
pcmk.commit(cmd)
cmd = "crm configure property no-quorum-policy=ignore"
pcmk.commit(cmd)
cmd = 'crm configure rsc_defaults $id="rsc-options" resource-stickiness="100"'
pcmk.commit(cmd)
# Check that there's enough nodes in order to perform the
# configuration of the HA cluster
if len(get_cluster_nodes()) < 2:
return
else:
utils.juju_log('INFO', 'hanode-relation: Waiting for PCMK to start')
pcmk.wait_for_pcmk()
# Obtain relation information
import ast
@ -120,6 +139,7 @@ def ha_relation():
init_services = {} if utils.relation_get("init_services") is None else ast.literal_eval(utils.relation_get("init_services"))
# Configuring the Resource
utils.juju_log('INFO', 'ha-relation: Configuring Resources')
for res_name,res_type in resources.iteritems():
# disable the service we are going to put in HA
if res_type.split(':')[0] == "lsb":
@ -131,7 +151,8 @@ def ha_relation():
if utils.running(init_services[res_name]):
utils.stop(init_services[res_name])
# Put the services in HA, if not already done so
if not pcmk.is_resource_present(res_name):
#if not pcmk.is_resource_present(res_name):
if not pcmk.crm_opt_exists(res_name):
if resource_params[res_name] is None:
cmd = 'crm -F configure primitive %s %s' % (res_name, res_type)
else:
@ -140,28 +161,45 @@ def ha_relation():
utils.juju_log('INFO', '%s' % cmd)
# Configuring groups
utils.juju_log('INFO', 'ha-relation: Configuring Groups')
for grp_name, grp_params in groups.iteritems():
cmd = 'crm -F configure group %s %s' % (grp_name, grp_params)
pcmk.commit(cmd)
utils.juju_log('INFO', '%s' % cmd)
if not pcmk.crm_opt_exists(grp_name):
cmd = 'crm -F configure group %s %s' % (grp_name, grp_params)
pcmk.commit(cmd)
utils.juju_log('INFO', '%s' % cmd)
# Configuring ordering
utils.juju_log('INFO', 'ha-relation: Configuring Orders')
for ord_name, ord_params in orders.iteritems():
cmd = 'crm -F configure order %s %s' % (ord_name, ord_params)
pcmk.commit(cmd)
utils.juju_log('INFO', '%s' % cmd)
if not pcmk.crm_opt_exists(ord_name):
cmd = 'crm -F configure order %s %s' % (ord_name, ord_params)
pcmk.commit(cmd)
utils.juju_log('INFO', '%s' % cmd)
# Configuring colocations
utils.juju_log('INFO', 'ha-relation: Configuring Colocations')
for col_name, col_params in colocations.iteritems():
cmd = 'crm -F configure colocation %s %s' % (col_name, col_params)
pcmk.commit(cmd)
utils.juju_log('INFO', '%s' % cmd)
if not pcmk.crm_opt_exists(col_name):
cmd = 'crm -F configure colocation %s %s' % (col_name, col_params)
pcmk.commit(cmd)
utils.juju_log('INFO', '%s' % cmd)
# Configuring clones
utils.juju_log('INFO', 'ha-relation: Configuring Clones')
for cln_name, cln_params in clones.iteritems():
cmd = 'crm -F configure clone %s %s' % (cln_name, cln_params)
pcmk.commit(cmd)
utils.juju_log('INFO', '%s' % cmd)
if not pcmk.crm_opt_exists(cln_name):
cmd = 'crm -F configure clone %s %s' % (cln_name, cln_params)
pcmk.commit(cmd)
utils.juju_log('INFO', '%s' % cmd)
for res_name,res_type in resources.iteritems():
# TODO: This should first check that the resources is running
if len(init_services) != 0 and res_name in init_services:
# If the resource is in HA already, and it is a service, restart
# the pcmk resource as the config file might have changed by the
# principal charm
cmd = 'crm resource restart %s' % res_name
pcmk.commit(cmd)
utils.juju_log('INFO', 'End ha relation joined/changed hook')
@ -171,19 +209,50 @@ def ha_relation_departed():
# If this happens, and a new relation is created in the same machine
# (which already has node), then check whether it is standby and put it
# in online mode. This should be done in ha_relation_joined.
cmd = "crm -F node standby %s" % utils.get_unit_hostname()
pcmk.commit(cmd)
pcmk.standby(utils.get_unit_hostname())
def get_cluster_nodes():
hosts = []
hosts.append('{}:6789'.format(utils.get_host_ip()))
for relid in utils.relation_ids('hanode'):
for unit in utils.relation_list(relid):
hosts.append(
'{}:6789'.format(utils.get_host_ip(
utils.relation_get('private-address',
unit, relid)))
)
hosts.sort()
return hosts
def hanode_relation():
utils.juju_log('INFO', 'Begin hanode peer relation hook')
if len(get_cluster_nodes()) >= 2:
utils.juju_log('INFO', 'hanode-relation: Waiting for PCMK to start')
pcmk.wait_for_pcmk()
utils.juju_log('INFO', 'hanode-relation: Doing global configuration')
cmd = "crm configure property stonith-enabled=false"
pcmk.commit(cmd)
cmd = "crm configure property no-quorum-policy=ignore"
pcmk.commit(cmd)
cmd = 'crm configure rsc_defaults $id="rsc-options" resource-stickiness="100"'
pcmk.commit(cmd)
utils.do_hooks({
'config-changed': config_changed,
'install': install,
'config-changed': config_changed,
'start': start,
'stop': stop,
'upgrade-charm': upgrade_charm,
'ha-relation-joined': ha_relation,
'ha-relation-changed': ha_relation,
'ha-relation-departed': ha_relation_departed,
'hanode-relation-joined': hanode_relation,
#'hanode-relation-departed': hanode_relation_departed, # TODO: should probably remove nodes from the cluster
})

View File

@ -20,12 +20,33 @@ def commit(cmd):
subprocess.call(cmd.split())
#def wait_for_cluster():
# while (not is_running()):
# time.sleep(3)
def is_resource_present(resource):
(status, output) = commands.getstatusoutput("crm resource status %s" % resource)
if status != 0:
return False
return True
def standby(node=None):
if node is None:
cmd = "crm -F node standby"
else:
cmd = "crm -F node standby %s" % node
commit(cmd)
def online(node=None):
if node is None:
cmd = "crm -F node online"
else:
cmd = "crm -F node online %s" % node
commit(cmd)
def crm_opt_exists(opt_name):
(status, output) = commands.getstatusoutput("crm configure show")
show_re = re.compile(opt_name)
opt = show_re.search(output)
if opt:
return True
return False

View File

@ -13,6 +13,8 @@ import re
import subprocess
import socket
import sys
import fcntl
import struct
def do_hooks(hooks):
@ -43,6 +45,12 @@ except ImportError:
install('python-jinja2')
import jinja2
try:
from netaddr import *
except:
install('python-netaddr')
from netaddr import *
def render_template(template_name, context, template_dir=TEMPLATES_DIR):
templates = jinja2.Environment(
@ -223,3 +231,35 @@ def disable_lsb_services(*services):
def enable_lsb_services(*services):
for service in services:
subprocess.call(['update-rc.d','-f',service,'defaults'])
def get_iface_ipaddr(iface):
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
return socket.inet_ntoa(fcntl.ioctl(
s.fileno(),
0x8919, # SIOCGIFADDR
struct.pack('256s', iface[:15])
)[20:24])
def get_iface_netmask(iface):
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
return socket.inet_ntoa(fcntl.ioctl(
s.fileno(),
0x891b, # SIOCGIFNETMASK
struct.pack('256s', iface[:15])
)[20:24])
def get_netmask_cidr(netmask):
netmask = netmask.split('.')
binary_str = ''
for octet in netmask:
binary_str += bin(int(octet))[2:].zfill(8)
return str(len(binary_str.rstrip('0')))
def get_network_address(iface):
network = "%s/%s" % (get_iface_ipaddr(iface), get_netmask_cidr(get_iface_netmask(iface)))
ip = IPNetwork(network)
return str(ip.network)

View File

@ -1,3 +1,3 @@
# Configuration file created by the ha charm
# start corosync at boot [yes|no]
START=yes
START={{ corosync_enabled }}