Add implement of preventing split-brain

This patch adds implementation of preventing split-brain.
The part that captures packets of corosync is not impelemented yet.

Change-Id: I6bedd28928ac53dfa30b889d2442d748edad2f9c
Implements: bp pythonize-host-and-process-monitor
This commit is contained in:
Kengo Takahara 2017-01-26 18:57:49 +09:00
parent 826d909e59
commit 1a0b96caf7
3 changed files with 150 additions and 2 deletions

View File

@ -49,6 +49,33 @@ If ipmi RA is not set in pacemaker, this value should be set True.
cfg.IntOpt('ipmi_retry_interval',
default=10,
help='Retry interval(in seconds) of the ipmitool command.'),
cfg.IntOpt('stonith_wait',
default=30,
help='Standby time(in seconds) until activate STONITH.'),
cfg.IntOpt('tcpdump_timeout',
default=5,
help='Timeout value(in seconds) of the tcpdump command when'
' monitors the corosync communication.'),
cfg.StrOpt('corosync_multicast_interfaces',
help='''
The name of interface that corosync is using for mutual communication
between hosts.
If there are multiple interfaces, specify them in comma-separated
like 'enp0s3,enp0s8'.
The number of interfaces you specify must be equal to the number of
corosync_multicast_ports values and must be in correct order with relevant
ports in corosync_multicast_ports.
'''),
cfg.StrOpt('corosync_multicast_ports',
help='''
The port numbers that corosync is using for mutual communication
between hosts.
If there are multiple port numbers, specify them in comma-separated
like '5405,5406'.
The number of port numbers you specify must be equal to the number of
corosync_multicast_interfaces values and must be in correct order with
relevant interfaces in corosync_multicast_interfaces.
'''),
]

View File

@ -46,6 +46,92 @@ class HandleHost(driver.DriverBase):
self.status_holder = hold_host_status.HostHoldStatus()
self.notifier = masakari.SendNotification()
def _check_pacemaker_services(self, target_service):
try:
cmd_str = 'systemctl status ' + target_service
command = cmd_str.split(' ')
# Execute command.
out, err = utils.execute(*command, run_as_root=True)
if err:
raise Exception
return True
except Exception:
return False
def _check_hb_line(self):
"""Check whether the corosync communication is normal.
:returns: 0 if normal, 1 if abnormal, 2 if configuration file is
wrong or neither pacemaker nor pacemaker-remote is running.
"""
# Check whether the pacemaker services is normal.
corosync_status = self._check_pacemaker_services('corosync')
pacemaker_status = self._check_pacemaker_services('pacemaker')
pacemaker_remote_status = self._check_pacemaker_services(
'pacemaker_remote')
if corosync_status is False or pacemaker_status is False:
if pacemaker_remote_status is False:
LOG.error(
_LE("Neither pacemaker nor pacemaker-remote is running."))
return 2
else:
LOG.info(_LI("Works on pacemaker-remote."))
return 0
# Check whether the neccesary parameters are set.
if CONF.host.corosync_multicast_interfaces is None or \
CONF.host.corosync_multicast_ports is None:
msg = ("corosync_multicast_interfaces or "
"corosync_multicast_ports is not set.")
LOG.error(_LE("%s"), msg)
return 2
# Check whether the corosync communication is normal.
corosync_multicast_interfaces = \
CONF.host.corosync_multicast_interfaces.split(',')
corosync_multicast_ports = \
CONF.host.corosync_multicast_ports.split(',')
if len(corosync_multicast_interfaces) != len(corosync_multicast_ports):
msg = ("Incorrect parameters corosync_multicast_interfaces or "
"corosync_multicast_ports.")
LOG.error(_LE("%s"), msg)
return 2
is_nic_normal = False
for num in range(0, len(corosync_multicast_interfaces)):
cmd_str = ("timeout %s tcpdump -n -c 1 -p -i %s port %s") \
% (CONF.host.tcpdump_timeout,
corosync_multicast_interfaces[num],
corosync_multicast_ports[num])
command = cmd_str.split(' ')
try:
# Execute crmadmin command.
out, err = utils.execute(*command, run_as_root=True)
# If command doesn't raise exception, nic is normal.
msg = ("Corosync communication using '%s' is normal.") \
% corosync_multicast_interfaces[num]
LOG.info(_LI("%s"), msg)
is_nic_normal = True
break
except Exception:
msg = ("Corosync communication using '%s' is failed.") \
% corosync_multicast_interfaces[num]
LOG.warning(_LW("%s"), msg)
if is_nic_normal is False:
LOG.error(_LE("Corosync communication is failed."))
return 1
return 0
def _check_host_status_by_crmadmin(self):
try:
# Execute crmadmin command.
@ -256,12 +342,30 @@ class HandleHost(driver.DriverBase):
self.running = True
while self.running:
# Check the host status is stable or unstable by crmadmin.
if self._check_host_status_by_crmadmin() != 0:
# Check whether corosync communication between hosts
# is normal.
ret = self._check_hb_line()
if ret == 1:
# Because my host may be fenced by stonith due to split
# brain condition, sleep for a certain time.
eventlet.greenthread.sleep(CONF.host.stonith_wait)
elif ret == 2:
LOG.warning(_LW("hostmonitor skips monitoring hosts."))
eventlet.greenthread.sleep(CONF.host.monitoring_interval)
continue
# Check the host status is stable or unstable by crmadmin.
# It only checks when this process runs on the full cluster
# stack of corosync.
pacemaker_remote_status = self._check_pacemaker_services(
'pacemaker_remote')
if pacemaker_remote_status is False:
if self._check_host_status_by_crmadmin() != 0:
LOG.warning(_LW("hostmonitor skips monitoring hosts."))
eventlet.greenthread.sleep(
CONF.host.monitoring_interval)
continue
# Check the host status is online or offline by cibadmin.
if self._check_host_status_by_cibadmin() != 0:
LOG.warning(_LW("hostmonitor skips monitoring hosts."))

View File

@ -35,7 +35,11 @@ class TestHandleHost(testtools.TestCase):
@mock.patch.object(parse_cib_xml.ParseCibXml, 'have_quorum')
@mock.patch.object(parse_cib_xml.ParseCibXml, 'set_cib_xml')
@mock.patch.object(utils, 'execute')
@mock.patch.object(handle_host.HandleHost, '_check_pacemaker_services')
@mock.patch.object(handle_host.HandleHost, '_check_hb_line')
def test_monitor_hosts(self,
mock_check_hb_line,
mock_check_pacemaker_services,
mock_execute,
mock_set_cib_xml,
mock_have_quorum,
@ -43,6 +47,8 @@ class TestHandleHost(testtools.TestCase):
obj = handle_host.HandleHost()
mock_check_hb_line.return_value = 0
mock_check_pacemaker_services.return_value = False
mock_execute.return_value = (EXECUTE_RETURN, '')
mock_set_cib_xml.return_value = None
mock_have_quorum.return_value = 0
@ -50,3 +56,14 @@ class TestHandleHost(testtools.TestCase):
ret = obj.monitor_hosts()
self.assertEqual(None, ret)
@mock.patch.object(utils, 'execute')
def test_check_hb_line(self,
mock_execute):
obj = handle_host.HandleHost()
mock_execute.return_value = ('', '')
ret = obj._check_hb_line()
self.assertEqual(2, ret)