Add implement of preventing split-brain
This patch adds implementation of preventing split-brain. The part that captures packets of corosync is not impelemented yet. Change-Id: I6bedd28928ac53dfa30b889d2442d748edad2f9c Implements: bp pythonize-host-and-process-monitor
This commit is contained in:
parent
826d909e59
commit
1a0b96caf7
|
@ -49,6 +49,33 @@ If ipmi RA is not set in pacemaker, this value should be set True.
|
|||
cfg.IntOpt('ipmi_retry_interval',
|
||||
default=10,
|
||||
help='Retry interval(in seconds) of the ipmitool command.'),
|
||||
cfg.IntOpt('stonith_wait',
|
||||
default=30,
|
||||
help='Standby time(in seconds) until activate STONITH.'),
|
||||
cfg.IntOpt('tcpdump_timeout',
|
||||
default=5,
|
||||
help='Timeout value(in seconds) of the tcpdump command when'
|
||||
' monitors the corosync communication.'),
|
||||
cfg.StrOpt('corosync_multicast_interfaces',
|
||||
help='''
|
||||
The name of interface that corosync is using for mutual communication
|
||||
between hosts.
|
||||
If there are multiple interfaces, specify them in comma-separated
|
||||
like 'enp0s3,enp0s8'.
|
||||
The number of interfaces you specify must be equal to the number of
|
||||
corosync_multicast_ports values and must be in correct order with relevant
|
||||
ports in corosync_multicast_ports.
|
||||
'''),
|
||||
cfg.StrOpt('corosync_multicast_ports',
|
||||
help='''
|
||||
The port numbers that corosync is using for mutual communication
|
||||
between hosts.
|
||||
If there are multiple port numbers, specify them in comma-separated
|
||||
like '5405,5406'.
|
||||
The number of port numbers you specify must be equal to the number of
|
||||
corosync_multicast_interfaces values and must be in correct order with
|
||||
relevant interfaces in corosync_multicast_interfaces.
|
||||
'''),
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -46,6 +46,92 @@ class HandleHost(driver.DriverBase):
|
|||
self.status_holder = hold_host_status.HostHoldStatus()
|
||||
self.notifier = masakari.SendNotification()
|
||||
|
||||
def _check_pacemaker_services(self, target_service):
|
||||
try:
|
||||
cmd_str = 'systemctl status ' + target_service
|
||||
command = cmd_str.split(' ')
|
||||
|
||||
# Execute command.
|
||||
out, err = utils.execute(*command, run_as_root=True)
|
||||
|
||||
if err:
|
||||
raise Exception
|
||||
|
||||
return True
|
||||
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _check_hb_line(self):
|
||||
"""Check whether the corosync communication is normal.
|
||||
|
||||
:returns: 0 if normal, 1 if abnormal, 2 if configuration file is
|
||||
wrong or neither pacemaker nor pacemaker-remote is running.
|
||||
"""
|
||||
# Check whether the pacemaker services is normal.
|
||||
corosync_status = self._check_pacemaker_services('corosync')
|
||||
pacemaker_status = self._check_pacemaker_services('pacemaker')
|
||||
pacemaker_remote_status = self._check_pacemaker_services(
|
||||
'pacemaker_remote')
|
||||
|
||||
if corosync_status is False or pacemaker_status is False:
|
||||
if pacemaker_remote_status is False:
|
||||
LOG.error(
|
||||
_LE("Neither pacemaker nor pacemaker-remote is running."))
|
||||
return 2
|
||||
else:
|
||||
LOG.info(_LI("Works on pacemaker-remote."))
|
||||
return 0
|
||||
|
||||
# Check whether the neccesary parameters are set.
|
||||
if CONF.host.corosync_multicast_interfaces is None or \
|
||||
CONF.host.corosync_multicast_ports is None:
|
||||
msg = ("corosync_multicast_interfaces or "
|
||||
"corosync_multicast_ports is not set.")
|
||||
LOG.error(_LE("%s"), msg)
|
||||
return 2
|
||||
|
||||
# Check whether the corosync communication is normal.
|
||||
corosync_multicast_interfaces = \
|
||||
CONF.host.corosync_multicast_interfaces.split(',')
|
||||
corosync_multicast_ports = \
|
||||
CONF.host.corosync_multicast_ports.split(',')
|
||||
|
||||
if len(corosync_multicast_interfaces) != len(corosync_multicast_ports):
|
||||
msg = ("Incorrect parameters corosync_multicast_interfaces or "
|
||||
"corosync_multicast_ports.")
|
||||
LOG.error(_LE("%s"), msg)
|
||||
return 2
|
||||
|
||||
is_nic_normal = False
|
||||
for num in range(0, len(corosync_multicast_interfaces)):
|
||||
cmd_str = ("timeout %s tcpdump -n -c 1 -p -i %s port %s") \
|
||||
% (CONF.host.tcpdump_timeout,
|
||||
corosync_multicast_interfaces[num],
|
||||
corosync_multicast_ports[num])
|
||||
command = cmd_str.split(' ')
|
||||
|
||||
try:
|
||||
# Execute crmadmin command.
|
||||
out, err = utils.execute(*command, run_as_root=True)
|
||||
|
||||
# If command doesn't raise exception, nic is normal.
|
||||
msg = ("Corosync communication using '%s' is normal.") \
|
||||
% corosync_multicast_interfaces[num]
|
||||
LOG.info(_LI("%s"), msg)
|
||||
is_nic_normal = True
|
||||
break
|
||||
except Exception:
|
||||
msg = ("Corosync communication using '%s' is failed.") \
|
||||
% corosync_multicast_interfaces[num]
|
||||
LOG.warning(_LW("%s"), msg)
|
||||
|
||||
if is_nic_normal is False:
|
||||
LOG.error(_LE("Corosync communication is failed."))
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
def _check_host_status_by_crmadmin(self):
|
||||
try:
|
||||
# Execute crmadmin command.
|
||||
|
@ -256,12 +342,30 @@ class HandleHost(driver.DriverBase):
|
|||
self.running = True
|
||||
while self.running:
|
||||
|
||||
# Check the host status is stable or unstable by crmadmin.
|
||||
if self._check_host_status_by_crmadmin() != 0:
|
||||
# Check whether corosync communication between hosts
|
||||
# is normal.
|
||||
ret = self._check_hb_line()
|
||||
if ret == 1:
|
||||
# Because my host may be fenced by stonith due to split
|
||||
# brain condition, sleep for a certain time.
|
||||
eventlet.greenthread.sleep(CONF.host.stonith_wait)
|
||||
elif ret == 2:
|
||||
LOG.warning(_LW("hostmonitor skips monitoring hosts."))
|
||||
eventlet.greenthread.sleep(CONF.host.monitoring_interval)
|
||||
continue
|
||||
|
||||
# Check the host status is stable or unstable by crmadmin.
|
||||
# It only checks when this process runs on the full cluster
|
||||
# stack of corosync.
|
||||
pacemaker_remote_status = self._check_pacemaker_services(
|
||||
'pacemaker_remote')
|
||||
if pacemaker_remote_status is False:
|
||||
if self._check_host_status_by_crmadmin() != 0:
|
||||
LOG.warning(_LW("hostmonitor skips monitoring hosts."))
|
||||
eventlet.greenthread.sleep(
|
||||
CONF.host.monitoring_interval)
|
||||
continue
|
||||
|
||||
# Check the host status is online or offline by cibadmin.
|
||||
if self._check_host_status_by_cibadmin() != 0:
|
||||
LOG.warning(_LW("hostmonitor skips monitoring hosts."))
|
||||
|
|
|
@ -35,7 +35,11 @@ class TestHandleHost(testtools.TestCase):
|
|||
@mock.patch.object(parse_cib_xml.ParseCibXml, 'have_quorum')
|
||||
@mock.patch.object(parse_cib_xml.ParseCibXml, 'set_cib_xml')
|
||||
@mock.patch.object(utils, 'execute')
|
||||
@mock.patch.object(handle_host.HandleHost, '_check_pacemaker_services')
|
||||
@mock.patch.object(handle_host.HandleHost, '_check_hb_line')
|
||||
def test_monitor_hosts(self,
|
||||
mock_check_hb_line,
|
||||
mock_check_pacemaker_services,
|
||||
mock_execute,
|
||||
mock_set_cib_xml,
|
||||
mock_have_quorum,
|
||||
|
@ -43,6 +47,8 @@ class TestHandleHost(testtools.TestCase):
|
|||
|
||||
obj = handle_host.HandleHost()
|
||||
|
||||
mock_check_hb_line.return_value = 0
|
||||
mock_check_pacemaker_services.return_value = False
|
||||
mock_execute.return_value = (EXECUTE_RETURN, '')
|
||||
mock_set_cib_xml.return_value = None
|
||||
mock_have_quorum.return_value = 0
|
||||
|
@ -50,3 +56,14 @@ class TestHandleHost(testtools.TestCase):
|
|||
|
||||
ret = obj.monitor_hosts()
|
||||
self.assertEqual(None, ret)
|
||||
|
||||
@mock.patch.object(utils, 'execute')
|
||||
def test_check_hb_line(self,
|
||||
mock_execute):
|
||||
|
||||
obj = handle_host.HandleHost()
|
||||
|
||||
mock_execute.return_value = ('', '')
|
||||
|
||||
ret = obj._check_hb_line()
|
||||
self.assertEqual(2, ret)
|
||||
|
|
Loading…
Reference in New Issue