Use more correct approach to detect resources status
Closes-bug: 1499696 Change-Id: Ied72bc68ba41e47334796f09b407ebd8a3fb7f40
This commit is contained in:
parent
2cbab19251
commit
3d06b9d8dd
|
@ -11,6 +11,8 @@
|
|||
# under the License.
|
||||
|
||||
import mock
|
||||
import pytest
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from octane.util import maintenance
|
||||
from octane.util import subprocess
|
||||
|
@ -21,6 +23,32 @@ def test_get_crm_services():
|
|||
assert sorted(res) == CRM_XML_PARSE_RESULT
|
||||
|
||||
|
||||
@pytest.mark.parametrize("resource_list,status,expected_result", [
|
||||
(["master_p_rabbitmq-server", "vip__management_old"], False, False),
|
||||
(["master_p_rabbitmq-server", "vip__management_old"], True, False),
|
||||
(["master_p_rabbitmq-server", "p_ceilometer-alarm-evaluator"], False,
|
||||
True),
|
||||
(["clone_p_neutron-metadata-agent", "vip__management_old",
|
||||
"group__zabbix-server"], True, True),
|
||||
(["test1", "vip__management_old"], True, False),
|
||||
(["test1", "test2"], False, True),
|
||||
])
|
||||
def test_resources_synced(resource_list, status, expected_result):
|
||||
res = maintenance.is_resources_synced(resource_list, CRM_XML_STATUS_SAMPLE,
|
||||
status)
|
||||
assert res is expected_result
|
||||
|
||||
|
||||
def test_resources_status():
|
||||
data = ElementTree.fromstring(CRM_XML_STATUS_SAMPLE)
|
||||
resources = next(el for el in data if el.tag == 'resources')
|
||||
|
||||
result = []
|
||||
for resource in resources:
|
||||
result.append(maintenance.is_resource_active(resource))
|
||||
assert result == [True, False, False, True, True]
|
||||
|
||||
|
||||
def test_stop_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output,
|
||||
mock_subprocess, node):
|
||||
get_one_controller = mocker.patch('octane.util.env.get_one_controller')
|
||||
|
@ -34,6 +62,9 @@ def test_stop_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output,
|
|||
|
||||
mocker.patch('time.sleep')
|
||||
|
||||
wait_for_services = \
|
||||
mocker.patch.object(maintenance, 'wait_for_corosync_services_sync')
|
||||
|
||||
maintenance.stop_corosync_services('env')
|
||||
|
||||
assert not mock_subprocess.called
|
||||
|
@ -41,6 +72,8 @@ def test_stop_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output,
|
|||
mock_ssh_call_output.assert_called_once_with(['cibadmin', '--query',
|
||||
'--scope', 'resources'],
|
||||
node=node)
|
||||
assert wait_for_services.call_args_list == \
|
||||
[mock.call('env', ['s1', 's2'], 'stop')]
|
||||
assert mock_ssh_call.call_args_list == [
|
||||
mock.call(['crm', 'resource', 'stop', 's1'], node=node),
|
||||
mock.call(['crm', 'resource', 'stop', 's1'], node=node),
|
||||
|
@ -57,10 +90,16 @@ def test_start_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output,
|
|||
mock_ssh_call.side_effect = \
|
||||
[None, subprocess.CalledProcessError(1, 'cmd'), None]
|
||||
|
||||
wait_for_services = \
|
||||
mocker.patch.object(maintenance, 'wait_for_corosync_services_sync')
|
||||
|
||||
maintenance.start_corosync_services('env')
|
||||
|
||||
mock_ssh_call_output.assert_called_once_with(
|
||||
['cibadmin', '--query', '--scope', 'resources'], node=node)
|
||||
|
||||
assert wait_for_services.call_args_list == \
|
||||
[mock.call('env', ['test_service1', 'test_service2'], 'start')]
|
||||
assert mock_ssh_call.call_args_list == [
|
||||
mock.call(['crm', 'resource', 'start', 'test_service1'], node=node),
|
||||
mock.call(['crm', 'resource', 'start', 'test_service2'], node=node),
|
||||
|
@ -458,12 +497,61 @@ CRM_XML_SAMPLE = """
|
|||
</resources>
|
||||
"""[1:] # noqa
|
||||
CRM_XML_PARSE_RESULT = [
|
||||
'clone_p_dns',
|
||||
'clone_p_haproxy',
|
||||
'clone_p_heat-engine',
|
||||
'clone_p_mysql',
|
||||
'clone_p_neutron-dhcp-agent',
|
||||
'clone_p_neutron-l3-agent',
|
||||
'clone_p_neutron-metadata-agent',
|
||||
'clone_p_neutron-plugin-openvswitch-agent',
|
||||
'clone_p_ntp',
|
||||
'clone_p_vrouter',
|
||||
'group__zabbix-server',
|
||||
'master_p_conntrackd',
|
||||
'master_p_rabbitmq-server',
|
||||
'p_ceilometer-agent-central',
|
||||
'p_ceilometer-alarm-evaluator',
|
||||
'vip__management',
|
||||
'vip__public',
|
||||
'vip__vrouter',
|
||||
'vip__vrouter_pub'
|
||||
]
|
||||
CRM_XML_STATUS_SAMPLE = """
|
||||
<crm_mon version="1.1.12">
|
||||
<resources>
|
||||
<resource id="vip__management_old" resource_agent="ocf::mirantis:ns_IPaddr2" role="Started" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||
<node name="node-2" id="node-2" cached="false"/>
|
||||
</resource>
|
||||
<resource id="p_ceilometer-alarm-evaluator" resource_agent="ocf::mirantis:ceilometer-alarm-evaluator" role="Started" active="false" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="0" />
|
||||
<clone id="master_p_rabbitmq-server" multi_state="true" unique="false" managed="true" failed="false" failure_ignored="false" >
|
||||
<resource id="p_rabbitmq-server" resource_agent="ocf::mirantis:rabbitmq-server" role="Master" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||
<node name="node-3" id="node-3" cached="false"/>
|
||||
</resource>
|
||||
<resource id="p_rabbitmq-server" resource_agent="ocf::mirantis:rabbitmq-server" role="Slave" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||
<node name="node-2" id="node-2" cached="false"/>
|
||||
</resource>
|
||||
<resource id="p_rabbitmq-server" resource_agent="ocf::mirantis:rabbitmq-server" role="Stopped" active="false" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="0" />
|
||||
</clone>
|
||||
<clone id="clone_p_neutron-metadata-agent" >
|
||||
<resource id="p_neutron-metadata-agent" resource_agent="ocf::mirantis:neutron-agent-metadata" active="true" >
|
||||
<node name="node-3" id="node-3" cached="false"/>
|
||||
</resource>
|
||||
<resource id="p_neutron-metadata-agent" resource_agent="ocf::mirantis:neutron-agent-metadata" active="true" >
|
||||
<node name="node-2" id="node-2" cached="false"/>
|
||||
</resource>
|
||||
<resource id="p_neutron-metadata-agent" resource_agent="ocf::mirantis:neutron-agent-metadata" active="true" >
|
||||
<node name="node-5" id="node-5" cached="false"/>
|
||||
</resource>
|
||||
</clone>
|
||||
<group id="group__zabbix-server" number_resources="2" >
|
||||
<resource id="vip__zbx_vip_mgmt" resource_agent="ocf::fuel:ns_IPaddr2" role="Started" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||
<node name="node-6" id="6" cached="false"/>
|
||||
</resource>
|
||||
<resource id="p_zabbix-server" resource_agent="ocf::fuel:zabbix-server" role="Started" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
|
||||
<node name="node-6" id="6" cached="false"/>
|
||||
</resource>
|
||||
</group>
|
||||
</resources>
|
||||
</crm_mon>
|
||||
"""[1:] # noqa
|
||||
|
|
|
@ -79,29 +79,93 @@ _default_exclude_services = ('p_mysql', 'p_haproxy', 'p_dns', 'p_ntp', 'vip',
|
|||
'clone_p_vrouter')
|
||||
|
||||
|
||||
def get_crm_services(status_out, exclude=_default_exclude_services):
|
||||
def get_crm_services(status_out):
|
||||
data = ElementTree.fromstring(status_out)
|
||||
for resource in data:
|
||||
name = resource.get('id')
|
||||
if any(service in name for service in exclude):
|
||||
continue
|
||||
yield name
|
||||
yield resource.get('id')
|
||||
|
||||
|
||||
def start_corosync_services(env):
|
||||
manage_corosync_services(env, 'start')
|
||||
|
||||
|
||||
def stop_corosync_services(env):
|
||||
manage_corosync_services(env, 'stop')
|
||||
|
||||
|
||||
def manage_corosync_services(env, status):
|
||||
node = env_util.get_one_controller(env)
|
||||
status_out = ssh.call_output(['cibadmin', '--query', '--scope',
|
||||
'resources'], node=node)
|
||||
for service in get_crm_services(status_out):
|
||||
services_list = []
|
||||
for res in get_crm_services(status_out):
|
||||
if any(service in res for service in _default_exclude_services):
|
||||
continue
|
||||
services_list.append(res)
|
||||
|
||||
for service in services_list:
|
||||
while True:
|
||||
try:
|
||||
ssh.call(['crm', 'resource', 'stop', service],
|
||||
ssh.call(['crm', 'resource', status, service],
|
||||
node=node)
|
||||
except subprocess.CalledProcessError:
|
||||
pass
|
||||
# Sometimes pacemaker rejects part of requests what it is
|
||||
# not able to process. Sleep was added to mitigate this risk.
|
||||
time.sleep(1)
|
||||
else:
|
||||
break
|
||||
time.sleep(60)
|
||||
wait_for_corosync_services_sync(env, services_list, status)
|
||||
|
||||
|
||||
def wait_for_corosync_services_sync(env, resource_list, status,
|
||||
timeout=1200, check_freq=20):
|
||||
status_bool = status == 'start'
|
||||
node = env_util.get_one_controller(env)
|
||||
started_at = time.time()
|
||||
while True:
|
||||
crm_out = ssh.call_output(['crm_mon', '--as-xml'], node=node)
|
||||
if is_resources_synced(resource_list, crm_out, status_bool):
|
||||
return
|
||||
if time.time() - started_at >= timeout:
|
||||
raise Exception("Timeout waiting for corosync cluster for env %s"
|
||||
" to be synced" % env.id)
|
||||
time.sleep(check_freq)
|
||||
|
||||
|
||||
def is_resources_synced(resources, crm_out, status):
|
||||
def get_resource(resources, resource_id):
|
||||
for resource in resources:
|
||||
if resource.get('id') == resource_id:
|
||||
return resource
|
||||
return None
|
||||
|
||||
data = ElementTree.fromstring(crm_out)
|
||||
mon_resources = data.find('resources')
|
||||
for resource in resources:
|
||||
res = get_resource(mon_resources, resource)
|
||||
if not (is_resource_active(res) is status):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
# Resources are fetching from the output of 'crm_mon' command. This command
|
||||
# doesn't return resource if it's not started and we can consider 'absent'
|
||||
# resource as disabled.
|
||||
def is_resource_active(resource):
|
||||
if resource is None:
|
||||
return False
|
||||
if resource.tag == 'resource':
|
||||
return is_primitive_active(resource)
|
||||
for primitive in resource:
|
||||
if not is_primitive_active(primitive):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_primitive_active(resource):
|
||||
if resource.get('active') == 'true':
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def stop_upstart_services(env):
|
||||
|
@ -129,22 +193,6 @@ def stop_upstart_services(env):
|
|||
ssh.call(['stop', service], node=node)
|
||||
|
||||
|
||||
def start_corosync_services(env):
|
||||
node = next(env_util.get_controllers(env))
|
||||
status_out = ssh.call_output(['cibadmin', '--query', '--scope',
|
||||
'resources'], node=node)
|
||||
for service in get_crm_services(status_out):
|
||||
while True:
|
||||
try:
|
||||
ssh.call(['crm', 'resource', 'start', service],
|
||||
node=node)
|
||||
except subprocess.CalledProcessError:
|
||||
pass
|
||||
else:
|
||||
break
|
||||
time.sleep(60)
|
||||
|
||||
|
||||
def start_upstart_services(env):
|
||||
controllers = list(env_util.get_controllers(env))
|
||||
for node in controllers:
|
||||
|
@ -179,3 +227,15 @@ def start_cluster(env):
|
|||
for node in controllers:
|
||||
for cmd in cmds:
|
||||
ssh.call(cmd, node=node)
|
||||
# When we start cluster we should wait while resources from constant
|
||||
# `_default_exclude_services` become up and running. BTW, We don't touch
|
||||
# these resources in stop/start corosync resources methods at all.
|
||||
node = env_util.get_one_controller(env)
|
||||
status_out = ssh.call_output(['cibadmin', '--query', '--scope',
|
||||
'resources'], node=node)
|
||||
services_list = []
|
||||
for res in get_crm_services(status_out):
|
||||
if any(service in res for service in _default_exclude_services):
|
||||
services_list.append(res)
|
||||
|
||||
wait_for_corosync_services_sync(env, services_list, 'start')
|
||||
|
|
Loading…
Reference in New Issue