Use more correct approach to detect resources status

Closes-bug: 1499696
Change-Id: Ied72bc68ba41e47334796f09b407ebd8a3fb7f40
This commit is contained in:
Valyavskiy Viacheslav 2015-10-04 03:14:37 +03:00
parent 2cbab19251
commit 3d06b9d8dd
2 changed files with 173 additions and 25 deletions

View File

@ -11,6 +11,8 @@
# under the License.
import mock
import pytest
from xml.etree import ElementTree
from octane.util import maintenance
from octane.util import subprocess
@ -21,6 +23,32 @@ def test_get_crm_services():
assert sorted(res) == CRM_XML_PARSE_RESULT
@pytest.mark.parametrize("resource_list,status,expected_result", [
(["master_p_rabbitmq-server", "vip__management_old"], False, False),
(["master_p_rabbitmq-server", "vip__management_old"], True, False),
(["master_p_rabbitmq-server", "p_ceilometer-alarm-evaluator"], False,
True),
(["clone_p_neutron-metadata-agent", "vip__management_old",
"group__zabbix-server"], True, True),
(["test1", "vip__management_old"], True, False),
(["test1", "test2"], False, True),
])
def test_resources_synced(resource_list, status, expected_result):
res = maintenance.is_resources_synced(resource_list, CRM_XML_STATUS_SAMPLE,
status)
assert res is expected_result
def test_resources_status():
data = ElementTree.fromstring(CRM_XML_STATUS_SAMPLE)
resources = next(el for el in data if el.tag == 'resources')
result = []
for resource in resources:
result.append(maintenance.is_resource_active(resource))
assert result == [True, False, False, True, True]
def test_stop_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output,
mock_subprocess, node):
get_one_controller = mocker.patch('octane.util.env.get_one_controller')
@ -34,6 +62,9 @@ def test_stop_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output,
mocker.patch('time.sleep')
wait_for_services = \
mocker.patch.object(maintenance, 'wait_for_corosync_services_sync')
maintenance.stop_corosync_services('env')
assert not mock_subprocess.called
@ -41,6 +72,8 @@ def test_stop_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output,
mock_ssh_call_output.assert_called_once_with(['cibadmin', '--query',
'--scope', 'resources'],
node=node)
assert wait_for_services.call_args_list == \
[mock.call('env', ['s1', 's2'], 'stop')]
assert mock_ssh_call.call_args_list == [
mock.call(['crm', 'resource', 'stop', 's1'], node=node),
mock.call(['crm', 'resource', 'stop', 's1'], node=node),
@ -57,10 +90,16 @@ def test_start_corosync_services(mocker, mock_ssh_call, mock_ssh_call_output,
mock_ssh_call.side_effect = \
[None, subprocess.CalledProcessError(1, 'cmd'), None]
wait_for_services = \
mocker.patch.object(maintenance, 'wait_for_corosync_services_sync')
maintenance.start_corosync_services('env')
mock_ssh_call_output.assert_called_once_with(
['cibadmin', '--query', '--scope', 'resources'], node=node)
assert wait_for_services.call_args_list == \
[mock.call('env', ['test_service1', 'test_service2'], 'start')]
assert mock_ssh_call.call_args_list == [
mock.call(['crm', 'resource', 'start', 'test_service1'], node=node),
mock.call(['crm', 'resource', 'start', 'test_service2'], node=node),
@ -458,12 +497,61 @@ CRM_XML_SAMPLE = """
</resources>
"""[1:] # noqa
CRM_XML_PARSE_RESULT = [
'clone_p_dns',
'clone_p_haproxy',
'clone_p_heat-engine',
'clone_p_mysql',
'clone_p_neutron-dhcp-agent',
'clone_p_neutron-l3-agent',
'clone_p_neutron-metadata-agent',
'clone_p_neutron-plugin-openvswitch-agent',
'clone_p_ntp',
'clone_p_vrouter',
'group__zabbix-server',
'master_p_conntrackd',
'master_p_rabbitmq-server',
'p_ceilometer-agent-central',
'p_ceilometer-alarm-evaluator',
'vip__management',
'vip__public',
'vip__vrouter',
'vip__vrouter_pub'
]
CRM_XML_STATUS_SAMPLE = """
<crm_mon version="1.1.12">
<resources>
<resource id="vip__management_old" resource_agent="ocf::mirantis:ns_IPaddr2" role="Started" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
<node name="node-2" id="node-2" cached="false"/>
</resource>
<resource id="p_ceilometer-alarm-evaluator" resource_agent="ocf::mirantis:ceilometer-alarm-evaluator" role="Started" active="false" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="0" />
<clone id="master_p_rabbitmq-server" multi_state="true" unique="false" managed="true" failed="false" failure_ignored="false" >
<resource id="p_rabbitmq-server" resource_agent="ocf::mirantis:rabbitmq-server" role="Master" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
<node name="node-3" id="node-3" cached="false"/>
</resource>
<resource id="p_rabbitmq-server" resource_agent="ocf::mirantis:rabbitmq-server" role="Slave" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
<node name="node-2" id="node-2" cached="false"/>
</resource>
<resource id="p_rabbitmq-server" resource_agent="ocf::mirantis:rabbitmq-server" role="Stopped" active="false" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="0" />
</clone>
<clone id="clone_p_neutron-metadata-agent" >
<resource id="p_neutron-metadata-agent" resource_agent="ocf::mirantis:neutron-agent-metadata" active="true" >
<node name="node-3" id="node-3" cached="false"/>
</resource>
<resource id="p_neutron-metadata-agent" resource_agent="ocf::mirantis:neutron-agent-metadata" active="true" >
<node name="node-2" id="node-2" cached="false"/>
</resource>
<resource id="p_neutron-metadata-agent" resource_agent="ocf::mirantis:neutron-agent-metadata" active="true" >
<node name="node-5" id="node-5" cached="false"/>
</resource>
</clone>
<group id="group__zabbix-server" number_resources="2" >
<resource id="vip__zbx_vip_mgmt" resource_agent="ocf::fuel:ns_IPaddr2" role="Started" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
<node name="node-6" id="6" cached="false"/>
</resource>
<resource id="p_zabbix-server" resource_agent="ocf::fuel:zabbix-server" role="Started" active="true" orphaned="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
<node name="node-6" id="6" cached="false"/>
</resource>
</group>
</resources>
</crm_mon>
"""[1:] # noqa

View File

@ -79,29 +79,93 @@ _default_exclude_services = ('p_mysql', 'p_haproxy', 'p_dns', 'p_ntp', 'vip',
'clone_p_vrouter')
def get_crm_services(status_out, exclude=_default_exclude_services):
def get_crm_services(status_out):
data = ElementTree.fromstring(status_out)
for resource in data:
name = resource.get('id')
if any(service in name for service in exclude):
continue
yield name
yield resource.get('id')
def start_corosync_services(env):
manage_corosync_services(env, 'start')
def stop_corosync_services(env):
manage_corosync_services(env, 'stop')
def manage_corosync_services(env, status):
node = env_util.get_one_controller(env)
status_out = ssh.call_output(['cibadmin', '--query', '--scope',
'resources'], node=node)
for service in get_crm_services(status_out):
services_list = []
for res in get_crm_services(status_out):
if any(service in res for service in _default_exclude_services):
continue
services_list.append(res)
for service in services_list:
while True:
try:
ssh.call(['crm', 'resource', 'stop', service],
ssh.call(['crm', 'resource', status, service],
node=node)
except subprocess.CalledProcessError:
pass
# Sometimes pacemaker rejects part of requests what it is
# not able to process. Sleep was added to mitigate this risk.
time.sleep(1)
else:
break
time.sleep(60)
wait_for_corosync_services_sync(env, services_list, status)
def wait_for_corosync_services_sync(env, resource_list, status,
timeout=1200, check_freq=20):
status_bool = status == 'start'
node = env_util.get_one_controller(env)
started_at = time.time()
while True:
crm_out = ssh.call_output(['crm_mon', '--as-xml'], node=node)
if is_resources_synced(resource_list, crm_out, status_bool):
return
if time.time() - started_at >= timeout:
raise Exception("Timeout waiting for corosync cluster for env %s"
" to be synced" % env.id)
time.sleep(check_freq)
def is_resources_synced(resources, crm_out, status):
def get_resource(resources, resource_id):
for resource in resources:
if resource.get('id') == resource_id:
return resource
return None
data = ElementTree.fromstring(crm_out)
mon_resources = data.find('resources')
for resource in resources:
res = get_resource(mon_resources, resource)
if not (is_resource_active(res) is status):
return False
return True
# Resources are fetching from the output of 'crm_mon' command. This command
# doesn't return resource if it's not started and we can consider 'absent'
# resource as disabled.
def is_resource_active(resource):
if resource is None:
return False
if resource.tag == 'resource':
return is_primitive_active(resource)
for primitive in resource:
if not is_primitive_active(primitive):
return False
return True
def is_primitive_active(resource):
if resource.get('active') == 'true':
return True
return False
def stop_upstart_services(env):
@ -129,22 +193,6 @@ def stop_upstart_services(env):
ssh.call(['stop', service], node=node)
def start_corosync_services(env):
node = next(env_util.get_controllers(env))
status_out = ssh.call_output(['cibadmin', '--query', '--scope',
'resources'], node=node)
for service in get_crm_services(status_out):
while True:
try:
ssh.call(['crm', 'resource', 'start', service],
node=node)
except subprocess.CalledProcessError:
pass
else:
break
time.sleep(60)
def start_upstart_services(env):
controllers = list(env_util.get_controllers(env))
for node in controllers:
@ -179,3 +227,15 @@ def start_cluster(env):
for node in controllers:
for cmd in cmds:
ssh.call(cmd, node=node)
# When we start cluster we should wait while resources from constant
# `_default_exclude_services` become up and running. BTW, We don't touch
# these resources in stop/start corosync resources methods at all.
node = env_util.get_one_controller(env)
status_out = ssh.call_output(['cibadmin', '--query', '--scope',
'resources'], node=node)
services_list = []
for res in get_crm_services(status_out):
if any(service in res for service in _default_exclude_services):
services_list.append(res)
wait_for_corosync_services_sync(env, services_list, 'start')