Refactoring node-health validation

node-health validation will now be using openstack-ansible-collection
modules to obtain information about cloud structure and state.

This will allow the validation to be executed in environments with
more strict authentication requirements.
Documentation was updated to provide more information about authentication
procedure used.

Signed-off-by: Jiri Podivin <jpodivin@redhat.com>
Change-Id: Id3f332b7fd70a54d537dabec447ffa22a36256fd
This commit is contained in:
Jiri Podivin 2022-10-12 15:03:54 +02:00
parent f9e2d6f933
commit bc547f3f14
4 changed files with 61 additions and 33 deletions

View File

@ -4,3 +4,4 @@ collections:
- community.general
- community.crypto
- ansible.posix
- openstack.cloud

View File

@ -2,5 +2,12 @@
node_health
===========
Role is used by the :ref:`pre-upgrade_node-health` validation to verify state of the overcloud
compute services and baremetal nodes they are running on.
As the clients contacted require Keystone authentication, the role requires
relevant values, such as Keystone endpoint and username, for correct operation.
Otherwise it will produce authentication error.
.. ansibleautoplugin::
:role: roles/node_health

View File

@ -1,11 +1,12 @@
---
- hosts: undercloud
- hosts: localhost
vars:
metadata:
name: Node health check
description: |
Check if all overcloud nodes can be connected to before starting a
scale-up or an upgrade.
scale-up or an upgrade. Validation requires cloud authentication details
in the form of accessible clouds.yaml file to be correctly executed.
groups:
- pre-upgrade
categories:

View File

@ -1,35 +1,54 @@
---
- name: Collect IPs for allovercloud nodes
set_fact: ansible_host="{{ hostvars[item]['ansible_host'] }}"
register: oc_ips
with_items: "{{ groups.allovercloud }}"
- name: Ping all overcloud nodes
icmp_ping:
host: "{{ item }}"
with_items: "{{ oc_ips.results | map(attribute='ansible_facts.ansible_host') | list }}"
- name: Retrieving compute services
ignore_errors: true
register: ping_results
openstack.cloud.compute_service_info:
cloud: overcloud
register: result
- name: Extract failed pings
set_fact:
failed_ips: "{{ ping_results.results | selectattr('failed', 'equalto', True) | map(attribute='item') | list }}"
- name: Lookup nova servers for each failed IP
set_fact:
servers: "{{ lookup('nova_servers', 'ip', 'ctlplane', failed_ips, wantlist=True) }}"
- name: Extract nova ids
set_fact:
server_ids: "{{ servers | map(attribute='id') | list }}"
- name: Lookup ironic nodes for unreachable nova servers
set_fact:
nodes: "{{ lookup('ironic_nodes', 'instance_uuid', server_ids, wantlist=True) }}"
- name: Fail if there are unreachable nodes
- name: Fail if the compute services can't be queried
fail:
msg: |
{{ lookup('template', './templates/unreachable_nodes.j2',
template_vars=dict(nodes=nodes)) }}
when: nodes|length > 0
msg: Compute services query failed with {{ result.msg }}
when: result.failed
- name: Get nova nodes
set_fact:
nova_nodes: "{{ result.openstack_compute_services | community.general.json_query(query) }}"
vars:
query: "[?contains(name, 'nova')]"
- name: Get failed nova nodes
set_fact:
failed_nodes: "{{ nova_nodes | community.general.json_query(failed_nodes_query) }}"
vars:
failed_nodes_query: "[?state!='up']"
- when: failed_nodes | length > 0
block:
- name: Get baremetal nodes info
become: true
openstack.cloud.baremetal_node_info:
cloud: undercloud
register: result
- name: Get baremetal nodes
set_fact:
baremetal_nodes: "{{ result.baremetal_nodes }}"
- name: Get failed node names
set_fact:
node_names: "{{ item.host.split('.')[0]}}"
with_items: "{{ failed_nodes }}"
- name: Get failed baremetal nodes
set_fact:
failed_baremetal_nodes: "{{ baremetal_nodes | to_json | from_json | community.general.json_query(query) }}"
with_items: "{{ node_names }}"
vars:
query: "[?contains(name, '{{ item }}')]"
- name: Fail if there are unreachable nodes
fail:
msg: |
{{ lookup('template', './templates/unreachable_nodes.j2',
template_vars=dict(nodes=failed_baremetal_nodes)) }}
when: failed_baremetal_nodes|length > 0