Merge "Introduce an Action Plugin to fetch container infos" into stable/train

This commit is contained in:
Zuul 2020-07-15 05:29:09 +00:00 committed by Gerrit Code Review
commit 132b9d20d5
6 changed files with 347 additions and 117 deletions

View File

@ -0,0 +1,334 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright 2020 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import, division, print_function
__metaclass__ = type
import copy
import tenacity
import yaml
from ansible.errors import AnsibleActionFail
from ansible.plugins.action import ActionBase
from ansible.utils.display import Display
DISPLAY = Display()
# Default delay/retries used to fetch containers status and wait for them to be
# finished.
DELAY = 10
RETRIES = 30
TIMEOUT = DELAY * RETRIES
ANSIBLE_METADATA = {
'metadata_version': '1.1',
'status': ['preview'],
'supported_by': 'community'
}
DOCUMENTATION = """
module: container_status
author:
- "TripleO team"
version_added: '2.9'
short_description: Check and report containers status
notes: []
description:
- For each container that isn't an exec or a container supposed to be
controlled by systemd, we expect it to terminate with a return code.
This module will check that code and make sure it's correct. If not, it
will report the failure for easier debug.
requirements:
- None
options:
container_async_results:
description:
- Async results of a podman_container invocation.
type: list
container_data:
description:
- List of dictionaries which have the container configurations.
type: list
valid_exit_codes:
description:
- List of valid container exit codes.
default: []
type: list
debug:
description:
- Whether or not debug is enabled.
default: False
type: boolean
"""
EXAMPLES = """
- name: Check containers status
containers_status:
container_async_results: "{{ create_async_poll_results.results }}"
container_data:
- keystone:
image: docker.io/keystone
- mysql_bootstrap:
image: docker.io/mysql
valid_exit_codes:
- 0
- 2
"""
RETURN = """
changed_containers:
description: List of containers which changed.
returned: always
type: list
sample:
- keystone
- mysql
commands:
description: List of container cli commands that would be run.
returned: always
type: list
sample:
- podman rm -f keystone
- podman run keystone
"""
class ActionModule(ActionBase):
"""Action plugin for container status"""
_VALID_ARGS = yaml.safe_load(DOCUMENTATION)['options']
def _get_args(self):
missing = []
args = {}
for option, vals in self._VALID_ARGS.items():
if 'default' not in vals:
if self._task.args.get(option, None) is None:
missing.append(option)
continue
args[option] = self._task.args.get(option)
else:
args[option] = self._task.args.get(option, vals['default'])
if missing:
raise AnsibleActionFail('Missing required parameters: {}'.format(
', '.join(missing)))
return args
def _get_containers_to_check(self, data):
"""Return a list of containers that we need to check.
Given some container_data, figure out what containers terminate with
a return code so later we can check that code.
:param data: Dictionary of container data.
:returns: List of containers that need to be checked.
"""
containers = []
# loop through container data to get specific container
for container in data:
# get container name and data
for name, values in container.items():
if 'action' in values or 'restart' in values:
continue
if 'image' in values:
# We assume that container configs that don't have a
# restart policy nor action (used for podman exec) but have
# an image set, will run something and then exit with a
# return code.
containers.append(name)
if self.debug and len(containers) > 0:
DISPLAY.display('These containers are supposed to terminate with '
'a valid exit code and will be checked: '
'{}'.format(containers))
return containers
def _get_create_commands(self, results):
"""Return a list of commands that were executed by container tool.
:param results: Ansible task results.
:returns commands: List of commands.
"""
commands = []
for item in results:
if item['changed']:
commands.extend(item['podman_actions'])
return commands
def _is_container_running(self, container):
"""Return True if a container has Running State.
:params container: Dictionary for container infos.
:returns running: Boolean of container running status.
"""
state = container.get('State', {})
running = state.get('Running', False)
return running
def _get_container_infos(self, containers, task_vars):
"""Return container infos.
:params containers: List of containers.
:params task_vars: Dictionary of Ansible tasks variables.
:returns container_results: Dictionary of container infos.
"""
tvars = copy.deepcopy(task_vars)
result = self._execute_module(
module_name='podman_container_info',
module_args=dict(name=containers),
task_vars=tvars
)
return [c for c in result["containers"]]
@tenacity.retry(
reraise=True,
stop=tenacity.stop_after_attempt(RETRIES),
wait=tenacity.wait_fixed(DELAY)
)
def _fetch_container_state(self, containers, task_vars):
"""Return container states of finished containers with retries.
:params containers: List of containers.
:params task_vars: Dictionary of Ansible tasks variables.
:returns container_results: Dictionary of container infos.
"""
containers_results = self._get_container_infos(containers, task_vars)
for container in containers_results:
name = container.get('Name')
if self._is_container_running(container):
raise AnsibleActionFail('Container {} has not finished yet, '
'retrying...'.format(name))
return containers_results
def _check_container_state(self, containers, exit_codes, task_vars):
"""Return a tuple of running and failed containers.
:params containers: List of containers to check.
:params exit_codes: List of valid exit codes.
:params task_vars: Dictionary of Ansible tasks variables.
:returns running, failed: Tuple of lists.
"""
running = []
failed = []
try:
self._fetch_container_state(containers, task_vars)
except AnsibleActionFail:
# We fail at the end with all the other infos
if self.debug:
DISPLAY.display('One or more containers did not finish on '
'time, the failure will be reported later.')
pass
containers_results = self._get_container_infos(containers, task_vars)
for container in containers_results:
container_name = container.get('Name')
container_state = container.get('State')
if self._is_container_running(container):
running.append(container_name)
elif container_state.get('ExitCode') not in exit_codes:
failed.append(container_name)
return (running, failed)
def _check_errors_in_ansible_async_results(self, results):
"""Get a tuple with changed and failed containers.
:param results: Ansible results from "Check podman create status"
:returns: Tuple of containers that changed or failed
"""
changed = []
failed = []
for item in results:
# if Ansible is run in check mode, the async_results items will
# not contain failed or finished keys.
if self._play_context.check_mode:
break
async_result_item = item['create_async_result_item']
if item['changed']:
for name, c in async_result_item['container_data'].items():
changed.append(name)
if (item['failed'] or not item['finished']
or ('stderr' in async_result_item
and async_result_item['stderr'] != '')):
for name, c in async_result_item['container_data'].items():
failed.append(name)
return (changed, failed)
def run(self, tmp=None, task_vars=None):
self._supports_check_mode = True
self.changed = False
self.changed_containers = []
container_commands = []
running = []
failed = []
if task_vars is None:
task_vars = dict()
result = super(ActionModule, self).run(tmp, task_vars)
del tmp
# parse args
args = self._get_args()
async_results = args['container_async_results']
container_data = args['container_data']
valid_exit_codes = args['valid_exit_codes']
self.debug = args['debug']
containers_to_check = self._get_containers_to_check(container_data)
# Check that the containers which are supposed to finish have
# actually finished and also terminated with the right exit code.
if len(valid_exit_codes) > 0 and len(containers_to_check) > 0:
(running, failed) = self._check_container_state(
containers_to_check,
valid_exit_codes,
task_vars)
# Check the Ansible async results for containers which:
# - reported a changed resources (podman_container created or updated
# a container) and return it as self.changed_containers.
# - reported a failed resource (podman_container failed to create
# the container and return it as self.failed_containers.
# - didn't finish on time and return it as self.failed_containers.
(self.changed_containers, async_failed) = (
self._check_errors_in_ansible_async_results(async_results))
if len(failed) > 0:
DISPLAY.error('Container(s) which finished with wrong return code'
': {}'.format(failed))
if len(async_failed) > 0:
DISPLAY.error('Container(s) which failed to be created by '
'podman_container module: {}'.format(async_failed))
if len(running) > 0:
DISPLAY.error('Container(s) which did not finish after {} '
'minutes: {}'.format(TIMEOUT, running))
total_errors = list(set(failed + async_failed + running))
if len(total_errors) > 0:
raise AnsibleActionFail('Failed container(s): {}, check logs in '
'/var/log/containers/'
'stdouts/'.format(total_errors))
container_commands = self._get_create_commands(async_results)
if len(container_commands) > 0 and \
(self._play_context.check_mode or self.debug):
for cmd in container_commands:
DISPLAY.display(cmd)
if len(container_commands) > 0:
self.changed = True
result['changed_containers'] = self.changed_containers
result['commands'] = container_commands
result['changed'] = self.changed
return result

View File

@ -290,7 +290,7 @@
{
"image": "fedora:rawhide",
"net": "host",
"command": "sleep 3600"
"command": "sleep 10"
}
dest: '/tmp/container-configs/fedora_bis.json'
- include_role:

View File

@ -1,51 +0,0 @@
---
# Copyright 2020 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
- name: "Wait for containers to be exit"
podman_container_info:
name: "{{ containers_with_exit_code }}"
register: podman_containers_infos
until: ( podman_containers_infos.containers | selectattr('State.Running', 'equalto', True) |list|length ) == 0
# Retry 30 times every 10 seconds so we wait 5 min in total
retries: 30
delay: 10
# We need to ignore the failures since later we print some debug.
# We can't use "rescue" here because the debug tasks use
# "podman_containers_infos".
failed_when: false
no_log: "{{ not tripleo_container_manage_debug }}"
- name: Create a list of containers which didn't exit
set_fact:
running_containers: >-
{{ podman_containers_infos.containers |
selectattr('State.Running', 'equalto', True) | map(attribute='Name') | list }}
- name: Create a list of containers with bad Exit Codes
set_fact:
broken_containers: >-
{{ podman_containers_infos.containers |
rejectattr('State.ExitCode', 'in', tripleo_container_manage_valid_exit_code) | map(attribute='Name') | list }}
- name: "Print running containers"
fail:
msg: "Container(s) which are still running after 5 min: {{ running_containers }}, check logs in /var/log/containers/stdouts/"
when: running_containers|length != 0
- name: "Print failing containers"
fail:
msg: "Container(s) with bad ExitCode: {{ broken_containers }}, check logs in /var/log/containers/stdouts/"
when: broken_containers|length != 0

View File

@ -96,46 +96,21 @@
when:
- not ansible_check_mode|bool
- name: Check containers status
container_status:
container_async_results: "{{ create_async_poll_results.results }}"
container_data: "{{ batched_container_data }}"
valid_exit_codes: "{{ tripleo_container_manage_valid_exit_code }}"
debug: "{{ tripleo_container_manage_debug | bool }}"
register: container_status_results
- name: "Create fact for containers which changed"
set_fact:
# List of containers which have changed (created or updated)
containers_changed: "{{ create_async_poll_results.results | get_changed_containers | default([]) }}"
containers_changed: "{{ container_status_results.changed_containers | default([]) }}"
- name: "Create fact for containers which failed"
- name: "Append the list of all podman commands that are run for containers with changes"
set_fact:
# List of containers which returned an error when creating or updating them
containers_failed: "{{ create_async_poll_results.results | get_failed_containers | default([]) }}"
- name: "Create fact for containers which require rc check"
set_fact:
# List of containers which would terminate with a return code that needs to be valid.
# We assume that container configs that don't have a restart policy nor action
# (used for podman exec) will run something and then exit with a return code.
containers_to_check: >-
{{ batched_container_data | haskey(attribute='image', excluded_keys=['action', 'restart']) |
list_of_keys | default([]) | difference(containers_failed) }}
- name: Print the containers that failed to start
fail:
msg: "{{ containers_failed }} failed to start, check logs in /var/log/containers/stdouts/"
when:
- containers_failed|length != 0
- name: Block for container commands
include_tasks: podman/get_commands_create.yml
all_containers_commands: "{{ container_status_results.commands | default([]) + (all_containers_commands | default([]) | list) }}"
when:
- ansible_check_mode|bool
- name: "Print the list of containers which changed"
debug:
var: containers_changed
when: tripleo_container_manage_debug | bool
- name: "Block for container exit codes"
when:
- not ansible_check_mode|bool
- tripleo_container_manage_valid_exit_code|length != 0
- containers_to_check|length != 0
include_tasks: podman/check_exit_code.yml
vars:
containers_with_exit_code: "{{ containers_to_check }}"

View File

@ -1,29 +0,0 @@
---
# Copyright 2020 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
- name: "Create a list of podman commands that are run for containers with changes"
set_fact:
containers_commands: >-
{{ create_async_results.results | selectattr('changed', 'equalto', true) |
map(attribute='podman_actions') | default([]) | list }}
- name: "Print the list of commands that are run for containers with changes"
debug:
var: containers_commands
- name: "Append the list of all podman commands that are run for containers with changes"
set_fact:
all_containers_commands: "{{ containers_commands|default([], true) + (all_containers_commands | default([]) | list) }}"

View File

@ -353,6 +353,7 @@
files:
- ^tripleo_ansible/roles/tripleo-container-manage/.*
- ^tripleo_ansible/roles/tripleo-container-rm/.*
- ^tripleo_ansible/ansible_plugins/action/container_status.py$
- ^tripleo_ansible/ansible_plugins/filter/helpers.py$
- ^tripleo_ansible/ansible_plugins/modules/container_config_data.py$
- ^tripleo_ansible/ansible_plugins/modules/container_puppet_config.py$