Burn-in: Add CPU step

Add a clean step for CPU burn-in via stress-ng. Get basic
run parameters from the node's driver_info.

Story: #2007523
Task: #42382

Change-Id: I14fd4164991fb94263757244f716b6bfe8edf875
This commit is contained in:
Arne Wiebalck 2021-04-26 12:00:44 +02:00
parent 9edb13d891
commit 6702fcaa43
6 changed files with 138 additions and 1 deletions

View File

@ -74,6 +74,9 @@ Known limitations:
Clean steps
-----------
``deploy.burnin_cpu``
Stress-test the CPUs of a node via stress-ng for a configurable
amount of time. Disabled by default.
``deploy.erase_devices``
Securely erases all information from all recognized disk devices.
Relatively fast when secure ATA erase is available, otherwise can take

View File

@ -0,0 +1,48 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ironic_lib import utils
from oslo_concurrency import processutils
from oslo_log import log
from ironic_python_agent import errors
LOG = log.getLogger(__name__)
def stress_ng_cpu(node):
"""Burn-in the CPU with stress-ng
Run stress-ng on a configurable number of CPUs for
a configurable amount of time. Without config use
all CPUs and stress them for 24 hours.
:param node: Ironic node object
:raises: CommandExecutionError if the execution of stress-ng fails.
"""
info = node.get('driver_info', {})
cpu = info.get('agent_burnin_cpu_cpu', 0)
timeout = info.get('agent_burnin_cpu_timeout', 86400)
args = ('stress-ng', '--cpu', cpu, '--timeout', timeout,
'--metrics-brief')
LOG.debug('Burn-in stress_ng_cpu command: %s', args)
try:
_, err = utils.execute(*args)
# stress-ng reports on stderr only
LOG.info(err)
except (processutils.ProcessExecutionError, OSError) as e:
error_msg = ("stress-ng (cpu) failed with error %(err)s",
{'err': e})
LOG.error(error_msg)
raise errors.CommandExecutionError(error_msg)

View File

@ -38,6 +38,7 @@ import pyudev
import stevedore
import yaml
from ironic_python_agent import burnin
from ironic_python_agent import encoding
from ironic_python_agent import errors
from ironic_python_agent.extensions import base as ext_base
@ -1393,6 +1394,14 @@ class GenericHardwareManager(HardwareManager):
except OSError:
os.remove(filepath)
def burnin_cpu(self, node, ports):
"""Burn-in the CPU
:param node: Ironic node object
:param ports: list of Ironic port objects
"""
burnin.stress_ng_cpu(node)
def _shred_block_device(self, node, block_device):
"""Erase a block device using shred.
@ -1865,7 +1874,14 @@ class GenericHardwareManager(HardwareManager):
'interface': 'raid',
'reboot_requested': False,
'abortable': True
}
},
{
'step': 'burnin_cpu',
'priority': 0,
'interface': 'deploy',
'reboot_requested': False,
'abortable': True
},
]
def get_deploy_steps(self, node, ports):

View File

@ -0,0 +1,56 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from unittest import mock
from ironic_lib import utils
from oslo_concurrency import processutils
from ironic_python_agent import burnin
from ironic_python_agent import errors
from ironic_python_agent.tests.unit import base
@mock.patch.object(utils, 'execute', autospec=True)
class TestBurnin(base.IronicAgentTest):
def test_stress_ng_cpu_default(self, mock_execute):
node = {'driver_info': {}}
mock_execute.return_value = (['out', 'err'])
burnin.stress_ng_cpu(node)
mock_execute.assert_called_once_with(
'stress-ng', '--cpu', 0, '--timeout', 86400, '--metrics-brief')
def test_stress_ng_cpu_non_default(self, mock_execute):
node = {'driver_info': {'agent_burnin_cpu_cpu': 3,
'agent_burnin_cpu_timeout': 2911}}
mock_execute.return_value = (['out', 'err'])
burnin.stress_ng_cpu(node)
mock_execute.assert_called_once_with(
'stress-ng', '--cpu', 3, '--timeout', 2911, '--metrics-brief')
def test_stress_ng_cpu_no_stress_ng(self, mock_execute):
node = {'driver_info': {}}
mock_execute.side_effect = (['out', 'err'],
processutils.ProcessExecutionError())
burnin.stress_ng_cpu(node)
self.assertRaises(errors.CommandExecutionError,
burnin.stress_ng_cpu, node)

View File

@ -149,6 +149,13 @@ class TestGenericHardwareManager(base.IronicAgentTest):
'interface': 'raid',
'reboot_requested': False,
'abortable': True
},
{
'step': 'burnin_cpu',
'priority': 0,
'interface': 'deploy',
'reboot_requested': False,
'abortable': True
}
]
clean_steps = self.hardware.get_clean_steps(self.node, [])

View File

@ -0,0 +1,7 @@
---
features:
- |
Adds a burn-in cleaning step 'burnin_cpu' to stress test CPUs for a
configurable amount of time with stress-ng. To use this step,
stress-ng needs to be installed on the RAM disk.