Revert "Reimplement os_tripleo_baremetal_node_introspection"

This reverts commit 0ed41aeab1.

Change-Id: I2856bfdd47b9e8c5c33270efa7725a81ccd81378
This commit is contained in:
Steve Baker 2020-11-19 22:23:05 +00:00 committed by wes hayutin
parent 3ffc7e85a8
commit 732a872c66
1 changed files with 192 additions and 130 deletions

View File

@ -1,3 +1,4 @@
#!/usr/bin/python
# Copyright (c) 2019 OpenStack Foundation
# All Rights Reserved.
#
@ -12,18 +13,7 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from concurrent import futures
import io
import logging
import yaml
from ansible.module_utils.basic import AnsibleModule
from ansible.module_utils.openstack import openstack_cloud_from_module
from ansible.module_utils.openstack import openstack_full_argument_spec
from ansible.module_utils.openstack import openstack_module_kwargs
LOG = logging.getLogger('os_tripleo_baremetal_node_introspection')
__metaclass__ = type
ANSIBLE_METADATA = {'metadata_version': '1.1',
'status': ['preview'],
@ -137,130 +127,205 @@ EXAMPLES = '''
'''
import time
import yaml
def _configure_logging():
log_fmt = ('%(asctime)s %(levelname)s %(name)s: %(message)s')
urllib_level = logging.CRITICAL
base_level = logging.INFO
log_stream = io.StringIO()
handler = logging.StreamHandler(log_stream)
logging.basicConfig(level=base_level, format=log_fmt,
handlers=[handler])
logging.getLogger('urllib3.connectionpool').setLevel(urllib_level)
return log_stream
from ansible.module_utils.basic import AnsibleModule
from ansible.module_utils.openstack import openstack_full_argument_spec
from ansible.module_utils.openstack import openstack_module_kwargs
from ansible.module_utils.openstack import openstack_cloud_from_module
def introspect(cloud, node_uuids, node_timeout, retry_timeout, max_retries,
concurrency):
result = {}
if not node_uuids:
return result
introspect_jobs = []
class IntrospectionManagement(object):
def __init__(self,
cloud,
module,
concurrency,
max_retries,
node_timeout,
retry_timeout):
self.client = cloud.baremetal_introspection
self.cloud = cloud
self.module = module
self.concurrency = concurrency
self.max_retries = max_retries
self.node_timeout = node_timeout
self.retry_timeout = retry_timeout
with futures.ThreadPoolExecutor(max_workers=concurrency) as p:
for node_uuid in node_uuids:
introspect_jobs.append(p.submit(
introspect_node, cloud, node_uuid,
node_timeout, retry_timeout, max_retries
))
for job in futures.as_completed(introspect_jobs):
result[node_uuid] = job.result()
return result
def introspect_node(cloud, node_uuid, node_timeout, retry_timeout,
max_retries):
last_error = None
attempt = 0
while attempt <= max_retries:
attempt += 1
# Attempt cleanup from previous error
if last_error:
LOG.info("Preparing for retry %s for node: %s", attempt, node_uuid)
prepare_for_retry(cloud, node_uuid, node_timeout, retry_timeout)
def log(self, msg):
self.module.log("os_tripleo_baremetal_node_introspection: %s" % msg)
def push_next(self, pool, queue):
try:
LOG.info("Introspecting node: %s", node_uuid)
next_introspection = next(queue)
pool.append(next_introspection)
except StopIteration:
pass
return pool
# Start introspection
cloud.baremetal.set_node_provision_state(
node_uuid, 'inspect', wait=True, timeout=node_timeout)
def introspect(self, node_uuids):
# Power off the node
cloud.baremetal.set_node_power_state(
node_uuid, 'power off', wait=True, timeout=node_timeout
)
result = {}
queue = (NodeIntrospection(
uuid,
self.client,
self.cloud,
self.node_timeout,
self.max_retries,
self.retry_timeout,
self.log) for uuid in node_uuids)
pool = []
# Wait for the node lock to be released
cloud.baremetal.wait_for_node_reservation(
node_uuid, timeout=node_timeout
)
for i in range(self.concurrency):
pool = self.push_next(pool, queue)
# Get the introspection data for the result
data = cloud.baremetal_introspection.get_introspection_data(
node_uuid)
while len(pool) > 0:
finished = []
for intro in pool:
if not intro.started:
try:
intro.start_introspection()
continue
except Exception as e:
self.log("ERROR Node %s can't start introspection"
" because: %s" % (intro.node_id, str(e)))
result[intro.node_id] = {
"error": "Error for introspection node %s: %s " % (
intro.node_id, str(e)),
"failed": True,
"status": ''
}
finished.append(intro)
continue
status = intro.get_introspection()
if (not status.is_finished and intro.timeouted()) or (
status.is_finished and status.error is not None
):
if status.is_finished:
self.log("ERROR Introspection of node %s "
"failed: %s" % (
status.id, str(status.error))
)
if intro.last_retry():
result[status.id] = (intro.error_msg()
if status.is_finished
else intro.timeout_msg())
finished.append(intro)
else:
intro.restart_introspection()
if status.is_finished and status.error is None:
result[status.id] = {
'status': intro.get_introspection_data(),
'failed': False,
'error': None}
finished.append(intro)
for i in finished:
pool.remove(i)
pool = self.push_next(pool, queue)
# Let's not DDOS Ironic service
if pool:
time.sleep(min(10, self.node_timeout))
LOG.info("Introspecting node complete: %s", node_uuid)
# Success
return {
'status': data,
'failed': False,
'error': None
}
return result
class NodeIntrospection:
started = False
def __init__(
self,
node_id,
os_client,
os_cloud,
timeout,
max_retries,
retry_timeout,
log):
self.node_id = node_id
self.os_client = os_client
self.os_cloud = os_cloud
self.timeout = timeout
self.max_retries = max_retries
self.log = log
self.start = int(time.time())
self.retries = 0
self.retry_timeout = retry_timeout
self.last_status = None
def restart_introspection(self):
self.retries += 1
try:
self.os_client.abort_introspection(self.node_id)
except Exception as e:
last_error = str(e)
LOG.error("Introspection of node %s failed on attempt %s: "
"%s", node_uuid, attempt, last_error)
# Node is locked
self.log("ERROR Node %s can't abort introspection: %s" % (
self.node_id, str(e)))
return
# need to wait before restarting introspection till it's aborted
# to prevent hanging let's use introspect timeout for that
try:
self.os_client.wait_for_introspection(
self.node_id, timeout=self.timeout, ignore_error=True)
# Wait until node is unlocked
self.os_cloud.baremetal.wait_for_node_reservation(
self.node_id, timeout=self.retry_timeout)
except Exception as e:
self.log("ERROR Node %s can't restart introspection because can't "
"abort and unlock it: %s" % (self.node_id, str(e)))
return
self.start = int(time.time())
return self.start_introspection(restart=True)
message = 'unknown error'
status = ''
# All attempts failed, fetch node to get the reason
try:
node = cloud.baremetal.get_node(node_uuid)
message = node.last_error
status = node.provision_state
except Exception:
if last_error:
# Couldn't fetch the node, use the last exception message instead
message = last_error
def start_introspection(self, restart=False):
self.started = True
if restart:
self.log("INFO Restarting (try %s of %s) introspection of "
"node %s" % (
self.retries, self.max_retries, self.node_id))
else:
self.log("INFO Starting introspection of node %s" % (self.node_id))
return self.os_client.start_introspection(self.node_id)
return {
"error": "Error for introspection node %s on attempt %s: %s " %
(node_uuid, attempt, message),
"failed": True,
"status": status
}
def get_introspection(self):
self.last_status = self.os_client.get_introspection(self.node_id)
return self.last_status
def get_introspection_data(self):
self.log(
"Instrospection of node %s finished successfully!" % self.node_id)
return self.os_client.get_introspection_data(self.node_id)
def prepare_for_retry(cloud, node_uuid, node_timeout, retry_timeout):
# Attempt to abort any existing introspection
try:
cloud.baremetal.set_node_provision_state(
node_uuid, 'abort', wait=True, timeout=node_timeout)
except Exception as e:
LOG.warn("Abort introspection of node %s failed: %s",
node_uuid, str(e))
def time_elapsed(self):
return int(time.time()) - self.start
# Attempt to power off the node
try:
cloud.baremetal.set_node_power_state(
node_uuid, 'off', wait=True, timeout=node_timeout
)
except Exception as e:
LOG.warn("Power off of node %s failed: %s",
node_uuid, str(e))
def timeouted(self):
return self.time_elapsed() > self.timeout
# Wait until node is unlocked
try:
cloud.baremetal.wait_for_node_reservation(
node_uuid, timeout=retry_timeout)
except Exception as e:
LOG.warn("Waiting for node unlock %s failed: %s",
node_uuid, str(e))
def last_retry(self):
return self.retries >= self.max_retries
def timeout_msg(self):
self.log(
"ERROR Retry limit %s reached for introspection "
"node %s: exceeded timeout" % (
self.max_retries, self.node_id))
return {"error": "Timeout error for introspection node %s: %s "
"sec exceeded max timeout of %s sec" % (
self.node_id, self.time_elapsed(), self.timeout),
"failed": True,
"status": self.last_status
}
def error_msg(self):
self.log(
"ERROR Retry limit %s reached for introspection "
"node %s: %s" % (
self.max_retries, self.node_id, self.last_status.error))
return {"error": "Error for introspection node %s: %s " % (
self.node_id, self.last_status.error),
"failed": True,
"status": self.last_status
}
def main():
@ -273,9 +338,6 @@ def main():
supports_check_mode=False,
**module_kwargs
)
log_stream = _configure_logging()
auth_type = module.params.get('auth_type')
ironic_url = module.params.get('ironic_url')
if auth_type in (None, 'None'):
@ -289,15 +351,16 @@ def main():
_, cloud = openstack_cloud_from_module(module)
result = introspect(
introspector = IntrospectionManagement(
cloud,
node_uuids=module.params["node_uuids"],
node_timeout=module.params["node_timeout"],
retry_timeout=module.params["retry_timeout"],
max_retries=module.params["max_retries"],
concurrency=module.params["concurrency"])
module,
module.params["concurrency"],
module.params["max_retries"],
module.params["node_timeout"],
module.params["retry_timeout"]
)
module_results = {"changed": True}
result = introspector.introspect(module.params["node_uuids"])
failed_nodes = [k for k, v in result.items() if v['failed']]
passed_nodes = [k for k, v in result.items() if not v['failed']]
failed = len(failed_nodes)
@ -317,8 +380,7 @@ def main():
"introspection_data": result if not module.params['quiet'] else {},
"failed_nodes": failed_nodes,
"passed_nodes": passed_nodes,
"msg": message,
"logging": log_stream.getvalue().split('\n')
"msg": message
})
module.exit_json(**module_results)