Add metrics support to IPA

This utilizes the new metrics support in ironic-lib to allow the agent to
report timing metrics for agent API methods as configured in ironic-lib.

Additionally, this adds developer docs on how to use metrics in IPA,
including some caveats specific to ironic-lib.metrics use in IPA.

Co-Authored-By: Jay Faulkner <jay@jvf.cc>
Co-Authored-By: Alex Weeks <alex.weeks@gmail.com>
Change-Id: Ic08d4ff78b6fb614b474b956a32eac352a14262a
Partial-bug: #1526219
This commit is contained in:
Josh Gachnang 2014-09-08 20:26:51 -07:00 committed by Jay Faulkner
parent ad60806f93
commit fd874652e3
7 changed files with 97 additions and 25 deletions

View File

@ -17,7 +17,8 @@ Index
.. toctree::
troubleshooting
troubleshooting
metrics
How it works
============

53
doc/source/metrics.rst Normal file
View File

@ -0,0 +1,53 @@
.. _metrics:
===============================================
Emitting metrics from Ironic-Python-Agent (IPA)
===============================================
This document describes how to emit metrics from IPA, including timers and
counters in code to directly emitting hardware metrics from a custom
HardwareManager.
Overview
========
IPA uses the metrics implementation from ironic-lib, with a few caveats due
to the dynamic configuration done at lookup time. You cannot cache the metrics
instance as the MetricsLogger returned will change after lookup if configs
different than the default setting have been used. This also means that the
method decorator supported by ironic-lib cannot be used in IPA.
Using a context manager
=======================
Using the context manager is the recommended way for sending metrics that time
or count sections of code. However, given that you cannot cache the
MetricsLogger, you have to explicitly call get_metrics_logger() from
ironic-lib every time. For example:
from ironic_lib import metrics_utils
def my_method():
with metrics_utils.get_metrics_logger(__name__).timer():
return _do_work()
As a note, these metric collectors do work for custom HardwareManagers as
well, however, you may want to metric the portions of a method that determine
compatability separate from portions of a method that actually do work, in
order to assure the metrics are relevant and useful on all hardware.
Explicitly sending metrics
==========================
A feature that may be particularly helpful for deployers writing custom
HardwareManagers is the ability to explicitly send metrics. As an example,
you could add a cleaning step which would retrieve metrics about a device and
ship them using the provided metrics library. For example:
from ironic_lib import metrics_utils
def my_cleaning_step():
for name, value in _get_smart_data():
metrics_utils.get_metrics_logger(__name__).send_gauge(name, value)
References
==========
For more information, please read the source of the metrics module in
`ironic-lib <http://git.openstack.org/cgit/openstack/ironic-lib/tree/ironic_lib>`_.

View File

@ -20,6 +20,7 @@ import threading
import time
from oslo_concurrency import processutils
from oslo_config import cfg
from oslo_log import log
import pkg_resources
from six.moves.urllib import parse as urlparse
@ -35,7 +36,6 @@ from ironic_python_agent import inspector
from ironic_python_agent import ironic_api_client
from ironic_python_agent import utils
LOG = log.getLogger(__name__)
# Time(in seconds) to wait for any of the interfaces to be up
@ -45,6 +45,9 @@ NETWORK_WAIT_TIMEOUT = 60
# Time(in seconds) to wait before reattempt
NETWORK_WAIT_RETRY = 5
cfg.CONF.import_group('metrics', 'ironic_lib.metrics_utils')
cfg.CONF.import_group('metrics_statsd', 'ironic_lib.metrics_statsd')
def _time():
"""Wraps time.time() for simpler testing."""
@ -340,6 +343,15 @@ class IronicPythonAgent(base.ExecuteCommandMixin):
hardware.cache_node(self.node)
self.heartbeat_timeout = content['heartbeat_timeout']
# Update config with values from Ironic
config = content.get('config', {})
if config.get('metrics'):
for opt, val in config.items():
setattr(cfg.CONF.metrics, opt, val)
if config.get('metrics_statsd'):
for opt, val in config.items():
setattr(cfg.CONF.metrics_statsd, opt, val)
wsgi = simple_server.make_server(
self.listen_address[0],
self.listen_address[1],

View File

@ -12,9 +12,9 @@
# License for the specific language governing permissions and limitations
# under the License.
from ironic_lib import metrics_utils
import pecan
from pecan import rest
from wsme import types as wtypes
import wsmeext.pecan as wsme_pecan
@ -81,7 +81,8 @@ class RootController(rest.RestController):
# NOTE: The reason why convert() it's being called for every
# request is because we need to get the host url from
# the request object to make the links.
return Root.convert()
with metrics_utils.get_metrics_logger(__name__).timer('get'):
return Root.convert()
@pecan.expose()
def _route(self, args):

View File

@ -13,6 +13,7 @@
# License for the specific language governing permissions and limitations
# under the License.
from ironic_lib import metrics_utils
import pecan
from pecan import rest
from wsme import types
@ -78,9 +79,10 @@ class CommandController(rest.RestController):
@wsme_pecan.wsexpose(CommandResultList)
def get_all(self):
"""Get all command results."""
agent = pecan.request.agent
results = agent.list_command_results()
return CommandResultList.from_results(results)
with metrics_utils.get_metrics_logger(__name__).timer('get_all'):
agent = pecan.request.agent
results = agent.list_command_results()
return CommandResultList.from_results(results)
@wsme_pecan.wsexpose(CommandResult, types.text, types.text)
def get_one(self, result_id, wait=None):
@ -91,13 +93,14 @@ class CommandController(rest.RestController):
:returns: a :class:`ironic_python_agent.api.controller.v1.command.
CommandResult` object.
"""
agent = pecan.request.agent
result = agent.get_command_result(result_id)
with metrics_utils.get_metrics_logger(__name__).timer('get_one'):
agent = pecan.request.agent
result = agent.get_command_result(result_id)
if wait and wait.lower() == 'true':
result.join()
if wait and wait.lower() == 'true':
result.join()
return CommandResult.from_result(result)
return CommandResult.from_result(result)
@wsme_pecan.wsexpose(CommandResult, types.text, body=Command)
def post(self, wait=None, command=None):
@ -109,14 +112,15 @@ class CommandController(rest.RestController):
:returns: a :class:`ironic_python_agent.api.controller.v1.command.
CommandResult` object.
"""
# the POST body is always the last arg,
# so command must be a kwarg here
if command is None:
command = Command()
agent = pecan.request.agent
result = agent.execute_command(command.name, **command.params)
with metrics_utils.get_metrics_logger(__name__).timer('post'):
# the POST body is always the last arg,
# so command must be a kwarg here
if command is None:
command = Command()
agent = pecan.request.agent
result = agent.execute_command(command.name, **command.params)
if wait and wait.lower() == 'true':
result.join()
if wait and wait.lower() == 'true':
result.join()
return result
return result

View File

@ -13,6 +13,7 @@
# License for the specific language governing permissions and limitations
# under the License.
from ironic_lib import metrics_utils
import pecan
from pecan import rest
from wsme import types
@ -48,6 +49,7 @@ class StatusController(rest.RestController):
@wsme_pecan.wsexpose(AgentStatus)
def get_all(self):
"""Get current status of the running agent."""
agent = pecan.request.agent
status = agent.get_status()
return AgentStatus.from_agent_status(status)
with metrics_utils.get_metrics_logger(__name__).timer('get_all'):
agent = pecan.request.agent
status = agent.get_status()
return AgentStatus.from_agent_status(status)

View File

@ -533,7 +533,6 @@ class StandbyExtension(base.BaseAgentExtension):
stream_raw_images = image_info.get('stream_raw_images', False)
# don't write image again if already cached
if self.cached_image_id != image_info['id']:
if self.cached_image_id is not None:
LOG.debug('Already had %s cached, overwriting',
self.cached_image_id)