Handle deprecated "cpu_util" metric

The "cpu_util" metric has been deprecated a few years ago. We'll obtain the same result by converting the cumulative cpu time to a percentage, leveraging the rate of change aggregation. Change-Id: I18fe0de6f74c785e674faceea0c48f44055818fe
2023-10-10 14:29:06 +03:00 · 2023-10-10 14:29:06 +03:00 · 00fea975e2
parent 922478fbda
commit 00fea975e2
9 changed files with 74 additions and 31 deletions
--- a/doc/source/configuration/configuring.rst
+++ b/doc/source/configuration/configuring.rst
@ -372,7 +372,7 @@ You can configure and install Ceilometer by following the documentation below :
 #. https://docs.openstack.org/ceilometer/latest

 The built-in strategy 'basic_consolidation' provided by watcher requires
-"**compute.node.cpu.percent**" and "**cpu_util**" measurements to be collected
+"**compute.node.cpu.percent**" and "**cpu**" measurements to be collected
 by Ceilometer.
 The measurements available depend on the hypervisors that OpenStack manages on
 the specific implementation.
--- a/doc/source/contributor/plugin/strategy-plugin.rst
+++ b/doc/source/contributor/plugin/strategy-plugin.rst
@ -300,6 +300,6 @@ Using that you can now query the values for that specific metric:
 .. code-block:: py

    avg_meter = self.datasource_backend.statistic_aggregation(
-        instance.uuid, 'cpu_util', self.periods['instance'],
+        instance.uuid, 'instance_cpu_usage', self.periods['instance'],
        self.granularity,
        aggregation=self.aggregation_method['instance'])
--- a/doc/source/strategies/basic-server-consolidation.rst
+++ b/doc/source/strategies/basic-server-consolidation.rst
@ -26,8 +26,7 @@ metric                       service name plugins comment
                                                  ``compute_monitors`` option
                                                  to ``cpu.virt_driver`` in
                                                  the nova.conf.
-``cpu_util``                 ceilometer_  none    cpu_util has been removed
-                                                  since Stein.
+``cpu``                      ceilometer_  none
 ============================ ============ ======= ===========================

 .. _ceilometer: https://docs.openstack.org/ceilometer/latest/admin/telemetry-measurements.html#openstack-compute
--- a/doc/source/strategies/vm_workload_consolidation.rst
+++ b/doc/source/strategies/vm_workload_consolidation.rst
@ -22,8 +22,7 @@ The *vm_workload_consolidation* strategy requires the following metrics:
 ============================ ============ ======= =========================
 metric                       service name plugins comment
 ============================ ============ ======= =========================
-``cpu_util``                 ceilometer_  none    cpu_util has been removed
-                                                  since Stein.
+``cpu``                      ceilometer_  none
 ``memory.resident``          ceilometer_  none
 ``memory``                   ceilometer_  none
 ``disk.root.size``           ceilometer_  none
--- a/doc/source/strategies/workload-stabilization.rst
+++ b/doc/source/strategies/workload-stabilization.rst
@ -27,9 +27,8 @@ metric                       service name plugins comment
                                                  to ``cpu.virt_driver`` in the
                                                  nova.conf.
 ``hardware.memory.used``     ceilometer_  SNMP_
-``cpu_util``                 ceilometer_  none    cpu_util has been removed
-                                                  since Stein.
-``memory.resident``          ceilometer_  none
+``cpu``                      ceilometer_  none
+``instance_ram_usage``       ceilometer_  none
 ============================ ============ ======= =============================

 .. _ceilometer: https://docs.openstack.org/ceilometer/latest/admin/telemetry-measurements.html#openstack-compute
@ -107,10 +106,10 @@ parameter            type   default Value         description
                                                  period of all received ones.
 ==================== ====== ===================== =============================

-.. |metrics| replace:: ["cpu_util", "memory.resident"]
-.. |thresholds| replace:: {"cpu_util": 0.2, "memory.resident": 0.2}
-.. |weights| replace:: {"cpu_util_weight": 1.0, "memory.resident_weight": 1.0}
-.. |instance_metrics| replace:: {"cpu_util": "compute.node.cpu.percent", "memory.resident": "hardware.memory.used"}
+.. |metrics| replace:: ["instance_cpu_usage", "instance_ram_usage"]
+.. |thresholds| replace:: {"instance_cpu_usage": 0.2, "instance_ram_usage": 0.2}
+.. |weights| replace:: {"instance_cpu_usage_weight": 1.0, "instance_ram_usage_weight": 1.0}
+.. |instance_metrics| replace:: {"instance_cpu_usage": "compute.node.cpu.percent", "instance_ram_usage": "hardware.memory.used"}
 .. |periods| replace:: {"instance": 720, "node": 600}

 Efficacy Indicator
@ -136,8 +135,8 @@ How to use it ?
      at1 workload_balancing --strategy workload_stabilization

    $ openstack optimize audit create -a at1 \
-      -p thresholds='{"memory.resident": 0.05}' \
-      -p metrics='["memory.resident"]'
+      -p thresholds='{"instance_ram_usage": 0.05}' \
+      -p metrics='["instance_ram_usage"]'

 External Links
 --------------
--- a/doc/source/strategies/workload_balance.rst
+++ b/doc/source/strategies/workload_balance.rst
@ -24,8 +24,7 @@ The *workload_balance* strategy requires the following metrics:
 ======================= ============ ======= =========================
 metric                  service name plugins comment
 ======================= ============ ======= =========================
-``cpu_util``            ceilometer_  none    cpu_util has been removed
-                                             since Stein.
+``cpu``                 ceilometer_  none
 ``memory.resident``     ceilometer_  none
 ======================= ============ ======= =========================

@ -65,15 +64,16 @@ Configuration

 Strategy parameters are:

-============== ====== ============= ====================================
-parameter      type   default Value description
-============== ====== ============= ====================================
-``metrics``    String 'cpu_util'    Workload balance base on cpu or ram
-                                    utilization. choice: ['cpu_util',
-                                    'memory.resident']
-``threshold``  Number 25.0          Workload threshold for migration
-``period``     Number 300           Aggregate time period of ceilometer
-============== ====== ============= ====================================
+============== ====== ==================== ====================================
+parameter      type   default Value        description
+============== ====== ==================== ====================================
+``metrics``    String 'instance_cpu_usage' Workload balance base on cpu or ram
+                                           utilization. Choices:
+                                           ['instance_cpu_usage',
+                                           'instance_ram_usage']
+``threshold``  Number 25.0                 Workload threshold for migration
+``period``     Number 300                  Aggregate time period of ceilometer
+============== ====== ==================== ====================================

 Efficacy Indicator
 ------------------
@ -95,7 +95,7 @@ How to use it ?
      at1 workload_balancing --strategy workload_balance

    $ openstack optimize audit create -a at1 -p threshold=26.0 \
-            -p period=310 -p metrics=cpu_util
+            -p period=310 -p metrics=instance_cpu_usage

 External Links
 --------------
--- a/watcher/decision_engine/datasources/gnocchi.py
+++ b/watcher/decision_engine/datasources/gnocchi.py
@ -38,7 +38,7 @@ class GnocchiHelper(base.DataSourceBase):
                      host_inlet_temp='hardware.ipmi.node.temperature',
                      host_airflow='hardware.ipmi.node.airflow',
                      host_power='hardware.ipmi.node.power',
-                      instance_cpu_usage='cpu_util',
+                      instance_cpu_usage='cpu',
                      instance_ram_usage='memory.resident',
                      instance_ram_allocated='memory',
                      instance_l3_cache_usage='cpu_l3_cache',
@ -93,6 +93,25 @@ class GnocchiHelper(base.DataSourceBase):

            resource_id = resources[0]['id']

+        if meter_name == "instance_cpu_usage":
+            if resource_type != "instance":
+                LOG.warning("Unsupported resource type for metric "
+                            "'instance_cpu_usage': ", resource_type)
+                return
+
+            # The "cpu_util" gauge (percentage) metric has been removed.
+            # We're going to obtain the same result by using the rate of change
+            # aggregate operation.
+            if aggregate not in ("mean", "rate:mean"):
+                LOG.warning("Unsupported aggregate for instance_cpu_usage "
+                            "metric: %s. "
+                            "Supported aggregates: mean, rate:mean ",
+                            aggregate)
+                return
+
+            # TODO(lpetrut): consider supporting other aggregates.
+            aggregate = "rate:mean"
+
        raw_kwargs = dict(
            metric=meter,
            start=start_time,
@ -117,6 +136,17 @@ class GnocchiHelper(base.DataSourceBase):
                # Airflow from hardware.ipmi.node.airflow is reported as
                # 1/10 th of actual CFM
                return_value *= 10
+            if meter_name == "instance_cpu_usage":
+                # "rate:mean" can return negative values for migrated vms.
+                return_value = max(0, return_value)
+
+                # We're converting the cumulative cpu time (ns) to cpu usage
+                # percentage.
+                vcpus = resource.vcpus
+                if not vcpus:
+                    LOG.warning("instance vcpu count not set, assuming 1")
+                    vcpus = 1
+                return_value *= 100 / (granularity * 10e+8) / vcpus

        return return_value

--- a/watcher/decision_engine/strategy/strategies/workload_balance.py
+++ b/watcher/decision_engine/strategy/strategies/workload_balance.py
@ -295,7 +295,7 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy):
                        self.threshold)
            return self.solution

-        # choose the server with largest cpu_util
+        # choose the server with largest cpu usage
        source_nodes = sorted(source_nodes,
                              reverse=True,
                              key=lambda x: (x[self._meter]))
--- a/watcher/tests/decision_engine/datasources/test_gnocchi_helper.py
+++ b/watcher/tests/decision_engine/datasources/test_gnocchi_helper.py
@ -40,17 +40,25 @@ class TestGnocchiHelper(base.BaseTestCase):
        self.addCleanup(stat_agg_patcher.stop)

    def test_gnocchi_statistic_aggregation(self, mock_gnocchi):
+        vcpus = 2
+        mock_instance = mock.Mock(
+            id='16a86790-327a-45f9-bc82-45839f062fdc',
+            vcpus=vcpus)
+
        gnocchi = mock.MagicMock()
+        # cpu time rate of change (ns)
+        mock_rate_measure = 360 * 10e+8 * vcpus * 5.5 / 100
        expected_result = 5.5

-        expected_measures = [["2017-02-02T09:00:00.000000", 360, 5.5]]
+        expected_measures = [
+            ["2017-02-02T09:00:00.000000", 360, mock_rate_measure]]

        gnocchi.metric.get_measures.return_value = expected_measures
        mock_gnocchi.return_value = gnocchi

        helper = gnocchi_helper.GnocchiHelper()
        result = helper.statistic_aggregation(
-            resource=mock.Mock(id='16a86790-327a-45f9-bc82-45839f062fdc'),
+            resource=mock_instance,
            resource_type='instance',
            meter_name='instance_cpu_usage',
            period=300,
@ -59,6 +67,14 @@ class TestGnocchiHelper(base.BaseTestCase):
        )
        self.assertEqual(expected_result, result)

+        gnocchi.metric.get_measures.assert_called_once_with(
+            metric="cpu",
+            start=mock.ANY,
+            stop=mock.ANY,
+            resource_id=mock_instance.uuid,
+            granularity=360,
+            aggregation="rate:mean")
+
    def test_gnocchi_statistic_series(self, mock_gnocchi):
        gnocchi = mock.MagicMock()
        expected_result = {