From bce179882a3c5329290aac359aeb8278c3e8a8e3 Mon Sep 17 00:00:00 2001
From: Claudiu Belu <cbelu@cloudbasesolutions.com>
Date: Wed, 30 Aug 2017 08:50:52 -0700
Subject: [PATCH] Adds metrics collection scenario

This test suite verifies that the instance metrics are properly published
and collected and have non-zero values. The verification is done via the
ceilometer API.

Waiting for the ceilometer compute agent to poll the resources is crucial,
otherwise the test suite will fail due to the fact that no samples would
be found published before checking the samples.

The ceilometer agent's polling interval should have the same value as
the test suite's polled_metric_delay. This can be done in two ways:
a. Configure tempest's polled_metric_delay, by adding the following
line in tempest.conf, in the telemetry section:
    polled_metric_delay = <desired value>
b. Set the interval value in pipeline.yaml on the compute node to the
desired value and restart the ceilometer compute agent.

The second method is preferred, as the interval value defined in
pipeline.yaml is 600 seconds, which would mean each test would last
at least 10 minutes.

Change-Id: I2ad696c221fba860a30621686b464b8486bea583
---
 oswin_tempest_plugin/config.py                |  20 ++
 oswin_tempest_plugin/tests/_mixins/migrate.py |   4 +-
 .../tests/_mixins/optional_feature.py         |   2 +-
 oswin_tempest_plugin/tests/_mixins/resize.py  |   2 +-
 .../tests/scenario/test_cluster.py            |   2 +-
 .../tests/scenario/test_disks.py              |   2 +-
 .../tests/scenario/test_metrics_collection.py | 181 ++++++++++++++++++
 oswin_tempest_plugin/tests/test_base.py       |   7 +
 8 files changed, 214 insertions(+), 6 deletions(-)
 create mode 100644 oswin_tempest_plugin/tests/scenario/test_metrics_collection.py
diff --git a/oswin_tempest_plugin/config.py b/oswin_tempest_plugin/config.py
index 082147f..4782a13 100644
--- a/oswin_tempest_plugin/config.py
+++ b/oswin_tempest_plugin/config.py
@@ -14,6 +14,7 @@
 #    under the License.
 
 from oslo_config import cfg
+from oslo_config import types
 from tempest import config
 
 CONF = config.CONF
@@ -61,6 +62,25 @@ HyperVGroup = [
                help="The maximum number of NUMA cells the compute nodes "
                     "have. If it's less than 2, resize negative tests for "
                     "vNUMA will be skipped."),
+    cfg.ListOpt('collected_metrics',
+                item_type=types.String(
+                    choices=('cpu', 'network.outgoing.bytes',
+                             'disk.read.bytes')),
+                default=[],
+                help="The ceilometer metrics to check. If this config value "
+                     "is empty, the telemetry tests are skipped. This config "
+                     "option assumes that the compute nodes are configured "
+                     "and capable of collecting ceilometer metrics. WARNING: "
+                     "neutron-ovs-agent is not capable of enabling network "
+                     "metrics collection."),
+    cfg.IntOpt('polled_metrics_delay',
+               default=620,
+               help="The number of seconds to wait for the metrics to be "
+                    "published by the compute node's ceilometer-polling "
+                    "agent. The value must be greater by ~15-20 seconds "
+                    "than the agent's publish interval, defined in its "
+                    "pipeline.yaml file (typically, the intervals are 600 "
+                    "seconds)."),
 ]
 
 hyperv_host_auth_group = cfg.OptGroup(name='hyperv_host_auth',
diff --git a/oswin_tempest_plugin/tests/_mixins/migrate.py b/oswin_tempest_plugin/tests/_mixins/migrate.py
index 9dad6e1..d5135c2 100644
--- a/oswin_tempest_plugin/tests/_mixins/migrate.py
+++ b/oswin_tempest_plugin/tests/_mixins/migrate.py
@@ -43,7 +43,7 @@ class _MigrateMixin(object):
     def test_migration(self):
         server_tuple = self._create_server()
         self._migrate_server(server_tuple)
-        self._check_server_connectivity(server_tuple)
+        self._check_scenario(server_tuple)
 
 
 class _LiveMigrateMixin(object):
@@ -104,4 +104,4 @@ class _LiveMigrateMixin(object):
     def test_live_migration(self):
         server_tuple = self._create_server()
         self._live_migrate_server(server_tuple)
-        self._check_server_connectivity(server_tuple)
+        self._check_scenario(server_tuple)
diff --git a/oswin_tempest_plugin/tests/_mixins/optional_feature.py b/oswin_tempest_plugin/tests/_mixins/optional_feature.py
index 11651ee..7ae1ea4 100644
--- a/oswin_tempest_plugin/tests/_mixins/optional_feature.py
+++ b/oswin_tempest_plugin/tests/_mixins/optional_feature.py
@@ -71,7 +71,7 @@ class _OptionalFeatureMixin(resize._ResizeUtils):
 
     def test_feature(self):
         server_tuple = self._create_server()
-        self._check_server_connectivity(server_tuple)
+        self._check_scenario(server_tuple)
 
     @testtools.skipUnless(CONF.compute_feature_enabled.resize,
                           'Resize is not available.')
diff --git a/oswin_tempest_plugin/tests/_mixins/resize.py b/oswin_tempest_plugin/tests/_mixins/resize.py
index d8f16cb..b4c4b91 100644
--- a/oswin_tempest_plugin/tests/_mixins/resize.py
+++ b/oswin_tempest_plugin/tests/_mixins/resize.py
@@ -77,7 +77,7 @@ class _ResizeUtils(object):
 
         # assert that the server is still reachable, even if the resize
         # failed.
-        self._check_server_connectivity(server_tuple)
+        self._check_scenario(server_tuple)
 
 
 class _ResizeMixin(_ResizeUtils):
diff --git a/oswin_tempest_plugin/tests/scenario/test_cluster.py b/oswin_tempest_plugin/tests/scenario/test_cluster.py
index c6e10be..fd3022a 100644
--- a/oswin_tempest_plugin/tests/scenario/test_cluster.py
+++ b/oswin_tempest_plugin/tests/scenario/test_cluster.py
@@ -150,4 +150,4 @@ class HyperVClusterTest(migrate._MigrateMixin,
 
     def test_clustered_vm(self):
         server_tuple = self._create_server()
-        self._check_server_connectivity(server_tuple)
+        self._check_scenario(server_tuple)
diff --git a/oswin_tempest_plugin/tests/scenario/test_disks.py b/oswin_tempest_plugin/tests/scenario/test_disks.py
index d78b992..cd44ade 100644
--- a/oswin_tempest_plugin/tests/scenario/test_disks.py
+++ b/oswin_tempest_plugin/tests/scenario/test_disks.py
@@ -49,7 +49,7 @@ class _BaseDiskTestMixin(migrate._MigrateMixin,
 
     def test_disk(self):
         server_tuple = self._create_server()
-        self._check_server_connectivity(server_tuple)
+        self._check_scenario(server_tuple)
 
     @testtools.skipUnless(CONF.compute_feature_enabled.resize,
                           'Resize is not available.')
diff --git a/oswin_tempest_plugin/tests/scenario/test_metrics_collection.py b/oswin_tempest_plugin/tests/scenario/test_metrics_collection.py
new file mode 100644
index 0000000..1375388
--- /dev/null
+++ b/oswin_tempest_plugin/tests/scenario/test_metrics_collection.py
@@ -0,0 +1,181 @@
+# Copyright 2017 Cloudbase Solutions
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import time
+
+try:
+    # NOTE(claudiub): ceilometer might not be installed, it is not mandatory.
+    from ceilometer.tests.tempest.service import client as telemetry_client
+except Exception:
+    telemetry_client = None
+
+from oslo_log import log as logging
+from tempest import clients
+
+from oswin_tempest_plugin import config
+from oswin_tempest_plugin.tests import test_base
+
+CONF = config.CONF
+LOG = logging.getLogger(__name__)
+
+
+class ClientManager(clients.Manager):
+
+    def __init__(self, *args, **kwargs):
+        super(ClientManager, self).__init__(*args, **kwargs)
+
+        self._set_telemetry_clients()
+
+    def _set_telemetry_clients(self):
+        self.telemetry_client = telemetry_client.TelemetryClient(
+            self.auth_provider, **telemetry_client.Manager.telemetry_params)
+
+
+class MetricsCollectionTestCase(test_base.TestBase):
+    """Adds metrics collection scenario tests.
+
+    This test suite verifies that the instance metrics are properly published
+    and collected and have non-zero values. The verification is done via the
+    ceilometer API.
+
+    setup:
+        1. spins a new instance.
+        2. waits until the instance was created succesfully (ACTIVE status).
+        3. wait an interval of time which represents the polling period of the
+        ceilometer-polling agent.
+
+    Waiting for the ceilometer-polling agent to poll the resources is crucial,
+    otherwise the test suite will fail due to the fact that no samples
+    would be found published before checking the samples.
+
+    The test suite's polled_metrics_delay must have a greater value than the
+    ceilometer agent's polling interval. This can be done in two ways:
+        a. Configure tempest's polled_metric_delay, by adding the
+        following line in tempest.conf, in the hyperv section:
+        polled_metrics_delay = <desired value>
+        b. Set the interval value in pipeline.yaml on the compute node to
+        the desired value and restart the ceilometer polling agent. The
+        interval value is set either for the 'meter_source' or for each
+        of the following: 'cpu_source', 'disk_source', 'network_source'.
+
+    Note: If the polled_metrics_delay value is too low, the tests might not
+    find any samples and fail because of this. As a recommandation,
+    polled_metrics_delay's value should be:
+        polled_metric_delay = <pipeline.yaml interval value> + <15-20 seconds>
+
+    tests:
+        1. test_metrics - tests values for the following metrics:
+            - cpu
+            - network.outgoing.bytes
+            - disk.read.bytes
+
+    assumptions:
+        1. Ceilometer agent on the compute node is running.
+        2. Ceilometer agent on the compute node has the polling interval
+        defined in pipeline.yaml lower than the polled_metrics_delay defined
+        in this test suite.
+        3. The compute nodes' nova-compute and neutron-hyperv-agent services
+        have been configured to enable metrics collection.
+    """
+
+    client_manager = ClientManager
+
+    @classmethod
+    def skip_checks(cls):
+        super(MetricsCollectionTestCase, cls).skip_checks()
+
+        if (not CONF.service_available.ceilometer or
+                not CONF.telemetry.deprecated_api_enabled):
+            raise cls.skipException("Ceilometer API support is required.")
+
+        if not CONF.hyperv.collected_metrics:
+            raise cls.skipException("Collected metrics not configured.")
+
+    @classmethod
+    def setup_clients(cls):
+        super(MetricsCollectionTestCase, cls).setup_clients()
+
+        # Telemetry client
+        cls.telemetry_client = cls.os_primary.telemetry_client
+
+    def _telemetry_check_samples(self, resource_id, meter_name):
+        LOG.info("Checking %(meter_name)s for resource %(resource_id)s" % {
+            'meter_name': meter_name, 'resource_id': resource_id})
+
+        samples = self.telemetry_client.list_samples(meter_name)
+        self.assertNotEmpty(samples,
+                            'Telemetry client returned no samples.')
+
+        resource_samples = [s for s in samples if
+                            s['resource_id'] == resource_id]
+        self.assertNotEmpty(
+            resource_samples,
+            'No meter %(meter_name)s samples for resource '
+            '%(resource_id)s found.' % {'meter_name': meter_name,
+                                        'resource_id': resource_id})
+
+        non_zero_valued_samples = [s for s in resource_samples if
+                                   s['counter_volume'] > 0]
+        self.assertNotEmpty(
+            non_zero_valued_samples,
+            'All meter %(meter_name)s samples for resource '
+            '%(resource_id)s are 0.' % {'meter_name': meter_name,
+                                        'resource_id': resource_id})
+
+    def _get_instance_cpu_resource_id(self, server):
+        return server['id']
+
+    def _get_instance_disk_resource_id(self, server):
+        return server['id']
+
+    def _get_instance_port_resource_id(self, server):
+        # Note(claudiub): the format for the instance_port_resource_id is:
+        # %(OS-EXT-SRV-ATTR:instance_name)s-%(instance_id)s-%(port_id)s
+        # the instance returned by self.servers_client does not contain the
+        # OS-EXT-SRV-ATTR:instance_name field. Which means that the resource_id
+        # must be found in ceilometer's resources.
+        start_res_id = server['id']
+        resources = self.telemetry_client.list_resources()
+        res_ids = [r['resource_id'] for r in resources
+                   if r['resource_id'].startswith('instance-') and
+                   start_res_id in r['resource_id']]
+
+        self.assertEqual(1, len(res_ids))
+        return res_ids[0]
+
+    def _check_scenario(self, server_tuple):
+        server = server_tuple.server
+        LOG.info("Waiting %s seconds for the ceilometer compute agents to "
+                 "publish the samples.", CONF.hyperv.polled_metrics_delay)
+        time.sleep(CONF.hyperv.polled_metrics_delay)
+
+        # TODO(claudiub): Add more metrics.
+
+        if 'cpu' in CONF.hyperv.collected_metrics:
+            cpu_res_id = self._get_instance_cpu_resource_id(server)
+            self._telemetry_check_samples(cpu_res_id, 'cpu')
+
+        if 'network.outgoing.bytes' in CONF.hyperv.collected_metrics:
+            port_res_id = self._get_instance_port_resource_id(server)
+            self._telemetry_check_samples(port_res_id,
+                                          'network.outgoing.bytes')
+
+        if 'disk.read.bytes' in CONF.hyperv.collected_metrics:
+            disk_resource_id = self._get_instance_disk_resource_id(server)
+            self._telemetry_check_samples(disk_resource_id, 'disk.read.bytes')
+
+    def test_metrics(self):
+        server_tuple = self._create_server()
+        self._check_scenario(server_tuple)
diff --git a/oswin_tempest_plugin/tests/test_base.py b/oswin_tempest_plugin/tests/test_base.py
index 7999742..87ccfce 100644
--- a/oswin_tempest_plugin/tests/test_base.py
+++ b/oswin_tempest_plugin/tests/test_base.py
@@ -284,3 +284,10 @@ class TestBase(tempest.test.BaseTestCase):
     def _check_server_connectivity(self, server_tuple):
         # if server connectivity works, an SSH client can be opened.
         self._get_server_client(server_tuple)
+
+    def _check_scenario(self, server_tuple):
+        # NOTE(claudiub): This method is to be used when verifying a
+        # particular scenario. If a scenario test case needs to perform
+        # different validation steps (e.g.: metrics collection), it should
+        # overwrite this method.
+        self._check_server_connectivity(server_tuple)