From 833e5946fe3ceb221414c96fbd7bb743c202d720 Mon Sep 17 00:00:00 2001 From: Matthew Oliver Date: Tue, 24 Jul 2018 10:09:56 +1000 Subject: [PATCH] Add swift_handoffs check plugin to monasca A powerful metric to watch for a swift cluster is the number of handoff partitions on a drive on a storage node. A build up of handoff nodes on a particular server could indicate a disk problem somewhere in the cluster. A bottleneck somewhere. Or better, when would be a good time to rebalance the ring (as you'd want to do it when existing backend data movement is at a minimum. So it turns out to be a great visualisation of the health of a cluster. That's what this check plugin does. Each instance check takes the following values: ring: devices: granularity: To be able to determine primary vs handoff partitions on a drive the swift ring needs to be consulted. If a storage node stores more then 1 ring, and an instance would be defined for each. You give swift a bunch of disks. These disks are placed in what swift calls the 'devices' location. That is a directory where a mount point for each mounted swift drive is located. Finally, you can decide on the granularity, which defaults to `server` if not defined. Only 2 metrics are created from this check: swift.partitions.primary_count swift.partitions.handoff_count But with the hostname dimension a ring dimension will also be set. Allowing the graphing of the handoff vs partitions of each ring. When the granularity is set to device, then an additional dimension to the metric is added, the device name (the name of the devices mount point). This allows the graphing and monitoring of each device in a server if a finer granularity is required. Because we need to consult the Swift ring there is a runtime requirement on the Python Swift module being installed. But this isn't required for the unit tests. Making it a runtime dependency means when the check is loaded it'll log an error and then exit if it can't import the swift module. This is the second of two Swift check plugins I've been working on. For more details see my blog post[1] [1] - https://oliver.net.au/?p=358 Change-Id: Ie91add9af39f2ab0e5b575390c0c6355563c0bfc --- conf.d/swift_handoffs.yaml.example | 24 ++ docs/Plugins.md | 38 +++ .../collector/checks_d/swift_handoffs.py | 114 +++++++ setup.cfg | 2 + tests/checks_d/test_swift_handoffs.py | 323 ++++++++++++++++++ 5 files changed, 501 insertions(+) create mode 100644 conf.d/swift_handoffs.yaml.example create mode 100644 monasca_agent/collector/checks_d/swift_handoffs.py create mode 100644 tests/checks_d/test_swift_handoffs.py diff --git a/conf.d/swift_handoffs.yaml.example b/conf.d/swift_handoffs.yaml.example new file mode 100644 index 00000000..ed840703 --- /dev/null +++ b/conf.d/swift_handoffs.yaml.example @@ -0,0 +1,24 @@ +init_config: + collect_period: 300 + +instances: + - name: policy_0 ring + ring: /etc/swift/object.ring.gz + devices: /srv/node + granularity: server + #granularity: device + - name: policy_1 ring + ring: /etc/swift/object_1.ring.gz + devices: /srv/node + granularity: server + #granularity: device + - name: account ring + ring: /etc/swift/account.ring.gz + devices: /srv/node + granularity: server + #granularity: device + - name: container ring + ring: /etc/swift/container.ring.gz + devices: /srv/node + granularity: server + #granularity: device diff --git a/docs/Plugins.md b/docs/Plugins.md index ef2b185a..71c7783c 100644 --- a/docs/Plugins.md +++ b/docs/Plugins.md @@ -88,6 +88,7 @@ - [Swift Diags](#swift-diags) - [Swift Recon](#swift-recon) - [Sample Config](#sample-config) + - [Swift Handoffs](#swift-handoffs) - [TCP Check](#tcp-check) - [Varnish](#varnish) - [VCenter](#vcenter) @@ -2718,6 +2719,43 @@ date/timestamp and float/integer metrics. These include: ### Sample Config See [the example configuration](https://github.com/openstack/monasca-agent/blob/master/conf.d/swift_recon.yaml.example) +## Swift Handoffs +This plugin monitors the number of Swift primary and handoff partitions on a server or +drive (device). This is a powerful metric to watch for on a swift cluster. A build up of +handoff nodes on a particular server could indicate a disk problem somewhere in the +cluster. A bottleneck somewhere. Or better, when would be a good time to rebalance the +ring, as you'd want to do it when existing backend data movement is at a minimum. + +So it turns out to be a great visualisation of the health of a cluster. + +To be able to determine primary vs handoff partitions on a drive the swift ring needs +to be consulted. If a storage node stores more then 1 ring then an instance would be +defined for each. + + You give swift a bunch of disks. These disks are placed in what swift calls the +'devices' location. That is a directory where a mount point for each mounted +swift drive is located. + +Finally, you can decide on the granularity which is either `server` or `device`, +defaulting to `server` if not defined. On `device` the device (mountpoint name) will +be added as a dimesion. + +### Sample Config + +``` +instances: + - name: Object Storage Policy 0 + ring: /etc/swift/object.ring.gz + devices: /srv/node + granularity: server +``` +### Swift Handoffs Metrics + +| Metric Name | Dimensions | Semantics | +| ----------- | ---------- | --------- | +| swift.partitions.primary_count | service=swift, ring=ringname, device=device | The number of partitions of a given ring on the server (or device) | +| swift.partitions.handoff_count | service=swift, ring=ringname, device=device | The number of handoff partitions of a given ring on the server (or device), partitions that should live elsewhere | + ## TCP Check See [the example configuration](https://github.com/openstack/monasca-agent/blob/master/conf.d/tcp_check.yaml.example) for how to configure the TCP Check plugin. diff --git a/monasca_agent/collector/checks_d/swift_handoffs.py b/monasca_agent/collector/checks_d/swift_handoffs.py new file mode 100644 index 00000000..84bbd84c --- /dev/null +++ b/monasca_agent/collector/checks_d/swift_handoffs.py @@ -0,0 +1,114 @@ +import errno +import os +import sys + +import monasca_agent.collector.checks as checks + +from collections import defaultdict + + +class StubRing(object): + # this is a stub ring class which is used as a mock out point when + # unit testing this check plugin as swift is a run time dependency, but + # don't necessary want it installed for all tests. + pass + + +try: + from swift.common.ring import Ring + swift_loaded = True +except ImportError: + Ring = StubRing + swift_loaded = False + +NO_SWIFT_ERROR_EXIT = 1 + + +def get_ring_and_datadir(path): + """:param path: path to ring + + :returns: a tuple, (ring, datadir) + """ + ring_name = os.path.basename(path).split('.')[0] + if '-' in ring_name: + datadir, policy_index = ring_name.rsplit('-', 1) + else: + datadir, policy_index = ring_name, None + datadir += 's' + if policy_index: + datadir += '-{}'.format(policy_index) + + return Ring(path), ring_name, datadir + + +class SwiftHandoffs(checks.AgentCheck): + + def __init__(self, name, init_config, agent_config, instances=None): + super(SwiftHandoffs, self).__init__(name, init_config, agent_config, + instances) + global swift_loaded + if not swift_loaded: + self.log.error('Swift python module not found. The python swift ' + 'module is a runtime dependency') + sys.exit(NO_SWIFT_ERROR_EXIT) + + def check(self, instance): + device_root = instance.get('devices', '/srv/node') + if not os.path.exists(device_root) or not os.path.isdir(device_root): + self.log.error('devices must exist or be a directory') + return None + + ring_path = instance.get('ring') + if not ring_path or not os.path.exists(ring_path) \ + or not os.path.isfile(ring_path): + self.log.error('ring must exist') + return None + + granularity = instance.get('granularity', 'server').lower() + if granularity not in ('server', 'device'): + self.log.error("granularity must be either 'server' or 'drive'") + return None + + ring, ring_name, datadir = get_ring_and_datadir(ring_path) + + dev2parts = defaultdict(set) + for replica, part2dev in enumerate(ring._replica2part2dev_id): + for part, device_id in enumerate(part2dev): + dev2parts[ring.devs[device_id]['device']].add(part) + + # print dev2parts + primary_count = defaultdict(int) + handoffs = defaultdict(set) + device_dirs = os.listdir(device_root) + for device_dir in device_dirs: + parts_dir = os.path.join(device_root, device_dir, datadir) + try: + parts = os.listdir(parts_dir) + except OSError as e: + if e.errno == errno.ENOENT: + continue + else: + raise + for part in parts: + if not part.isdigit(): + continue + part = int(part) + if part in dev2parts[device_dir]: + primary_count[device_dir] += 1 + else: + handoffs[device_dir].add(part) + + dimensions = {u'ring': ring_name, u'service': u'swift'} + dimensions = self._set_dimensions(dimensions, instance) + if granularity == 'server': + self.gauge(u'swift.partitions.primary_count', + sum(primary_count.values()), dimensions) + self.gauge('swift.partitions.handoff_count', + sum(map(len, handoffs.values())), dimensions) + else: + for device in device_dirs: + dimensions['device'] = device + self.gauge(u'swift.partitions.primary_count', + primary_count[device], dimensions) + self.gauge('swift.partitions.handoff_count', + len(handoffs[device]), dimensions) diff --git a/setup.cfg b/setup.cfg index 75e8318f..44c965e7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -67,6 +67,8 @@ prometheus = ovs = python-novaclient>=9.1.0 # Apache-2.0 python-neutronclient>=6.3.0 # Apache-2.0 +swift_handoffs = + swift >= 2.0.0 # Apache-2.0 [global] setup-hooks = diff --git a/tests/checks_d/test_swift_handoffs.py b/tests/checks_d/test_swift_handoffs.py new file mode 100644 index 00000000..bb024b1d --- /dev/null +++ b/tests/checks_d/test_swift_handoffs.py @@ -0,0 +1,323 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import mock +import unittest +from collections import defaultdict +from tempfile import mkdtemp +import shutil +import os +from array import array + +from monasca_agent.collector.checks_d import swift_handoffs + + +class FakeLogger(object): + def __init__(self): + self.log = {'error': [], + 'warning': []} + + def _write_msg(self, msg, key): + self.log[key].append(msg) + + def error(self, msg): + self._write_msg(msg, 'error') + + def warning(self, msg): + self._write_msg(msg, 'warning') + + def get_loglines(self, key): + return self.log[key] + + +class MockSwiftHandoffs(swift_handoffs.SwiftHandoffs): + def __init__(self): + swift_handoffs.swift_loaded = True + super(MockSwiftHandoffs, self).__init__( + name='swift_handoffs', + init_config={}, + instances=[], + agent_config={} + ) + self.log = FakeLogger() + self.reset_gauge() + + def gauge(self, key, value, dimensions, *args, **kwargs): + self.gauge_called = True + self.gauge_calls[key].append(value) + for k, v in dimensions.items(): + self.dimensions[k].add(v) + + def reset_gauge(self): + self.gauge_called = False + self.gauge_calls = defaultdict(list) + self.dimensions = defaultdict(set) + + +class MockRing(object): + def __init__(self, *args): + self.devs = [ + {u'device': u'sdb1', u'id': 0, u'ip': u'127.0.0.1', + u'meta': u'', u'port': 6010, u'region': 1, + u'replication_ip': u'127.0.0.1', u'replication_port': 6010, + u'weight': 1.0, u'zone': 1}, + {u'device': u'sdb2', u'id': 1, u'ip': u'127.0.0.1', + u'meta': u'', u'port': 6010, u'region': 1, + u'replication_ip': u'127.0.0.1', u'replication_port': 6010, + u'weight': 1.0, u'zone': 1}, + {u'device': u'sdb3', u'id': 2, u'ip': u'127.0.0.2', + u'meta': u'', u'port': 6010, u'region': 1, + u'replication_ip': u'127.0.0.2', u'replication_port': 6010, + u'weight': 1.0, u'zone': 1}, + {u'device': u'sdb4', u'id': 3, u'ip': u'127.0.0.2', + u'meta': u'', u'port': 6010, u'region': 1, + u'replication_ip': u'127.0.0.2', u'replication_port': 6010, + u'weight': 1.0, u'zone': 1}] + + self._replica2part2dev_id = [ + array('H', [3, 0, 2, 1, 2, 3, 0, 1, 3, 3, 0, 1, 2, 1, 0, 2]), + array('H', [0, 2, 1, 3, 1, 0, 2, 3, 0, 0, 2, 3, 1, 3, 2, 1]), + array('H', [2, 1, 3, 0, 3, 2, 1, 0, 2, 2, 1, 0, 3, 0, 1, 3])] + + +class SwiftHandoffsTest(unittest.TestCase): + def setUp(self): + super(SwiftHandoffsTest, self).setUp() + self.swift_handoffs = MockSwiftHandoffs() + self.tmpdir = mkdtemp() + self.datadir = os.path.join(self.tmpdir, 'datadir') + self.ring = os.path.join(self.tmpdir, 'object.ring.gz') + os.mkdir(self.datadir) + os.mknod(self.ring) + self.expected_dev2part = { + u'sdb1': {0, 1, 3, 5, 6, 7, 8, 9, 10, 11, 13, 14}, + u'sdb2': {1, 2, 3, 4, 6, 7, 10, 11, 12, 13, 14, 15}, + u'sdb3': {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 14, 15}, + u'sdb4': {0, 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15}} + + self.expected_handoffs = { + u'sdb1': {2, 4, 12, 15}, + u'sdb2': {0, 5, 8, 9}, + u'sdb3': {3, 7, 11, 13}, + u'sdb4': {1, 6, 10, 14}} + + def tearDown(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + @mock.patch('monasca_agent.collector.checks_d.swift_handoffs.Ring', + MockRing) + def test_get_ring_and_datadir(self): + def do_test(path, expected_ringname, expected_datadir): + _ring, name, datadir = swift_handoffs.get_ring_and_datadir(path) + self.assertEqual(name, expected_ringname) + self.assertEqual(datadir, expected_datadir) + + for prefix in ('/etc/swift/{}', './{}', 'some/other/loc/{}'): + test_cases = ( + (prefix.format('object.ring.gz'), 'object', 'objects'), + (prefix.format('object-1.ring.gz'), 'object-1', 'objects-1'), + (prefix.format('object-2.ring.gz'), 'object-2', 'objects-2'), + (prefix.format('object-50.ring.gz'), 'object-50', 'objects-50'), + (prefix.format('container.ring.gz'), 'container', 'containers'), + (prefix.format('account.ring.gz'), 'account', 'accounts')) + for path, ex_ringname, ex_datadir in test_cases: + do_test(path, ex_ringname, ex_datadir) + + def test_check_missing_options(self): + # missing device (path to devices mount point), and default doesn't + # exist + instance = {'ring': self.ring} + with mock.patch('os.path.exists', return_value=False): + self.swift_handoffs.check(instance) + self.assertIn('devices must exist or be a directory', + self.swift_handoffs.log.get_loglines('error')) + self.swift_handoffs.log = FakeLogger() + + # a device that isn't a dir + instance = {'ring': self.ring, + 'devices': '{}/random'.format(self.datadir)} + with mock.patch('os.path.exists', return_value=True), \ + mock.patch('os.path.isdir', return_value=False): + self.swift_handoffs.check(instance) + self.assertIn('devices must exist or be a directory', + self.swift_handoffs.log.get_loglines('error')) + self.swift_handoffs.log = FakeLogger() + + # missing ring + instance = {'devices': self.datadir} + self.swift_handoffs.check(instance) + self.assertIn('ring must exist', + self.swift_handoffs.log.get_loglines('error')) + self.swift_handoffs.log = FakeLogger() + + instance = {'devices': self.datadir, 'ring': self.ring} + with mock.patch('os.path.isfile', return_value=False): + self.swift_handoffs.check(instance) + self.assertIn('ring must exist', + self.swift_handoffs.log.get_loglines('error')) + self.swift_handoffs.log = FakeLogger() + + # granularity defaults to server. If specified it only allows either + # server or drive. Anything else will be an error. + instance = {'devices': self.datadir, 'ring': self.ring, + 'granularity': 'something else'} + self.swift_handoffs.check(instance) + self.assertIn("granularity must be either 'server' or 'drive'", + self.swift_handoffs.log.get_loglines('error')) + + def setup_partitions(self, devices): + for dev in devices: + for part in devices[dev]: + path = os.path.join(self.datadir, dev, 'objects', str(part)) + os.makedirs(path) + + @mock.patch('monasca_agent.collector.checks_d.swift_handoffs.Ring', + MockRing) + def test_all_paritions_in_correct_place(self): + self.setup_partitions(self.expected_dev2part) + instances = {'devices': self.datadir, 'ring': self.ring, + 'granularity': 'device'} + self.swift_handoffs.check(instances) + + self.assertTrue(self.swift_handoffs.gauge_called) + for metric in ('swift.partitions.primary_count', + 'swift.partitions.handoff_count'): + # metric was called + self.assertIn(metric, self.swift_handoffs.gauge_calls) + + # Each metric was called once per device, so 4 times. + self.assertEqual(len(self.swift_handoffs.gauge_calls[metric]), 4) + + self.assertListEqual( + self.swift_handoffs.gauge_calls['swift.partitions.primary_count'], + [12, 12, 12, 12]) + self.assertListEqual( + self.swift_handoffs.gauge_calls['swift.partitions.handoff_count'], + [0, 0, 0, 0]) + + # each device should be a device metric + self.assertSetEqual(self.swift_handoffs.dimensions['device'], + {'sdb3', 'sdb2', 'sdb1', 'sdb4'}) + + @mock.patch('monasca_agent.collector.checks_d.swift_handoffs.Ring', + MockRing) + def test_all_paritions_and_all_handoffs(self): + + for device in self.expected_dev2part: + self.expected_dev2part[device].update( + self.expected_handoffs[device]) + self.setup_partitions(self.expected_dev2part) + instances = {'devices': self.datadir, 'ring': self.ring, + 'granularity': 'device'} + self.swift_handoffs.check(instances) + + self.assertTrue(self.swift_handoffs.gauge_called) + for metric in ('swift.partitions.primary_count', + 'swift.partitions.handoff_count'): + # metric was called + self.assertIn(metric, self.swift_handoffs.gauge_calls) + + # Each metric was called once per device, so 4 times. + self.assertEqual(len(self.swift_handoffs.gauge_calls[metric]), 4) + + # all primaries were on each drive + self.assertListEqual( + self.swift_handoffs.gauge_calls['swift.partitions.primary_count'], + [12, 12, 12, 12]) + # so were 4 handoffs + self.assertListEqual( + self.swift_handoffs.gauge_calls['swift.partitions.handoff_count'], + [4, 4, 4, 4]) + + # each device should be a device metric + self.assertSetEqual(self.swift_handoffs.dimensions['device'], + {'sdb3', 'sdb2', 'sdb1', 'sdb4'}) + + @mock.patch('monasca_agent.collector.checks_d.swift_handoffs.Ring', + MockRing) + def test_some_paritions_in_correct_no_handoffs(self): + # Are parition will only be created on a drive if an object in that + # partition has a been PUT into the cluster. So a partition missing + # on a drive isn't bad (though in a realy cluster weird) but isn't + # a failure. + + # let's remove a bunch of partitions from each cluster. + for drive in self.expected_dev2part: + self.expected_dev2part[drive].difference_update( + list(self.expected_dev2part[drive])[:5]) + self.setup_partitions(self.expected_dev2part) + instances = {'devices': self.datadir, 'ring': self.ring, + 'granularity': 'device'} + self.swift_handoffs.check(instances) + + self.assertTrue(self.swift_handoffs.gauge_called) + for metric in ('swift.partitions.primary_count', + 'swift.partitions.handoff_count'): + # metric was called + self.assertIn(metric, self.swift_handoffs.gauge_calls) + + # Each metric was called once per device, so 4 times. + self.assertEqual(len(self.swift_handoffs.gauge_calls[metric]), 4) + + self.assertListEqual( + self.swift_handoffs.gauge_calls['swift.partitions.primary_count'], + [7, 7, 7, 7]) + self.assertListEqual( + self.swift_handoffs.gauge_calls['swift.partitions.handoff_count'], + [0, 0, 0, 0]) + + # each device should be a device metric + self.assertSetEqual(self.swift_handoffs.dimensions['device'], + {'sdb3', 'sdb2', 'sdb1', 'sdb4'}) + + @mock.patch('monasca_agent.collector.checks_d.swift_handoffs.Ring', + MockRing) + def test_some_paritions_and_some_handoffs_less_devices(self): + # Are parition will only be created on a drive if an object in that + # partition has a been PUT into the cluster. So a partition missing + # on a drive isn't bad (though in a realy cluster weird) but isn't + # a failure. + + # let's remove a bunch of partitions from each cluster and 2 of the + # devices + for drive in 'sdb1', 'sdb4': + self.expected_dev2part.pop(drive) + + for drive in self.expected_dev2part: + self.expected_dev2part[drive].difference_update( + list(self.expected_dev2part[drive])[:5]) + self.expected_dev2part[drive].update( + list(self.expected_handoffs[drive])[:1]) + self.setup_partitions(self.expected_dev2part) + instances = {'devices': self.datadir, 'ring': self.ring, + 'granularity': 'device'} + self.swift_handoffs.check(instances) + + self.assertTrue(self.swift_handoffs.gauge_called) + for metric in ('swift.partitions.primary_count', + 'swift.partitions.handoff_count'): + # metric was called + self.assertIn(metric, self.swift_handoffs.gauge_calls) + + # Each metric was called once per device, so 4 times. + self.assertEqual(len(self.swift_handoffs.gauge_calls[metric]), 2) + + self.assertListEqual( + self.swift_handoffs.gauge_calls['swift.partitions.primary_count'], + [7, 7]) + self.assertListEqual( + self.swift_handoffs.gauge_calls['swift.partitions.handoff_count'], + [1, 1]) + + # each device should be a device metric + self.assertSetEqual(self.swift_handoffs.dimensions['device'], + {'sdb3', 'sdb2'})