From 352d6993870be2547f37463e4e3cffc7605f749c Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Fri, 1 Jun 2018 12:15:01 +0200 Subject: [PATCH] Add pre-flight check for device pristinity Add `non-pristine` key to `list-disks` action. No longer attempt to do initializtion of `osd-journal` devices. Make py27 test noop Flip pep8 test to py3 Partial-Bug: #1698154 Change-Id: I0ca574fa7f0683b4e8a693b9f62fbf6b39689789 Depends-On: I90a866aa138d18e4242783c42d4c7c587f696d7d --- actions.yaml | 13 ++++- actions/list_disks.py | 23 +++++++-- hooks/ceph_hooks.py | 93 +++++++++++++---------------------- hooks/utils.py | 21 +++++++- lib/ceph/utils.py | 45 +++++++++++++---- tests/basic_deployment.py | 64 ++++++++++++++++++++++++ tox.ini | 10 ++-- unit_tests/test_ceph_hooks.py | 35 ------------- unit_tests/test_ceph_utils.py | 63 ++++++++++++++++++++++++ 9 files changed, 252 insertions(+), 115 deletions(-) create mode 100644 unit_tests/test_ceph_utils.py diff --git a/actions.yaml b/actions.yaml index 994506cd..396dc4c1 100644 --- a/actions.yaml +++ b/actions.yaml @@ -25,7 +25,18 @@ resume: Set the local osd units in the charm to 'in'. Note that the pause option does NOT stop the osd processes. list-disks: - description: List the unmounted disk on the specified unit + description: | + List disks + . + The 'disks' key is populated with block devices that are known by udev, + are not mounted and not mentioned in 'osd-journal' configuration option. + . + The 'blacklist' key is populated with osd-devices in the blacklist stored + in the local kv store of this specific unit. + . + The 'non-pristine' key is populated with block devices that are known by + udev, are not mounted, not mentioned in 'osd-journal' configuration option + and are currently not eligible for use because of presence of foreign data. add-disk: description: Add disk(s) to Ceph params: diff --git a/actions/list_disks.py b/actions/list_disks.py index 6bd3bf9b..819310f8 100755 --- a/actions/list_disks.py +++ b/actions/list_disks.py @@ -15,10 +15,17 @@ # limitations under the License. """ -List unmounted devices. +List disks -This script will get all block devices known by udev and check if they -are mounted so that we can give unmounted devices to the administrator. +The 'disks' key is populated with block devices that are known by udev, +are not mounted and not mentioned in 'osd-journal' configuration option. + +The 'blacklist' key is populated with osd-devices in the blacklist stored +in the local kv store of this specific unit. + +The 'non-pristine' key is populated with block devices that are known by +udev, are not mounted, not mentioned in 'osd-journal' configuration option +and are currently not eligible for use because of presence of foreign data. """ import sys @@ -32,7 +39,15 @@ import ceph.utils import utils if __name__ == '__main__': + non_pristine = [] + osd_journal = utils.get_journal_devices() + for dev in list(set(ceph.utils.unmounted_disks()) - set(osd_journal)): + if (not ceph.utils.is_active_bluestore_device(dev) and + not ceph.utils.is_pristine_disk(dev)): + non_pristine.append(dev) + hookenv.action_set({ - 'disks': ceph.utils.unmounted_disks(), + 'disks': list(set(ceph.utils.unmounted_disks()) - set(osd_journal)), 'blacklist': utils.get_blacklist(), + 'non-pristine': non_pristine, }) diff --git a/hooks/ceph_hooks.py b/hooks/ceph_hooks.py index 3c2cf016..605d5999 100755 --- a/hooks/ceph_hooks.py +++ b/hooks/ceph_hooks.py @@ -20,7 +20,6 @@ import glob import os import shutil import sys -import tempfile import socket import subprocess import netifaces @@ -41,6 +40,7 @@ from charmhelpers.core.hookenv import ( Hooks, UnregisteredHookError, service_name, + status_get, status_set, storage_get, storage_list, @@ -76,6 +76,7 @@ from utils import ( get_public_addr, get_cluster_addr, get_blacklist, + get_journal_devices, ) from charmhelpers.contrib.openstack.alternatives import install_alternative from charmhelpers.contrib.network.ip import ( @@ -85,6 +86,9 @@ from charmhelpers.contrib.network.ip import ( ) from charmhelpers.contrib.storage.linux.ceph import ( CephConfContext) +from charmhelpers.contrib.storage.linux.utils import ( + is_device_mounted, +) from charmhelpers.contrib.charmsupport import nrpe from charmhelpers.contrib.hardening.harden import harden @@ -357,38 +361,6 @@ def emit_cephconf(upgrading=False): charm_ceph_conf, 90) -JOURNAL_ZAPPED = '/var/lib/ceph/journal_zapped' - - -def read_zapped_journals(): - if os.path.exists(JOURNAL_ZAPPED): - with open(JOURNAL_ZAPPED, 'rt', encoding='UTF-8') as zapfile: - zapped = set( - filter(None, - [l.strip() for l in zapfile.readlines()])) - log("read zapped: {}".format(zapped), level=DEBUG) - return zapped - return set() - - -def write_zapped_journals(journal_devs): - tmpfh, tmpfile = tempfile.mkstemp() - with os.fdopen(tmpfh, 'wb') as zapfile: - log("write zapped: {}".format(journal_devs), - level=DEBUG) - zapfile.write('\n'.join(sorted(list(journal_devs))).encode('UTF-8')) - shutil.move(tmpfile, JOURNAL_ZAPPED) - - -def check_overlap(journaldevs, datadevs): - if not journaldevs.isdisjoint(datadevs): - msg = ("Journal/data devices mustn't" - " overlap; journal: {0}, data: {1}".format(journaldevs, - datadevs)) - log(msg, level=ERROR) - raise ValueError(msg) - - @hooks.hook('config-changed') @harden() def config_changed(): @@ -438,13 +410,28 @@ def prepare_disks_and_activate(): vaultlocker.write_vaultlocker_conf(context) osd_journal = get_journal_devices() - check_overlap(osd_journal, set(get_devices())) + if not osd_journal.isdisjoint(set(get_devices())): + raise ValueError('`osd-journal` and `osd-devices` options must not' + 'overlap.') log("got journal devs: {}".format(osd_journal), level=DEBUG) - already_zapped = read_zapped_journals() - non_zapped = osd_journal - already_zapped - for journ in non_zapped: - ceph.maybe_zap_journal(journ) - write_zapped_journals(osd_journal) + + # pre-flight check of eligible device pristinity + devices = get_devices() + # filter osd-devices that are file system paths + devices = [dev for dev in devices if dev.startswith('/dev')] + # filter osd-devices that does not exist on this unit + devices = [dev for dev in devices if os.path.exists(dev)] + # filter osd-devices that are already mounted + devices = [dev for dev in devices if not is_device_mounted(dev)] + # filter osd-devices that are active bluestore devices + devices = [dev for dev in devices + if not ceph.is_active_bluestore_device(dev)] + log('Checking for pristine devices: "{}"'.format(devices), level=DEBUG) + if not all(ceph.is_pristine_disk(dev) for dev in devices): + status_set('blocked', + 'Non-pristine devices detected, consult ' + '`list-disks`, `zap-disk` and `blacklist-*` actions.') + return if ceph.is_bootstrapped(): log('ceph bootstrapped, rescanning disks') @@ -521,20 +508,6 @@ def get_devices(): return [device for device in devices if device not in _blacklist] -def get_journal_devices(): - if config('osd-journal'): - devices = [l.strip() for l in config('osd-journal').split(' ')] - else: - devices = [] - storage_ids = storage_list('osd-journals') - devices.extend((storage_get('location', s) for s in storage_ids)) - - # Filter out any devices in the action managed unit-local device blacklist - _blacklist = get_blacklist() - return set(device for device in devices - if device not in _blacklist and os.path.exists(device)) - - @hooks.hook('mon-relation-changed', 'mon-relation-departed') def mon_relation(): @@ -631,13 +604,15 @@ def assess_status(): # Check for OSD device creation parity i.e. at least some devices # must have been presented and used for this charm to be operational + (prev_status, prev_message) = status_get() running_osds = ceph.get_running_osds() - if not running_osds: - status_set('blocked', - 'No block devices detected using current configuration') - else: - status_set('active', - 'Unit is ready ({} OSD)'.format(len(running_osds))) + if prev_status != 'blocked': + if not running_osds: + status_set('blocked', + 'No block devices detected using current configuration') + else: + status_set('active', + 'Unit is ready ({} OSD)'.format(len(running_osds))) @hooks.hook('update-status') diff --git a/hooks/utils.py b/hooks/utils.py index b49254c5..a2fffd10 100644 --- a/hooks/utils.py +++ b/hooks/utils.py @@ -12,8 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import socket import re +import os +import socket + from charmhelpers.core.hookenv import ( unit_get, cached, @@ -22,6 +24,8 @@ from charmhelpers.core.hookenv import ( log, DEBUG, status_set, + storage_get, + storage_list, ) from charmhelpers.core import unitdata from charmhelpers.fetch import ( @@ -39,6 +43,7 @@ from charmhelpers.contrib.network.ip import ( get_ipv6_addr ) + TEMPLATES_DIR = 'templates' try: @@ -213,3 +218,17 @@ def get_blacklist(): """Get blacklist stored in the local kv() store""" db = unitdata.kv() return db.get('osd-blacklist', []) + + +def get_journal_devices(): + if config('osd-journal'): + devices = [l.strip() for l in config('osd-journal').split(' ')] + else: + devices = [] + storage_ids = storage_list('osd-journals') + devices.extend((storage_get('location', s) for s in storage_ids)) + + # Filter out any devices in the action managed unit-local device blacklist + _blacklist = get_blacklist() + return set(device for device in devices + if device not in _blacklist and os.path.exists(device)) diff --git a/lib/ceph/utils.py b/lib/ceph/utils.py index c71824d3..5ff970bf 100644 --- a/lib/ceph/utils.py +++ b/lib/ceph/utils.py @@ -66,7 +66,6 @@ from charmhelpers.contrib.storage.linux.ceph import ( from charmhelpers.contrib.storage.linux.utils import ( is_block_device, is_device_mounted, - zap_disk, ) from charmhelpers.contrib.openstack.utils import ( get_os_codename_install_source, @@ -870,7 +869,42 @@ def get_partition_list(dev): raise +def is_pristine_disk(dev): + """ + Read first 2048 bytes (LBA 0 - 3) of block device to determine whether it + is actually all zeros and safe for us to use. + + Existing partitioning tools does not discern between a failure to read from + block device, failure to understand a partition table and the fact that a + block device has no partition table. Since we need to be positive about + which is which we need to read the device directly and confirm ourselves. + + :param dev: Path to block device + :type dev: str + :returns: True all 2048 bytes == 0x0, False if not + :rtype: bool + """ + want_bytes = 2048 + + f = open(dev, 'rb') + data = f.read(want_bytes) + read_bytes = len(data) + if read_bytes != want_bytes: + log('{}: short read, got {} bytes expected {}.' + .format(dev, read_bytes, want_bytes), level=WARNING) + return False + + return all(byte == 0x0 for byte in data) + + def is_osd_disk(dev): + db = kv() + osd_devices = db.get('osd-devices', []) + if dev in osd_devices: + log('Device {} already processed by charm,' + ' skipping'.format(dev)) + return True + partitions = get_partition_list(dev) for partition in partitions: try: @@ -1296,15 +1330,6 @@ def update_monfs(): pass -def maybe_zap_journal(journal_dev): - if is_osd_disk(journal_dev): - log('Looks like {} is already an OSD data' - ' or journal, skipping.'.format(journal_dev)) - return - zap_disk(journal_dev) - log("Zapped journal device {}".format(journal_dev)) - - def get_partitions(dev): cmd = ['partx', '--raw', '--noheadings', dev] try: diff --git a/tests/basic_deployment.py b/tests/basic_deployment.py index 77db5863..6e5f880e 100644 --- a/tests/basic_deployment.py +++ b/tests/basic_deployment.py @@ -15,6 +15,7 @@ # limitations under the License. import amulet +import re import time import keystoneclient @@ -712,6 +713,69 @@ class CephOsdBasicDeployment(OpenStackAmuletDeployment): mtime, unit_name)) amulet.raise_status('Folder mtime is older than provided mtime') + def test_901_blocked_when_non_pristine_disk_appears(self): + """ + Validate that charm goes into blocked state when it is presented with + new block devices that have foreign data on them. + + Instances used in UOSCI has a flavour with ephemeral storage in + addition to the bootable instance storage. The ephemeral storage + device is partitioned, formatted and mounted early in the boot process + by cloud-init. + + As long as the device is mounted the charm will not attempt to use it. + + If we unmount it and trigger the config-changed hook the block device + will appear as a new and previously untouched device for the charm. + + One of the first steps of device eligibility checks should be to make + sure we are seeing a pristine and empty device before doing any + further processing. + + As the ephemeral device will have data on it we can use it to validate + that these checks work as intended. + """ + u.log.debug('Checking behaviour when non-pristine disks appear...') + u.log.debug('Configuring ephemeral-unmount...') + self.d.configure('ceph-osd', {'ephemeral-unmount': '/mnt', + 'osd-devices': '/dev/vdb'}) + self._auto_wait_for_status(message=re.compile('Non-pristine.*'), + include_only=['ceph-osd']) + u.log.debug('Units now in blocked state, running zap-disk action...') + action_ids = [] + self.ceph_osd_sentry = self.d.sentry['ceph-osd'][0] + for unit in range(0, 3): + zap_disk_params = { + 'devices': '/dev/vdb', + 'i-really-mean-it': True, + } + action_id = u.run_action(self.d.sentry['ceph-osd'][unit], + 'zap-disk', params=zap_disk_params) + action_ids.append(action_id) + for unit in range(0, 3): + assert u.wait_on_action(action_ids[unit]), ( + 'zap-disk action failed.') + + u.log.debug('Running add-disk action...') + action_ids = [] + for unit in range(0, 3): + add_disk_params = { + 'osd-devices': '/dev/vdb', + } + action_id = u.run_action(self.d.sentry['ceph-osd'][unit], + 'add-disk', params=add_disk_params) + action_ids.append(action_id) + + # NOTE(fnordahl): LP: #1774694 + # for unit in range(0, 3): + # assert u.wait_on_action(action_ids[unit]), ( + # 'add-disk action failed.') + + u.log.debug('Wait for idle/ready status...') + self._auto_wait_for_status(include_only=['ceph-osd']) + + u.log.debug('OK') + def test_910_pause_and_resume(self): """The services can be paused and resumed. """ u.log.debug('Checking pause and resume actions...') diff --git a/tox.ini b/tox.ini index 6c223662..99448527 100644 --- a/tox.ini +++ b/tox.ini @@ -18,11 +18,11 @@ whitelist_externals = juju passenv = HOME TERM AMULET_* CS_API_* [testenv:py27] -basepython = python2.7 -deps = -r{toxinidir}/requirements.txt - -r{toxinidir}/test-requirements.txt -# temporarily disable py27 -commands = /bin/true +# ceph charms are Python3-only, but py27 unit test target +# is required by OpenStack Governance. Remove this shim as soon as +# permitted. http://governance.openstack.org/reference/cti/python_cti.html +whitelist_externals = true +commands = true [testenv:py35] basepython = python3.5 diff --git a/unit_tests/test_ceph_hooks.py b/unit_tests/test_ceph_hooks.py index a8d84766..f8c442e9 100644 --- a/unit_tests/test_ceph_hooks.py +++ b/unit_tests/test_ceph_hooks.py @@ -405,21 +405,6 @@ class CephHooksTestCase(unittest.TestCase): devices = ceph_hooks.get_devices() self.assertEqual(devices, ['/dev/vda', '/dev/vdb']) - @patch('os.path.exists') - @patch.object(ceph_hooks, 'storage_list') - @patch.object(ceph_hooks, 'config') - def test_get_journal_devices(self, mock_config, mock_storage_list, - mock_os_path_exists): - '''Devices returned as expected''' - config = {'osd-journal': '/dev/vda /dev/vdb'} - mock_config.side_effect = lambda key: config[key] - mock_storage_list.return_value = [] - mock_os_path_exists.return_value = True - devices = ceph_hooks.get_journal_devices() - mock_storage_list.assert_called() - mock_os_path_exists.assert_called() - self.assertEqual(devices, set(['/dev/vda', '/dev/vdb'])) - @patch.object(ceph_hooks, 'get_blacklist') @patch.object(ceph_hooks, 'storage_list') @patch.object(ceph_hooks, 'config') @@ -435,26 +420,6 @@ class CephHooksTestCase(unittest.TestCase): mock_get_blacklist.assert_called() self.assertEqual(devices, ['/dev/vdb']) - @patch('os.path.exists') - @patch.object(ceph_hooks, 'get_blacklist') - @patch.object(ceph_hooks, 'storage_list') - @patch.object(ceph_hooks, 'config') - def test_get_journal_devices_blacklist(self, mock_config, - mock_storage_list, - mock_get_blacklist, - mock_os_path_exists): - '''Devices returned as expected when blacklist in effect''' - config = {'osd-journal': '/dev/vda /dev/vdb'} - mock_config.side_effect = lambda key: config[key] - mock_storage_list.return_value = [] - mock_get_blacklist.return_value = ['/dev/vda'] - mock_os_path_exists.return_value = True - devices = ceph_hooks.get_journal_devices() - mock_storage_list.assert_called() - mock_os_path_exists.assert_called() - mock_get_blacklist.assert_called() - self.assertEqual(devices, set(['/dev/vdb'])) - @patch.object(ceph_hooks, 'log') @patch.object(ceph_hooks, 'config') @patch('os.environ') diff --git a/unit_tests/test_ceph_utils.py b/unit_tests/test_ceph_utils.py new file mode 100644 index 00000000..f58ae070 --- /dev/null +++ b/unit_tests/test_ceph_utils.py @@ -0,0 +1,63 @@ +# Copyright 2016 Canonical Ltd + +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from mock import patch + +with patch('charmhelpers.contrib.hardening.harden.harden') as mock_dec: + mock_dec.side_effect = (lambda *dargs, **dkwargs: lambda f: + lambda *args, **kwargs: f(*args, **kwargs)) + import utils + + +class CephUtilsTestCase(unittest.TestCase): + def setUp(self): + super(CephUtilsTestCase, self).setUp() + + @patch('os.path.exists') + @patch.object(utils, 'storage_list') + @patch.object(utils, 'config') + def test_get_journal_devices(self, mock_config, mock_storage_list, + mock_os_path_exists): + '''Devices returned as expected''' + config = {'osd-journal': '/dev/vda /dev/vdb'} + mock_config.side_effect = lambda key: config[key] + mock_storage_list.return_value = [] + mock_os_path_exists.return_value = True + devices = utils.get_journal_devices() + mock_storage_list.assert_called() + mock_os_path_exists.assert_called() + self.assertEqual(devices, set(['/dev/vda', '/dev/vdb'])) + + @patch('os.path.exists') + @patch.object(utils, 'get_blacklist') + @patch.object(utils, 'storage_list') + @patch.object(utils, 'config') + def test_get_journal_devices_blacklist(self, mock_config, + mock_storage_list, + mock_get_blacklist, + mock_os_path_exists): + '''Devices returned as expected when blacklist in effect''' + config = {'osd-journal': '/dev/vda /dev/vdb'} + mock_config.side_effect = lambda key: config[key] + mock_storage_list.return_value = [] + mock_get_blacklist.return_value = ['/dev/vda'] + mock_os_path_exists.return_value = True + devices = utils.get_journal_devices() + mock_storage_list.assert_called() + mock_os_path_exists.assert_called() + mock_get_blacklist.assert_called() + self.assertEqual(devices, set(['/dev/vdb']))