From 8e651a2d3dbc6b66cd26b38c13889c6b313f1971 Mon Sep 17 00:00:00 2001 From: Samuel Merritt Date: Mon, 5 Feb 2018 14:06:18 -0800 Subject: [PATCH] Add fallocate_reserve to account and container servers. The object server can be configured to leave a certain amount of disk space free; default is 1%. This is useful in avoiding 100%-full filesystems, as those can get Swift in a state where the filesystem is too full to write tombstones, so you can't delete objects to free up space. When a cluster has accounts/containers and objects on the same disks, then you can wind up with a 100%-full disk since account and container servers don't respect fallocate_reserve. This commit makes account and container servers respect fallocate_reserve so that disks shared between account/container and object rings won't get 100% full. When a disk's free space falls below the configured reserve, account and container PUT, POST, and REPLICATE requests will fail with a 507 status code. These are the operations that can significantly increase the disk space used by a given database. I called the parameter "fallocate_reserve" for consistency with the object server. No actual fallocate() call happens under Swift's control in the account or container servers (sqlite3 might make such a call, but it's out of our hands). Change-Id: I083442eef14bf83c0ea717b1decb3e6b56dbf1d0 --- doc/source/admin_guide.rst | 25 +++++++-- etc/account-server.conf-sample | 7 +++ etc/container-server.conf-sample | 7 +++ swift/account/server.py | 16 +++++- swift/common/db_replicator.py | 7 +++ swift/common/utils.py | 29 ++++++++++ swift/container/server.py | 18 ++++++- test/unit/account/test_server.py | 86 +++++++++++++++++++++++++++++ test/unit/common/test_utils.py | 46 ++++++++++++++++ test/unit/container/test_server.py | 87 ++++++++++++++++++++++++++++++ 10 files changed, 321 insertions(+), 7 deletions(-) diff --git a/doc/source/admin_guide.rst b/doc/source/admin_guide.rst index 2e3500127e..ce8612eedc 100644 --- a/doc/source/admin_guide.rst +++ b/doc/source/admin_guide.rst @@ -298,10 +298,27 @@ Preventing Disk Full Scenarios Prevent disk full scenarios by ensuring that the ``proxy-server`` blocks PUT requests and rsync prevents replication to the specific drives. -You can prevent `proxy-server` PUT requests to low space disks by ensuring -``fallocate_reserve`` is set in the ``object-server.conf``. By default, -``fallocate_reserve`` is set to 1%. This blocks PUT requests that leave the -free disk space below 1% of the disk. +You can prevent `proxy-server` PUT requests to low space disks by +ensuring ``fallocate_reserve`` is set in ``account-server.conf``, +``container-server.conf``, and ``object-server.conf``. By default, +``fallocate_reserve`` is set to 1%. In the object server, this blocks +PUT requests that would leave the free disk space below 1% of the +disk. In the account and container servers, this blocks operations +that will increase account or container database size once the free +disk space falls below 1%. + +Setting ``fallocate_reserve`` is highly recommended to avoid filling +disks to 100%. When Swift's disks are completely full, all requests +involving those disks will fail, including DELETE requests that would +otherwise free up space. This is because object deletion includes the +creation of a zero-byte tombstone (.ts) to record the time of the +deletion for replication purposes; this happens prior to deletion of +the object's data. On a completely-full filesystem, that zero-byte .ts +file cannot be created, so the DELETE request will fail and the disk +will remain completely full. If ``fallocate_reserve`` is set, then the +filesystem will have enough space to create the zero-byte .ts file, +and thus the deletion of the object will succeed and free up some +space. In order to prevent rsync replication to specific drives, firstly setup ``rsync_module`` per disk in your ``object-replicator``. diff --git a/etc/account-server.conf-sample b/etc/account-server.conf-sample index 86200505ea..47df56ecf3 100644 --- a/etc/account-server.conf-sample +++ b/etc/account-server.conf-sample @@ -97,6 +97,13 @@ use = egg:swift#account # Work only with ionice_class. # ionice_class = # ionice_priority = +# +# You can set fallocate_reserve to the number of bytes or percentage +# of disk space you'd like kept free at all times. If the disk's free +# space falls below this value, then PUT, POST, and REPLICATE requests +# will be denied until the disk ha s more space available. Percentage +# will be used if the value ends with a '%'. +# fallocate_reserve = 1% [filter:healthcheck] use = egg:swift#healthcheck diff --git a/etc/container-server.conf-sample b/etc/container-server.conf-sample index 7d38deb0c5..1c7fdd0b64 100644 --- a/etc/container-server.conf-sample +++ b/etc/container-server.conf-sample @@ -110,6 +110,13 @@ use = egg:swift#container # Work only with ionice_class. # ionice_class = # ionice_priority = +# +# You can set fallocate_reserve to the number of bytes or percentage +# of disk space you'd like kept free at all times. If the disk's free +# space falls below this value, then PUT, POST, and REPLICATE requests +# will be denied until the disk ha s more space available. Percentage +# will be used if the value ends with a '%'. +# fallocate_reserve = 1% [filter:healthcheck] use = egg:swift#healthcheck diff --git a/swift/account/server.py b/swift/account/server.py index cd284aae34..586e7dfab2 100644 --- a/swift/account/server.py +++ b/swift/account/server.py @@ -28,7 +28,8 @@ from swift.common.request_helpers import get_param, \ split_and_validate_path from swift.common.utils import get_logger, hash_path, public, \ Timestamp, storage_directory, config_true_value, \ - json, timing_stats, replication, get_log_line + json, timing_stats, replication, get_log_line, \ + config_fallocate_value, fs_has_free_space from swift.common.constraints import valid_timestamp, check_utf8, check_drive from swift.common import constraints from swift.common.db_replicator import ReplicatorRpc @@ -60,6 +61,8 @@ class AccountController(BaseStorageServer): conf.get('auto_create_account_prefix') or '.' swift.common.db.DB_PREALLOCATION = \ config_true_value(conf.get('db_preallocation', 'f')) + self.fallocate_reserve, self.fallocate_is_percent = \ + config_fallocate_value(conf.get('fallocate_reserve', '1%')) def _get_account_broker(self, drive, part, account, **kwargs): hsh = hash_path(account) @@ -83,6 +86,11 @@ class AccountController(BaseStorageServer): pass return resp(request=req, headers=headers, charset='utf-8', body=body) + def check_free_space(self, drive): + drive_root = os.path.join(self.root, drive) + return fs_has_free_space( + drive_root, self.fallocate_reserve, self.fallocate_is_percent) + @public @timing_stats() def DELETE(self, req): @@ -108,6 +116,8 @@ class AccountController(BaseStorageServer): check_drive(self.root, drive, self.mount_check) except ValueError: return HTTPInsufficientStorage(drive=drive, request=req) + if not self.check_free_space(drive): + return HTTPInsufficientStorage(drive=drive, request=req) if container: # put account container if 'x-timestamp' not in req.headers: timestamp = Timestamp.now() @@ -237,6 +247,8 @@ class AccountController(BaseStorageServer): check_drive(self.root, drive, self.mount_check) except ValueError: return HTTPInsufficientStorage(drive=drive, request=req) + if not self.check_free_space(drive): + return HTTPInsufficientStorage(drive=drive, request=req) try: args = json.load(req.environ['wsgi.input']) except ValueError as err: @@ -255,6 +267,8 @@ class AccountController(BaseStorageServer): check_drive(self.root, drive, self.mount_check) except ValueError: return HTTPInsufficientStorage(drive=drive, request=req) + if not self.check_free_space(drive): + return HTTPInsufficientStorage(drive=drive, request=req) broker = self._get_account_broker(drive, part, account) if broker.is_deleted(): return self._deleted_response(broker, req, HTTPNotFound) diff --git a/swift/common/db_replicator.py b/swift/common/db_replicator.py index 334f80b162..457a06379e 100644 --- a/swift/common/db_replicator.py +++ b/swift/common/db_replicator.py @@ -918,6 +918,13 @@ class ReplicatorRpc(object): quarantine_db(broker.db_file, broker.db_type) return HTTPNotFound() raise + # TODO(mattoliverau) At this point in the RPC, we have the callers + # replication info and ours, so it would be cool to be able to make + # an educated guess here on the size of the incoming replication (maybe + # average object table row size * difference in ROWIDs or something) + # and the fallocate_reserve setting so we could return a 507. + # This would make db fallocate_reserve more or less on par with the + # object's. if remote_info['metadata']: with self.debug_timing('update_metadata'): broker.update_metadata(remote_info['metadata']) diff --git a/swift/common/utils.py b/swift/common/utils.py index 64717f91d2..88a6cc1e42 100644 --- a/swift/common/utils.py +++ b/swift/common/utils.py @@ -759,6 +759,35 @@ class FileLikeIter(object): self.closed = True +def fs_has_free_space(fs_path, space_needed, is_percent): + """ + Check to see whether or not a filesystem has the given amount of space + free. Unlike fallocate(), this does not reserve any space. + + :param fs_path: path to a file or directory on the filesystem; typically + the path to the filesystem's mount point + + :param space_needed: minimum bytes or percentage of free space + + :param is_percent: if True, then space_needed is treated as a percentage + of the filesystem's capacity; if False, space_needed is a number of + free bytes. + + :returns: True if the filesystem has at least that much free space, + False otherwise + + :raises OSError: if fs_path does not exist + """ + st = os.statvfs(fs_path) + free_bytes = st.f_frsize * st.f_bavail + if is_percent: + size_bytes = st.f_frsize * st.f_blocks + free_percent = float(free_bytes) / float(size_bytes) * 100 + return free_percent >= space_needed + else: + return free_bytes >= space_needed + + class FallocateWrapper(object): def __init__(self, noop=False): diff --git a/swift/container/server.py b/swift/container/server.py index fc9b60be3c..4bbbf69fab 100644 --- a/swift/container/server.py +++ b/swift/container/server.py @@ -34,8 +34,9 @@ from swift.common.request_helpers import get_param, \ from swift.common.utils import get_logger, hash_path, public, \ Timestamp, storage_directory, validate_sync_to, \ config_true_value, timing_stats, replication, \ - override_bytes_from_content_type, get_log_line, ShardRange, list_from_csv - + override_bytes_from_content_type, get_log_line, \ + config_fallocate_value, fs_has_free_space, list_from_csv, \ + ShardRange from swift.common.constraints import valid_timestamp, check_utf8, check_drive from swift.common import constraints from swift.common.bufferedhttp import http_connect @@ -124,6 +125,8 @@ class ContainerController(BaseStorageServer): self.sync_store = ContainerSyncStore(self.root, self.logger, self.mount_check) + self.fallocate_reserve, self.fallocate_is_percent = \ + config_fallocate_value(conf.get('fallocate_reserve', '1%')) def _get_container_broker(self, drive, part, account, container, **kwargs): """ @@ -298,6 +301,11 @@ class ContainerController(BaseStorageServer): req.environ['swift.leave_relative_location'] = True return HTTPMovedPermanently(headers=headers, request=req) + def check_free_space(self, drive): + drive_root = os.path.join(self.root, drive) + return fs_has_free_space( + drive_root, self.fallocate_reserve, self.fallocate_is_percent) + @public @timing_stats() def DELETE(self, req): @@ -438,6 +446,8 @@ class ContainerController(BaseStorageServer): check_drive(self.root, drive, self.mount_check) except ValueError: return HTTPInsufficientStorage(drive=drive, request=req) + if not self.check_free_space(drive): + return HTTPInsufficientStorage(drive=drive, request=req) requested_policy_index = self.get_and_validate_policy_index(req) broker = self._get_container_broker(drive, part, account, container) if obj: # put container object @@ -726,6 +736,8 @@ class ContainerController(BaseStorageServer): check_drive(self.root, drive, self.mount_check) except ValueError: return HTTPInsufficientStorage(drive=drive, request=req) + if not self.check_free_space(drive): + return HTTPInsufficientStorage(drive=drive, request=req) try: args = json.load(req.environ['wsgi.input']) except ValueError as err: @@ -750,6 +762,8 @@ class ContainerController(BaseStorageServer): check_drive(self.root, drive, self.mount_check) except ValueError: return HTTPInsufficientStorage(drive=drive, request=req) + if not self.check_free_space(drive): + return HTTPInsufficientStorage(drive=drive, request=req) broker = self._get_container_broker(drive, part, account, container) if broker.is_deleted(): return HTTPNotFound(request=req) diff --git a/test/unit/account/test_server.py b/test/unit/account/test_server.py index 4a8f58cb05..61cef2ebe9 100644 --- a/test/unit/account/test_server.py +++ b/test/unit/account/test_server.py @@ -16,6 +16,7 @@ import errno import os import mock +import posix import unittest from tempfile import mkdtemp from shutil import rmtree @@ -192,6 +193,33 @@ class TestAccountController(unittest.TestCase): resp = req.get_response(self.controller) self.assertEqual(resp.status_int, 400) + def test_REPLICATE_insufficient_space(self): + conf = {'devices': self.testdir, + 'mount_check': 'false', + 'fallocate_reserve': '2%'} + account_controller = AccountController(conf) + + req = Request.blank('/sda1/p/a', + environ={'REQUEST_METHOD': 'REPLICATE'}) + statvfs_result = posix.statvfs_result([ + 4096, # f_bsize + 4096, # f_frsize + 2854907, # f_blocks + 59000, # f_bfree + 57000, # f_bavail (just under 2% free) + 1280000, # f_files + 1266040, # f_ffree, + 1266040, # f_favail, + 4096, # f_flag + 255, # f_namemax + ]) + with mock.patch('os.statvfs', + return_value=statvfs_result) as mock_statvfs: + resp = req.get_response(account_controller) + self.assertEqual(resp.status_int, 507) + self.assertEqual(mock_statvfs.mock_calls, + [mock.call(os.path.join(self.testdir, 'sda1'))]) + def test_REPLICATE_rsync_then_merge_works(self): def fake_rsync_then_merge(self, drive, db_file, args): return HTTPNoContent() @@ -379,6 +407,35 @@ class TestAccountController(unittest.TestCase): self.assertEqual(resp.status_int, 404) self.assertNotIn('X-Account-Status', resp.headers) + def test_PUT_insufficient_space(self): + conf = {'devices': self.testdir, + 'mount_check': 'false', + 'fallocate_reserve': '2%'} + account_controller = AccountController(conf) + + req = Request.blank( + '/sda1/p/a', + environ={'REQUEST_METHOD': 'PUT'}, + headers={'X-Timestamp': '1517612949.541469'}) + statvfs_result = posix.statvfs_result([ + 4096, # f_bsize + 4096, # f_frsize + 2854907, # f_blocks + 59000, # f_bfree + 57000, # f_bavail (just under 2% free) + 1280000, # f_files + 1266040, # f_ffree, + 1266040, # f_favail, + 4096, # f_flag + 255, # f_namemax + ]) + with mock.patch('os.statvfs', + return_value=statvfs_result) as mock_statvfs: + resp = req.get_response(account_controller) + self.assertEqual(resp.status_int, 507) + self.assertEqual(mock_statvfs.mock_calls, + [mock.call(os.path.join(self.testdir, 'sda1'))]) + def test_PUT(self): req = Request.blank('/sda1/p/a', environ={'REQUEST_METHOD': 'PUT', 'HTTP_X_TIMESTAMP': '0'}) @@ -701,6 +758,35 @@ class TestAccountController(unittest.TestCase): resp = req.get_response(self.controller) self.assertEqual(resp.status_int, 400) + def test_POST_insufficient_space(self): + conf = {'devices': self.testdir, + 'mount_check': 'false', + 'fallocate_reserve': '2%'} + account_controller = AccountController(conf) + + req = Request.blank( + '/sda1/p/a', + environ={'REQUEST_METHOD': 'POST'}, + headers={'X-Timestamp': '1517611584.937603'}) + statvfs_result = posix.statvfs_result([ + 4096, # f_bsize + 4096, # f_frsize + 2854907, # f_blocks + 59000, # f_bfree + 57000, # f_bavail (just under 2% free) + 1280000, # f_files + 1266040, # f_ffree, + 1266040, # f_favail, + 4096, # f_flag + 255, # f_namemax + ]) + with mock.patch('os.statvfs', + return_value=statvfs_result) as mock_statvfs: + resp = req.get_response(account_controller) + self.assertEqual(resp.status_int, 507) + self.assertEqual(mock_statvfs.mock_calls, + [mock.call(os.path.join(self.testdir, 'sda1'))]) + def test_POST_timestamp_not_float(self): req = Request.blank('/sda1/p/a', environ={'REQUEST_METHOD': 'POST', 'HTTP_X_TIMESTAMP': '0'}, diff --git a/test/unit/common/test_utils.py b/test/unit/common/test_utils.py index f399e7b98f..f9e9169cf0 100644 --- a/test/unit/common/test_utils.py +++ b/test/unit/common/test_utils.py @@ -34,6 +34,7 @@ import logging import platform import os import mock +import posix import pwd import random import re @@ -6680,6 +6681,51 @@ class TestHashForFileFunction(unittest.TestCase): '\n'.join(failures)) +class TestFsHasFreeSpace(unittest.TestCase): + def test_bytes(self): + fake_result = posix.statvfs_result([ + 4096, # f_bsize + 4096, # f_frsize + 2854907, # f_blocks + 1984802, # f_bfree (free blocks for root) + 1728089, # f_bavail (free blocks for non-root) + 1280000, # f_files + 1266040, # f_ffree, + 1266040, # f_favail, + 4096, # f_flag + 255, # f_namemax + ]) + with mock.patch('os.statvfs', return_value=fake_result): + self.assertTrue(utils.fs_has_free_space("/", 0, False)) + self.assertTrue(utils.fs_has_free_space("/", 1, False)) + # free space left = f_bavail * f_bsize = 7078252544 + self.assertTrue(utils.fs_has_free_space("/", 7078252544, False)) + self.assertFalse(utils.fs_has_free_space("/", 7078252545, False)) + self.assertFalse(utils.fs_has_free_space("/", 2 ** 64, False)) + + def test_percent(self): + fake_result = posix.statvfs_result([ + 4096, # f_bsize + 4096, # f_frsize + 2854907, # f_blocks + 1984802, # f_bfree (free blocks for root) + 1728089, # f_bavail (free blocks for non-root) + 1280000, # f_files + 1266040, # f_ffree, + 1266040, # f_favail, + 4096, # f_flag + 255, # f_namemax + ]) + with mock.patch('os.statvfs', return_value=fake_result): + self.assertTrue(utils.fs_has_free_space("/", 0, True)) + self.assertTrue(utils.fs_has_free_space("/", 1, True)) + # percentage of free space for the faked statvfs is 60% + self.assertTrue(utils.fs_has_free_space("/", 60, True)) + self.assertFalse(utils.fs_has_free_space("/", 61, True)) + self.assertFalse(utils.fs_has_free_space("/", 100, True)) + self.assertFalse(utils.fs_has_free_space("/", 110, True)) + + class TestSetSwiftDir(unittest.TestCase): def setUp(self): self.swift_dir = tempfile.mkdtemp() diff --git a/test/unit/container/test_server.py b/test/unit/container/test_server.py index 20a2fec7e0..7041de20e5 100644 --- a/test/unit/container/test_server.py +++ b/test/unit/container/test_server.py @@ -16,6 +16,7 @@ import operator import os +import posix import mock import unittest import itertools @@ -426,6 +427,35 @@ class TestContainerController(unittest.TestCase): resp = req.get_response(self.controller) self.assertEqual(resp.status_int, 202) + def test_PUT_insufficient_space(self): + conf = {'devices': self.testdir, + 'mount_check': 'false', + 'fallocate_reserve': '2%'} + container_controller = container_server.ContainerController(conf) + + req = Request.blank( + '/sda1/p/a/c', + environ={'REQUEST_METHOD': 'PUT'}, + headers={'X-Timestamp': '1517617825.74832'}) + statvfs_result = posix.statvfs_result([ + 4096, # f_bsize + 4096, # f_frsize + 2854907, # f_blocks + 59000, # f_bfree + 57000, # f_bavail (just under 2% free) + 1280000, # f_files + 1266040, # f_ffree, + 1266040, # f_favail, + 4096, # f_flag + 255, # f_namemax + ]) + with mock.patch('os.statvfs', + return_value=statvfs_result) as mock_statvfs: + resp = req.get_response(container_controller) + self.assertEqual(resp.status_int, 507) + self.assertEqual(mock_statvfs.mock_calls, + [mock.call(os.path.join(self.testdir, 'sda1'))]) + def test_PUT_simulated_create_race(self): state = ['initial'] @@ -1001,6 +1031,35 @@ class TestContainerController(unittest.TestCase): resp = req.get_response(self.controller) self.assertEqual(resp.status_int, 400) + def test_POST_insufficient_space(self): + conf = {'devices': self.testdir, + 'mount_check': 'false', + 'fallocate_reserve': '2%'} + container_controller = container_server.ContainerController(conf) + + req = Request.blank( + '/sda1/p/a/c', + environ={'REQUEST_METHOD': 'POST'}, + headers={'X-Timestamp': '1517618035.469202'}) + statvfs_result = posix.statvfs_result([ + 4096, # f_bsize + 4096, # f_frsize + 2854907, # f_blocks + 59000, # f_bfree + 57000, # f_bavail (just under 2% free) + 1280000, # f_files + 1266040, # f_ffree, + 1266040, # f_favail, + 4096, # f_flag + 255, # f_namemax + ]) + with mock.patch('os.statvfs', + return_value=statvfs_result) as mock_statvfs: + resp = req.get_response(container_controller) + self.assertEqual(resp.status_int, 507) + self.assertEqual(mock_statvfs.mock_calls, + [mock.call(os.path.join(self.testdir, 'sda1'))]) + def test_POST_timestamp_not_float(self): req = Request.blank('/sda1/p/a/c', environ={'REQUEST_METHOD': 'PUT', 'HTTP_X_TIMESTAMP': '0'}) @@ -1388,6 +1447,34 @@ class TestContainerController(unittest.TestCase): resp = req.get_response(self.controller) self.assertEqual(resp.status_int, 500) + def test_REPLICATE_insufficient_space(self): + conf = {'devices': self.testdir, + 'mount_check': 'false', + 'fallocate_reserve': '2%'} + container_controller = container_server.ContainerController(conf) + + req = Request.blank( + '/sda1/p/a/', + environ={'REQUEST_METHOD': 'REPLICATE'}) + statvfs_result = posix.statvfs_result([ + 4096, # f_bsize + 4096, # f_frsize + 2854907, # f_blocks + 59000, # f_bfree + 57000, # f_bavail (just under 2% free) + 1280000, # f_files + 1266040, # f_ffree, + 1266040, # f_favail, + 4096, # f_flag + 255, # f_namemax + ]) + with mock.patch('os.statvfs', + return_value=statvfs_result) as mock_statvfs: + resp = req.get_response(container_controller) + self.assertEqual(resp.status_int, 507) + self.assertEqual(mock_statvfs.mock_calls, + [mock.call(os.path.join(self.testdir, 'sda1'))]) + def test_DELETE(self): ts_iter = make_timestamp_iter() req = Request.blank(