From 06cf5d298fb6b103899aa358e1cb4b828f502dc5 Mon Sep 17 00:00:00 2001 From: Clay Gerrard Date: Mon, 29 Oct 2018 14:49:48 -0500 Subject: [PATCH] Add databases_per_second to db daemons Most daemons have a "go as fast as you can then sleep for 30 seconds" strategy towards resource utilization; the object-updater and object-auditor however have some "X_per_second" options that allow operators much better control over how they spend their I/O budget. This change extends that pattern into the account-replicator, container-replicator, and container-sharder which have been known to peg CPUs when they're not IO limited. Partial-Bug: #1784753 Change-Id: Ib7f2497794fa2f384a1a6ab500b657c624426384 --- doc/source/deployment_guide.rst | 350 +++++++++++++------------ etc/account-server.conf-sample | 3 + etc/container-server.conf-sample | 6 + swift/common/db_replicator.py | 11 +- swift/container/sharder.py | 4 +- test/unit/common/test_db_replicator.py | 1 + test/unit/container/test_sharder.py | 56 +++- 7 files changed, 253 insertions(+), 178 deletions(-) diff --git a/doc/source/deployment_guide.rst b/doc/source/deployment_guide.rst index ee580669d4..dbadef4670 100644 --- a/doc/source/deployment_guide.rst +++ b/doc/source/deployment_guide.rst @@ -1173,94 +1173,98 @@ ionice_priority None I/O scheduling priority of ser [container-replicator] ********************** -================== =========================== ============================= -Option Default Description ------------------- --------------------------- ----------------------------- -log_name container-replicator Label used when logging -log_facility LOG_LOCAL0 Syslog log facility -log_level INFO Logging level -log_address /dev/log Logging directory -per_diff 1000 Maximum number of database - rows that will be sync'd in a - single HTTP replication - request. Databases with less - than or equal to this number - of differing rows will always - be sync'd using an HTTP - replication request rather - than using rsync. -max_diffs 100 Maximum number of HTTP - replication requests attempted - on each replication pass for - any one container. This caps - how long the replicator will - spend trying to sync a given - database per pass so the other - databases don't get starved. -concurrency 8 Number of replication workers - to spawn -interval 30 Time in seconds to wait - between replication passes -node_timeout 10 Request timeout to external - services -conn_timeout 0.5 Connection timeout to external - services -reclaim_age 604800 Time elapsed in seconds before - a container can be reclaimed -rsync_module {replication_ip}::container Format of the rsync module - where the replicator will send - data. The configuration value - can include some variables - that will be extracted from - the ring. Variables must - follow the format {NAME} where - NAME is one of: ip, port, - replication_ip, - replication_port, region, - zone, device, meta. See - etc/rsyncd.conf-sample for - some examples. -rsync_compress no Allow rsync to compress data - which is transmitted to - destination node during sync. - However, this is applicable - only when destination node is - in a different region than the - local one. NOTE: Objects that - are already compressed (for - example: .tar.gz, mp3) might - slow down the syncing process. -recon_cache_path /var/cache/swift Path to recon cache -nice_priority None Scheduling priority of server - processes. Niceness values - range from -20 (most favorable - to the process) to 19 (least - favorable to the process). - The default does not modify - priority. -ionice_class None I/O scheduling class of server - processes. I/O niceness class - values are - IOPRIO_CLASS_RT (realtime), - IOPRIO_CLASS_BE (best-effort), - and IOPRIO_CLASS_IDLE (idle). - The default does not modify - class and priority. Linux - supports io scheduling - priorities and classes since - 2.6.13 with the CFQ io - scheduler. - Work only with ionice_priority. -ionice_priority None I/O scheduling priority of - server processes. I/O niceness - priority is a number which goes - from 0 to 7. - The higher the value, the lower - the I/O priority of the process. - Work only with ionice_class. - Ignored if IOPRIO_CLASS_IDLE - is set. -================== =========================== ============================= +==================== =========================== ============================= +Option Default Description +-------------------- --------------------------- ----------------------------- +log_name container-replicator Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +per_diff 1000 Maximum number of database + rows that will be sync'd in a + single HTTP replication + request. Databases with less + than or equal to this number + of differing rows will always + be sync'd using an HTTP + replication request rather + than using rsync. +max_diffs 100 Maximum number of HTTP + replication requests attempted + on each replication pass for + any one container. This caps + how long the replicator will + spend trying to sync a given + database per pass so the other + databases don't get starved. +concurrency 8 Number of replication workers + to spawn +interval 30 Time in seconds to wait + between replication passes +databases_per_second 50 Maximum databases to process + per second. Should be tuned + according to individual + system specs. 0 is unlimited. +node_timeout 10 Request timeout to external + services +conn_timeout 0.5 Connection timeout to external + services +reclaim_age 604800 Time elapsed in seconds before + a container can be reclaimed +rsync_module {replication_ip}::container Format of the rsync module + where the replicator will send + data. The configuration value + can include some variables + that will be extracted from + the ring. Variables must + follow the format {NAME} where + NAME is one of: ip, port, + replication_ip, + replication_port, region, + zone, device, meta. See + etc/rsyncd.conf-sample for + some examples. +rsync_compress no Allow rsync to compress data + which is transmitted to + destination node during sync. + However, this is applicable + only when destination node is + in a different region than the + local one. NOTE: Objects that + are already compressed (for + example: .tar.gz, mp3) might + slow down the syncing process. +recon_cache_path /var/cache/swift Path to recon cache +nice_priority None Scheduling priority of server + processes. Niceness values + range from -20 (most favorable + to the process) to 19 (least + favorable to the process). + The default does not modify + priority. +ionice_class None I/O scheduling class of server + processes. I/O niceness class + values are + IOPRIO_CLASS_RT (realtime), + IOPRIO_CLASS_BE (best-effort), + and IOPRIO_CLASS_IDLE (idle). + The default does not modify + class and priority. Linux + supports io scheduling + priorities and classes since + 2.6.13 with the CFQ io + scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of + server processes. I/O niceness + priority is a number which goes + from 0 to 7. + The higher the value, the lower + the I/O priority of the process. + Work only with ionice_class. + Ignored if IOPRIO_CLASS_IDLE + is set. +==================== =========================== ============================= ******************* [container-updater] @@ -1524,89 +1528,93 @@ ionice_priority None I/O scheduling priority of server [account-replicator] ******************** -================== ========================= =============================== -Option Default Description ------------------- ------------------------- ------------------------------- -log_name account-replicator Label used when logging -log_facility LOG_LOCAL0 Syslog log facility -log_level INFO Logging level -log_address /dev/log Logging directory -per_diff 1000 Maximum number of database rows - that will be sync'd in a single - HTTP replication request. - Databases with less than or - equal to this number of - differing rows will always be - sync'd using an HTTP replication - request rather than using rsync. -max_diffs 100 Maximum number of HTTP - replication requests attempted - on each replication pass for any - one container. This caps how - long the replicator will spend - trying to sync a given database - per pass so the other databases - don't get starved. -concurrency 8 Number of replication workers - to spawn -interval 30 Time in seconds to wait between - replication passes -node_timeout 10 Request timeout to external - services -conn_timeout 0.5 Connection timeout to external - services -reclaim_age 604800 Time elapsed in seconds before - an account can be reclaimed -rsync_module {replication_ip}::account Format of the rsync module where - the replicator will send data. - The configuration value can - include some variables that will - be extracted from the ring. - Variables must follow the format - {NAME} where NAME is one of: ip, - port, replication_ip, - replication_port, region, zone, - device, meta. See - etc/rsyncd.conf-sample for some - examples. -rsync_compress no Allow rsync to compress data - which is transmitted to - destination node during sync. - However, this is applicable only - when destination node is in a - different region than the local - one. NOTE: Objects that are - already compressed (for example: - .tar.gz, mp3) might slow down - the syncing process. -recon_cache_path /var/cache/swift Path to recon cache -nice_priority None Scheduling priority of server - processes. Niceness values - range from -20 (most favorable - to the process) to 19 (least - favorable to the process). - The default does not modify - priority. -ionice_class None I/O scheduling class of server - processes. I/O niceness class - values are IOPRIO_CLASS_RT - (realtime), IOPRIO_CLASS_BE - (best-effort), and IOPRIO_CLASS_IDLE - (idle). - The default does not modify - class and priority. Linux supports - io scheduling priorities and classes - since 2.6.13 with the CFQ io scheduler. - Work only with ionice_priority. -ionice_priority None I/O scheduling priority of server - processes. I/O niceness priority - is a number which goes from 0 to 7. - The higher the value, the lower - the I/O priority of the process. - Work only with ionice_class. - Ignored if IOPRIO_CLASS_IDLE - is set. -================== ========================= =============================== +==================== ========================= =============================== +Option Default Description +-------------------- ------------------------- ------------------------------- +log_name account-replicator Label used when logging +log_facility LOG_LOCAL0 Syslog log facility +log_level INFO Logging level +log_address /dev/log Logging directory +per_diff 1000 Maximum number of database rows + that will be sync'd in a single + HTTP replication request. + Databases with less than or + equal to this number of + differing rows will always be + sync'd using an HTTP replication + request rather than using rsync. +max_diffs 100 Maximum number of HTTP + replication requests attempted + on each replication pass for any + one container. This caps how + long the replicator will spend + trying to sync a given database + per pass so the other databases + don't get starved. +concurrency 8 Number of replication workers + to spawn +interval 30 Time in seconds to wait between + replication passes +databases_per_second 50 Maximum databases to process + per second. Should be tuned + according to individual + system specs. 0 is unlimited. +node_timeout 10 Request timeout to external + services +conn_timeout 0.5 Connection timeout to external + services +reclaim_age 604800 Time elapsed in seconds before + an account can be reclaimed +rsync_module {replication_ip}::account Format of the rsync module where + the replicator will send data. + The configuration value can + include some variables that will + be extracted from the ring. + Variables must follow the format + {NAME} where NAME is one of: ip, + port, replication_ip, + replication_port, region, zone, + device, meta. See + etc/rsyncd.conf-sample for some + examples. +rsync_compress no Allow rsync to compress data + which is transmitted to + destination node during sync. + However, this is applicable only + when destination node is in a + different region than the local + one. NOTE: Objects that are + already compressed (for example: + .tar.gz, mp3) might slow down + the syncing process. +recon_cache_path /var/cache/swift Path to recon cache +nice_priority None Scheduling priority of server + processes. Niceness values + range from -20 (most favorable + to the process) to 19 (least + favorable to the process). + The default does not modify + priority. +ionice_class None I/O scheduling class of server + processes. I/O niceness class + values are IOPRIO_CLASS_RT + (realtime), IOPRIO_CLASS_BE + (best-effort), and IOPRIO_CLASS_IDLE + (idle). + The default does not modify + class and priority. Linux supports + io scheduling priorities and classes + since 2.6.13 with the CFQ io scheduler. + Work only with ionice_priority. +ionice_priority None I/O scheduling priority of server + processes. I/O niceness priority + is a number which goes from 0 to 7. + The higher the value, the lower + the I/O priority of the process. + Work only with ionice_class. + Ignored if IOPRIO_CLASS_IDLE + is set. +==================== ========================= =============================== ***************** [account-auditor] diff --git a/etc/account-server.conf-sample b/etc/account-server.conf-sample index dbe13a020a..0a3856256b 100644 --- a/etc/account-server.conf-sample +++ b/etc/account-server.conf-sample @@ -143,6 +143,9 @@ use = egg:swift#recon # run_pause is deprecated, use interval instead # run_pause = 30 # +# Process at most this many databases per second +# databases_per_second = 50 +# # node_timeout = 10 # conn_timeout = 0.5 # diff --git a/etc/container-server.conf-sample b/etc/container-server.conf-sample index 70b0bf4f2c..5458793a5a 100644 --- a/etc/container-server.conf-sample +++ b/etc/container-server.conf-sample @@ -156,6 +156,9 @@ use = egg:swift#recon # run_pause is deprecated, use interval instead # run_pause = 30 # +# Process at most this many databases per second +# databases_per_second = 50 +# # node_timeout = 10 # conn_timeout = 0.5 # @@ -436,6 +439,9 @@ use = egg:swift#xprofile # Time in seconds to wait between sharder cycles # interval = 30 # +# Process at most this many databases per second +# databases_per_second = 50 +# # The container-sharder accepts the following configuration options as defined # in the container-replicator section: # diff --git a/swift/common/db_replicator.py b/swift/common/db_replicator.py index 478d1f83ba..d0edb4b255 100644 --- a/swift/common/db_replicator.py +++ b/swift/common/db_replicator.py @@ -33,7 +33,7 @@ from swift.common.utils import get_logger, whataremyips, storage_directory, \ renamer, mkdirs, lock_parent_directory, config_true_value, \ unlink_older_than, dump_recon_cache, rsync_module_interpolation, \ json, parse_override_options, round_robin_iter, Everything, get_db_files, \ - parse_db_filename, quote + parse_db_filename, quote, RateLimitedIterator from swift.common import ring from swift.common.ring.utils import is_local_device from swift.common.http import HTTP_NOT_FOUND, HTTP_INSUFFICIENT_STORAGE, \ @@ -204,6 +204,8 @@ class Replicator(Daemon): ' to use option %(type)s-replicator/' 'interval.' % {'type': self.server_type}) + self.databases_per_second = int( + conf.get('databases_per_second', 50)) self.node_timeout = float(conf.get('node_timeout', 10)) self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.rsync_compress = config_true_value( @@ -733,6 +735,11 @@ class Replicator(Daemon): def report_up_to_date(self, full_info): return True + def roundrobin_datadirs(self, dirs): + return RateLimitedIterator( + roundrobin_datadirs(dirs), + elements_per_second=self.databases_per_second) + def run_once(self, *args, **kwargs): """Run a replication pass once.""" override_options = parse_override_options(once=True, **kwargs) @@ -789,7 +796,7 @@ class Replicator(Daemon): "file, not replicating", ", ".join(ips), self.port) self.logger.info(_('Beginning replication run')) - for part, object_file, node_id in roundrobin_datadirs(dirs): + for part, object_file, node_id in self.roundrobin_datadirs(dirs): self.cpool.spawn_n( self._replicate_object, part, object_file, node_id) self.cpool.waitall() diff --git a/swift/container/sharder.py b/swift/container/sharder.py index d282d7b6da..ae266736a5 100644 --- a/swift/container/sharder.py +++ b/swift/container/sharder.py @@ -23,7 +23,7 @@ import os import six from eventlet import Timeout -from swift.common import internal_client, db_replicator +from swift.common import internal_client from swift.common.constraints import check_drive from swift.common.direct_client import (direct_put_container, DirectClientException) @@ -1500,7 +1500,7 @@ class ContainerSharder(ContainerReplicator): dirs.append((datadir, node, part_filt)) if not dirs: self.logger.warning('Found no data dirs!') - for part, path, node in db_replicator.roundrobin_datadirs(dirs): + for part, path, node in self.roundrobin_datadirs(dirs): # NB: get_part_nodes always provides an 'index' key; # this will be used in leader selection for primary in self.ring.get_part_nodes(int(part)): diff --git a/test/unit/common/test_db_replicator.py b/test/unit/common/test_db_replicator.py index 09ac8bc28a..9dd40091fa 100644 --- a/test/unit/common/test_db_replicator.py +++ b/test/unit/common/test_db_replicator.py @@ -321,6 +321,7 @@ class TestDBReplicator(unittest.TestCase): # later config should be extended to assert more config options replicator = TestReplicator({'node_timeout': '3.5'}) self.assertEqual(replicator.node_timeout, 3.5) + self.assertEqual(replicator.databases_per_second, 50) def test_repl_connection(self): node = {'replication_ip': '127.0.0.1', 'replication_port': 80, diff --git a/test/unit/container/test_sharder.py b/test/unit/container/test_sharder.py index 751b621ed5..64b127f4a5 100644 --- a/test/unit/container/test_sharder.py +++ b/test/unit/container/test_sharder.py @@ -128,6 +128,7 @@ class TestSharder(BaseTestSharder): expected = { 'mount_check': True, 'bind_ip': '0.0.0.0', 'port': 6201, 'per_diff': 1000, 'max_diffs': 100, 'interval': 30, + 'databases_per_second': 50, 'cleave_row_batch_size': 10000, 'node_timeout': 10, 'conn_timeout': 5, 'rsync_compress': False, @@ -154,6 +155,7 @@ class TestSharder(BaseTestSharder): conf = { 'mount_check': False, 'bind_ip': '10.11.12.13', 'bind_port': 62010, 'per_diff': 2000, 'max_diffs': 200, 'interval': 60, + 'databases_per_second': 5, 'cleave_row_batch_size': 3000, 'node_timeout': 20, 'conn_timeout': 1, 'rsync_compress': True, @@ -176,6 +178,7 @@ class TestSharder(BaseTestSharder): expected = { 'mount_check': False, 'bind_ip': '10.11.12.13', 'port': 62010, 'per_diff': 2000, 'max_diffs': 200, 'interval': 60, + 'databases_per_second': 5, 'cleave_row_batch_size': 3000, 'node_timeout': 20, 'conn_timeout': 1, 'rsync_compress': True, @@ -485,7 +488,7 @@ class TestSharder(BaseTestSharder): 0, 'text/plain', 'etag', 0) # check only sharding enabled containers are processed - with mock.patch.object( + with mock.patch('eventlet.sleep'), mock.patch.object( sharder, '_process_broker' ) as mock_process_broker: sharder._local_device_ids = {'stale_node_id'} @@ -539,7 +542,7 @@ class TestSharder(BaseTestSharder): "for %s" % broker.path) # check exceptions are handled - with mock.patch.object( + with mock.patch('eventlet.sleep'), mock.patch.object( sharder, '_process_broker', side_effect=mock_processing ) as mock_process_broker: sharder._local_device_ids = {'stale_node_id'} @@ -593,7 +596,7 @@ class TestSharder(BaseTestSharder): for i in range(10): brokers[1].delete_object( 'o%s' % i, next(self.ts_iter).internal) - with mock.patch.object( + with mock.patch('eventlet.sleep'), mock.patch.object( sharder, '_process_broker' ) as mock_process_broker: sharder._local_device_ids = {999} @@ -612,6 +615,53 @@ class TestSharder(BaseTestSharder): expected_candidate_stats, sharder, 'sharding_candidates') self._assert_recon_stats(None, sharder, 'sharding_progress') + def test_ratelimited_roundrobin(self): + n_databases = 100 + + def stub_iter(dirs): + for i in range(n_databases): + yield i, '/srv/node/sda/path/to/container.db', {} + + now = time.time() + clock = { + 'sleeps': [], + 'now': now, + } + + def fake_sleep(t): + clock['sleeps'].append(t) + clock['now'] += t + + def fake_time(): + return clock['now'] + + with self._mock_sharder({'databases_per_second': 1}) as sharder, \ + mock.patch('swift.common.db_replicator.roundrobin_datadirs', + stub_iter), \ + mock.patch('time.time', fake_time), \ + mock.patch('eventlet.sleep', fake_sleep): + list(sharder.roundrobin_datadirs(None)) + # 100 db at 1/s should take ~100s + run_time = sum(clock['sleeps']) + self.assertTrue(97 <= run_time < 100, 'took %s' % run_time) + + n_databases = 1000 + now = time.time() + clock = { + 'sleeps': [], + 'now': now, + } + + with self._mock_sharder({'databases_per_second': 50}) as sharder, \ + mock.patch('swift.common.db_replicator.roundrobin_datadirs', + stub_iter), \ + mock.patch('time.time', fake_time), \ + mock.patch('eventlet.sleep', fake_sleep): + list(sharder.roundrobin_datadirs(None)) + # 1000 db at 50/s + run_time = sum(clock['sleeps']) + self.assertTrue(18 <= run_time < 20, 'took %s' % run_time) + @contextmanager def _mock_sharder(self, conf=None, replicas=3): conf = conf or {}