Add databases_per_second to db daemons

Most daemons have a "go as fast as you can then sleep for 30 seconds"
strategy towards resource utilization; the object-updater and
object-auditor however have some "X_per_second" options that allow
operators much better control over how they spend their I/O budget.

This change extends that pattern into the account-replicator,
container-replicator, and container-sharder which have been known to peg
CPUs when they're not IO limited.

Partial-Bug: #1784753
Change-Id: Ib7f2497794fa2f384a1a6ab500b657c624426384
This commit is contained in:
Clay Gerrard 2018-10-29 14:49:48 -05:00 committed by Tim Burke
parent 24bf5eea8c
commit 06cf5d298f
7 changed files with 253 additions and 178 deletions

View File

@ -1173,94 +1173,98 @@ ionice_priority None I/O scheduling priority of ser
[container-replicator]
**********************
================== =========================== =============================
Option Default Description
------------------ --------------------------- -----------------------------
log_name container-replicator Label used when logging
log_facility LOG_LOCAL0 Syslog log facility
log_level INFO Logging level
log_address /dev/log Logging directory
per_diff 1000 Maximum number of database
rows that will be sync'd in a
single HTTP replication
request. Databases with less
than or equal to this number
of differing rows will always
be sync'd using an HTTP
replication request rather
than using rsync.
max_diffs 100 Maximum number of HTTP
replication requests attempted
on each replication pass for
any one container. This caps
how long the replicator will
spend trying to sync a given
database per pass so the other
databases don't get starved.
concurrency 8 Number of replication workers
to spawn
interval 30 Time in seconds to wait
between replication passes
node_timeout 10 Request timeout to external
services
conn_timeout 0.5 Connection timeout to external
services
reclaim_age 604800 Time elapsed in seconds before
a container can be reclaimed
rsync_module {replication_ip}::container Format of the rsync module
where the replicator will send
data. The configuration value
can include some variables
that will be extracted from
the ring. Variables must
follow the format {NAME} where
NAME is one of: ip, port,
replication_ip,
replication_port, region,
zone, device, meta. See
etc/rsyncd.conf-sample for
some examples.
rsync_compress no Allow rsync to compress data
which is transmitted to
destination node during sync.
However, this is applicable
only when destination node is
in a different region than the
local one. NOTE: Objects that
are already compressed (for
example: .tar.gz, mp3) might
slow down the syncing process.
recon_cache_path /var/cache/swift Path to recon cache
nice_priority None Scheduling priority of server
processes. Niceness values
range from -20 (most favorable
to the process) to 19 (least
favorable to the process).
The default does not modify
priority.
ionice_class None I/O scheduling class of server
processes. I/O niceness class
values are
IOPRIO_CLASS_RT (realtime),
IOPRIO_CLASS_BE (best-effort),
and IOPRIO_CLASS_IDLE (idle).
The default does not modify
class and priority. Linux
supports io scheduling
priorities and classes since
2.6.13 with the CFQ io
scheduler.
Work only with ionice_priority.
ionice_priority None I/O scheduling priority of
server processes. I/O niceness
priority is a number which goes
from 0 to 7.
The higher the value, the lower
the I/O priority of the process.
Work only with ionice_class.
Ignored if IOPRIO_CLASS_IDLE
is set.
================== =========================== =============================
==================== =========================== =============================
Option Default Description
-------------------- --------------------------- -----------------------------
log_name container-replicator Label used when logging
log_facility LOG_LOCAL0 Syslog log facility
log_level INFO Logging level
log_address /dev/log Logging directory
per_diff 1000 Maximum number of database
rows that will be sync'd in a
single HTTP replication
request. Databases with less
than or equal to this number
of differing rows will always
be sync'd using an HTTP
replication request rather
than using rsync.
max_diffs 100 Maximum number of HTTP
replication requests attempted
on each replication pass for
any one container. This caps
how long the replicator will
spend trying to sync a given
database per pass so the other
databases don't get starved.
concurrency 8 Number of replication workers
to spawn
interval 30 Time in seconds to wait
between replication passes
databases_per_second 50 Maximum databases to process
per second. Should be tuned
according to individual
system specs. 0 is unlimited.
node_timeout 10 Request timeout to external
services
conn_timeout 0.5 Connection timeout to external
services
reclaim_age 604800 Time elapsed in seconds before
a container can be reclaimed
rsync_module {replication_ip}::container Format of the rsync module
where the replicator will send
data. The configuration value
can include some variables
that will be extracted from
the ring. Variables must
follow the format {NAME} where
NAME is one of: ip, port,
replication_ip,
replication_port, region,
zone, device, meta. See
etc/rsyncd.conf-sample for
some examples.
rsync_compress no Allow rsync to compress data
which is transmitted to
destination node during sync.
However, this is applicable
only when destination node is
in a different region than the
local one. NOTE: Objects that
are already compressed (for
example: .tar.gz, mp3) might
slow down the syncing process.
recon_cache_path /var/cache/swift Path to recon cache
nice_priority None Scheduling priority of server
processes. Niceness values
range from -20 (most favorable
to the process) to 19 (least
favorable to the process).
The default does not modify
priority.
ionice_class None I/O scheduling class of server
processes. I/O niceness class
values are
IOPRIO_CLASS_RT (realtime),
IOPRIO_CLASS_BE (best-effort),
and IOPRIO_CLASS_IDLE (idle).
The default does not modify
class and priority. Linux
supports io scheduling
priorities and classes since
2.6.13 with the CFQ io
scheduler.
Work only with ionice_priority.
ionice_priority None I/O scheduling priority of
server processes. I/O niceness
priority is a number which goes
from 0 to 7.
The higher the value, the lower
the I/O priority of the process.
Work only with ionice_class.
Ignored if IOPRIO_CLASS_IDLE
is set.
==================== =========================== =============================
*******************
[container-updater]
@ -1524,89 +1528,93 @@ ionice_priority None I/O scheduling priority of server
[account-replicator]
********************
================== ========================= ===============================
Option Default Description
------------------ ------------------------- -------------------------------
log_name account-replicator Label used when logging
log_facility LOG_LOCAL0 Syslog log facility
log_level INFO Logging level
log_address /dev/log Logging directory
per_diff 1000 Maximum number of database rows
that will be sync'd in a single
HTTP replication request.
Databases with less than or
equal to this number of
differing rows will always be
sync'd using an HTTP replication
request rather than using rsync.
max_diffs 100 Maximum number of HTTP
replication requests attempted
on each replication pass for any
one container. This caps how
long the replicator will spend
trying to sync a given database
per pass so the other databases
don't get starved.
concurrency 8 Number of replication workers
to spawn
interval 30 Time in seconds to wait between
replication passes
node_timeout 10 Request timeout to external
services
conn_timeout 0.5 Connection timeout to external
services
reclaim_age 604800 Time elapsed in seconds before
an account can be reclaimed
rsync_module {replication_ip}::account Format of the rsync module where
the replicator will send data.
The configuration value can
include some variables that will
be extracted from the ring.
Variables must follow the format
{NAME} where NAME is one of: ip,
port, replication_ip,
replication_port, region, zone,
device, meta. See
etc/rsyncd.conf-sample for some
examples.
rsync_compress no Allow rsync to compress data
which is transmitted to
destination node during sync.
However, this is applicable only
when destination node is in a
different region than the local
one. NOTE: Objects that are
already compressed (for example:
.tar.gz, mp3) might slow down
the syncing process.
recon_cache_path /var/cache/swift Path to recon cache
nice_priority None Scheduling priority of server
processes. Niceness values
range from -20 (most favorable
to the process) to 19 (least
favorable to the process).
The default does not modify
priority.
ionice_class None I/O scheduling class of server
processes. I/O niceness class
values are IOPRIO_CLASS_RT
(realtime), IOPRIO_CLASS_BE
(best-effort), and IOPRIO_CLASS_IDLE
(idle).
The default does not modify
class and priority. Linux supports
io scheduling priorities and classes
since 2.6.13 with the CFQ io scheduler.
Work only with ionice_priority.
ionice_priority None I/O scheduling priority of server
processes. I/O niceness priority
is a number which goes from 0 to 7.
The higher the value, the lower
the I/O priority of the process.
Work only with ionice_class.
Ignored if IOPRIO_CLASS_IDLE
is set.
================== ========================= ===============================
==================== ========================= ===============================
Option Default Description
-------------------- ------------------------- -------------------------------
log_name account-replicator Label used when logging
log_facility LOG_LOCAL0 Syslog log facility
log_level INFO Logging level
log_address /dev/log Logging directory
per_diff 1000 Maximum number of database rows
that will be sync'd in a single
HTTP replication request.
Databases with less than or
equal to this number of
differing rows will always be
sync'd using an HTTP replication
request rather than using rsync.
max_diffs 100 Maximum number of HTTP
replication requests attempted
on each replication pass for any
one container. This caps how
long the replicator will spend
trying to sync a given database
per pass so the other databases
don't get starved.
concurrency 8 Number of replication workers
to spawn
interval 30 Time in seconds to wait between
replication passes
databases_per_second 50 Maximum databases to process
per second. Should be tuned
according to individual
system specs. 0 is unlimited.
node_timeout 10 Request timeout to external
services
conn_timeout 0.5 Connection timeout to external
services
reclaim_age 604800 Time elapsed in seconds before
an account can be reclaimed
rsync_module {replication_ip}::account Format of the rsync module where
the replicator will send data.
The configuration value can
include some variables that will
be extracted from the ring.
Variables must follow the format
{NAME} where NAME is one of: ip,
port, replication_ip,
replication_port, region, zone,
device, meta. See
etc/rsyncd.conf-sample for some
examples.
rsync_compress no Allow rsync to compress data
which is transmitted to
destination node during sync.
However, this is applicable only
when destination node is in a
different region than the local
one. NOTE: Objects that are
already compressed (for example:
.tar.gz, mp3) might slow down
the syncing process.
recon_cache_path /var/cache/swift Path to recon cache
nice_priority None Scheduling priority of server
processes. Niceness values
range from -20 (most favorable
to the process) to 19 (least
favorable to the process).
The default does not modify
priority.
ionice_class None I/O scheduling class of server
processes. I/O niceness class
values are IOPRIO_CLASS_RT
(realtime), IOPRIO_CLASS_BE
(best-effort), and IOPRIO_CLASS_IDLE
(idle).
The default does not modify
class and priority. Linux supports
io scheduling priorities and classes
since 2.6.13 with the CFQ io scheduler.
Work only with ionice_priority.
ionice_priority None I/O scheduling priority of server
processes. I/O niceness priority
is a number which goes from 0 to 7.
The higher the value, the lower
the I/O priority of the process.
Work only with ionice_class.
Ignored if IOPRIO_CLASS_IDLE
is set.
==================== ========================= ===============================
*****************
[account-auditor]

View File

@ -143,6 +143,9 @@ use = egg:swift#recon
# run_pause is deprecated, use interval instead
# run_pause = 30
#
# Process at most this many databases per second
# databases_per_second = 50
#
# node_timeout = 10
# conn_timeout = 0.5
#

View File

@ -156,6 +156,9 @@ use = egg:swift#recon
# run_pause is deprecated, use interval instead
# run_pause = 30
#
# Process at most this many databases per second
# databases_per_second = 50
#
# node_timeout = 10
# conn_timeout = 0.5
#
@ -436,6 +439,9 @@ use = egg:swift#xprofile
# Time in seconds to wait between sharder cycles
# interval = 30
#
# Process at most this many databases per second
# databases_per_second = 50
#
# The container-sharder accepts the following configuration options as defined
# in the container-replicator section:
#

View File

@ -33,7 +33,7 @@ from swift.common.utils import get_logger, whataremyips, storage_directory, \
renamer, mkdirs, lock_parent_directory, config_true_value, \
unlink_older_than, dump_recon_cache, rsync_module_interpolation, \
json, parse_override_options, round_robin_iter, Everything, get_db_files, \
parse_db_filename, quote
parse_db_filename, quote, RateLimitedIterator
from swift.common import ring
from swift.common.ring.utils import is_local_device
from swift.common.http import HTTP_NOT_FOUND, HTTP_INSUFFICIENT_STORAGE, \
@ -204,6 +204,8 @@ class Replicator(Daemon):
' to use option %(type)s-replicator/'
'interval.'
% {'type': self.server_type})
self.databases_per_second = int(
conf.get('databases_per_second', 50))
self.node_timeout = float(conf.get('node_timeout', 10))
self.conn_timeout = float(conf.get('conn_timeout', 0.5))
self.rsync_compress = config_true_value(
@ -733,6 +735,11 @@ class Replicator(Daemon):
def report_up_to_date(self, full_info):
return True
def roundrobin_datadirs(self, dirs):
return RateLimitedIterator(
roundrobin_datadirs(dirs),
elements_per_second=self.databases_per_second)
def run_once(self, *args, **kwargs):
"""Run a replication pass once."""
override_options = parse_override_options(once=True, **kwargs)
@ -789,7 +796,7 @@ class Replicator(Daemon):
"file, not replicating",
", ".join(ips), self.port)
self.logger.info(_('Beginning replication run'))
for part, object_file, node_id in roundrobin_datadirs(dirs):
for part, object_file, node_id in self.roundrobin_datadirs(dirs):
self.cpool.spawn_n(
self._replicate_object, part, object_file, node_id)
self.cpool.waitall()

View File

@ -23,7 +23,7 @@ import os
import six
from eventlet import Timeout
from swift.common import internal_client, db_replicator
from swift.common import internal_client
from swift.common.constraints import check_drive
from swift.common.direct_client import (direct_put_container,
DirectClientException)
@ -1500,7 +1500,7 @@ class ContainerSharder(ContainerReplicator):
dirs.append((datadir, node, part_filt))
if not dirs:
self.logger.warning('Found no data dirs!')
for part, path, node in db_replicator.roundrobin_datadirs(dirs):
for part, path, node in self.roundrobin_datadirs(dirs):
# NB: get_part_nodes always provides an 'index' key;
# this will be used in leader selection
for primary in self.ring.get_part_nodes(int(part)):

View File

@ -321,6 +321,7 @@ class TestDBReplicator(unittest.TestCase):
# later config should be extended to assert more config options
replicator = TestReplicator({'node_timeout': '3.5'})
self.assertEqual(replicator.node_timeout, 3.5)
self.assertEqual(replicator.databases_per_second, 50)
def test_repl_connection(self):
node = {'replication_ip': '127.0.0.1', 'replication_port': 80,

View File

@ -128,6 +128,7 @@ class TestSharder(BaseTestSharder):
expected = {
'mount_check': True, 'bind_ip': '0.0.0.0', 'port': 6201,
'per_diff': 1000, 'max_diffs': 100, 'interval': 30,
'databases_per_second': 50,
'cleave_row_batch_size': 10000,
'node_timeout': 10, 'conn_timeout': 5,
'rsync_compress': False,
@ -154,6 +155,7 @@ class TestSharder(BaseTestSharder):
conf = {
'mount_check': False, 'bind_ip': '10.11.12.13', 'bind_port': 62010,
'per_diff': 2000, 'max_diffs': 200, 'interval': 60,
'databases_per_second': 5,
'cleave_row_batch_size': 3000,
'node_timeout': 20, 'conn_timeout': 1,
'rsync_compress': True,
@ -176,6 +178,7 @@ class TestSharder(BaseTestSharder):
expected = {
'mount_check': False, 'bind_ip': '10.11.12.13', 'port': 62010,
'per_diff': 2000, 'max_diffs': 200, 'interval': 60,
'databases_per_second': 5,
'cleave_row_batch_size': 3000,
'node_timeout': 20, 'conn_timeout': 1,
'rsync_compress': True,
@ -485,7 +488,7 @@ class TestSharder(BaseTestSharder):
0, 'text/plain', 'etag', 0)
# check only sharding enabled containers are processed
with mock.patch.object(
with mock.patch('eventlet.sleep'), mock.patch.object(
sharder, '_process_broker'
) as mock_process_broker:
sharder._local_device_ids = {'stale_node_id'}
@ -539,7 +542,7 @@ class TestSharder(BaseTestSharder):
"for %s" % broker.path)
# check exceptions are handled
with mock.patch.object(
with mock.patch('eventlet.sleep'), mock.patch.object(
sharder, '_process_broker', side_effect=mock_processing
) as mock_process_broker:
sharder._local_device_ids = {'stale_node_id'}
@ -593,7 +596,7 @@ class TestSharder(BaseTestSharder):
for i in range(10):
brokers[1].delete_object(
'o%s' % i, next(self.ts_iter).internal)
with mock.patch.object(
with mock.patch('eventlet.sleep'), mock.patch.object(
sharder, '_process_broker'
) as mock_process_broker:
sharder._local_device_ids = {999}
@ -612,6 +615,53 @@ class TestSharder(BaseTestSharder):
expected_candidate_stats, sharder, 'sharding_candidates')
self._assert_recon_stats(None, sharder, 'sharding_progress')
def test_ratelimited_roundrobin(self):
n_databases = 100
def stub_iter(dirs):
for i in range(n_databases):
yield i, '/srv/node/sda/path/to/container.db', {}
now = time.time()
clock = {
'sleeps': [],
'now': now,
}
def fake_sleep(t):
clock['sleeps'].append(t)
clock['now'] += t
def fake_time():
return clock['now']
with self._mock_sharder({'databases_per_second': 1}) as sharder, \
mock.patch('swift.common.db_replicator.roundrobin_datadirs',
stub_iter), \
mock.patch('time.time', fake_time), \
mock.patch('eventlet.sleep', fake_sleep):
list(sharder.roundrobin_datadirs(None))
# 100 db at 1/s should take ~100s
run_time = sum(clock['sleeps'])
self.assertTrue(97 <= run_time < 100, 'took %s' % run_time)
n_databases = 1000
now = time.time()
clock = {
'sleeps': [],
'now': now,
}
with self._mock_sharder({'databases_per_second': 50}) as sharder, \
mock.patch('swift.common.db_replicator.roundrobin_datadirs',
stub_iter), \
mock.patch('time.time', fake_time), \
mock.patch('eventlet.sleep', fake_sleep):
list(sharder.roundrobin_datadirs(None))
# 1000 db at 50/s
run_time = sum(clock['sleeps'])
self.assertTrue(18 <= run_time < 20, 'took %s' % run_time)
@contextmanager
def _mock_sharder(self, conf=None, replicas=3):
conf = conf or {}