proxy: emit metric when updating namespaces memcache set fails

Previously, if a memcache set command had an unexpected response then
an error was logged but no exception raised from MemcacheRing.set.
With this patch, when MemcacheRing.set fails in this way, a
MemcacheConnectionError will be raised if 'raise_on_error' is
True. This exception is now handled when the proxy tries to set
updating namespaces in memcache, and a counter metric emitted with
name 'object.shard_updating.cache.set_error'. If the set is successful
an 'object.shard_updating.cache.set' counter metric is emitted.

The logging of MemcacheRing errors is improved to include the memcache
key being set, in line with exception logging.

Change-Id: Icfc97751c80f4bb4a3373796c01bff6ed5843937
This commit is contained in:
Alistair Coles 2023-10-20 14:40:06 +01:00
parent 195f54ecb5
commit 965908d753
4 changed files with 188 additions and 38 deletions

View File

@ -245,6 +245,13 @@ class MemcacheRing(object):
def memcache_servers(self):
return list(self._client_cache.keys())
def _log_error(self, server, cmd, action, msg):
self.logger.error(
"Error %(action)s to memcached: %(server)s"
": with key_prefix %(key_prefix)s, method %(method)s: %(msg)s",
{'action': action, 'server': server, 'key_prefix': cmd.key_prefix,
'method': cmd.method, 'msg': msg})
"""
Handles exceptions.
@ -367,7 +374,8 @@ class MemcacheRing(object):
self._exception_occurred(server, e, cmd, pool_start_time,
action='connecting', sock=sock)
if not any_yielded:
self.logger.error('All memcached servers error-limited')
self._log_error('ALL', cmd, 'connecting',
'All memcached servers error-limited')
def _return_conn(self, server, fp, sock):
"""Returns a server connection to the pool."""
@ -404,6 +412,14 @@ class MemcacheRing(object):
elif not isinstance(value, bytes):
value = str(value).encode('utf-8')
if 0 <= self.item_size_warning_threshold <= len(value):
self.logger.warning(
"Item size larger than warning threshold: "
"%d (%s) >= %d (%s)", len(value),
human_readable(len(value)),
self.item_size_warning_threshold,
human_readable(self.item_size_warning_threshold))
for (server, fp, sock) in self._get_conns(cmd):
conn_start_time = tm.time()
try:
@ -414,17 +430,7 @@ class MemcacheRing(object):
if msg != b'STORED':
if not six.PY2:
msg = msg.decode('ascii')
self.logger.error(
"Error setting value in memcached: "
"%(server)s: %(msg)s",
{'server': server, 'msg': msg})
if 0 <= self.item_size_warning_threshold <= len(value):
self.logger.warning(
"Item size larger than warning threshold: "
"%d (%s) >= %d (%s)", len(value),
human_readable(len(value)),
self.item_size_warning_threshold,
human_readable(self.item_size_warning_threshold))
raise MemcacheConnectionError('failed set: %s' % msg)
self._return_conn(server, fp, sock)
return
except (Exception, Timeout) as e:

View File

@ -345,6 +345,25 @@ class BaseObjectController(Controller):
# there will be only one shard range in the list if any
return shard_ranges[0] if shard_ranges else None
def _cache_update_namespaces(self, memcache, cache_key, namespaces):
if not memcache:
return
self.logger.info(
'Caching updating shards for %s (%d shards)',
cache_key, len(namespaces.bounds))
try:
memcache.set(
cache_key, namespaces.bounds,
time=self.app.recheck_updating_shard_ranges,
raise_on_error=True)
cache_state = 'set'
except MemcacheConnectionError:
cache_state = 'set_error'
finally:
record_cache_op_metrics(self.logger, self.server_type.lower(),
'shard_updating', cache_state, None)
def _get_update_shard(self, req, account, container, obj):
"""
Find the appropriate shard range for an object update.
@ -386,16 +405,9 @@ class BaseObjectController(Controller):
if shard_ranges:
# only store the list of namespace lower bounds and names into
# infocache and memcache.
cached_namespaces = NamespaceBoundList.parse(
shard_ranges)
infocache[cache_key] = cached_namespaces
if memcache:
self.logger.info(
'Caching updating shards for %s (%d shards)',
cache_key, len(cached_namespaces.bounds))
memcache.set(
cache_key, cached_namespaces.bounds,
time=self.app.recheck_updating_shard_ranges)
namespaces = NamespaceBoundList.parse(shard_ranges)
infocache[cache_key] = namespaces
self._cache_update_namespaces(memcache, cache_key, namespaces)
update_shard = find_namespace(obj, shard_ranges or [])
record_cache_op_metrics(
self.logger, self.server_type.lower(), 'shard_updating',

View File

@ -410,16 +410,50 @@ class TestMemcached(unittest.TestCase):
self.assertAlmostEqual(float(cache_timeout), esttimeout, delta=1)
def test_set_error(self):
memcache_client = memcached.MemcacheRing(['1.2.3.4:11211'],
logger=self.logger)
memcache_client = memcached.MemcacheRing(
['1.2.3.4:11211'], logger=self.logger,
item_size_warning_threshold=1)
mock = MockMemcached()
memcache_client._client_cache['1.2.3.4:11211'] = MockedMemcachePool(
[(mock, mock)] * 2)
memcache_client.set('too-big', [1, 2, 3])
now = time.time()
with patch('time.time', return_value=now):
memcache_client.set('too-big', [1, 2, 3])
self.assertEqual(
self.logger.get_lines_for_level('error'),
['Error setting value in memcached: 1.2.3.4:11211: '
'SERVER_ERROR object too large for cache'])
['Error talking to memcached: 1.2.3.4:11211: '
'with key_prefix too-big, method set, time_spent 0.0, '
'failed set: SERVER_ERROR object too large for cache'])
warning_lines = self.logger.get_lines_for_level('warning')
self.assertEqual(1, len(warning_lines))
self.assertIn('Item size larger than warning threshold',
warning_lines[0])
self.assertTrue(mock.close_called)
def test_set_error_raise_on_error(self):
memcache_client = memcached.MemcacheRing(
['1.2.3.4:11211'], logger=self.logger,
item_size_warning_threshold=1)
mock = MockMemcached()
memcache_client._client_cache[
'1.2.3.4:11211'] = MockedMemcachePool(
[(mock, mock)] * 2)
now = time.time()
with self.assertRaises(MemcacheConnectionError) as cm:
with patch('time.time', return_value=now):
memcache_client.set('too-big', [1, 2, 3], raise_on_error=True)
self.assertIn("No memcached connections succeeded", str(cm.exception))
self.assertEqual(
self.logger.get_lines_for_level('error'),
['Error talking to memcached: 1.2.3.4:11211: '
'with key_prefix too-big, method set, time_spent 0.0, '
'failed set: SERVER_ERROR object too large for cache'])
warning_lines = self.logger.get_lines_for_level('warning')
self.assertEqual(1, len(warning_lines))
self.assertIn('Item size larger than warning threshold',
warning_lines[0])
self.assertTrue(mock.close_called)
def test_get_failed_connection_mid_request(self):
memcache_client = memcached.MemcacheRing(['1.2.3.4:11211'],
@ -620,7 +654,8 @@ class TestMemcached(unittest.TestCase):
'with key_prefix some_key, method set, time_spent 0.0, '
'[Errno 32] Broken pipe',
'Error limiting server 1.2.3.4:11211',
'All memcached servers error-limited',
'Error connecting to memcached: ALL: with key_prefix some_key, '
'method set: All memcached servers error-limited',
])
self.logger.clear()
@ -628,14 +663,16 @@ class TestMemcached(unittest.TestCase):
for _ in range(12):
memcache_client.set('some_key', [1, 2, 3])
self.assertEqual(self.logger.get_lines_for_level('error'), [
'All memcached servers error-limited',
'Error connecting to memcached: ALL: with key_prefix some_key, '
'method set: All memcached servers error-limited',
] * 12)
self.logger.clear()
# and get()s are all a "cache miss"
self.assertIsNone(memcache_client.get('some_key'))
self.assertEqual(self.logger.get_lines_for_level('error'), [
'All memcached servers error-limited',
'Error connecting to memcached: ALL: with key_prefix some_key, '
'method get: All memcached servers error-limited',
])
def test_error_disabled(self):
@ -752,7 +789,8 @@ class TestMemcached(unittest.TestCase):
'with key_prefix some_key, method set, time_spent 0.0, '
'[Errno 32] Broken pipe',
'Error limiting server 1.2.3.5:11211',
'All memcached servers error-limited',
'Error connecting to memcached: ALL: with key_prefix some_key, '
'method set: All memcached servers error-limited',
])
# with default error_limit_time of 60, one call per 6 secs, error limit
@ -776,8 +814,8 @@ class TestMemcached(unittest.TestCase):
'with key_prefix some_key, method set, time_spent 0.0, '
'[Errno 32] Broken pipe',
'Error limiting server 1.2.3.5:11211',
'All memcached servers error-limited',
])
'Error connecting to memcached: ALL: with key_prefix some_key, '
'method set: All memcached servers error-limited'])
# with error_limit_time of 70, one call per 6 secs, error_limit_count
# of 11, 13th call triggers error limit
@ -791,8 +829,8 @@ class TestMemcached(unittest.TestCase):
'with key_prefix some_key, method set, time_spent 0.0, '
'[Errno 32] Broken pipe',
'Error limiting server 1.2.3.5:11211',
'All memcached servers error-limited',
])
'Error connecting to memcached: ALL: with key_prefix some_key, '
'method set: All memcached servers error-limited'])
def test_delete(self):
memcache_client = memcached.MemcacheRing(['1.2.3.4:11211'],

View File

@ -4431,7 +4431,8 @@ class TestReplicatedObjectController(
'account.info.infocache.hit': 2,
'container.info.cache.miss.200': 1,
'container.info.infocache.hit': 1,
'object.shard_updating.cache.miss.200': 1},
'object.shard_updating.cache.miss.200': 1,
'object.shard_updating.cache.set': 1},
stats)
self.assertEqual([], self.app.logger.log_dict['set_statsd_prefix'])
info_lines = self.logger.get_lines_for_level('info')
@ -4795,7 +4796,8 @@ class TestReplicatedObjectController(
'object.shard_updating.cache.hit': 1,
'container.info.cache.hit': 1,
'account.info.cache.hit': 1,
'object.shard_updating.cache.skip.200': 1},
'object.shard_updating.cache.skip.200': 1,
'object.shard_updating.cache.set': 1},
stats)
# verify statsd prefix is not mutated
self.assertEqual([], self.app.logger.log_dict['set_statsd_prefix'])
@ -4865,7 +4867,99 @@ class TestReplicatedObjectController(
'container.info.infocache.hit': 3,
'object.shard_updating.cache.skip.200': 1,
'object.shard_updating.cache.hit': 1,
'object.shard_updating.cache.error.200': 1})
'object.shard_updating.cache.error.200': 1,
'object.shard_updating.cache.set': 2
})
do_test('POST', 'sharding')
do_test('POST', 'sharded')
do_test('DELETE', 'sharding')
do_test('DELETE', 'sharded')
do_test('PUT', 'sharding')
do_test('PUT', 'sharded')
@patch_policies([
StoragePolicy(0, 'zero', is_default=True, object_ring=FakeRing()),
StoragePolicy(1, 'one', object_ring=FakeRing()),
])
def test_backend_headers_update_shard_container_cache_set_error(self):
# verify that backend container update is directed to the shard
# container despite memcache set error
# reset the router post patch_policies
self.app.obj_controller_router = proxy_server.ObjectControllerRouter()
self.app.sort_nodes = lambda nodes, *args, **kwargs: nodes
self.app.recheck_updating_shard_ranges = 3600
self.app.container_updating_shard_ranges_skip_cache = 0.001
def do_test(method, sharding_state):
self.app.logger.clear() # clean capture state
# simulate memcache error when setting updating namespaces;
# expect 4 memcache sets: account info, container info, container
# info again from namespaces GET subrequest, namespaces
cache = FakeMemcache(error_on_set=[False, False, False, True])
req = Request.blank(
'/v1/a/c/o', {'swift.cache': cache},
method=method, body='', headers={'Content-Type': 'text/plain'})
# acct HEAD, cont HEAD, cont shard GET, obj POSTs
status_codes = (200, 200, 200, 202, 202, 202)
resp_headers = {'X-Backend-Storage-Policy-Index': 1,
'x-backend-sharding-state': sharding_state,
'X-Backend-Record-Type': 'shard'}
shard_ranges = [
utils.ShardRange(
'.shards_a/c_not_used', utils.Timestamp.now(), '', 'l'),
utils.ShardRange(
'.shards_a/c_shard', utils.Timestamp.now(), 'l', 'u'),
utils.ShardRange(
'.shards_a/c_nope', utils.Timestamp.now(), 'u', ''),
]
body = json.dumps([
dict(shard_range)
for shard_range in shard_ranges]).encode('ascii')
with mock.patch('random.random', return_value=0), \
mocked_http_conn(*status_codes, headers=resp_headers,
body=body) as fake_conn:
resp = req.get_response(self.app)
self.assertEqual(resp.status_int, 202)
stats = self.app.logger.statsd_client.get_increment_counts()
self.assertEqual({'account.info.cache.miss.200': 1,
'account.info.infocache.hit': 2,
'container.info.cache.miss.200': 1,
'container.info.infocache.hit': 1,
'object.shard_updating.cache.skip.200': 1,
'object.shard_updating.cache.set_error': 1},
stats)
# verify statsd prefix is not mutated
self.assertEqual([], self.app.logger.log_dict['set_statsd_prefix'])
# sanity check: namespaces not in cache
cache_key = 'shard-updating-v2/a/c'
self.assertNotIn(cache_key, req.environ['swift.cache'].store)
# make sure backend requests included expected container headers
container_headers = {}
for backend_request in fake_conn.requests[3:]:
req_headers = backend_request['headers']
device = req_headers['x-container-device']
container_headers[device] = req_headers['x-container-host']
expectations = {
'method': method,
'path': '/0/a/c/o',
'headers': {
'X-Container-Partition': '0',
'Host': 'localhost:80',
'Referer': '%s http://localhost/v1/a/c/o' % method,
'X-Backend-Storage-Policy-Index': '1',
'X-Backend-Quoted-Container-Path': shard_ranges[1].name
},
}
self._check_request(backend_request, **expectations)
expected = {}
for i, device in enumerate(['sda', 'sdb', 'sdc']):
expected[device] = '10.0.0.%d:100%d' % (i, i)
self.assertEqual(container_headers, expected)
do_test('POST', 'sharding')
do_test('POST', 'sharded')