From 2d55960a221c9934680053873bf1355c4690bb19 Mon Sep 17 00:00:00 2001 From: Alistair Coles Date: Thu, 14 Jan 2016 18:31:21 +0000 Subject: [PATCH] Fix inconsistent suffix hashes after ssync of tombstone Consider two replicas of the same object whose ondisk files have diverged due to failures: A has t2.ts B has t1.data, t4.meta (The DELETE at t2 did not make it to B. The POST at t4 was rejected by A.) After ssync replication the two ondisk file sets will not be consistent: A has t2.ts (ssync cannot POST t4.meta to this node) B has t2.ts, t4.meta (ssync should not delete t4.meta, there may be a t3.data somewhere) Consequenty the two nodes will report different hashes for the object's suffix, and replication will repeat, always with the inconsistent outcome. This scenario is reproduced by the probe test added in this patch. (Note that rsync replication does result in (t2.ts, t4.meta) on both nodes.) The solution is to change the way that suffix hashes are calculated. Currently the names of *all* files found in each object dir are added to the hash. With this patch the timestamps of only those files that could be used to construct a valid diskfile are added to the hash. File extensions are appended to the timestamp so that in most 'normal' situations the result of the hashing is the same as before this patch. That avoids a storm of hash mismatches when this patch is deployed in an existing cluster. In the problem case described above, t4.meta is no longer added to the hash, since it is not useful for constructing a diskfile. (Note that t4.meta is not deleted because it may become useful should a t3.data be replicated in future). Closes-Bug: 1534276 Change-Id: I99e88b8d5f5d9bc22b42112a99634ba942415e05 --- swift/common/direct_client.py | 29 ++++++ swift/obj/diskfile.py | 98 ++++++++++++++----- .../probe/test_object_metadata_replication.py | 66 ++++++++++++- test/unit/obj/test_diskfile.py | 31 ++++++ 4 files changed, 200 insertions(+), 24 deletions(-) diff --git a/swift/common/direct_client.py b/swift/common/direct_client.py index bbb1ca4b90..1e7ede92a4 100644 --- a/swift/common/direct_client.py +++ b/swift/common/direct_client.py @@ -25,6 +25,7 @@ from time import time from eventlet import sleep, Timeout import six +import six.moves.cPickle as pickle from six.moves.http_client import HTTPException from swift.common.bufferedhttp import http_connect @@ -496,6 +497,34 @@ def direct_delete_object(node, part, account, container, obj, node, part, path, resp) +def direct_get_suffix_hashes(node, part, suffixes, conn_timeout=5, + response_timeout=15, headers=None): + """ + Get suffix hashes directly from the object server. + + :param node: node dictionary from the ring + :param part: partition the container is on + :param conn_timeout: timeout in seconds for establishing the connection + :param response_timeout: timeout in seconds for getting the response + :param headers: dict to be passed into HTTPConnection headers + :returns: dict of suffix hashes + :raises ClientException: HTTP REPLICATE request failed + """ + if headers is None: + headers = {} + + path = '/%s' % '-'.join(suffixes) + with Timeout(conn_timeout): + conn = http_connect(node['ip'], node['port'], node['device'], part, + 'REPLICATE', path, headers=gen_headers(headers)) + with Timeout(response_timeout): + resp = conn.getresponse() + if not is_success(resp.status): + raise DirectClientException('Object', 'REPLICATE', + node, part, path, resp) + return pickle.loads(resp.read()) + + def retry(func, *args, **kwargs): """ Helper function to retry a given function a number of times. diff --git a/swift/obj/diskfile.py b/swift/obj/diskfile.py index 0928a5b688..6328a79387 100644 --- a/swift/obj/diskfile.py +++ b/swift/obj/diskfile.py @@ -678,7 +678,18 @@ class BaseDiskFileManager(object): return self.cleanup_ondisk_files( hsh_path, reclaim_age=reclaim_age)['files'] - def _hash_suffix_dir(self, path, mapper, reclaim_age): + def _update_suffix_hashes(self, hashes, ondisk_info): + """ + Applies policy specific updates to the given dict of md5 hashes for + the given ondisk_info. + + :param hashes: a dict of md5 hashes to be updated + :param ondisk_info: a dict describing the state of ondisk files, as + returned by get_ondisk_files + """ + raise NotImplementedError + + def _hash_suffix_dir(self, path, reclaim_age): """ :param path: full path to directory @@ -694,7 +705,7 @@ class BaseDiskFileManager(object): for hsh in path_contents: hsh_path = join(path, hsh) try: - files = self.hash_cleanup_listdir(hsh_path, reclaim_age) + ondisk_info = self.cleanup_ondisk_files(hsh_path, reclaim_age) except OSError as err: if err.errno == errno.ENOTDIR: partition_path = dirname(path) @@ -707,14 +718,30 @@ class BaseDiskFileManager(object): 'quar_path': quar_path}) continue raise - if not files: + if not ondisk_info['files']: try: os.rmdir(hsh_path) except OSError: pass - for filename in files: - key, value = mapper(filename) - hashes[key].update(value) + continue + + # ondisk_info has info dicts containing timestamps for those + # files that could determine the state of the diskfile if it were + # to be opened. We update the suffix hash with the concatenation of + # each file's timestamp and extension. The extension is added to + # guarantee distinct hash values from two object dirs that have + # different file types at the same timestamp(s). + # + # Files that may be in the object dir but would have no effect on + # the state of the diskfile are not used to update the hash. + for key in (k for k in ('meta_info', 'ts_info') + if k in ondisk_info): + info = ondisk_info[key] + hashes[None].update(info['timestamp'].internal + info['ext']) + + # delegate to subclass for data file related updates... + self._update_suffix_hashes(hashes, ondisk_info) + try: os.rmdir(path) except OSError as e: @@ -2195,6 +2222,20 @@ class DiskFileManager(BaseDiskFileManager): # set results results['data_info'] = exts['.data'][0] + def _update_suffix_hashes(self, hashes, ondisk_info): + """ + Applies policy specific updates to the given dict of md5 hashes for + the given ondisk_info. + + :param hashes: a dict of md5 hashes to be updated + :param ondisk_info: a dict describing the state of ondisk files, as + returned by get_ondisk_files + """ + if 'data_info' in ondisk_info: + file_info = ondisk_info['data_info'] + hashes[None].update( + file_info['timestamp'].internal + file_info['ext']) + def _hash_suffix(self, path, reclaim_age): """ Performs reclamation and returns an md5 of all (remaining) files. @@ -2203,9 +2244,9 @@ class DiskFileManager(BaseDiskFileManager): :param reclaim_age: age in seconds at which to remove tombstones :raises PathNotDir: if given path is not a valid directory :raises OSError: for non-ENOTDIR errors + :returns: md5 of files in suffix """ - mapper = lambda filename: (None, filename) - hashes = self._hash_suffix_dir(path, mapper, reclaim_age) + hashes = self._hash_suffix_dir(path, reclaim_age) return hashes[None].hexdigest() @@ -2544,28 +2585,41 @@ class ECDiskFileManager(BaseDiskFileManager): return have_data_file == have_durable return False + def _update_suffix_hashes(self, hashes, ondisk_info): + """ + Applies policy specific updates to the given dict of md5 hashes for + the given ondisk_info. + + The only difference between this method and the replication policy + function is the way that data files update hashes dict. Instead of all + filenames hashed into a single hasher, each data file name will fall + into a bucket keyed by its fragment index. + + :param hashes: a dict of md5 hashes to be updated + :param ondisk_info: a dict describing the state of ondisk files, as + returned by get_ondisk_files + """ + for frag_set in ondisk_info['frag_sets'].values(): + for file_info in frag_set: + fi = file_info['frag_index'] + hashes[fi].update(file_info['timestamp'].internal) + if 'durable_frag_set' in ondisk_info: + file_info = ondisk_info['durable_frag_set'][0] + hashes[None].update(file_info['timestamp'].internal + '.durable') + def _hash_suffix(self, path, reclaim_age): """ - The only difference between this method and the replication policy - function is the way that files are updated on the returned hash. - - Instead of all filenames hashed into a single hasher, each file name - will fall into a bucket either by fragment index for datafiles, or - None (indicating a durable, metadata or tombstone). + Performs reclamation and returns an md5 of all (remaining) files. :param path: full path to directory :param reclaim_age: age in seconds at which to remove tombstones + :raises PathNotDir: if given path is not a valid directory + :raises OSError: for non-ENOTDIR errors + :returns: dict of md5 hex digests """ # hash_per_fi instead of single hash for whole suffix # here we flatten out the hashers hexdigest into a dictionary instead # of just returning the one hexdigest for the whole suffix - def mapper(filename): - info = self.parse_on_disk_filename(filename) - fi = info['frag_index'] - if fi is None: - return None, filename - else: - return fi, info['timestamp'].internal - hash_per_fi = self._hash_suffix_dir(path, mapper, reclaim_age) + hash_per_fi = self._hash_suffix_dir(path, reclaim_age) return dict((fi, md5.hexdigest()) for fi, md5 in hash_per_fi.items()) diff --git a/test/probe/test_object_metadata_replication.py b/test/probe/test_object_metadata_replication.py index e7479d5fab..809a493d83 100644 --- a/test/probe/test_object_metadata_replication.py +++ b/test/probe/test_object_metadata_replication.py @@ -21,12 +21,15 @@ import unittest import os import shutil import uuid -from swift.common.exceptions import DiskFileDeleted +from swift.common.direct_client import direct_get_suffix_hashes +from swift.common.exceptions import DiskFileDeleted +from swift.common.internal_client import UnexpectedResponse from swift.container.backend import ContainerBroker from swift.common import internal_client, utils +from swiftclient import client from swift.common.ring import Ring -from swift.common.utils import Timestamp, get_logger +from swift.common.utils import Timestamp, get_logger, hash_path from swift.obj.diskfile import DiskFileManager from swift.common.storage_policy import POLICIES @@ -186,6 +189,20 @@ class Test(ReplProbeTest): self.container_name, self.object_name) + def _assert_consistent_suffix_hashes(self): + opart, onodes = self.object_ring.get_nodes( + self.account, self.container_name, self.object_name) + name_hash = hash_path( + self.account, self.container_name, self.object_name) + results = [] + for node in onodes: + results.append( + (node, + direct_get_suffix_hashes(node, opart, [name_hash[-3:]]))) + for (node, hashes) in results[1:]: + self.assertEqual(results[0][1], hashes, + 'Inconsistent suffix hashes found: %s' % results) + def test_object_delete_is_replicated(self): self.brain.put_container(policy_index=int(self.policy)) # put object @@ -419,6 +436,51 @@ class Test(ReplProbeTest): self._assert_consistent_object_metadata() self._assert_consistent_container_dbs() + def test_post_trumped_by_prior_delete(self): + # new metadata and content-type posted to subset of nodes should not + # cause object to persist after replication of an earlier delete on + # other nodes. + self.brain.put_container(policy_index=0) + # incomplete put + self.brain.stop_primary_half() + self._put_object(headers={'Content-Type': 'oldest', + 'X-Object-Sysmeta-Test': 'oldest', + 'X-Object-Meta-Test': 'oldest'}) + self.brain.start_primary_half() + + # incomplete put then delete + self.brain.stop_handoff_half() + self._put_object(headers={'Content-Type': 'oldest', + 'X-Object-Sysmeta-Test': 'oldest', + 'X-Object-Meta-Test': 'oldest'}) + self._delete_object() + self.brain.start_handoff_half() + + # handoff post + self.brain.stop_primary_half() + self._post_object(headers={'Content-Type': 'newest', + 'X-Object-Sysmeta-Test': 'ignored', + 'X-Object-Meta-Test': 'newest'}) + + # check object metadata + metadata = self._get_object_metadata() + self.assertEqual(metadata['x-object-sysmeta-test'], 'oldest') + self.assertEqual(metadata['x-object-meta-test'], 'newest') + self.assertEqual(metadata['content-type'], 'oldest') + + self.brain.start_primary_half() + + # delete trumps later post + self.get_to_final_state() + + # check object is now deleted + self.assertRaises(UnexpectedResponse, self._get_object_metadata) + container_metadata, objs = client.get_container(self.url, self.token, + self.container_name) + self.assertEqual(0, len(objs)) + self._assert_consistent_container_dbs() + self._assert_consistent_deleted_object() + self._assert_consistent_suffix_hashes() if __name__ == "__main__": unittest.main() diff --git a/test/unit/obj/test_diskfile.py b/test/unit/obj/test_diskfile.py index 534882bec3..7057d7a61c 100644 --- a/test/unit/obj/test_diskfile.py +++ b/test/unit/obj/test_diskfile.py @@ -4568,6 +4568,37 @@ class TestSuffixHashes(unittest.TestCase): }[policy.policy_type] self.assertEqual(hashes, expected) + def test_hash_suffix_one_tombstone_and_one_meta(self): + # A tombstone plus a newer meta file can happen if a tombstone is + # replicated to a node with a newer meta file but older data file. The + # meta file will be ignored when the diskfile is opened so the + # effective state of the disk files is equivalent to only having the + # tombstone. Replication cannot remove the meta file, and the meta file + # cannot be ssync replicated to a node with only the tombstone, so + # we want the get_hashes result to be the same as if the meta file was + # not there. + for policy in self.iter_policies(): + df_mgr = self.df_router[policy] + df = df_mgr.get_diskfile( + 'sda1', '0', 'a', 'c', 'o', policy=policy) + suffix = os.path.basename(os.path.dirname(df._datadir)) + # write a tombstone + timestamp = self.ts() + df.delete(timestamp) + # write a meta file + df.write_metadata({'X-Timestamp': self.ts().internal}) + # sanity check + self.assertEqual(2, len(os.listdir(df._datadir))) + tombstone_hash = md5(timestamp.internal + '.ts').hexdigest() + hashes = df_mgr.get_hashes('sda1', '0', [], policy) + expected = { + REPL_POLICY: {suffix: tombstone_hash}, + EC_POLICY: {suffix: { + # fi is None here because we have a tombstone + None: tombstone_hash}}, + }[policy.policy_type] + self.assertEqual(hashes, expected) + def test_hash_suffix_one_reclaim_tombstone(self): for policy in self.iter_policies(): df_mgr = self.df_router[policy]