Delete old tombstones

- Call invalidate_hash in auditor for reclaimable tombstones - assert changed auditor behavior with a unit test - driveby test: assert get_hashes behavior with a unit test Co-Authored-By: Pete Zaitcev <zaitcev@redhat.com> Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp> Closes-Bug: #1301728 Change-Id: I3e99dc702d55a7424c6482969e03cb4afac854a4
2016-07-25 20:10:44 +05:30 · 2016-07-25 20:10:44 +05:30 · 81d4673674
parent 561284e3d4
commit 81d4673674
3 changed files with 222 additions and 13 deletions
--- a/swift/obj/auditor.py
+++ b/swift/obj/auditor.py
@ -19,6 +19,7 @@ import sys
 import time
 import signal
 import re
+from os.path import basename, dirname, join
 from random import shuffle
 from swift import gettext_ as _
 from contextlib import closing
@ -28,7 +29,8 @@ from swift.obj import diskfile, replicator
 from swift.common.utils import (
    get_logger, ratelimit_sleep, dump_recon_cache, list_from_csv, listdir,
    unlink_paths_older_than, readconf, config_auto_int_value)
-from swift.common.exceptions import DiskFileQuarantined, DiskFileNotExist
+from swift.common.exceptions import DiskFileQuarantined, DiskFileNotExist,\
+    DiskFileDeleted
 from swift.common.daemon import Daemon
 from swift.common.storage_policy import POLICIES

@ -43,7 +45,6 @@ class AuditorWorker(object):
        self.conf = conf
        self.logger = logger
        self.devices = devices
-        self.diskfile_router = diskfile.DiskFileRouter(conf, self.logger)
        self.max_files_per_second = float(conf.get('files_per_second', 20))
        self.max_bytes_per_second = float(conf.get('bytes_per_second',
                                                   10000000))
@ -56,17 +57,25 @@ class AuditorWorker(object):
        except (KeyError, SystemExit):
            # if we can't parse the real config (generally a KeyError on
            # __file__, or SystemExit on no object-replicator section) we use
-            # a very conservative default
-            default = 86400
+            # a very conservative default for rsync_timeout
+            default_rsync_timeout = 86400
        else:
            replicator_rsync_timeout = int(replicator_config.get(
                'rsync_timeout', replicator.DEFAULT_RSYNC_TIMEOUT))
            # Here we can do some light math for ops and use the *replicator's*
            # rsync_timeout (plus 15 mins to avoid deleting local tempfiles
            # before the remote replicator kills it's rsync)
-            default = replicator_rsync_timeout + 900
+            default_rsync_timeout = replicator_rsync_timeout + 900
+            # there's not really a good reason to assume the replicator
+            # section's reclaim_age is more appropriate than the reconstructor
+            # reclaim_age - but we're already parsing the config so we can set
+            # the default value in our config if it's not already set
+            if 'reclaim_age' in replicator_config:
+                conf.setdefault('reclaim_age',
+                                replicator_config['reclaim_age'])
        self.rsync_tempfile_timeout = config_auto_int_value(
-            self.conf.get('rsync_tempfile_timeout'), default)
+            self.conf.get('rsync_tempfile_timeout'), default_rsync_timeout)
+        self.diskfile_router = diskfile.DiskFileRouter(conf, self.logger)

        self.auditor_type = 'ALL'
        self.zero_byte_only_at_fps = zero_byte_only_at_fps
@ -251,19 +260,26 @@ class AuditorWorker(object):
                            incr_by=chunk_len)
                        self.bytes_processed += chunk_len
                        self.total_bytes_processed += chunk_len
-        except DiskFileNotExist:
-            pass
        except DiskFileQuarantined as err:
            self.quarantines += 1
            self.logger.error(_('ERROR Object %(obj)s failed audit and was'
                                ' quarantined: %(err)s'),
                              {'obj': location, 'err': err})
+        except DiskFileDeleted:
+            # If there is a reclaimable tombstone, we'll invalidate the hash
+            # to trigger the replciator to rehash/cleanup this suffix
+            ts = df._ondisk_info['ts_info']['timestamp']
+            if (time.time() - float(ts)) > df.manager.reclaim_age:
+                df.manager.invalidate_hash(dirname(df._datadir))
+        except DiskFileNotExist:
+            pass
+
        self.passes += 1
        # _ondisk_info attr is initialized to None and filled in by open
        ondisk_info_dict = df._ondisk_info or {}
        if 'unexpected' in ondisk_info_dict:
            is_rsync_tempfile = lambda fpath: RE_RSYNC_TEMPFILE.match(
-                os.path.basename(fpath))
+                basename(fpath))
            rsync_tempfile_paths = filter(is_rsync_tempfile,
                                          ondisk_info_dict['unexpected'])
            mtime = time.time() - self.rsync_tempfile_timeout
@ -282,7 +298,7 @@ class ObjectAuditor(Daemon):
            conf.get('zero_byte_files_per_second', 50))
        self.recon_cache_path = conf.get('recon_cache_path',
                                         '/var/cache/swift')
-        self.rcache = os.path.join(self.recon_cache_path, "object.recon")
+        self.rcache = join(self.recon_cache_path, "object.recon")
        self.interval = int(conf.get('interval', 30))

    def _sleep(self):
--- a/test/unit/obj/test_auditor.py
+++ b/test/unit/obj/test_auditor.py
@ -14,6 +14,7 @@
 # limitations under the License.

 from test import unit
+import six.moves.cPickle as pickle
 import unittest
 import mock
 import os
@ -23,13 +24,14 @@ from shutil import rmtree
 from hashlib import md5
 from tempfile import mkdtemp
 import textwrap
+from os.path import dirname, basename, join
 from test.unit import (FakeLogger, patch_policies, make_timestamp_iter,
                       DEFAULT_TEST_EC_TYPE)
 from swift.obj import auditor, replicator
 from swift.obj.diskfile import (
    DiskFile, write_metadata, invalidate_hash, get_data_dir,
    DiskFileManager, ECDiskFileManager, AuditLocation, clear_auditor_status,
-    get_auditor_status)
+    get_auditor_status, HASH_FILE, HASH_INVALIDATIONS_FILE)
 from swift.common.utils import (
    mkdirs, normalize_timestamp, Timestamp, readconf)
 from swift.common.storage_policy import (
@ -328,7 +330,7 @@ class TestAuditor(unittest.TestCase):
        [object-auditor]
        rsync_tempfile_timeout = auto
        """
-        with open(os.path.join(self.testdir, 'objserver.conf'), 'w') as f:
+        with open(config_path, 'w') as f:
            f.write(textwrap.dedent(stub_config))
        conf = readconf(config_path, 'object-auditor')
        auditor_worker = auditor.AuditorWorker(conf, self.logger,
@ -346,7 +348,7 @@ class TestAuditor(unittest.TestCase):
        [object-auditor]
        rsync_tempfile_timeout = auto
        """
-        with open(os.path.join(self.testdir, 'objserver.conf'), 'w') as f:
+        with open(config_path, 'w') as f:
            f.write(textwrap.dedent(stub_config))
        conf = readconf(config_path, 'object-auditor')
        auditor_worker = auditor.AuditorWorker(conf, self.logger,
@ -746,6 +748,139 @@ class TestAuditor(unittest.TestCase):
        self.auditor.run_audit(**kwargs)
        self.assertFalse(os.path.exists(self.disk_file._datadir))

+    def test_with_tombstone_delete(self):
+        test_md5 = '098f6bcd4621d373cade4e832627b4f6'
+
+        def do_audit(self, timestamp, invalidate=False):
+            dir_path = self.disk_file._datadir
+            ts_file = os.path.join(dir_path, '%d.ts' % timestamp)
+
+            # Create a .ts file
+            if not os.path.exists(dir_path):
+                mkdirs(dir_path)
+            fp = open(ts_file, 'w')
+            write_metadata(fp, {'X-Timestamp': '%d' % timestamp})
+            fp.close()
+            # Create hashes.pkl
+            hash = dirname(dirname(ts_file))  # hash value of ts file
+            suffix = basename(hash)
+            hashes_pkl = join(os.path.dirname(hash), HASH_FILE)
+            with open(hashes_pkl, 'wb') as fp:
+                pickle.dump({suffix: test_md5}, fp, 0)
+            # Run auditor
+            kwargs = {'mode': 'once'}
+            self.auditor.run_audit(**kwargs)
+            # Check if hash invalid file exists
+            hash_invalid = join(dirname(hash), HASH_INVALIDATIONS_FILE)
+            hash_invalid_exists = os.path.exists(hash_invalid)
+            # If invalidate, fetch value from hashes.invalid
+            if invalidate:
+                with open(hash_invalid, 'rb') as fp:
+                    hash_val = fp.read()
+                return hash_invalid_exists, hash_val, suffix
+            return hash_invalid_exists, ts_file
+
+        self.auditor = auditor.ObjectAuditor(self.conf)
+        self.auditor.log_time = 0
+
+        now = time.time()
+
+        # audit with a recent tombstone
+        hash_invalid_exists, ts_file = do_audit(self, now - 55)
+        self.assertFalse(hash_invalid_exists)
+        os.unlink(ts_file)
+
+        # audit with a tombstone that is beyond default reclaim_age
+        hash_invalid_exists, hash_val, suffix = do_audit(self, now - (604800),
+                                                         True)
+        self.assertTrue(hash_invalid_exists)
+        self.assertEqual(hash_val.strip('\n'), suffix)
+
+    def test_auditor_reclaim_age(self):
+        # if we don't have access to the replicator config section we'll
+        # diskfile's default
+        auditor_worker = auditor.AuditorWorker(self.conf, self.logger,
+                                               self.rcache, self.devices)
+        router = auditor_worker.diskfile_router
+        for policy in POLICIES:
+            self.assertEqual(router[policy].reclaim_age, 86400 * 7)
+
+        # if the reclaim_age option is set explicitly we use that
+        self.conf['reclaim_age'] = '1800'
+        auditor_worker = auditor.AuditorWorker(self.conf, self.logger,
+                                               self.rcache, self.devices)
+        router = auditor_worker.diskfile_router
+        for policy in POLICIES:
+            self.assertEqual(router[policy].reclaim_age, 1800)
+
+        # if we have a real config we can be a little smarter
+        config_path = os.path.join(self.testdir, 'objserver.conf')
+
+        # if there is no object-replicator section we still have to fall back
+        # to default because we can't parse the config for that section!
+        stub_config = """
+        [object-auditor]
+        """
+        with open(config_path, 'w') as f:
+            f.write(textwrap.dedent(stub_config))
+        conf = readconf(config_path, 'object-auditor')
+        auditor_worker = auditor.AuditorWorker(conf, self.logger,
+                                               self.rcache, self.devices)
+        router = auditor_worker.diskfile_router
+        for policy in POLICIES:
+            self.assertEqual(router[policy].reclaim_age, 86400 * 7)
+
+        # verify reclaim_age is of auditor config value
+        stub_config = """
+                [object-replicator]
+                [object-auditor]
+                reclaim_age = 60
+                """
+        with open(config_path, 'w') as f:
+            f.write(textwrap.dedent(stub_config))
+        conf = readconf(config_path, 'object-auditor')
+        auditor_worker = auditor.AuditorWorker(conf, self.logger,
+                                               self.rcache, self.devices)
+        router = auditor_worker.diskfile_router
+        for policy in POLICIES:
+            self.assertEqual(router[policy].reclaim_age, 60)
+
+        # verify reclaim_age falls back to replicator config value
+        # if there is no auditor config value
+        config_path = os.path.join(self.testdir, 'objserver.conf')
+        stub_config = """
+                [object-replicator]
+                reclaim_age = 60
+                [object-auditor]
+                """
+        with open(config_path, 'w') as f:
+            f.write(textwrap.dedent(stub_config))
+        conf = readconf(config_path, 'object-auditor')
+        auditor_worker = auditor.AuditorWorker(conf, self.logger,
+                                               self.rcache, self.devices)
+        router = auditor_worker.diskfile_router
+        for policy in POLICIES:
+            self.assertEqual(router[policy].reclaim_age, 60)
+
+        # we'll prefer our own DEFAULT section to the replicator though
+        self.assertEqual(auditor_worker.rsync_tempfile_timeout,
+                         replicator.DEFAULT_RSYNC_TIMEOUT + 900)
+        stub_config = """
+        [DEFAULT]
+        reclaim_age = 1209600
+        [object-replicator]
+        reclaim_age = 1800
+        [object-auditor]
+        """
+        with open(config_path, 'w') as f:
+            f.write(textwrap.dedent(stub_config))
+        conf = readconf(config_path, 'object-auditor')
+        auditor_worker = auditor.AuditorWorker(conf, self.logger,
+                                               self.rcache, self.devices)
+        router = auditor_worker.diskfile_router
+        for policy in POLICIES:
+            self.assertEqual(router[policy].reclaim_age, 1209600)
+
    def test_sleeper(self):
        with mock.patch(
                'time.sleep', mock.MagicMock()) as mock_sleep:
--- a/test/unit/obj/test_diskfile.py
+++ b/test/unit/obj/test_diskfile.py
@ -5052,6 +5052,64 @@ class TestSuffixHashes(unittest.TestCase):
            hashes = df_mgr.get_hashes('sda1', '0', [], policy)
            self.assertEqual(hashes, {})

+    def test_hash_suffix_one_reclaim_tombstone_with_hash_pkl(self):
+        for policy in self.iter_policies():
+            df_mgr = self.df_router[policy]
+            df = df_mgr.get_diskfile(
+                'sda1', '0', 'a', 'c', 'o', policy=policy)
+            suffix_dir = os.path.dirname(df._datadir)
+            part_dir = os.path.dirname(suffix_dir)
+            hash_file = os.path.join(part_dir, diskfile.HASH_FILE)
+
+            # scale back reclaim age a bit
+            df_mgr.reclaim_age = 1000
+            # write a tombstone that's just a *little* older
+            old_time = time() - 1001
+            timestamp = Timestamp(old_time)
+            df.delete(timestamp.internal)
+            hashes = df_mgr.get_hashes('sda1', '0', [], policy)
+            # sanity
+            self.assertEqual(hashes, {})
+            self.assertFalse(os.path.exists(df._datadir))
+
+            hash_timestamp = os.stat(hash_file).st_mtime
+
+            # if hash.pkl exists, that .ts file is not reclaimed
+            df = df_mgr.get_diskfile(
+                'sda1', '0', 'a', 'c', 'o', policy=policy)
+            df.delete(timestamp.internal)
+
+            hashes = df_mgr.get_hashes('sda1', '0', [], policy)
+            # This was a cached value so the value looks empty
+            self.assertEqual(hashes, {})
+            # and the hash.pkl is not touched
+            self.assertEqual(hash_timestamp, os.stat(hash_file).st_mtime)
+            # and we still have tombstone entry
+            tombstone = '%s.ts' % timestamp.internal
+            self.assertTrue(os.path.exists(df._datadir))
+            self.assertIn(tombstone, os.listdir(df._datadir))
+
+            # However if we call invalidate_hash for the suffix dir,
+            # get_hashes can reclaim the tombstone
+            with mock.patch('swift.obj.diskfile.lock_path'):
+                df_mgr.invalidate_hash(suffix_dir)
+
+            hashes = df_mgr.get_hashes('sda1', '0', [], policy)
+
+            self.assertEqual(hashes, {})
+            # If we have no other objects in the suffix, get_hashes
+            # doesn't reclaim anything
+            self.assertTrue(os.path.exists(df._datadir))
+            self.assertIn(tombstone, os.listdir(df._datadir))
+            self.assertEqual(hash_timestamp, os.stat(hash_file).st_mtime)
+
+            # *BUT* if suffix value is given to recalc, it can force to recaim!
+            suffix = os.path.dirname(suffix_dir)
+            hashes = df_mgr.get_hashes('sda1', '0', [suffix], policy)
+            self.assertFalse(os.path.exists(df._datadir))
+            # hash.pkl was updated
+            self.assertGreater(os.stat(hash_file).st_mtime, hash_timestamp)
+
    def test_hash_suffix_one_reclaim_and_one_valid_tombstone(self):
        for policy in self.iter_policies():
            paths, suffix = find_paths_with_matching_suffixes(2, 1)