swift-recon: Added oldest and most recent repl

I've been doing this with cluster-wide log searches for far too long.
This adds support for reporting the oldest replication pass
completion as well as the most recent. This is quite useful for
finding those odd replicators that have hung up for some reason and
need intervention.

Change-Id: I7fd7260eca162d6b085f3e82aaa3cf90670f2d53
This commit is contained in:
gholt 2013-01-12 04:30:16 +00:00
parent b8626f9667
commit a88b412e17
5 changed files with 105 additions and 15 deletions

View File

@ -19,6 +19,21 @@ import sys
import os
def seconds2timeunit(seconds):
elapsed = seconds
unit = 'seconds'
if elapsed >= 60:
elapsed = elapsed / 60.0
unit = 'minutes'
if elapsed >= 60:
elapsed = elapsed / 60.0
unit = 'hours'
if elapsed >= 24:
elapsed = elapsed / 24.0
unit = 'days'
return elapsed, unit
class Scout(object):
"""
Obtain swift recon information
@ -261,6 +276,10 @@ class SwiftRecon(object):
recon = Scout("replication/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print "[%s] Checking on replication" % self._ptime()
least_recent_time = 9999999999
least_recent_url = None
most_recent_time = 0
most_recent_url = None
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
stats['replication_time'].append(
@ -269,6 +288,13 @@ class SwiftRecon(object):
if repl_stats:
for stat_key in ['attempted', 'failure', 'success']:
stats[stat_key].append(repl_stats.get(stat_key))
last = response.get('replication_last', 0)
if last < least_recent_time:
least_recent_time = last
least_recent_url = url
if last > most_recent_time:
most_recent_time = last
most_recent_url = url
for k in stats:
if stats[k]:
if k != 'replication_time':
@ -282,6 +308,25 @@ class SwiftRecon(object):
print "[%s] - No hosts returned valid data." % k
else:
print "[%s] - No hosts returned valid data." % k
if least_recent_url is not None:
host = urlparse(url).netloc
if not least_recent_time:
print 'Oldest completion was NEVER by %s.' % host
else:
elapsed = time.time() - least_recent_time
elapsed, elapsed_unit = seconds2timeunit(elapsed)
print 'Oldest completion was %s (%d %s ago) by %s.' % (
time.strftime('%Y-%m-%d %H:%M:%S',
time.gmtime(least_recent_time)),
elapsed, elapsed_unit, host)
if most_recent_url is not None:
host = urlparse(url).netloc
elapsed = time.time() - most_recent_time
elapsed, elapsed_unit = seconds2timeunit(elapsed)
print 'Most recent completion was %s (%d %s ago) by %s.' % (
time.strftime('%Y-%m-%d %H:%M:%S',
time.gmtime(most_recent_time)),
elapsed, elapsed_unit, host)
print "=" * 79
def object_replication_check(self, hosts):
@ -295,9 +340,20 @@ class SwiftRecon(object):
recon = Scout("replication", self.verbose, self.suppress_errors,
self.timeout)
print "[%s] Checking on replication" % self._ptime()
least_recent_time = 9999999999
least_recent_url = None
most_recent_time = 0
most_recent_url = None
for url, response, status in self.pool.imap(recon.scout, hosts):
if status == 200:
stats[url] = response['object_replication_time']
last = response.get('object_replication_last', 0)
if last < least_recent_time:
least_recent_time = last
least_recent_url = url
if last > most_recent_time:
most_recent_time = last
most_recent_url = url
times = [x for x in stats.values() if x is not None]
if len(stats) > 0 and len(times) > 0:
computed = self._gen_stats(times, 'replication_time')
@ -307,6 +363,25 @@ class SwiftRecon(object):
print "[replication_time] - No hosts returned valid data."
else:
print "[replication_time] - No hosts returned valid data."
if least_recent_url is not None:
host = urlparse(url).netloc
if not least_recent_time:
print 'Oldest completion was NEVER by %s.' % host
else:
elapsed = time.time() - least_recent_time
elapsed, elapsed_unit = seconds2timeunit(elapsed)
print 'Oldest completion was %s (%d %s ago) by %s.' % (
time.strftime('%Y-%m-%d %H:%M:%S',
time.gmtime(least_recent_time)),
elapsed, elapsed_unit, host)
if most_recent_url is not None:
host = urlparse(url).netloc
elapsed = time.time() - most_recent_time
elapsed, elapsed_unit = seconds2timeunit(elapsed)
print 'Most recent completion was %s (%d %s ago) by %s.' % (
time.strftime('%Y-%m-%d %H:%M:%S',
time.gmtime(most_recent_time)),
elapsed, elapsed_unit, host)
print "=" * 79
def updater_check(self, hosts):

View File

@ -194,7 +194,8 @@ class Replicator(Daemon):
% self.stats)
dump_recon_cache(
{'replication_stats': self.stats,
'replication_time': time.time() - self.stats['start']},
'replication_time': time.time() - self.stats['start'],
'replication_last': time.time()},
self.rcache, self.logger)
self.logger.info(' '.join(['%s:%s' % item for item in
self.stats.items() if item[0] in

View File

@ -122,14 +122,17 @@ class ReconMiddleware(object):
"""get replication info"""
if recon_type == 'account':
return self._from_recon_cache(['replication_time',
'replication_stats'],
'replication_stats',
'replication_last'],
self.account_recon_cache)
elif recon_type == 'container':
return self._from_recon_cache(['replication_time',
'replication_stats'],
'replication_stats',
'replication_last'],
self.container_recon_cache)
elif recon_type == 'object':
return self._from_recon_cache(['object_replication_time'],
return self._from_recon_cache(['object_replication_time',
'object_replication_last'],
self.object_recon_cache)
else:
return None

View File

@ -647,7 +647,8 @@ class ObjectReplicator(Daemon):
self.logger.info(
_("Object replication complete (once). (%.02f minutes)"), total)
if not (override_partitions or override_devices):
dump_recon_cache({'object_replication_time': total},
dump_recon_cache({'object_replication_time': total,
'object_replication_last': time.time()},
self.rcache, self.logger)
def run_forever(self, *args, **kwargs):
@ -661,7 +662,8 @@ class ObjectReplicator(Daemon):
total = (time.time() - start) / 60
self.logger.info(
_("Object replication complete. (%.02f minutes)"), total)
dump_recon_cache({'object_replication_time': total},
dump_recon_cache({'object_replication_time': total,
'object_replication_last': time.time()},
self.rcache, self.logger)
self.logger.debug(_('Replication sleeping for %s seconds.'),
self.run_pause)

View File

@ -388,11 +388,13 @@ class TestReconSuccess(TestCase):
"remove": 0, "rsync": 0,
"start": 1333044050.855202,
"success": 2, "ts_repl": 0 },
"replication_time": 0.2615511417388916}
"replication_time": 0.2615511417388916,
"replication_last": 1357969645.25}
self.fakecache.fakeout = from_cache_response
rv = self.app.get_replication_info('account')
self.assertEquals(self.fakecache.fakeout_calls,
[((['replication_time', 'replication_stats'],
[((['replication_time', 'replication_stats',
'replication_last'],
'/var/cache/swift/account.recon'), {})])
self.assertEquals(rv, {"replication_stats": {
"attempted": 1, "diff": 0,
@ -402,7 +404,8 @@ class TestReconSuccess(TestCase):
"remove": 0, "rsync": 0,
"start": 1333044050.855202,
"success": 2, "ts_repl": 0 },
"replication_time": 0.2615511417388916})
"replication_time": 0.2615511417388916,
"replication_last": 1357969645.25})
def test_get_replication_info_container(self):
from_cache_response = {"replication_time": 200.0,
@ -413,12 +416,14 @@ class TestReconSuccess(TestCase):
"no_change": 358, "remote_merge": 0,
"remove": 0, "rsync": 0,
"start": 5.5, "success": 358,
"ts_repl": 0}}
"ts_repl": 0},
"replication_last": 1357969645.25}
self.fakecache.fakeout_calls = []
self.fakecache.fakeout = from_cache_response
rv = self.app.get_replication_info('container')
self.assertEquals(self.fakecache.fakeout_calls,
[((['replication_time', 'replication_stats'],
[((['replication_time', 'replication_stats',
'replication_last'],
'/var/cache/swift/container.recon'), {})])
self.assertEquals(rv, {"replication_time": 200.0,
"replication_stats": {
@ -428,17 +433,21 @@ class TestReconSuccess(TestCase):
"no_change": 358, "remote_merge": 0,
"remove": 0, "rsync": 0,
"start": 5.5, "success": 358,
"ts_repl": 0}})
"ts_repl": 0},
"replication_last": 1357969645.25})
def test_get_replication_object(self):
from_cache_response = {"object_replication_time": 200.0}
from_cache_response = {"object_replication_time": 200.0,
"object_replication_last": 1357962809.15}
self.fakecache.fakeout_calls = []
self.fakecache.fakeout = from_cache_response
rv = self.app.get_replication_info('object')
self.assertEquals(self.fakecache.fakeout_calls,
[((['object_replication_time'],
[((['object_replication_time',
'object_replication_last'],
'/var/cache/swift/object.recon'), {})])
self.assertEquals(rv, {'object_replication_time': 200.0})
self.assertEquals(rv, {'object_replication_time': 200.0,
'object_replication_last': 1357962809.15})
def test_get_updater_info_container(self):
from_cache_response = {"container_updater_sweep": 18.476239919662476}