#! /usr/bin/env python """ cmdline utility to perform cluster reconnaissance """ from eventlet.green import urllib2 from swift.common.ring import Ring import simplejson as json from hashlib import md5 import datetime import eventlet import optparse import sys import os VERBOSE = False SUPPRESS_ERRORS = False def get_devices(zone_filter, ring_file): ring_data = Ring(ring_file) if zone_filter: ips = set((n['ip'], n['port']) for n in ring_data.devs if n \ if n['zone'] == zone_filter) else: ips = set((n['ip'], n['port']) for n in ring_data.devs if n) return ips def scout(base_url, recon_type): global VERBOSE, SUPPRESS_ERRORS url = base_url + recon_type try: body = urllib2.urlopen(url).read() content = json.loads(body) if VERBOSE: print "-> %s: %s" % (url, content) status = 200 except urllib2.HTTPError as e: if not SUPPRESS_ERRORS or VERBOSE: print "-> %s: %s" % (url, e) content = e status = e.code except urllib2.URLError as e: if not SUPPRESS_ERRORS or VERBOSE: print "-> %s: %s" % (url, e) content = e status = -1 return url, content, status def scout_md5(host): base_url = "http://%s:%s/recon/" % (host[0], host[1]) url, content, status = scout(base_url, "ringmd5") return url, content, status def scout_async(host): base_url = "http://%s:%s/recon/" % (host[0], host[1]) url, content, status = scout(base_url, "async") return url, content, status def scout_replication(host): base_url = "http://%s:%s/recon/" % (host[0], host[1]) url, content, status = scout(base_url, "replication") return url, content, status def scout_load(host): base_url = "http://%s:%s/recon/" % (host[0], host[1]) url, content, status = scout(base_url, "load") return url, content, status def scout_du(host): base_url = "http://%s:%s/recon/" % (host[0], host[1]) url, content, status = scout(base_url, "diskusage") return url, content, status def scout_umount(host): base_url = "http://%s:%s/recon/" % (host[0], host[1]) url, content, status = scout(base_url, "unmounted") return url, content, status def scout_quarantine(host): base_url = "http://%s:%s/recon/" % (host[0], host[1]) url, content, status = scout(base_url, "quarantined") return url, content, status def scout_sockstat(host): base_url = "http://%s:%s/recon/" % (host[0], host[1]) url, content, status = scout(base_url, "sockstat") return url, content, status def get_ringmd5(hosts, ringfile): stats = {} matches = 0 errors = 0 md5sum = md5() with open(ringfile, 'rb') as f: block = f.read(4096) while block: md5sum.update(block) block = f.read(4096) ring_sum = md5sum.hexdigest() pool = eventlet.GreenPool(20) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print "[%s] Checking ring md5sum's on %s hosts..." % (now, len(hosts)) if VERBOSE: print "-> On disk md5sum: %s" % ring_sum for url, response, status in pool.imap(scout_md5, hosts): if status == 200: #fixme - need to grab from config stats[url] = response[ringfile] if response[ringfile] != ring_sum: ringsmatch = False print "!! %s (%s) doesn't match on disk md5sum" % \ (url, response[ringfile]) else: matches = matches + 1 if VERBOSE: print "-> %s matches." % url else: errors = errors + 1 print "%s/%s hosts matched, %s error[s] while checking hosts." % \ (matches, len(hosts), errors) print "=" * 79 def async_check(hosts): stats = {} pool = eventlet.GreenPool(20) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print "[%s] Checking async pendings on %s hosts..." % (now, len(hosts)) for url, response, status in pool.imap(scout_async, hosts): if status == 200: stats[url] = response['async_pending'] if len(stats) > 0: low = min(stats.values()) high = max(stats.values()) total = sum(stats.values()) average = total / len(stats) print "Async stats: low: %d, high: %d, avg: %d, total: %d" % (low, high, average, total) else: print "Error: No hosts available or returned valid information." print "=" * 79 def umount_check(hosts): stats = {} pool = eventlet.GreenPool(20) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print "[%s] Getting unmounted drives from %s hosts..." % (now, len(hosts)) for url, response, status in pool.imap(scout_umount, hosts): if status == 200: for i in response: stats[url] = i['device'] for host in stats: print "Not mounted: %s on %s" % (stats[host], host) print "=" * 79 def replication_check(hosts): stats = {} pool = eventlet.GreenPool(20) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print "[%s] Checking replication times on %s hosts..." % (now, len(hosts)) for url, response, status in pool.imap(scout_replication, hosts): if status == 200: stats[url] = response['object_replication_time'] if len(stats) > 0: low = min(stats.values()) high = max(stats.values()) total = sum(stats.values()) average = total / len(stats) print "[Replication Times] shortest: %s, longest: %s, avg: %s" % \ (low, high, average) else: print "Error: No hosts available or returned valid information." print "=" * 79 def load_check(hosts): load1 = {} load5 = {} load15 = {} pool = eventlet.GreenPool(20) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print "[%s] Checking load avg's on %s hosts..." % (now, len(hosts)) for url, response, status in pool.imap(scout_load, hosts): if status == 200: load1[url] = response['1m'] load5[url] = response['5m'] load15[url] = response['15m'] stats = {"1m": load1, "5m": load5, "15m": load15} for item in stats: if len(stats[item]) > 0: low = min(stats[item].values()) high = max(stats[item].values()) total = sum(stats[item].values()) average = total / len(stats[item]) print "[%s load average] lowest: %s, highest: %s, avg: %s" % \ (item, low, high, average) else: print "Error: No hosts available or returned valid information." print "=" * 79 def quarantine_check(hosts): objq = {} conq = {} acctq = {} pool = eventlet.GreenPool(20) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print "[%s] Checking quarantine dirs on %s hosts..." % (now, len(hosts)) for url, response, status in pool.imap(scout_quarantine, hosts): if status == 200: objq[url] = response['objects'] conq[url] = response['containers'] acctq[url] = response['accounts'] stats = {"objects": objq, "containers": conq, "accounts": acctq} for item in stats: if len(stats[item]) > 0: low = min(stats[item].values()) high = max(stats[item].values()) total = sum(stats[item].values()) average = total / len(stats[item]) print "[Quarantined %s] low: %d, high: %d, avg: %d, total: %d" % \ (item, low, high, average, total) else: print "Error: No hosts available or returned valid information." print "=" * 79 def socket_usage(hosts): inuse4 = {} mem = {} inuse6 = {} timewait = {} orphan = {} pool = eventlet.GreenPool(20) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print "[%s] Checking socket usage on %s hosts..." % (now, len(hosts)) for url, response, status in pool.imap(scout_sockstat, hosts): if status == 200: inuse4[url] = response['tcp_in_use'] mem[url] = response['tcp_mem_allocated_bytes'] inuse6[url] = response['tcp6_in_use'] timewait[url] = response['time_wait'] orphan[url] = response['orphan'] stats = {"tcp_in_use": inuse4, "tcp_mem_allocated_bytes": mem, \ "tcp6_in_use": inuse6, "time_wait": timewait, "orphan": orphan} for item in stats: if len(stats[item]) > 0: low = min(stats[item].values()) high = max(stats[item].values()) total = sum(stats[item].values()) average = total / len(stats[item]) print "[%s] low: %d, high: %d, avg: %d, total: %d" % \ (item, low, high, average, total) else: print "Error: No hosts or info available." print "=" * 79 def disk_usage(hosts): stats = {} highs = [] lows = [] averages = [] percents = {} pool = eventlet.GreenPool(20) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print "[%s] Checking disk usage on %s hosts..." % (now, len(hosts)) for url, response, status in pool.imap(scout_du, hosts): if status == 200: hostusage = [] for entry in response: if entry['mounted']: used = float(entry['used']) / float(entry['size']) * 100.0 hostusage.append(round(used, 2)) stats[url] = hostusage for url in stats: if len(stats[url]) > 0: #get per host hi/los for another day low = min(stats[url]) high = max(stats[url]) total = sum(stats[url]) average = total / len(stats[url]) highs.append(high) lows.append(low) averages.append(average) for percent in stats[url]: percents[int(percent)] = percents.get(int(percent), 0) + 1 else: print "-> %s: Error. No drive info available." % url if len(lows) > 0: low = min(lows) high = max(highs) average = sum(averages) / len(averages) #distrib graph shamelessly stolen from https://github.com/gholt/tcod print "Distribution Graph:" mul = 69.0 / max(percents.values()) for percent in sorted(percents): print '% 3d%% % 4d %s' % (percent, percents[percent], \ '*' * int(percents[percent] * mul)) print "Disk usage: lowest: %s%%, highest: %s%%, avg: %s%%" % \ (low, high, average) else: print "Error: No hosts available or returned valid information." print "=" * 79 def main(): global VERBOSE, SUPPRESS_ERRORS, swift_dir, pool print "=" * 79 usage = ''' usage: %prog [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [--objmd5] ''' args = optparse.OptionParser(usage) args.add_option('--verbose', '-v', action="store_true", help="Print verbose info") args.add_option('--suppress', action="store_true", help="Suppress most connection related errors") args.add_option('--async', '-a', action="store_true", help="Get async stats") args.add_option('--replication', '-r', action="store_true", help="Get replication stats") args.add_option('--unmounted', '-u', action="store_true", help="Check cluster for unmounted devices") args.add_option('--diskusage', '-d', action="store_true", help="Get disk usage stats") args.add_option('--loadstats', '-l', action="store_true", help="Get cluster load average stats") args.add_option('--quarantined', '-q', action="store_true", help="Get cluster quarantine stats") args.add_option('--objmd5', action="store_true", help="Get md5sums of object.ring.gz and compare to local copy") args.add_option('--sockstat', action="store_true", help="Get cluster socket usage stats") args.add_option('--all', action="store_true", help="Perform all checks. Equivalent to -arudlq --objmd5 --sockstat") args.add_option('--zone', '-z', type="int", help="Only query servers in specified zone") args.add_option('--swiftdir', default="/etc/swift", help="Default = /etc/swift") options, arguments = args.parse_args() if len(sys.argv) <= 1: args.print_help() swift_dir = options.swiftdir obj_ring = os.path.join(swift_dir, 'object.ring.gz') con_ring = os.path.join(swift_dir, 'container.ring.gz') acct_ring = os.path.join(swift_dir, 'account.ring.gz') VERBOSE = options.verbose SUPPRESS_ERRORS = options.suppress if options.zone: hosts = get_devices(options.zone, obj_ring) else: hosts = get_devices(None, obj_ring) if options.all: async_check(hosts) umount_check(hosts) replication_check(hosts) load_check(hosts) disk_usage(hosts) get_ringmd5(hosts, obj_ring) quarantine_check(hosts) socket_usage(hosts) else: if options.async: async_check(hosts) if options.unmounted: umount_check(hosts) if options.replication: replication_check(hosts) if options.loadstats: load_check(hosts) if options.diskusage: disk_usage(hosts) if options.objmd5: get_ringmd5(hosts, obj_ring) if options.quarantined: quarantine_check(hosts) if options.sockstat: socket_usage(hosts) if __name__ == '__main__': try: main() except KeyboardInterrupt: print '\n'