Merge "Support null values on recon/replication module"

This commit is contained in:
Jenkins 2017-08-16 16:32:32 +00:00 committed by Gerrit Code Review
commit 56cf0e8210
2 changed files with 465 additions and 33 deletions

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python
# Copyright (C) 2014 Canonical
# Copyright (C) 2014, 2017 Canonical
# All Rights Reserved
# Author: Jacek Nykis
@ -35,7 +35,7 @@ def check_md5(base_url):
results = []
try:
data = urllib2.urlopen(url).read()
j = json.loads(data)
ringmd5_info = json.loads(data)
except urllib2.URLError:
return [(STATUS_UNKNOWN, "Can't open url: {}".format(url))]
except ValueError:
@ -43,9 +43,9 @@ def check_md5(base_url):
for ringfile in ringfiles:
try:
if generate_md5(ringfile) != j[ringfile]:
results.append((STATUS_CRIT,
"Ringfile {} MD5 sum mismatch".format(ringfile)))
if generate_md5(ringfile) != ringmd5_info[ringfile]:
results.append((STATUS_CRIT, "Ringfile {} MD5 sum "
"mismatch".format(ringfile)))
except IOError:
results.append(
(STATUS_UNKNOWN, "Can't open ringfile {}".format(ringfile)))
@ -55,6 +55,41 @@ def check_md5(base_url):
return [(STATUS_OK, "OK")]
def repl_last_timestamp(repl_info):
"""Verifies that values are not null (float expected)
@returns (timedelta, counter)
ie. if /var/cache/swift/* has no swift uid/gid,
syslog will show errors and all replication values will
return None
{
"replication_last": None,
"replication_stats": None,
"object_replication_last": None,
"replication_time": None,
"object_replication_time": None,
}
"""
delta = None
repl_last = (repl_info.get('replication_last') or
repl_info.get('object_replication_last'))
if repl_last:
delta = datetime.datetime.now() - \
datetime.datetime.fromtimestamp(repl_last)
if 'replication_stats' not in repl_info:
# no info shared by swift; considered OK
repl_failures = None
else:
# raise error (-1) if NULL value or 'failure' attr does not exist
repl_failures = (repl_info.get('replication_stats', {})
.get('failure', -1))
return (delta, repl_failures)
def check_replication(base_url, limits):
types = ["account", "object", "container"]
results = []
@ -62,7 +97,7 @@ def check_replication(base_url, limits):
url = base_url + "replication/" + repl
try:
data = urllib2.urlopen(url).read()
j = json.loads(data)
repl_info = json.loads(data)
except urllib2.URLError:
results.append((STATUS_UNKNOWN, "Can't open url: {}".format(url)))
continue
@ -70,25 +105,34 @@ def check_replication(base_url, limits):
results.append((STATUS_UNKNOWN, "Can't parse status data"))
continue
if "object_replication_last" in j:
repl_last = datetime.datetime.fromtimestamp(j["object_replication_last"])
else:
repl_last = datetime.datetime.fromtimestamp(j["replication_last"])
delta = datetime.datetime.now() - repl_last
if delta.seconds >= limits[1]:
delta, repl_failures = repl_last_timestamp(repl_info)
if not delta:
results.append((STATUS_CRIT,
"'{}' replication lag is {} seconds".format(repl, delta.seconds)))
"'{}' replication lag not working "
'(perms issue? check syslog)'.format(repl)))
elif delta.seconds >= limits[1]:
results.append((STATUS_CRIT,
"'{}' replication lag is {} "
"seconds".format(repl, delta.seconds)))
elif delta.seconds >= limits[0]:
results.append((STATUS_WARN,
"'{}' replication lag is {} seconds".format(repl, delta.seconds)))
if "replication_stats" in j:
errors = j["replication_stats"]["failure"]
if errors >= limits[3]:
results.append(
(STATUS_CRIT, "{} replication failures".format(errors)))
elif errors >= limits[2]:
results.append(
(STATUS_WARN, "{} replication failures".format(errors)))
"'{}' replication lag is {} "
"seconds".format(repl, delta.seconds)))
if not repl_failures:
pass
elif repl_failures >= limits[3]:
results.append((STATUS_CRIT,
"{} replication failures".format(repl_failures)))
elif repl_failures >= limits[2]:
results.append((STATUS_WARN,
"{} replication failures".format(repl_failures)))
elif repl_failures == -1 and delta:
# XXX(aluria): only raised if repl lag err was not triggered
results.append((STATUS_CRIT,
"replication failures counter is NULL "
"(check syslog)"))
if results:
return results
else:
@ -98,18 +142,19 @@ def check_replication(base_url, limits):
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Check swift-storage health')
parser.add_argument('-H', '--host', dest='host', default='localhost',
help='Hostname to query')
parser.add_argument('-p', '--port', dest='port', default='6000',
type=int, help='Port number')
help='Hostname to query')
parser.add_argument('-p', '--port', dest='port', default='6000', type=int,
help='Port number')
parser.add_argument('-r', '--replication', dest='check_replication',
type=int, nargs=4, help='Check replication status',
metavar=('lag_warn', 'lag_crit', 'failures_warn', 'failures_crit'))
type=int, nargs=4, help='Check replication status',
metavar=('lag_warn', 'lag_crit', 'failures_warn',
'failures_crit'))
parser.add_argument('-m', '--md5', dest='check_md5', action='store_true',
help='Compare server rings md5sum with local copy')
help='Compare server rings md5sum with local copy')
args = parser.parse_args()
if not args.check_replication and not args.check_md5:
print "You must use -r or -m switch"
print('You must use -r or -m switch')
sys.exit(STATUS_UNKNOWN)
base_url = "http://{}:{}/recon/".format(args.host, args.port)
@ -123,14 +168,14 @@ if __name__ == '__main__':
warns = ';'.join([i[1] for i in results if i[0] == STATUS_WARN])
unknowns = ';'.join([i[1] for i in results if i[0] == STATUS_UNKNOWN])
if crits:
print "CRITICAL: " + crits
print("CRITICAL: {}".format(crits))
sys.exit(STATUS_CRIT)
elif warns:
print "WARNING: " + warns
print("WARNING: {}".format(warns))
sys.exit(STATUS_WARN)
elif unknowns:
print "UNKNOWN: " + unknowns
print("UNKNOWN: {}".format(unknowns))
sys.exit(STATUS_UNKNOWN)
else:
print "OK"
print("OK")
sys.exit(0)

View File

@ -0,0 +1,387 @@
# Copyright 2017 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import sys
import unittest
import urllib2
from mock import (
patch,
mock_open,
MagicMock,
Mock,
PropertyMock,
)
sys.path.append('files/nrpe-external-master')
from check_swift_storage import (
check_md5,
check_replication,
generate_md5,
repl_last_timestamp,
)
STATUS_OK = 0
STATUS_WARN = 1
STATUS_CRIT = 2
STATUS_UNKNOWN = 3
class NewDate(datetime.datetime):
@classmethod
def now(cls):
"""
Mock for builtin datetime.datetime.now(), to test repl_last_timestamp()
Non-defined methods are inherited from real class
"""
return cls(2017, 4, 27,
13, 25, 46, 629282)
@classmethod
def fromtimestamp(cls, timestamp):
return cls.utcfromtimestamp(timestamp)
datetime.datetime = NewDate
class CheckSwiftStorageTestCase(unittest.TestCase):
def test_generate_md5(self):
"""
Ensure md5 checksum is generated from a file content
"""
with patch("__builtin__.open", mock_open(read_data='data')) as \
mock_file:
result = generate_md5('path/to/file')
mock_file.assert_called_with('path/to/file', 'rb')
# md5 hash for 'data' string
self.assertEqual(result,
'8d777f385d3dfec8815d20f7496026dc')
@patch('urllib2.urlopen')
def test_check_md5_unknown_urlerror(self, mock_urlopen):
"""
Force urllib2.URLError to test try-except
"""
base_url = 'http://localhost:6000/recon/'
url = '{}ringmd5'.format(base_url)
error = 'connection refused'
mock_urlopen.side_effect = urllib2.URLError(Mock(return_value=error))
result = check_md5(base_url)
self.assertEqual(result,
[(STATUS_UNKNOWN,
"Can't open url: {}".format(url))])
@patch('urllib2.urlopen')
def test_check_md5_unknown_valueerror1(self, mock_urlopen):
"""
Force ValueError on urllib2 to test try-except
"""
base_url = 'asdfasdf'
url = '{}ringmd5'.format(base_url)
mock_urlopen.side_effect = ValueError(Mock(return_value=''))
result = check_md5(base_url)
mock_urlopen.assert_called_with(url)
self.assertEqual(result,
[(STATUS_UNKNOWN,
"Can't parse status data")])
@patch('urllib2.urlopen')
def test_check_md5_unknown_valueerror2(self, mock_urlopen):
"""
Force ValueError on json to test try-catch
"""
jdata = PropertyMock(return_value='X')
mock_urlopen.return_value = MagicMock(read=jdata)
result = check_md5('.')
mock_urlopen.assert_called_with('.ringmd5')
self.assertEqual(result,
[(STATUS_UNKNOWN,
"Can't parse status data")])
@patch('check_swift_storage.generate_md5')
def test_check_md5_unknown_ioerror(self, mock_generate_md5):
"""
Force IOError (reading file) to test try-catch
"""
jdata = '{"/etc/swift/object.ring.gz": ' \
'"6b4f3a0ef3731f18291ecd053ce0d9b6", ' \
'"/etc/swift/account.ring.gz": ' \
'"93fc4ae496a7343362ebf13988a137e7", ' \
'"/etc/swift/container.ring.gz": ' \
'"0ea1ec9585ef644ce2b5c5b1dced4128"}'
pmock_jdata = PropertyMock(return_value=jdata)
mock_generate_md5.side_effect = IOError()
with patch('urllib2.urlopen') as mock_urlopen:
mock_urlopen.return_value = MagicMock(read=pmock_jdata)
result = check_md5('.')
mock_urlopen.assert_called_with('.ringmd5')
expected_result = [(STATUS_UNKNOWN,
"Can't open ringfile "
"/etc/swift/{}.ring.gz".format(name))
for name in ('object', 'account', 'container')]
self.assertEqual(result, expected_result)
@patch('check_swift_storage.generate_md5')
def test_check_md5_crit_md5sum_mismatch(self, mock_generate_md5):
"""
Ensure md5 checksums match, STATUS_CRIT
"""
jdata = '{"/etc/swift/object.ring.gz": ' \
'"6b4f3a0ef3731f18291ecd053ce0d9b6", ' \
'"/etc/swift/account.ring.gz": ' \
'"93fc4ae496a7343362ebf13988a137e7", ' \
'"/etc/swift/container.ring.gz": ' \
'"0ea1ec9585ef644ce2b5c5b1dced4128"}'
pmock_jdata = PropertyMock(return_value=jdata)
mock_generate_md5.return_value = 'xxxx'
with patch('urllib2.urlopen') as mock_urlopen:
mock_urlopen.return_value = MagicMock(read=pmock_jdata)
result = check_md5('.')
mock_urlopen.assert_called_with('.ringmd5')
expected_result = [(STATUS_CRIT,
'Ringfile /etc/swift/{}.ring.gz '
'MD5 sum mismatch'.format(name))
for name in ('object', 'account', 'container')]
self.assertEqual(result, expected_result)
@patch('check_swift_storage.generate_md5')
def test_check_md5_ok(self, mock_generate_md5):
"""
Ensure md5 checksums match, STATUS_OK
"""
jdata = '{"/etc/swift/object.ring.gz": ' \
'"6b4f3a0ef3731f18291ecd053ce0d9b6", ' \
'"/etc/swift/account.ring.gz": ' \
'"6b4f3a0ef3731f18291ecd053ce0d9b6", ' \
'"/etc/swift/container.ring.gz": ' \
'"6b4f3a0ef3731f18291ecd053ce0d9b6"}'
pmock_jdata = PropertyMock(return_value=jdata)
mock_generate_md5.return_value = '6b4f3a0ef3731f18291ecd053ce0d9b6'
with patch('urllib2.urlopen') as mock_urlopen:
mock_urlopen.return_value = MagicMock(read=pmock_jdata)
result = check_md5('.')
mock_urlopen.assert_called_with('.ringmd5')
self.assertEqual(result,
[(STATUS_OK, 'OK')])
@patch('urllib2.urlopen')
def test_check_replication_unknown_urlerror(self, mock_urlopen):
"""
Force urllib2.URLError to test try-catch
"""
base_url = 'http://localhost:6000/recon/'
url = '{}replication/{}'
error = 'connection refused'
mock_urlopen.side_effect = urllib2.URLError(Mock(return_value=error))
result = check_replication(base_url, 60)
expected_result = [(STATUS_UNKNOWN,
"Can't open url: "
"{}".format(url.format(base_url, name)))
for name in ('account', 'object', 'container')]
self.assertEqual(result, expected_result)
@patch('urllib2.urlopen')
def test_check_replication_unknown_valueerror1(self, mock_urlopen):
"""
Force ValueError on urllib2 to test try-catch
"""
base_url = '.'
mock_urlopen.side_effect = ValueError(Mock(return_value=''))
result = check_replication(base_url, [4, 10, 4, 10])
self.assertEqual(result,
3*[(STATUS_UNKNOWN,
"Can't parse status data")])
@patch('urllib2.urlopen')
def test_check_replication_unknown_valueerror2(self, mock_urlopen):
"""
Force ValueError on json to test try-catch
"""
base_url = 'http://localhost:6000/recon/'
jdata = PropertyMock(return_value='X')
mock_urlopen.return_value = MagicMock(read=jdata)
result = check_replication(base_url, [4, 10, 4, 10])
self.assertEqual(result,
3*[(STATUS_UNKNOWN,
"Can't parse status data")])
@patch('check_swift_storage.repl_last_timestamp')
def test_check_replication_crit_lag_notworking(self, mock_timestamp):
"""
Catch NULL replication value, STATUS_CRIT
"""
base_url = 'http://localhost:6000/recon/'
jdata = '{"replication_last": 1493299546.629282, ' \
'"replication_stats": {"no_change": 0, "rsync": 0, ' \
'"success": 0, "failure": 0, "attempted": 0, "ts_repl": 0, ' \
'"remove": 0, "remote_merge": 0, "diff_capped": 0, ' \
'"start": 1493299546.621624, "hashmatch": 0, "diff": 0, ' \
'"empty": 0}, "replication_time": 0.0076580047607421875}'
pmock_jdata = PropertyMock(return_value=jdata)
mock_timestamp.return_value = (None, 0)
with patch('urllib2.urlopen') as mock_urlopen:
mock_urlopen.return_value = MagicMock(read=pmock_jdata)
result = check_replication(base_url, [4, 10, 4, 10])
self.assertEqual(result,
[(STATUS_CRIT,
"'{}' replication lag not working "
"(perms issue? check syslog)".format(repl))
for repl in ('account', 'object', 'container')])
@patch('check_swift_storage.repl_last_timestamp')
def test_check_replication_crit_lag(self, mock_timestamp):
"""
Replication lag over CRIT threshold, STATUS_CRIT
"""
base_url = 'http://localhost:6000/recon/'
jdata = '{"replication_last": 1493299546.629282, ' \
'"replication_stats": {"no_change": 0, "rsync": 0, ' \
'"success": 0, "failure": 0, "attempted": 0, "ts_repl": 0, ' \
'"remove": 0, "remote_merge": 0, "diff_capped": 0, ' \
'"start": 1493299546.621624, "hashmatch": 0, "diff": 0, ' \
'"empty": 0}, "replication_time": 0.0076580047607421875}'
pmock_jdata = PropertyMock(return_value=jdata)
mock_timestamp.return_value = (MagicMock(seconds=12), 0)
with patch('urllib2.urlopen') as mock_urlopen:
mock_urlopen.return_value = MagicMock(read=pmock_jdata)
result = check_replication(base_url, [4, 10, 4, 10])
self.assertEqual(result,
[(STATUS_CRIT,
"'{}' replication lag is "
"12 seconds".format(repl))
for repl in ('account', 'object', 'container')])
@patch('check_swift_storage.repl_last_timestamp')
def test_check_replication_crit_failures(self, mock_timestamp):
"""
Replication failures over CRIT threshold, STATUS_CRIT
"""
base_url = 'http://localhost:6000/recon/'
jdata = '{"replication_last": 1493299546.629282, ' \
'"replication_stats": {"no_change": 0, "rsync": 0, ' \
'"success": 0, "failure": 0, "attempted": 0, "ts_repl": 0, ' \
'"remove": 0, "remote_merge": 0, "diff_capped": 0, ' \
'"start": 1493299546.621624, "hashmatch": 0, "diff": 0, ' \
'"empty": 0}, "replication_time": 0.0076580047607421875}'
pmock_jdata = PropertyMock(return_value=jdata)
mock_timestamp.return_value = (MagicMock(seconds=0), 12)
with patch('urllib2.urlopen') as mock_urlopen:
mock_urlopen.return_value = MagicMock(read=pmock_jdata)
result = check_replication(base_url, [4, 10, 4, 10])
self.assertEqual(result,
3*[(STATUS_CRIT, "12 replication failures")])
@patch('check_swift_storage.repl_last_timestamp')
def test_check_replication_crit_null_failures(self, mock_timestamp):
"""
Catch NULL value on failures stats, STATUS_CRIT
"""
base_url = 'http://localhost:6000/recon/'
jdata = '{"replication_last": 1493299546.629282, ' \
'"replication_stats": {"no_change": 0, "rsync": 0, ' \
'"success": 0, "failure": 0, "attempted": 0, "ts_repl": 0, ' \
'"remove": 0, "remote_merge": 0, "diff_capped": 0, ' \
'"start": 1493299546.621624, "hashmatch": 0, "diff": 0, ' \
'"empty": 0}, "replication_time": 0.0076580047607421875}'
pmock_jdata = PropertyMock(return_value=jdata)
mock_timestamp.return_value = (MagicMock(seconds=0), -1)
with patch('urllib2.urlopen') as mock_urlopen:
mock_urlopen.return_value = MagicMock(read=pmock_jdata)
result = check_replication(base_url, [4, 10, 4, 10])
self.assertEqual(result,
3*[(STATUS_CRIT,
"replication failures counter is NULL "
"(check syslog)")])
@patch('check_swift_storage.repl_last_timestamp')
def test_check_replication_warn_lag(self, mock_timestamp):
"""
Replication lag over WARN threshold (below CRIT), STATUS_WARN
"""
base_url = 'http://localhost:6000/recon/'
jdata = '{"replication_last": 1493299546.629282, ' \
'"replication_stats": {"no_change": 0, "rsync": 0, ' \
'"success": 0, "failure": 0, "attempted": 0, "ts_repl": 0, ' \
'"remove": 0, "remote_merge": 0, "diff_capped": 0, ' \
'"start": 1493299546.621624, "hashmatch": 0, "diff": 0, ' \
'"empty": 0}, "replication_time": 0.0076580047607421875}'
pmock_jdata = PropertyMock(return_value=jdata)
mock_timestamp.return_value = (MagicMock(seconds=5), 0)
with patch('urllib2.urlopen') as mock_urlopen:
mock_urlopen.return_value = MagicMock(read=pmock_jdata)
result = check_replication(base_url, [4, 10, 4, 10])
self.assertEqual(result,
[(STATUS_WARN,
"'{}' replication lag is "
"5 seconds".format(repl))
for repl in ('account', 'object', 'container')])
@patch('check_swift_storage.repl_last_timestamp')
def test_check_replication_warn_failures(self, mock_timestamp):
"""
Replication failures over WARN threshold (below CRIT), STATUS_WARN
"""
base_url = 'http://localhost:6000/recon/'
jdata = '{"replication_last": 1493299546.629282, ' \
'"replication_stats": {"no_change": 0, "rsync": 0, ' \
'"success": 0, "failure": 0, "attempted": 0, "ts_repl": 0, ' \
'"remove": 0, "remote_merge": 0, "diff_capped": 0, ' \
'"start": 1493299546.621624, "hashmatch": 0, "diff": 0, ' \
'"empty": 0}, "replication_time": 0.0076580047607421875}'
pmock_jdata = PropertyMock(return_value=jdata)
mock_timestamp.return_value = (MagicMock(seconds=0), 5)
with patch('urllib2.urlopen') as mock_urlopen:
mock_urlopen.return_value = MagicMock(read=pmock_jdata)
result = check_replication(base_url, [4, 10, 4, 10])
self.assertEqual(result,
3*[(STATUS_WARN, "5 replication failures")])
@patch('check_swift_storage.repl_last_timestamp')
def test_check_replication_ok(self, mock_timestamp):
"""
Replication lag and number of failures are below WARN threshold,
STATUS_OK
"""
base_url = 'http://localhost:6000/recon/'
jdata = '{"replication_last": 1493299546.629282, ' \
'"replication_stats": {"no_change": 0, "rsync": 0, ' \
'"success": 0, "failure": 0, "attempted": 0, "ts_repl": 0, ' \
'"remove": 0, "remote_merge": 0, "diff_capped": 0, ' \
'"start": 1493299546.621624, "hashmatch": 0, "diff": 0, ' \
'"empty": 0}, "replication_time": 0.0076580047607421875}'
pmock_jdata = PropertyMock(return_value=jdata)
mock_timestamp.return_value = (MagicMock(seconds=0), 0)
with patch('urllib2.urlopen') as mock_urlopen:
mock_urlopen.return_value = MagicMock(read=pmock_jdata)
result = check_replication(base_url, [4, 10, 4, 10])
self.assertEqual(result, [(STATUS_OK, 'OK')])
def test_repl_last_timestamp(self):
"""
Calculates delta between NOW and last replication date
Also gathers the number of failures
"""
# 1493299546.629282
jdata = {u'replication_last': 1493299546.629282, u'replication_stats':
{u'no_change': 0, u'rsync': 0, u'success': 0, u'start':
1493299546.621624, u'attempted': 0, u'ts_repl': 0, u'remove':
0, u'remote_merge': 0, u'diff_capped': 0, u'failure': 0,
u'hashmatch': 0, u'diff': 0, u'empty': 0},
u'replication_time': 0.0076580047607421875}
result = repl_last_timestamp(jdata)
self.assertEqual(result,
(datetime.timedelta(0), 0))