Add some Nagios check scripts for some services involved with CI systems

These can be set up for nrpe or ssh checks on jenkins master nodes and
cover a few basic checks like:

- Ensuring a nodepool image age is not greater than some threshold
- Checking a specific job is registered in gearman
- Checking the health score for a specific jenkins job

All of these have tons of room for improvement and are just barebones
check scripts (closer to idiot checks) but can help detect some
common problems with 3rd party ci systems.

Change-Id: Ic6fbcfbdbf577646ebebf133b492f35535f37b00
This commit is contained in:
Patrick East 2015-05-11 13:46:52 -07:00
parent 98ff04fb0a
commit 1bb3df6c6e
4 changed files with 126 additions and 0 deletions

View File

@ -0,0 +1,29 @@
#!/usr/bin/env python
import argparse
import utils
def check_gearman_status(job_name):
"""Returns a tuple of exit code and message string
Exit codes are either 2 -> critical or 0 -> OK
There are no warnings with gearman job checker
"""
try:
gearadmin_status = utils.run_command_local('(echo status ; sleep 0.1) | netcat 127.0.0.1 4730 -w 1')
if job_name not in gearadmin_status:
return 2, 'Failed to find job registered with gearman!\nstatus:\n%s' % gearadmin_status
except Exception, e:
return 2, 'Failed to check gearman status' + e.message
return 0, job_name + ' is registered with gearman'
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Check gearman job status.')
parser.add_argument('--job', required=True, type=str, help='the job name to check for')
args = parser.parse_args()
code, message = check_gearman_status(args.job)
print message
exit(code)

View File

@ -0,0 +1,40 @@
#!/usr/bin/env python
import argparse
import urllib
import utils
def check_jenkins_status(job_name, warning_threshold, critial_threshold):
"""Returns a tuple of exit code and message string
Exit codes are either 2 -> critical, 1 -> warning, or 0 -> OK
There code is determined based on the job health score and thresholds
passed into the script.
"""
try:
target_url = 'http://localhost:8080/job/%s/api/python' % job_name
jenkins_volume_job = eval(urllib.urlopen(target_url).read())
if jenkins_volume_job:
health_score = jenkins_volume_job['healthReport'][0]['score']
exit_code = 0
if health_score <= critial_threshold:
exit_code = 2
elif health_score <= warning_threshold:
exit_code = 1
return exit_code, 'Jenkins job health score is ' + str(health_score)
except Exception, e:
return 2, 'Error checking jenkins job status: ' + e.message
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Check jenkins job status.')
parser.add_argument('--job', required=True, type=str, help='the job name to check for')
parser.add_argument('-w', required=True, type=int, help='warning threshold of health score')
parser.add_argument('-c', required=True, type=int, help='critical threshold of health score')
args = parser.parse_args()
code, message = check_jenkins_status(args.job, args.w, args.c)
print message
exit(code)

View File

@ -0,0 +1,49 @@
#!/usr/bin/env python
import argparse
import re
import utils
def check_nodepool_image_status(warning_threshold, critial_threshold):
"""Returns a tuple of exit code and message string
Exit codes are either 2 -> critical or 0 -> OK
There are no warnings with gearman job checker
"""
try:
image_list_raw = utils.run_command_local('sudo /usr/local/bin/nodepool image-list')
image_list_lines = image_list_raw.split('\n')
newest_image_age = None
for line in image_list_lines:
match = re.search('\|\s+(\w+)\s+\|\s+(\d+\.\d+)\s+\|$', line)
if match:
status = match.group(1)
age = float(match.group(2))
if status == 'ready':
if (newest_image_age is None) or (age < newest_image_age):
newest_image_age = age
if not newest_image_age:
return 2, 'Error running command, output: ' + image_list_raw
exit_code = 0
if newest_image_age > warning_threshold:
exit_code = 2
elif newest_image_age > warning_threshold:
exit_code = 1
return exit_code, 'Nodepool image age (hours): ' + str(newest_image_age)
except Exception, e:
return 2, 'Error checking nodepool images: %s' + str(e)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Check nodepool image status.')
parser.add_argument('-w', required=True, type=int, help='warning threshold for age of the image in hours')
parser.add_argument('-c', required=True, type=int, help='critical threshold for age of the image in hours')
args = parser.parse_args()
code, message = check_nodepool_image_status(args.w, args.c)
print message
exit(code)

View File

@ -0,0 +1,8 @@
import subprocess
def run_command_local(command):
try:
return subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
except Exception, e:
return e.message