#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2014 eNovance SAS # # Author: Frederic Lepied # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain # a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. ''' Nagios check using ceph health. ''' import json import subprocess import sys import traceback def per(percent, value): return percent / 100 * value def remaining(avail, total): return "(%dMB/%dMB)" % (avail / 1024, total / 1024) def interpret_output_df(output): '''Parse the output of ceph health. Return an exit code and message compatible with nagios. ''' try: data = json.loads(output) except Exception: return (1, 'CEPH WARNING: unable to parse ceph df %s' % traceback.format_exc()) warn_percent = int(sys.argv[1]) if len(sys.argv) >= 2 else 85 crit_percent = int(sys.argv[2]) if len(sys.argv) >= 3 else 98 if 'total_bytes' in data['stats']: total = int(data['stats']['total_bytes']) else: total = int(data['stats']['total_space']) if 'total_used_bytes' in data['stats']: used = int(data['stats']['total_used_bytes']) else: used = int(data['stats']['total_used']) if 'total_avail_bytes' in data['stats']: avail = int(data['stats']['total_avail_bytes']) else: avail = int(data['stats']['total_avail']) # Test correctness of values if used + avail != total: return (1, '[WARN] Used + Avail. != Total space') elif avail < per(crit_percent, total): return (2, "[ERR] Ceph df avail. critical %s" % remaining(avail, total)) elif avail < per(warn_percent, total): return (1, "[WARN] Ceph df avail. waring %s" % remaining(avail, total)) else: return (0, "[OK] Ceph df avail. seems good %s" % remaining(avail, total)) def check_ceph_df(): 'Program entry point.' try: ceph_args = ["ceph", "df", "--format=json"] if len(sys.argv) >= 4: ceph_args.append('-n') ceph_args.append(sys.argv[3]) res = subprocess.check_output(ceph_args, stderr=subprocess.STDOUT) exit_code, message = interpret_output_df(res) sys.stdout.write("%s\n" % message) sys.exit(exit_code) except subprocess.CalledProcessError as e: sys.stdout.write('CEPH UNKNOWN: %s\n' % e.output) sys.exit(3) except OSError: sys.stdout.write('CEPH UNKNOWN: unable to launch ceph health\n') sys.exit(3) def interpret_output_health(output): '''Parse the output of ceph health. Return an exit code and message compatible with nagios. ''' tokens = output.split(' ') if len(tokens) == 1: tokens[0] = tokens[0].strip() tokens.append('\n') if tokens[0] == 'HEALTH_OK': return (0, 'CEPH OK: ' + ' '.join(tokens[1:])) elif tokens[0] == 'HEALTH_WARN': return (1, 'CEPH WARNING: ' + ' '.join(tokens[1:])) elif tokens[0] == 'HEALTH_ERR': return (2, 'CEPH CRITICAL: ' + ' '.join(tokens[1:])) else: return (3, 'CEPH UNKNOWN: ' + ' '.join(tokens)) def check_ceph_health(): 'Program entry point.' try: ceph_args = ["ceph", "health"] if len(sys.argv) >= 2: ceph_args.append('-n') ceph_args.append(sys.argv[1]) res = subprocess.check_output(ceph_args, stderr=subprocess.STDOUT) exit_code, message = interpret_output_health(res) sys.stdout.write(message) sys.exit(exit_code) except subprocess.CalledProcessError as e: sys.stdout.write('CEPH UNKNOWN: %s\n' % e.output) sys.exit(3) except OSError: sys.stdout.write('CEPH UNKNOWN: unable to launch ceph health\n') sys.exit(3)