monasca-vagrant/tests/smoke.py

#!/opt/monasca/bin/python
#
"""smoke
    Runs a smoke test of the monitoring installation on mini-mon to ensure
    the components (other than the UI) are functioning. The code tests these
    components:
       1. Agent - ensures metrics are being sent to API
       2. API - ensures alarm definitions can created, listed, etc. Ensure
                metrics and alarms can be queried
       3. CLI - used to talk to the API
       4. Persister - ensures metrics and alarm history has been persisted
                      in database because API can query them
       5. Threshold Engine - ensures alarms are created and change state
       6. Notification Engine - ensures email notifications are sent to the
                                local system
    This must be run on either the mini-mon VM for the single VM mode or
    on the kafka VM in the multi VM mode.

    If the tests are to be run in a different environment other than mini-mon,
    the environment variables below can be set and the smoke will use those
    instead of the mini-mon credentials and settings:

        OS_USERNAME
        OS_PASSWORD
        OS_PROJECT_NAME
        OS_AUTH_URL

    TODO:
        1. Add more logic to give ideas of why a particular step failed, for
           example, alarm did not get created because metrics weren't being
           received
"""

from __future__ import print_function
import argparse
import sys
import os
import time
import cli_wrapper
import utils
import datetime
import psutil
import smoke_configs

config = smoke_configs.test_config["default"]


# parse command line arguments
def parse_commandline_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config', nargs='?', default='default',
                        help='select configuration <CONFIG>')
    return parser.parse_args()


def set_config(config_name):
    global config
    try:
        config = smoke_configs.test_config[config_name]
        print('Using {} Configuration'.format(config_name))
        return True
    except KeyError:
        print('Could not find config "{}"'.format(config_name), file=sys.stderr)
        return False


def get_metrics(name, dimensions, since):
    dimensions_arg = ''
    for key, value in dimensions.iteritems():
        if dimensions_arg != '':
            dimensions_arg = dimensions_arg + ','
        dimensions_arg = dimensions_arg + key + '=' + value
    return cli_wrapper.run_mon_cli(['measurement-list', '--dimensions',
                                    dimensions_arg, name, since])


def cleanup(notification_name, alarm_definition_name):
    cli_wrapper.delete_alarm_definition_if_exists(alarm_definition_name)
    cli_wrapper.delete_notification_if_exists(notification_name)


def wait_for_alarm_state_change(alarm_id, old_state):
    # Wait for it to change state
    print('Waiting for alarm to change state from {}'.format(old_state))
    for x in range(0, 250):
        time.sleep(1)
        state = cli_wrapper.get_alarm_state(alarm_id)
        if state != old_state:
            print('Alarm state changed to {} in {} seconds'.format(state, x))
            return state
    print('State never changed from {} in {} seconds'.format(old_state, x),
          file=sys.stderr)
    return None


def check_notifications(alarm_id, state_changes):
    print("Checking Notification Engine")
    if not os.path.isfile('/etc/monasca/notification.yaml'):
        print('Notification Engine not installed on this VM,' +
              ' skipping Notifications test',
              file=sys.stderr)
        return False

    notifications = utils.find_notifications(alarm_id, "root")
    if len(notifications) != len(state_changes):
        print('Expected {} notifications but only found {}'.format(
              len(state_changes), len(notifications)), file=sys.stderr)
        return False

    index = 0
    for expected in state_changes:
        actual = notifications[index]
        if actual != expected:
            print('Expected {} but found {} for state change {}'.format(
                  expected, actual, index+1), file=sys.stderr)
            return False
        index = index + 1
    print('Received email notifications as expected')

    return True


def count_metrics(metric_name, metric_dimensions, since):
    # Query how many metrics there are for the Alarm
    metric_json = get_metrics(metric_name, metric_dimensions, since)
    if len(metric_json) == 0:
        print('No measurements received for metric {}{} '.format(
              metric_name, metric_dimensions), file=sys.stderr)
        return None

    return len(metric_json[0]['measurements'])


def ensure_at_least(actual, desired):
    if actual < desired:
        time.sleep(desired - actual)


def wait_for_alarm_creation(alarm_def_id):
    print('Waiting for alarm to be created for Alarm Definition {}'.format(alarm_def_id))
    for x in range(0, 30):
        time.sleep(1)
        alarms = cli_wrapper.find_alarms_for_definition(alarm_def_id)
        if len(alarms) == 1:
            print('Alarm was created in {} seconds'.format(x))
            return alarms[0]
        elif len(alarms) > 1:
            print('{} Alarms were created. Only expected 1'.format(len(alarms)),
                  file=sys.stderr)
            return None

    print('Alarm was not created for Alarm Definition {} in {} seconds'.format(
          alarm_def_id, x), file=sys.stderr)
    return None


def smoke_test():
    notification_name = config['notification']['name']
    notification_addr = config['notification']['addr']
    notification_type = config['notification']['type']
    alarm_definition_name = config['alarm']['name']
    metric_name = config['metric']['name']
    metric_dimensions = config['metric']['dimensions']
    statsd_metric_name = config['statsd_metric']['name']
    statsd_metric_dimensions = config['statsd_metric']['dimensions']

    cleanup(notification_name, alarm_definition_name)

    # Query how many metrics there are for the Alarm
    hour_ago = datetime.datetime.now() - datetime.timedelta(hours=1)
    hour_ago_str = hour_ago.strftime('%Y-%m-%dT%H:%M:%S')
    print('Getting metrics for {}{} '.format(metric_name, metric_dimensions))
    initial_num_metrics = count_metrics(metric_name, metric_dimensions,
                                        hour_ago_str)

    if initial_num_metrics is None or initial_num_metrics == 0:
        msg = ('No metric {} with dimensions {} received in last hour'.format(
               metric_name, metric_dimensions))
        return False, msg

    print('Getting metrics for {}{} '.format(statsd_metric_name, statsd_metric_dimensions))
    initial_statsd_num_metrics = count_metrics(statsd_metric_name, statsd_metric_dimensions, hour_ago_str)

    # statsd metrics may not have been sent yet, which will return None from the CLI wrapper
    if initial_statsd_num_metrics is None:
        initial_statsd_num_metrics = 0

    start_time = time.time()

    # Create Notification through CLI
    notif_id = cli_wrapper.create_notification(notification_name,
                                               notification_addr,
                                               notification_type)

    # Create Alarm through CLI
    expression = config['alarm']['expression']
    description = config['alarm']['description']
    alarm_def_id = cli_wrapper.create_alarm_definition(
        alarm_definition_name,
        expression,
        description=description,
        ok_notif_id=notif_id,
        alarm_notif_id=notif_id,
        undetermined_notif_id=notif_id)

    # Wait for an alarm to be created
    alarm_id = wait_for_alarm_creation(alarm_def_id)

    if alarm_id is None:
        received_num_metrics = count_metrics(metric_name, metric_dimensions,
                                             hour_ago_str)
        if received_num_metrics == initial_num_metrics:
            print('Did not receive any {}{} metrics while waiting'.format(metric_name,metric_dimensions))
        else:
            delta = received_num_metrics - initial_num_metrics
            print('Received {} {} metrics while waiting'.format(delta, metric_name))
        return False, 'Alarm creation error'

    # Ensure it is created in the right state
    initial_state = 'UNDETERMINED'
    if not utils.check_alarm_state(alarm_id, initial_state):
        msg = 'Alarm is in an invalid initial state'
        return False, msg
    states = []
    states.append(initial_state)
    state = wait_for_alarm_state_change(alarm_id, initial_state)
    if state is None:
        msg = 'Alarm is in an invalid state'
        return False, msg

    if state != 'ALARM':
        print('Wrong final state, expected ALARM but was {}'.format(state),
              file=sys.stderr)
        msg = 'Alarm is in an invalid final state'
        return False, msg
    states.append(state)

    new_state = 'OK'
    states.append(new_state)
    if not cli_wrapper.change_alarm_state(alarm_id, new_state):
        msg = 'Unable to change Alarm state'
        return False, msg

    # There is a bug in the API which allows this to work. Soon that
    # will be fixed and this will fail
    if len(sys.argv) > 1:
        final_state = 'ALARM'
        states.append(final_state)

        state = wait_for_alarm_state_change(alarm_id, new_state)
        if state is None:
            msg = 'Alarm is in an unknown state'
            return False, msg

        if state != final_state:
            msg = ('Wrong final state, expected {} but was {}'.format(final_state, state))
            return False, msg

    # If the alarm changes state too fast, then there isn't time for the new
    # metric to arrive. Unlikely, but it has been seen
    ensure_at_least(time.time() - start_time, 35)
    change_time = time.time() - start_time

    final_num_metrics = count_metrics(metric_name, metric_dimensions,
                                      hour_ago_str)
    if final_num_metrics <= initial_num_metrics:
        msg = ('No new metrics received for {}{} in {} seconds'.format(metric_name, metric_dimensions, change_time))
        return False, msg
    print('Received {} metrics in {} seconds'.format((final_num_metrics - initial_num_metrics),  change_time))
    if not utils.check_alarm_history(alarm_id, states):
        msg = 'Invalid alarm history'
        return False, msg

    # Notifications are only sent out for the changes, so omit the first state
    if not check_notifications(alarm_id, states[1:]):
        msg = 'Could not find correct notifications for alarm {}'.format(alarm_id)
        return False, msg

    # Check that monasca statsd is sending metrics
    # Metrics may take some time to arrive
    print('Waiting for statsd metrics')
    for x in range(0,30):
        final_statsd_num_metrics = count_metrics(statsd_metric_name, statsd_metric_dimensions, hour_ago_str)
        if final_statsd_num_metrics > initial_statsd_num_metrics:
            break
        if x >= 29:
            msg = 'No metrics received for statsd metric {}{} in {} seconds'.format(
                  statsd_metric_name, statsd_metric_dimensions, time.time() - start_time)
            return False, msg
        time.sleep(1)
    print('Received {0} metrics for {1}{2} in {3} seconds'.format(final_statsd_num_metrics - initial_statsd_num_metrics,
                                                                  statsd_metric_name,
                                                                  statsd_metric_dimensions,
                                                                  time.time() - start_time))

    msg = ''
    return True, msg


def find_processes():
    """Find_process is meant to validate that all the required processes
    are running before starting the smoke test """
    process_missing = []
    process_list = config['system_vars']['expected_processes']

    for process in process_list:
        process_found_flag = False

        for item in psutil.process_iter():
            for cmd in item.cmdline():
                if process in cmd:
                    process_found_flag = True
                    break

        if not process_found_flag:
            process_missing.append(process)

    if len(process_missing) > 0:   # if processes were not found
        print ('Process = {} Not Found'.format(process_missing))
        return False
    else:
        print ('All Mini-Mon Processes Found')
        return True


def main():
    # May be able to delete this test because the find_process check should
    # validate the notification engine present.
    if not utils.ensure_has_notification_engine():
        return 1

    utils.setup_cli()

    # parse the command line arguments
    cmd_args = parse_commandline_args()

    if not set_config(cmd_args.config):
        return 1

    print('*****VERIFYING HOST ENVIRONMENT*****')
    if find_processes():
        print('*****BEGIN TEST*****')
        complete, msg = smoke_test()
        if not complete:
            print('*****TEST FAILED*****', file=sys.stderr)
            print(msg, file=sys.stderr)
            return 1
    else:
        return 1

    cleanup(config['notification']['name'], config['alarm']['name'])
    print('*****TEST COMPLETE*****')
    return 0


if __name__ == "__main__":
    sys.exit(main())