Add template for autohealing servers

This is a template to create a Server along with Aodh alarms that
trigger a Mistral workflow (via a Zaqar queue) to get Heat to recreate
the server in the event that it is stopped, deleted or goes into an
error state.

Change-Id: Iab1b206cc06715cfd7dab0dfe7cb0c3de4c698dc
This commit is contained in:
Zane Bitter 2017-02-03 10:22:23 -05:00
parent 15cf0b6ebb
commit c975f6b179
2 changed files with 230 additions and 0 deletions

View File

@ -0,0 +1,60 @@
heat_template_version: 2017-02-24
description: >
A stack containing an Autoscaling Group whose members automatically heal
themselves if they are stopped, deleted, or go into an error state, using an
Aodh alarm delivered to a Zaqar queue that triggers a Mistral workflow to
replace the stopped server. Note that this requires event alarms to be
enabled in Aodh, following the instructions at
http://docs.openstack.org/developer/aodh/event-alarm.html - specifically by
adding the publisher "notifier://?topic=alarm.all" in
/etc/ceilometer/event_pipeline.yaml.
parameters:
flavor:
type: string
description: Flavor for the instances to be created
default: cirros256
constraints:
- custom_constraint: nova.flavor
description: Must be a flavor known to Nova
image:
type: string
description: >
Name or ID of the image to use for the instances.
default: cirros-0.3.4-x86_64-uec
constraints:
- custom_constraint: glance.image
description: Must identify an image known to Glance
network:
type: string
description: The network for the VM
default: private
port:
type: number
description: The port to reply to requests on
default: 8080
resources:
servers:
type: OS::Heat::AutoScalingGroup
properties:
resource:
type: autohealing_server.yaml
properties:
flavor: {get_param: flavor}
image: {get_param: image}
network: {get_param: network}
port: {get_param: port}
root_stack_id: {get_param: "OS::stack_id"}
min_size: 1
desired_capacity: 2
max_size: 4
outputs:
server_ids:
description: A list of the current server UUIDs
value: {get_attr: [servers, refs]}
ip_addresses:
description: A list of server IP addresses
value: {get_attr: [servers, outputs_list, first_address]}

View File

@ -0,0 +1,170 @@
heat_template_version: 2017-02-24
description: >
A stack containing a server that is automatically replaced if it is stopped,
deleted, or goes into an error state, using an Aodh alarm delivered to a
Zaqar queue that triggers a Mistral workflow. This may be either be used
standalone, or as the scaled unit of a scaling group. When using this from
inside another template, the 'root_stack_id' parameter should be passed to
indicate at which stack the stack update should commence after marking the
server as failed. This should be the root-level stack, to ensure that any
other resources depending on outputs from this stack are also updated. Note
that this requires event alarms to be enabled in Aodh, following the
instructions at http://docs.openstack.org/developer/aodh/event-alarm.html -
specifically, by adding the publisher "notifier://?topic=alarm.all" in
/etc/ceilometer/event_pipeline.yaml.
parameters:
flavor:
type: string
description: Flavor for the instances to be created
default: cirros256
constraints:
- custom_constraint: nova.flavor
description: Must be a flavor known to Nova
image:
type: string
description: >
Name or ID of the image to use for the instances.
default: cirros-0.3.4-x86_64-uec
constraints:
- custom_constraint: glance.image
description: Must identify an image known to Glance
network:
type: string
description: The network for the VM
default: private
port:
type: number
description: The port to reply to requests on
default: 8080
root_stack_id:
type: string
default: ""
conditions:
is_standalone: {equals: [{get_param: root_stack_id}, ""]}
resources:
server:
type: OS::Nova::Server
properties:
image: {get_param: image}
flavor: {get_param: flavor}
networks:
- network: {get_param: network}
user_data_format: RAW
user_data:
str_replace:
template: |
#! /bin/sh -v
Body=$(hostname)
Response="HTTP/1.1 200 OK\r\nContent-Length: ${#Body}\r\n\r\n$Body"
while true ; do echo -e $Response | nc -llp %PORT%; done
params:
"%PORT%": {get_param: port}
alarm_queue:
type: OS::Zaqar::Queue
stop_event_alarm:
type: OS::Aodh::EventAlarm
properties:
event_type: compute.instance.update
query:
- field: traits.instance_id
value: {get_resource: server}
op: eq
- field: traits.state
value: stopped
op: eq
alarm_queues:
- {get_resource: alarm_queue}
error_event_alarm:
type: OS::Aodh::EventAlarm
properties:
event_type: compute.instance.update
query:
- field: traits.instance_id
value: {get_resource: server}
op: eq
- field: traits.state
value: error
op: eq
alarm_queues:
- {get_resource: alarm_queue}
deleted_event_alarm:
type: OS::Aodh::EventAlarm
properties:
event_type: compute.instance.delete.start
query:
- field: traits.instance_id
value: {get_resource: server}
op: eq
alarm_queues:
- {get_resource: alarm_queue}
# The Aodh event alarm does not take effect immediately; it may take up to
# 60s (by default) for the event_alarm_cache_ttl to expire and the tenant's
# alarm data to be loaded. This resource ensures the stack is not completed
# until the alarm is active. See https://bugs.launchpad.net/aodh/+bug/1651273
alarm_cache_wait:
type: OS::Heat::TestResource
properties:
action_wait_secs:
create: 60
update: 60
value:
list_join:
- ''
- - {get_attr: [stop_event_alarm, show]}
- {get_attr: [error_event_alarm, show]}
- {get_attr: [deleted_event_alarm, show]}
alarm_subscription:
type: OS::Zaqar::MistralTrigger
properties:
queue_name: {get_resource: alarm_queue}
workflow_id: {get_resource: autoheal}
input:
stack_id: {get_param: "OS::stack_id"}
root_stack_id:
if:
- is_standalone
- {get_param: "OS::stack_id"}
- {get_param: "root_stack_id"}
autoheal:
type: OS::Mistral::Workflow
properties:
description: >
Mark a server as unhealthy and commence a stack update to replace it.
input:
stack_id:
root_stack_id:
type: direct
tasks:
- name: resources_mark_unhealthy
action:
list_join:
- ' '
- - heat.resources_mark_unhealthy
- stack_id=<% $.stack_id %>
- resource_name=<% env().notification.body.reason_data.event.traits.where($[0] = 'instance_id').select($[2]).first() %>
- mark_unhealthy=true
- resource_status_reason='Marked by alarm'
on_success:
- stacks_update
- name: stacks_update
action: heat.stacks_update stack_id=<% $.root_stack_id %> existing=true
outputs:
OS::stack_id:
description: The server UUID
value: {get_resource: server}
condition: {not: is_standalone}
first_address:
description: The server IP address
value: {get_attr: [server, first_address]}