tripleo-ansible/playbooks/update_cloud.yml

568 lines
24 KiB
YAML

# Copyright (c) 2014 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
- include: step_ping.yml
- hosts: localhost
name: "Setup local environment for upgrade processes to run"
gather_facts: no
max_fail_percentage: 0
tasks:
- include: update_local_ssh_config.yml
- include: step_pre_hook.yml
- hosts: undercloud
name: Disable Undercloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- fail: "Fail if online_upgrade is defined - online upgrades are not supported on undercloud instances."
when: online_upgrade is defined
- service_facts:
when: instance_status == "ACTIVE"
- command: mv -f /etc/init/mysql.conf /etc/init/mysql-boot-control.conf removes=/etc/init/mysql.conf
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_undercloud_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: undercloud_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- include: disable_os_collect_config.yml
- hosts: nova-compute
name: Disable Overcloud Compute
tags: shutdown-cloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
- fail: "FAILURE: Cannot perform an online upgrade on nodes that are not in ACTIVE state"
when: instance_status != "ACTIVE" and online_upgrade is defined
- include: stop_vms.yml
when: instance_status == "ACTIVE" and online_upgrade is not defined
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_compute_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: overcloud_compute_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- service: name=nova-compute state=stopped enabled=no
when: instance_status == "ACTIVE"
- include: step_stop_ns_metadata_proxy.yml
when: instance_status == "ACTIVE"
- hosts: swift-storage
name: swift-storage
tags: shutdown-cloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
- fail: "Fail if online_upgrade is defined - online upgrades are not supported on swift instances."
when: online_upgrade is defined
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_swift_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: overcloud_swift_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- hosts: vsa
name: "Stop services on VSA"
tags: shutdown-cloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
- fail: "Fail if online_upgrade is defined - online upgrades are not supported on vsa instances."
when: online_upgrade is defined
- include: stop_vms.yml
when: instance_status == "ACTIVE"
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_vsa_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- hosts: controller
name: Disable Overcloud Controller
tags: shutdown-cloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- fail: "Fail if online_upgrade is defined - online upgrades are not supported on controller instances."
when: online_upgrade is defined
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_controller_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- service: name={{ item }} enabled=no state=stopped
with_items: overcloud_controller_services
when: helion is not defined and instance_status == "ACTIVE" and item in existing_services
- include: stop_tgt.yml
when: instance_status == "ACTIVE"
- include: step_stop_ns_metadata_proxy.yml
when: instance_status == "ACTIVE"
- hosts: controller-bootstrap
name: Disable Overcloud Controller Bootstrap node
tags: shutdown-cloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- fail: "Fail if online_upgrade is defined - online upgrades are not supported on controllerMgmt instances."
when: online_upgrade is defined
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=no state=stopped
with_items: helion_overcloud_bootstrap_controller_services
when: helion is defined and instance_status == "ACTIVE" and item in existing_services
- include: stop_tgt.yml
when: instance_status == "ACTIVE"
- include: step_stop_ns_metadata_proxy.yml
when: instance_status == "ACTIVE"
# Critically, we need to select a single node of the galera cluster to
# be the 'last'. So controller-bootstrap fits that bill for now. We will have
# to select one to be the "special" node eventually, we can do that with
# host facts and conditionals. The last to go down must have the
# Galera bootstrap run on it, or none of them will come up.
- hosts: controller
name: Stop MySQL/RabbitMQ on controller nodes
tags: shutdown-cloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
serial: 1
tasks:
- include: galera_status.yml
when: instance_status == "ACTIVE"
- name: Stop MySQL under normal circumstances
include: stop_mysql.yml
when: instance_status == "ACTIVE" and galera_status == "Synced" and galera_cluster_size != "1"
- name: Stop MySQL if last node in cluster and single_controller flag has been set.
include: stop_mysql.yml
when: instance_status == "ACTIVE" and single_controller is defined and galera_status == "Synced" and galera_cluster_size == "1"
- fail: msg="Galera Replication is out of sync - cannot safely proceed"
when: single_controller is not defined and instance_status == "ACTIVE" and galera_status == "Out of Sync"
- fail: msg="Galera Replication - Node appears to be the last node in a cluster - cannot safely proceed unless overriden via single_controller setting - See README.rst"
when: instance_status == "ACTIVE" and single_controller is not defined and galera_cluster_size == "1"
- name: Stop RabbitMQ Application for shutdown
command: rabbitmqctl stop_app
- name: Remove the node from the RabbitMQ cluster
command: rabbitmqctl reset
- service: name=rabbitmq-server state=stopped
when: instance_status == "ACTIVE"
- name: "Waiting for MySQL to stop"
wait_for: port=3307 state=stopped timeout=60 delay=10
when: instance_status == "ACTIVE" and helion is defined and single_controller is not defined and galera_status == 'Synced'
- name: "Waiting for rabbitmq-server to stop"
wait_for: port=5672 state=stopped timeout=60 delay=10
when: instance_status == "ACTIVE"
- include: disable_os_collect_config.yml
- hosts: controller-bootstrap
name: Stop MySQL/RabbitMQ on Overcloud Controller Bootstrap node
tags: shutdown-cloud
sudo: yes
gather_facts: no
max_fail_percentage: 0
tasks:
- include: galera_status.yml
when: instance_status == "ACTIVE"
- fail: msg="Galera Replication on controller Management is out of sync - cannot safely proceed"
when: instance_status == "ACTIVE" and single_controller is not defined and galera_status != "Synced"
- fail: msg="Galera Replication on controller Management - cannot safely proceed as another MySQL cluster node is active."
when: instance_status == "ACTIVE" and single_controller is not defined and galera_cluster_size != "1"
- include: stop_mysql.yml
when: instance_status == "ACTIVE"
- service: name=rabbitmq-server enabled=no state=stopped
when: instance_status == "ACTIVE"
- name: "Waiting for rabbitmq-server to stop"
wait_for: port=5672 state=stopped timeout=60 delay=10
when: instance_status == "ACTIVE"
- include: disable_os_collect_config.yml
- hosts: all:!unknown
tags: shutdown-cloud
gather_facts: no
max_fail_percentage: 0
tasks:
- include: disable_os_collect_config.yml
when: instance_status == "ACTIVE"
- hosts: undercloud
name: Rebuild and Refresh Undercloud
gather_facts: no
max_fail_percentage: 0
tasks:
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: step_undercloud_backup_tftpboot.yml
when: instance_status == "ACTIVE"
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ undercloud_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- include: refresh_config.yml
- hosts: undercloud
name: Enable Undercloud
sudo: yes
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- include: step_reset_mnt_state_permissions.yml
# Directly call os-apply-config to write out configuration files.
- include: step_os-apply-config.yml
- include: step_undercloud_restore_tftpboot.yml
- include: start_mysql.yml
- include: start_rabbitmq.yml
# Fix Ironic Reservations due to bug:
# https://bugs.launchpad.net/ironic/+bug/1382698
- include: step_undercloud_ironic_release_reservations.yml
- include: step_run_occ.yml
- service: name={{ item }} enabled=yes state=started
with_items: helion_undercloud_services
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: undercloud_services
when: helion is not defined and item in existing_services
- include: enable_start_os_collect_config.yml
- hosts: controller-bootstrap
name: Rebuild and Refresh controller-bootstrap
gather_facts: no
max_fail_percentage: 0
tasks:
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: cleanup_cinder_volumes.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ controller_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- pause: seconds=30 prompt="Allowing controller-bootstrap node to settle"
- hosts: controller-bootstrap
name: Start initial cluster node
max_fail_percentage: 0
sudo: yes
tasks:
- include: activate_cinder_volumes.yml
- include: mysql_init_fix.yml
- include: rabbitmq_occ_disable.yml
- include: refresh_config.yml
- name: "Work around apache2 starting up at boot w/o config..."
service: name=apache2 enabled=no state=stopped
- name: "Run os-collect-config"
command: os-collect-config --force --one
- include: step_reset_mnt_state_permissions.yml
# Directly call os-apply-config to write out configuration files.
- include: step_os-apply-config.yml
- include: step_generate_hosts_file.yml
- name: Wait for cloud-init to Complete
wait_for: path=/run/cloud-init/result.json state=present
- name: Wait for ovs-vswitchd to be started
wait_for: path=/var/run/openvswitch/ovs-vswitchd.pid state=present
- name: Wait for ovs-vswitchd to config during start-up
pause: minutes=1
- name: Bootstrap the MySQL cluster
command: /etc/init.d/mysql bootstrap-pxc
when: single_controller is not defined
- include: start_mysql.yml
- name: "Start keepalived if not in single_controller mode"
service: name=keepalived state=started enabled=yes
when: single_controller is not defined
- name: "Start haproxy if not in single_controller mode"
service: name=haproxy state=started enabled=yes
when: single_controller is not defined
- name: "Pause for 60 seconds if not in single_controller mode"
pause: seconds=60 prompt="Pausing for 60 seconds to allow keepalived/haproxy to enter operational states"
when: single_controller is not defined
- include: step_create_databases.yml
- include: start_rabbitmq.yml
- include: step_run_occ.yml
- name: Wait for Rabbit to listen on its usual port
wait_for: port=5672 state=started timeout=90 delay=10
- include: mysql_access_fix.yml
- hosts: controller
name: Rebuild and Refresh Controller
gather_facts: no
max_fail_percentage: 0
tasks:
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_stop_services.yml
vars:
services_to_stop: "{{ overcloud_controller_services }}"
when: instance_status == "ACTIVE" and helion is not defined
- include: step_stop_services.yml
vars:
services_to_stop: "{{ helion_overcloud_controller_services }}"
when: instance_status == "ACTIVE" and helion is defined
- include: cleanup_cinder_volumes.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ controller_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- pause: seconds=30 prompt="Allowing controller node to settle."
- hosts: controller
name: Stop and setup for controller refresh
max_fail_percentage: 0
sudo: yes
tasks:
- include: activate_cinder_volumes.yml
- name: "Inject Firewall rules for for MySQL to start - tcp/4444"
sudo: yes
command: /sbin/iptables -I INPUT -p tcp --dport 4444 -j ACCEPT
- name: "Inject Firewall rules for for MySQL to start - tcp/4567"
sudo: yes
command: /sbin/iptables -I INPUT -p tcp --dport 4567 -j ACCEPT
- name: "Inject Firewall rules for for MySQL to start - tcp/4568"
sudo: yes
command: /sbin/iptables -I INPUT -p tcp --dport 4568 -j ACCEPT
# This action of stopping prior to starting is to ensure that should
# MySQL started upon boot, then it would hopefully pickup new config
# that os-collect-config and os-apply-config would have put in place.
- include: mysql_init_fix.yml
- include: rabbitmq_occ_disable.yml
- include: step_reset_mnt_state_permissions.yml
- include: refresh_config.yml
- name: "Work around apache2 starting up at boot w/o config..."
service: name=apache2 enabled=no state=stopped
# Directly call os-apply-config to write out configuration files in case
# os-collect-config has failed to reach that step.
- include: step_os-apply-config.yml
- include: step_generate_hosts_file.yml
- name: Wait for cloud-init to Complete
wait_for: path=/run/cloud-init/result.json state=present
- name: Wait for ovs-vswitchd to be started
wait_for: path=/var/run/openvswitch/ovs-vswitchd.pid state=present
- name: Wait for ovs-vswitchd to config during start-up
pause: minutes=1
- include: start_mysql.yml
- include: mysql_access_fix.yml
- hosts: controller
name: Initiate Database Creation
max_fail_percentage: 0
serial: 1
sudo: yes
tasks:
- include: step_create_databases.yml
- hosts: controller
name: Complete Controller Refresh
max_fail_percentage: 0
sudo: yes
tasks:
- include: start_rabbitmq.yml
- include: rabbitmq_rejoin_cluster.yml
- include: step_run_occ.yml
- name: Wait for Rabbit to listen on its usual port
wait_for: port=5672 state=started timeout=120 delay=10
- hosts: controller:controller-bootstrap
name: Check RabbitMQ
max_fail_percentage: 0
tasks:
- pause: seconds=30 prompt="Giving RabbitMQ time to start-up."
- name: Checking rabbitmq cluster status
sudo: yes
command: rabbitmqctl cluster_status
when: single_controller is not defined
- include: cleanup_rabbitmq_start.yml
- hosts: controller-bootstrap
name: Enable Overcloud controller-bootstrap
sudo: yes
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_bootstrap_controller_service
when: helion is defined and item in existing_services
- include: enable_start_os_collect_config.yml
- hosts: controller
name: Enable Overcloud Controller
sudo: yes
max_fail_percentage: 0
tasks:
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_controller_services
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: overcloud_controller_services
when: helion is not defined and item in existing_services
- include: enable_start_os_collect_config.yml
- hosts: swift-storage
name: Rebuild and Refresh swift-storage
gather_facts: no
max_fail_percentage: 0
tasks:
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ swift_storage_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- include: refresh_config.yml
- hosts: swift-storage
name: Enable Swift Storage
sudo: yes
max_fail_percentage: 0
tasks:
- include: step_run_occ.yml
sudo: yes
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_swift_services
sudo: yes
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: overcloud_swift_services
sudo: yes
when: helion is not defined and item in existing_services
- include: enable_start_os_collect_config.yml
- hosts: vsa
name: Rebuild and Refresh vsa
gather_facts: no
max_fail_percentage: 0
tasks:
- include: preserve_ssh_host_keys.yml
when: instance_status == "ACTIVE"
- include: step_unmount.yml
when: instance_status == "ACTIVE"
- { include: rebuild.yml, instance_id: "{{ instance_id }}", rebuild_image_id: "{{ vsa_rebuild_image_id }}", when: instance_status != "REBUILD" }
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=TRIPLEO_HK_RESTORED delay=10
when: wait_for_hostkey is defined
ignore_errors: yes
- local_action: wait_for port=22 timeout="{{ ssh_timeout }}" host="{{ inventory_hostname }}" search_regex=OpenSSH delay=10
when: wait_for_hostkey is not defined
ignore_errors: yes
- include: refresh_config.yml
- hosts: vsa
name: Enable VSA
sudo: yes
max_fail_percentage: 0
tasks:
- include: step_run_occ.yml
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_vsa_services
when: helion is defined and item in existing_services
- include: enable_start_os_collect_config.yml
- hosts: nova-compute
name: "Download image from glance if online upgrade is being invoked"
gather_facts: no
max_fail_percentage: 0
# This play must be executed one instance at a time as it downloads
# files to the local machine where ansible is executing.
serial: 1
tasks:
- include: step_update_online_download_image.yml
vars:
rebuild_image_id: "{{ nova_compute_rebuild_image_id }}"
when: online_upgrade is defined
- hosts: nova-compute
name: Rebuild and Refresh Nova Compute
gather_facts: yes
max_fail_percentage: 0
tasks:
- include: step_preserve_iscsi_initiator.yml
when: instance_status == "ACTIVE"
- include: step_preserve_password_file.yml
when: instance_status == "ACTIVE"
- include: step_update_rebuild_node.yml
vars:
instance_id: "{{ instance_id }}"
rebuild_image_id: "{{ nova_compute_rebuild_image_id }}"
when: online_upgrade is not defined
- include: step_update_online.yml
vars:
rebuild_image_id: "{{ nova_compute_rebuild_image_id }}"
when: online_upgrade is defined
- include: step_stamp_image_id.yml
vars:
rebuild_image_id: "{{ nova_compute_rebuild_image_id }}"
- include: step_cleanup_from_online_upgrade.yml
when: online_upgrade is defined
- hosts: nova-compute
name: Enable Overcloud Compute
sudo: yes
max_fail_percentage: 0
tasks:
- include: step_os-apply-config.yml
- include: step_restore_iscsi_initiator.yml
- pause: seconds=45 prompt="Giving the compute node forty-five seconds to complete existing processes"
when: online_upgrade is not defined
# Write out config files in as we might be getting in while the
# system is starting up.
- include: step_cloud_init.yml
when: online_upgrade is defined
- name: Wait for cloud-init to Complete
wait_for: path=/run/cloud-init/result.json state=present
- name: Wait for ovs-vswitchd to be started
wait_for: path=/var/run/openvswitch/ovs-vswitchd.pid state=present
when: online_upgrade is not defined
- name: Wait for ovs-vswitchd to config during start-up
pause: minutes=1
when: online_upgrade is not defined
- include: step_run_occ.yml
- pause: seconds=30 msg="Pausing for 30 seconds to allow services to complete start-up."
- service_facts:
when: instance_status == "ACTIVE"
- service: name={{ item }} enabled=yes state=started
with_items: helion_overcloud_compute_services
when: helion is defined and item in existing_services
- service: name={{ item }} enabled=yes state=started
with_items: overcloud_compute_services
when: helion is not defined and item in existing_services
- pause: seconds=60 prompt="Giving Open vSwitch time to reconnect"
# nova-compute should already be started, however this step explicitly sets
# the service to start upon boot.
- service: name=nova-compute state=started enabled=yes
- include: enable_start_os_collect_config.yml
- include: step_post_hook.yml