From 96a96a26557092971604441a53f74a0cb98979a5 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 17 Jun 2019 13:48:13 +0100 Subject: [PATCH] Restart all nova services after upgrade During an upgrade, nova pins the version of RPC calls to the minimum seen across all services. This ensures that old services do not receive data they cannot handle. After the upgrade is complete, all nova services are supposed to be reloaded via SIGHUP to cause them to check again the RPC versions of services and use the new latest version which should now be supported by all running services. Due to a bug [1] in oslo.service, sending services SIGHUP is currently broken. We replaced the HUP with a restart for the nova_compute container for bug 1821362, but not other nova services. It seems we need to restart all nova services to allow the RPC version pin to be removed. Testing in a Queens to Rocky upgrade, we find the following in the logs: Automatically selected compute RPC version 5.0 from minimum service version 30 However, the service version in Rocky is 35. There is a second issue in that it takes some time for the upgraded services to update the nova services database table with their new version. We need to wait until all nova-compute services have done this before the restart is performed, otherwise the RPC version cap will remain in place. There is currently no interface in nova available for checking these versions [2], so as a workaround we use a configurable delay with a default duration of 30 seconds. Testing showed it takes about 10 seconds for the version to be updated, so this gives us some headroom. This change restarts all nova services after an upgrade, after a 30 second delay. [1] https://bugs.launchpad.net/oslo.service/+bug/1715374 [2] https://bugs.launchpad.net/nova/+bug/1833542 Change-Id: Ia6fc9011ee6f5461f40a1307b72709d769814a79 Closes-Bug: #1833069 Related-Bug: #1833542 (cherry picked from commit e6d2b92200d02715649d923b0ef2d6981905a6b9) --- ansible/roles/nova/defaults/main.yml | 9 ++++ ansible/roles/nova/handlers/main.yml | 52 ++++++++++++++++++++ ansible/roles/nova/tasks/legacy_upgrade.yml | 2 - ansible/roles/nova/tasks/reload.yml | 38 -------------- ansible/roles/nova/tasks/rolling_upgrade.yml | 2 - 5 files changed, 61 insertions(+), 42 deletions(-) delete mode 100644 ansible/roles/nova/tasks/reload.yml diff --git a/ansible/roles/nova/defaults/main.yml b/ansible/roles/nova/defaults/main.yml index 8081b7ad45..cd19331ee0 100644 --- a/ansible/roles/nova/defaults/main.yml +++ b/ansible/roles/nova/defaults/main.yml @@ -368,6 +368,15 @@ nova_services_require_nova_conf: - nova-scheduler - nova-spicehtml5proxy +# After upgrading nova-compute, services will have an RPC version cap in place. +# We need to restart all services that communicate with nova-compute in order +# to allow them to use the latest RPC version. Ideally, there would be a way to +# check whether all nova services are using the latest version, but currently +# there is not. Instead, wait a short time for all nova compute services to +# update the version of their service in the database. This seems to take +# around 10 seconds, but the default is 30 to allow room for slowness. +nova_compute_startup_delay: 30 + #################### # Notification #################### diff --git a/ansible/roles/nova/handlers/main.yml b/ansible/roles/nova/handlers/main.yml index ad73838e9a..faa2072fcf 100644 --- a/ansible/roles/nova/handlers/main.yml +++ b/ansible/roles/nova/handlers/main.yml @@ -319,3 +319,55 @@ - kolla_action != "config" - inventory_hostname in groups['compute'] - enable_nova_fake | bool + +# NOTE(mgoddard): After upgrading nova-compute, services will have an RPC +# version cap in place. We need to restart all services that communicate with +# nova-compute in order to allow them to use the latest RPC version. Ideally, +# there would be a way to check whether all nova services are using the latest +# version, but currently there is not. Instead, wait a short time for all nova +# compute services to update the version of their service in the database. +# This seems to take around 10 seconds, but the default is 30 to allow room +# for slowness. + +- name: Wait for nova-compute services to update service versions + pause: + seconds: "{{ nova_compute_startup_delay }}" + run_once: true + when: + - kolla_action == 'upgrade' + listen: + - Restart nova-compute container + - Restart nova-compute-ironic container + - Restart nova-compute-fake containers + +# NOTE(mgoddard): Currently (just prior to Stein release), sending SIGHUP to +# nova compute services leaves them in a broken state in which they cannot +# start new instances. The following error is seen in the logs: +# "In shutdown, no new events can be scheduled" +# To work around this we restart the nova-compute services. +# Speaking to the nova team, this seems to be an issue in oslo.service, +# with a fix proposed here: https://review.openstack.org/#/c/641907. +# This issue also seems to affect the proxy services, which exit non-zero in +# reponse to a SIGHUP, so restart those too. +# The issue actually affects all nova services, since they remain with RPC +# version pinned to the previous release: +# https://bugs.launchpad.net/kolla-ansible/+bug/1833069. +# TODO(mgoddard): Use SIGHUP when this bug has been fixed. + +- name: Restart nova services to remove RPC version cap + become: true + kolla_docker: + action: restart_container + common_options: "{{ docker_common_options }}" + name: "{{ item.value.container_name }}" + when: + - kolla_action == 'upgrade' + - inventory_hostname in groups[item.value.group] + - item.value.enabled | bool + - item.key in nova_services_require_nova_conf + - item.key != 'placement-api' + with_dict: "{{ nova_services }}" + listen: + - Restart nova-compute container + - Restart nova-compute-ironic container + - Restart nova-compute-fake containers diff --git a/ansible/roles/nova/tasks/legacy_upgrade.yml b/ansible/roles/nova/tasks/legacy_upgrade.yml index bd931d5282..04c879e3bc 100644 --- a/ansible/roles/nova/tasks/legacy_upgrade.yml +++ b/ansible/roles/nova/tasks/legacy_upgrade.yml @@ -26,5 +26,3 @@ - name: Flush handlers meta: flush_handlers - -- include_tasks: reload.yml diff --git a/ansible/roles/nova/tasks/reload.yml b/ansible/roles/nova/tasks/reload.yml deleted file mode 100644 index 96f57f9681..0000000000 --- a/ansible/roles/nova/tasks/reload.yml +++ /dev/null @@ -1,38 +0,0 @@ ---- -# This play calls sighup on every service to refresh upgrade levels - -# NOTE(mgoddard): Currently (just prior to Stein release), sending SIGHUP to -# nova compute services leaves them in a broken state in which they cannot -# start new instances. The following error is seen in the logs: -# "In shutdown, no new events can be scheduled" -# To work around this we restart the nova-compute services. -# Speaking to the nova team, this seems to be an issue in oslo.service, -# with a fix proposed here: https://review.openstack.org/#/c/641907. -# This issue also seems to affect the proxy services, which exit non-zero in -# reponse to a SIGHUP, so restart those too. -# TODO(mgoddard): Remove this workaround when this bug has been fixed. - -- name: Send SIGHUP to nova services - become: true - command: docker exec -t {{ item.value.container_name }} kill -1 1 - when: - - inventory_hostname in groups[item.value.group] - - item.value.enabled | bool - - item.key in nova_services_require_nova_conf - - not item.key.startswith('nova-compute') - - not item.key.endswith('proxy') - with_dict: "{{ nova_services }}" - -- name: Restart nova compute and proxy services - become: true - kolla_docker: - action: restart_container - common_options: "{{ docker_common_options }}" - name: "{{ item.value.container_name }}" - when: - - inventory_hostname in groups[item.value.group] - - item.value.enabled | bool - - item.key in nova_services_require_nova_conf - - item.key.startswith('nova-compute') - or item.key.endswith('proxy') - with_dict: "{{ nova_services }}" diff --git a/ansible/roles/nova/tasks/rolling_upgrade.yml b/ansible/roles/nova/tasks/rolling_upgrade.yml index 76ed9c56cd..962321975b 100644 --- a/ansible/roles/nova/tasks/rolling_upgrade.yml +++ b/ansible/roles/nova/tasks/rolling_upgrade.yml @@ -25,8 +25,6 @@ - name: Flush handlers meta: flush_handlers -- include_tasks: reload.yml - - name: Migrate Nova database vars: nova_api: "{{ nova_services['nova-api'] }}"