From cc61ff93ec41e149caba31cc21524f37def4d07e Mon Sep 17 00:00:00 2001
From: Martin Schuppert <mschuppert@redhat.com>
Date: Thu, 22 Nov 2018 15:08:11 +0100
Subject: [PATCH] Change step to start nova placement and make compute wait for
 it

There is a deployment race where nova-placement fails to start if
the nova api db migration have not finished before starting it.
We start nova placement early to make sure it is up before the
nova-compute services get started. Since in HA scenario there is
no sync in between the nodes on the current worked deployment step
we might have the situation that the placement service gets started
on C1/2 when the nova api db sync is not yet finished on C0.

We have two possibilities:
1) start placement later and verify that nova-computes recover correct
2) verify that db migration on nova_api db finished before start nova-
placement on the controllers

2) which was addressed via https://review.openstack.org/610966 showed
problems:
a) the docker/podman container failed to start with some file not found
error, therefore this was reverted in https://review.openstack.org/619607

b) when the scrip were running on different controllers at the same
time, the way how nova's db_version() is implemented has issues, which
is being worked on in https://review.openstack.org/619622

This patch addresses 1) and moves placement service start to step_4
and adds an additional task on the computes to wait until the placement
service is up.

Closes-Bug: #1784155

Change-Id: Ifb5ffc4b25f5ca266560bc0ac96c73071ebd1c9f
---
 docker/services/nova-compute-common.yaml      |   3 +
 docker/services/nova-compute.yaml             |  14 ++-
 docker/services/nova-placement.yaml           |   2 +-
 .../nova_wait_for_placement_service.py        | 110 ++++++++++++++++++
 4 files changed, 127 insertions(+), 2 deletions(-)
 create mode 100755 docker_config_scripts/nova_wait_for_placement_service.py

diff --git a/docker/services/nova-compute-common.yaml b/docker/services/nova-compute-common.yaml
index 65f95baf09..55ccbed696 100644
--- a/docker/services/nova-compute-common.yaml
+++ b/docker/services/nova-compute-common.yaml
@@ -40,3 +40,6 @@ outputs:
       nova_statedir_ownership.py:
         mode: "0700"
         content: { get_file: ../../docker_config_scripts/nova_statedir_ownership.py }
+      nova_wait_for_placement_service.py:
+        mode: "0700"
+        content: { get_file: ../../docker_config_scripts/nova_wait_for_placement_service.py }
diff --git a/docker/services/nova-compute.yaml b/docker/services/nova-compute.yaml
index 0c69810af6..ce8863c992 100644
--- a/docker/services/nova-compute.yaml
+++ b/docker/services/nova-compute.yaml
@@ -197,10 +197,22 @@ outputs:
             detach: false
             volumes:
               - /var/lib/nova:/var/lib/nova:shared,z
-              - /var/lib/docker-config-scripts/:/docker-config-scripts/
+              - /var/lib/docker-config-scripts/:/docker-config-scripts/:z
             command: "/docker-config-scripts/pyshim.sh /docker-config-scripts/nova_statedir_ownership.py"
         step_4:
+          nova_wait_for_placement_service:
+            start_order: 2
+            image: *nova_compute_image
+            user: root
+            net: host
+            privileged: false
+            detach: false
+            volumes:
+              - /var/lib/docker-config-scripts/:/docker-config-scripts/:z
+              - /var/lib/config-data/puppet-generated/nova_libvirt/etc/nova:/etc/nova:ro
+            command: "/docker-config-scripts/pyshim.sh /docker-config-scripts/nova_wait_for_placement_service.py"
           nova_compute:
+            start_order: 3
             image: *nova_compute_image
             ulimit: {get_param: DockerNovaComputeUlimit}
             ipc: host
diff --git a/docker/services/nova-placement.yaml b/docker/services/nova-placement.yaml
index ee32b2e37b..c13b0649d0 100644
--- a/docker/services/nova-placement.yaml
+++ b/docker/services/nova-placement.yaml
@@ -118,7 +118,7 @@ outputs:
         step_2:
           get_attr: [NovaPlacementLogging, docker_config, step_2]
         # start this early so it is up before computes start reporting
-        step_3:
+        step_4:
           nova_placement:
             start_order: 1
             image: {get_param: DockerNovaPlacementImage}
diff --git a/docker_config_scripts/nova_wait_for_placement_service.py b/docker_config_scripts/nova_wait_for_placement_service.py
new file mode 100755
index 0000000000..37126cc913
--- /dev/null
+++ b/docker_config_scripts/nova_wait_for_placement_service.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+#
+# Copyright 2018 Red Hat Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+# shell script to check if nova API DB migrations finished after X attempts.
+# Default max is 60 iterations with 10s (default) timeout in between.
+
+from __future__ import print_function
+
+import logging
+import os
+import re
+import sys
+import time
+
+from keystoneauth1.identity import v3
+from keystoneauth1 import session
+from keystoneclient.v3 import client
+import requests
+from six.moves.configparser import SafeConfigParser
+
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+LOG = logging.getLogger('nova_wait_for_placement_service')
+
+iterations = 60
+timeout = 10
+nova_cfg = '/etc/nova/nova.conf'
+
+if __name__ == '__main__':
+    if os.path.isfile(nova_cfg):
+        config = SafeConfigParser()
+        config.read(nova_cfg)
+    else:
+        LOG.error('Nova configuration file %s does not exist', nova_cfg)
+        sys.exit(1)
+
+    # get keystone client with details from [placement] section
+    auth = v3.Password(
+        user_domain_name=config.get('placement', 'user_domain_name'),
+        username=config.get('placement', 'username'),
+        password=config.get('placement', 'password'),
+        project_name=config.get('placement', 'project_name'),
+        project_domain_name=config.get('placement', 'user_domain_name'),
+        auth_url=config.get('placement', 'auth_url')+'/v3')
+    sess = session.Session(auth=auth)
+    keystone = client.Client(session=sess)
+
+    iterations_endpoint = iterations
+    placement_endpoint_url = None
+    while iterations_endpoint > 1:
+        iterations_endpoint -= 1
+        try:
+            # get placement service id
+            placement_service_id = keystone.services.list(
+                name='placement')[0].id
+
+            # get placement endpoint (valid_interfaces)
+            placement_endpoint_url = keystone.endpoints.list(
+                service=placement_service_id,
+                interface=config.get('placement', 'valid_interfaces'))[0].url
+            if not placement_endpoint_url:
+                LOG.error('Failed to get placement service endpoint!')
+            else:
+                break
+        except Exception as e:
+            LOG.exception('Retry - Failed to get placement service endpoint:')
+        time.sleep(timeout)
+
+    if not placement_endpoint_url:
+        LOG.error('Failed to get placement service endpoint!')
+        sys.exit(1)
+
+    # we should have CURRENT in the request response from placement:
+    # {"versions": [{"status": "CURRENT", "min_version": "1.0", "max_version":
+    # "1.29", "id": "v1.0", "links": [{"href": "", "rel": "self"}]}]}
+    response_reg = re.compile('.*CURRENT,*')
+
+    while iterations > 1:
+        iterations -= 1
+        try:
+            r = requests.get(placement_endpoint_url+'/', verify=False)
+            if r.status_code == 200 and response_reg.match(r.text):
+                LOG.info('Placement service up! - %s', r.text)
+                sys.exit(0)
+                break
+            else:
+                LOG.info('response - %r', r)
+                LOG.info('Placement service not up - %s, %s',
+                         r.status_code,
+                         r.text)
+        except Exception as e:
+            LOG.exception('Error query the placement endpoint:')
+        time.sleep(timeout)
+
+    sys.exit(1)
+
+# vim: set et ts=4 sw=4 :