From 826f6850d06afbbad95727bdf6cf53fb986b7cf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Piliszek?= Date: Sun, 28 Jul 2019 11:41:46 +0200 Subject: [PATCH] ceph: fixes to deployment and upgrade MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1) ceph-nfs (ganesha-ceph) - use NFSv4 only This is recommended upstream. v3 and UDP require portmapper (aka rpcbind) which we do not want, except where Ubuntu ganesha version (2.6) forces it by requiring enabled UDP, see [1]. The issue has been fixed in 2.8, included in CentOS. Additionally disable v3 helper protocols and kerberos to avoid meaningless warnings. 2) ceph-nfs (ganesha-ceph) - do not export host dbus It is not in use. This avoids the temptation to try handling it on host. 3) Properly handle ceph services deploy and upgrade Upgrade runs deploy. The order has been corrected - nfs goes after mds. Additionally upgrade takes care of rgw for keystone (for swift emulation). 4) Enhance ceph keyring module with error detection Now it does not blindly try to create a keyring after any failure. This used to hide real issue. 5) Retry ceph admin keyring update until cluster works Reordering deployment caused issue with ceph cluster not being fully operational before taking actions on it. 6) CI: Remove osd df from collected logs as it may hang CI Hangs are caused by healthy MON and no healthy MGR. A descriptive note is left in its place. 7) CI: Add 5s timeout to ceph informational commands This decreases the timeout from the default 300s. [1] https://review.opendev.org/669315 Change-Id: I1cf0ad10b80552f503898e723f0c4bd00a38f143 Signed-off-by: Radosław Piliszek --- ansible/library/kolla_ceph_keyring.py | 11 ++++-- ansible/roles/ceph/tasks/deploy.yml | 35 ++++++++++++------- ansible/roles/ceph/tasks/start_nfss.yml | 1 - ansible/roles/ceph/tasks/upgrade.yml | 36 +------------------- ansible/roles/ceph/templates/ganesha.conf.j2 | 11 ++++++ tests/get_logs.sh | 8 +++-- 6 files changed, 48 insertions(+), 54 deletions(-) diff --git a/ansible/library/kolla_ceph_keyring.py b/ansible/library/kolla_ceph_keyring.py index 5041563ecd..ba61d36e98 100644 --- a/ansible/library/kolla_ceph_keyring.py +++ b/ansible/library/kolla_ceph_keyring.py @@ -15,6 +15,7 @@ # limitations under the License. import json +import re import subprocess # nosec @@ -51,13 +52,16 @@ EXAMPLES = ''' name: client.admin container_name: ceph_mon caps: - mds: 'allow' + mds: 'allow *' mon: 'allow *' osd: 'allow *' mgr: 'allow *' ''' +enoent_re = re.compile(r"\bENOENT\b") + + class CephKeyring(object): def __init__(self, name, caps, container_name='ceph_mon'): self.name = name @@ -93,7 +97,10 @@ class CephKeyring(object): def ensure_keyring(self): try: stdout = self.get_keyring() - except subprocess.CalledProcessError: + except subprocess.CalledProcessError as e: + if e.returncode != 2 or not enoent_re.search(e.output): + # this is not a missing keyring case + raise # keyring doesn't exsit, try to create it stdout = self.create_keyring() self.changed = True diff --git a/ansible/roles/ceph/tasks/deploy.yml b/ansible/roles/ceph/tasks/deploy.yml index c72eced8ef..869be9d419 100644 --- a/ansible/roles/ceph/tasks/deploy.yml +++ b/ansible/roles/ceph/tasks/deploy.yml @@ -1,4 +1,6 @@ --- +# NOTE(yoctozepto): this file is used during upgrade as well + - include_tasks: config.yml - include_tasks: bootstrap_mons.yml @@ -9,19 +11,8 @@ - include_tasks: start_mons.yml when: inventory_hostname in groups['ceph-mon'] -- include_tasks: start_mgrs.yml - when: inventory_hostname in groups['ceph-mgr'] - -- include_tasks: start_ceph_dashboard.yml - when: - - enable_ceph_dashboard | bool - - inventory_hostname in groups['ceph-mon'] - -- include_tasks: start_nfss.yml - when: - - enable_ceph_nfs | bool - - inventory_hostname in groups['ceph-nfs'] - +# NOTE(yoctozepto): this ensures caps for admin are always up-to-date (run as earliest as possible = after MONs start) +# this is retried because the cluster might not be fully operational yet (quorum gathering) - name: configuring client.admin caps become: true kolla_ceph_keyring: @@ -29,6 +20,18 @@ caps: "{{ ceph_client_admin_keyring_caps }}" run_once: True delegate_to: "{{ groups['ceph-mon'][0] }}" + register: result + until: result is success + retries: 3 + delay: 15 + +- include_tasks: start_mgrs.yml + when: inventory_hostname in groups['ceph-mgr'] + +- include_tasks: start_ceph_dashboard.yml + when: + - enable_ceph_dashboard | bool + - inventory_hostname in groups['ceph-mon'] - include_tasks: bootstrap_osds.yml when: inventory_hostname in groups['ceph-osd'] @@ -50,3 +53,9 @@ when: - enable_ceph_mds | bool - inventory_hostname in groups['ceph-mds'] + +# NOTE(yoctozepto): nfs (cephfs-based) depends on mds so start it after +- include_tasks: start_nfss.yml + when: + - enable_ceph_nfs | bool + - inventory_hostname in groups['ceph-nfs'] diff --git a/ansible/roles/ceph/tasks/start_nfss.yml b/ansible/roles/ceph/tasks/start_nfss.yml index 642d7e50f4..61eaf7ad95 100644 --- a/ansible/roles/ceph/tasks/start_nfss.yml +++ b/ansible/roles/ceph/tasks/start_nfss.yml @@ -9,5 +9,4 @@ volumes: - "{{ node_config_directory }}/ceph-nfs/:{{ container_config_directory }}/:ro" - "/etc/localtime:/etc/localtime:ro" - - "/var/run/dbus:/var/run/dbus" - "kolla_logs:/var/log/kolla/" diff --git a/ansible/roles/ceph/tasks/upgrade.yml b/ansible/roles/ceph/tasks/upgrade.yml index 0669732510..6de4f2c343 100644 --- a/ansible/roles/ceph/tasks/upgrade.yml +++ b/ansible/roles/ceph/tasks/upgrade.yml @@ -1,39 +1,5 @@ --- -- include_tasks: config.yml - -# NOTE(jeffrey4l): client.admin caps should be update when upgrade from Jewel -# to Luminous -- name: configuring client.admin caps - become: true - kolla_ceph_keyring: - name: client.admin - caps: "{{ ceph_client_admin_keyring_caps }}" - run_once: True - delegate_to: "{{ groups['ceph-mon'][0] }}" - -- include_tasks: start_mons.yml - when: inventory_hostname in groups['ceph-mon'] - -- include_tasks: start_mgrs.yml - when: inventory_hostname in groups['ceph-mgr'] - -- include_tasks: start_osds.yml - when: inventory_hostname in groups['ceph-osd'] - -- include_tasks: start_rgws.yml - when: - - enable_ceph_rgw | bool - - inventory_hostname in groups['ceph-rgw'] - -- include_tasks: start_mdss.yml - when: - - enable_ceph_mds | bool - - inventory_hostname in groups['ceph-mds'] - -- include_tasks: start_nfss.yml - when: - - enable_ceph_nfs | bool - - inventory_hostname in groups['ceph-nfs'] +- include_tasks: deploy.yml - name: Check final release (as running on MONs) become: true diff --git a/ansible/roles/ceph/templates/ganesha.conf.j2 b/ansible/roles/ceph/templates/ganesha.conf.j2 index 3860cdc796..2c51ae61d9 100644 --- a/ansible/roles/ceph/templates/ganesha.conf.j2 +++ b/ansible/roles/ceph/templates/ganesha.conf.j2 @@ -1,3 +1,14 @@ +NFS_CORE_PARAM { + Protocols = 4; + Enable_NLM = false; + Enable_RQUOTA = false; + Enable_UDP = false; +} + +NFS_KRB5 { + Active_krb5 = false; +} + EXPORT { Export_id=1; diff --git a/tests/get_logs.sh b/tests/get_logs.sh index 359f1f3f1c..fb2d0773f5 100644 --- a/tests/get_logs.sh +++ b/tests/get_logs.sh @@ -43,9 +43,11 @@ copy_logs() { # ceph related logs if [[ $(docker ps --filter name=ceph_mon --format "{{.Names}}") ]]; then - docker exec ceph_mon ceph -s > ${LOG_DIR}/kolla/ceph/ceph_s.txt - docker exec ceph_mon ceph osd df > ${LOG_DIR}/kolla/ceph/ceph_osd_df.txt - docker exec ceph_mon ceph osd tree > ${LOG_DIR}/kolla/ceph/ceph_osd_tree.txt + docker exec ceph_mon ceph --connect-timeout 5 -s > ${LOG_DIR}/kolla/ceph/ceph_s.txt + # NOTE(yoctozepto): osd df removed on purpose to avoid CI POST_FAILURE due to a possible hang: + # as of ceph mimic it hangs when MON is operational but MGR not + # its usefulness is mediocre and having POST_FAILUREs is bad + docker exec ceph_mon ceph --connect-timeout 5 osd tree > ${LOG_DIR}/kolla/ceph/ceph_osd_tree.txt fi # bifrost related logs