Fix health-probe concurrency and timings

Changed Nova and Neutron health-probe script to exit if previous
probe process is still running.
The health-probe has RPC call timeout of 60 seconds and has 2
retries. In worst case scenario the probe process can run a little
over 180 seconds. Changing the periodSeconds so that probe starts
after previous one is complete. Also changing timeoutSeconds value
a little to give little more extra time for the probe to finish.
Increasing the liveness probe periods as they are not do critical
which will reduce the resource usage for the probes.

Co-authored-by: Randeep Jalli <rj2083@att.com>

Change-Id: Ife1c381d663c1e271a5099bdc6d0dfefb00d8d73
This commit is contained in:
Sangeet Gupta 2020-02-07 17:22:06 +00:00 committed by Tin Lam
parent 5827236ad2
commit 414b10fab0
8 changed files with 97 additions and 36 deletions

View File

@ -39,8 +39,10 @@ Usage example for Neutron metadata agent:
import httplib2
from six.moves import http_client as httplib
import json
import os
import psutil
import signal
import socket
import sys
@ -292,8 +294,36 @@ def test_rpc_liveness():
check_agent_status(transport)
def check_pid_running(pid):
if psutil.pid_exists(int(pid)):
return True
else:
return False
if __name__ == "__main__":
if "liveness-probe" in ','.join(sys.argv):
pidfile = "/tmp/liveness.pid" #nosec
else:
pidfile = "/tmp/readiness.pid" #nosec
data = {}
if os.path.isfile(pidfile):
with open(pidfile,'r') as f:
data = json.load(f)
if check_pid_running(data['pid']):
if data['exit_count'] > 1:
# Third time in, kill the previous process
os.kill(int(data['pid']), signal.SIGTERM)
else:
data['exit_count'] = data['exit_count'] + 1
with open(pidfile, 'w') as f:
json.dump(data, f)
sys.exit(0)
data['pid'] = os.getpid()
data['exit_count'] = 0
with open(pidfile, 'w') as f:
json.dump(data, f)
if "sriov_agent.ini" in ','.join(sys.argv):
sriov_readiness_check()
elif "metadata_agent.ini" not in ','.join(sys.argv):

View File

@ -81,8 +81,8 @@ spec:
- --use-fqdn
{{- end }}
initialDelaySeconds: 30
periodSeconds: 15
timeoutSeconds: 65
periodSeconds: 190
timeoutSeconds: 185
livenessProbe:
exec:
command:
@ -99,8 +99,8 @@ spec:
- --use-fqdn
{{- end }}
initialDelaySeconds: 120
periodSeconds: 90
timeoutSeconds: 70
periodSeconds: 600
timeoutSeconds: 580
command:
- /tmp/neutron-l2gw-agent.sh
volumeMounts:

View File

@ -341,28 +341,28 @@ pod:
enabled: true
params:
initialDelaySeconds: 30
periodSeconds: 15
timeoutSeconds: 65
periodSeconds: 190
timeoutSeconds: 185
liveness:
enabled: true
params:
initialDelaySeconds: 120
periodSeconds: 90
timeoutSeconds: 70
periodSeconds: 600
timeoutSeconds: 580
l3_agent:
l3_agent:
readiness:
enabled: true
params:
initialDelaySeconds: 30
periodSeconds: 15
timeoutSeconds: 65
periodSeconds: 190
timeoutSeconds: 185
liveness:
enabled: true
params:
initialDelaySeconds: 120
periodSeconds: 90
timeoutSeconds: 70
periodSeconds: 600
timeoutSeconds: 580
lb_agent:
lb_agent:
readiness:
@ -373,14 +373,14 @@ pod:
enabled: true
params:
initialDelaySeconds: 30
periodSeconds: 15
timeoutSeconds: 65
periodSeconds: 190
timeoutSeconds: 185
liveness:
enabled: true
params:
initialDelaySeconds: 120
periodSeconds: 90
timeoutSeconds: 70
periodSeconds: 600
timeoutSeconds: 580
ovs_agent:
ovs_agent:
readiness:
@ -390,16 +390,16 @@ pod:
enabled: true
params:
initialDelaySeconds: 120
periodSeconds: 90
timeoutSeconds: 70
periodSeconds: 600
timeoutSeconds: 580
sriov_agent:
sriov_agent:
readiness:
enabled: true
params:
initialDelaySeconds: 30
periodSeconds: 15
timeoutSeconds: 65
periodSeconds: 190
timeoutSeconds: 185
server:
server:
readiness:

View File

@ -33,7 +33,10 @@ Usage example for Nova Compute:
"""
import json
import os
import psutil
import signal
import socket
import sys
@ -218,8 +221,36 @@ def test_rpc_liveness():
check_service_status(transport)
def check_pid_running(pid):
if psutil.pid_exists(int(pid)):
return True
else:
return False
if __name__ == "__main__":
if "liveness-probe" in ','.join(sys.argv):
pidfile = "/tmp/liveness.pid" #nosec
else:
pidfile = "/tmp/readiness.pid" #nosec
data = {}
if os.path.isfile(pidfile):
with open(pidfile,'r') as f:
data = json.load(f)
if check_pid_running(data['pid']):
if data['exit_count'] > 1:
# Third time in, kill the previous process
os.kill(int(data['pid']), signal.SIGTERM)
else:
data['exit_count'] = data['exit_count'] + 1
with open(pidfile, 'w') as f:
json.dump(data, f)
sys.exit(0)
data['pid'] = os.getpid()
data['exit_count'] = 0
with open(pidfile, 'w') as f:
json.dump(data, f)
test_rpc_liveness()
sys.exit(0) # return success

View File

@ -203,8 +203,8 @@ spec:
- --use-fqdn
{{- end }}
initialDelaySeconds: 80
periodSeconds: 90
timeoutSeconds: 70
periodSeconds: 190
timeoutSeconds: 185
livenessProbe:
exec:
command:
@ -219,8 +219,8 @@ spec:
- --use-fqdn
{{- end }}
initialDelaySeconds: 120
periodSeconds: 90
timeoutSeconds: 70
periodSeconds: 600
timeoutSeconds: 580
command:
- /tmp/nova-compute.sh
volumeMounts:

View File

@ -69,8 +69,8 @@ spec:
- --service-queue-name
- conductor
initialDelaySeconds: 80
periodSeconds: 90
timeoutSeconds: 70
periodSeconds: 190
timeoutSeconds: 185
livenessProbe:
exec:
command:
@ -82,8 +82,8 @@ spec:
- conductor
- --liveness-probe
initialDelaySeconds: 120
periodSeconds: 90
timeoutSeconds: 70
periodSeconds: 600
timeoutSeconds: 580
command:
- /tmp/nova-conductor.sh
volumeMounts:

View File

@ -69,8 +69,8 @@ spec:
- --service-queue-name
- consoleauth
initialDelaySeconds: 80
periodSeconds: 90
timeoutSeconds: 70
periodSeconds: 190
timeoutSeconds: 185
livenessProbe:
exec:
command:
@ -82,8 +82,8 @@ spec:
- consoleauth
- --liveness-probe
initialDelaySeconds: 120
periodSeconds: 90
timeoutSeconds: 70
periodSeconds: 600
timeoutSeconds: 580
command:
- /tmp/nova-consoleauth.sh
volumeMounts:

View File

@ -69,8 +69,8 @@ spec:
- --service-queue-name
- scheduler
initialDelaySeconds: 80
periodSeconds: 90
timeoutSeconds: 70
periodSeconds: 190
timeoutSeconds: 185
livenessProbe:
exec:
command:
@ -82,8 +82,8 @@ spec:
- scheduler
- --liveness-probe
initialDelaySeconds: 120
periodSeconds: 90
timeoutSeconds: 70
periodSeconds: 600
timeoutSeconds: 580
command:
- /tmp/nova-scheduler.sh
volumeMounts: