Revise wait timeouts plus dry-run.

- revise wait on namespace+label, only wait on ns+label for
  charts we've touched in the current apply loop
- skipping any actions that would change system during dry-run
- skip 'test' and 'wait' during dry-run
- tweaking some logs for insight and readability

Change-Id: I1223f01690832c26ce2faa96e7e64620cf413ac9
This commit is contained in:
Marshall Margenau 2018-05-07 13:22:24 -05:00
parent 19efc1ad03
commit d770640b95
7 changed files with 139 additions and 88 deletions

View File

@ -16,16 +16,17 @@
DOCUMENT_CHART = 'armada/Chart/v1'
DOCUMENT_GROUP = 'armada/ChartGroup/v1'
DOCUMENT_MANIFEST = 'armada/Manifest/v1'
# Armada Data
KEYWORD_ARMADA = 'armada'
KEYWORD_PREFIX = 'release_prefix'
KEYWORD_GROUPS = 'chart_groups'
KEYWORD_CHARTS = 'chart_group'
KEYWORD_RELEASE = 'release'
# Tiller
# Armada
DEFAULT_CHART_TIMEOUT = 3600
# Tiller
DEFAULT_TILLER_TIMEOUT = 300
STATUS_DEPLOYED = 'DEPLOYED'
STATUS_FAILED = 'FAILED'

View File

@ -95,7 +95,7 @@ class Armada(object):
self.timeout = timeout
self.tiller = Tiller(
tiller_host=tiller_host, tiller_port=tiller_port,
tiller_namespace=tiller_namespace)
tiller_namespace=tiller_namespace, dry_run=dry_run)
self.documents = Override(
documents, overrides=set_ovr,
values=values).update_manifests()
@ -151,9 +151,14 @@ class Armada(object):
ch_release_name = release_prefix(
prefix, ch.get('chart', {}).get('chart_name'))
if release[0] == ch_release_name:
LOG.info('Purging failed release %s '
'before deployment', release[0])
self.tiller.uninstall_release(release[0])
if self.dry_run:
LOG.info('Skipping purge during `dry-run`, would '
'have purged failed release %s before '
'deployment,', release[0])
else:
LOG.info('Purging failed release %s '
'before deployment', release[0])
self.tiller.uninstall_release(release[0])
# Clone the chart sources
#
@ -233,6 +238,8 @@ class Armada(object):
'''
Synchronize Helm with the Armada Config(s)
'''
if self.dry_run:
LOG.info('Armada is in DRY RUN mode, no changes being made.')
msg = {'install': [], 'upgrade': [], 'diff': []}
@ -248,12 +255,13 @@ class Armada(object):
for chartgroup in manifest_data.get(const.KEYWORD_GROUPS, []):
cg_name = chartgroup.get('name', '<missing name>')
cg_desc = chartgroup.get('description', '<missing description>')
LOG.info('Processing ChartGroup: %s (%s)', cg_name, cg_desc)
cg_sequenced = chartgroup.get('sequenced', False)
cg_test_all_charts = chartgroup.get('test_charts', False)
LOG.info('Processing ChartGroup: %s (%s), sequenced=%s, '
'test_charts=%s', cg_name, cg_desc, cg_sequenced,
cg_test_all_charts)
namespaces_seen = set()
ns_label_set = set()
tests_to_run = []
cg_charts = chartgroup.get(const.KEYWORD_CHARTS, [])
@ -275,7 +283,6 @@ class Armada(object):
release_name = release_prefix(prefix, release)
# Retrieve appropriate timeout value
if wait_timeout <= 0:
# TODO(MarshM): chart's `data.timeout` should be deprecated
chart_timeout = chart.get('timeout', 0)
@ -284,6 +291,7 @@ class Armada(object):
wait_timeout = wait_values.get('timeout', chart_timeout)
wait_labels = wait_values.get('labels', {})
# Determine wait logic
this_chart_should_wait = (
cg_sequenced or self.force_wait or
wait_timeout > 0 or len(wait_labels) > 0)
@ -293,9 +301,6 @@ class Armada(object):
const.DEFAULT_CHART_TIMEOUT)
wait_timeout = const.DEFAULT_CHART_TIMEOUT
# Track namespaces + labels touched
namespaces_seen.add((namespace, tuple(wait_labels.items())))
# Naively take largest timeout to apply at end
# TODO(MarshM) better handling of timeout/timer
cg_max_timeout = max(wait_timeout, cg_max_timeout)
@ -364,22 +369,19 @@ class Armada(object):
namespace,
pre_actions=pre_actions,
post_actions=post_actions,
dry_run=self.dry_run,
disable_hooks=disable_hooks,
values=yaml.safe_dump(values),
wait=this_chart_should_wait,
timeout=timer)
if this_chart_should_wait:
self.tiller.k8s.wait_until_ready(
release=release_name,
labels=wait_labels,
namespace=namespace,
k8s_wait_attempts=self.k8s_wait_attempts,
k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep,
timeout=timer
self._wait_until_ready(
release_name, wait_labels, namespace, timer
)
# Track namespace+labels touched by upgrade
ns_label_set.add((namespace, tuple(wait_labels.items())))
LOG.info('Upgrade completed with results from Tiller: %s',
tiller_result.__dict__)
msg['upgrade'].append(release_name)
@ -396,21 +398,18 @@ class Armada(object):
protoc_chart,
release_name,
namespace,
dry_run=self.dry_run,
values=yaml.safe_dump(values),
wait=this_chart_should_wait,
timeout=timer)
if this_chart_should_wait:
self.tiller.k8s.wait_until_ready(
release=release_name,
labels=wait_labels,
namespace=namespace,
k8s_wait_attempts=self.k8s_wait_attempts,
k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep,
timeout=timer
self._wait_until_ready(
release_name, wait_labels, namespace, timer
)
# Track namespace+labels touched by install
ns_label_set.add((namespace, tuple(wait_labels.items())))
LOG.info('Install completed with results from Tiller: %s',
tiller_result.__dict__)
msg['install'].append(release_name)
@ -426,6 +425,7 @@ class Armada(object):
LOG.error(reason)
raise ArmadaTimeoutException(reason)
self._test_chart(release_name, timer)
# TODO(MarshM): handle test failure or timeout
# Un-sequenced ChartGroup should run tests at the end
elif test_this_chart:
@ -433,39 +433,37 @@ class Armada(object):
tests_to_run.append((release_name, timer))
# End of Charts in ChartGroup
LOG.info('All Charts applied.')
LOG.info('All Charts applied in ChartGroup %s.', cg_name)
# After all Charts are applied, we should wait for the entire
# ChartGroup to become healthy by looking at the namespaces seen
# TODO(MarshM): Need to restrict to only releases we processed
# TODO(MarshM): Need to determine a better timeout
# (not cg_max_timeout)
if cg_max_timeout <= 0:
cg_max_timeout = const.DEFAULT_CHART_TIMEOUT
deadline = time.time() + cg_max_timeout
for (ns, labels) in namespaces_seen:
for (ns, labels) in ns_label_set:
labels_dict = dict(labels)
timer = int(round(deadline - time.time()))
LOG.info('Final wait for healthy namespace (%s), label=(%s), '
'timeout remaining: %ss.', ns, labels_dict, timer)
LOG.info('Final ChartGroup wait for healthy namespace (%s), '
'labels=(%s), timeout remaining: %ss.',
ns, labels_dict, timer)
if timer <= 0:
reason = ('Timeout expired waiting on namespace: %s, '
'label: %s' % (ns, labels_dict))
'labels: (%s)' % (ns, labels_dict))
LOG.error(reason)
raise ArmadaTimeoutException(reason)
self.tiller.k8s.wait_until_ready(
namespace=ns,
labels=labels_dict,
k8s_wait_attempts=self.k8s_wait_attempts,
k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep,
timeout=timer)
self._wait_until_ready(
release_name=None, wait_labels=labels_dict,
namespace=ns, timeout=timer
)
# After entire ChartGroup is healthy, run any pending tests
for (test, test_timer) in tests_to_run:
self._test_chart(test, test_timer)
# TODO(MarshM): handle test failure or timeout
LOG.info("Performing Post-Flight Operations")
self.post_flight_ops()
if self.enable_chart_cleanup:
@ -473,12 +471,15 @@ class Armada(object):
prefix,
self.manifest[const.KEYWORD_ARMADA][const.KEYWORD_GROUPS])
LOG.info('Done applying manifest.')
return msg
def post_flight_ops(self):
'''
Operations to run after deployment process has terminated
'''
LOG.info("Performing post-flight operations.")
# Delete temp dirs used for deployment
for group in self.manifest.get(const.KEYWORD_ARMADA, {}).get(
const.KEYWORD_GROUPS, []):
@ -489,16 +490,37 @@ class Armada(object):
if isinstance(source_dir, tuple) and source_dir:
source.source_cleanup(source_dir[0])
def _wait_until_ready(self, release_name, wait_labels, namespace, timeout):
if self.dry_run:
LOG.info('Skipping wait during `dry-run`, would have waited on '
'namespace=%s, labels=(%s) for %ss.',
namespace, wait_labels, timeout)
return
self.tiller.k8s.wait_until_ready(
release=release_name,
labels=wait_labels,
namespace=namespace,
k8s_wait_attempts=self.k8s_wait_attempts,
k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep,
timeout=timeout
)
def _test_chart(self, release_name, timeout):
if self.dry_run:
LOG.info('Skipping test during `dry-run`, would have tested '
'release=%s with timeout %ss.', release_name, timeout)
return True
# TODO(MarshM): Fix testing, it's broken, and track timeout
resp = self.tiller.testing_release(release_name, timeout=timeout)
status = getattr(resp.info.status, 'last_test_suite_run', 'FAILED')
LOG.info("Test INFO: %s", status)
LOG.info("Test info.status: %s", status)
if resp:
LOG.info("PASSED: %s", release_name)
LOG.info("Test passed for release: %s", release_name)
return True
else:
LOG.info("FAILED: %s", release_name)
LOG.info("Test failed for release: %s", release_name)
return False
def show_diff(self, chart, installed_chart, installed_values, target_chart,

View File

@ -92,7 +92,9 @@ class K8s(object):
propagation_policy='Foreground',
timeout=DEFAULT_K8S_TIMEOUT):
try:
LOG.debug('Deleting %s %s, Wait timeout=%s',
timeout = self._check_timeout(timeout)
LOG.debug('Watching to delete %s %s, Wait timeout=%s',
job_type_description, name, timeout)
body = client.V1DeleteOptions()
w = watch.Watch()
@ -108,10 +110,11 @@ class K8s(object):
event_type = event['type'].upper()
job_name = event['object'].metadata.name
LOG.debug('Watch event %s on %s', event_type, job_name)
if event_type == 'DELETED' and job_name == name:
LOG.debug('Successfully deleted %s %s',
job_type_description, job_name)
LOG.info('Successfully deleted %s %s',
job_type_description, job_name)
return
err_msg = ('Reached timeout while waiting to delete %s: '
@ -268,6 +271,7 @@ class K8s(object):
:param release - part of namespace
:param timeout - time before disconnecting stream
'''
timeout = self._check_timeout(timeout)
w = watch.Watch()
for event in w.stream(self.client.list_pod_for_all_namespaces,
@ -300,6 +304,8 @@ class K8s(object):
:param k8s_wait_attempt_sleep: The time in seconds to sleep
between attempts (minimum 1).
'''
timeout = self._check_timeout(timeout)
# NOTE(MarshM) 'release' is currently unused
label_selector = label_selectors(labels) if labels else ''
@ -438,3 +444,10 @@ class K8s(object):
for pc in pod_conditions:
if pc.type == condition_type:
return pc.status
def _check_timeout(self, timeout):
if timeout <= 0:
LOG.warn('Kubernetes timeout is invalid or unspecified, '
'using default %ss.', DEFAULT_K8S_TIMEOUT)
timeout = DEFAULT_K8S_TIMEOUT
return timeout

View File

@ -28,17 +28,16 @@ from hapi.services.tiller_pb2 import UpdateReleaseRequest
from oslo_config import cfg
from oslo_log import log as logging
from armada.const import STATUS_DEPLOYED, STATUS_FAILED
from armada import const
from armada.exceptions import tiller_exceptions as ex
from armada.handlers.k8s import K8s
from armada.utils.release import release_prefix
from armada.utils.release import label_selectors
TILLER_VERSION = b'2.7.2'
TILLER_TIMEOUT = 300
GRPC_EPSILON = 60
RELEASE_LIMIT = 128 # TODO(mark-burnett): There may be a better page size.
RUNTEST_SUCCESS = 9
RELEASE_RUNTEST_SUCCESS = 9
# the standard gRPC max message size is 4MB
# this expansion comes at a performance penalty
@ -68,10 +67,13 @@ class Tiller(object):
'''
def __init__(self, tiller_host=None, tiller_port=None,
tiller_namespace=None):
tiller_namespace=None, dry_run=False):
self.tiller_host = tiller_host
self.tiller_port = tiller_port or CONF.tiller_port
self.tiller_namespace = tiller_namespace or CONF.tiller_namespace
self.dry_run = dry_run
# init k8s connectivity
self.k8s = K8s()
@ -81,7 +83,7 @@ class Tiller(object):
# init timeout for all requests
# and assume eventually this will
# be fed at runtime as an override
self.timeout = TILLER_TIMEOUT
self.timeout = const.DEFAULT_TILLER_TIMEOUT
LOG.debug('Armada is using Tiller at: %s:%s, namespace=%s, timeout=%s',
self.tiller_host, self.tiller_port, self.tiller_namespace,
@ -183,8 +185,8 @@ class Tiller(object):
# NOTE(MarshM): `Helm List` defaults to returning Deployed and Failed,
# but this might not be a desireable ListReleasesRequest default.
req = ListReleasesRequest(limit=RELEASE_LIMIT,
status_codes=[STATUS_DEPLOYED,
STATUS_FAILED],
status_codes=[const.STATUS_DEPLOYED,
const.STATUS_FAILED],
sort_by='LAST_RELEASED',
sort_order='DESC')
@ -310,7 +312,6 @@ class Tiller(object):
return charts
def update_release(self, chart, release, namespace,
dry_run=False,
pre_actions=None,
post_actions=None,
disable_hooks=False,
@ -320,12 +321,12 @@ class Tiller(object):
'''
Update a Helm Release
'''
timeout = self._check_timeout(wait, timeout)
rel_timeout = self.timeout if not timeout else timeout
LOG.debug('Helm update release%s: wait=%s, timeout=%s',
(' (dry run)' if dry_run else ''),
wait, timeout)
LOG.info('Helm update release%s: wait=%s, timeout=%s',
(' (dry run)' if self.dry_run else ''),
wait, timeout)
if values is None:
values = Config(raw='')
@ -340,7 +341,7 @@ class Tiller(object):
stub = ReleaseServiceStub(self.channel)
release_request = UpdateReleaseRequest(
chart=chart,
dry_run=dry_run,
dry_run=self.dry_run,
disable_hooks=disable_hooks,
values=values,
name=release,
@ -368,19 +369,18 @@ class Tiller(object):
self._post_update_actions(post_actions, namespace)
def install_release(self, chart, release, namespace,
dry_run=False,
values=None,
wait=False,
timeout=None):
'''
Create a Helm Release
'''
timeout = self._check_timeout(wait, timeout)
rel_timeout = self.timeout if not timeout else timeout
LOG.debug('Helm install release%s: wait=%s, timeout=%s',
(' (dry run)' if dry_run else ''),
wait, timeout)
LOG.info('Helm install release%s: wait=%s, timeout=%s',
(' (dry run)' if self.dry_run else ''),
wait, timeout)
if values is None:
values = Config(raw='')
@ -392,7 +392,7 @@ class Tiller(object):
stub = ReleaseServiceStub(self.channel)
release_request = InstallReleaseRequest(
chart=chart,
dry_run=dry_run,
dry_run=self.dry_run,
values=values,
name=release,
namespace=namespace,
@ -417,7 +417,8 @@ class Tiller(object):
status = self.get_release_status(release)
raise ex.ReleaseException(release, status, 'Install')
def testing_release(self, release, timeout=300, cleanup=True):
def testing_release(self, release, timeout=const.DEFAULT_TILLER_TIMEOUT,
cleanup=True):
'''
:param release - name of release to test
:param timeout - runtime before exiting
@ -426,7 +427,7 @@ class Tiller(object):
:returns - results of test pod
'''
LOG.debug("Helm test release %s, timeout=%s", release, timeout)
LOG.info("Running Helm test: release=%s, timeout=%s", release, timeout)
try:
@ -441,12 +442,12 @@ class Tiller(object):
LOG.info('No test found')
return False
if content.release.hooks[0].events[0] == RUNTEST_SUCCESS:
if content.release.hooks[0].events[0] == RELEASE_RUNTEST_SUCCESS:
test = stub.RunReleaseTest(
release_request, self.timeout, metadata=self.metadata)
release_request, timeout, metadata=self.metadata)
if test.running():
self.k8s.wait_get_completed_podphase(release)
self.k8s.wait_get_completed_podphase(release, timeout)
test.cancel()
@ -527,7 +528,15 @@ class Tiller(object):
deletes a Helm chart from Tiller
'''
# build release install request
# Helm client calls ReleaseContent in Delete dry-run scenario
if self.dry_run:
content = self.get_release_content(release)
LOG.info('Skipping delete during `dry-run`, would have deleted '
'release=%s from namespace=%s.',
content.release.name, content.release.namespace)
return
# build release uninstall request
try:
stub = ReleaseServiceStub(self.channel)
LOG.info("Uninstall %s release with disable_hooks=%s, "
@ -567,7 +576,7 @@ class Tiller(object):
def delete_resources(self, release_name, resource_name, resource_type,
resource_labels, namespace, wait=False,
timeout=TILLER_TIMEOUT):
timeout=const.DEFAULT_TILLER_TIMEOUT):
'''
:params release_name - release name the specified resource is under
:params resource_name - name of specific resource
@ -577,12 +586,13 @@ class Tiller(object):
Apply deletion logic based on type of resource
'''
timeout = self._check_timeout(wait, timeout)
label_selector = ''
if resource_labels is not None:
label_selector = label_selectors(resource_labels)
LOG.debug("Deleting resources in namespace %s matching "
"selectors %s.", namespace, label_selector)
"selectors (%s).", namespace, label_selector)
handled = False
if resource_type == 'job':
@ -629,7 +639,7 @@ class Tiller(object):
def rolling_upgrade_pod_deployment(self, name, release_name, namespace,
resource_labels, action_type, chart,
disable_hooks, values,
timeout=TILLER_TIMEOUT):
timeout=const.DEFAULT_TILLER_TIMEOUT):
'''
update statefullsets (daemon, stateful)
'''
@ -672,3 +682,10 @@ class Tiller(object):
else:
LOG.error("Unable to exectue name: % type: %s", name, action_type)
def _check_timeout(self, wait, timeout):
if wait and timeout <= 0:
LOG.warn('Tiller timeout is invalid or unspecified, '
'using default %ss.', const.DEFAULT_TILLER_TIMEOUT)
timeout = const.DEFAULT_TILLER_TIMEOUT
return timeout

View File

@ -159,7 +159,8 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase):
mock_tiller.assert_called_once_with(tiller_host=None,
tiller_namespace='kube-system',
tiller_port=44134)
tiller_port=44134,
dry_run=False)
mock_source.git_clone.assert_called_once_with(
'git://github.com/dummy/armada', 'master', auth_method=None,
proxy_server=None)
@ -216,7 +217,6 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase):
"{}-{}".format(armada_obj.manifest['armada']['release_prefix'],
chart_1['release']),
chart_1['namespace'],
dry_run=armada_obj.dry_run,
values=yaml.safe_dump(chart_1['values']),
timeout=10,
wait=True),
@ -225,7 +225,6 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase):
"{}-{}".format(armada_obj.manifest['armada']['release_prefix'],
chart_2['release']),
chart_2['namespace'],
dry_run=armada_obj.dry_run,
values=yaml.safe_dump(chart_2['values']),
timeout=10,
wait=True)

View File

@ -37,7 +37,6 @@ class TillerTestCase(base.ArmadaTestCase):
# set params
chart = mock.Mock()
dry_run = False
name = None
namespace = None
initial_values = None
@ -45,14 +44,13 @@ class TillerTestCase(base.ArmadaTestCase):
wait = False
timeout = 3600
tiller_obj.install_release(chart, name, namespace,
dry_run=dry_run, values=initial_values,
wait=wait, timeout=timeout)
tiller_obj.install_release(
chart, name, namespace, values=initial_values,
wait=wait, timeout=timeout)
mock_stub.assert_called_with(tiller_obj.channel)
release_request = mock_install_request(
chart=chart,
dry_run=dry_run,
values=updated_values,
release=name,
namespace=namespace,
@ -188,8 +186,8 @@ class TillerTestCase(base.ArmadaTestCase):
mock_list_releases_request.assert_called_once_with(
limit=tiller.RELEASE_LIMIT,
status_codes=[tiller.STATUS_DEPLOYED,
tiller.STATUS_FAILED],
status_codes=[tiller.const.STATUS_DEPLOYED,
tiller.const.STATUS_FAILED],
sort_by='LAST_RELEASED',
sort_order='DESC')

View File

@ -189,7 +189,8 @@ def validate_armada_document(document):
diagnostic='Please ensure document is one of '
'the following schema types: %s' %
list(SCHEMAS.keys()))
LOG.info('ValidationMessage: %s', vmsg.get_output_json())
LOG.info('Unsupported document type, ignoring %s.', schema)
LOG.debug('ValidationMessage: %s', vmsg.get_output_json())
# Validation API doesn't care about this type of message, don't send
if len([x for x in details if x.get('error', False)]) > 0: