diff --git a/armada/const.py b/armada/const.py index 63e8ca29..9e90b094 100644 --- a/armada/const.py +++ b/armada/const.py @@ -16,16 +16,17 @@ DOCUMENT_CHART = 'armada/Chart/v1' DOCUMENT_GROUP = 'armada/ChartGroup/v1' DOCUMENT_MANIFEST = 'armada/Manifest/v1' - -# Armada Data KEYWORD_ARMADA = 'armada' KEYWORD_PREFIX = 'release_prefix' KEYWORD_GROUPS = 'chart_groups' KEYWORD_CHARTS = 'chart_group' KEYWORD_RELEASE = 'release' -# Tiller +# Armada DEFAULT_CHART_TIMEOUT = 3600 + +# Tiller +DEFAULT_TILLER_TIMEOUT = 300 STATUS_DEPLOYED = 'DEPLOYED' STATUS_FAILED = 'FAILED' diff --git a/armada/handlers/armada.py b/armada/handlers/armada.py index aaa007a1..e3c0b084 100644 --- a/armada/handlers/armada.py +++ b/armada/handlers/armada.py @@ -95,7 +95,7 @@ class Armada(object): self.timeout = timeout self.tiller = Tiller( tiller_host=tiller_host, tiller_port=tiller_port, - tiller_namespace=tiller_namespace) + tiller_namespace=tiller_namespace, dry_run=dry_run) self.documents = Override( documents, overrides=set_ovr, values=values).update_manifests() @@ -151,9 +151,14 @@ class Armada(object): ch_release_name = release_prefix( prefix, ch.get('chart', {}).get('chart_name')) if release[0] == ch_release_name: - LOG.info('Purging failed release %s ' - 'before deployment', release[0]) - self.tiller.uninstall_release(release[0]) + if self.dry_run: + LOG.info('Skipping purge during `dry-run`, would ' + 'have purged failed release %s before ' + 'deployment,', release[0]) + else: + LOG.info('Purging failed release %s ' + 'before deployment', release[0]) + self.tiller.uninstall_release(release[0]) # Clone the chart sources # @@ -233,6 +238,8 @@ class Armada(object): ''' Synchronize Helm with the Armada Config(s) ''' + if self.dry_run: + LOG.info('Armada is in DRY RUN mode, no changes being made.') msg = {'install': [], 'upgrade': [], 'diff': []} @@ -248,12 +255,13 @@ class Armada(object): for chartgroup in manifest_data.get(const.KEYWORD_GROUPS, []): cg_name = chartgroup.get('name', '') cg_desc = chartgroup.get('description', '') - LOG.info('Processing ChartGroup: %s (%s)', cg_name, cg_desc) - cg_sequenced = chartgroup.get('sequenced', False) cg_test_all_charts = chartgroup.get('test_charts', False) + LOG.info('Processing ChartGroup: %s (%s), sequenced=%s, ' + 'test_charts=%s', cg_name, cg_desc, cg_sequenced, + cg_test_all_charts) - namespaces_seen = set() + ns_label_set = set() tests_to_run = [] cg_charts = chartgroup.get(const.KEYWORD_CHARTS, []) @@ -275,7 +283,6 @@ class Armada(object): release_name = release_prefix(prefix, release) # Retrieve appropriate timeout value - if wait_timeout <= 0: # TODO(MarshM): chart's `data.timeout` should be deprecated chart_timeout = chart.get('timeout', 0) @@ -284,6 +291,7 @@ class Armada(object): wait_timeout = wait_values.get('timeout', chart_timeout) wait_labels = wait_values.get('labels', {}) + # Determine wait logic this_chart_should_wait = ( cg_sequenced or self.force_wait or wait_timeout > 0 or len(wait_labels) > 0) @@ -293,9 +301,6 @@ class Armada(object): const.DEFAULT_CHART_TIMEOUT) wait_timeout = const.DEFAULT_CHART_TIMEOUT - # Track namespaces + labels touched - namespaces_seen.add((namespace, tuple(wait_labels.items()))) - # Naively take largest timeout to apply at end # TODO(MarshM) better handling of timeout/timer cg_max_timeout = max(wait_timeout, cg_max_timeout) @@ -364,22 +369,19 @@ class Armada(object): namespace, pre_actions=pre_actions, post_actions=post_actions, - dry_run=self.dry_run, disable_hooks=disable_hooks, values=yaml.safe_dump(values), wait=this_chart_should_wait, timeout=timer) if this_chart_should_wait: - self.tiller.k8s.wait_until_ready( - release=release_name, - labels=wait_labels, - namespace=namespace, - k8s_wait_attempts=self.k8s_wait_attempts, - k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep, - timeout=timer + self._wait_until_ready( + release_name, wait_labels, namespace, timer ) + # Track namespace+labels touched by upgrade + ns_label_set.add((namespace, tuple(wait_labels.items()))) + LOG.info('Upgrade completed with results from Tiller: %s', tiller_result.__dict__) msg['upgrade'].append(release_name) @@ -396,21 +398,18 @@ class Armada(object): protoc_chart, release_name, namespace, - dry_run=self.dry_run, values=yaml.safe_dump(values), wait=this_chart_should_wait, timeout=timer) if this_chart_should_wait: - self.tiller.k8s.wait_until_ready( - release=release_name, - labels=wait_labels, - namespace=namespace, - k8s_wait_attempts=self.k8s_wait_attempts, - k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep, - timeout=timer + self._wait_until_ready( + release_name, wait_labels, namespace, timer ) + # Track namespace+labels touched by install + ns_label_set.add((namespace, tuple(wait_labels.items()))) + LOG.info('Install completed with results from Tiller: %s', tiller_result.__dict__) msg['install'].append(release_name) @@ -426,6 +425,7 @@ class Armada(object): LOG.error(reason) raise ArmadaTimeoutException(reason) self._test_chart(release_name, timer) + # TODO(MarshM): handle test failure or timeout # Un-sequenced ChartGroup should run tests at the end elif test_this_chart: @@ -433,39 +433,37 @@ class Armada(object): tests_to_run.append((release_name, timer)) # End of Charts in ChartGroup - LOG.info('All Charts applied.') + LOG.info('All Charts applied in ChartGroup %s.', cg_name) # After all Charts are applied, we should wait for the entire # ChartGroup to become healthy by looking at the namespaces seen - # TODO(MarshM): Need to restrict to only releases we processed # TODO(MarshM): Need to determine a better timeout # (not cg_max_timeout) if cg_max_timeout <= 0: cg_max_timeout = const.DEFAULT_CHART_TIMEOUT deadline = time.time() + cg_max_timeout - for (ns, labels) in namespaces_seen: + for (ns, labels) in ns_label_set: labels_dict = dict(labels) timer = int(round(deadline - time.time())) - LOG.info('Final wait for healthy namespace (%s), label=(%s), ' - 'timeout remaining: %ss.', ns, labels_dict, timer) + LOG.info('Final ChartGroup wait for healthy namespace (%s), ' + 'labels=(%s), timeout remaining: %ss.', + ns, labels_dict, timer) if timer <= 0: reason = ('Timeout expired waiting on namespace: %s, ' - 'label: %s' % (ns, labels_dict)) + 'labels: (%s)' % (ns, labels_dict)) LOG.error(reason) raise ArmadaTimeoutException(reason) - self.tiller.k8s.wait_until_ready( - namespace=ns, - labels=labels_dict, - k8s_wait_attempts=self.k8s_wait_attempts, - k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep, - timeout=timer) + self._wait_until_ready( + release_name=None, wait_labels=labels_dict, + namespace=ns, timeout=timer + ) # After entire ChartGroup is healthy, run any pending tests for (test, test_timer) in tests_to_run: self._test_chart(test, test_timer) + # TODO(MarshM): handle test failure or timeout - LOG.info("Performing Post-Flight Operations") self.post_flight_ops() if self.enable_chart_cleanup: @@ -473,12 +471,15 @@ class Armada(object): prefix, self.manifest[const.KEYWORD_ARMADA][const.KEYWORD_GROUPS]) + LOG.info('Done applying manifest.') return msg def post_flight_ops(self): ''' Operations to run after deployment process has terminated ''' + LOG.info("Performing post-flight operations.") + # Delete temp dirs used for deployment for group in self.manifest.get(const.KEYWORD_ARMADA, {}).get( const.KEYWORD_GROUPS, []): @@ -489,16 +490,37 @@ class Armada(object): if isinstance(source_dir, tuple) and source_dir: source.source_cleanup(source_dir[0]) + def _wait_until_ready(self, release_name, wait_labels, namespace, timeout): + if self.dry_run: + LOG.info('Skipping wait during `dry-run`, would have waited on ' + 'namespace=%s, labels=(%s) for %ss.', + namespace, wait_labels, timeout) + return + + self.tiller.k8s.wait_until_ready( + release=release_name, + labels=wait_labels, + namespace=namespace, + k8s_wait_attempts=self.k8s_wait_attempts, + k8s_wait_attempt_sleep=self.k8s_wait_attempt_sleep, + timeout=timeout + ) + def _test_chart(self, release_name, timeout): + if self.dry_run: + LOG.info('Skipping test during `dry-run`, would have tested ' + 'release=%s with timeout %ss.', release_name, timeout) + return True + # TODO(MarshM): Fix testing, it's broken, and track timeout resp = self.tiller.testing_release(release_name, timeout=timeout) status = getattr(resp.info.status, 'last_test_suite_run', 'FAILED') - LOG.info("Test INFO: %s", status) + LOG.info("Test info.status: %s", status) if resp: - LOG.info("PASSED: %s", release_name) + LOG.info("Test passed for release: %s", release_name) return True else: - LOG.info("FAILED: %s", release_name) + LOG.info("Test failed for release: %s", release_name) return False def show_diff(self, chart, installed_chart, installed_values, target_chart, diff --git a/armada/handlers/k8s.py b/armada/handlers/k8s.py index 5d48fcb5..dd6b0840 100644 --- a/armada/handlers/k8s.py +++ b/armada/handlers/k8s.py @@ -92,7 +92,9 @@ class K8s(object): propagation_policy='Foreground', timeout=DEFAULT_K8S_TIMEOUT): try: - LOG.debug('Deleting %s %s, Wait timeout=%s', + timeout = self._check_timeout(timeout) + + LOG.debug('Watching to delete %s %s, Wait timeout=%s', job_type_description, name, timeout) body = client.V1DeleteOptions() w = watch.Watch() @@ -108,10 +110,11 @@ class K8s(object): event_type = event['type'].upper() job_name = event['object'].metadata.name + LOG.debug('Watch event %s on %s', event_type, job_name) if event_type == 'DELETED' and job_name == name: - LOG.debug('Successfully deleted %s %s', - job_type_description, job_name) + LOG.info('Successfully deleted %s %s', + job_type_description, job_name) return err_msg = ('Reached timeout while waiting to delete %s: ' @@ -268,6 +271,7 @@ class K8s(object): :param release - part of namespace :param timeout - time before disconnecting stream ''' + timeout = self._check_timeout(timeout) w = watch.Watch() for event in w.stream(self.client.list_pod_for_all_namespaces, @@ -300,6 +304,8 @@ class K8s(object): :param k8s_wait_attempt_sleep: The time in seconds to sleep between attempts (minimum 1). ''' + timeout = self._check_timeout(timeout) + # NOTE(MarshM) 'release' is currently unused label_selector = label_selectors(labels) if labels else '' @@ -438,3 +444,10 @@ class K8s(object): for pc in pod_conditions: if pc.type == condition_type: return pc.status + + def _check_timeout(self, timeout): + if timeout <= 0: + LOG.warn('Kubernetes timeout is invalid or unspecified, ' + 'using default %ss.', DEFAULT_K8S_TIMEOUT) + timeout = DEFAULT_K8S_TIMEOUT + return timeout diff --git a/armada/handlers/tiller.py b/armada/handlers/tiller.py index 873b2889..766e27c8 100644 --- a/armada/handlers/tiller.py +++ b/armada/handlers/tiller.py @@ -28,17 +28,16 @@ from hapi.services.tiller_pb2 import UpdateReleaseRequest from oslo_config import cfg from oslo_log import log as logging -from armada.const import STATUS_DEPLOYED, STATUS_FAILED +from armada import const from armada.exceptions import tiller_exceptions as ex from armada.handlers.k8s import K8s from armada.utils.release import release_prefix from armada.utils.release import label_selectors TILLER_VERSION = b'2.7.2' -TILLER_TIMEOUT = 300 GRPC_EPSILON = 60 RELEASE_LIMIT = 128 # TODO(mark-burnett): There may be a better page size. -RUNTEST_SUCCESS = 9 +RELEASE_RUNTEST_SUCCESS = 9 # the standard gRPC max message size is 4MB # this expansion comes at a performance penalty @@ -68,10 +67,13 @@ class Tiller(object): ''' def __init__(self, tiller_host=None, tiller_port=None, - tiller_namespace=None): + tiller_namespace=None, dry_run=False): self.tiller_host = tiller_host self.tiller_port = tiller_port or CONF.tiller_port self.tiller_namespace = tiller_namespace or CONF.tiller_namespace + + self.dry_run = dry_run + # init k8s connectivity self.k8s = K8s() @@ -81,7 +83,7 @@ class Tiller(object): # init timeout for all requests # and assume eventually this will # be fed at runtime as an override - self.timeout = TILLER_TIMEOUT + self.timeout = const.DEFAULT_TILLER_TIMEOUT LOG.debug('Armada is using Tiller at: %s:%s, namespace=%s, timeout=%s', self.tiller_host, self.tiller_port, self.tiller_namespace, @@ -183,8 +185,8 @@ class Tiller(object): # NOTE(MarshM): `Helm List` defaults to returning Deployed and Failed, # but this might not be a desireable ListReleasesRequest default. req = ListReleasesRequest(limit=RELEASE_LIMIT, - status_codes=[STATUS_DEPLOYED, - STATUS_FAILED], + status_codes=[const.STATUS_DEPLOYED, + const.STATUS_FAILED], sort_by='LAST_RELEASED', sort_order='DESC') @@ -310,7 +312,6 @@ class Tiller(object): return charts def update_release(self, chart, release, namespace, - dry_run=False, pre_actions=None, post_actions=None, disable_hooks=False, @@ -320,12 +321,12 @@ class Tiller(object): ''' Update a Helm Release ''' - + timeout = self._check_timeout(wait, timeout) rel_timeout = self.timeout if not timeout else timeout - LOG.debug('Helm update release%s: wait=%s, timeout=%s', - (' (dry run)' if dry_run else ''), - wait, timeout) + LOG.info('Helm update release%s: wait=%s, timeout=%s', + (' (dry run)' if self.dry_run else ''), + wait, timeout) if values is None: values = Config(raw='') @@ -340,7 +341,7 @@ class Tiller(object): stub = ReleaseServiceStub(self.channel) release_request = UpdateReleaseRequest( chart=chart, - dry_run=dry_run, + dry_run=self.dry_run, disable_hooks=disable_hooks, values=values, name=release, @@ -368,19 +369,18 @@ class Tiller(object): self._post_update_actions(post_actions, namespace) def install_release(self, chart, release, namespace, - dry_run=False, values=None, wait=False, timeout=None): ''' Create a Helm Release ''' - + timeout = self._check_timeout(wait, timeout) rel_timeout = self.timeout if not timeout else timeout - LOG.debug('Helm install release%s: wait=%s, timeout=%s', - (' (dry run)' if dry_run else ''), - wait, timeout) + LOG.info('Helm install release%s: wait=%s, timeout=%s', + (' (dry run)' if self.dry_run else ''), + wait, timeout) if values is None: values = Config(raw='') @@ -392,7 +392,7 @@ class Tiller(object): stub = ReleaseServiceStub(self.channel) release_request = InstallReleaseRequest( chart=chart, - dry_run=dry_run, + dry_run=self.dry_run, values=values, name=release, namespace=namespace, @@ -417,7 +417,8 @@ class Tiller(object): status = self.get_release_status(release) raise ex.ReleaseException(release, status, 'Install') - def testing_release(self, release, timeout=300, cleanup=True): + def testing_release(self, release, timeout=const.DEFAULT_TILLER_TIMEOUT, + cleanup=True): ''' :param release - name of release to test :param timeout - runtime before exiting @@ -426,7 +427,7 @@ class Tiller(object): :returns - results of test pod ''' - LOG.debug("Helm test release %s, timeout=%s", release, timeout) + LOG.info("Running Helm test: release=%s, timeout=%s", release, timeout) try: @@ -441,12 +442,12 @@ class Tiller(object): LOG.info('No test found') return False - if content.release.hooks[0].events[0] == RUNTEST_SUCCESS: + if content.release.hooks[0].events[0] == RELEASE_RUNTEST_SUCCESS: test = stub.RunReleaseTest( - release_request, self.timeout, metadata=self.metadata) + release_request, timeout, metadata=self.metadata) if test.running(): - self.k8s.wait_get_completed_podphase(release) + self.k8s.wait_get_completed_podphase(release, timeout) test.cancel() @@ -527,7 +528,15 @@ class Tiller(object): deletes a Helm chart from Tiller ''' - # build release install request + # Helm client calls ReleaseContent in Delete dry-run scenario + if self.dry_run: + content = self.get_release_content(release) + LOG.info('Skipping delete during `dry-run`, would have deleted ' + 'release=%s from namespace=%s.', + content.release.name, content.release.namespace) + return + + # build release uninstall request try: stub = ReleaseServiceStub(self.channel) LOG.info("Uninstall %s release with disable_hooks=%s, " @@ -567,7 +576,7 @@ class Tiller(object): def delete_resources(self, release_name, resource_name, resource_type, resource_labels, namespace, wait=False, - timeout=TILLER_TIMEOUT): + timeout=const.DEFAULT_TILLER_TIMEOUT): ''' :params release_name - release name the specified resource is under :params resource_name - name of specific resource @@ -577,12 +586,13 @@ class Tiller(object): Apply deletion logic based on type of resource ''' + timeout = self._check_timeout(wait, timeout) label_selector = '' if resource_labels is not None: label_selector = label_selectors(resource_labels) LOG.debug("Deleting resources in namespace %s matching " - "selectors %s.", namespace, label_selector) + "selectors (%s).", namespace, label_selector) handled = False if resource_type == 'job': @@ -629,7 +639,7 @@ class Tiller(object): def rolling_upgrade_pod_deployment(self, name, release_name, namespace, resource_labels, action_type, chart, disable_hooks, values, - timeout=TILLER_TIMEOUT): + timeout=const.DEFAULT_TILLER_TIMEOUT): ''' update statefullsets (daemon, stateful) ''' @@ -672,3 +682,10 @@ class Tiller(object): else: LOG.error("Unable to exectue name: % type: %s", name, action_type) + + def _check_timeout(self, wait, timeout): + if wait and timeout <= 0: + LOG.warn('Tiller timeout is invalid or unspecified, ' + 'using default %ss.', const.DEFAULT_TILLER_TIMEOUT) + timeout = const.DEFAULT_TILLER_TIMEOUT + return timeout diff --git a/armada/tests/unit/handlers/test_armada.py b/armada/tests/unit/handlers/test_armada.py index fb3fa4e8..e2117d70 100644 --- a/armada/tests/unit/handlers/test_armada.py +++ b/armada/tests/unit/handlers/test_armada.py @@ -159,7 +159,8 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase): mock_tiller.assert_called_once_with(tiller_host=None, tiller_namespace='kube-system', - tiller_port=44134) + tiller_port=44134, + dry_run=False) mock_source.git_clone.assert_called_once_with( 'git://github.com/dummy/armada', 'master', auth_method=None, proxy_server=None) @@ -216,7 +217,6 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase): "{}-{}".format(armada_obj.manifest['armada']['release_prefix'], chart_1['release']), chart_1['namespace'], - dry_run=armada_obj.dry_run, values=yaml.safe_dump(chart_1['values']), timeout=10, wait=True), @@ -225,7 +225,6 @@ class ArmadaHandlerTestCase(base.ArmadaTestCase): "{}-{}".format(armada_obj.manifest['armada']['release_prefix'], chart_2['release']), chart_2['namespace'], - dry_run=armada_obj.dry_run, values=yaml.safe_dump(chart_2['values']), timeout=10, wait=True) diff --git a/armada/tests/unit/handlers/test_tiller.py b/armada/tests/unit/handlers/test_tiller.py index cac5d93b..824b8091 100644 --- a/armada/tests/unit/handlers/test_tiller.py +++ b/armada/tests/unit/handlers/test_tiller.py @@ -37,7 +37,6 @@ class TillerTestCase(base.ArmadaTestCase): # set params chart = mock.Mock() - dry_run = False name = None namespace = None initial_values = None @@ -45,14 +44,13 @@ class TillerTestCase(base.ArmadaTestCase): wait = False timeout = 3600 - tiller_obj.install_release(chart, name, namespace, - dry_run=dry_run, values=initial_values, - wait=wait, timeout=timeout) + tiller_obj.install_release( + chart, name, namespace, values=initial_values, + wait=wait, timeout=timeout) mock_stub.assert_called_with(tiller_obj.channel) release_request = mock_install_request( chart=chart, - dry_run=dry_run, values=updated_values, release=name, namespace=namespace, @@ -188,8 +186,8 @@ class TillerTestCase(base.ArmadaTestCase): mock_list_releases_request.assert_called_once_with( limit=tiller.RELEASE_LIMIT, - status_codes=[tiller.STATUS_DEPLOYED, - tiller.STATUS_FAILED], + status_codes=[tiller.const.STATUS_DEPLOYED, + tiller.const.STATUS_FAILED], sort_by='LAST_RELEASED', sort_order='DESC') diff --git a/armada/utils/validate.py b/armada/utils/validate.py index 9e38c95b..17bd99e3 100644 --- a/armada/utils/validate.py +++ b/armada/utils/validate.py @@ -189,7 +189,8 @@ def validate_armada_document(document): diagnostic='Please ensure document is one of ' 'the following schema types: %s' % list(SCHEMAS.keys())) - LOG.info('ValidationMessage: %s', vmsg.get_output_json()) + LOG.info('Unsupported document type, ignoring %s.', schema) + LOG.debug('ValidationMessage: %s', vmsg.get_output_json()) # Validation API doesn't care about this type of message, don't send if len([x for x in details if x.get('error', False)]) > 0: