From a37dfac06b5a6a474e8756c6567f832a38a233c6 Mon Sep 17 00:00:00 2001 From: Jeremy Freudberg Date: Thu, 2 Aug 2018 15:04:49 -0400 Subject: [PATCH] S3 data source URL format change The old way will still work, but prefer s3:// now. Change-Id: Ia1f8eba22016044aa5ffe50b2ab898908aef1890 --- doc/source/user/edp.rst | 10 +++++----- .../notes/s3-datasource-protocol-d3abd0b22f653b3b.yaml | 4 ++++ sahara/service/edp/data_sources/s3/implementation.py | 8 ++++++-- .../unit/service/edp/data_sources/s3/test_s3_type.py | 3 +++ 4 files changed, 18 insertions(+), 7 deletions(-) create mode 100644 releasenotes/notes/s3-datasource-protocol-d3abd0b22f653b3b.yaml diff --git a/doc/source/user/edp.rst b/doc/source/user/edp.rst index 1052654ecc..618d1d8a50 100644 --- a/doc/source/user/edp.rst +++ b/doc/source/user/edp.rst @@ -135,7 +135,7 @@ share will be automatically mounted to your cluster's nodes as needed to access the data source. Finally, Sahara supports data sources referring to S3-like object stores. The -URL should be of the form ``s3a://{bucket}/{path}``. Also, the following +URL should be of the form ``s3://{bucket}/{path}``. Also, the following credentials/configs are understood: ``accesskey``, ``secretkey``, ``endpoint``, ``bucket_in_path``, and ``ssl``. These credentials are specified through the ``credentials`` attribute of the body of the request when creating @@ -632,13 +632,13 @@ Manila NFS filesystem reference URLS take the form: This format should be used when referring to a job binary or a data source stored in a manila NFS share. -For job binaries only, S3 urls take the form: +For both job binaries and data sources, S3 urls take the form: ``s3://bucket/path/to/object`` -For data sources, S3 urls take the standard Hadoop form: - -``s3a://bucket/path/to/object`` +Despite the above URL format, the current implementation of EDP will still +use the Hadoop ``s3a`` driver to access data sources. Botocore is used to +access job binaries. EDP Requirements ================ diff --git a/releasenotes/notes/s3-datasource-protocol-d3abd0b22f653b3b.yaml b/releasenotes/notes/s3-datasource-protocol-d3abd0b22f653b3b.yaml new file mode 100644 index 0000000000..86baaae4fb --- /dev/null +++ b/releasenotes/notes/s3-datasource-protocol-d3abd0b22f653b3b.yaml @@ -0,0 +1,4 @@ +--- +other: + - | + The URL of an S3 data source may have `s3://` or `s3a://`, equivalently. diff --git a/sahara/service/edp/data_sources/s3/implementation.py b/sahara/service/edp/data_sources/s3/implementation.py index 2a74000185..f65242a739 100644 --- a/sahara/service/edp/data_sources/s3/implementation.py +++ b/sahara/service/edp/data_sources/s3/implementation.py @@ -55,8 +55,9 @@ class S3Type(DataSourceType): raise ex.InvalidDataException(_("S3 url must not be empty")) url = urlparse.urlparse(url) - if url.scheme != "s3a": - raise ex.InvalidDataException(_("URL scheme must be 's3a'")) + if url.scheme not in ["s3", "s3a"]: + raise ex.InvalidDataException( + _("URL scheme must be 's3' or 's3a'")) if not url.hostname: raise ex.InvalidDataException(_("Bucket name must be present")) @@ -80,3 +81,6 @@ class S3Type(DataSourceType): if job_conf.get(s3a_cfg_name, None) is None: # no overwrite if creds.get(config_name, None) is not None: job_conf[s3a_cfg_name] = creds[config_name] + + def get_runtime_url(self, url, cluster): + return url.replace("s3://", "s3a://", 1) diff --git a/sahara/tests/unit/service/edp/data_sources/s3/test_s3_type.py b/sahara/tests/unit/service/edp/data_sources/s3/test_s3_type.py index 2da7a129c2..e396049023 100644 --- a/sahara/tests/unit/service/edp/data_sources/s3/test_s3_type.py +++ b/sahara/tests/unit/service/edp/data_sources/s3/test_s3_type.py @@ -35,6 +35,9 @@ class TestSwiftType(base.SaharaTestCase): } self.s_type.validate(data) + data["url"] = "s3://mybucket/myobject" + self.s_type.validate(data) + creds = {} data["credentials"] = creds self.s_type.validate(data)