From 7910521a7ed7ed685a4bd7ec892f26c79557dc50 Mon Sep 17 00:00:00 2001 From: Jeremy Freudberg Date: Tue, 26 Jun 2018 16:17:26 -0400 Subject: [PATCH] Totally rewrite s3_hadoop Remedying of patchings, version conflicts, classpath issues, etc. ALSO: Switch the Hadoop libraries used on the Spark standalone plugin to Hadoop 2.7.3. The version was previously 2.6.5, to match Cloudera's so-called "Hadoop 2.6.0", but in fact this concordance is not at all necessary... Change-Id: Iafafb64fd60a1ae585375a68173c84fbb82c7e1f --- diskimage-create/diskimage-create.sh | 6 ++ .../post-install.d/89-add-amazon-jar | 60 ------------------- .../s3_hadoop/post-install.d/89-manipulate-s3 | 54 +++++++++++++++++ elements/spark/root.d/50-download-spark | 5 +- 4 files changed, 63 insertions(+), 62 deletions(-) delete mode 100755 elements/s3_hadoop/post-install.d/89-add-amazon-jar create mode 100755 elements/s3_hadoop/post-install.d/89-manipulate-s3 diff --git a/diskimage-create/diskimage-create.sh b/diskimage-create/diskimage-create.sh index 2d54d426..a35cd0ca 100755 --- a/diskimage-create/diskimage-create.sh +++ b/diskimage-create/diskimage-create.sh @@ -622,6 +622,11 @@ if [ -z "$PLUGIN" -o "$PLUGIN" = "spark" ]; then export DIB_RELEASE=${DIB_RELEASE:-trusty} export DIB_CDH_VERSION="5.5" fi + if [ "$DIB_SPARK_VERSION" = "1.6.0" ]; then + export SPARK_HADOOP_DL=hadoop2.6 + else + export SPARK_HADOOP_DL=hadoop2.7 + fi # Tell the cloudera element to install only hdfs export DIB_CDH_HDFS_ONLY=1 @@ -630,6 +635,7 @@ if [ -z "$PLUGIN" -o "$PLUGIN" = "spark" ]; then # Creating Ubuntu cloud image image_create ubuntu $ubuntu_image_name $ubuntu_elements_sequence + unset SPARK_HADOOP_DL unset DIB_CLOUD_INIT_DATASOURCES unset DIB_HDFS_LIB_DIR unset DIB_CDH_HDFS_ONLY diff --git a/elements/s3_hadoop/post-install.d/89-add-amazon-jar b/elements/s3_hadoop/post-install.d/89-add-amazon-jar deleted file mode 100755 index 70a10eac..00000000 --- a/elements/s3_hadoop/post-install.d/89-add-amazon-jar +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -if [ "${DIB_DEBUG_TRACE:-0}" -gt 0 ]; then - set -x -fi -set -eu -set -o pipefail - - -case "$plugin_type" in - "vanilla" ) - HADOOP_TOOLS_DIR_PATH="/opt/hadoop/share/hadoop/tools/lib" - HADOOP_ENV_SH_PATH="/opt/hadoop/etc/hadoop/hadoop-env.sh" - SPARK_JARS_DIR_PATH="/opt/spark/jars" - ;; - "spark" ) - HADOOP_TOOLS_DIR_PATH="/usr/lib/hadoop/client" - SPARK_JARS_DIR_PATH="/opt/spark/jars" - ;; - "cloudera" ) - echo -n "The s3_hadoop element is not supported on CDH," - echo " because the relevant libraries are already in the right place." - exit 1 - ;; - *) - echo "The s3_hadoop element is only supported on Vanilla and Spark." - exit 1 -esac - -# NOTE: By defintion, the Spark standalone plugin does not contain Hadoop in -# its entirety. Therefore, there are no Hadoop-specific environment settings -# available for modification. -if [ "$plugin_type" != "spark" ]; then - if [ -f "$HADOOP_ENV_SH_PATH" ]; then - cat >> $HADOOP_ENV_SH_PATH <=5.9 + exit 1 + ;; + *) + # TODO: Investigate if some changes are in fact needed for HDP, MapR + echo "The s3_hadoop element is only relevant to Vanilla and Spark." + exit 1 +esac + +SPARK_JARS_DIR_PATH="/opt/spark/jars" +HADOOP_TOOLS_DIR_PATH="/opt/hadoop/share/hadoop/tools/lib" +HADOOP_COMMON_DIR_PATH="/opt/hadoop/share/hadoop/common/lib" + +if [ "$plugin_type" = "vanilla" ]; then + if [ "$DIB_HADOOP_VERSION" = "2.7.1" -o "$DIB_HADOOP_VERSION" = "2.7.5" ]; then + # These versions need a patched hadoop-aws jar + wget https://tarballs.openstack.org/sahara-extra/dist/common-artifacts/hadoop-aws-$DIB_HADOOP_VERSION.jar -O $HADOOP_TOOLS_DIR_PATH/hadoop-aws-$DIB_HADOOP_VERSION.jar + fi + + # NOTE: It's easier just to copy, than to mess with YARN + cp $HADOOP_TOOLS_DIR_PATH/*aws*.jar $HADOOP_COMMON_DIR_PATH + if [ "$DIB_HADOOP_VERSION" = "2.7.1" -o "$DIB_HADOOP_VERSION" = "2.7.5" -o "$DIB_HADOOP_VERSION" = "2.8.2" ]; then + # Hadoop-aws older than 2.9.0 needs these too + cp $HADOOP_TOOLS_DIR_PATH/joda-time*.jar $HADOOP_COMMON_DIR_PATH + # The following jars are also on-disk, but under the wrong namespace + wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-core/2.5.3/jackson-core-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-core.jar + wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-databind/2.5.3/jackson-databind-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-databind.jar + wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-annotations/2.5.3/jackson-annotations-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-annotations.jar + fi +fi + +# For both Spark and Vanilla plugins: +# (The s3a driver in hadoop-aws 2.6.5 is too buggy to be redeemed) +if [ "$SPARK_HADOOP_DL" != "hadoop2.6" ]; then + # The hadoop-aws and aws-java-sdk libraries are missing here, but we + # cannot copy them from the Hadoop folder on-disk due to + # version/patching issues + wget https://tarballs.openstack.org/sahara-extra/dist/common-artifacts/hadoop-aws-2.7.3.jar -O $SPARK_JARS_DIR_PATH/hadoop-aws.jar + wget http://central.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar -O $SPARK_JARS_DIR_PATH/aws-java-sdk.jar +fi diff --git a/elements/spark/root.d/50-download-spark b/elements/spark/root.d/50-download-spark index eac5b47d..f1a1fc1f 100755 --- a/elements/spark/root.d/50-download-spark +++ b/elements/spark/root.d/50-download-spark @@ -13,13 +13,14 @@ mkdir -p $tmp_dir if [ -z "${SPARK_DOWNLOAD_URL:-}" ]; then # Check hadoop version # INFO on hadoop versions: http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html + # Now the below is just a sanity check if [ -z "${SPARK_HADOOP_DL:-}" ]; then case "${DIB_CDH_VERSION:-}" in 5.5) - SPARK_HADOOP_DL=hadoop2.6 + SPARK_HADOOP_DL=hadoop2.7 ;; 5.11) - SPARK_HADOOP_DL=hadoop2.6 + SPARK_HADOOP_DL=hadoop2.7 ;; *) echo "WARNING: Cloudera CDH $DIB_CDH_VERSION not supported."