Totally rewrite s3_hadoop

Remedying of patchings, version conflicts, classpath issues, etc.

ALSO: Switch the Hadoop libraries used on the Spark standalone plugin to
Hadoop 2.7.3. The version was previously 2.6.5, to match Cloudera's
so-called "Hadoop 2.6.0", but in fact this concordance is not at all
necessary...

Change-Id: Iafafb64fd60a1ae585375a68173c84fbb82c7e1f
This commit is contained in:
Jeremy Freudberg 2018-06-26 16:17:26 -04:00
parent 83224a6c5e
commit 7910521a7e
4 changed files with 63 additions and 62 deletions

View File

@ -622,6 +622,11 @@ if [ -z "$PLUGIN" -o "$PLUGIN" = "spark" ]; then
export DIB_RELEASE=${DIB_RELEASE:-trusty}
export DIB_CDH_VERSION="5.5"
fi
if [ "$DIB_SPARK_VERSION" = "1.6.0" ]; then
export SPARK_HADOOP_DL=hadoop2.6
else
export SPARK_HADOOP_DL=hadoop2.7
fi
# Tell the cloudera element to install only hdfs
export DIB_CDH_HDFS_ONLY=1
@ -630,6 +635,7 @@ if [ -z "$PLUGIN" -o "$PLUGIN" = "spark" ]; then
# Creating Ubuntu cloud image
image_create ubuntu $ubuntu_image_name $ubuntu_elements_sequence
unset SPARK_HADOOP_DL
unset DIB_CLOUD_INIT_DATASOURCES
unset DIB_HDFS_LIB_DIR
unset DIB_CDH_HDFS_ONLY

View File

@ -1,60 +0,0 @@
#!/bin/bash
if [ "${DIB_DEBUG_TRACE:-0}" -gt 0 ]; then
set -x
fi
set -eu
set -o pipefail
case "$plugin_type" in
"vanilla" )
HADOOP_TOOLS_DIR_PATH="/opt/hadoop/share/hadoop/tools/lib"
HADOOP_ENV_SH_PATH="/opt/hadoop/etc/hadoop/hadoop-env.sh"
SPARK_JARS_DIR_PATH="/opt/spark/jars"
;;
"spark" )
HADOOP_TOOLS_DIR_PATH="/usr/lib/hadoop/client"
SPARK_JARS_DIR_PATH="/opt/spark/jars"
;;
"cloudera" )
echo -n "The s3_hadoop element is not supported on CDH,"
echo " because the relevant libraries are already in the right place."
exit 1
;;
*)
echo "The s3_hadoop element is only supported on Vanilla and Spark."
exit 1
esac
# NOTE: By defintion, the Spark standalone plugin does not contain Hadoop in
# its entirety. Therefore, there are no Hadoop-specific environment settings
# available for modification.
if [ "$plugin_type" != "spark" ]; then
if [ -f "$HADOOP_ENV_SH_PATH" ]; then
cat >> $HADOOP_ENV_SH_PATH <<EOF
for f in $HADOOP_TOOLS_DIR_PATH/*.jar; do
if [ "\$HADOOP_CLASSPATH" ]; then
export HADOOP_CLASSPATH=\$HADOOP_CLASSPATH:\$f
else
export HADOOP_CLASSPATH=\$f
fi
done
EOF
else
echo "Something went wrong: couldn't find Hadoop env settings."
exit 1
fi
fi
if [ -d "$SPARK_JARS_DIR_PATH" ]; then
cp $HADOOP_TOOLS_DIR_PATH/*aws*.jar $SPARK_JARS_DIR_PATH
chmod 0644 $SPARK_JARS_DIR_PATH/*aws*jar
else
# NOTE: In the case of Vanilla, the user may have disabled the Spark
# element. So, check for the existence of the directory explicitly, but
# crucially do do not consider it an error if the folder does not exist.
if [ "$plugin_type" != "vanilla" ]; then
echo "Something went wrong: couldn't find Spark installation."
exit 1
fi
fi

View File

@ -0,0 +1,54 @@
#!/bin/bash
if [ "${DIB_DEBUG_TRACE:-0}" -gt 0 ]; then
set -x
fi
set -eu
set -o pipefail
case "$plugin_type" in
"vanilla" | "spark" )
;;
"cloudera" )
echo -n "The s3_hadoop element is not supported on CDH,"
echo " because the relevant libraries need no manipulation."
# NOTE: actually the above statement is only true on CDH>=5.9
exit 1
;;
*)
# TODO: Investigate if some changes are in fact needed for HDP, MapR
echo "The s3_hadoop element is only relevant to Vanilla and Spark."
exit 1
esac
SPARK_JARS_DIR_PATH="/opt/spark/jars"
HADOOP_TOOLS_DIR_PATH="/opt/hadoop/share/hadoop/tools/lib"
HADOOP_COMMON_DIR_PATH="/opt/hadoop/share/hadoop/common/lib"
if [ "$plugin_type" = "vanilla" ]; then
if [ "$DIB_HADOOP_VERSION" = "2.7.1" -o "$DIB_HADOOP_VERSION" = "2.7.5" ]; then
# These versions need a patched hadoop-aws jar
wget https://tarballs.openstack.org/sahara-extra/dist/common-artifacts/hadoop-aws-$DIB_HADOOP_VERSION.jar -O $HADOOP_TOOLS_DIR_PATH/hadoop-aws-$DIB_HADOOP_VERSION.jar
fi
# NOTE: It's easier just to copy, than to mess with YARN
cp $HADOOP_TOOLS_DIR_PATH/*aws*.jar $HADOOP_COMMON_DIR_PATH
if [ "$DIB_HADOOP_VERSION" = "2.7.1" -o "$DIB_HADOOP_VERSION" = "2.7.5" -o "$DIB_HADOOP_VERSION" = "2.8.2" ]; then
# Hadoop-aws older than 2.9.0 needs these too
cp $HADOOP_TOOLS_DIR_PATH/joda-time*.jar $HADOOP_COMMON_DIR_PATH
# The following jars are also on-disk, but under the wrong namespace
wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-core/2.5.3/jackson-core-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-core.jar
wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-databind/2.5.3/jackson-databind-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-databind.jar
wget http://central.maven.org/maven2/com/fasterxml/jackson/core/jackson-annotations/2.5.3/jackson-annotations-2.5.3.jar -O $HADOOP_COMMON_DIR_PATH/jackson-annotations.jar
fi
fi
# For both Spark and Vanilla plugins:
# (The s3a driver in hadoop-aws 2.6.5 is too buggy to be redeemed)
if [ "$SPARK_HADOOP_DL" != "hadoop2.6" ]; then
# The hadoop-aws and aws-java-sdk libraries are missing here, but we
# cannot copy them from the Hadoop folder on-disk due to
# version/patching issues
wget https://tarballs.openstack.org/sahara-extra/dist/common-artifacts/hadoop-aws-2.7.3.jar -O $SPARK_JARS_DIR_PATH/hadoop-aws.jar
wget http://central.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar -O $SPARK_JARS_DIR_PATH/aws-java-sdk.jar
fi

View File

@ -13,13 +13,14 @@ mkdir -p $tmp_dir
if [ -z "${SPARK_DOWNLOAD_URL:-}" ]; then
# Check hadoop version
# INFO on hadoop versions: http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html
# Now the below is just a sanity check
if [ -z "${SPARK_HADOOP_DL:-}" ]; then
case "${DIB_CDH_VERSION:-}" in
5.5)
SPARK_HADOOP_DL=hadoop2.6
SPARK_HADOOP_DL=hadoop2.7
;;
5.11)
SPARK_HADOOP_DL=hadoop2.6
SPARK_HADOOP_DL=hadoop2.7
;;
*)
echo "WARNING: Cloudera CDH $DIB_CDH_VERSION not supported."