Use Cloudera element for Spark HDFS

Update the Spark element to use the existing hadoop-cloudera element for HDFS
for Spark versions > 1.0, instead of the ad-hoc cloudera-cdh one. For Spark 1.0.2,
CDH4 via the old hadoop-cdh element is used, since a precompiled binary for CDH5
is not available.

This change also makes it possible to specify an arbitrary Spark version via the
new -s commandline switch, reducing the amount of code for supporting future
versions of Spark. The defaults for Spark are 1.3.1 and CDH 5.3, a combination
that works well in our deployments.

A small change is needed in the cloudera element: when creating a Spark image,
only the HDFS packages have to be installed.

README files have been updated to clarify that default versions are tested, while
other combinations are not. A reference to the SparkPlugin wiki page was added
to point to a table of supported versions.

Change-Id: Ifc2a0c8729981e1e1df79b556a4c2e6bd1ba893a
Implements: blueprint support-spark-1-3
Depends-On: I8fa482b6d1d6abaa6633aec309a3ba826a8b7ebb
This commit is contained in:
Daniele Venzano 2015-06-24 12:19:35 +00:00
parent aa6b3c64e2
commit f376b0f480
12 changed files with 201 additions and 106 deletions

View File

@ -46,7 +46,7 @@ NOTE: Do not create all images for all plugins with the same mirrors. Different
NOTE for 4, 5, 6:
For Vanilla you can create ubuntu, fedora and centos cloud image with hadoop 1.x.x and 2.x.x versions. Use environment variables 'DIB_HADOOP_VERSION_1' and 'DIB_HADOOP_VERSION_2' to change defaults.
For Spark you can create only ubuntu image with one hadoop version. You shouldn't specify image type and hadoop version.
For Spark you can create only ubuntu images, so you shouldn't specify an image type. The default Spark and HDFS versions included in the build are tested and known working together with the Sahara Spark plugin, other combinations should be used only for evaluation or testing purposes. You can select a different Spark version with commandline option '-s' and Hadoop HDFS version with '-v', but only Cludera CDH versions are available for now.
For HDP you can create only centos image with hadoop 1.3.0 or 2.0 and without hadoop ('plain' image). You shouldn't specify image type.
For Cloudera you can create ubuntu and centos images with preinstalled cloudera hadoop. You shouldn't specify hadoop version.

View File

@ -12,6 +12,9 @@ DEBUG_MODE="false"
# The default version for a MapR plugin
DIB_DEFAULT_MAPR_VERSION="4.0.2"
# The default version for Spark plugin
DIB_DEFAULT_SPARK_VERSION="1.3.1"
# Default list of datasource modules for ubuntu. Workaround for bug #1375645
export CLOUD_INIT_DATASOURCES=${DIB_CLOUD_INIT_DATASOURCES:-"NoCloud, ConfigDrive, OVF, MAAS, Ec2"}
@ -23,8 +26,9 @@ usage() {
echo "Usage: $(basename $0)"
echo " [-p vanilla|spark|hdp|cloudera|storm|mapr|plain]"
echo " [-i ubuntu|fedora|centos|centos7]"
echo " [-v 1|2|2.6|5.0|5.3|5.4]"
echo " [-v 1|2|2.6|4|5.0|5.3|5.4]"
echo " [-r 3.1.1|4.0.1|4.0.2]"
echo " [-s <Spark version>]"
echo " [-d]"
echo " [-u]"
echo " [-j openjdk|oracle-java]"
@ -33,12 +37,13 @@ usage() {
echo " '-i' is operating system of the base image (default: all supported by plugin)"
echo " '-v' is hadoop version (default: all supported by plugin)"
echo " '-r' is MapR Version (default: ${DIB_DEFAULT_MAPR_VERSION})"
echo " '-s' is Spark version (default: ${DIB_DEFAULT_SPARK_VERSION})"
echo " '-d' enable debug mode, root account will have password 'hadoop'"
echo " '-u' install missing packages necessary for building"
echo " '-j' is java distribution (default: openjdk)"
echo " '-x' turns on tracing"
echo
echo "You shouldn't specify hadoop version and image type for spark plugin"
echo "You shouldn't specify image type for spark plugin"
echo "You shouldn't specify image type for hdp plugin"
echo "You shouldn't specify hadoop version for plain images"
echo "Debug mode should only be enabled for local debugging purposes, not for production systems"
@ -47,7 +52,7 @@ usage() {
exit 1
}
while getopts "p:i:v:dur:j:x" opt; do
while getopts "p:i:v:dur:s:j:x" opt; do
case $opt in
p)
PLUGIN=$OPTARG
@ -64,6 +69,9 @@ while getopts "p:i:v:dur:j:x" opt; do
r)
DIB_MAPR_VERSION=$OPTARG
;;
s)
DIB_SPARK_VERSION=$OPTARG
;;
u)
DIB_UPDATE_REQUESTED=true
;;
@ -150,7 +158,41 @@ case "$PLUGIN" in
;;
esac
;;
"spark" | "storm")
"spark")
case "$BASE_IMAGE_OS" in
"" | "ubuntu");;
*)
echo -e "'$BASE_IMAGE_OS' image type is not supported by '$PLUGIN'.\nAborting"
exit 1
;;
esac
case "$HADOOP_VERSION" in
"")
echo "CDH version not specified"
echo "CDH version 5.3 will be used"
HADOOP_VERSION="5.3"
;;
"4")
HADOOP_VERSION="CDH4"
;;
"5.0" | "5.3" | "5.4");;
*)
echo -e "Unknown hadoop version selected.\nAborting"
exit 1
;;
esac
case "$DIB_SPARK_VERSION" in
"")
echo "Spark version not specified"
echo "Spark ${DIB_DEFAULT_SPARK_VERSION} will be used"
DIB_SPARK_VERSION=${DIB_DEFAULT_SPARK_VERSION}
;;
esac
;;
"storm")
case "$BASE_IMAGE_OS" in
"" | "ubuntu");;
*)
@ -418,12 +460,23 @@ fi
if [ -z "$PLUGIN" -o "$PLUGIN" = "spark" ]; then
export DIB_HDFS_LIB_DIR="/usr/lib/hadoop"
export DIB_CLOUD_INIT_DATASOURCES=$CLOUD_INIT_DATASOURCES
export DIB_SPARK_VERSION
COMMON_ELEMENTS="vm ubuntu $JAVA_ELEMENT swift_hadoop spark"
if [ "$DIB_SPARK_VERSION" == "1.0.2" ]; then
echo "Overriding CDH version, CDH 4 is required for this Spark version"
export DIB_CDH_VERSION="CDH4"
ubuntu_elements_sequence="$COMMON_ELEMENTS hadoop-cdh"
else
export DIB_CDH_VERSION=$HADOOP_VERSION
ubuntu_elements_sequence="$COMMON_ELEMENTS hadoop-cloudera"
fi
# Tell the cloudera element to install only hdfs
export CDH_HDFS_ONLY=1
export DIB_HADOOP_VERSION="CDH4"
export ubuntu_image_name=${ubuntu_spark_image_name:-"ubuntu_sahara_spark_latest"}
ubuntu_elements_sequence="vm ubuntu $JAVA_ELEMENT hadoop-cdh swift_hadoop spark"
if [ -n "$USE_MIRRORS" ]; then
[ -n "$UBUNTU_MIRROR" ] && ubuntu_elements_sequence="$ubuntu_elements_sequence apt-mirror"
fi
@ -432,6 +485,10 @@ if [ -z "$PLUGIN" -o "$PLUGIN" = "spark" ]; then
disk-image-create $TRACING $ubuntu_elements_sequence -o $ubuntu_image_name
unset DIB_CLOUD_INIT_DATASOURCES
unset DIB_HDFS_LIB_DIR
unset CDH_HDFS_ONLY
unset DIB_CDH_VERSION
unset DIB_SPARK_VERSION
unset DIB_HADOOP_VERSION
fi

View File

@ -4,3 +4,18 @@ hadoop-cdh
Installs Hadoop CDH 4 (the Cloudera distribution), configures SSH.
Only HDFS is installed at this time.
This element is used by Spark version 1.0.2.
This element is deprecated and will be deleted when support for Spark 1.0.2
will be dropped from Sahara.
Environment Variables
---------------------
DIB_CDH_VERSION
:Required: Yes.
:Description: Version of the CDH platform to install.
:Example: ``DIB_CDH_VERSION=CDH4``

View File

@ -11,11 +11,11 @@ if [ -z "${JAVA_DOWNLOAD_URL:-}" ]; then
echo "JAVA_FILE and JAVA_DOWNLOAD_URL are not set. Proceeding with distro native Java."
fi
fi
if [ -z "$DIB_HADOOP_VERSION" ]; then
echo "DIB_HADOOP_VERSION is not set. Impossible to install hadoop. Exit"
if [ -z "$DIB_CDH_VERSION" ]; then
echo "DIB_CDH_VERSION is not set. Impossible to install hadoop. Exit"
exit 1
fi
if [ $DIB_HADOOP_VERSION != "CDH4" ]; then
echo "CDH version $DIB_HADOOP_VERSION not supported. Exiting."
if [ $DIB_CDH_VERSION != "CDH4" ]; then
echo "CDH version $DIB_CDH_VERSION not supported. Exiting."
exit 1
fi

View File

@ -16,3 +16,15 @@ following syntax to select the ``cloudera`` plugin:
.. sourcecode:: bash
diskimage-create.sh -p cloudera
Environment Variables
---------------------
The element can be configured by exporting variables using a
`environment.d` script.
CDH_HDFS_ONLY
:Required: No
:Description: If set will install only the namenode and datanode
packages with their dependencies.

View File

@ -11,48 +11,48 @@ if [ "$DISTRO_NAME" = "ubuntu" ]; then
export RUNLEVEL=1
fi
install-packages \
cloudera-manager-agent \
cloudera-manager-daemons \
cloudera-manager-server \
cloudera-manager-server-db-2 \
hadoop-hdfs-datanode \
hadoop-hdfs-namenode \
hadoop-hdfs-secondarynamenode \
hadoop-mapreduce \
hadoop-mapreduce-historyserver \
hadoop-yarn-nodemanager \
hadoop-yarn-resourcemanager \
hbase \
hive-hcatalog \
hive-metastore \
hive-server2 \
hive-webhcat-server \
hue \
ntp \
oozie \
oracle-j2sdk1.7 \
spark-core \
zookeeper
if [ $DIB_CDH_VERSION \> "5.0" ]; then
# CDH5.0 does not have below packages.
# Install the rest of CDH unless a limited HDFS install was requested
if [ -z "${CDH_HDFS_ONLY:-}" ]; then
install-packages \
flume-ng \
hadoop-kms \
hbase-solr \
impala \
impala-server \
impala-state-store \
impala-catalog \
impala-shell \
keytrustee-keyprovider \
sentry \
solr-server \
solr-doc \
search \
spark-history-server \
sqoop2
cloudera-manager-agent \
cloudera-manager-daemons \
cloudera-manager-server \
cloudera-manager-server-db-2 \
hadoop-hdfs-secondarynamenode \
hadoop-mapreduce \
hadoop-mapreduce-historyserver \
hadoop-yarn-nodemanager \
hadoop-yarn-resourcemanager \
hbase \
hive-hcatalog \
hive-metastore \
hive-server2 \
hive-webhcat-server \
hue \
oozie \
oracle-j2sdk1.7 \
spark-core \
zookeeper
if [ $DIB_CDH_VERSION \> "5.0" ]; then
# CDH5.0 does not have below packages.
install-packages \
flume-ng \
hadoop-kms \
hbase-solr \
impala \
impala-server \
impala-state-store \
impala-catalog \
impala-shell \
keytrustee-keyprovider \
sentry \
solr-server \
solr-doc \
search \
spark-history-server \
sqoop2
fi
fi
HADOOP_OPENSTACK_5_4_0_URL="https://repository.cloudera.com/artifactory/repo/org/apache/hadoop/hadoop-openstack/2.6.0-cdh5.4.0/hadoop-openstack-2.6.0-cdh5.4.0.jar"

View File

@ -1,2 +1,6 @@
wget:
phase: pre-install.d
ntp:
hadoop-hdfs-datanode:
hadoop-hdfs-namenode:
# other packages are installed conditionally in install.d/50-install-cloudera

View File

@ -2,27 +2,45 @@
spark
=====
Installs Spark on Ubuntu. Requires Hadoop CDH 4 (``hadoop-cdh`` element).
Installs Spark on Ubuntu. Requires Hadoop (currently from CDH distribution).
It will install a version of Spark known to be compatible with CDH 4;
this behaviour can be controlled also by using ``DIB_SPARK_VERSION`` or
directly with ``SPARK_DOWNLOAD_URL``.
This element will install Spark into an Ubuntu image. It tries to guess the
correct file to download based on the ``DIB_SPARK_VERSION`` and ``DIB_CDH_VERSION``
variables, but this behaviour can be overridden by using ``SPARK_DOWNLOAD_URL``
to specify a download URL for a pre-built Spark tar.gz file. See
http://spark.apache.org/downloads.html for more download options.
Versions
--------
This element is able to generate images containing any valid Spark version,
compiled against one version of Hadoop HDFS libraries.
Only some combinations of Spark and Hadoop versions are possible, depending on
the availability of a pre-compiled binary and only few of them are tested with
the Sahara Spark plugin.
The ``diskimage-create.sh`` script will use tested defaults. Those defaults
generate an image supported by the Sahara Spark plugin. Other combinations
should be used only for evaluation or testing purposes. Refer to the Sahara
Spark plugin wiki page (https://wiki.openstack.org/wiki/Sahara/SparkPlugin)
for more information about tested and supported versions.
Environment Variables
---------------------
DIB_HADOOP_VERSION
:Required: Yes, if ``SPARK_DOWNLOAD_URL`` is not set.
:Description: Version of the Hadoop platform. See also
http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html.
:Example: ``DIB_HADOOP_VERSION=CDH4``
DIB_SPARK_VERSION
:Required: No
:Default: Depends on ``DIB_HADOOP_VERSION``.
:Description: Version of Spark to download from apache.org.
:Required: Yes, if ``SPARK_DOWNLOAD_URL`` is not set.
:Description: Version of the Spark package to download.
:Exmaple: ``DIB_SPARK_VERSION=1.3.1``
DIB_CDH_VERSION
:Required: Yes, if ``SPARK_DOWNLOAD_URL`` is not set.
:Description: Version of the CDH platform to use for Hadoop compatibility.
CDH version 5.3 is known to work well.
:Example: ``DIB_CDH_VERSION=5.3``
SPARK_DOWNLOAD_URL
:Required: Yes, if ``DIB_HADOOP_VERSION`` is not set.
:Default: ``http://archive.apache.org/dist/spark/spark-$DIB_SPARK_VERSION/spark-$DIB_SPARK_VERSION-bin-$SPARK_HADOOP_DL.tgz``
:Description: Download URL of the Spark package.
:Required: No, if set overrides ``DIB_CDH_VERSION`` and ``DIB_SPARK_VERSION``
:Description: Download URL of a tgz Spark package to override the automatic
selection from the Apache repositories.

View File

@ -1,3 +1,2 @@
cache-url
hadoop-cdh
package-installs

View File

@ -11,28 +11,10 @@ set -o pipefail
tmp_dir=/tmp/spark
pushd $tmp_dir
# The user is not providing his own Spark distribution package
if [ -z "${SPARK_DOWNLOAD_URL:-}" ]; then
# Check hadoop version
# INFO on hadoop versions: http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html
if [ -z "${DIB_SPARK_VERSION:-}" ]; then
case "$DIB_HADOOP_VERSION" in
CDH4)
DIB_SPARK_VERSION=1.0.0
SPARK_HADOOP_DL=cdh4
;;
*)
echo -e "WARNING: Hadoop version $DIB_HADOOP_VERSION not supported."
echo -e "WARNING: make sure SPARK_DOWNLOAD_URL points to a compatible Spark version."
;;
esac
fi
SPARK_DOWNLOAD_URL="http://archive.apache.org/dist/spark/spark-$DIB_SPARK_VERSION/spark-$DIB_SPARK_VERSION-bin-$SPARK_HADOOP_DL.tgz"
fi
echo "Extracting SPARK"
spark_file=$(basename "$SPARK_DOWNLOAD_URL")
spark_url=$(cat spark_url.txt)
spark_file=$(basename $spark_url)
extract_folder=$(tar tzf $spark_file | sed -e 's@/.*@@' | uniq)
echo "Decompressing Spark..."
tar xzf $spark_file
@ -40,8 +22,8 @@ rm $spark_file
echo "Moving SPARK to /opt/"
# Placing spark in /opt/spark
mv $extract_folder /opt/spark
echo "$SPARK_DOWNLOAD_URL" > /opt/spark/spark_url.txt
mv $extract_folder /opt/spark/
mv spark_url.txt /opt/spark/
popd
rm -Rf $tmp_dir

View File

@ -6,7 +6,7 @@ fi
set -eu
set -o pipefail
if [ -z "${SPARK_DOWNLOAD_URL:-}" -a -z "${DIB_HADOOP_VERSION:-}" ]; then
echo -e "Neither DIB_HADOOP_VERSION nor SPARK_DOWNLOAD_URL are set. Impossible to install Spark.\nAborting"
if [ -z "${SPARK_DOWNLOAD_URL:-}" -a -z "${DIB_CDH_VERSION:-}" ]; then
echo -e "Neither DIB_CDH_VERSION nor SPARK_DOWNLOAD_URL are set. Impossible to install Spark.\nAborting"
exit 1
fi

View File

@ -13,18 +13,25 @@ mkdir -p $tmp_dir
if [ -z "${SPARK_DOWNLOAD_URL:-}" ]; then
# Check hadoop version
# INFO on hadoop versions: http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html
if [ -z "${DIB_SPARK_VERSION:-}" ]; then
case "$DIB_HADOOP_VERSION" in
CDH4)
DIB_SPARK_VERSION=1.0.0
SPARK_HADOOP_DL=cdh4
;;
*)
echo -e "WARNING: Hadoop version $DIB_HADOOP_VERSION not supported."
echo -e "WARNING: make sure SPARK_DOWNLOAD_URL points to a compatible Spark version."
;;
esac
fi
case "$DIB_CDH_VERSION" in
5.0)
SPARK_HADOOP_DL=hadoop2.3
;;
5.3)
SPARK_HADOOP_DL=hadoop2.4
;;
5.4)
SPARK_HADOOP_DL=hadoop2.6
;;
CDH4)
SPARK_HADOOP_DL=cdh4
;;
*)
echo "WARNING: Cloudera CDH version $DIB_CDH_VERSION not supported."
echo "WARNING: use the SPARK_DOWNLOAD_URL variable to install a custom Spark version."
exit 1
;;
esac
SPARK_DOWNLOAD_URL="http://archive.apache.org/dist/spark/spark-$DIB_SPARK_VERSION/spark-$DIB_SPARK_VERSION-bin-$SPARK_HADOOP_DL.tgz"
fi
@ -34,3 +41,4 @@ spark_file=$(basename "$SPARK_DOWNLOAD_URL")
cached_tar="$DIB_IMAGE_CACHE/$spark_file"
$TMP_HOOKS_PATH/bin/cache-url $SPARK_DOWNLOAD_URL $cached_tar
sudo install -D -g root -o root -m 0755 $cached_tar $tmp_dir
echo "$SPARK_DOWNLOAD_URL" > $tmp_dir/spark_url.txt