diff --git a/diskimage-create/README.rst b/diskimage-create/README.rst index 52510bb0..0a55b51f 100644 --- a/diskimage-create/README.rst +++ b/diskimage-create/README.rst @@ -46,7 +46,7 @@ NOTE: Do not create all images for all plugins with the same mirrors. Different NOTE for 4, 5, 6: For Vanilla you can create ubuntu, fedora and centos cloud image with hadoop 1.x.x and 2.x.x versions. Use environment variables 'DIB_HADOOP_VERSION_1' and 'DIB_HADOOP_VERSION_2' to change defaults. -For Spark you can create only ubuntu image with one hadoop version. You shouldn't specify image type and hadoop version. +For Spark you can create only ubuntu images, so you shouldn't specify an image type. The default Spark and HDFS versions included in the build are tested and known working together with the Sahara Spark plugin, other combinations should be used only for evaluation or testing purposes. You can select a different Spark version with commandline option '-s' and Hadoop HDFS version with '-v', but only Cludera CDH versions are available for now. For HDP you can create only centos image with hadoop 1.3.0 or 2.0 and without hadoop ('plain' image). You shouldn't specify image type. For Cloudera you can create ubuntu and centos images with preinstalled cloudera hadoop. You shouldn't specify hadoop version. diff --git a/diskimage-create/diskimage-create.sh b/diskimage-create/diskimage-create.sh index e3ddbc99..7ec26b54 100755 --- a/diskimage-create/diskimage-create.sh +++ b/diskimage-create/diskimage-create.sh @@ -12,6 +12,9 @@ DEBUG_MODE="false" # The default version for a MapR plugin DIB_DEFAULT_MAPR_VERSION="4.0.2" +# The default version for Spark plugin +DIB_DEFAULT_SPARK_VERSION="1.3.1" + # Default list of datasource modules for ubuntu. Workaround for bug #1375645 export CLOUD_INIT_DATASOURCES=${DIB_CLOUD_INIT_DATASOURCES:-"NoCloud, ConfigDrive, OVF, MAAS, Ec2"} @@ -23,8 +26,9 @@ usage() { echo "Usage: $(basename $0)" echo " [-p vanilla|spark|hdp|cloudera|storm|mapr|plain]" echo " [-i ubuntu|fedora|centos|centos7]" - echo " [-v 1|2|2.6|5.0|5.3|5.4]" + echo " [-v 1|2|2.6|4|5.0|5.3|5.4]" echo " [-r 3.1.1|4.0.1|4.0.2]" + echo " [-s ]" echo " [-d]" echo " [-u]" echo " [-j openjdk|oracle-java]" @@ -33,12 +37,13 @@ usage() { echo " '-i' is operating system of the base image (default: all supported by plugin)" echo " '-v' is hadoop version (default: all supported by plugin)" echo " '-r' is MapR Version (default: ${DIB_DEFAULT_MAPR_VERSION})" + echo " '-s' is Spark version (default: ${DIB_DEFAULT_SPARK_VERSION})" echo " '-d' enable debug mode, root account will have password 'hadoop'" echo " '-u' install missing packages necessary for building" echo " '-j' is java distribution (default: openjdk)" echo " '-x' turns on tracing" echo - echo "You shouldn't specify hadoop version and image type for spark plugin" + echo "You shouldn't specify image type for spark plugin" echo "You shouldn't specify image type for hdp plugin" echo "You shouldn't specify hadoop version for plain images" echo "Debug mode should only be enabled for local debugging purposes, not for production systems" @@ -47,7 +52,7 @@ usage() { exit 1 } -while getopts "p:i:v:dur:j:x" opt; do +while getopts "p:i:v:dur:s:j:x" opt; do case $opt in p) PLUGIN=$OPTARG @@ -64,6 +69,9 @@ while getopts "p:i:v:dur:j:x" opt; do r) DIB_MAPR_VERSION=$OPTARG ;; + s) + DIB_SPARK_VERSION=$OPTARG + ;; u) DIB_UPDATE_REQUESTED=true ;; @@ -150,7 +158,41 @@ case "$PLUGIN" in ;; esac ;; - "spark" | "storm") + "spark") + case "$BASE_IMAGE_OS" in + "" | "ubuntu");; + *) + echo -e "'$BASE_IMAGE_OS' image type is not supported by '$PLUGIN'.\nAborting" + exit 1 + ;; + esac + + case "$HADOOP_VERSION" in + "") + echo "CDH version not specified" + echo "CDH version 5.3 will be used" + HADOOP_VERSION="5.3" + ;; + "4") + HADOOP_VERSION="CDH4" + ;; + "5.0" | "5.3" | "5.4");; + *) + echo -e "Unknown hadoop version selected.\nAborting" + exit 1 + ;; + esac + + case "$DIB_SPARK_VERSION" in + "") + echo "Spark version not specified" + echo "Spark ${DIB_DEFAULT_SPARK_VERSION} will be used" + DIB_SPARK_VERSION=${DIB_DEFAULT_SPARK_VERSION} + ;; + esac + + ;; + "storm") case "$BASE_IMAGE_OS" in "" | "ubuntu");; *) @@ -418,12 +460,23 @@ fi if [ -z "$PLUGIN" -o "$PLUGIN" = "spark" ]; then export DIB_HDFS_LIB_DIR="/usr/lib/hadoop" export DIB_CLOUD_INIT_DATASOURCES=$CLOUD_INIT_DATASOURCES + export DIB_SPARK_VERSION + + COMMON_ELEMENTS="vm ubuntu $JAVA_ELEMENT swift_hadoop spark" + if [ "$DIB_SPARK_VERSION" == "1.0.2" ]; then + echo "Overriding CDH version, CDH 4 is required for this Spark version" + export DIB_CDH_VERSION="CDH4" + ubuntu_elements_sequence="$COMMON_ELEMENTS hadoop-cdh" + else + export DIB_CDH_VERSION=$HADOOP_VERSION + ubuntu_elements_sequence="$COMMON_ELEMENTS hadoop-cloudera" + fi + + # Tell the cloudera element to install only hdfs + export CDH_HDFS_ONLY=1 - export DIB_HADOOP_VERSION="CDH4" export ubuntu_image_name=${ubuntu_spark_image_name:-"ubuntu_sahara_spark_latest"} - ubuntu_elements_sequence="vm ubuntu $JAVA_ELEMENT hadoop-cdh swift_hadoop spark" - if [ -n "$USE_MIRRORS" ]; then [ -n "$UBUNTU_MIRROR" ] && ubuntu_elements_sequence="$ubuntu_elements_sequence apt-mirror" fi @@ -432,6 +485,10 @@ if [ -z "$PLUGIN" -o "$PLUGIN" = "spark" ]; then disk-image-create $TRACING $ubuntu_elements_sequence -o $ubuntu_image_name unset DIB_CLOUD_INIT_DATASOURCES unset DIB_HDFS_LIB_DIR + unset CDH_HDFS_ONLY + unset DIB_CDH_VERSION + unset DIB_SPARK_VERSION + unset DIB_HADOOP_VERSION fi diff --git a/elements/hadoop-cdh/README.rst b/elements/hadoop-cdh/README.rst index d068e26e..9b95d708 100644 --- a/elements/hadoop-cdh/README.rst +++ b/elements/hadoop-cdh/README.rst @@ -4,3 +4,18 @@ hadoop-cdh Installs Hadoop CDH 4 (the Cloudera distribution), configures SSH. Only HDFS is installed at this time. + +This element is used by Spark version 1.0.2. + +This element is deprecated and will be deleted when support for Spark 1.0.2 +will be dropped from Sahara. + + +Environment Variables +--------------------- + +DIB_CDH_VERSION + :Required: Yes. + :Description: Version of the CDH platform to install. + :Example: ``DIB_CDH_VERSION=CDH4`` + diff --git a/elements/hadoop-cdh/root.d/0-check-hadoop-cdh b/elements/hadoop-cdh/root.d/0-check-hadoop-cdh index fdaa0eb6..278b9b86 100755 --- a/elements/hadoop-cdh/root.d/0-check-hadoop-cdh +++ b/elements/hadoop-cdh/root.d/0-check-hadoop-cdh @@ -11,11 +11,11 @@ if [ -z "${JAVA_DOWNLOAD_URL:-}" ]; then echo "JAVA_FILE and JAVA_DOWNLOAD_URL are not set. Proceeding with distro native Java." fi fi -if [ -z "$DIB_HADOOP_VERSION" ]; then - echo "DIB_HADOOP_VERSION is not set. Impossible to install hadoop. Exit" +if [ -z "$DIB_CDH_VERSION" ]; then + echo "DIB_CDH_VERSION is not set. Impossible to install hadoop. Exit" exit 1 fi -if [ $DIB_HADOOP_VERSION != "CDH4" ]; then - echo "CDH version $DIB_HADOOP_VERSION not supported. Exiting." +if [ $DIB_CDH_VERSION != "CDH4" ]; then + echo "CDH version $DIB_CDH_VERSION not supported. Exiting." exit 1 fi diff --git a/elements/hadoop-cloudera/README.rst b/elements/hadoop-cloudera/README.rst index 4cb0ab46..3dbb944a 100644 --- a/elements/hadoop-cloudera/README.rst +++ b/elements/hadoop-cloudera/README.rst @@ -16,3 +16,15 @@ following syntax to select the ``cloudera`` plugin: .. sourcecode:: bash diskimage-create.sh -p cloudera + +Environment Variables +--------------------- + +The element can be configured by exporting variables using a +`environment.d` script. + +CDH_HDFS_ONLY + :Required: No + :Description: If set will install only the namenode and datanode + packages with their dependencies. + diff --git a/elements/hadoop-cloudera/install.d/50-install-cloudera b/elements/hadoop-cloudera/install.d/50-install-cloudera index 8f3fd620..7dfac3e9 100755 --- a/elements/hadoop-cloudera/install.d/50-install-cloudera +++ b/elements/hadoop-cloudera/install.d/50-install-cloudera @@ -11,48 +11,48 @@ if [ "$DISTRO_NAME" = "ubuntu" ]; then export RUNLEVEL=1 fi -install-packages \ - cloudera-manager-agent \ - cloudera-manager-daemons \ - cloudera-manager-server \ - cloudera-manager-server-db-2 \ - hadoop-hdfs-datanode \ - hadoop-hdfs-namenode \ - hadoop-hdfs-secondarynamenode \ - hadoop-mapreduce \ - hadoop-mapreduce-historyserver \ - hadoop-yarn-nodemanager \ - hadoop-yarn-resourcemanager \ - hbase \ - hive-hcatalog \ - hive-metastore \ - hive-server2 \ - hive-webhcat-server \ - hue \ - ntp \ - oozie \ - oracle-j2sdk1.7 \ - spark-core \ - zookeeper - -if [ $DIB_CDH_VERSION \> "5.0" ]; then - # CDH5.0 does not have below packages. +# Install the rest of CDH unless a limited HDFS install was requested +if [ -z "${CDH_HDFS_ONLY:-}" ]; then install-packages \ - flume-ng \ - hadoop-kms \ - hbase-solr \ - impala \ - impala-server \ - impala-state-store \ - impala-catalog \ - impala-shell \ - keytrustee-keyprovider \ - sentry \ - solr-server \ - solr-doc \ - search \ - spark-history-server \ - sqoop2 + cloudera-manager-agent \ + cloudera-manager-daemons \ + cloudera-manager-server \ + cloudera-manager-server-db-2 \ + hadoop-hdfs-secondarynamenode \ + hadoop-mapreduce \ + hadoop-mapreduce-historyserver \ + hadoop-yarn-nodemanager \ + hadoop-yarn-resourcemanager \ + hbase \ + hive-hcatalog \ + hive-metastore \ + hive-server2 \ + hive-webhcat-server \ + hue \ + oozie \ + oracle-j2sdk1.7 \ + spark-core \ + zookeeper + + if [ $DIB_CDH_VERSION \> "5.0" ]; then + # CDH5.0 does not have below packages. + install-packages \ + flume-ng \ + hadoop-kms \ + hbase-solr \ + impala \ + impala-server \ + impala-state-store \ + impala-catalog \ + impala-shell \ + keytrustee-keyprovider \ + sentry \ + solr-server \ + solr-doc \ + search \ + spark-history-server \ + sqoop2 + fi fi HADOOP_OPENSTACK_5_4_0_URL="https://repository.cloudera.com/artifactory/repo/org/apache/hadoop/hadoop-openstack/2.6.0-cdh5.4.0/hadoop-openstack-2.6.0-cdh5.4.0.jar" diff --git a/elements/hadoop-cloudera/package-installs.yaml b/elements/hadoop-cloudera/package-installs.yaml index 6d45f106..0a12a18f 100644 --- a/elements/hadoop-cloudera/package-installs.yaml +++ b/elements/hadoop-cloudera/package-installs.yaml @@ -1,2 +1,6 @@ wget: phase: pre-install.d +ntp: +hadoop-hdfs-datanode: +hadoop-hdfs-namenode: +# other packages are installed conditionally in install.d/50-install-cloudera diff --git a/elements/spark/README.rst b/elements/spark/README.rst index a59336a7..1c08495e 100644 --- a/elements/spark/README.rst +++ b/elements/spark/README.rst @@ -2,27 +2,45 @@ spark ===== -Installs Spark on Ubuntu. Requires Hadoop CDH 4 (``hadoop-cdh`` element). +Installs Spark on Ubuntu. Requires Hadoop (currently from CDH distribution). -It will install a version of Spark known to be compatible with CDH 4; -this behaviour can be controlled also by using ``DIB_SPARK_VERSION`` or -directly with ``SPARK_DOWNLOAD_URL``. +This element will install Spark into an Ubuntu image. It tries to guess the +correct file to download based on the ``DIB_SPARK_VERSION`` and ``DIB_CDH_VERSION`` +variables, but this behaviour can be overridden by using ``SPARK_DOWNLOAD_URL`` +to specify a download URL for a pre-built Spark tar.gz file. See +http://spark.apache.org/downloads.html for more download options. + +Versions +-------- + +This element is able to generate images containing any valid Spark version, +compiled against one version of Hadoop HDFS libraries. + +Only some combinations of Spark and Hadoop versions are possible, depending on +the availability of a pre-compiled binary and only few of them are tested with +the Sahara Spark plugin. + +The ``diskimage-create.sh`` script will use tested defaults. Those defaults +generate an image supported by the Sahara Spark plugin. Other combinations +should be used only for evaluation or testing purposes. Refer to the Sahara +Spark plugin wiki page (https://wiki.openstack.org/wiki/Sahara/SparkPlugin) +for more information about tested and supported versions. Environment Variables --------------------- -DIB_HADOOP_VERSION - :Required: Yes, if ``SPARK_DOWNLOAD_URL`` is not set. - :Description: Version of the Hadoop platform. See also - http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html. - :Example: ``DIB_HADOOP_VERSION=CDH4`` - DIB_SPARK_VERSION - :Required: No - :Default: Depends on ``DIB_HADOOP_VERSION``. - :Description: Version of Spark to download from apache.org. + :Required: Yes, if ``SPARK_DOWNLOAD_URL`` is not set. + :Description: Version of the Spark package to download. + :Exmaple: ``DIB_SPARK_VERSION=1.3.1`` + +DIB_CDH_VERSION + :Required: Yes, if ``SPARK_DOWNLOAD_URL`` is not set. + :Description: Version of the CDH platform to use for Hadoop compatibility. + CDH version 5.3 is known to work well. + :Example: ``DIB_CDH_VERSION=5.3`` SPARK_DOWNLOAD_URL - :Required: Yes, if ``DIB_HADOOP_VERSION`` is not set. - :Default: ``http://archive.apache.org/dist/spark/spark-$DIB_SPARK_VERSION/spark-$DIB_SPARK_VERSION-bin-$SPARK_HADOOP_DL.tgz`` - :Description: Download URL of the Spark package. + :Required: No, if set overrides ``DIB_CDH_VERSION`` and ``DIB_SPARK_VERSION`` + :Description: Download URL of a tgz Spark package to override the automatic + selection from the Apache repositories. diff --git a/elements/spark/element-deps b/elements/spark/element-deps index a548793d..0e0a242d 100644 --- a/elements/spark/element-deps +++ b/elements/spark/element-deps @@ -1,3 +1,2 @@ cache-url -hadoop-cdh package-installs diff --git a/elements/spark/install.d/60-spark b/elements/spark/install.d/60-spark index f93d9eb4..adfcb8ef 100755 --- a/elements/spark/install.d/60-spark +++ b/elements/spark/install.d/60-spark @@ -11,28 +11,10 @@ set -o pipefail tmp_dir=/tmp/spark pushd $tmp_dir -# The user is not providing his own Spark distribution package -if [ -z "${SPARK_DOWNLOAD_URL:-}" ]; then - # Check hadoop version - # INFO on hadoop versions: http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html - if [ -z "${DIB_SPARK_VERSION:-}" ]; then - case "$DIB_HADOOP_VERSION" in - CDH4) - DIB_SPARK_VERSION=1.0.0 - SPARK_HADOOP_DL=cdh4 - ;; - *) - echo -e "WARNING: Hadoop version $DIB_HADOOP_VERSION not supported." - echo -e "WARNING: make sure SPARK_DOWNLOAD_URL points to a compatible Spark version." - ;; - esac - fi - - SPARK_DOWNLOAD_URL="http://archive.apache.org/dist/spark/spark-$DIB_SPARK_VERSION/spark-$DIB_SPARK_VERSION-bin-$SPARK_HADOOP_DL.tgz" -fi echo "Extracting SPARK" -spark_file=$(basename "$SPARK_DOWNLOAD_URL") +spark_url=$(cat spark_url.txt) +spark_file=$(basename $spark_url) extract_folder=$(tar tzf $spark_file | sed -e 's@/.*@@' | uniq) echo "Decompressing Spark..." tar xzf $spark_file @@ -40,8 +22,8 @@ rm $spark_file echo "Moving SPARK to /opt/" # Placing spark in /opt/spark -mv $extract_folder /opt/spark -echo "$SPARK_DOWNLOAD_URL" > /opt/spark/spark_url.txt +mv $extract_folder /opt/spark/ +mv spark_url.txt /opt/spark/ popd rm -Rf $tmp_dir diff --git a/elements/spark/root.d/0-check-spark b/elements/spark/root.d/0-check-spark index abc4a7d5..e07fff36 100755 --- a/elements/spark/root.d/0-check-spark +++ b/elements/spark/root.d/0-check-spark @@ -6,7 +6,7 @@ fi set -eu set -o pipefail -if [ -z "${SPARK_DOWNLOAD_URL:-}" -a -z "${DIB_HADOOP_VERSION:-}" ]; then - echo -e "Neither DIB_HADOOP_VERSION nor SPARK_DOWNLOAD_URL are set. Impossible to install Spark.\nAborting" +if [ -z "${SPARK_DOWNLOAD_URL:-}" -a -z "${DIB_CDH_VERSION:-}" ]; then + echo -e "Neither DIB_CDH_VERSION nor SPARK_DOWNLOAD_URL are set. Impossible to install Spark.\nAborting" exit 1 fi diff --git a/elements/spark/root.d/50-download-spark b/elements/spark/root.d/50-download-spark index d96b7ec8..4a56218d 100755 --- a/elements/spark/root.d/50-download-spark +++ b/elements/spark/root.d/50-download-spark @@ -13,18 +13,25 @@ mkdir -p $tmp_dir if [ -z "${SPARK_DOWNLOAD_URL:-}" ]; then # Check hadoop version # INFO on hadoop versions: http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html - if [ -z "${DIB_SPARK_VERSION:-}" ]; then - case "$DIB_HADOOP_VERSION" in - CDH4) - DIB_SPARK_VERSION=1.0.0 - SPARK_HADOOP_DL=cdh4 - ;; - *) - echo -e "WARNING: Hadoop version $DIB_HADOOP_VERSION not supported." - echo -e "WARNING: make sure SPARK_DOWNLOAD_URL points to a compatible Spark version." - ;; - esac - fi + case "$DIB_CDH_VERSION" in + 5.0) + SPARK_HADOOP_DL=hadoop2.3 + ;; + 5.3) + SPARK_HADOOP_DL=hadoop2.4 + ;; + 5.4) + SPARK_HADOOP_DL=hadoop2.6 + ;; + CDH4) + SPARK_HADOOP_DL=cdh4 + ;; + *) + echo "WARNING: Cloudera CDH version $DIB_CDH_VERSION not supported." + echo "WARNING: use the SPARK_DOWNLOAD_URL variable to install a custom Spark version." + exit 1 + ;; + esac SPARK_DOWNLOAD_URL="http://archive.apache.org/dist/spark/spark-$DIB_SPARK_VERSION/spark-$DIB_SPARK_VERSION-bin-$SPARK_HADOOP_DL.tgz" fi @@ -34,3 +41,4 @@ spark_file=$(basename "$SPARK_DOWNLOAD_URL") cached_tar="$DIB_IMAGE_CACHE/$spark_file" $TMP_HOOKS_PATH/bin/cache-url $SPARK_DOWNLOAD_URL $cached_tar sudo install -D -g root -o root -m 0755 $cached_tar $tmp_dir +echo "$SPARK_DOWNLOAD_URL" > $tmp_dir/spark_url.txt