Add a swift-enabled version of WordCount to edp-examples

The edp-wordcount example includes source code and instructions on
how to build the jar and run it from the Oozie command line or the
Savanna UI. WordCount will work with hdfs paths, or with swift paths
as long as the proper configs are set.

Change-Id: I9ac728de505d874fd50a6baf75b062d5b622f3d0
This commit is contained in:
Trevor McKay 2014-01-14 14:31:30 -05:00
parent 0ea551e556
commit bcc350d87e
7 changed files with 245 additions and 0 deletions

View File

@ -2,3 +2,4 @@ EDP Examples
============
* Pig job example - trim spaces in input file
* Edp wordcount - a version of WordCount that works with swift input and output

View File

@ -0,0 +1,75 @@
=====================
EDP WordCount Example
=====================
Overview
========
``WordCount.java`` is a modified version of the WordCount example bundled with
version 1.2.1 of Apache Hadoop. It has been extended for use from a java action
in an Oozie workflow. The modification below allows any configuration values
from the ``<configuration>`` tag in an Oozie workflow to be set in the Configuration
object::
// This will add properties from the <configuration> tag specified
// in the Oozie workflow. For java actions, Oozie writes the
// configuration values to a file pointed to by ooze.action.conf.xml
conf.addResource(new Path("file:///",
System.getProperty("oozie.action.conf.xml")));
In the example workflow, we use the ``<configuration>`` tag to specify user and
password configuration values for accessing swift objects.
Compiling
=========
To build the jar, add ``hadoop-core`` and ``commons-cli`` to the classpath.
On a node running Ubuntu 13.04 with hadoop 1.2.1 the following commands
will compile ``WordCount.java`` from within the ``src`` directory::
$ mkdir wordcount_classes
$ javac -classpath /usr/share/hadoop/hadoop-core-1.2.1.jar:/usr/share/hadoop/lib/commons-cli-1.2.jar -d wordcount_classes WordCount.java
$ jar -cvf edp-wordcount.jar -C wordcount_classes/ .
(A compiled ``edp-wordcount.jar`` is included in ``wordcount/lib``. Replace it if you rebuild)
Running from the command line with Oozie
========================================
The ``wordcount`` subdirectory contains a ``job.properties`` file, a ``workflow.xml`` file,
and a ``lib`` directory with an ``edp-wordcount.jar`` compiled as above.
To run this example from Oozie, you will need to modify the ``job.properties`` file
to specify the correct ``jobTracker`` and ``nameNode`` addresses for your cluster.
You will also need to modify the ``workflow.xml`` file to contain the correct input
and output paths. These paths may be Savanna swift urls or hdfs paths. If swift
urls are used, set the ``fs.swift.service.savanna.username`` and ``fs.swift.service.savanna.password``
properties in the ``<configuration>`` section.
1) Upload the ``wordcount`` directory to hdfs
``$ hadoop fs -put wordcount wordcount``
2) Launch the job, specifying the correct oozie server and port
``$ oozie job -oozie http://oozie_server:port/oozie -config wordcount/job.properties -run``
3) Don't forget to create your swift input path! A Savanna swift url looks like *swift://container.savanna/object*
Running from the Savanna UI
===========================
Running the WordCount example from the Savanna UI is very similar to running a Pig, Hive,
or MapReduce job.
1) Create a job binary that points to the ``edp-wordcount.jar`` file
2) Create a ``Java`` job type and add the job binary to the ``libs`` value
3) Launch the job:
a) Add the input and output paths to ``args``
b) If swift input or output paths are used, set the ``fs.swift.service.savanna.username`` and ``fs.swift.service.savanna.password``
configuration values

View File

@ -0,0 +1,2 @@
This product includes software developed by The Apache Software
Foundation (http://www.apache.org/).

View File

@ -0,0 +1,95 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
// ---- Begin modifications for EDP ----
// This will add properties from the <configuration> tag specified
// in the Oozie workflow. For java actions, Oozie writes the
// configuration values to a file pointed to by ooze.action.conf.xml
conf.addResource(new Path("file:///",
System.getProperty("oozie.action.conf.xml")));
// ---- End modifications for EDP ----
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

View File

@ -0,0 +1,23 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
nameNode=hdfs://1.2.3.4:8020
jobTracker=1.2.3.4:8021
queueName=default
oozie.wf.application.path=${nameNode}/user/${user.name}/wordcount

View File

@ -0,0 +1,49 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<workflow-app xmlns="uri:oozie:workflow:0.2" name="java-main-wf">
<start to="java-node"/>
<action name="java-node">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
<property>
<name>fs.swift.service.savanna.username</name>
<value>swiftuser</value>
</property>
<property>
<name>fs.swift.service.savanna.password</name>
<value>swiftpassword</value>
</property>
</configuration>
<main-class>org.apache.hadoop.examples.WordCount</main-class>
<arg>swift://user.savanna/input</arg>
<arg>swift://user.savanna/output</arg>
</java>
<ok to="end"/>
<error to="fail"/>
</action>
<kill name="fail">
<message>Java failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<end name="end"/>
</workflow-app>