Skip to content

Commit

Permalink
Merge branch 'master' into groupby
Browse files Browse the repository at this point in the history
  • Loading branch information
davies committed Sep 14, 2014
2 parents 779ed03 + 2aea0da commit fbc504a
Show file tree
Hide file tree
Showing 478 changed files with 12,506 additions and 9,118 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ Spark is a fast and general cluster computing system for Big Data. It provides
high-level APIs in Scala, Java, and Python, and an optimized engine that
supports general computation graphs for data analysis. It also supports a
rich set of higher-level tools including Spark SQL for SQL and structured
data processing, MLLib for machine learning, GraphX for graph processing,
and Spark Streaming.
data processing, MLlib for machine learning, GraphX for graph processing,
and Spark Streaming for stream processing.

<http://spark.apache.org/>

Expand Down
2 changes: 1 addition & 1 deletion assembly/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent</artifactId>
<version>1.1.0-SNAPSHOT</version>
<version>1.2.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down
2 changes: 1 addition & 1 deletion bagel/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent</artifactId>
<version>1.1.0-SNAPSHOT</version>
<version>1.2.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down
2 changes: 0 additions & 2 deletions bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ import org.scalatest.time.SpanSugar._
import org.apache.spark._
import org.apache.spark.storage.StorageLevel

import scala.language.postfixOps

class TestVertex(val active: Boolean, val age: Int) extends Vertex with Serializable
class TestMessage(val targetId: String) extends Message[String] with Serializable

Expand Down
2 changes: 1 addition & 1 deletion bin/beeline
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
set -o posix

# Figure out where Spark is installed
FWDIR="$(cd `dirname $0`/..; pwd)"
FWDIR="$(cd "`dirname "$0"`"/..; pwd)"

CLASS="org.apache.hive.beeline.BeeLine"
exec "$FWDIR/bin/spark-class" $CLASS "$@"
3 changes: 2 additions & 1 deletion bin/compute-classpath.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ rem Load environment variables from conf\spark-env.cmd, if it exists
if exist "%FWDIR%conf\spark-env.cmd" call "%FWDIR%conf\spark-env.cmd"

rem Build up classpath
set CLASSPATH=%FWDIR%conf
set CLASSPATH=%SPARK_CLASSPATH%;%SPARK_SUBMIT_CLASSPATH%;%FWDIR%conf

if exist "%FWDIR%RELEASE" (
for %%d in ("%FWDIR%lib\spark-assembly*.jar") do (
set ASSEMBLY_JAR=%%d
Expand Down
13 changes: 7 additions & 6 deletions bin/compute-classpath.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
SCALA_VERSION=2.10

# Figure out where Spark is installed
FWDIR="$(cd `dirname $0`/..; pwd)"
FWDIR="$(cd "`dirname "$0"`"/..; pwd)"

. $FWDIR/bin/load-spark-env.sh
. "$FWDIR"/bin/load-spark-env.sh

# Build up classpath
CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:$FWDIR/conf"
Expand All @@ -43,6 +43,7 @@ if [ -n "$SPARK_PREPEND_CLASSES" ]; then
echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\
"classes ahead of assembly." >&2
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/core/target/jars/*"
CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/classes"
Expand All @@ -63,7 +64,7 @@ else
assembly_folder="$ASSEMBLY_DIR"
fi

num_jars=$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*\.jar" | wc -l)
num_jars="$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*\.jar" | wc -l)"
if [ "$num_jars" -eq "0" ]; then
echo "Failed to find Spark assembly in $assembly_folder"
echo "You need to build Spark before running this program."
Expand All @@ -77,7 +78,7 @@ if [ "$num_jars" -gt "1" ]; then
exit 1
fi

ASSEMBLY_JAR=$(ls "$assembly_folder"/spark-assembly*hadoop*.jar 2>/dev/null)
ASSEMBLY_JAR="$(ls "$assembly_folder"/spark-assembly*hadoop*.jar 2>/dev/null)"

# Verify that versions of java used to build the jars and run Spark are compatible
jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
Expand All @@ -103,8 +104,8 @@ else
datanucleus_dir="$FWDIR"/lib_managed/jars
fi

datanucleus_jars=$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar")
datanucleus_jars=$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)
datanucleus_jars="$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar")"
datanucleus_jars="$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)"

if [ -n "$datanucleus_jars" ]; then
hive_files=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null)
Expand Down
4 changes: 2 additions & 2 deletions bin/load-spark-env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ if [ -z "$SPARK_ENV_LOADED" ]; then
export SPARK_ENV_LOADED=1

# Returns the parent of the directory this script lives in.
parent_dir="$(cd `dirname $0`/..; pwd)"
parent_dir="$(cd "`dirname "$0"`"/..; pwd)"

user_conf_dir=${SPARK_CONF_DIR:-"$parent_dir/conf"}
user_conf_dir="${SPARK_CONF_DIR:-"$parent_dir"/conf}"

if [ -f "${user_conf_dir}/spark-env.sh" ]; then
# Promote all variable declarations to environment (exported) variables
Expand Down
26 changes: 15 additions & 11 deletions bin/pyspark
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,18 @@
#

# Figure out where Spark is installed
FWDIR="$(cd `dirname $0`/..; pwd)"
FWDIR="$(cd "`dirname "$0"`"/..; pwd)"

# Export this as SPARK_HOME
export SPARK_HOME="$FWDIR"

source $FWDIR/bin/utils.sh
source "$FWDIR/bin/utils.sh"

SCALA_VERSION=2.10

function usage() {
echo "Usage: ./bin/pyspark [options]" 1>&2
$FWDIR/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
"$FWDIR"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
exit 0
}

Expand All @@ -48,7 +48,7 @@ if [ ! -f "$FWDIR/RELEASE" ]; then
fi
fi

. $FWDIR/bin/load-spark-env.sh
. "$FWDIR"/bin/load-spark-env.sh

# Figure out which Python executable to use
if [[ -z "$PYSPARK_PYTHON" ]]; then
Expand All @@ -57,12 +57,12 @@ fi
export PYSPARK_PYTHON

# Add the PySpark classes to the Python path:
export PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH
export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH
export PYTHONPATH="$SPARK_HOME/python/:$PYTHONPATH"
export PYTHONPATH="$SPARK_HOME/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH"

# Load the PySpark shell.py script when ./pyspark is used interactively:
export OLD_PYTHONSTARTUP=$PYTHONSTARTUP
export PYTHONSTARTUP=$FWDIR/python/pyspark/shell.py
export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"
export PYTHONSTARTUP="$FWDIR/python/pyspark/shell.py"

# If IPython options are specified, assume user wants to run IPython
if [[ -n "$IPYTHON_OPTS" ]]; then
Expand All @@ -85,6 +85,8 @@ export PYSPARK_SUBMIT_ARGS

# For pyspark tests
if [[ -n "$SPARK_TESTING" ]]; then
unset YARN_CONF_DIR
unset HADOOP_CONF_DIR
if [[ -n "$PYSPARK_DOC_TEST" ]]; then
exec "$PYSPARK_PYTHON" -m doctest $1
else
Expand All @@ -97,14 +99,16 @@ fi
if [[ "$1" =~ \.py$ ]]; then
echo -e "\nWARNING: Running python applications through ./bin/pyspark is deprecated as of Spark 1.0." 1>&2
echo -e "Use ./bin/spark-submit <python file>\n" 1>&2
primary=$1
primary="$1"
shift
gatherSparkSubmitOpts "$@"
exec $FWDIR/bin/spark-submit "${SUBMISSION_OPTS[@]}" $primary "${APPLICATION_OPTS[@]}"
exec "$FWDIR"/bin/spark-submit "${SUBMISSION_OPTS[@]}" "$primary" "${APPLICATION_OPTS[@]}"
else
# PySpark shell requires special handling downstream
export PYSPARK_SHELL=1
# Only use ipython if no command line arguments were provided [SPARK-1134]
if [[ "$IPYTHON" = "1" ]]; then
exec ipython $IPYTHON_OPTS
exec ${PYSPARK_PYTHON:-ipython} $IPYTHON_OPTS
else
exec "$PYSPARK_PYTHON"
fi
Expand Down
8 changes: 4 additions & 4 deletions bin/run-example
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

SCALA_VERSION=2.10

FWDIR="$(cd `dirname $0`/..; pwd)"
FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
export SPARK_HOME="$FWDIR"
EXAMPLES_DIR="$FWDIR"/examples

Expand All @@ -35,12 +35,12 @@ else
fi

if [ -f "$FWDIR/RELEASE" ]; then
export SPARK_EXAMPLES_JAR=`ls "$FWDIR"/lib/spark-examples-*hadoop*.jar`
export SPARK_EXAMPLES_JAR="`ls "$FWDIR"/lib/spark-examples-*hadoop*.jar`"
elif [ -e "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/spark-examples-*hadoop*.jar ]; then
export SPARK_EXAMPLES_JAR=`ls "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/spark-examples-*hadoop*.jar`
export SPARK_EXAMPLES_JAR="`ls "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/spark-examples-*hadoop*.jar`"
fi

if [[ -z $SPARK_EXAMPLES_JAR ]]; then
if [[ -z "$SPARK_EXAMPLES_JAR" ]]; then
echo "Failed to find Spark examples assembly in $FWDIR/lib or $FWDIR/examples/target" 1>&2
echo "You need to build Spark before running this program" 1>&2
exit 1
Expand Down
20 changes: 10 additions & 10 deletions bin/spark-class
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ esac
SCALA_VERSION=2.10

# Figure out where Spark is installed
FWDIR="$(cd `dirname $0`/..; pwd)"
FWDIR="$(cd "`dirname "$0"`"/..; pwd)"

# Export this as SPARK_HOME
export SPARK_HOME="$FWDIR"

. $FWDIR/bin/load-spark-env.sh
. "$FWDIR"/bin/load-spark-env.sh

if [ -z "$1" ]; then
echo "Usage: spark-class <class> [<args>]" 1>&2
Expand Down Expand Up @@ -105,7 +105,7 @@ else
exit 1
fi
fi
JAVA_VERSION=$($RUNNER -version 2>&1 | sed 's/java version "\(.*\)\.\(.*\)\..*"/\1\2/; 1q')
JAVA_VERSION=$("$RUNNER" -version 2>&1 | sed 's/java version "\(.*\)\.\(.*\)\..*"/\1\2/; 1q')

# Set JAVA_OPTS to be able to load native libraries and to set heap size
if [ "$JAVA_VERSION" -ge 18 ]; then
Expand All @@ -117,7 +117,7 @@ JAVA_OPTS="$JAVA_OPTS -Xms$OUR_JAVA_MEM -Xmx$OUR_JAVA_MEM"

# Load extra JAVA_OPTS from conf/java-opts, if it exists
if [ -e "$FWDIR/conf/java-opts" ] ; then
JAVA_OPTS="$JAVA_OPTS `cat $FWDIR/conf/java-opts`"
JAVA_OPTS="$JAVA_OPTS `cat "$FWDIR"/conf/java-opts`"
fi

# Attention: when changing the way the JAVA_OPTS are assembled, the change must be reflected in CommandUtils.scala!
Expand All @@ -126,21 +126,21 @@ TOOLS_DIR="$FWDIR"/tools
SPARK_TOOLS_JAR=""
if [ -e "$TOOLS_DIR"/target/scala-$SCALA_VERSION/spark-tools*[0-9Tg].jar ]; then
# Use the JAR from the SBT build
export SPARK_TOOLS_JAR=`ls "$TOOLS_DIR"/target/scala-$SCALA_VERSION/spark-tools*[0-9Tg].jar`
export SPARK_TOOLS_JAR="`ls "$TOOLS_DIR"/target/scala-$SCALA_VERSION/spark-tools*[0-9Tg].jar`"
fi
if [ -e "$TOOLS_DIR"/target/spark-tools*[0-9Tg].jar ]; then
# Use the JAR from the Maven build
# TODO: this also needs to become an assembly!
export SPARK_TOOLS_JAR=`ls "$TOOLS_DIR"/target/spark-tools*[0-9Tg].jar`
export SPARK_TOOLS_JAR="`ls "$TOOLS_DIR"/target/spark-tools*[0-9Tg].jar`"
fi

# Compute classpath using external script
classpath_output=$($FWDIR/bin/compute-classpath.sh)
classpath_output=$("$FWDIR"/bin/compute-classpath.sh)
if [[ "$?" != "0" ]]; then
echo "$classpath_output"
exit 1
else
CLASSPATH=$classpath_output
CLASSPATH="$classpath_output"
fi

if [[ "$1" =~ org.apache.spark.tools.* ]]; then
Expand All @@ -153,9 +153,9 @@ if [[ "$1" =~ org.apache.spark.tools.* ]]; then
fi

if $cygwin; then
CLASSPATH=`cygpath -wp $CLASSPATH`
CLASSPATH="`cygpath -wp "$CLASSPATH"`"
if [ "$1" == "org.apache.spark.tools.JavaAPICompletenessChecker" ]; then
export SPARK_TOOLS_JAR=`cygpath -w $SPARK_TOOLS_JAR`
export SPARK_TOOLS_JAR="`cygpath -w "$SPARK_TOOLS_JAR"`"
fi
fi
export CLASSPATH
Expand Down
46 changes: 39 additions & 7 deletions bin/spark-class2.cmd
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ rem See the License for the specific language governing permissions and
rem limitations under the License.
rem

rem Any changes to this file must be reflected in SparkSubmitDriverBootstrapper.scala!

setlocal enabledelayedexpansion

set SCALA_VERSION=2.10
Expand All @@ -38,7 +40,7 @@ if not "x%1"=="x" goto arg_given

if not "x%SPARK_MEM%"=="x" (
echo Warning: SPARK_MEM is deprecated, please use a more specific config option
echo e.g., spark.executor.memory or SPARK_DRIVER_MEMORY.
echo e.g., spark.executor.memory or spark.driver.memory.
)

rem Use SPARK_MEM or 512m as the default memory, to be overridden by specific options
Expand Down Expand Up @@ -67,10 +69,18 @@ rem Executors use SPARK_JAVA_OPTS + SPARK_EXECUTOR_MEMORY.
set OUR_JAVA_OPTS=%SPARK_JAVA_OPTS% %SPARK_EXECUTOR_OPTS%
if not "x%SPARK_EXECUTOR_MEMORY%"=="x" set OUR_JAVA_MEM=%SPARK_EXECUTOR_MEMORY%

rem All drivers use SPARK_JAVA_OPTS + SPARK_DRIVER_MEMORY. The repl also uses SPARK_REPL_OPTS.
) else if "%1"=="org.apache.spark.repl.Main" (
set OUR_JAVA_OPTS=%SPARK_JAVA_OPTS% %SPARK_REPL_OPTS%
rem Spark submit uses SPARK_JAVA_OPTS + SPARK_SUBMIT_OPTS +
rem SPARK_DRIVER_MEMORY + SPARK_SUBMIT_DRIVER_MEMORY.
rem The repl also uses SPARK_REPL_OPTS.
) else if "%1"=="org.apache.spark.deploy.SparkSubmit" (
set OUR_JAVA_OPTS=%SPARK_JAVA_OPTS% %SPARK_SUBMIT_OPTS% %SPARK_REPL_OPTS%
if not "x%SPARK_SUBMIT_LIBRARY_PATH%"=="x" (
set OUR_JAVA_OPTS=!OUR_JAVA_OPTS! -Djava.library.path=%SPARK_SUBMIT_LIBRARY_PATH%
) else if not "x%SPARK_LIBRARY_PATH%"=="x" (
set OUR_JAVA_OPTS=!OUR_JAVA_OPTS! -Djava.library.path=%SPARK_LIBRARY_PATH%
)
if not "x%SPARK_DRIVER_MEMORY%"=="x" set OUR_JAVA_MEM=%SPARK_DRIVER_MEMORY%
if not "x%SPARK_SUBMIT_DRIVER_MEMORY%"=="x" set OUR_JAVA_MEM=%SPARK_SUBMIT_DRIVER_MEMORY%
) else (
set OUR_JAVA_OPTS=%SPARK_JAVA_OPTS%
if not "x%SPARK_DRIVER_MEMORY%"=="x" set OUR_JAVA_MEM=%SPARK_DRIVER_MEMORY%
Expand All @@ -80,9 +90,9 @@ rem Set JAVA_OPTS to be able to load native libraries and to set heap size
for /f "tokens=3" %%i in ('java -version 2^>^&1 ^| find "version"') do set jversion=%%i
for /f "tokens=1 delims=_" %%i in ("%jversion:~1,-1%") do set jversion=%%i
if "%jversion%" geq "1.8.0" (
set JAVA_OPTS=%OUR_JAVA_OPTS% -Djava.library.path=%SPARK_LIBRARY_PATH% -Xms%OUR_JAVA_MEM% -Xmx%OUR_JAVA_MEM%
set JAVA_OPTS=%OUR_JAVA_OPTS% -Xms%OUR_JAVA_MEM% -Xmx%OUR_JAVA_MEM%
) else (
set JAVA_OPTS=-XX:MaxPermSize=128m %OUR_JAVA_OPTS% -Djava.library.path=%SPARK_LIBRARY_PATH% -Xms%OUR_JAVA_MEM% -Xmx%OUR_JAVA_MEM%
set JAVA_OPTS=-XX:MaxPermSize=128m %OUR_JAVA_OPTS% -Xms%OUR_JAVA_MEM% -Xmx%OUR_JAVA_MEM%
)
rem Attention: when changing the way the JAVA_OPTS are assembled, the change must be reflected in CommandUtils.scala!

Expand Down Expand Up @@ -115,5 +125,27 @@ rem Figure out where java is.
set RUNNER=java
if not "x%JAVA_HOME%"=="x" set RUNNER=%JAVA_HOME%\bin\java

"%RUNNER%" -cp "%CLASSPATH%" %JAVA_OPTS% %*
rem In Spark submit client mode, the driver is launched in the same JVM as Spark submit itself.
rem Here we must parse the properties file for relevant "spark.driver.*" configs before launching
rem the driver JVM itself. Instead of handling this complexity here, we launch a separate JVM
rem to prepare the launch environment of this driver JVM.

rem In this case, leave out the main class (org.apache.spark.deploy.SparkSubmit) and use our own.
rem Leaving out the first argument is surprisingly difficult to do in Windows. Note that this must
rem be done here because the Windows "shift" command does not work in a conditional block.
set BOOTSTRAP_ARGS=
shift
:start_parse
if "%~1" == "" goto end_parse
set BOOTSTRAP_ARGS=%BOOTSTRAP_ARGS% %~1
shift
goto start_parse
:end_parse

if not [%SPARK_SUBMIT_BOOTSTRAP_DRIVER%] == [] (
set SPARK_CLASS=1
"%RUNNER%" org.apache.spark.deploy.SparkSubmitDriverBootstrapper %BOOTSTRAP_ARGS%
) else (
"%RUNNER%" -cp "%CLASSPATH%" %JAVA_OPTS% %*
)
:exit
Loading

0 comments on commit fbc504a

Please sign in to comment.