Merge branch 'master' into groupby

Conflicts: python/pyspark/rdd.py python/pyspark/shuffle.py python/pyspark/tests.py
rekhajoshm · Aug 27, 2014 · b48cda5 · b48cda5
2 parents 85138e6 + 73b3089
commit b48cda5
Show file tree

Hide file tree

Showing 139 changed files with 3,729 additions and 908 deletions.
diff --git a/README.md b/README.md
@@ -69,7 +69,7 @@ Many of the example programs print usage help if no params are given.
 Testing first requires [building Spark](#building-spark). Once Spark is built, tests
 can be run using:
 
-    ./sbt/sbt test
+    ./dev/run-tests
 
 ## A Note About Hadoop Versions
 
@@ -118,7 +118,10 @@ If your project is built with Maven, add this to your POM file's `<dependencies>
 ## A Note About Thrift JDBC server and CLI for Spark SQL
 
 Spark SQL supports Thrift JDBC server and CLI.
-See sql-programming-guide.md for more information about using the JDBC server.
+See sql-programming-guide.md for more information about using the JDBC server and CLI.
+You can use those features by setting `-Phive` when building Spark as follows.
+
+    $ sbt/sbt -Phive  assembly
 
 ## Configuration
 
@@ -136,3 +139,5 @@ submitting any copyrighted material via pull request, email, or other means
 you agree to license the material under the project's open source license and
 warrant that you have the legal authority to do so.
 
+Please see [Contributing to Spark wiki page](https://cwiki.apache.org/SPARK/Contributing+to+Spark)
+for more information.
diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh
@@ -27,12 +27,12 @@ if [ -z "$SPARK_ENV_LOADED" ]; then
   # Returns the parent of the directory this script lives in.
   parent_dir="$(cd `dirname $0`/..; pwd)"
 
-  use_conf_dir=${SPARK_CONF_DIR:-"$parent_dir/conf"}
+  user_conf_dir=${SPARK_CONF_DIR:-"$parent_dir/conf"}
 
-  if [ -f "${use_conf_dir}/spark-env.sh" ]; then
+  if [ -f "${user_conf_dir}/spark-env.sh" ]; then
     # Promote all variable declarations to environment (exported) variables
     set -a
-    . "${use_conf_dir}/spark-env.sh"
+    . "${user_conf_dir}/spark-env.sh"
     set +a
   fi
 fi
diff --git a/bin/spark-class b/bin/spark-class
@@ -105,9 +105,14 @@ else
     exit 1
   fi
 fi
+JAVA_VERSION=$($RUNNER -version 2>&1 | sed 's/java version "\(.*\)\.\(.*\)\..*"/\1\2/; 1q')
 
 # Set JAVA_OPTS to be able to load native libraries and to set heap size
-JAVA_OPTS="-XX:MaxPermSize=128m $OUR_JAVA_OPTS"
+if [ "$JAVA_VERSION" -ge 18 ]; then
+  JAVA_OPTS="$OUR_JAVA_OPTS"
+else
+  JAVA_OPTS="-XX:MaxPermSize=128m $OUR_JAVA_OPTS"
+fi
 JAVA_OPTS="$JAVA_OPTS -Xms$OUR_JAVA_MEM -Xmx$OUR_JAVA_MEM"
 
 # Load extra JAVA_OPTS from conf/java-opts, if it exists

diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd
@@ -77,7 +77,13 @@ rem All drivers use SPARK_JAVA_OPTS + SPARK_DRIVER_MEMORY. The repl also uses SP
 )
 
 rem Set JAVA_OPTS to be able to load native libraries and to set heap size
-set JAVA_OPTS=-XX:MaxPermSize=128m %OUR_JAVA_OPTS% -Djava.library.path=%SPARK_LIBRARY_PATH% -Xms%OUR_JAVA_MEM% -Xmx%OUR_JAVA_MEM%
+for /f "tokens=3" %%i in ('java -version 2^>^&1 ^| find "version"') do set jversion=%%i
+for /f "tokens=1 delims=_" %%i in ("%jversion:~1,-1%") do set jversion=%%i
+if "%jversion%" geq "1.8.0" (
+  set JAVA_OPTS=%OUR_JAVA_OPTS% -Djava.library.path=%SPARK_LIBRARY_PATH% -Xms%OUR_JAVA_MEM% -Xmx%OUR_JAVA_MEM%
+) else (
+  set JAVA_OPTS=-XX:MaxPermSize=128m %OUR_JAVA_OPTS% -Djava.library.path=%SPARK_LIBRARY_PATH% -Xms%OUR_JAVA_MEM% -Xmx%OUR_JAVA_MEM%
+)
 rem Attention: when changing the way the JAVA_OPTS are assembled, the change must be reflected in CommandUtils.scala!
 
 rem Test whether the user has built Spark

diff --git a/bin/spark-shell b/bin/spark-shell
@@ -22,7 +22,7 @@
 
 cygwin=false
 case "`uname`" in
-    CYGWIN*) cygwin=true;;
+  CYGWIN*) cygwin=true;;
 esac
 
 # Enter posix mode for bash
@@ -32,9 +32,9 @@ set -o posix
 FWDIR="$(cd `dirname $0`/..; pwd)"
 
 function usage() {
-    echo "Usage: ./bin/spark-shell [options]"
-    $FWDIR/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
-    exit 0
+  echo "Usage: ./bin/spark-shell [options]"
+  $FWDIR/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
+  exit 0
 }
 
 if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
@@ -46,20 +46,20 @@ SUBMIT_USAGE_FUNCTION=usage
 gatherSparkSubmitOpts "$@"
 
 function main() {
-    if $cygwin; then
-        # Workaround for issue involving JLine and Cygwin
-        # (see http://sourceforge.net/p/jline/bugs/40/).
-        # If you're using the Mintty terminal emulator in Cygwin, may need to set the
-        # "Backspace sends ^H" setting in "Keys" section of the Mintty options
-        # (see https://github.com/sbt/sbt/issues/562).
-        stty -icanon min 1 -echo > /dev/null 2>&1
-        export SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Djline.terminal=unix"
-        $FWDIR/bin/spark-submit --class org.apache.spark.repl.Main "${SUBMISSION_OPTS[@]}" spark-shell "${APPLICATION_OPTS[@]}"
-        stty icanon echo > /dev/null 2>&1
-    else
-        export SPARK_SUBMIT_OPTS
-        $FWDIR/bin/spark-submit --class org.apache.spark.repl.Main "${SUBMISSION_OPTS[@]}" spark-shell "${APPLICATION_OPTS[@]}"
-    fi
+  if $cygwin; then
+    # Workaround for issue involving JLine and Cygwin
+    # (see http://sourceforge.net/p/jline/bugs/40/).
+    # If you're using the Mintty terminal emulator in Cygwin, may need to set the
+    # "Backspace sends ^H" setting in "Keys" section of the Mintty options
+    # (see https://github.com/sbt/sbt/issues/562).
+    stty -icanon min 1 -echo > /dev/null 2>&1
+    export SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Djline.terminal=unix"
+    $FWDIR/bin/spark-submit --class org.apache.spark.repl.Main "${SUBMISSION_OPTS[@]}" spark-shell "${APPLICATION_OPTS[@]}"
+    stty icanon echo > /dev/null 2>&1
+  else
+    export SPARK_SUBMIT_OPTS
+    $FWDIR/bin/spark-submit --class org.apache.spark.repl.Main "${SUBMISSION_OPTS[@]}" spark-shell "${APPLICATION_OPTS[@]}"
+  fi
 }
 
 # Copy restore-TTY-on-exit functions from Scala script so spark-shell exits properly even in

diff --git a/bin/spark-sql b/bin/spark-sql
@@ -24,6 +24,7 @@
 set -o posix
 
 CLASS="org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver"
+CLASS_NOT_FOUND_EXIT_STATUS=1
 
 # Figure out where Spark is installed
 FWDIR="$(cd `dirname $0`/..; pwd)"
@@ -43,52 +44,22 @@ function usage {
   $FWDIR/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2
 }
 
-function ensure_arg_number {
-  arg_number=$1
-  at_least=$2
-
-  if [[ $arg_number -lt $at_least ]]; then
-    usage
-    exit 1
-  fi
-}
-
-if [[ "$@" = --help ]] || [[ "$@" = -h ]]; then
+if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
   usage
   exit 0
 fi
 
-CLI_ARGS=()
-SUBMISSION_ARGS=()
-
-while (($#)); do
-  case $1 in
-    -d | --define | --database | -f | -h | --hiveconf | --hivevar | -i | -p)
-      ensure_arg_number $# 2
-      CLI_ARGS+=("$1"); shift
-      CLI_ARGS+=("$1"); shift
-      ;;
+source $FWDIR/bin/utils.sh
+SUBMIT_USAGE_FUNCTION=usage
+gatherSparkSubmitOpts "$@"
 
-    -e)
-      ensure_arg_number $# 2
-      CLI_ARGS+=("$1"); shift
-      CLI_ARGS+=("$1"); shift
-      ;;
+"$FWDIR"/bin/spark-submit --class $CLASS "${SUBMISSION_OPTS[@]}" spark-internal "${APPLICATION_OPTS[@]}"
+exit_status=$?
 
-    -s | --silent)
-      CLI_ARGS+=("$1"); shift
-      ;;
-
-    -v | --verbose)
-      # Both SparkSubmit and SparkSQLCLIDriver recognizes -v | --verbose
-      CLI_ARGS+=("$1")
-      SUBMISSION_ARGS+=("$1"); shift
-      ;;
-
-    *)
-      SUBMISSION_ARGS+=("$1"); shift
-      ;;
-  esac
-done
+if [[ exit_status -eq CLASS_NOT_FOUND_EXIT_STATUS ]]; then
+  echo
+  echo "Failed to load Spark SQL CLI main class $CLASS."
+  echo "You need to build Spark with -Phive."
+fi
 
-exec "$FWDIR"/bin/spark-submit --class $CLASS "${SUBMISSION_ARGS[@]}" spark-internal "${CLI_ARGS[@]}"
+exit $exit_status