Merge branch 'master' into ansi_offset-clause

apache · Sep 27, 2020 · bbe10d6 · bbe10d6
2 parents 2820b6c + 0c38765
commit bbe10d6
Show file tree

Hide file tree

Showing 1,656 changed files with 174,991 additions and 17,577 deletions.
diff --git a/.github/workflows/master.yml → .github/workflows/build_and_test.yml b/.github/workflows/master.yml → .github/workflows/build_and_test.yml
@@ -1,4 +1,4 @@
-name: master
+name: Build and test
 
 on:
   push:
@@ -7,9 +7,13 @@ on:
   pull_request:
     branches:
     - master
+  workflow_dispatch:
+    inputs:
+      target:
+        description: 'Target branch to run'
+        required: true
 
 jobs:
-  # TODO(SPARK-32248): Recover JDK 11 builds
   # Build: build Spark and run the tests for specified modules.
   build:
     name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
@@ -27,21 +31,21 @@ jobs:
         # Kinesis tests depends on external Amazon kinesis service.
         # Note that the modules below are from sparktestsupport/modules.py.
         modules:
-          - |-
+          - >-
             core, unsafe, kvstore, avro,
             network-common, network-shuffle, repl, launcher,
             examples, sketch, graphx
-          - |-
+          - >-
             catalyst, hive-thriftserver
-          - |-
+          - >-
             streaming, sql-kafka-0-10, streaming-kafka-0-10,
             mllib-local, mllib,
             yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
-          - |-
+          - >-
             pyspark-sql, pyspark-mllib, pyspark-resource
-          - |-
+          - >-
             pyspark-core, pyspark-streaming, pyspark-ml
-          - |-
+          - >-
             sparkr
         # Here, we split Hive and SQL tests into some of slow ones and the rest of them.
         included-tags: [""]
@@ -83,18 +87,26 @@ jobs:
       # GitHub Actions' default miniconda to use in pip packaging test.
       CONDA_PREFIX: /usr/share/miniconda
       GITHUB_PREV_SHA: ${{ github.event.before }}
+      GITHUB_INPUT_BRANCH: ${{ github.event.inputs.target }}
     steps:
     - name: Checkout Spark repository
       uses: actions/checkout@v2
       # In order to fetch changed files
       with:
         fetch-depth: 0
+    - name: Merge dispatched input branch
+      if: ${{ github.event.inputs.target != '' }}
+      run: git merge --progress --ff-only origin/${{ github.event.inputs.target }}
     # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
     - name: Cache Scala, SBT, Maven and Zinc
-      uses: actions/cache@v1
+      uses: actions/cache@v2
       with:
-        path: build
-        key: build-${{ hashFiles('**/pom.xml') }}
+        path: |
+          build/apache-maven-*
+          build/zinc-*
+          build/scala-*
+          build/*.jar
+        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
         restore-keys: |
           build-
     - name: Cache Maven local repository
@@ -108,7 +120,7 @@ jobs:
       uses: actions/cache@v2
       with:
         path: ~/.ivy2/cache
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }}
+        key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
         restore-keys: |
           ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-
     - name: Install JDK ${{ matrix.java }}
@@ -144,36 +156,52 @@ jobs:
       # PyArrow is not supported in PyPy yet, see ARROW-2651.
       # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
       run: |
-        python3.6 -m pip install numpy pyarrow pandas scipy
+        python3.6 -m pip install numpy pyarrow pandas scipy xmlrunner
         python3.6 -m pip list
+        # PyPy does not have xmlrunner
         pypy3 -m pip install numpy pandas
         pypy3 -m pip list
     - name: Install Python packages (Python 3.8)
       if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
       run: |
-        python3.8 -m pip install numpy pyarrow pandas scipy
+        python3.8 -m pip install numpy pyarrow pandas scipy xmlrunner
         python3.8 -m pip list
     # SparkR
-    - name: Install R 3.6
-      uses: r-lib/actions/setup-r@v1
+    - name: Install R 4.0
       if: contains(matrix.modules, 'sparkr')
-      with:
-        r-version: 3.6
+      run: |
+        sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
+        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
+        sudo apt-get update
+        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
     - name: Install R packages
       if: contains(matrix.modules, 'sparkr')
       run: |
-        sudo apt-get install -y libcurl4-openssl-dev
+        # qpdf is required to reduce the size of PDFs to make CRAN check pass. See SPARK-32497.
+        sudo apt-get install -y libcurl4-openssl-dev qpdf
         sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
         # Show installed packages in R.
         sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
     # Run the tests.
-    - name: "Run tests: ${{ matrix.modules }}"
+    - name: Run tests
       run: |
         # Hive tests become flaky when running in parallel as it's too intensive.
         if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
         mkdir -p ~/.m2
         ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
         rm -rf ~/.m2/repository/org/apache/spark
+    - name: Upload test results to report
+      if: always()
+      uses: actions/upload-artifact@v2
+      with:
+        name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
+        path: "**/target/test-reports/*.xml"
+    - name: Upload unit tests log files
+      if: failure()
+      uses: actions/upload-artifact@v2
+      with:
+        name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
+        path: "**/target/unit-tests.log"
 
   # Static analysis, and documentation build
   lint:
@@ -202,11 +230,13 @@ jobs:
       run: |
         # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
         #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
-        pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme
-    - name: Install R 3.6
-      uses: r-lib/actions/setup-r@v1
-      with:
-        r-version: 3.6
+        pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx
+    - name: Install R 4.0
+      run: |
+        sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
+        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
+        sudo apt-get update
+        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
     - name: Install R linter dependencies and SparkR
       run: |
         sudo apt-get install -y libcurl4-openssl-dev
@@ -219,10 +249,11 @@ jobs:
         ruby-version: 2.7
     - name: Install dependencies for documentation generation
       run: |
+        # pandoc is required to generate PySpark APIs as well in nbsphinx.
         sudo apt-get install -y libcurl4-openssl-dev pandoc
         # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
         #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
-        pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme
+        pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx
         gem install jekyll jekyll-redirect-from rouge
         sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
     - name: Scala linter
@@ -266,3 +297,29 @@ jobs:
         mkdir -p ~/.m2
         ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install
         rm -rf ~/.m2/repository/org/apache/spark
+
+  scala-213:
+    name: Scala 2.13 build
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.m2/repository
+        key: scala-213-maven-${{ hashFiles('**/pom.xml') }}
+        restore-keys: |
+          scala-213-maven-
+    - name: Install Java 11
+      uses: actions/setup-java@v1
+      with:
+        java-version: 11
+    - name: Build with Maven
+      run: |
+        export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
+        export MAVEN_CLI_OPTS="--no-transfer-progress"
+        mkdir -p ~/.m2
+        ./dev/change-scala-version.sh 2.13
+        ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 -Pscala-2.13 install
+        rm -rf ~/.m2/repository/org/apache/spark
diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml
@@ -0,0 +1,24 @@
+name: Report test results
+on:
+  workflow_run:
+    workflows: ["Build and test"]
+    types:
+      - completed
+
+jobs:
+  test_report:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Download test results to report
+      uses: dawidd6/action-download-artifact@v2
+      with:
+        github_token: ${{ secrets.GITHUB_TOKEN }}
+        workflow: ${{ github.event.workflow_run.workflow_id }}
+        commit: ${{ github.event.workflow_run.head_commit.id }}
+    - name: Publish test report
+      uses: scacap/action-surefire-report@v1
+      with:
+        check_name: Report test results
+        github_token: ${{ secrets.GITHUB_TOKEN }}
+        report_paths: "**/target/test-reports/*.xml"
+        commit: ${{ github.event.workflow_run.head_commit.id }}
diff --git a/.gitignore b/.gitignore
@@ -80,6 +80,7 @@ target/
 unit-tests.log
 work/
 docs/.jekyll-metadata
+docs/.jekyll-cache
 
 # For Hive
 TempStatsStore/

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -405,6 +405,7 @@ exportMethods("%<=>%",
               "sumDistinct",
               "tan",
               "tanh",
+              "timestamp_seconds",
               "toDegrees",
               "toRadians",
               "to_csv",
@@ -428,6 +429,7 @@ exportMethods("%<=>%",
               "weekofyear",
               "when",
               "window",
+              "withField",
               "xxhash64",
               "year")
 

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -1234,13 +1234,9 @@ setMethod("collect",
                 output <- tryCatch({
                   doServerAuth(conn, authSecret)
                   arrowTable <- arrow::read_ipc_stream(readRaw(conn))
-                  # Arrow drops `as_tibble` since 0.14.0, see ARROW-5190.
-                  if (exists("as_tibble", envir = asNamespace("arrow"))) {
-                    as.data.frame(arrow::as_tibble(arrowTable), stringsAsFactors = stringsAsFactors)
-                  } else {
-                    as.data.frame(arrowTable, stringsAsFactors = stringsAsFactors)
-                  }
-                }, finally = {
+                  as.data.frame(arrowTable, stringsAsFactors = stringsAsFactors)
+                },
+                finally = {
                   close(conn)
                 })
                 return(output)
@@ -2867,11 +2863,18 @@ setMethod("unionAll",
 #' \code{UNION ALL} and \code{UNION DISTINCT} in SQL as column positions are not taken
 #' into account. Input SparkDataFrames can have different data types in the schema.
 #'
+#' When the parameter allowMissingColumns is `TRUE`, the set of column names
+#' in x and y can differ; missing columns will be filled as null.
+#' Further, the missing columns of x will be added at the end
+#' in the schema of the union result.
+#'
 #' Note: This does not remove duplicate rows across the two SparkDataFrames.
 #' This function resolves columns by name (not by position).
 #'
 #' @param x A SparkDataFrame
 #' @param y A SparkDataFrame
+#' @param allowMissingColumns logical
+#' @param ... further arguments to be passed to or from other methods.
 #' @return A SparkDataFrame containing the result of the union.
 #' @family SparkDataFrame functions
 #' @rdname unionByName
@@ -2884,12 +2887,15 @@ setMethod("unionAll",
 #' df1 <- select(createDataFrame(mtcars), "carb", "am", "gear")
 #' df2 <- select(createDataFrame(mtcars), "am", "gear", "carb")
 #' head(unionByName(df1, df2))
+#'
+#' df3 <- select(createDataFrame(mtcars), "carb")
+#' head(unionByName(df1, df3, allowMissingColumns = TRUE))
 #' }
 #' @note unionByName since 2.3.0
 setMethod("unionByName",
           signature(x = "SparkDataFrame", y = "SparkDataFrame"),
-          function(x, y) {
-            unioned <- callJMethod(x@sdf, "unionByName", y@sdf)
+          function(x, y, allowMissingColumns=FALSE) {
+            unioned <- callJMethod(x@sdf, "unionByName", y@sdf, allowMissingColumns)
             dataFrame(unioned)
           })
 

diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
@@ -356,3 +356,34 @@ setMethod("%<=>%",
 #' }
 #' @note ! since 2.3.0
 setMethod("!", signature(x = "Column"), function(x) not(x))
+
+#' withField
+#'
+#' Adds/replaces field in a struct \code{Column} by name.
+#'
+#' @param x a Column
+#' @param fieldName a character
+#' @param col a Column expression
+#'
+#' @rdname withField
+#' @aliases withField withField,Column-method
+#' @examples
+#' \dontrun{
+#' df <- withColumn(
+#'   createDataFrame(iris),
+#'   "sepal",
+#'    struct(column("Sepal_Width"), column("Sepal_Length"))
+#' )
+#'
+#' head(select(
+#'   df,
+#'   withField(df$sepal, "product", df$Sepal_Length * df$Sepal_Width)
+#' ))
+#' }
+#' @note withField since 3.1.0
+setMethod("withField",
+          signature(x = "Column", fieldName = "character", col = "Column"),
+          function(x, fieldName, col) {
+            jc <- callJMethod(x@jc, "withField", fieldName, col@jc)
+            column(jc)
+          })
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
@@ -233,24 +233,13 @@ readMultipleObjectsWithKeys <- function(inputCon) {
 
 readDeserializeInArrow <- function(inputCon) {
   if (requireNamespace("arrow", quietly = TRUE)) {
-    # Arrow drops `as_tibble` since 0.14.0, see ARROW-5190.
-    useAsTibble <- exists("as_tibble", envir = asNamespace("arrow"))
-
-
     # Currently, there looks no way to read batch by batch by socket connection in R side,
     # See ARROW-4512. Therefore, it reads the whole Arrow streaming-formatted binary at once
     # for now.
     dataLen <- readInt(inputCon)
     arrowData <- readBin(inputCon, raw(), as.integer(dataLen), endian = "big")
     batches <- arrow::RecordBatchStreamReader$create(arrowData)$batches()
-
-    if (useAsTibble) {
-      as_tibble <- get("as_tibble", envir = asNamespace("arrow"))
-      # Read all groupped batches. Tibble -> data.frame is cheap.
-      lapply(batches, function(batch) as.data.frame(as_tibble(batch)))
-    } else {
-      lapply(batches, function(batch) as.data.frame(batch))
-    }
+    lapply(batches, function(batch) as.data.frame(batch))
   } else {
     stop("'arrow' package should be installed.")
   }