Merge remote-tracking branch 'upstream/master' into SPARK-32444

apache · Aug 21, 2020 · baa7796 · baa7796
2 parents 98f7275 + e277ef1
commit baa7796
Show file tree

Hide file tree

Showing 1,290 changed files with 151,361 additions and 6,488 deletions.
diff --git a/.github/workflows/master.yml → .github/workflows/build_and_test.yml b/.github/workflows/master.yml → .github/workflows/build_and_test.yml
@@ -1,4 +1,4 @@
-name: master
+name: Build and test
 
 on:
   push:
@@ -9,7 +9,6 @@ on:
     - master
 
 jobs:
-  # TODO(SPARK-32248): Recover JDK 11 builds
   # Build: build Spark and run the tests for specified modules.
   build:
     name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
@@ -27,21 +26,21 @@ jobs:
         # Kinesis tests depends on external Amazon kinesis service.
         # Note that the modules below are from sparktestsupport/modules.py.
         modules:
-          - |-
+          - >-
             core, unsafe, kvstore, avro,
             network-common, network-shuffle, repl, launcher,
             examples, sketch, graphx
-          - |-
+          - >-
             catalyst, hive-thriftserver
-          - |-
+          - >-
             streaming, sql-kafka-0-10, streaming-kafka-0-10,
             mllib-local, mllib,
             yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
-          - |-
+          - >-
             pyspark-sql, pyspark-mllib, pyspark-resource
-          - |-
+          - >-
             pyspark-core, pyspark-streaming, pyspark-ml
-          - |-
+          - >-
             sparkr
         # Here, we split Hive and SQL tests into some of slow ones and the rest of them.
         included-tags: [""]
@@ -144,36 +143,52 @@ jobs:
       # PyArrow is not supported in PyPy yet, see ARROW-2651.
       # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
       run: |
-        python3.6 -m pip install numpy pyarrow pandas scipy
+        python3.6 -m pip install numpy pyarrow pandas scipy xmlrunner
         python3.6 -m pip list
+        # PyPy does not have xmlrunner
         pypy3 -m pip install numpy pandas
         pypy3 -m pip list
     - name: Install Python packages (Python 3.8)
       if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
       run: |
-        python3.8 -m pip install numpy pyarrow pandas scipy
+        python3.8 -m pip install numpy pyarrow pandas scipy xmlrunner
         python3.8 -m pip list
     # SparkR
-    - name: Install R 3.6
-      uses: r-lib/actions/setup-r@v1
+    - name: Install R 4.0
       if: contains(matrix.modules, 'sparkr')
-      with:
-        r-version: 3.6
+      run: |
+        sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
+        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
+        sudo apt-get update
+        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
     - name: Install R packages
       if: contains(matrix.modules, 'sparkr')
       run: |
-        sudo apt-get install -y libcurl4-openssl-dev
+        # qpdf is required to reduce the size of PDFs to make CRAN check pass. See SPARK-32497.
+        sudo apt-get install -y libcurl4-openssl-dev qpdf
         sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
         # Show installed packages in R.
         sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
     # Run the tests.
-    - name: "Run tests: ${{ matrix.modules }}"
+    - name: Run tests
       run: |
         # Hive tests become flaky when running in parallel as it's too intensive.
         if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
         mkdir -p ~/.m2
         ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
         rm -rf ~/.m2/repository/org/apache/spark
+    - name: Upload test results to report
+      if: always()
+      uses: actions/upload-artifact@v2
+      with:
+        name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
+        path: "**/target/test-reports/*.xml"
+    - name: Upload unit tests log files
+      if: failure()
+      uses: actions/upload-artifact@v2
+      with:
+        name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
+        path: "**/target/unit-tests.log"
 
   # Static analysis, and documentation build
   lint:
@@ -200,11 +215,15 @@ jobs:
         architecture: x64
     - name: Install Python linter dependencies
       run: |
-        pip3 install flake8 sphinx numpy
-    - name: Install R 3.6
-      uses: r-lib/actions/setup-r@v1
-      with:
-        r-version: 3.6
+        # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
+        #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
+        pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme
+    - name: Install R 4.0
+      run: |
+        sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
+        curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
+        sudo apt-get update
+        sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
     - name: Install R linter dependencies and SparkR
       run: |
         sudo apt-get install -y libcurl4-openssl-dev
@@ -218,7 +237,9 @@ jobs:
     - name: Install dependencies for documentation generation
       run: |
         sudo apt-get install -y libcurl4-openssl-dev pandoc
-        pip install sphinx mkdocs numpy
+        # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
+        #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
+        pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme
         gem install jekyll jekyll-redirect-from rouge
         sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
     - name: Scala linter
@@ -237,3 +258,28 @@ jobs:
       run: |
         cd docs
         jekyll build
+
+  java11:
+    name: Java 11 build
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.m2/repository
+        key: java11-maven-${{ hashFiles('**/pom.xml') }}
+        restore-keys: |
+          java11-maven-
+    - name: Install Java 11
+      uses: actions/setup-java@v1
+      with:
+        java-version: 11
+    - name: Build with Maven
+      run: |
+        export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
+        export MAVEN_CLI_OPTS="--no-transfer-progress"
+        mkdir -p ~/.m2
+        ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install
+        rm -rf ~/.m2/repository/org/apache/spark
diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml
@@ -0,0 +1,24 @@
+name: Report test results
+on:
+  workflow_run:
+    workflows: ["Build and test"]
+    types:
+      - completed
+
+jobs:
+  test_report:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Download test results to report
+      uses: dawidd6/action-download-artifact@v2
+      with:
+        github_token: ${{ secrets.GITHUB_TOKEN }}
+        workflow: ${{ github.event.workflow_run.workflow_id }}
+        commit: ${{ github.event.workflow_run.head_commit.id }}
+    - name: Publish test report
+      uses: scacap/action-surefire-report@v1
+      with:
+        check_name: Report test results
+        github_token: ${{ secrets.GITHUB_TOKEN }}
+        report_paths: "**/target/test-reports/*.xml"
+        commit: ${{ github.event.workflow_run.head_commit.id }}
diff --git a/.gitignore b/.gitignore
@@ -64,6 +64,7 @@ python/lib/pyspark.zip
 python/.eggs/
 python/deps
 python/docs/_site/
+python/docs/source/reference/api/
 python/test_coverage/coverage_data
 python/test_coverage/htmlcov
 python/pyspark/python

diff --git a/LICENSE b/LICENSE
@@ -222,8 +222,7 @@ external/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaRe
 Python Software Foundation License
 ----------------------------------
 
-pyspark/heapq3.py
-python/docs/_static/copybutton.js
+python/docs/source/_static/copybutton.js
 
 BSD 3-Clause
 ------------

diff --git a/LICENSE-binary b/LICENSE-binary
@@ -557,12 +557,6 @@ jakarta.ws.rs:jakarta.ws.rs-api https://github.com/eclipse-ee4j/jaxrs-api
 org.glassfish.hk2.external:jakarta.inject
 
 
-Python Software Foundation License
-----------------------------------
-
-pyspark/heapq3.py
-
-
 Public Domain
 -------------
 

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -23,7 +23,7 @@ Suggests:
     testthat,
     e1071,
     survival,
-    arrow (>= 0.15.1)
+    arrow (>= 1.0.0)
 Collate:
     'schema.R'
     'generics.R'

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -1233,13 +1233,8 @@ setMethod("collect",
                   port = port, blocking = TRUE, open = "wb", timeout = connectionTimeout)
                 output <- tryCatch({
                   doServerAuth(conn, authSecret)
-                  arrowTable <- arrow::read_arrow(readRaw(conn))
-                  # Arrow drops `as_tibble` since 0.14.0, see ARROW-5190.
-                  if (exists("as_tibble", envir = asNamespace("arrow"))) {
-                    as.data.frame(arrow::as_tibble(arrowTable), stringsAsFactors = stringsAsFactors)
-                  } else {
-                    as.data.frame(arrowTable, stringsAsFactors = stringsAsFactors)
-                  }
+                  arrowTable <- arrow::read_ipc_stream(readRaw(conn))
+                  as.data.frame(arrowTable, stringsAsFactors = stringsAsFactors)
                 }, finally = {
                   close(conn)
                 })

diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
@@ -233,24 +233,13 @@ readMultipleObjectsWithKeys <- function(inputCon) {
 
 readDeserializeInArrow <- function(inputCon) {
   if (requireNamespace("arrow", quietly = TRUE)) {
-    # Arrow drops `as_tibble` since 0.14.0, see ARROW-5190.
-    useAsTibble <- exists("as_tibble", envir = asNamespace("arrow"))
-
-
     # Currently, there looks no way to read batch by batch by socket connection in R side,
     # See ARROW-4512. Therefore, it reads the whole Arrow streaming-formatted binary at once
     # for now.
     dataLen <- readInt(inputCon)
     arrowData <- readBin(inputCon, raw(), as.integer(dataLen), endian = "big")
     batches <- arrow::RecordBatchStreamReader$create(arrowData)$batches()
-
-    if (useAsTibble) {
-      as_tibble <- get("as_tibble", envir = asNamespace("arrow"))
-      # Read all groupped batches. Tibble -> data.frame is cheap.
-      lapply(batches, function(batch) as.data.frame(as_tibble(batch)))
-    } else {
-      lapply(batches, function(batch) as.data.frame(batch))
-    }
+    lapply(batches, function(batch) as.data.frame(batch))
   } else {
     stop("'arrow' package should be installed.")
   }

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -2286,7 +2286,7 @@ setMethod("pmod", signature(y = "Column"),
             column(jc)
           })
 
-#' @param rsd maximum estimation error allowed (default = 0.05).
+#' @param rsd maximum relative standard deviation allowed (default = 0.05).
 #'
 #' @rdname column_aggregate_functions
 #' @aliases approx_count_distinct,Column-method

diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
@@ -376,6 +376,7 @@ varargsToStrEnv <- function(...) {
 
 getStorageLevel <- function(newLevel = c("DISK_ONLY",
                                          "DISK_ONLY_2",
+                                         "DISK_ONLY_3",
                                          "MEMORY_AND_DISK",
                                          "MEMORY_AND_DISK_2",
                                          "MEMORY_AND_DISK_SER",
@@ -390,6 +391,7 @@ getStorageLevel <- function(newLevel = c("DISK_ONLY",
   storageLevel <- switch(newLevel,
                          "DISK_ONLY" = callJStatic(storageLevelClass, "DISK_ONLY"),
                          "DISK_ONLY_2" = callJStatic(storageLevelClass, "DISK_ONLY_2"),
+                         "DISK_ONLY_3" = callJStatic(storageLevelClass, "DISK_ONLY_3"),
                          "MEMORY_AND_DISK" = callJStatic(storageLevelClass, "MEMORY_AND_DISK"),
                          "MEMORY_AND_DISK_2" = callJStatic(storageLevelClass, "MEMORY_AND_DISK_2"),
                          "MEMORY_AND_DISK_SER" = callJStatic(storageLevelClass,
@@ -415,6 +417,8 @@ storageLevelToString <- function(levelObj) {
     "DISK_ONLY"
   } else if (useDisk && !useMemory && !useOffHeap && !deserialized && replication == 2) {
     "DISK_ONLY_2"
+  } else if (useDisk && !useMemory && !useOffHeap && !deserialized && replication == 3) {
+    "DISK_ONLY_3"
   } else if (!useDisk && useMemory && !useOffHeap && deserialized && replication == 1) {
     "MEMORY_ONLY"
   } else if (!useDisk && useMemory && !useOffHeap && deserialized && replication == 2) {

diff --git a/R/pkg/tests/fulltests/test_sparkSQL_arrow.R b/R/pkg/tests/fulltests/test_sparkSQL_arrow.R
@@ -312,4 +312,22 @@ test_that("Arrow optimization - unsupported types", {
   })
 })
 
+test_that("SPARK-32478: gapply() Arrow optimization - error message for schema mismatch", {
+  skip_if_not_installed("arrow")
+  df <- createDataFrame(list(list(a = 1L, b = "a")))
+
+  conf <- callJMethod(sparkSession, "conf")
+  arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]
+
+  callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
+  tryCatch({
+    expect_error(
+    count(gapply(df, "a", function(key, group) { group }, structType("a int, b int"))),
+    "expected IntegerType, IntegerType, got IntegerType, StringType")
+  },
+  finally = {
+    callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
+  })
+})
+
 sparkR.session.stop()
diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
@@ -61,15 +61,18 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) {
     set.seed(42)
 
     # TODO (SPARK-30663) To be removed once testthat 1.x is removed from all builds
-    if (grepl("^1\\..*", packageVersion("testthat"))) {
+    if (packageVersion("testthat")$major <= 1) {
       # testthat 1.x
       test_runner <- testthat:::run_tests
       reporter <- "summary"
-
     } else {
       # testthat >= 2.0.0
       test_runner <- testthat:::test_package_dir
-      reporter <- testthat::default_reporter()
+      dir.create("target/test-reports", showWarnings = FALSE)
+      reporter <- MultiReporter$new(list(
+        SummaryReporter$new(),
+        JunitReporter$new(file = "target/test-reports/test-results.xml")
+      ))
     }
 
     test_runner("SparkR",

diff --git a/appveyor.yml b/appveyor.yml
@@ -41,8 +41,8 @@ cache:
 install:
   # Install maven and dependencies
   - ps: .\dev\appveyor-install-dependencies.ps1
-  # Required package for R unit tests
-  - cmd: Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow'), repos='https://cloud.r-project.org/')"
+  # Required package for R unit tests. xml2 is required to use jUnit reporter in testthat.
+  - cmd: Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'xml2'), repos='https://cloud.r-project.org/')"
   - cmd: Rscript -e "pkg_list <- as.data.frame(installed.packages()[,c(1, 3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]"
 
 build_script:

diff --git a/bin/find-spark-home b/bin/find-spark-home
@@ -33,9 +33,9 @@ elif [ ! -f "$FIND_SPARK_HOME_PYTHON_SCRIPT" ]; then
   export SPARK_HOME="$(cd "$(dirname "$0")"/..; pwd)"
 else
   # We are pip installed, use the Python script to resolve a reasonable SPARK_HOME
-  # Default to standard python interpreter unless told otherwise
+  # Default to standard python3 interpreter unless told otherwise
   if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then
-     PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}"
+     PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python3"}"
   fi
   export SPARK_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_SPARK_HOME_PYTHON_SCRIPT")
 fi
diff --git a/bin/find-spark-home.cmd b/bin/find-spark-home.cmd
@@ -20,8 +20,8 @@ rem
 rem Path to Python script finding SPARK_HOME
 set FIND_SPARK_HOME_PYTHON_SCRIPT=%~dp0find_spark_home.py
 
-rem Default to standard python interpreter unless told otherwise
-set PYTHON_RUNNER=python
+rem Default to standard python3 interpreter unless told otherwise
+set PYTHON_RUNNER=python3
 rem If PYSPARK_DRIVER_PYTHON is set, it overwrites the python version
 if not "x%PYSPARK_DRIVER_PYTHON%"=="x" (
   set PYTHON_RUNNER=%PYSPARK_DRIVER_PYTHON%