Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into SPARK-32444
Browse files Browse the repository at this point in the history
  • Loading branch information
wangyum committed Aug 21, 2020
2 parents 98f7275 + e277ef1 commit baa7796
Show file tree
Hide file tree
Showing 1,290 changed files with 151,361 additions and 6,488 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: master
name: Build and test

on:
push:
Expand All @@ -9,7 +9,6 @@ on:
- master

jobs:
# TODO(SPARK-32248): Recover JDK 11 builds
# Build: build Spark and run the tests for specified modules.
build:
name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
Expand All @@ -27,21 +26,21 @@ jobs:
# Kinesis tests depends on external Amazon kinesis service.
# Note that the modules below are from sparktestsupport/modules.py.
modules:
- |-
- >-
core, unsafe, kvstore, avro,
network-common, network-shuffle, repl, launcher,
examples, sketch, graphx
- |-
- >-
catalyst, hive-thriftserver
- |-
- >-
streaming, sql-kafka-0-10, streaming-kafka-0-10,
mllib-local, mllib,
yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
- |-
- >-
pyspark-sql, pyspark-mllib, pyspark-resource
- |-
- >-
pyspark-core, pyspark-streaming, pyspark-ml
- |-
- >-
sparkr
# Here, we split Hive and SQL tests into some of slow ones and the rest of them.
included-tags: [""]
Expand Down Expand Up @@ -144,36 +143,52 @@ jobs:
# PyArrow is not supported in PyPy yet, see ARROW-2651.
# TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
run: |
python3.6 -m pip install numpy pyarrow pandas scipy
python3.6 -m pip install numpy pyarrow pandas scipy xmlrunner
python3.6 -m pip list
# PyPy does not have xmlrunner
pypy3 -m pip install numpy pandas
pypy3 -m pip list
- name: Install Python packages (Python 3.8)
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
run: |
python3.8 -m pip install numpy pyarrow pandas scipy
python3.8 -m pip install numpy pyarrow pandas scipy xmlrunner
python3.8 -m pip list
# SparkR
- name: Install R 3.6
uses: r-lib/actions/setup-r@v1
- name: Install R 4.0
if: contains(matrix.modules, 'sparkr')
with:
r-version: 3.6
run: |
sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
sudo apt-get update
sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
- name: Install R packages
if: contains(matrix.modules, 'sparkr')
run: |
sudo apt-get install -y libcurl4-openssl-dev
# qpdf is required to reduce the size of PDFs to make CRAN check pass. See SPARK-32497.
sudo apt-get install -y libcurl4-openssl-dev qpdf
sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
# Show installed packages in R.
sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
# Run the tests.
- name: "Run tests: ${{ matrix.modules }}"
- name: Run tests
run: |
# Hive tests become flaky when running in parallel as it's too intensive.
if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
mkdir -p ~/.m2
./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
rm -rf ~/.m2/repository/org/apache/spark
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v2
with:
name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
path: "**/target/test-reports/*.xml"
- name: Upload unit tests log files
if: failure()
uses: actions/upload-artifact@v2
with:
name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
path: "**/target/unit-tests.log"

# Static analysis, and documentation build
lint:
Expand All @@ -200,11 +215,15 @@ jobs:
architecture: x64
- name: Install Python linter dependencies
run: |
pip3 install flake8 sphinx numpy
- name: Install R 3.6
uses: r-lib/actions/setup-r@v1
with:
r-version: 3.6
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
# See also https://github.com/sphinx-doc/sphinx/issues/7551.
pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme
- name: Install R 4.0
run: |
sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
sudo apt-get update
sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
- name: Install R linter dependencies and SparkR
run: |
sudo apt-get install -y libcurl4-openssl-dev
Expand All @@ -218,7 +237,9 @@ jobs:
- name: Install dependencies for documentation generation
run: |
sudo apt-get install -y libcurl4-openssl-dev pandoc
pip install sphinx mkdocs numpy
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
# See also https://github.com/sphinx-doc/sphinx/issues/7551.
pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme
gem install jekyll jekyll-redirect-from rouge
sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
- name: Scala linter
Expand All @@ -237,3 +258,28 @@ jobs:
run: |
cd docs
jekyll build
java11:
name: Java 11 build
runs-on: ubuntu-latest
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
- name: Cache Maven local repository
uses: actions/cache@v2
with:
path: ~/.m2/repository
key: java11-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
java11-maven-
- name: Install Java 11
uses: actions/setup-java@v1
with:
java-version: 11
- name: Build with Maven
run: |
export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
export MAVEN_CLI_OPTS="--no-transfer-progress"
mkdir -p ~/.m2
./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install
rm -rf ~/.m2/repository/org/apache/spark
24 changes: 24 additions & 0 deletions .github/workflows/test_report.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Report test results
on:
workflow_run:
workflows: ["Build and test"]
types:
- completed

jobs:
test_report:
runs-on: ubuntu-latest
steps:
- name: Download test results to report
uses: dawidd6/action-download-artifact@v2
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
workflow: ${{ github.event.workflow_run.workflow_id }}
commit: ${{ github.event.workflow_run.head_commit.id }}
- name: Publish test report
uses: scacap/action-surefire-report@v1
with:
check_name: Report test results
github_token: ${{ secrets.GITHUB_TOKEN }}
report_paths: "**/target/test-reports/*.xml"
commit: ${{ github.event.workflow_run.head_commit.id }}
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ python/lib/pyspark.zip
python/.eggs/
python/deps
python/docs/_site/
python/docs/source/reference/api/
python/test_coverage/coverage_data
python/test_coverage/htmlcov
python/pyspark/python
Expand Down
3 changes: 1 addition & 2 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,7 @@ external/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaRe
Python Software Foundation License
----------------------------------

pyspark/heapq3.py
python/docs/_static/copybutton.js
python/docs/source/_static/copybutton.js

BSD 3-Clause
------------
Expand Down
6 changes: 0 additions & 6 deletions LICENSE-binary
Original file line number Diff line number Diff line change
Expand Up @@ -557,12 +557,6 @@ jakarta.ws.rs:jakarta.ws.rs-api https://github.com/eclipse-ee4j/jaxrs-api
org.glassfish.hk2.external:jakarta.inject


Python Software Foundation License
----------------------------------

pyspark/heapq3.py


Public Domain
-------------

Expand Down
2 changes: 1 addition & 1 deletion R/pkg/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Suggests:
testthat,
e1071,
survival,
arrow (>= 0.15.1)
arrow (>= 1.0.0)
Collate:
'schema.R'
'generics.R'
Expand Down
9 changes: 2 additions & 7 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -1233,13 +1233,8 @@ setMethod("collect",
port = port, blocking = TRUE, open = "wb", timeout = connectionTimeout)
output <- tryCatch({
doServerAuth(conn, authSecret)
arrowTable <- arrow::read_arrow(readRaw(conn))
# Arrow drops `as_tibble` since 0.14.0, see ARROW-5190.
if (exists("as_tibble", envir = asNamespace("arrow"))) {
as.data.frame(arrow::as_tibble(arrowTable), stringsAsFactors = stringsAsFactors)
} else {
as.data.frame(arrowTable, stringsAsFactors = stringsAsFactors)
}
arrowTable <- arrow::read_ipc_stream(readRaw(conn))
as.data.frame(arrowTable, stringsAsFactors = stringsAsFactors)
}, finally = {
close(conn)
})
Expand Down
13 changes: 1 addition & 12 deletions R/pkg/R/deserialize.R
Original file line number Diff line number Diff line change
Expand Up @@ -233,24 +233,13 @@ readMultipleObjectsWithKeys <- function(inputCon) {

readDeserializeInArrow <- function(inputCon) {
if (requireNamespace("arrow", quietly = TRUE)) {
# Arrow drops `as_tibble` since 0.14.0, see ARROW-5190.
useAsTibble <- exists("as_tibble", envir = asNamespace("arrow"))


# Currently, there looks no way to read batch by batch by socket connection in R side,
# See ARROW-4512. Therefore, it reads the whole Arrow streaming-formatted binary at once
# for now.
dataLen <- readInt(inputCon)
arrowData <- readBin(inputCon, raw(), as.integer(dataLen), endian = "big")
batches <- arrow::RecordBatchStreamReader$create(arrowData)$batches()

if (useAsTibble) {
as_tibble <- get("as_tibble", envir = asNamespace("arrow"))
# Read all groupped batches. Tibble -> data.frame is cheap.
lapply(batches, function(batch) as.data.frame(as_tibble(batch)))
} else {
lapply(batches, function(batch) as.data.frame(batch))
}
lapply(batches, function(batch) as.data.frame(batch))
} else {
stop("'arrow' package should be installed.")
}
Expand Down
2 changes: 1 addition & 1 deletion R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -2286,7 +2286,7 @@ setMethod("pmod", signature(y = "Column"),
column(jc)
})

#' @param rsd maximum estimation error allowed (default = 0.05).
#' @param rsd maximum relative standard deviation allowed (default = 0.05).
#'
#' @rdname column_aggregate_functions
#' @aliases approx_count_distinct,Column-method
Expand Down
4 changes: 4 additions & 0 deletions R/pkg/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ varargsToStrEnv <- function(...) {

getStorageLevel <- function(newLevel = c("DISK_ONLY",
"DISK_ONLY_2",
"DISK_ONLY_3",
"MEMORY_AND_DISK",
"MEMORY_AND_DISK_2",
"MEMORY_AND_DISK_SER",
Expand All @@ -390,6 +391,7 @@ getStorageLevel <- function(newLevel = c("DISK_ONLY",
storageLevel <- switch(newLevel,
"DISK_ONLY" = callJStatic(storageLevelClass, "DISK_ONLY"),
"DISK_ONLY_2" = callJStatic(storageLevelClass, "DISK_ONLY_2"),
"DISK_ONLY_3" = callJStatic(storageLevelClass, "DISK_ONLY_3"),
"MEMORY_AND_DISK" = callJStatic(storageLevelClass, "MEMORY_AND_DISK"),
"MEMORY_AND_DISK_2" = callJStatic(storageLevelClass, "MEMORY_AND_DISK_2"),
"MEMORY_AND_DISK_SER" = callJStatic(storageLevelClass,
Expand All @@ -415,6 +417,8 @@ storageLevelToString <- function(levelObj) {
"DISK_ONLY"
} else if (useDisk && !useMemory && !useOffHeap && !deserialized && replication == 2) {
"DISK_ONLY_2"
} else if (useDisk && !useMemory && !useOffHeap && !deserialized && replication == 3) {
"DISK_ONLY_3"
} else if (!useDisk && useMemory && !useOffHeap && deserialized && replication == 1) {
"MEMORY_ONLY"
} else if (!useDisk && useMemory && !useOffHeap && deserialized && replication == 2) {
Expand Down
18 changes: 18 additions & 0 deletions R/pkg/tests/fulltests/test_sparkSQL_arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -312,4 +312,22 @@ test_that("Arrow optimization - unsupported types", {
})
})

test_that("SPARK-32478: gapply() Arrow optimization - error message for schema mismatch", {
skip_if_not_installed("arrow")
df <- createDataFrame(list(list(a = 1L, b = "a")))

conf <- callJMethod(sparkSession, "conf")
arrowEnabled <- sparkR.conf("spark.sql.execution.arrow.sparkr.enabled")[[1]]

callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", "true")
tryCatch({
expect_error(
count(gapply(df, "a", function(key, group) { group }, structType("a int, b int"))),
"expected IntegerType, IntegerType, got IntegerType, StringType")
},
finally = {
callJMethod(conf, "set", "spark.sql.execution.arrow.sparkr.enabled", arrowEnabled)
})
})

sparkR.session.stop()
9 changes: 6 additions & 3 deletions R/pkg/tests/run-all.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,18 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) {
set.seed(42)

# TODO (SPARK-30663) To be removed once testthat 1.x is removed from all builds
if (grepl("^1\\..*", packageVersion("testthat"))) {
if (packageVersion("testthat")$major <= 1) {
# testthat 1.x
test_runner <- testthat:::run_tests
reporter <- "summary"

} else {
# testthat >= 2.0.0
test_runner <- testthat:::test_package_dir
reporter <- testthat::default_reporter()
dir.create("target/test-reports", showWarnings = FALSE)
reporter <- MultiReporter$new(list(
SummaryReporter$new(),
JunitReporter$new(file = "target/test-reports/test-results.xml")
))
}

test_runner("SparkR",
Expand Down
4 changes: 2 additions & 2 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ cache:
install:
# Install maven and dependencies
- ps: .\dev\appveyor-install-dependencies.ps1
# Required package for R unit tests
- cmd: Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow'), repos='https://cloud.r-project.org/')"
# Required package for R unit tests. xml2 is required to use jUnit reporter in testthat.
- cmd: Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'xml2'), repos='https://cloud.r-project.org/')"
- cmd: Rscript -e "pkg_list <- as.data.frame(installed.packages()[,c(1, 3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]"

build_script:
Expand Down
4 changes: 2 additions & 2 deletions bin/find-spark-home
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ elif [ ! -f "$FIND_SPARK_HOME_PYTHON_SCRIPT" ]; then
export SPARK_HOME="$(cd "$(dirname "$0")"/..; pwd)"
else
# We are pip installed, use the Python script to resolve a reasonable SPARK_HOME
# Default to standard python interpreter unless told otherwise
# Default to standard python3 interpreter unless told otherwise
if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then
PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}"
PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python3"}"
fi
export SPARK_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_SPARK_HOME_PYTHON_SCRIPT")
fi
4 changes: 2 additions & 2 deletions bin/find-spark-home.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ rem
rem Path to Python script finding SPARK_HOME
set FIND_SPARK_HOME_PYTHON_SCRIPT=%~dp0find_spark_home.py

rem Default to standard python interpreter unless told otherwise
set PYTHON_RUNNER=python
rem Default to standard python3 interpreter unless told otherwise
set PYTHON_RUNNER=python3
rem If PYSPARK_DRIVER_PYTHON is set, it overwrites the python version
if not "x%PYSPARK_DRIVER_PYTHON%"=="x" (
set PYTHON_RUNNER=%PYSPARK_DRIVER_PYTHON%
Expand Down
Loading

0 comments on commit baa7796

Please sign in to comment.