Skip to content

Commit

Permalink
Merge branch 'master' into ansi_offset-clause
Browse files Browse the repository at this point in the history
  • Loading branch information
beliefer authored Sep 27, 2020
2 parents 2820b6c + 0c38765 commit bbe10d6
Show file tree
Hide file tree
Showing 1,656 changed files with 174,991 additions and 17,577 deletions.
109 changes: 83 additions & 26 deletions .github/workflows/master.yml → .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: master
name: Build and test

on:
push:
Expand All @@ -7,9 +7,13 @@ on:
pull_request:
branches:
- master
workflow_dispatch:
inputs:
target:
description: 'Target branch to run'
required: true

jobs:
# TODO(SPARK-32248): Recover JDK 11 builds
# Build: build Spark and run the tests for specified modules.
build:
name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
Expand All @@ -27,21 +31,21 @@ jobs:
# Kinesis tests depends on external Amazon kinesis service.
# Note that the modules below are from sparktestsupport/modules.py.
modules:
- |-
- >-
core, unsafe, kvstore, avro,
network-common, network-shuffle, repl, launcher,
examples, sketch, graphx
- |-
- >-
catalyst, hive-thriftserver
- |-
- >-
streaming, sql-kafka-0-10, streaming-kafka-0-10,
mllib-local, mllib,
yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
- |-
- >-
pyspark-sql, pyspark-mllib, pyspark-resource
- |-
- >-
pyspark-core, pyspark-streaming, pyspark-ml
- |-
- >-
sparkr
# Here, we split Hive and SQL tests into some of slow ones and the rest of them.
included-tags: [""]
Expand Down Expand Up @@ -83,18 +87,26 @@ jobs:
# GitHub Actions' default miniconda to use in pip packaging test.
CONDA_PREFIX: /usr/share/miniconda
GITHUB_PREV_SHA: ${{ github.event.before }}
GITHUB_INPUT_BRANCH: ${{ github.event.inputs.target }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
# In order to fetch changed files
with:
fetch-depth: 0
- name: Merge dispatched input branch
if: ${{ github.event.inputs.target != '' }}
run: git merge --progress --ff-only origin/${{ github.event.inputs.target }}
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
- name: Cache Scala, SBT, Maven and Zinc
uses: actions/cache@v1
uses: actions/cache@v2
with:
path: build
key: build-${{ hashFiles('**/pom.xml') }}
path: |
build/apache-maven-*
build/zinc-*
build/scala-*
build/*.jar
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Maven local repository
Expand All @@ -108,7 +120,7 @@ jobs:
uses: actions/cache@v2
with:
path: ~/.ivy2/cache
key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }}
key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
${{ matrix.java }}-${{ matrix.hadoop }}-ivy-
- name: Install JDK ${{ matrix.java }}
Expand Down Expand Up @@ -144,36 +156,52 @@ jobs:
# PyArrow is not supported in PyPy yet, see ARROW-2651.
# TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
run: |
python3.6 -m pip install numpy pyarrow pandas scipy
python3.6 -m pip install numpy pyarrow pandas scipy xmlrunner
python3.6 -m pip list
# PyPy does not have xmlrunner
pypy3 -m pip install numpy pandas
pypy3 -m pip list
- name: Install Python packages (Python 3.8)
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
run: |
python3.8 -m pip install numpy pyarrow pandas scipy
python3.8 -m pip install numpy pyarrow pandas scipy xmlrunner
python3.8 -m pip list
# SparkR
- name: Install R 3.6
uses: r-lib/actions/setup-r@v1
- name: Install R 4.0
if: contains(matrix.modules, 'sparkr')
with:
r-version: 3.6
run: |
sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
sudo apt-get update
sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
- name: Install R packages
if: contains(matrix.modules, 'sparkr')
run: |
sudo apt-get install -y libcurl4-openssl-dev
# qpdf is required to reduce the size of PDFs to make CRAN check pass. See SPARK-32497.
sudo apt-get install -y libcurl4-openssl-dev qpdf
sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
# Show installed packages in R.
sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
# Run the tests.
- name: "Run tests: ${{ matrix.modules }}"
- name: Run tests
run: |
# Hive tests become flaky when running in parallel as it's too intensive.
if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
mkdir -p ~/.m2
./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
rm -rf ~/.m2/repository/org/apache/spark
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v2
with:
name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
path: "**/target/test-reports/*.xml"
- name: Upload unit tests log files
if: failure()
uses: actions/upload-artifact@v2
with:
name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
path: "**/target/unit-tests.log"

# Static analysis, and documentation build
lint:
Expand Down Expand Up @@ -202,11 +230,13 @@ jobs:
run: |
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
# See also https://github.com/sphinx-doc/sphinx/issues/7551.
pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme
- name: Install R 3.6
uses: r-lib/actions/setup-r@v1
with:
r-version: 3.6
pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx
- name: Install R 4.0
run: |
sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add
sudo apt-get update
sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev
- name: Install R linter dependencies and SparkR
run: |
sudo apt-get install -y libcurl4-openssl-dev
Expand All @@ -219,10 +249,11 @@ jobs:
ruby-version: 2.7
- name: Install dependencies for documentation generation
run: |
# pandoc is required to generate PySpark APIs as well in nbsphinx.
sudo apt-get install -y libcurl4-openssl-dev pandoc
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
# See also https://github.com/sphinx-doc/sphinx/issues/7551.
pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme
pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx
gem install jekyll jekyll-redirect-from rouge
sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
- name: Scala linter
Expand Down Expand Up @@ -266,3 +297,29 @@ jobs:
mkdir -p ~/.m2
./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install
rm -rf ~/.m2/repository/org/apache/spark
scala-213:
name: Scala 2.13 build
runs-on: ubuntu-latest
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
- name: Cache Maven local repository
uses: actions/cache@v2
with:
path: ~/.m2/repository
key: scala-213-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
scala-213-maven-
- name: Install Java 11
uses: actions/setup-java@v1
with:
java-version: 11
- name: Build with Maven
run: |
export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
export MAVEN_CLI_OPTS="--no-transfer-progress"
mkdir -p ~/.m2
./dev/change-scala-version.sh 2.13
./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 -Pscala-2.13 install
rm -rf ~/.m2/repository/org/apache/spark
24 changes: 24 additions & 0 deletions .github/workflows/test_report.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Report test results
on:
workflow_run:
workflows: ["Build and test"]
types:
- completed

jobs:
test_report:
runs-on: ubuntu-latest
steps:
- name: Download test results to report
uses: dawidd6/action-download-artifact@v2
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
workflow: ${{ github.event.workflow_run.workflow_id }}
commit: ${{ github.event.workflow_run.head_commit.id }}
- name: Publish test report
uses: scacap/action-surefire-report@v1
with:
check_name: Report test results
github_token: ${{ secrets.GITHUB_TOKEN }}
report_paths: "**/target/test-reports/*.xml"
commit: ${{ github.event.workflow_run.head_commit.id }}
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ target/
unit-tests.log
work/
docs/.jekyll-metadata
docs/.jekyll-cache

# For Hive
TempStatsStore/
Expand Down
2 changes: 2 additions & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,7 @@ exportMethods("%<=>%",
"sumDistinct",
"tan",
"tanh",
"timestamp_seconds",
"toDegrees",
"toRadians",
"to_csv",
Expand All @@ -428,6 +429,7 @@ exportMethods("%<=>%",
"weekofyear",
"when",
"window",
"withField",
"xxhash64",
"year")

Expand Down
24 changes: 15 additions & 9 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -1234,13 +1234,9 @@ setMethod("collect",
output <- tryCatch({
doServerAuth(conn, authSecret)
arrowTable <- arrow::read_ipc_stream(readRaw(conn))
# Arrow drops `as_tibble` since 0.14.0, see ARROW-5190.
if (exists("as_tibble", envir = asNamespace("arrow"))) {
as.data.frame(arrow::as_tibble(arrowTable), stringsAsFactors = stringsAsFactors)
} else {
as.data.frame(arrowTable, stringsAsFactors = stringsAsFactors)
}
}, finally = {
as.data.frame(arrowTable, stringsAsFactors = stringsAsFactors)
},
finally = {
close(conn)
})
return(output)
Expand Down Expand Up @@ -2867,11 +2863,18 @@ setMethod("unionAll",
#' \code{UNION ALL} and \code{UNION DISTINCT} in SQL as column positions are not taken
#' into account. Input SparkDataFrames can have different data types in the schema.
#'
#' When the parameter allowMissingColumns is `TRUE`, the set of column names
#' in x and y can differ; missing columns will be filled as null.
#' Further, the missing columns of x will be added at the end
#' in the schema of the union result.
#'
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
#' This function resolves columns by name (not by position).
#'
#' @param x A SparkDataFrame
#' @param y A SparkDataFrame
#' @param allowMissingColumns logical
#' @param ... further arguments to be passed to or from other methods.
#' @return A SparkDataFrame containing the result of the union.
#' @family SparkDataFrame functions
#' @rdname unionByName
Expand All @@ -2884,12 +2887,15 @@ setMethod("unionAll",
#' df1 <- select(createDataFrame(mtcars), "carb", "am", "gear")
#' df2 <- select(createDataFrame(mtcars), "am", "gear", "carb")
#' head(unionByName(df1, df2))
#'
#' df3 <- select(createDataFrame(mtcars), "carb")
#' head(unionByName(df1, df3, allowMissingColumns = TRUE))
#' }
#' @note unionByName since 2.3.0
setMethod("unionByName",
signature(x = "SparkDataFrame", y = "SparkDataFrame"),
function(x, y) {
unioned <- callJMethod(x@sdf, "unionByName", y@sdf)
function(x, y, allowMissingColumns=FALSE) {
unioned <- callJMethod(x@sdf, "unionByName", y@sdf, allowMissingColumns)
dataFrame(unioned)
})

Expand Down
31 changes: 31 additions & 0 deletions R/pkg/R/column.R
Original file line number Diff line number Diff line change
Expand Up @@ -356,3 +356,34 @@ setMethod("%<=>%",
#' }
#' @note ! since 2.3.0
setMethod("!", signature(x = "Column"), function(x) not(x))

#' withField
#'
#' Adds/replaces field in a struct \code{Column} by name.
#'
#' @param x a Column
#' @param fieldName a character
#' @param col a Column expression
#'
#' @rdname withField
#' @aliases withField withField,Column-method
#' @examples
#' \dontrun{
#' df <- withColumn(
#' createDataFrame(iris),
#' "sepal",
#' struct(column("Sepal_Width"), column("Sepal_Length"))
#' )
#'
#' head(select(
#' df,
#' withField(df$sepal, "product", df$Sepal_Length * df$Sepal_Width)
#' ))
#' }
#' @note withField since 3.1.0
setMethod("withField",
signature(x = "Column", fieldName = "character", col = "Column"),
function(x, fieldName, col) {
jc <- callJMethod(x@jc, "withField", fieldName, col@jc)
column(jc)
})
13 changes: 1 addition & 12 deletions R/pkg/R/deserialize.R
Original file line number Diff line number Diff line change
Expand Up @@ -233,24 +233,13 @@ readMultipleObjectsWithKeys <- function(inputCon) {

readDeserializeInArrow <- function(inputCon) {
if (requireNamespace("arrow", quietly = TRUE)) {
# Arrow drops `as_tibble` since 0.14.0, see ARROW-5190.
useAsTibble <- exists("as_tibble", envir = asNamespace("arrow"))


# Currently, there looks no way to read batch by batch by socket connection in R side,
# See ARROW-4512. Therefore, it reads the whole Arrow streaming-formatted binary at once
# for now.
dataLen <- readInt(inputCon)
arrowData <- readBin(inputCon, raw(), as.integer(dataLen), endian = "big")
batches <- arrow::RecordBatchStreamReader$create(arrowData)$batches()

if (useAsTibble) {
as_tibble <- get("as_tibble", envir = asNamespace("arrow"))
# Read all groupped batches. Tibble -> data.frame is cheap.
lapply(batches, function(batch) as.data.frame(as_tibble(batch)))
} else {
lapply(batches, function(batch) as.data.frame(batch))
}
lapply(batches, function(batch) as.data.frame(batch))
} else {
stop("'arrow' package should be installed.")
}
Expand Down
Loading

0 comments on commit bbe10d6

Please sign in to comment.