Resolve conflict

apache · Jan 28, 2020 · 7825bc3 · 7825bc3
2 parents e7b4d43 + cc986a7
commit 7825bc3
Show file tree

Hide file tree

Showing 417 changed files with 29,829 additions and 26,771 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1,2 +1,7 @@
 *.bat text eol=crlf
 *.cmd text eol=crlf
+*.java text eol=lf
+*.scala text eol=lf
+*.xml text eol=lf
+*.py text eol=lf
+*.R text eol=lf
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -66,14 +66,6 @@ jobs:
         export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
         export MAVEN_CLI_OPTS="--no-transfer-progress"
         mkdir -p ~/.m2
-        # `Maven Central` is too flaky in terms of downloading artifacts in `GitHub Action` environment.
-        # `Google Maven Central Mirror` is too slow in terms of sycing upstream. To get the best combination,
-        #   1) we set `Google Maven Central` as a mirror of `central` in `GitHub Action` environment only.
-        #   2) we duplicates `Maven Central` in pom.xml with ID `central_without_mirror`.
-        # In other words, in GitHub Action environment, `central` is mirrored by `Google Maven Central` first.
-        # If `Google Maven Central` doesn't provide the artifact due to its slowness, `central_without_mirror` will be used.
-        # Note that we aim to achieve the above while keeping the existing behavior of non-`GitHub Action` environment unchanged.
-        echo "<settings><mirrors><mirror><id>google-maven-central</id><name>GCS Maven Central mirror</name><url>https://maven-central.storage-download.googleapis.com/repos/central/data/</url><mirrorOf>central</mirrorOf></mirror></mirrors></settings>" > ~/.m2/settings.xml
         ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -P${{ matrix.hive }} -Phive-thriftserver -P${{ matrix.hadoop }} -Phadoop-cloud -Djava.version=${{ matrix.java }} install
         rm -rf ~/.m2/repository/org/apache/spark
 

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -335,6 +335,7 @@ exportMethods("%<=>%",
               "ntile",
               "otherwise",
               "over",
+              "overlay",
               "percent_rank",
               "pmod",
               "posexplode",

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -136,6 +136,14 @@ NULL
 #'           format to. See 'Details'.
 #'      }
 #' @param y Column to compute on.
+#' @param pos In \itemize{
+#'                \item \code{locate}: a start position of search.
+#'                \item \code{overlay}: a start postiton for replacement.
+#'                }
+#' @param len In \itemize{
+#'               \item \code{lpad} the maximum length of each output result.
+#'               \item \code{overlay} a number of bytes to replace.
+#'               }
 #' @param ... additional Columns.
 #' @name column_string_functions
 #' @rdname column_string_functions
@@ -1319,6 +1327,35 @@ setMethod("negate",
             column(jc)
           })
 
+#' @details
+#' \code{overlay}: Overlay the specified portion of \code{x} with \code{replace},
+#' starting from byte position \code{pos} of \code{src} and proceeding for
+#' \code{len} bytes.
+#'
+#' @param replace a Column with replacement.
+#'
+#' @rdname column_string_functions
+#' @aliases overlay overlay,Column-method,numericOrColumn-method
+#' @note overlay since 3.0.0
+setMethod("overlay",
+  signature(x = "Column", replace = "Column", pos = "numericOrColumn"),
+  function(x, replace, pos, len = -1) {
+    if (is.numeric(pos)) {
+      pos <- lit(as.integer(pos))
+    }
+
+    if (is.numeric(len)) {
+      len <- lit(as.integer(len))
+    }
+
+    jc <- callJStatic(
+      "org.apache.spark.sql.functions", "overlay",
+      x@jc, replace@jc, pos@jc, len@jc
+    )
+
+    column(jc)
+  })
+
 #' @details
 #' \code{quarter}: Extracts the quarter as an integer from a given date/timestamp/string.
 #'
@@ -2459,7 +2496,6 @@ setMethod("schema_of_csv", signature(x = "characterOrColumn"),
 #' @note from_utc_timestamp since 1.5.0
 setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
           function(y, x) {
-            .Deprecated(msg = "from_utc_timestamp is deprecated. See SPARK-25496.")
             jc <- callJStatic("org.apache.spark.sql.functions", "from_utc_timestamp", y@jc, x)
             column(jc)
           })
@@ -2518,7 +2554,6 @@ setMethod("next_day", signature(y = "Column", x = "character"),
 #' @note to_utc_timestamp since 1.5.0
 setMethod("to_utc_timestamp", signature(y = "Column", x = "character"),
           function(y, x) {
-            .Deprecated(msg = "to_utc_timestamp is deprecated. See SPARK-25496.")
             jc <- callJStatic("org.apache.spark.sql.functions", "to_utc_timestamp", y@jc, x)
             column(jc)
           })
@@ -2819,7 +2854,6 @@ setMethod("window", signature(x = "Column"),
 #'
 #' @param substr a character string to be matched.
 #' @param str a Column where matches are sought for each entry.
-#' @param pos start position of search.
 #' @rdname column_string_functions
 #' @aliases locate locate,character,Column-method
 #' @note locate since 1.5.0
@@ -2834,7 +2868,6 @@ setMethod("locate", signature(substr = "character", str = "Column"),
 #' @details
 #' \code{lpad}: Left-padded with pad to a length of len.
 #'
-#' @param len maximum length of each output result.
 #' @param pad a character string to be padded with.
 #' @rdname column_string_functions
 #' @aliases lpad lpad,Column,numeric,character-method

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -1149,6 +1149,10 @@ setGeneric("ntile", function(x) { standardGeneric("ntile") })
 #' @name NULL
 setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") })
 
+#' @rdname column_string_functions
+#' @name NULL
+setGeneric("overlay", function(x, replace, pos, ...) { standardGeneric("overlay") })
+
 #' @rdname column_window_functions
 #' @name NULL
 setGeneric("percent_rank", function(x = "missing") { standardGeneric("percent_rank") })

diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R
@@ -393,6 +393,7 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara
 #'                                         "error" (throw an error), "keep" (put invalid data in
 #'                                         a special additional bucket, at index numLabels). Default
 #'                                         is "error".
+#' @param bootstrap Whether bootstrap samples are used when building trees.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.randomForest,SparkDataFrame,formula-method
 #' @return \code{spark.randomForest} returns a fitted Random Forest model.
@@ -428,7 +429,8 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo
                    featureSubsetStrategy = "auto", seed = NULL, subsamplingRate = 1.0,
                    minInstancesPerNode = 1, minInfoGain = 0.0, checkpointInterval = 10,
                    maxMemoryInMB = 256, cacheNodeIds = FALSE,
-                   handleInvalid = c("error", "keep", "skip")) {
+                   handleInvalid = c("error", "keep", "skip"),
+                   bootstrap = TRUE) {
             type <- match.arg(type)
             formula <- paste(deparse(formula), collapse = "")
             if (!is.null(seed)) {
@@ -445,7 +447,8 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo
                                          as.numeric(minInfoGain), as.integer(checkpointInterval),
                                          as.character(featureSubsetStrategy), seed,
                                          as.numeric(subsamplingRate),
-                                         as.integer(maxMemoryInMB), as.logical(cacheNodeIds))
+                                         as.integer(maxMemoryInMB), as.logical(cacheNodeIds),
+                                         as.logical(bootstrap))
                      new("RandomForestRegressionModel", jobj = jobj)
                    },
                    classification = {
@@ -460,7 +463,7 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo
                                          as.character(featureSubsetStrategy), seed,
                                          as.numeric(subsamplingRate),
                                          as.integer(maxMemoryInMB), as.logical(cacheNodeIds),
-                                         handleInvalid)
+                                         handleInvalid, as.logical(bootstrap))
                      new("RandomForestClassificationModel", jobj = jobj)
                    }
             )

diff --git a/R/pkg/tests/fulltests/data/test_utils_utf.json b/R/pkg/tests/fulltests/data/test_utils_utf.json
@@ -0,0 +1,4 @@
+{"name": "안녕하세요"}
+{"name": "您好", "age": 30}
+{"name": "こんにちは", "age": 19}
+{"name": "Xin chào"}
diff --git a/R/pkg/tests/fulltests/test_mllib_tree.R b/R/pkg/tests/fulltests/test_mllib_tree.R
@@ -130,7 +130,7 @@ test_that("spark.randomForest", {
   # regression
   data <- suppressWarnings(createDataFrame(longley))
   model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
-                              numTrees = 1, seed = 1)
+                              numTrees = 1, seed = 1, bootstrap = FALSE)
 
   predictions <- collect(predict(model, data))
   expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -848,24 +848,31 @@ test_that("collect() and take() on a DataFrame return the same number of rows an
 })
 
 test_that("collect() support Unicode characters", {
-  lines <- c("{\"name\":\"안녕하세요\"}",
-             "{\"name\":\"您好\", \"age\":30}",
-             "{\"name\":\"こんにちは\", \"age\":19}",
-             "{\"name\":\"Xin chào\"}")
+  jsonPath <- file.path(
+    Sys.getenv("SPARK_HOME"),
+    "R", "pkg", "tests", "fulltests", "data",
+    "test_utils_utf.json"
+  )
+
+  lines <- readLines(jsonPath, encoding = "UTF-8")
 
-  jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-  writeLines(lines, jsonPath)
+  expected <- regmatches(lines, gregexpr('(?<="name": ").*?(?=")', lines, perl = TRUE))
 
   df <- read.df(jsonPath, "json")
   rdf <- collect(df)
   expect_true(is.data.frame(rdf))
-  expect_equal(rdf$name[1], markUtf8("안녕하세요"))
-  expect_equal(rdf$name[2], markUtf8("您好"))
-  expect_equal(rdf$name[3], markUtf8("こんにちは"))
-  expect_equal(rdf$name[4], markUtf8("Xin chào"))
+  expect_equal(rdf$name[1], expected[[1]])
+  expect_equal(rdf$name[2], expected[[2]])
+  expect_equal(rdf$name[3], expected[[3]])
+  expect_equal(rdf$name[4], expected[[4]])
 
   df1 <- createDataFrame(rdf)
-  expect_equal(collect(where(df1, df1$name == markUtf8("您好")))$name, markUtf8("您好"))
+  expect_equal(
+    collect(
+      where(df1, df1$name == expected[[2]])
+    )$name,
+    expected[[2]]
+  )
 })
 
 test_that("multiple pipeline transformations result in an RDD with the correct values", {
@@ -1405,6 +1412,8 @@ test_that("column functions", {
     trunc(c, "month") + trunc(c, "mon") + trunc(c, "mm")
   c24 <- date_trunc("hour", c) + date_trunc("minute", c) + date_trunc("week", c) +
     date_trunc("quarter", c) + current_date() + current_timestamp()
+  c25 <- overlay(c1, c2, c3, c3) + overlay(c1, c2, c3) + overlay(c1, c2, 1) +
+    overlay(c1, c2, 3, 4)
 
   # Test if base::is.nan() is exposed
   expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))
@@ -1905,20 +1914,10 @@ test_that("date functions on a DataFrame", {
   df2 <- createDataFrame(l2)
   expect_equal(collect(select(df2, minute(df2$b)))[, 1], c(34, 24))
   expect_equal(collect(select(df2, second(df2$b)))[, 1], c(0, 34))
-  conf <- callJMethod(sparkSession, "conf")
-  isUtcTimestampFuncEnabled <- callJMethod(conf, "get", "spark.sql.legacy.utcTimestampFunc.enabled")
-  callJMethod(conf, "set", "spark.sql.legacy.utcTimestampFunc.enabled", "true")
-  tryCatch({
-    # Both from_utc_timestamp and to_utc_timestamp are deprecated as of SPARK-25496
-    expect_equal(suppressWarnings(collect(select(df2, from_utc_timestamp(df2$b, "JST"))))[, 1],
-                 c(as.POSIXct("2012-12-13 21:34:00 UTC"), as.POSIXct("2014-12-15 10:24:34 UTC")))
-    expect_equal(suppressWarnings(collect(select(df2, to_utc_timestamp(df2$b, "JST"))))[, 1],
-                 c(as.POSIXct("2012-12-13 03:34:00 UTC"), as.POSIXct("2014-12-14 16:24:34 UTC")))
-  },
-  finally = {
-    # Reverting the conf back
-    callJMethod(conf, "set", "spark.sql.legacy.utcTimestampFunc.enabled", isUtcTimestampFuncEnabled)
-  })
+  expect_equal(collect(select(df2, from_utc_timestamp(df2$b, "JST")))[, 1],
+               c(as.POSIXct("2012-12-13 21:34:00 UTC"), as.POSIXct("2014-12-15 10:24:34 UTC")))
+  expect_equal(collect(select(df2, to_utc_timestamp(df2$b, "JST")))[, 1],
+               c(as.POSIXct("2012-12-13 03:34:00 UTC"), as.POSIXct("2014-12-14 16:24:34 UTC")))
   expect_gt(collect(select(df2, unix_timestamp()))[1, 1], 0)
   expect_gt(collect(select(df2, unix_timestamp(df2$b)))[1, 1], 0)
   expect_gt(collect(select(df2, unix_timestamp(lit("2015-01-01"), "yyyy-MM-dd")))[1, 1], 0)

diff --git a/R/pkg/tests/fulltests/test_utils.R b/R/pkg/tests/fulltests/test_utils.R
@@ -89,7 +89,10 @@ test_that("cleanClosure on R functions", {
     lapply(x, g) + 1  # Test for capturing function call "g"'s closure as a argument of lapply.
     l$field[1, 1] <- 3  # Test for access operators `$`.
     res <- defUse + l$field[1, ]  # Test for def-use chain of "defUse", and "" symbol.
-    f(res)  # Test for recursive calls.
+    # Enable once SPARK-30629 is fixed
+    # nolint start
+    # f(res)  # Test for recursive calls.
+    # nolint end
   }
   newF <- cleanClosure(f)
   env <- environment(newF)
@@ -101,7 +104,10 @@ test_that("cleanClosure on R functions", {
   # nolint end
   expect_true("g" %in% ls(env))
   expect_true("l" %in% ls(env))
-  expect_true("f" %in% ls(env))
+  # Enable once SPARK-30629 is fixed
+  # nolint start
+  # expect_true("f" %in% ls(env))
+  # nolint end
   expect_equal(get("l", envir = env, inherits = FALSE), l)
   # "y" should be in the environment of g.
   newG <- get("g", envir = env, inherits = FALSE)

diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ and Structured Streaming for stream processing.
 
 <https://spark.apache.org/>
 
-[![Jenkins Build](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7/badge/icon)](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7)
+[![Jenkins Build](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7-hive-2.3/badge/icon)](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7-hive-2.3)
 [![AppVeyor Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark)
 [![PySpark Coverage](https://img.shields.io/badge/dynamic/xml.svg?label=pyspark%20coverage&url=https%3A%2F%2Fspark-test.github.io%2Fpyspark-coverage-site&query=%2Fhtml%2Fbody%2Fdiv%5B1%5D%2Fdiv%2Fh1%2Fspan&colorB=brightgreen&style=plastic)](https://spark-test.github.io/pyspark-coverage-site)
 

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java
@@ -17,6 +17,8 @@
 
 package org.apache.spark.unsafe.types;
 
+import org.apache.spark.annotation.Unstable;
+
 import java.io.Serializable;
 import java.math.BigDecimal;
 import java.time.Duration;
@@ -27,8 +29,21 @@
 import static org.apache.spark.sql.catalyst.util.DateTimeConstants.*;
 
 /**
- * The internal representation of interval type.
+ * The class representing calendar intervals. The calendar interval is stored internally in
+ * three components:
+ * <ul>
+ *   <li>an integer value representing the number of `months` in this interval,</li>
+ *   <li>an integer value representing the number of `days` in this interval,</li>
+ *   <li>a long value representing the number of `microseconds` in this interval.</li>
+ * </ul>
+ *
+ * The `months` and `days` are not units of time with a constant length (unlike hours, seconds), so
+ * they are two separated fields from microseconds. One month may be equal to 28, 29, 30 or 31 days
+ * and one day may be equal to 23, 24 or 25 hours (daylight saving).
+ *
+ * @since 3.0.0
  */
+@Unstable
 public final class CalendarInterval implements Serializable {
   public final int months;
   public final int days;

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java
@@ -46,6 +46,36 @@ public void equalsTest() {
     assertEquals(i1, i6);
   }
 
+  @Test
+  public void toStringTest() {
+    CalendarInterval i;
+
+    i = new CalendarInterval(0, 0, 0);
+    assertEquals("0 seconds", i.toString());
+
+    i = new CalendarInterval(34, 0, 0);
+    assertEquals("2 years 10 months", i.toString());
+
+    i = new CalendarInterval(-34, 0, 0);
+    assertEquals("-2 years -10 months", i.toString());
+
+    i = new CalendarInterval(0, 31, 0);
+    assertEquals("31 days", i.toString());
+
+    i = new CalendarInterval(0, -31, 0);
+    assertEquals("-31 days", i.toString());
+
+    i = new CalendarInterval(0, 0, 3 * MICROS_PER_HOUR + 13 * MICROS_PER_MINUTE + 123);
+    assertEquals("3 hours 13 minutes 0.000123 seconds", i.toString());
+
+    i = new CalendarInterval(0, 0, -3 * MICROS_PER_HOUR - 13 * MICROS_PER_MINUTE - 123);
+    assertEquals("-3 hours -13 minutes -0.000123 seconds", i.toString());
+
+    i = new CalendarInterval(34, 31, 3 * MICROS_PER_HOUR + 13 * MICROS_PER_MINUTE + 123);
+    assertEquals("2 years 10 months 31 days 3 hours 13 minutes 0.000123 seconds",
+      i.toString());
+  }
+
   @Test
   public void periodAndDurationTest() {
     CalendarInterval interval = new CalendarInterval(120, -40, 123456);