Merge branch 'master' of https://github.com/apache/spark into sqlparser1

apache · May 1, 2015 · c2895cf · c2895cf
2 parents 493775c + beeafcf
commit c2895cf
Show file tree

Hide file tree

Showing 369 changed files with 17,905 additions and 2,501 deletions.
diff --git a/LICENSE b/LICENSE
@@ -814,6 +814,7 @@ BSD-style licenses
 The following components are provided under a BSD-style license. See project link for details.
 
      (BSD 3 Clause) core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
+     (BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.1.15 - https://github.com/jpmml/jpmml-model)
      (BSD 3-clause style license) jblas (org.jblas:jblas:1.2.3 - http://jblas.org/)
      (BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/)
      (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -71,6 +71,7 @@ exportMethods(
               "unpersist",
               "value",
               "values",
+              "zipPartitions",
               "zipRDD",
               "zipWithIndex",
               "zipWithUniqueId"

diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
@@ -66,6 +66,11 @@ setMethod("initialize", "RDD", function(.Object, jrdd, serializedMode,
   .Object
 })
 
+setMethod("show", "RDD",
+          function(.Object) {
+              cat(paste(callJMethod(.Object@jrdd, "toString"), "\n", sep=""))
+          })
+
 setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val) {
   .Object@env <- new.env()
   .Object@env$isCached <- FALSE
@@ -91,8 +96,8 @@ setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val)
     # NOTE: We use prev_serializedMode to track the serialization mode of prev_JRDD
     # prev_serializedMode is used during the delayed computation of JRDD in getJRDD
   } else {
-    pipelinedFunc <- function(split, iterator) {
-      func(split, prev@func(split, iterator))
+    pipelinedFunc <- function(partIndex, part) {
+      func(partIndex, prev@func(partIndex, part))
     }
     .Object@func <- cleanClosure(pipelinedFunc)
     .Object@prev_jrdd <- prev@prev_jrdd # maintain the pipeline
@@ -306,7 +311,7 @@ setMethod("numPartitions",
           signature(x = "RDD"),
           function(x) {
             jrdd <- getJRDD(x)
-            partitions <- callJMethod(jrdd, "splits")
+            partitions <- callJMethod(jrdd, "partitions")
             callJMethod(partitions, "size")
           })
 
@@ -452,8 +457,8 @@ setMethod("countByValue",
 setMethod("lapply",
           signature(X = "RDD", FUN = "function"),
           function(X, FUN) {
-            func <- function(split, iterator) {
-              lapply(iterator, FUN)
+            func <- function(partIndex, part) {
+              lapply(part, FUN)
             }
             lapplyPartitionsWithIndex(X, func)
           })
@@ -538,8 +543,8 @@ setMethod("mapPartitions",
 #'\dontrun{
 #' sc <- sparkR.init()
 #' rdd <- parallelize(sc, 1:10, 5L)
-#' prod <- lapplyPartitionsWithIndex(rdd, function(split, part) {
-#'                                          split * Reduce("+", part) })
+#' prod <- lapplyPartitionsWithIndex(rdd, function(partIndex, part) {
+#'                                          partIndex * Reduce("+", part) })
 #' collect(prod, flatten = FALSE) # 0, 7, 22, 45, 76
 #'}
 #' @rdname lapplyPartitionsWithIndex
@@ -813,7 +818,7 @@ setMethod("distinct",
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
-#' rdd <- parallelize(sc, 1:10) # ensure each num is in its own split
+#' rdd <- parallelize(sc, 1:10)
 #' collect(sampleRDD(rdd, FALSE, 0.5, 1618L)) # ~5 distinct elements
 #' collect(sampleRDD(rdd, TRUE, 0.5, 9L)) # ~5 elements possibly with duplicates
 #'}
@@ -825,14 +830,14 @@ setMethod("sampleRDD",
           function(x, withReplacement, fraction, seed) {
 
             # The sampler: takes a partition and returns its sampled version.
-            samplingFunc <- function(split, part) {
+            samplingFunc <- function(partIndex, part) {
               set.seed(seed)
               res <- vector("list", length(part))
               len <- 0
 
               # Discards some random values to ensure each partition has a
               # different random seed.
-              runif(split)
+              runif(partIndex)
 
               for (elem in part) {
                 if (withReplacement) {
@@ -967,7 +972,7 @@ setMethod("keyBy",
 setMethod("repartition",
           signature(x = "RDD", numPartitions = "numeric"),
           function(x, numPartitions) {
-            coalesce(x, numToInt(numPartitions), TRUE)
+            coalesce(x, numPartitions, TRUE)
           })
 
 #' Return a new RDD that is reduced into numPartitions partitions.
@@ -989,8 +994,8 @@ setMethod("coalesce",
            function(x, numPartitions, shuffle = FALSE) {
              numPartitions <- numToInt(numPartitions)
              if (shuffle || numPartitions > SparkR::numPartitions(x)) {
-               func <- function(s, part) {
-                 set.seed(s)  # split as seed
+               func <- function(partIndex, part) {
+                 set.seed(partIndex)  # partIndex as seed
                  start <- as.integer(sample(numPartitions, 1) - 1)
                  lapply(seq_along(part),
                         function(i) {
@@ -1035,7 +1040,7 @@ setMethod("saveAsObjectFile",
 #' Save this RDD as a text file, using string representations of elements.
 #'
 #' @param x The RDD to save
-#' @param path The directory where the splits of the text file are saved
+#' @param path The directory where the partitions of the text file are saved
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
@@ -1335,10 +1340,10 @@ setMethod("zipWithUniqueId",
           function(x) {
             n <- numPartitions(x)
 
-            partitionFunc <- function(split, part) {
+            partitionFunc <- function(partIndex, part) {
               mapply(
                 function(item, index) {
-                  list(item, (index - 1) * n + split)
+                  list(item, (index - 1) * n + partIndex)
                 },
                 part,
                 seq_along(part),
@@ -1382,11 +1387,11 @@ setMethod("zipWithIndex",
               startIndices <- Reduce("+", nums, accumulate = TRUE)
             }
 
-            partitionFunc <- function(split, part) {
-              if (split == 0) {
+            partitionFunc <- function(partIndex, part) {
+              if (partIndex == 0) {
                 startIndex <- 0
               } else {
-                startIndex <- startIndices[[split]]
+                startIndex <- startIndices[[partIndex]]
               }
 
               mapply(
@@ -1590,3 +1595,49 @@ setMethod("intersection",
 
             keys(filterRDD(cogroup(rdd1, rdd2, numPartitions = numPartitions), filterFunction))
           })
+
+#' Zips an RDD's partitions with one (or more) RDD(s).
+#' Same as zipPartitions in Spark.
+#' 
+#' @param ... RDDs to be zipped.
+#' @param func A function to transform zipped partitions.
+#' @return A new RDD by applying a function to the zipped partitions. 
+#'         Assumes that all the RDDs have the *same number of partitions*, but 
+#'         does *not* require them to have the same number of elements in each partition.
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' rdd1 <- parallelize(sc, 1:2, 2L)  # 1, 2
+#' rdd2 <- parallelize(sc, 1:4, 2L)  # 1:2, 3:4
+#' rdd3 <- parallelize(sc, 1:6, 2L)  # 1:3, 4:6
+#' collect(zipPartitions(rdd1, rdd2, rdd3, 
+#'                       func = function(x, y, z) { list(list(x, y, z))} ))
+#' # list(list(1, c(1,2), c(1,2,3)), list(2, c(3,4), c(4,5,6)))
+#'}
+#' @rdname zipRDD
+#' @aliases zipPartitions,RDD
+setMethod("zipPartitions",
+          "RDD",
+          function(..., func) {
+            rrdds <- list(...)
+            if (length(rrdds) == 1) {
+              return(rrdds[[1]])
+            }
+            nPart <- sapply(rrdds, numPartitions)
+            if (length(unique(nPart)) != 1) {
+              stop("Can only zipPartitions RDDs which have the same number of partitions.")
+            }
+
+            rrdds <- lapply(rrdds, function(rdd) {
+              mapPartitionsWithIndex(rdd, function(partIndex, part) {
+                print(length(part))
+                list(list(partIndex, part))
+              })
+            })
+            union.rdd <- Reduce(unionRDD, rrdds)
+            zipped.rdd <- values(groupByKey(union.rdd, numPartitions = nPart[1]))
+            res <- mapPartitions(zipped.rdd, function(plist) {
+              do.call(func, plist[[1]])
+            })
+            res
+          })
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
@@ -17,12 +17,12 @@
 
 # context.R: SparkContext driven functions
 
-getMinSplits <- function(sc, minSplits) {
-  if (is.null(minSplits)) {
+getMinPartitions <- function(sc, minPartitions) {
+  if (is.null(minPartitions)) {
     defaultParallelism <- callJMethod(sc, "defaultParallelism")
-    minSplits <- min(defaultParallelism, 2)
+    minPartitions <- min(defaultParallelism, 2)
   }
-  as.integer(minSplits)
+  as.integer(minPartitions)
 }
 
 #' Create an RDD from a text file.
@@ -33,7 +33,7 @@ getMinSplits <- function(sc, minSplits) {
 #'
 #' @param sc SparkContext to use
 #' @param path Path of file to read. A vector of multiple paths is allowed.
-#' @param minSplits Minimum number of splits to be created. If NULL, the default
+#' @param minPartitions Minimum number of partitions to be created. If NULL, the default
 #'  value is chosen based on available parallelism.
 #' @return RDD where each item is of type \code{character}
 #' @export
@@ -42,13 +42,13 @@ getMinSplits <- function(sc, minSplits) {
 #'  sc <- sparkR.init()
 #'  lines <- textFile(sc, "myfile.txt")
 #'}
-textFile <- function(sc, path, minSplits = NULL) {
+textFile <- function(sc, path, minPartitions = NULL) {
   # Allow the user to have a more flexible definiton of the text file path
   path <- suppressWarnings(normalizePath(path))
   #' Convert a string vector of paths to a string containing comma separated paths
   path <- paste(path, collapse = ",")
 
-  jrdd <- callJMethod(sc, "textFile", path, getMinSplits(sc, minSplits))
+  jrdd <- callJMethod(sc, "textFile", path, getMinPartitions(sc, minPartitions))
   # jrdd is of type JavaRDD[String]
   RDD(jrdd, "string")
 }
@@ -60,7 +60,7 @@ textFile <- function(sc, path, minSplits = NULL) {
 #'
 #' @param sc SparkContext to use
 #' @param path Path of file to read. A vector of multiple paths is allowed.
-#' @param minSplits Minimum number of splits to be created. If NULL, the default
+#' @param minPartitions Minimum number of partitions to be created. If NULL, the default
 #'  value is chosen based on available parallelism.
 #' @return RDD containing serialized R objects.
 #' @seealso saveAsObjectFile
@@ -70,13 +70,13 @@ textFile <- function(sc, path, minSplits = NULL) {
 #'  sc <- sparkR.init()
 #'  rdd <- objectFile(sc, "myfile")
 #'}
-objectFile <- function(sc, path, minSplits = NULL) {
+objectFile <- function(sc, path, minPartitions = NULL) {
   # Allow the user to have a more flexible definiton of the text file path
   path <- suppressWarnings(normalizePath(path))
   #' Convert a string vector of paths to a string containing comma separated paths
   path <- paste(path, collapse = ",")
 
-  jrdd <- callJMethod(sc, "objectFile", path, getMinSplits(sc, minSplits))
+  jrdd <- callJMethod(sc, "objectFile", path, getMinPartitions(sc, minPartitions))
   # Assume the RDD contains serialized R objects.
   RDD(jrdd, "byte")
 }

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -60,7 +60,7 @@ setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
 
 #' @rdname distinct
 #' @export
-setGeneric("distinct", function(x, numPartitions = 1L) { standardGeneric("distinct") })
+setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
 
 #' @rdname filterRDD
 #' @export
@@ -182,7 +182,7 @@ setGeneric("setName", function(x, name) { standardGeneric("setName") })
 #' @rdname sortBy
 #' @export
 setGeneric("sortBy",
-           function(x, func, ascending = TRUE, numPartitions = 1L) {
+           function(x, func, ascending = TRUE, numPartitions = 1) {
              standardGeneric("sortBy")
            })
 
@@ -217,6 +217,11 @@ setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") })
 #' @export
 setGeneric("zipRDD", function(x, other) { standardGeneric("zipRDD") })
 
+#' @rdname zipRDD
+#' @export
+setGeneric("zipPartitions", function(..., func) { standardGeneric("zipPartitions") }, 
+           signature = "...")
+
 #' @rdname zipWithIndex
 #' @seealso zipWithUniqueId
 #' @export
@@ -244,7 +249,7 @@ setGeneric("flatMapValues", function(X, FUN) { standardGeneric("flatMapValues")
 
 #' @rdname intersection
 #' @export
-setGeneric("intersection", function(x, other, numPartitions = 1L) {
+setGeneric("intersection", function(x, other, numPartitions = 1) {
   standardGeneric("intersection") })
 
 #' @rdname keys
@@ -346,21 +351,21 @@ setGeneric("rightOuterJoin", function(x, y, numPartitions) { standardGeneric("ri
 #' @rdname sortByKey
 #' @export
 setGeneric("sortByKey",
-           function(x, ascending = TRUE, numPartitions = 1L) {
+           function(x, ascending = TRUE, numPartitions = 1) {
              standardGeneric("sortByKey")
            })
 
 #' @rdname subtract
 #' @export
 setGeneric("subtract",
-           function(x, other, numPartitions = 1L) {
+           function(x, other, numPartitions = 1) {
              standardGeneric("subtract")
            })
 
 #' @rdname subtractByKey
 #' @export
 setGeneric("subtractByKey", 
-           function(x, other, numPartitions = 1L) {
+           function(x, other, numPartitions = 1) {
              standardGeneric("subtractByKey")
            })