diff --git a/pkg/NAMESPACE b/pkg/NAMESPACE index 63c05605f67bb..6b649bc90bccb 100644 --- a/pkg/NAMESPACE +++ b/pkg/NAMESPACE @@ -20,6 +20,7 @@ exportMethods( "foreachPartition", "groupByKey", "join", + "keyBy", "keys", "length", "lapply", diff --git a/pkg/R/RDD.R b/pkg/R/RDD.R index b641d43a0fc8d..2237f45dafd87 100644 --- a/pkg/R/RDD.R +++ b/pkg/R/RDD.R @@ -1036,6 +1036,31 @@ setMethod("takeSample", signature(rdd = "RDD", withReplacement = "logical", sample(samples)[1:total] }) +#' Creates tuples of the elements in this RDD by applying a function. +#' +#' @param rdd The RDD. +#' @param func The function to be applied. +#' @rdname keyBy +#' @export +#' @examples +#'\dontrun{ +#' sc <- sparkR.init() +#' rdd <- parallelize(sc, list(1, 2, 3)) +#' collect(keyBy(rdd, function(x) { x*x })) # list(list(1, 1), list(4, 2), list(9, 3)) +#'} +setGeneric("keyBy", function(rdd, func) { standardGeneric("keyBy") }) + +#' @rdname keyBy +#' @aliases keyBy,RDD +setMethod("keyBy", + signature(rdd = "RDD", func = "function"), + function(rdd, func) { + apply.func <- function(x) { + list(func(x), x) + } + lapply(rdd, apply.func) + }) + #' Return an RDD with the keys of each tuple. #' #' @param rdd The RDD from which the keys of each tuple is returned. diff --git a/pkg/inst/tests/test_rdd.R b/pkg/inst/tests/test_rdd.R index e6d6a63d214ad..dfd96f15981b4 100644 --- a/pkg/inst/tests/test_rdd.R +++ b/pkg/inst/tests/test_rdd.R @@ -246,6 +246,13 @@ test_that("minimum() on RDDs", { expect_equal(min, 1) }) +test_that("keyBy on RDDs", { + func <- function(x) { x*x } + keys <- keyBy(rdd, func) + actual <- collect(keys) + expect_equal(actual, lapply(nums, function(x) { list(func(x), x) })) +}) + test_that("keys() on RDDs", { keys <- keys(intRdd) actual <- collect(keys) diff --git a/pkg/man/keyBy.Rd b/pkg/man/keyBy.Rd new file mode 100644 index 0000000000000..d4fb45d4965df --- /dev/null +++ b/pkg/man/keyBy.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2 (4.0.2): do not edit by hand +\docType{methods} +\name{keyBy} +\alias{keyBy} +\alias{keyBy,RDD} +\alias{keyBy,RDD,function-method} +\title{Creates tuples of the elements in this RDD by applying a function.} +\usage{ +keyBy(rdd, func) + +\S4method{keyBy}{RDD,`function`}(rdd, func) +} +\arguments{ +\item{rdd}{The RDD.} + +\item{func}{The function to be applied.} +} +\description{ +Creates tuples of the elements in this RDD by applying a function. +} +\examples{ +\dontrun{ +sc <- sparkR.init() +rdd <- parallelize(sc, list(1, 2, 3)) +collect(keyBy(rdd, function(x) { x*x })) # list(list(1, 1), list(4, 2), list(9, 3)) +} +} +