-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-11215][ML] Add multiple columns support to StringIndexer #20146
Changes from 3 commits
bb990f1
26cc94b
540c364
18acbbf
b884fb5
76ff7bf
50af02e
c1be2c7
ed35d87
a1dcfda
c168522
a6551b0
c003bd3
ea3a46b
f7102e9
301fa4c
196db63
cd1eda0
7155c63
70009a5
3c6ffc7
b6ad1e4
d6fed35
0137d67
7a5be12
b33556b
c3d2522
867e001
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -313,7 +313,7 @@ test_that("spark.mlp", { | |
# Test predict method | ||
mlpTestDF <- df | ||
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) | ||
expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0")) | ||
expect_equal(head(mlpPredictions$prediction, 6), c("0.0", "1.0", "1.0", "1.0", "1.0", "1.0")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is due to the change of how we sort string labels with same frequency under the setting of frequencyDesc/Asc. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I checked the predictions. All |
||
|
||
# Test model save/load | ||
if (windows_with_hadoop()) { | ||
|
@@ -348,12 +348,12 @@ test_that("spark.mlp", { | |
|
||
# Test random seed | ||
# default seed | ||
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10) | ||
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 100) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you check if the run time increases significantly? this is an issue before - see SPARK-21693 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. > start.time <- Sys.time()
> model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10)
> end.time <- Sys.time()
> time.taken <- end.time - start.time
> time.taken
Time difference of 1.780564 secs > start.time <- Sys.time()
> model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 100)
> end.time <- Sys.time()
> time.taken <- end.time - start.time
> time.taken
Time difference of 5.728089 secs There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ahh, @viirya, would you mind if I ask to check it after setting There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ran it again with the config: > sparkR.conf("spark.sparkr.use.daemon")
$spark.sparkr.use.daemon
[1] "false"
> start.time <- Sys.time()
> model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10)
> end.time <- Sys.time()
> time.taken <- end.time - start.time
> time.taken
Time difference of 1.704288 secs > start.time <- Sys.time()
> model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 100)
> end.time <- Sys.time()
> time.taken <- end.time - start.time
> time.taken
Time difference of 5.135418 secs |
||
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) | ||
expect_equal(head(mlpPredictions$prediction, 10), | ||
c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0")) | ||
# seed equals 10 | ||
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10) | ||
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 100, seed = 10) | ||
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) | ||
expect_equal(head(mlpPredictions$prediction, 10), | ||
c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0")) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -102,10 +102,18 @@ test_that("spark.glm and predict", { | |
}) | ||
|
||
test_that("spark.glm summary", { | ||
# prepare dataset | ||
srowen marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Sepal.Length <- c(2.0, 1.5, 1.8, 3.4, 5.1, 1.8, 1.0, 2.3) | ||
Sepal.Width <- c(2.1, 2.3, 5.4, 4.7, 3.1, 2.1, 3.1, 5.5) | ||
Petal.Length <- c(1.8, 2.1, 7.1, 2.5, 3.7, 6.3, 2.2, 7.2) | ||
Species <- c("setosa", "versicolor", "versicolor", "versicolor", "virginica", "virginica", | ||
"versicolor", "virginica") | ||
dataset <- data.frame(Sepal.Length, Sepal.Width, Petal.Length, Species, stringsAsFactors = TRUE) | ||
|
||
# gaussian family | ||
training <- suppressWarnings(createDataFrame(iris)) | ||
training <- suppressWarnings(createDataFrame(dataset)) | ||
stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species)) | ||
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris)) | ||
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = dataset)) | ||
|
||
# test summary coefficients return matrix type | ||
expect_true(class(stats$coefficients) == "matrix") | ||
|
@@ -126,15 +134,15 @@ test_that("spark.glm summary", { | |
|
||
out <- capture.output(print(stats)) | ||
expect_match(out[2], "Deviance Residuals:") | ||
expect_true(any(grepl("AIC: 59.22", out))) | ||
expect_true(any(grepl("AIC: 35.84", out))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. R glm's AIC: 35.839: > out <- capture.output(print(rStats))
> out
[1] ""
[2] "Call:"
[3] "glm(formula = Sepal.Width ~ Sepal.Length + Species, data = dataset)"
[4] ""
[5] "Deviance Residuals: "
[6] " 1 2 3 4 5 6 7 8 "
[7] " 0.0000 -1.4932 1.5491 0.5411 -0.8581 -1.2228 -0.5969 2.0809 "
[8] ""
[9] "Coefficients:"
[10] " Estimate Std. Error t value Pr(>|t|)"
[11] "(Intercept) 1.7150 2.0492 0.837 0.450"
[12] "Sepal.Length 0.1925 0.5566 0.346 0.747"
[13] "Speciesversicolor 1.7894 1.9240 0.930 0.405"
[14] "Speciesvirginica 1.2613 2.0735 0.608 0.576"
[15] ""
[16] "(Dispersion parameter for gaussian family taken to be 2.960032)"
[17] ""
[18] " Null deviance: 14.719 on 7 degrees of freedom"
[19] "Residual deviance: 11.840 on 4 degrees of freedom"
[20] "AIC: 35.839"
[21] ""
[22] "Number of Fisher Scoring iterations: 2"
[23] "" |
||
|
||
# binomial family | ||
df <- suppressWarnings(createDataFrame(iris)) | ||
df <- suppressWarnings(createDataFrame(dataset)) | ||
training <- df[df$Species %in% c("versicolor", "virginica"), ] | ||
stats <- summary(spark.glm(training, Species ~ Sepal_Length + Sepal_Width, | ||
family = binomial(link = "logit"))) | ||
|
||
rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ] | ||
rTraining <- dataset[dataset$Species %in% c("versicolor", "virginica"), ] | ||
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining, | ||
family = binomial(link = "logit"))) | ||
|
||
|
@@ -174,17 +182,17 @@ test_that("spark.glm summary", { | |
expect_equal(stats$aic, rStats$aic) | ||
|
||
# Test spark.glm works with offset | ||
training <- suppressWarnings(createDataFrame(iris)) | ||
training <- suppressWarnings(createDataFrame(dataset)) | ||
stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species, | ||
family = poisson(), offsetCol = "Petal_Length")) | ||
rStats <- suppressWarnings(summary(glm(Sepal.Width ~ Sepal.Length + Species, | ||
data = iris, family = poisson(), offset = iris$Petal.Length))) | ||
data = dataset, family = poisson(), offset = dataset$Petal.Length))) | ||
expect_true(all(abs(rStats$coefficients - stats$coefficients) < 1e-3)) | ||
|
||
# Test summary works on base GLM models | ||
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris) | ||
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = dataset) | ||
baseSummary <- summary(baseModel) | ||
expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4) | ||
expect_true(abs(baseSummary$deviance - 11.84013) < 1e-4) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. R glm: > baseSummary <- summary(stats::glm(Sepal.Width ~ Sepal.Length + Species, data = dataset))
> baseSummary$deviance
[1] 11.84013 Spark glm: > baseSummary <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species))
> baseSummary$deviance
[1] 11.84013 |
||
|
||
# Test spark.glm works with regularization parameter | ||
data <- as.data.frame(cbind(a1, a2, b)) | ||
|
@@ -300,11 +308,19 @@ test_that("glm and predict", { | |
}) | ||
|
||
test_that("glm summary", { | ||
# prepare dataset | ||
Sepal.Length <- c(2.0, 1.5, 1.8, 3.4, 5.1, 1.8, 1.0, 2.3) | ||
Sepal.Width <- c(2.1, 2.3, 5.4, 4.7, 3.1, 2.1, 3.1, 5.5) | ||
Petal.Length <- c(1.8, 2.1, 7.1, 2.5, 3.7, 6.3, 2.2, 7.2) | ||
Species <- c("setosa", "versicolor", "versicolor", "versicolor", "virginica", "virginica", | ||
"versicolor", "virginica") | ||
dataset <- data.frame(Sepal.Length, Sepal.Width, Petal.Length, Species, stringsAsFactors = TRUE) | ||
|
||
# gaussian family | ||
training <- suppressWarnings(createDataFrame(iris)) | ||
training <- suppressWarnings(createDataFrame(dataset)) | ||
stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training)) | ||
|
||
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris)) | ||
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = dataset)) | ||
|
||
coefs <- stats$coefficients | ||
rCoefs <- rStats$coefficients | ||
|
@@ -320,12 +336,12 @@ test_that("glm summary", { | |
expect_equal(stats$aic, rStats$aic) | ||
|
||
# binomial family | ||
df <- suppressWarnings(createDataFrame(iris)) | ||
df <- suppressWarnings(createDataFrame(dataset)) | ||
training <- df[df$Species %in% c("versicolor", "virginica"), ] | ||
stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training, | ||
family = binomial(link = "logit"))) | ||
|
||
rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ] | ||
rTraining <- dataset[dataset$Species %in% c("versicolor", "virginica"), ] | ||
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining, | ||
family = binomial(link = "logit"))) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually I think we may remove this test
Test predict method
. Seems to me, with thetol = 0.5
, the prediction may not be very meaningful.