apache · viirya · Jan 3, 2018 · Jan 7, 2018 · Jan 16, 2018 · Jan 24, 2018
diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R
@@ -313,7 +313,7 @@ test_that("spark.mlp", {
   # Test predict method
   mlpTestDF <- df
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0"))
+  expect_equal(head(mlpPredictions$prediction, 6), c("0.0", "1.0", "1.0", "1.0", "1.0", "1.0"))
 
   # Test model save/load
   if (windows_with_hadoop()) {
@@ -348,12 +348,12 @@ test_that("spark.mlp", {
 
   # Test random seed
   # default seed
-  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10)
+  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 100)
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
   expect_equal(head(mlpPredictions$prediction, 10),
                c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
   # seed equals 10
-  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
+  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 100, seed = 10)
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
   expect_equal(head(mlpPredictions$prediction, 10),
                c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))

diff --git a/R/pkg/tests/fulltests/test_mllib_regression.R b/R/pkg/tests/fulltests/test_mllib_regression.R
@@ -102,10 +102,18 @@ test_that("spark.glm and predict", {
 })
 
 test_that("spark.glm summary", {
+  # prepare dataset
+  Sepal.Length <- c(2.0, 1.5, 1.8, 3.4, 5.1, 1.8, 1.0, 2.3)
+  Sepal.Width <- c(2.1, 2.3, 5.4, 4.7, 3.1, 2.1, 3.1, 5.5)
+  Petal.Length <- c(1.8, 2.1, 7.1, 2.5, 3.7, 6.3, 2.2, 7.2)
+  Species <- c("setosa", "versicolor", "versicolor", "versicolor", "virginica", "virginica",
+               "versicolor", "virginica")
+  dataset <- data.frame(Sepal.Length, Sepal.Width, Petal.Length, Species, stringsAsFactors = TRUE)
+
   # gaussian family
-  training <- suppressWarnings(createDataFrame(iris))
+  training <- suppressWarnings(createDataFrame(dataset))
   stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species))
-  rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
+  rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = dataset))
 
   # test summary coefficients return matrix type
   expect_true(class(stats$coefficients) == "matrix")
@@ -126,15 +134,15 @@ test_that("spark.glm summary", {
 
   out <- capture.output(print(stats))
   expect_match(out[2], "Deviance Residuals:")
-  expect_true(any(grepl("AIC: 59.22", out)))
+  expect_true(any(grepl("AIC: 35.84", out)))
 
   # binomial family
-  df <- suppressWarnings(createDataFrame(iris))
+  df <- suppressWarnings(createDataFrame(dataset))
   training <- df[df$Species %in% c("versicolor", "virginica"), ]
   stats <- summary(spark.glm(training, Species ~ Sepal_Length + Sepal_Width,
                              family = binomial(link = "logit")))
 
-  rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
+  rTraining <- dataset[dataset$Species %in% c("versicolor", "virginica"), ]
   rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
                         family = binomial(link = "logit")))
 
@@ -174,17 +182,17 @@ test_that("spark.glm summary", {
   expect_equal(stats$aic, rStats$aic)
 
   # Test spark.glm works with offset
-  training <- suppressWarnings(createDataFrame(iris))
+  training <- suppressWarnings(createDataFrame(dataset))
   stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
                              family = poisson(), offsetCol = "Petal_Length"))
   rStats <- suppressWarnings(summary(glm(Sepal.Width ~ Sepal.Length + Species,
-                        data = iris, family = poisson(), offset = iris$Petal.Length)))
+                        data = dataset, family = poisson(), offset = dataset$Petal.Length)))
   expect_true(all(abs(rStats$coefficients - stats$coefficients) < 1e-3))
 
   # Test summary works on base GLM models
-  baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
+  baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = dataset)
   baseSummary <- summary(baseModel)
-  expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
+  expect_true(abs(baseSummary$deviance - 11.84013) < 1e-4)
 
   # Test spark.glm works with regularization parameter
   data <- as.data.frame(cbind(a1, a2, b))
@@ -300,11 +308,19 @@ test_that("glm and predict", {
 })
 
 test_that("glm summary", {
+  # prepare dataset
+  Sepal.Length <- c(2.0, 1.5, 1.8, 3.4, 5.1, 1.8, 1.0, 2.3)
+  Sepal.Width <- c(2.1, 2.3, 5.4, 4.7, 3.1, 2.1, 3.1, 5.5)
+  Petal.Length <- c(1.8, 2.1, 7.1, 2.5, 3.7, 6.3, 2.2, 7.2)
+  Species <- c("setosa", "versicolor", "versicolor", "versicolor", "virginica", "virginica",
+               "versicolor", "virginica")
+  dataset <- data.frame(Sepal.Length, Sepal.Width, Petal.Length, Species, stringsAsFactors = TRUE)
+
   # gaussian family
-  training <- suppressWarnings(createDataFrame(iris))
+  training <- suppressWarnings(createDataFrame(dataset))
   stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
 
-  rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
+  rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = dataset))
 
   coefs <- stats$coefficients
   rCoefs <- rStats$coefficients
@@ -320,12 +336,12 @@ test_that("glm summary", {
   expect_equal(stats$aic, rStats$aic)
 
   # binomial family
-  df <- suppressWarnings(createDataFrame(iris))
+  df <- suppressWarnings(createDataFrame(dataset))
   training <- df[df$Species %in% c("versicolor", "virginica"), ]
   stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
                        family = binomial(link = "logit")))
 
-  rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
+  rTraining <- dataset[dataset$Species %in% c("versicolor", "virginica"), ]
   rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
                         family = binomial(link = "logit")))