apache · mgaido91 · Dec 15, 2017 · Dec 19, 2017 · Dec 19, 2017 · Dec 19, 2017
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -34,9 +34,9 @@ import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 /**
  * `Bucketizer` maps a column of continuous features to a column of feature buckets. Since 2.3.0,
  * `Bucketizer` can map multiple columns at once by setting the `inputCols` parameter. Note that
- * when both the `inputCol` and `inputCols` parameters are set, a log warning will be printed and
- * only `inputCol` will take effect, while `inputCols` will be ignored. The `splits` parameter is
- * only used for single column usage, and `splitsArray` is for multiple columns.
+ * when both the `inputCol` and `inputCols` parameters are set, an Exception will be thrown. The
+ * `splits` parameter is only used for single column usage, and `splitsArray` is for multiple
+ * columns.
  */
 @Since("1.4.0")
 final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)
@@ -137,18 +137,17 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
   /**
    * Determines whether this `Bucketizer` is going to map multiple columns. If and only if
    * `inputCols` is set, it will map multiple columns. Otherwise, it just maps a column specified
-   * by `inputCol`. A warning will be printed if both are set.
+   * by `inputCol`. An exception will be thrown if both are set.
    */
   private[feature] def isBucketizeMultipleColumns(): Boolean = {
-    if (isSet(inputCols) && isSet(inputCol)) {
-      logWarning("Both `inputCol` and `inputCols` are set, we ignore `inputCols` and this " +
-        "`Bucketizer` only map one column specified by `inputCol`")
-      false
-    } else if (isSet(inputCols)) {
-      true
-    } else {
-      false
+    ParamValidators.assertColOrCols(this)
+    if (isSet(inputCol) && isSet(splitsArray)) {
+      ParamValidators.raiseIncompatibleParamsException("inputCol", "splitsArray")
+    }
+    if (isSet(inputCols) && isSet(splits)) {
+      ParamValidators.raiseIncompatibleParamsException("inputCols", "splits")
     }
+    isSet(inputCols)
   }
 
   @Since("2.0.0")

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -31,6 +31,7 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.ml.linalg.{JsonMatrixConverter, JsonVectorConverter, Matrix, Vector}
+import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.Identifiable
 
 /**
@@ -249,6 +250,29 @@ object ParamValidators {
   def arrayLengthGt[T](lowerBound: Double): Array[T] => Boolean = { (value: Array[T]) =>
     value.length > lowerBound
   }
+
+  /**
+   * Checks that either inputCols and outputCols are set or inputCol and outputCol are set. If
+   * this is not true, an `IllegalArgumentException` is raised.
+   * @param model
+   */
+  def assertColOrCols(model: Params): Unit = {
+    model match {
+      case m: HasInputCols with HasInputCol if m.isSet(m.inputCols) && m.isSet(m.inputCol) =>
+        raiseIncompatibleParamsException("inputCols", "inputCol")
+      case m: HasOutputCols with HasInputCol if m.isSet(m.outputCols) && m.isSet(m.inputCol) =>
+        raiseIncompatibleParamsException("outputCols", "inputCol")
+      case m: HasInputCols with HasOutputCol if m.isSet(m.inputCols) && m.isSet(m.outputCol) =>
+        raiseIncompatibleParamsException("inputCols", "outputCol")
+      case m: HasOutputCols with HasOutputCol if m.isSet(m.outputCols) && m.isSet(m.outputCol) =>
+        raiseIncompatibleParamsException("outputCols", "outputCol")
+      case _ =>
+    }
+  }
+
+  def raiseIncompatibleParamsException(paramName1: String, paramName2: String): Unit = {
+    throw new IllegalArgumentException(s"Both `$paramName1` and `$paramName2` are set.")
+  }
 }
 
 // specialize primitive-typed params because Java doesn't recognize scala.Double, scala.Int, ...

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
@@ -402,14 +402,33 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
   }
 
   test("Both inputCol and inputCols are set") {
-    val bucket = new Bucketizer()
+    val feature1 = Array(-0.5, -0.3, 0.0, 0.2)
+    val feature2 = Array(-0.3, -0.2, 0.5, 0.0)
+    val df = feature1.zip(feature2).toSeq.toDF("feature1", "feature2")
+
+    val invalid1 = new Bucketizer()
       .setInputCol("feature1")
       .setOutputCol("result")
       .setSplits(Array(-0.5, 0.0, 0.5))
       .setInputCols(Array("feature1", "feature2"))
 
-    // When both are set, we ignore `inputCols` and just map the column specified by `inputCol`.
-    assert(bucket.isBucketizeMultipleColumns() == false)
+    val invalid2 = new Bucketizer()
+      .setOutputCol("result")
+      .setSplits(Array(-0.5, 0.0, 0.5))
+      .setInputCols(Array("feature1", "feature2"))
+
+    val invalid3 = new Bucketizer()
+      .setInputCol("feature1")
+      .setSplits(Array(-0.5, 0.0, 0.5))
+      .setOutputCols(Array("result1", "result2"))
+
+    Seq(invalid1, invalid2, invalid3).foreach { bucketizer =>
+      // When both inputCol and inputCols are set, we throw Exception.
+      val e = intercept[IllegalArgumentException] {
+        bucketizer.transform(df)
+      }
+      assert(e.getMessage.contains("Both `inputCol` and `inputCols` are set"))
+    }
   }
 }