salesforce · TuanNguyen27 · Aug 2, 2019 · Jul 2, 2019 · Jul 2, 2019 · Jul 3, 2019
diff --git a/core/src/main/scala/com/salesforce/op/OpWorkflow.scala b/core/src/main/scala/com/salesforce/op/OpWorkflow.scala
@@ -524,12 +524,14 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
   def withRawFeatureFilter[T](
     trainingReader: Option[Reader[T]],
     scoringReader: Option[Reader[T]],
-    bins: Int = 100,
+    bins: Int = 500,
     minFillRate: Double = 0.001,
     maxFillDifference: Double = 0.90,
     maxFillRatioDiff: Double = 20.0,
     maxJSDivergence: Double = 0.90,
     maxCorrelation: Double = 0.95,
+    pvalCutoff: Double = 0.05,
+    minTextLen: Double = 100,
     correlationType: CorrelationType = CorrelationType.Pearson,
     protectedFeatures: Array[OPFeature] = Array.empty,
     protectedJSFeatures: Array[OPFeature] = Array.empty,
@@ -552,6 +554,8 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
         maxFillRatioDiff = maxFillRatioDiff,
         maxJSDivergence = maxJSDivergence,
         maxCorrelation = maxCorrelation,
+        pvalCutoff = pvalCutoff,
+        minTextLen = minTextLen,
         correlationType = correlationType,
         protectedFeatures = protectedRawFeatures,
         jsDivergenceProtectedFeatures = protectedRawJSFeatures,

diff --git a/core/src/main/scala/com/salesforce/op/filters/AllFeatureInformation.scala b/core/src/main/scala/com/salesforce/op/filters/AllFeatureInformation.scala
@@ -42,6 +42,7 @@ package com.salesforce.op.filters
  *                               1st level keys correspond to response keys
  *                               2nd level keys correspond to predictor keys with values being
  *                               null-label leakage corr. value
+ * @param avgtextLength          average length of text features
  */
 private[op] case class AllFeatureInformation
 (

@@ -38,7 +38,9 @@ import com.salesforce.op.utils.json.EnumEntrySerializer
 import com.twitter.algebird.Monoid._
 import com.twitter.algebird.Operators._
 import com.twitter.algebird.Semigroup
+import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.feature.HashingTF
+import org.apache.spark.mllib.stat.Statistics
 import org.json4s.jackson.Serialization
 import org.json4s.{DefaultFormats, Formats}
 
@@ -51,6 +53,7 @@ import scala.util.Try
  * @param key          map key associated with distribution (when the feature is a map)
  * @param count        total count of feature seen
  * @param nulls        number of empties seen in feature
+ * @param avgTextLen   average length of the text (only applicable to text feature)
  * @param distribution binned counts of feature values (hashed for strings, evenly spaced bins for numerics)
  * @param summaryInfo  either min and max number of tokens for text data, or splits used for bins for numeric data
  * @param `type`       feature distribution type: training or scoring
@@ -61,6 +64,7 @@ case class FeatureDistribution
   key: Option[String],
   count: Long,
   nulls: Long,
+  avgTextLen: Double,
   distribution: Array[Double],
   summaryInfo: Array[Double],
   `type`: FeatureDistributionType = FeatureDistributionType.Training
@@ -91,6 +95,23 @@ case class FeatureDistribution
    */
   def fillRate(): Double = if (count == 0L) 0.0 else (count - nulls) / count.toDouble
 
+  /**
+   * Test whether the given distribution is Uniform, for detecting useless text hashes
+   *
+   * @return true means we don't have enough evidence to reject Null hypothesis (current distribution is uniform)
+   *         likely to drop the feature, but will combine with average text length check.
+   *         If the hash space is too small w.r.t, a text feature could still appear uniformly distributed
+   *
+   *         false means rejecting the Null hypothesis
+   *         hashed feature deos not follow uniform distribution, but could still be useless
+   */
+
+  def chiSqUnifTest(cutoff: Double): Boolean = {
+    val vectorizedDistr = Vectors.dense(distribution)
+    val goodnessOfFitTestResult = Statistics.chiSqTest(vectorizedDistr)
+    return goodnessOfFitTestResult.pValue >= cutoff
+  }
+
   /**
    * Combine feature distributions
    *
@@ -100,9 +121,11 @@ case class FeatureDistribution
   def reduce(fd: FeatureDistribution): FeatureDistribution = {
     checkMatch(fd)
     val combinedDist = distribution + fd.distribution
+    val combinedAvgTextLen = (avgTextLen * count + fd.avgTextLen * fd.count)/(count + fd.count)
     // summary info can be empty or min max if hist is empty but should otherwise match so take the longest info
     val combinedSummary = if (summaryInfo.length > fd.summaryInfo.length) summaryInfo else fd.summaryInfo
-    FeatureDistribution(name, key, count + fd.count, nulls + fd.nulls, combinedDist, combinedSummary, `type`)
+    FeatureDistribution(name, key, count + fd.count, nulls + fd.nulls,
+      combinedAvgTextLen, combinedDist, combinedSummary, `type`)
   }
 
   /**
@@ -154,6 +177,7 @@ case class FeatureDistribution
       "key" -> key,
       "count" -> count.toString,
       "nulls" -> nulls.toString,
+      "avgTextLen" -> avgTextLen.toString,
       "distribution" -> distribution.mkString("[", ",", "]"),
       "summaryInfo" -> summaryInfo.mkString("[", ",", "]")
     ).map { case (n, v) => s"$n = $v" }.mkString(", ")
@@ -162,7 +186,7 @@ case class FeatureDistribution
   }
 
   override def equals(that: Any): Boolean = that match {
-    case FeatureDistribution(`name`, `key`, `count`, `nulls`, d, s, `type`) =>
+    case FeatureDistribution(`name`, `key`, `count`, `nulls`, `avgTextLen`, d, s, `type`) =>
       distribution.deep == d.deep && summaryInfo.deep == s.deep
     case _ => false
   }
@@ -224,12 +248,16 @@ object FeatureDistribution {
     val (nullCount, (summaryInfo, distribution)) =
       value.map(seq => 0L -> histValues(seq, summary, bins, textBinsFormula))
         .getOrElse(1L -> (Array(summary.min, summary.max, summary.sum, summary.count) -> new Array[Double](bins)))
-
+    val avgTextLen = value match {
+      case Some(Left(v)) => if (v.size > 0) v.map(_.size).sum / v.size else 0.0
+      case _ => 0.0
+    }
     FeatureDistribution(
       name = name,
       key = key,
       count = 1L,
       nulls = nullCount,
+      avgTextLen = avgTextLen,
       summaryInfo = summaryInfo,
       distribution = distribution,
       `type` = `type`

@@ -97,6 +97,8 @@ class RawFeatureFilter[T]
   val maxFillRatioDiff: Double,
   val maxJSDivergence: Double,
   val maxCorrelation: Double,
+  val pvalCutoff: Double,
+  val minTextLen: Double,
   val correlationType: CorrelationType = CorrelationType.Pearson,
   val jsDivergenceProtectedFeatures: Set[String] = Set.empty,
   val protectedFeatures: Set[String] = Set.empty,
@@ -320,6 +322,8 @@ class RawFeatureFilter[T]
       message = s"Features excluded because training fill rate did not meet min required ($minFill)"
     )
 
+    val uniformFtDistribution: Seq[Boolean] = trainingDistribs.map(_.chiSqUnifTest(pvalCutoff))
+    val avgTextLenTest: Seq[Boolean] = trainingDistribs.map(_.avgTextLen < minTextLen)
     val trainingNullLabelLeakers: Seq[Boolean] = rawFeatureFilterMetrics.map(_.trainingNullLabelAbsoluteCorr).map {
       case Some(corr) => corr > maxCorrelation
       case None => false

diff --git a/core/src/test/scala/com/salesforce/op/OpWorkflowModelReaderWriterTest.scala b/core/src/test/scala/com/salesforce/op/OpWorkflowModelReaderWriterTest.scala
@@ -87,8 +87,8 @@ class OpWorkflowModelReaderWriterTest
     aggregateParams = null
   )
 
-  val distributions = Array(FeatureDistribution("a", None, 1L, 1L, Array(1.0), Array(1.0)),
-    FeatureDistribution("b", Option("b"), 2L, 2L, Array(2.0), Array(2.0)))
+  val distributions = Array(FeatureDistribution("a", None, 1L, 1L, 0, Array(1.0), Array(1.0)),
+    FeatureDistribution("b", Option("b"), 2L, 2L, 0, Array(2.0), Array(2.0)))
 
   val rawFeatureFilterResults = RawFeatureFilterResults(rawFeatureDistributions = distributions)
 

@@ -134,42 +134,42 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
   }
 
   it should "correctly compare fill rates" in {
-    val fd1 = FeatureDistribution("A", None, 10, 1, Array.empty, Array.empty)
-    val fd2 = FeatureDistribution("A", None, 20, 20, Array.empty, Array.empty)
+    val fd1 = FeatureDistribution("A", None, 10, 1, 0, Array.empty, Array.empty)
+    val fd2 = FeatureDistribution("A", None, 20, 20, 0, Array.empty, Array.empty)
     fd1.relativeFillRate(fd2) shouldBe 0.9
   }
 
   it should "correctly compare relative fill rates" in {
-    val fd1 = FeatureDistribution("A", None, 10, 1, Array.empty, Array.empty)
-    val fd2 = FeatureDistribution("A", None, 20, 19, Array.empty, Array.empty)
+    val fd1 = FeatureDistribution("A", None, 10, 1, 0, Array.empty, Array.empty)
+    val fd2 = FeatureDistribution("A", None, 20, 19, 0, Array.empty, Array.empty)
     trainSummaries(0).relativeFillRatio(scoreSummaries(0)) shouldBe 4.5
     trainSummaries(2).relativeFillRatio(scoreSummaries(2)) shouldBe 1.0
     fd1.relativeFillRatio(fd2) shouldBe 18.0
   }
 
   it should "correctly compute the DS divergence" in {
-    val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty)
-    val fd2 = FeatureDistribution("A", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty)
+    val fd1 = FeatureDistribution("A", None, 10, 1, 0, Array(1, 4, 0, 0, 6), Array.empty)
+    val fd2 = FeatureDistribution("A", None, 20, 20, 0, Array(2, 8, 0, 0, 12), Array.empty)
     fd1.jsDivergence(fd2) should be < eps
 
-    val fd3 = FeatureDistribution("A", None, 10, 1, Array(0, 0, 1000, 1000, 0), Array.empty)
+    val fd3 = FeatureDistribution("A", None, 10, 1, 0, Array(0, 0, 1000, 1000, 0), Array.empty)
     fd3.jsDivergence(fd3) should be < eps
-    val fd4 = FeatureDistribution("A", None, 20, 20, Array(200, 800, 0, 0, 1200), Array.empty)
+    val fd4 = FeatureDistribution("A", None, 20, 20, 0, Array(200, 800, 0, 0, 1200), Array.empty)
     (fd3.jsDivergence(fd4) - 1.0) should be < eps
   }
 
   it should "reduce correctly" in {
-    val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty)
-    val fd2 = FeatureDistribution("A", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty)
-    val res = FeatureDistribution("A", None, 30, 21, Array(3.0, 12.0, 0.0, 0.0, 18.0), Array.empty)
+    val fd1 = FeatureDistribution("A", None, 10, 1, 0, Array(1, 4, 0, 0, 6), Array.empty)
+    val fd2 = FeatureDistribution("A", None, 20, 20, 0, Array(2, 8, 0, 0, 12), Array.empty)
+    val res = FeatureDistribution("A", None, 30, 21, 0, Array(3.0, 12.0, 0.0, 0.0, 18.0), Array.empty)
 
     fd1.reduce(fd2) shouldBe res
     FeatureDistribution.semigroup.plus(fd1, fd2) shouldBe res
   }
 
   it should "have equals" in {
-    val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty)
-    val fd2 = FeatureDistribution("A", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty)
+    val fd1 = FeatureDistribution("A", None, 10, 1, 0, Array(1, 4, 0, 0, 6), Array.empty)
+    val fd2 = FeatureDistribution("A", None, 20, 20, 0, Array(2, 8, 0, 0, 12), Array.empty)
     fd1 shouldBe fd1
     fd1.equals("blarg") shouldBe false
     fd1 shouldBe fd1.copy(summaryInfo = Array.empty)
@@ -178,23 +178,23 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
   }
 
   it should "have hashCode" in {
-    val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty)
-    val fd2 = FeatureDistribution("A", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty)
+    val fd1 = FeatureDistribution("A", None, 10, 1, 0, Array(1, 4, 0, 0, 6), Array.empty)
+    val fd2 = FeatureDistribution("A", None, 20, 20, 0, Array(2, 8, 0, 0, 12), Array.empty)
     fd1.hashCode() shouldBe fd1.hashCode()
     fd1.hashCode() shouldBe fd1.copy(summaryInfo = fd1.summaryInfo).hashCode()
     fd1.hashCode() should not be fd1.copy(summaryInfo = Array.empty).hashCode()
     fd1.hashCode() should not be fd2.hashCode()
   }
 
   it should "have toString" in {
-    FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty).toString() shouldBe
+    FeatureDistribution("A", None, 10, 1, 0, Array(1, 4, 0, 0, 6), Array.empty).toString() shouldBe
       "FeatureDistribution(type = Training, name = A, key = None, count = 10, nulls = 1, " +
-        "distribution = [1.0,4.0,0.0,0.0,6.0], summaryInfo = [])"
+        "avgTextLen = 0.0, distribution = [1.0,4.0,0.0,0.0,6.0], summaryInfo = [])"
   }
 
   it should "marshall to/from json" in {
-    val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty)
-    val fd2 = FeatureDistribution("A", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty)
+    val fd1 = FeatureDistribution("A", None, 10, 1, 0, Array(1, 4, 0, 0, 6), Array.empty)
+    val fd2 = FeatureDistribution("A", None, 20, 20, 0, Array(2, 8, 0, 0, 12), Array.empty)
     val json = FeatureDistribution.toJson(Array(fd1, fd2))
     FeatureDistribution.fromJson(json) match {
       case Success(r) => r shouldBe Seq(fd1, fd2)
@@ -203,11 +203,12 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
   }
 
   it should "marshall to/from json with default vector args" in {
-    val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty, FeatureDistributionType.Scoring)
-    val fd2 = FeatureDistribution("A", Some("X"), 20, 20, Array(2, 8, 0, 0, 12), Array.empty)
+    val fd1 = FeatureDistribution("A", None, 10, 1, 0.0, Array(1, 4, 0, 0, 6),
+      Array.empty, FeatureDistributionType.Scoring)
+    val fd2 = FeatureDistribution("A", Some("X"), 20, 20, 0.0, Array(2, 8, 0, 0, 12), Array.empty)
     val json =
-      """[{"name":"A","count":10,"nulls":1,"distribution":[1.0,4.0,0.0,0.0,6.0],"type":"Scoring"},
-        |{"name":"A","key":"X","count":20,"nulls":20,"distribution":[2.0,8.0,0.0,0.0,12.0]}]
+      """[{"name":"A","count":10,"nulls":1,"avgTextLen":0.0,"distribution":[1.0,4.0,0.0,0.0,6.0],"type":"Scoring"},
+        |{"name":"A","key":"X","count":20,"nulls":20,"avgTextLen":0.0,"distribution":[2.0,8.0,0.0,0.0,12.0]}]
         |""".stripMargin
 
     FeatureDistribution.fromJson(json) match {
@@ -217,7 +218,7 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
   }
 
   it should "error on mismatching feature name, key or type" in {
-    val fd1 = FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty)
+    val fd1 = FeatureDistribution("A", None, 10, 1, 0, Array(1, 4, 0, 0, 6), Array.empty)
 
     intercept[IllegalArgumentException](fd1.reduce(fd1.copy(name = "boo"))) should have message
       "requirement failed: Name must match to compare or combine feature distributions: A != boo"

@@ -37,20 +37,20 @@ trait FiltersTestData {
   protected val eps = 1E-2
 
   protected val trainSummaries = Seq(
-    FeatureDistribution("A", None, 10, 1, Array(1, 4, 0, 0, 6), Array.empty),
-    FeatureDistribution("B", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty),
-    FeatureDistribution("C", Some("1"), 10, 1, Array(1, 4, 0, 0, 6), Array.empty),
-    FeatureDistribution("C", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty),
-    FeatureDistribution("D", Some("1"), 10, 9, Array(1, 4, 0, 0, 6), Array.empty),
-    FeatureDistribution("D", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty)
+    FeatureDistribution("A", None, 10, 1, 0, Array(1, 4, 0, 0, 6), Array.empty),
+    FeatureDistribution("B", None, 20, 20, 0, Array(2, 8, 0, 0, 12), Array.empty),
+    FeatureDistribution("C", Some("1"), 10, 1, 0, Array(1, 4, 0, 0, 6), Array.empty),
+    FeatureDistribution("C", Some("2"), 20, 19, 0, Array(2, 8, 0, 0, 12), Array.empty),
+    FeatureDistribution("D", Some("1"), 10, 9, 0, Array(1, 4, 0, 0, 6), Array.empty),
+    FeatureDistribution("D", Some("2"), 20, 19, 0, Array(2, 8, 0, 0, 12), Array.empty)
   )
 
   protected val scoreSummaries = Seq(
-    FeatureDistribution("A", None, 10, 8, Array(1, 4, 0, 0, 6), Array.empty, FeatureDistributionType.Scoring),
-    FeatureDistribution("B", None, 20, 20, Array(2, 8, 0, 0, 12), Array.empty, FeatureDistributionType.Scoring),
-    FeatureDistribution("C", Some("1"), 10, 1, Array(0, 0, 10, 10, 0), Array.empty, FeatureDistributionType.Scoring),
-    FeatureDistribution("C", Some("2"), 20, 19, Array(2, 8, 0, 0, 12), Array.empty, FeatureDistributionType.Scoring),
-    FeatureDistribution("D", Some("1"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty, FeatureDistributionType.Scoring),
-    FeatureDistribution("D", Some("2"), 0, 0, Array(0, 0, 0, 0, 0), Array.empty, FeatureDistributionType.Scoring)
+    FeatureDistribution("A", None, 10, 8, 0, Array(1, 4, 0, 0, 6), Array.empty, FeatureDistributionType.Scoring),
+    FeatureDistribution("B", None, 20, 20, 0, Array(2, 8, 0, 0, 12), Array.empty, FeatureDistributionType.Scoring),
+    FeatureDistribution("C", Some("1"), 10, 1, 0, Array(0, 0, 10, 10, 0), Array.empty, FeatureDistributionType.Scoring),
+    FeatureDistribution("C", Some("2"), 20, 19, 0, Array(2, 8, 0, 0, 12), Array.empty, FeatureDistributionType.Scoring),
+    FeatureDistribution("D", Some("1"), 0, 0, 0, Array(0, 0, 0, 0, 0), Array.empty, FeatureDistributionType.Scoring),
+    FeatureDistribution("D", Some("2"), 0, 0, 0, Array(0, 0, 0, 0, 0), Array.empty, FeatureDistributionType.Scoring)
   )
 }