Change minimumOccurence to minDocFreq in code and docs

apache · Sep 23, 2014 · 40fd70c · 40fd70c
1 parent 47850ab
commit 40fd70c
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 15 deletions.
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
@@ -85,14 +85,14 @@ val tfidf: RDD[Vector] = idf.transform(tf)
 
 MLLib's IDF implementation provides an option for ignoring terms which occur in less than a
 minimum number of documents.  In such cases, the IDF for these terms is set to 0.  This feature
-can be used by passing the `minimumOccurence` value to the IDF constructor.
+can be used by passing the `minDocFreq` value to the IDF constructor.
 
 {% highlight scala %}
 import org.apache.spark.mllib.feature.IDF
 
 // ... continue from the previous example
 tf.cache()
-val idf = new IDF(minimumOccurence=2).fit(tf)
+val idf = new IDF(minDocFreq=2).fit(tf)
 val tfidf: RDD[Vector] = idf.transform(tf)
 {% endhighlight %}
 

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
@@ -32,14 +32,14 @@ import org.apache.spark.rdd.RDD
  * number of documents and `d(t)` is the number of documents that contain term `t`.
  *
  * This implementation supports filtering out terms which do not appear in a minimum number
- * of documents (controlled by the variable minimumOccurence). For terms that are not in
- * at least `minimumOccurence` documents, the IDF is found as 0, resulting in TF-IDFs of 0.
+ * of documents (controlled by the variable `minDocFreq`). For terms that are not in
+ * at least `minDocFreq` documents, the IDF is found as 0, resulting in TF-IDFs of 0.
  *
- * @param minimumOccurence minimum of documents in which a term
- *                         should appear for filtering
+ * @param minDocFreq minimum of documents in which a term
+ *                   should appear for filtering
  */
 @Experimental
-class IDF(val minimumOccurence: Int) {
+class IDF(val minDocFreq: Int) {
 
   def this() = this(0)
 
@@ -51,7 +51,7 @@ class IDF(val minimumOccurence: Int) {
    */
   def fit(dataset: RDD[Vector]): IDFModel = {
     val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator(
-          minimumOccurence=minimumOccurence))(
+          minDocFreq=minDocFreq))(
       seqOp = (df, v) => df.add(v),
       combOp = (df1, df2) => df1.merge(df2)
     ).idf()
@@ -70,7 +70,7 @@ class IDF(val minimumOccurence: Int) {
 private object IDF {
 
   /** Document frequency aggregator. */
-  class DocumentFrequencyAggregator(val minimumOccurence: Int) extends Serializable {
+  class DocumentFrequencyAggregator(val minDocFreq: Int) extends Serializable {
 
     /** number of documents */
     private var m = 0L
@@ -145,7 +145,7 @@ private object IDF {
          * Since arrays are initialized to 0 by default,
          * we just omit changing those entries.
          */
-        if(df(j) >= minimumOccurence) {
+        if(df(j) >= minDocFreq) {
           inv(j) = math.log((m + 1.0)/ (df(j) + 1.0))
         }
         j += 1
@@ -165,8 +165,8 @@ class IDFModel private[mllib] (val idf: Vector) extends Serializable {
   /**
    * Transforms term frequency (TF) vectors to TF-IDF vectors.
    *
-   * If minimumOccurence was set for the IDF calculation,
-   * the terms which occur in fewer than minimumOccurence
+   * If `minDocFreq` was set for the IDF calculation,
+   * the terms which occur in fewer than `minDocFreq`
    * documents will have an entry of 0.
    *
    * @param dataset an RDD of term frequency vectors

diff --git a/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java b/mllib/src/test/java/org/apache/spark/mllib/feature/JavaTfIdfSuite.java
@@ -65,7 +65,7 @@ public void tfIdf() {
   }
 
   @Test
-  public void tfIdfMinimumOccurence() {
+  public void tfIdfMinimumDocumentFrequency() {
     // The tests are to check Java compatibility.
     HashingTF tf = new HashingTF();
     JavaRDD<ArrayList<String>> documents = sc.parallelize(Lists.newArrayList(

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/IDFSuite.scala
@@ -55,7 +55,7 @@ class IDFSuite extends FunSuite with LocalSparkContext {
     assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
   }
 
-  test("idf minimum occurence filtering") {
+  test("idf minimum document frequency filtering") {
     val n = 4
     val localTermFrequencies = Seq(
       Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
@@ -64,7 +64,7 @@ class IDFSuite extends FunSuite with LocalSparkContext {
     )
     val m = localTermFrequencies.size
     val termFrequencies = sc.parallelize(localTermFrequencies, 2)
-    val idf = new IDF(minimumOccurence=1)
+    val idf = new IDF(minDocFreq=1)
     val model = idf.fit(termFrequencies)
     val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
       if(x > 0) {