Skip to content

Commit

Permalink
Change minimumOccurence to minDocFreq in code and docs
Browse files Browse the repository at this point in the history
  • Loading branch information
rnowling committed Sep 23, 2014
1 parent 47850ab commit 40fd70c
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 15 deletions.
4 changes: 2 additions & 2 deletions docs/mllib-feature-extraction.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,14 @@ val tfidf: RDD[Vector] = idf.transform(tf)

MLLib's IDF implementation provides an option for ignoring terms which occur in less than a
minimum number of documents. In such cases, the IDF for these terms is set to 0. This feature
can be used by passing the `minimumOccurence` value to the IDF constructor.
can be used by passing the `minDocFreq` value to the IDF constructor.

{% highlight scala %}
import org.apache.spark.mllib.feature.IDF

// ... continue from the previous example
tf.cache()
val idf = new IDF(minimumOccurence=2).fit(tf)
val idf = new IDF(minDocFreq=2).fit(tf)
val tfidf: RDD[Vector] = idf.transform(tf)
{% endhighlight %}

Expand Down
20 changes: 10 additions & 10 deletions mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@ import org.apache.spark.rdd.RDD
* number of documents and `d(t)` is the number of documents that contain term `t`.
*
* This implementation supports filtering out terms which do not appear in a minimum number
* of documents (controlled by the variable minimumOccurence). For terms that are not in
* at least `minimumOccurence` documents, the IDF is found as 0, resulting in TF-IDFs of 0.
* of documents (controlled by the variable `minDocFreq`). For terms that are not in
* at least `minDocFreq` documents, the IDF is found as 0, resulting in TF-IDFs of 0.
*
* @param minimumOccurence minimum of documents in which a term
* should appear for filtering
* @param minDocFreq minimum of documents in which a term
* should appear for filtering
*/
@Experimental
class IDF(val minimumOccurence: Int) {
class IDF(val minDocFreq: Int) {

def this() = this(0)

Expand All @@ -51,7 +51,7 @@ class IDF(val minimumOccurence: Int) {
*/
def fit(dataset: RDD[Vector]): IDFModel = {
val idf = dataset.treeAggregate(new IDF.DocumentFrequencyAggregator(
minimumOccurence=minimumOccurence))(
minDocFreq=minDocFreq))(
seqOp = (df, v) => df.add(v),
combOp = (df1, df2) => df1.merge(df2)
).idf()
Expand All @@ -70,7 +70,7 @@ class IDF(val minimumOccurence: Int) {
private object IDF {

/** Document frequency aggregator. */
class DocumentFrequencyAggregator(val minimumOccurence: Int) extends Serializable {
class DocumentFrequencyAggregator(val minDocFreq: Int) extends Serializable {

/** number of documents */
private var m = 0L
Expand Down Expand Up @@ -145,7 +145,7 @@ private object IDF {
* Since arrays are initialized to 0 by default,
* we just omit changing those entries.
*/
if(df(j) >= minimumOccurence) {
if(df(j) >= minDocFreq) {
inv(j) = math.log((m + 1.0)/ (df(j) + 1.0))
}
j += 1
Expand All @@ -165,8 +165,8 @@ class IDFModel private[mllib] (val idf: Vector) extends Serializable {
/**
* Transforms term frequency (TF) vectors to TF-IDF vectors.
*
* If minimumOccurence was set for the IDF calculation,
* the terms which occur in fewer than minimumOccurence
* If `minDocFreq` was set for the IDF calculation,
* the terms which occur in fewer than `minDocFreq`
* documents will have an entry of 0.
*
* @param dataset an RDD of term frequency vectors
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public void tfIdf() {
}

@Test
public void tfIdfMinimumOccurence() {
public void tfIdfMinimumDocumentFrequency() {
// The tests are to check Java compatibility.
HashingTF tf = new HashingTF();
JavaRDD<ArrayList<String>> documents = sc.parallelize(Lists.newArrayList(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class IDFSuite extends FunSuite with LocalSparkContext {
assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
}

test("idf minimum occurence filtering") {
test("idf minimum document frequency filtering") {
val n = 4
val localTermFrequencies = Seq(
Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
Expand All @@ -64,7 +64,7 @@ class IDFSuite extends FunSuite with LocalSparkContext {
)
val m = localTermFrequencies.size
val termFrequencies = sc.parallelize(localTermFrequencies, 2)
val idf = new IDF(minimumOccurence=1)
val idf = new IDF(minDocFreq=1)
val model = idf.fit(termFrequencies)
val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
if(x > 0) {
Expand Down

0 comments on commit 40fd70c

Please sign in to comment.