Skip to content

Commit

Permalink
Maintain TF-IDF weights for each document
Browse files Browse the repository at this point in the history
  • Loading branch information
hrs committed May 13, 2023
1 parent 471fcd1 commit 371c11d
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 1 deletion.
5 changes: 5 additions & 0 deletions corpus.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,10 @@ func NewCorpus(documents []*Document) *Corpus {
invDocFreq[term] = math.Log(docCount / docFreq[term])
}

// Assign TF-IDF weights to every document in the corpus
for _, doc := range documents {
doc.NormalizeTfIdf(invDocFreq)
}

return &Corpus{documents, invDocFreq}
}
9 changes: 8 additions & 1 deletion document.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ type TermMap map[string]float64
type Document struct {
Path string
TermFreq TermMap
TfIdf TermMap
}

var nonAlphanumericRegex = regexp.MustCompile(`[^a-z0-9 ']+`)
Expand Down Expand Up @@ -66,5 +67,11 @@ func NewDocument(path string) (*Document, error) {
termFreq[term] = count / totalWordCount
}

return &Document{path, termFreq}, nil
return &Document{path, termFreq, make(TermMap)}, nil
}

func (doc *Document) NormalizeTfIdf(invDocFreq TermMap) {
for term, weight := range doc.TermFreq {
doc.TfIdf[term] = weight * invDocFreq[term]
}
}

0 comments on commit 371c11d

Please sign in to comment.