-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy patharticletext.go
103 lines (78 loc) · 2.29 KB
/
articletext.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
package articletext
/*
The package is used to extracts article text from a HTML page
It drops all additional elements from a html page (navigation, advertizing etc)
This file containes internal functions and a logic
Author: Roman Gelembjuk <roman@gelembjuk.com>
*/
import (
"sort"
"github.com/PuerkitoBio/goquery"
"github.com/jaytaylor/html2text"
)
// liist of tags to ignore, as they dones't contain useful data
var skiphtmltags []string = []string{"script", "style", "noscript", "head",
"header", "footer", "nav"}
func init() {
// to make lookup faster
sort.Strings(skiphtmltags)
}
// the function prepares a document for analysing
// cleans a DOM object and starts analysing
func processArticle(doc *goquery.Document, responsetype int) (string, error) {
if doc == nil {
return "", nil
}
// get clone of a selection. Clone is neede,d because we willdo some transformations
docselection := doc.Selection.Clone()
// preprocess. Remove all tags that are not useful and can make parsing wrong
cleanDocument(docselection)
// get a selection that contains a text of a page (only primary or article text)
selection := getPrimarySelection(docselection)
if responsetype == 2 {
// return parent node path and attributes
return getSelectionSignature(selection), nil
}
return getTextFromHtml(selection), nil
}
// clean HTML document. Removes all tags that are not useful
func cleanDocument(s *goquery.Selection) *goquery.Selection {
tagname := goquery.NodeName(s)
if checkTagsToSkip(tagname) {
s.Remove()
return nil
}
// for each child node check if to remove or not
s.Children().Each(func(i int, sec *goquery.Selection) {
tagname := goquery.NodeName(sec)
if checkTagsToSkip(tagname) {
sec.Remove()
return
}
// go deeper recursively
cleanDocument(sec)
})
return s
}
// convert HTML to text from a DOM node
// we ignore errors in this function
func getTextFromHtml(s *goquery.Selection) string {
// gethtml from a node
html, _ := s.Html()
// convert to text
text, err := html2text.FromString(html)
if err != nil {
return ""
}
return text
}
// check if aword (string) is in an array of tags
// we have list of tags to ignore some not useful tags
func checkTagsToSkip(tag string) bool {
for _, v := range skiphtmltags {
if v == tag {
return true
}
}
return false
}