-
Notifications
You must be signed in to change notification settings - Fork 44
/
Copy pathcrawlerdb.go
136 lines (115 loc) · 3.23 KB
/
crawlerdb.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
package gcse
import (
"log"
"strings"
"time"
"github.com/daviddengcn/gcse/configs"
"github.com/daviddengcn/gddo/doc"
)
const (
KindIndex = "index"
KindDocDB = "docdb"
KindPackage = "package"
KindPerson = "person"
KindToCheck = "tocheck"
IndexFn = KindIndex + ".gob"
)
/*
* CrawlerDB including all crawler entires database.
*/
type CrawlerDB struct {
PackageDB *MemDB
PersonDB *MemDB
}
// LoadCrawlerDB loads PackageDB and PersonDB and returns a new *CrawlerDB
func LoadCrawlerDB() *CrawlerDB {
root := configs.CrawlerDBPath()
log.Printf("Loading CrawlerDB from %s", root)
return &CrawlerDB{
PackageDB: NewMemDB(root, KindPackage),
PersonDB: NewMemDB(root, KindPerson),
}
}
// Sync syncs both PackageDB and PersonDB. Returns error if any of the sync
// failed.
func (cdb *CrawlerDB) Sync() error {
if err := cdb.PackageDB.Sync(); err != nil {
log.Printf("cdb.PackageDB.Sync failed: %v", err)
return err
}
if err := cdb.PersonDB.Sync(); err != nil {
log.Printf("cdb.PersonDB.Sync failed: %v", err)
return err
}
return nil
}
// SchedulePackage schedules a package to be crawled at a specific time.
func (cdb *CrawlerDB) SchedulePackage(pkg string, sTime time.Time, etag string) error {
ent := CrawlingEntry{
ScheduleTime: sTime,
Version: CrawlerVersion,
Etag: etag,
}
cdb.PackageDB.Put(pkg, ent)
// log.Printf("Schedule package %s to %v", pkg, sTime)
return nil
}
// SchedulePackage schedules a package to be crawled at a specific time if not specified earlier.
func (cdb *CrawlerDB) PushToCrawlPackage(pkg string) {
now := time.Now()
var ent CrawlingEntry
if cdb.PackageDB.Get(pkg, &ent) {
if ent.ScheduleTime.Before(now) {
// The package has been scheduled to an earlier time.
return
}
}
ent.ScheduleTime = now
cdb.PackageDB.Put(pkg, ent)
}
func TrimPackageName(pkg string) string {
return strings.TrimFunc(strings.TrimSpace(pkg), func(r rune) bool {
return r > rune(128)
})
}
// AppendPackage appends a package. If the package did not exist in either
// PackageDB or Docs, schedule it (immediately).
func (cdb *CrawlerDB) AppendPackage(pkg string, inDocs func(pkg string) bool) {
pkg = TrimPackageName(pkg)
if !doc.IsValidRemotePath(pkg) {
return
}
var ent CrawlingEntry
if cdb.PackageDB.Get(pkg, &ent) {
if ent.ScheduleTime.Before(time.Now()) || inDocs(pkg) {
return
}
// if the docs is missing in Docs, schedule it earlier
log.Printf("Scheduling a package with missing docs: %v", pkg)
} else {
log.Printf("Scheduling new package: %v", pkg)
}
cdb.SchedulePackage(pkg, time.Now(), "")
}
// SchedulePerson schedules a person to be crawled at a specific time.
func (cdb *CrawlerDB) SchedulePerson(id string, sTime time.Time) error {
ent := CrawlingEntry{
ScheduleTime: sTime,
Version: CrawlerVersion,
}
cdb.PersonDB.Put(id, ent)
log.Printf("Schedule person %s to %v", id, sTime)
return nil
}
// AppendPerson appends a person to the PersonDB, schedules to crawl
// immediately for a new person
func (cdb *CrawlerDB) AppendPerson(site, username string) bool {
id := IdOfPerson(site, username)
var ent CrawlingEntry
exists := cdb.PersonDB.Get(id, &ent)
if exists {
// already scheduled
return false
}
return cdb.SchedulePerson(id, time.Now()) == nil
}