Skip to content

Commit

Permalink
fix: Fix Book Source 3 not available
Browse files Browse the repository at this point in the history
Signed-off-by: Fang Yuan <wojiushifangyuanlove@gmail.com>
  • Loading branch information
767829413 committed Jan 9, 2025
1 parent 511261b commit 24e3098
Show file tree
Hide file tree
Showing 14 changed files with 124 additions and 55 deletions.
2 changes: 1 addition & 1 deletion frontend/src/components/ViewConfig.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ const ViewConfig: React.FC = () => {
options={[
{ value: 1, label: '1' },
{ value: 2, label: '2' },
{ value: 3, label: '3' },
{ value: 3, label: '3', tooltip: t('viewConfig.slowSource') },
]}
style={{ width: '100%' }}
/>
Expand Down
3 changes: 2 additions & 1 deletion frontend/src/locales/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
"chatbotSettings": "Chatbot Settings",
"chatbotSupportInfo": "Note: The current configuration only supports Ollama, switching models may result in a download of the",
"model": "model",
"modelTooltip": "Ollama supports a list of models available on https://ollama.com/library"
"modelTooltip": "Ollama supports a list of models available on https://ollama.com/library",
"slowSource": "There is a limitation on the current source of books, the download will be very slow"
},
"checkUpdate": {
"title": "Check Update",
Expand Down
5 changes: 3 additions & 2 deletions frontend/src/locales/zh.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"crawlSettings": "爬取设置",
"retrySettings": "重试设置",
"enableBookSource": "启用书源",
"bookSourceTooltip": "只能选一个, 当前可选值:1、2、3,建议使用默认书源 3",
"bookSourceTooltip": "只能选一个, 当前可选值:1、2、3,建议不使用书源 3,有限速和防爬",
"downloadPath": "下载路径",
"downloadPathTooltip": "绝对相对均可 (Windows 路径分隔符不要用 \\ , 用 / 或 \\)",
"fileExtension": "文件扩展名",
Expand All @@ -37,7 +37,8 @@
"chatbotSettings": "聊天机器人设置",
"chatbotSupportInfo": "注意:当前配置仅支持 Ollama,切换模型可能会导致下载",
"model": "模型",
"modelTooltip": "Ollama 支持 https://ollama.com/library 上可用的模型列表"
"modelTooltip": "Ollama 支持 https://ollama.com/library 上可用的模型列表",
"slowSource": "当前书源有限制,下载会很慢"
},
"checkUpdate": {
"title": "检查更新",
Expand Down
2 changes: 1 addition & 1 deletion internal/config/default_config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
base:
# 启用书源 (只能选一个, 当前可选值:1、2、3)
source-id: 3
source-id: 1
# 下载路径, 绝对相对均可 (Windows 路径分隔符不要用 \ , 用 / 或 \)
download-path: "downloads"
# 文件扩展名, 支持 txt, epub, 推荐 epub
Expand Down
2 changes: 1 addition & 1 deletion internal/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func (nc *novelCrawler) Crawl(res *model.SearchResult, start, end int) (*model.C
// Parse and download content
// Limit concurrent processing
var wg sync.WaitGroup
threads := concurrencyTool.GetConcurrencyNum(conf.Crawl.Threads)
threads := concurrencyTool.GetConcurrencyNum(conf.Crawl.Threads, conf.Base.SourceID)
semaphore := make(chan struct{}, threads)
var nowCatalogsCount = int32(0)
// Total completed tasks = number of chapters fetched + 1 (merging task)
Expand Down
10 changes: 9 additions & 1 deletion internal/model/rule.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ type Rule struct {
Search search `json:"search"`
Book book `json:"book"`
Chapter chapter `json:"chapter"`
Catalog catalog `json:"catalog"`
}

// Search represents the search rules
Expand Down Expand Up @@ -39,10 +40,17 @@ type book struct {
LatestChapter string `json:"latestChapter"`
LatestUpdate string `json:"latestUpdate"`
IsEnd string `json:"isEnd"`
Catalog string `json:"catalog"`
CatalogOffset int `json:"catalogOffset"`
}

type catalog struct {
URL string `json:"url"`
Result string `json:"result"`
Pagination bool `json:"pagination"`
NextPage string `json:"nextPage"`
Offset int `json:"offset"`
}

// Chapter represents the chapter rules
type chapter struct {
URL string `json:"url"`
Expand Down
9 changes: 8 additions & 1 deletion internal/parse/catalogs.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
package parse

import (
"fmt"
"sort"

"fy-novel/internal/model"
"fy-novel/internal/source"
"fy-novel/pkg/utils"

"github.com/gocolly/colly/v2"
// "github.com/gocolly/colly/v2/debug"
)
Expand All @@ -25,7 +27,12 @@ func (b *CatalogsParser) Parse(bookUrl string, start, end, retry int) ([]*model.

var chapters = make(map[string]*model.Chapter)

collector.OnHTML(b.rule.Book.Catalog, func(e *colly.HTMLElement) {
if len(b.rule.Catalog.URL) > 0 {
id := utils.GetGroup1(b.rule.Book.URL, bookUrl)
bookUrl = fmt.Sprintf(b.rule.Catalog.URL, id)
}

collector.OnHTML(b.rule.Catalog.Result, func(e *colly.HTMLElement) {
chapter := &model.Chapter{
Title: e.Text,
URL: utils.NormalizeURL(e.Attr("href"), b.rule.URL),
Expand Down
3 changes: 3 additions & 0 deletions internal/parse/chapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ func (b *ChapterParser) Parse(
downOk := false
attemptStart := 1
attempt := conf.Retry.MaxAttempts
if conf.Base.SourceID == 3 {
attempt = 0
}
// Fetch content
utils.SpinWaitMaxRetryAttempts(
func() bool {
Expand Down
4 changes: 3 additions & 1 deletion internal/parse/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ import (
)

const timeoutMillis = 25000
const retryDefault = 3
const retryDefault = 10
const sleepSecond = 1 * time.Second

var urlLock sync.Mutex

Expand Down Expand Up @@ -44,6 +45,7 @@ func getCollector(cookies map[string]string, retry int) *colly.Collector {
saveErrorUrl[link]++
r.Request.Retry()
} else if saveErrorUrl[link] < retry {
time.Sleep(sleepSecond * time.Duration(retry))
saveErrorUrl[link]++
r.Request.Retry()
} else {
Expand Down
8 changes: 5 additions & 3 deletions internal/source/rule/rule1.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@
"coverUrl": "#fmimg > img",
"latestChapter": "",
"latestUpdate": "",
"isEnd": "",
"catalog": "#list > dl > dd > a",
"catalogOffset": 12
"isEnd": ""
},
"catalog": {
"result": "#list > dl > dd > a",
"offset": 12
},
"chapter": {
"url": "http://www.mcmssc.la/%s/%s.html",
Expand Down
83 changes: 43 additions & 40 deletions internal/source/rule/rule2.json
Original file line number Diff line number Diff line change
@@ -1,42 +1,45 @@
{
"id": "2",
"url": "http://www.99xs.info/",
"name": "鸟书网",
"comment": "书很全,有15w+;无反爬",
"type": "html",
"search": {
"url": "http://www.99xs.info/read/search/",
"method": "post",
"body": {"kw": "searchkey"},
"cookies": {},
"pagination": false,
"result": "div.wrap > div > div > div",
"bookName": "div.bookinfo > h4 > a",
"author": "div.bookinfo > div.author",
"latestChapter": "div.bookinfo > div.update > a"
"id": "2",
"url": "http://www.99xs.info/",
"name": "鸟书网",
"comment": "书很全,有15w+;无反爬",
"type": "html",
"search": {
"url": "http://www.99xs.info/read/search/",
"method": "post",
"body": {
"kw": "searchkey"
},
"book": {
"url": "http://www.99xs.info/tag/%s/",
"bookName": "meta[property=\"og:novel:book_name\"]",
"author": "meta[property=\"og:novel:author\"]",
"intro": "meta[property=\"og:description\"]",
"category": "meta[property=\"og:novel:category\"]",
"coverUrl": "div.cover > img",
"latestChapter": "",
"latestUpdate": "",
"isEnd": "",
"catalog": "div.listmain > dl > dd > a",
"catalogOffset": 12
},
"chapter": {
"url": "http://www.99xs.info/tag/%s/%s.html",
"pagination": false,
"chapterNo": 0,
"title": "",
"content": "#content",
"paragraphTagClosed": false,
"paragraphTag": "<br><br>",
"filterTxt": "请记住本书首发域名:.+。鸟书网手机版阅读网址:.+|7017k",
"filterTag": ""
}
}
"cookies": {},
"pagination": false,
"result": "div.wrap > div > div > div",
"bookName": "div.bookinfo > h4 > a",
"author": "div.bookinfo > div.author",
"latestChapter": "div.bookinfo > div.update > a"
},
"book": {
"url": "http://www.99xs.info/tag/%s/",
"bookName": "meta[property=\"og:novel:book_name\"]",
"author": "meta[property=\"og:novel:author\"]",
"intro": "meta[property=\"og:description\"]",
"category": "meta[property=\"og:novel:category\"]",
"coverUrl": "div.cover > img",
"latestChapter": "",
"latestUpdate": "",
"isEnd": ""
},
"catalog": {
"result": "div.listmain > dl > dd > a",
"offset": 12
},
"chapter": {
"url": "http://www.99xs.info/tag/%s/%s.html",
"pagination": false,
"title": "",
"content": "#content",
"paragraphTagClosed": false,
"paragraphTag": "<br><br>",
"filterTxt": "请记住本书首发域名:.+。鸟书网手机版阅读网址:.+|7017k",
"filterTag": ""
}
}
3 changes: 1 addition & 2 deletions internal/source/rule/rule3.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
"id": "3",
"url": "https://69shux.co/",
"name": "69书吧",
"comment": "无五秒盾、爬取章节有限流、需要梯子,美国节点🆗、数量5w+",
"comment": "需要梯子、无五秒盾、章节有限流、数量5w+",
"type": "html",
"useProxy": true,
"language": "zh-Hant",
"search": {
"url": "https://69shux.co/search",
Expand Down
6 changes: 5 additions & 1 deletion internal/tools/concurrency/concurrency.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ const (
var once sync.Once
var defaultConcurrency int

func GetConcurrencyNum(target int) int {
func GetConcurrencyNum(target, sourceID int) int {
once.Do(func() {
defaultConcurrency = runtime.NumCPU()
})
Expand All @@ -22,5 +22,9 @@ func GetConcurrencyNum(target int) int {
if target >= maxThreads {
target = maxThreads
}
// 书源3有防爬机制,不能并发太高
if sourceID == 3 {
return 1
}
return target
}
39 changes: 39 additions & 0 deletions pkg/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"hash/fnv"
"os"
"regexp"
"time"
"unicode"
)
Expand Down Expand Up @@ -133,3 +134,41 @@ func StringToUniqueHash(s string) uint64 {
h.Write([]byte(s))
return h.Sum64()
}

// GetGroup0 获得匹配的字符串,获得正则中分组0的内容
//
// regex: 匹配的正则
// content: 被匹配的内容
// 返回值: 匹配后得到的字符串,未匹配返回空字符串
func GetGroup0(regex string, content string) string {
return Get(regex, content, 0)
}

// GetGroup1 获得匹配的字符串,获得正则中分组1的内容
//
// regex: 匹配的正则
// content: 被匹配的内容
// 返回值: 匹配后得到的字符串,未匹配返回空字符串
func GetGroup1(regex string, content string) string {
return Get(regex, content, 1)
}

// Get 获得匹配的字符串
//
// regex: 匹配的正则
// content: 被匹配的内容
// groupIndex: 匹配正则的分组序号
// 返回值: 匹配后得到的字符串,未匹配返回空字符串
func Get(regex string, content string, groupIndex int) string {
re, err := regexp.Compile(regex)
if err != nil {
return ""
}

matches := re.FindStringSubmatch(content)
if len(matches) > groupIndex {
return matches[groupIndex]
}

return ""
}

0 comments on commit 24e3098

Please sign in to comment.