fix: Fix Book Source 3 not available

Signed-off-by: Fang Yuan <wojiushifangyuanlove@gmail.com>
767829413 · Jan 9, 2025 · 24e3098 · 24e3098
1 parent 511261b
commit 24e3098
Show file tree

Hide file tree

Showing 14 changed files with 124 additions and 55 deletions.
diff --git a/frontend/src/components/ViewConfig.tsx b/frontend/src/components/ViewConfig.tsx
@@ -86,7 +86,7 @@ const ViewConfig: React.FC = () => {
                             options={[
                                 { value: 1, label: '1' },
                                 { value: 2, label: '2' },
-                                { value: 3, label: '3' },
+                                { value: 3, label: '3', tooltip: t('viewConfig.slowSource') },
                             ]}
                             style={{ width: '100%' }}
                         />

diff --git a/frontend/src/locales/en.json b/frontend/src/locales/en.json
@@ -37,7 +37,8 @@
     "chatbotSettings": "Chatbot Settings",
     "chatbotSupportInfo": "Note: The current configuration only supports Ollama, switching models may result in a download of the",
     "model": "model",
-    "modelTooltip": "Ollama supports a list of models available on https://ollama.com/library"
+    "modelTooltip": "Ollama supports a list of models available on https://ollama.com/library",
+    "slowSource": "There is a limitation on the current source of books, the download will be very slow"
   },
   "checkUpdate": {
     "title": "Check Update",

diff --git a/frontend/src/locales/zh.json b/frontend/src/locales/zh.json
@@ -23,7 +23,7 @@
     "crawlSettings": "爬取设置",
     "retrySettings": "重试设置",
     "enableBookSource": "启用书源",
-    "bookSourceTooltip": "只能选一个, 当前可选值：1、2、3，建议使用默认书源 3",
+    "bookSourceTooltip": "只能选一个, 当前可选值：1、2、3，建议不使用书源 3，有限速和防爬",
     "downloadPath": "下载路径",
     "downloadPathTooltip": "绝对相对均可 (Windows 路径分隔符不要用 \\ , 用 / 或 \\)",
     "fileExtension": "文件扩展名",
@@ -37,7 +37,8 @@
     "chatbotSettings": "聊天机器人设置",
     "chatbotSupportInfo": "注意：当前配置仅支持 Ollama，切换模型可能会导致下载",
     "model": "模型",
-    "modelTooltip": "Ollama 支持 https://ollama.com/library 上可用的模型列表"
+    "modelTooltip": "Ollama 支持 https://ollama.com/library 上可用的模型列表",
+    "slowSource": "当前书源有限制，下载会很慢"
   },
   "checkUpdate": {
     "title": "检查更新",

diff --git a/internal/config/default_config.yaml b/internal/config/default_config.yaml
@@ -1,6 +1,6 @@
 base:
   # 启用书源 (只能选一个, 当前可选值：1、2、3)
-  source-id: 3
+  source-id: 1
   # 下载路径, 绝对相对均可 (Windows 路径分隔符不要用 \ , 用 / 或 \)
   download-path: "downloads"
   # 文件扩展名, 支持 txt, epub, 推荐 epub

diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go
@@ -67,7 +67,7 @@ func (nc *novelCrawler) Crawl(res *model.SearchResult, start, end int) (*model.C
 	// Parse and download content
 	// Limit concurrent processing
 	var wg sync.WaitGroup
-	threads := concurrencyTool.GetConcurrencyNum(conf.Crawl.Threads)
+	threads := concurrencyTool.GetConcurrencyNum(conf.Crawl.Threads, conf.Base.SourceID)
 	semaphore := make(chan struct{}, threads)
 	var nowCatalogsCount = int32(0)
 	// Total completed tasks = number of chapters fetched + 1 (merging task)

diff --git a/internal/model/rule.go b/internal/model/rule.go
@@ -10,6 +10,7 @@ type Rule struct {
 	Search  search  `json:"search"`
 	Book    book    `json:"book"`
 	Chapter chapter `json:"chapter"`
+	Catalog catalog `json:"catalog"`
 }
 
 // Search represents the search rules
@@ -39,10 +40,17 @@ type book struct {
 	LatestChapter string `json:"latestChapter"`
 	LatestUpdate  string `json:"latestUpdate"`
 	IsEnd         string `json:"isEnd"`
-	Catalog       string `json:"catalog"`
 	CatalogOffset int    `json:"catalogOffset"`
 }
 
+type catalog struct {
+	URL        string `json:"url"`
+	Result     string `json:"result"`
+	Pagination bool   `json:"pagination"`
+	NextPage   string `json:"nextPage"`
+	Offset     int    `json:"offset"`
+}
+
 // Chapter represents the chapter rules
 type chapter struct {
 	URL                string `json:"url"`

diff --git a/internal/parse/catalogs.go b/internal/parse/catalogs.go
@@ -1,11 +1,13 @@
 package parse
 
 import (
+	"fmt"
 	"sort"
 
 	"fy-novel/internal/model"
 	"fy-novel/internal/source"
 	"fy-novel/pkg/utils"
+
 	"github.com/gocolly/colly/v2"
 	// "github.com/gocolly/colly/v2/debug"
 )
@@ -25,7 +27,12 @@ func (b *CatalogsParser) Parse(bookUrl string, start, end, retry int) ([]*model.
 
 	var chapters = make(map[string]*model.Chapter)
 
-	collector.OnHTML(b.rule.Book.Catalog, func(e *colly.HTMLElement) {
+	if len(b.rule.Catalog.URL) > 0 {
+		id := utils.GetGroup1(b.rule.Book.URL, bookUrl)
+		bookUrl = fmt.Sprintf(b.rule.Catalog.URL, id)
+	}
+
+	collector.OnHTML(b.rule.Catalog.Result, func(e *colly.HTMLElement) {
 		chapter := &model.Chapter{
 			Title: e.Text,
 			URL:   utils.NormalizeURL(e.Attr("href"), b.rule.URL),

diff --git a/internal/parse/chapter.go b/internal/parse/chapter.go
@@ -33,6 +33,9 @@ func (b *ChapterParser) Parse(
 	downOk := false
 	attemptStart := 1
 	attempt := conf.Retry.MaxAttempts
+	if conf.Base.SourceID == 3 {
+		attempt = 0
+	}
 	// Fetch content
 	utils.SpinWaitMaxRetryAttempts(
 		func() bool {

diff --git a/internal/parse/common.go b/internal/parse/common.go
@@ -11,7 +11,8 @@ import (
 )
 
 const timeoutMillis = 25000
-const retryDefault = 3
+const retryDefault = 10
+const sleepSecond = 1 * time.Second
 
 var urlLock sync.Mutex
 
@@ -44,6 +45,7 @@ func getCollector(cookies map[string]string, retry int) *colly.Collector {
 			saveErrorUrl[link]++
 			r.Request.Retry()
 		} else if saveErrorUrl[link] < retry {
+			time.Sleep(sleepSecond * time.Duration(retry))
 			saveErrorUrl[link]++
 			r.Request.Retry()
 		} else {

diff --git a/internal/source/rule/rule1.json b/internal/source/rule/rule1.json
@@ -28,9 +28,11 @@
     "coverUrl": "#fmimg > img",
     "latestChapter": "",
     "latestUpdate": "",
-    "isEnd": "",
-    "catalog": "#list > dl > dd > a",
-    "catalogOffset": 12
+    "isEnd": ""
+  },
+  "catalog": {
+    "result": "#list > dl > dd > a",
+    "offset": 12
   },
   "chapter": {
     "url": "http://www.mcmssc.la/%s/%s.html",

diff --git a/internal/source/rule/rule2.json b/internal/source/rule/rule2.json
@@ -1,42 +1,45 @@
 {
-    "id": "2",
-    "url": "http://www.99xs.info/",
-    "name": "鸟书网",
-    "comment": "书很全，有15w+；无反爬",
-    "type": "html",
-    "search": {
-      "url": "http://www.99xs.info/read/search/",
-      "method": "post",
-      "body": {"kw": "searchkey"},
-      "cookies": {},
-      "pagination": false,
-      "result": "div.wrap > div > div > div",
-      "bookName": "div.bookinfo > h4 > a",
-      "author": "div.bookinfo > div.author",
-      "latestChapter": "div.bookinfo > div.update > a"
+  "id": "2",
+  "url": "http://www.99xs.info/",
+  "name": "鸟书网",
+  "comment": "书很全，有15w+；无反爬",
+  "type": "html",
+  "search": {
+    "url": "http://www.99xs.info/read/search/",
+    "method": "post",
+    "body": {
+      "kw": "searchkey"
     },
-    "book": {
-      "url": "http://www.99xs.info/tag/%s/",
-      "bookName": "meta[property=\"og:novel:book_name\"]",
-      "author": "meta[property=\"og:novel:author\"]",
-      "intro": "meta[property=\"og:description\"]",
-      "category": "meta[property=\"og:novel:category\"]",
-      "coverUrl": "div.cover > img",
-      "latestChapter": "",
-      "latestUpdate": "",
-      "isEnd": "",
-      "catalog": "div.listmain > dl > dd > a",
-      "catalogOffset": 12
-    },
-    "chapter": {
-      "url": "http://www.99xs.info/tag/%s/%s.html",
-      "pagination": false,
-      "chapterNo": 0,
-      "title": "",
-      "content": "#content",
-      "paragraphTagClosed": false,
-      "paragraphTag": "<br><br>",
-      "filterTxt": "请记住本书首发域名：.+。鸟书网手机版阅读网址：.+|7017k",
-      "filterTag": ""
-    }
-  }
+    "cookies": {},
+    "pagination": false,
+    "result": "div.wrap > div > div > div",
+    "bookName": "div.bookinfo > h4 > a",
+    "author": "div.bookinfo > div.author",
+    "latestChapter": "div.bookinfo > div.update > a"
+  },
+  "book": {
+    "url": "http://www.99xs.info/tag/%s/",
+    "bookName": "meta[property=\"og:novel:book_name\"]",
+    "author": "meta[property=\"og:novel:author\"]",
+    "intro": "meta[property=\"og:description\"]",
+    "category": "meta[property=\"og:novel:category\"]",
+    "coverUrl": "div.cover > img",
+    "latestChapter": "",
+    "latestUpdate": "",
+    "isEnd": ""
+  },
+  "catalog": {
+    "result": "div.listmain > dl > dd > a",
+    "offset": 12
+  },
+  "chapter": {
+    "url": "http://www.99xs.info/tag/%s/%s.html",
+    "pagination": false,
+    "title": "",
+    "content": "#content",
+    "paragraphTagClosed": false,
+    "paragraphTag": "<br><br>",
+    "filterTxt": "请记住本书首发域名：.+。鸟书网手机版阅读网址：.+|7017k",
+    "filterTag": ""
+  }
+}
diff --git a/internal/source/rule/rule3.json b/internal/source/rule/rule3.json
@@ -2,9 +2,8 @@
     "id": "3",
     "url": "https://69shux.co/",
     "name": "69书吧",
-    "comment": "无五秒盾、爬取章节有限流、需要梯子，美国节点🆗、数量5w+",
+    "comment": "需要梯子、无五秒盾、章节有限流、数量5w+",
     "type": "html",
-    "useProxy": true,
     "language": "zh-Hant",
     "search": {
         "url": "https://69shux.co/search",

diff --git a/internal/tools/concurrency/concurrency.go b/internal/tools/concurrency/concurrency.go
@@ -12,7 +12,7 @@ const (
 var once sync.Once
 var defaultConcurrency int
 
-func GetConcurrencyNum(target int) int {
+func GetConcurrencyNum(target, sourceID int) int {
 	once.Do(func() {
 		defaultConcurrency = runtime.NumCPU()
 	})
@@ -22,5 +22,9 @@ func GetConcurrencyNum(target int) int {
 	if target >= maxThreads {
 		target = maxThreads
 	}
+	// 书源3有防爬机制，不能并发太高
+	if sourceID == 3 {
+		return 1
+	}
 	return target
 }
diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"hash/fnv"
 	"os"
+	"regexp"
 	"time"
 	"unicode"
 )
@@ -133,3 +134,41 @@ func StringToUniqueHash(s string) uint64 {
 	h.Write([]byte(s))
 	return h.Sum64()
 }
+
+// GetGroup0 获得匹配的字符串，获得正则中分组0的内容
+//
+// regex: 匹配的正则
+// content: 被匹配的内容
+// 返回值: 匹配后得到的字符串，未匹配返回空字符串
+func GetGroup0(regex string, content string) string {
+	return Get(regex, content, 0)
+}
+
+// GetGroup1 获得匹配的字符串，获得正则中分组1的内容
+//
+// regex: 匹配的正则
+// content: 被匹配的内容
+// 返回值: 匹配后得到的字符串，未匹配返回空字符串
+func GetGroup1(regex string, content string) string {
+	return Get(regex, content, 1)
+}
+
+// Get 获得匹配的字符串
+//
+// regex: 匹配的正则
+// content: 被匹配的内容
+// groupIndex: 匹配正则的分组序号
+// 返回值: 匹配后得到的字符串，未匹配返回空字符串
+func Get(regex string, content string, groupIndex int) string {
+	re, err := regexp.Compile(regex)
+	if err != nil {
+		return ""
+	}
+
+	matches := re.FindStringSubmatch(content)
+	if len(matches) > groupIndex {
+		return matches[groupIndex]
+	}
+
+	return ""
+}