From 238c2967b269ca0f66d8e759c6a0234107e1fd1e Mon Sep 17 00:00:00 2001 From: MohamedBassem Date: Thu, 11 Apr 2024 15:03:31 +0300 Subject: [PATCH] fix: Increase default navigation timeout to 30s, make it configurable and add retries to crawling jobs --- apps/workers/crawlerWorker.ts | 2 +- apps/workers/utils.ts | 1 - docs/docs/03-configuration.md | 7 ++++--- packages/shared/config.ts | 2 ++ packages/shared/queues.ts | 11 ++++++++++- 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index c9a1189c..eec8cd98 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -170,7 +170,7 @@ async function crawlPage(jobId: string, url: string) { const page = await context.newPage(); await page.goto(url, { - timeout: 10000, // 10 seconds + timeout: serverConfig.crawler.navigateTimeoutSec * 1000, }); logger.info( `[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`, diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts index f8c48408..8e69dcd2 100644 --- a/apps/workers/utils.ts +++ b/apps/workers/utils.ts @@ -26,7 +26,6 @@ export async function readPDFText(buffer: Buffer): Promise<{ const pdfParser = new PDFParser(null, 1); pdfParser.on("pdfParser_dataError", reject); pdfParser.on("pdfParser_dataReady", (pdfData) => { - // eslint-disable-next-line resolve({ // The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327 // eslint-disable-next-line diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md index 5bf1612c..28ead2f1 100644 --- a/docs/docs/03-configuration.md +++ b/docs/docs/03-configuration.md @@ -37,6 +37,7 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin ## Crawler Configs -| Name | Required | Default | Description | -| ----------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit | +| Name | Required | Default | Description | +| ---------------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit | +| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection | diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 4e444908..41173433 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -21,6 +21,7 @@ const allEnv = z.object({ CRAWLER_HEADLESS_BROWSER: stringBool("true"), BROWSER_WEB_URL: z.string().url().optional(), CRAWLER_JOB_TIMEOUT_SEC: z.number().default(60), + CRAWLER_NAVIGATE_TIMEOUT_SEC: z.number().default(30), MEILI_ADDR: z.string().optional(), MEILI_MASTER_KEY: z.string().default(""), LOG_LEVEL: z.string().default("debug"), @@ -58,6 +59,7 @@ const serverConfigSchema = allEnv.transform((val) => { headlessBrowser: val.CRAWLER_HEADLESS_BROWSER, browserWebUrl: val.BROWSER_WEB_URL, jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC, + navigateTimeoutSec: val.CRAWLER_NAVIGATE_TIMEOUT_SEC, }, meilisearch: val.MEILI_ADDR ? { diff --git a/packages/shared/queues.ts b/packages/shared/queues.ts index 146c19c6..6d5fdd5f 100644 --- a/packages/shared/queues.ts +++ b/packages/shared/queues.ts @@ -17,7 +17,16 @@ export type ZCrawlLinkRequest = z.infer; export const LinkCrawlerQueue = new Queue( "link_crawler_queue", - { connection: queueConnectionDetails }, + { + connection: queueConnectionDetails, + defaultJobOptions: { + attempts: 5, + backoff: { + type: "exponential", + delay: 1000, + }, + }, + }, ); // OpenAI Worker