Skip to content

Commit

Permalink
fix: Increase default navigation timeout to 30s, make it configurable…
Browse files Browse the repository at this point in the history
… and add retries to crawling jobs
  • Loading branch information
MohamedBassem committed Apr 11, 2024
1 parent be622e5 commit 238c296
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 6 deletions.
2 changes: 1 addition & 1 deletion apps/workers/crawlerWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ async function crawlPage(jobId: string, url: string) {
const page = await context.newPage();

await page.goto(url, {
timeout: 10000, // 10 seconds
timeout: serverConfig.crawler.navigateTimeoutSec * 1000,
});
logger.info(
`[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`,
Expand Down
1 change: 0 additions & 1 deletion apps/workers/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ export async function readPDFText(buffer: Buffer): Promise<{
const pdfParser = new PDFParser(null, 1);
pdfParser.on("pdfParser_dataError", reject);
pdfParser.on("pdfParser_dataReady", (pdfData) => {
// eslint-disable-next-line
resolve({
// The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327
// eslint-disable-next-line
Expand Down
7 changes: 4 additions & 3 deletions docs/docs/03-configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin

## Crawler Configs

| Name | Required | Default | Description |
| ----------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
| Name | Required | Default | Description |
| ---------------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection |
2 changes: 2 additions & 0 deletions packages/shared/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ const allEnv = z.object({
CRAWLER_HEADLESS_BROWSER: stringBool("true"),
BROWSER_WEB_URL: z.string().url().optional(),
CRAWLER_JOB_TIMEOUT_SEC: z.number().default(60),
CRAWLER_NAVIGATE_TIMEOUT_SEC: z.number().default(30),
MEILI_ADDR: z.string().optional(),
MEILI_MASTER_KEY: z.string().default(""),
LOG_LEVEL: z.string().default("debug"),
Expand Down Expand Up @@ -58,6 +59,7 @@ const serverConfigSchema = allEnv.transform((val) => {
headlessBrowser: val.CRAWLER_HEADLESS_BROWSER,
browserWebUrl: val.BROWSER_WEB_URL,
jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC,
navigateTimeoutSec: val.CRAWLER_NAVIGATE_TIMEOUT_SEC,
},
meilisearch: val.MEILI_ADDR
? {
Expand Down
11 changes: 10 additions & 1 deletion packages/shared/queues.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,16 @@ export type ZCrawlLinkRequest = z.infer<typeof zCrawlLinkRequestSchema>;

export const LinkCrawlerQueue = new Queue<ZCrawlLinkRequest, void>(
"link_crawler_queue",
{ connection: queueConnectionDetails },
{
connection: queueConnectionDetails,
defaultJobOptions: {
attempts: 5,
backoff: {
type: "exponential",
delay: 1000,
},
},
},
);

// OpenAI Worker
Expand Down

0 comments on commit 238c296

Please sign in to comment.