From ed298b2abbe9f7de8fe4ee7269512b2f2cee2916 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20F=2E=20Romaniello?= Date: Tue, 11 Feb 2025 18:52:45 -0300 Subject: [PATCH] fix(community): hide console errors on RecursiveUrlLoader (#7679) Co-authored-by: jacoblee93 --- .../tests/recursive_url.int.test.ts | 22 ++++++++++++++++++- .../src/document_loaders/web/recursive_url.ts | 9 +++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/libs/langchain-community/src/document_loaders/tests/recursive_url.int.test.ts b/libs/langchain-community/src/document_loaders/tests/recursive_url.int.test.ts index b1bf2cbbe271..e7cbf283e455 100644 --- a/libs/langchain-community/src/document_loaders/tests/recursive_url.int.test.ts +++ b/libs/langchain-community/src/document_loaders/tests/recursive_url.int.test.ts @@ -1,10 +1,18 @@ /* eslint-disable no-process-env */ /* eslint-disable @typescript-eslint/no-non-null-assertion */ -import { test } from "@jest/globals"; +import { test, jest } from "@jest/globals"; import { compile } from "html-to-text"; import { RecursiveUrlLoader } from "../web/recursive_url.js"; describe("RecursiveUrlLoader", () => { + beforeEach(() => { + jest.spyOn(console, "error"); + }); + + afterEach(() => { + jest.restoreAllMocks(); + }); + test("loading valid url", async () => { const url = "https://js.langchain.com/docs/introduction"; @@ -84,4 +92,16 @@ describe("RecursiveUrlLoader", () => { false ); }); + + test("load docs from langsmith without reporting errors", async () => { + const url = "https://docs.smith.langchain.com/"; + const loader = new RecursiveUrlLoader(url, { + maxDepth: 5, + timeout: 5000, + }); + + const docs = await loader.load(); + expect(docs.length).toBeGreaterThan(1); + expect(console.error).not.toHaveBeenCalled(); + }); }); diff --git a/libs/langchain-community/src/document_loaders/web/recursive_url.ts b/libs/langchain-community/src/document_loaders/web/recursive_url.ts index a3c1fd0ebb5d..ac6d004fc5e4 100644 --- a/libs/langchain-community/src/document_loaders/web/recursive_url.ts +++ b/libs/langchain-community/src/document_loaders/web/recursive_url.ts @@ -1,4 +1,4 @@ -import { JSDOM } from "jsdom"; +import { JSDOM, VirtualConsole } from "jsdom"; import { Document } from "@langchain/core/documents"; import { AsyncCaller } from "@langchain/core/utils/async_caller"; import { @@ -6,6 +6,9 @@ import { DocumentLoader, } from "@langchain/core/document_loaders/base"; +const virtualConsole = new VirtualConsole(); +virtualConsole.on("error", () => {}); + export interface RecursiveUrlLoaderOptions { excludeDirs?: string[]; extractor?: (text: string) => string; @@ -62,7 +65,7 @@ export class RecursiveUrlLoader private getChildLinks(html: string, baseUrl: string): Array { const allLinks = Array.from( - new JSDOM(html).window.document.querySelectorAll("a") + new JSDOM(html, { virtualConsole }).window.document.querySelectorAll("a") ).map((a) => a.href); const absolutePaths = []; // eslint-disable-next-line no-script-url @@ -117,7 +120,7 @@ export class RecursiveUrlLoader private extractMetadata(rawHtml: string, url: string) { // eslint-disable-next-line @typescript-eslint/no-explicit-any const metadata: Record = { source: url }; - const { document } = new JSDOM(rawHtml).window; + const { document } = new JSDOM(rawHtml, { virtualConsole }).window; const title = document.getElementsByTagName("title")[0]; if (title) {