From cff52638b21d83b95f1757c90ccacb2f77fc5e39 Mon Sep 17 00:00:00 2001 From: Oege Dijk Date: Tue, 8 Aug 2023 00:35:41 +0200 Subject: [PATCH] when encountering error during fetch return "" in web_base.py (#8753) when e.g. downloading a sitemap with a malformed url (e.g. "ttp://example.com/index.html" with the h omitted at the beginning of the url), this will ensure that the sitemap download does not crash, but just emits a warning. (maybe should be optional with e.g. a `skip_faulty_urls:bool=True` parameter, but this was the most straightforward fix) @rlancemartin, @eyurtsev --------- Co-authored-by: Bagatur --- .../langchain/document_loaders/blackboard.py | 8 +++++++- .../langchain/document_loaders/gitbook.py | 6 ++++++ .../langchain/document_loaders/sitemap.py | 6 ++++++ .../langchain/document_loaders/web_base.py | 17 ++++++++++++++++- 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/blackboard.py b/libs/langchain/langchain/document_loaders/blackboard.py index 0ab6ca11be885..b21db2dfd1b78 100644 --- a/libs/langchain/langchain/document_loaders/blackboard.py +++ b/libs/langchain/langchain/document_loaders/blackboard.py @@ -31,7 +31,7 @@ class BlackboardLoader(WebBaseLoader): ) documents = loader.load() - """ + """ # noqa: E501 base_url: str """Base url of the blackboard course.""" @@ -47,6 +47,7 @@ def __init__( load_all_recursively: bool = True, basic_auth: Optional[Tuple[str, str]] = None, cookies: Optional[dict] = None, + continue_on_failure: Optional[bool] = False, ): """Initialize with blackboard course url. @@ -58,6 +59,10 @@ def __init__( load_all_recursively: If True, load all documents recursively. basic_auth: Basic auth credentials. cookies: Cookies. + continue_on_failure: whether to continue loading the sitemap if an error + occurs loading a url, emitting a warning instead of raising an + exception. Setting this to True makes the loader more robust, but also + may result in missing data. Default: False Raises: ValueError: If blackboard course url is invalid. @@ -80,6 +85,7 @@ def __init__( cookies.update({"BbRouter": bbrouter}) self.session.cookies.update(cookies) self.load_all_recursively = load_all_recursively + self.continue_on_failure = continue_on_failure self.check_bs4() def check_bs4(self) -> None: diff --git a/libs/langchain/langchain/document_loaders/gitbook.py b/libs/langchain/langchain/document_loaders/gitbook.py index e293c70c0ad1a..1fcec22922d10 100644 --- a/libs/langchain/langchain/document_loaders/gitbook.py +++ b/libs/langchain/langchain/document_loaders/gitbook.py @@ -19,6 +19,7 @@ def __init__( load_all_paths: bool = False, base_url: Optional[str] = None, content_selector: str = "main", + continue_on_failure: Optional[bool] = False, ): """Initialize with web page and whether to load all paths. @@ -31,6 +32,10 @@ def __init__( appended to this base url. Defaults to `web_page`. content_selector: The CSS selector for the content to load. Defaults to "main". + continue_on_failure: whether to continue loading the sitemap if an error + occurs loading a url, emitting a warning instead of raising an + exception. Setting this to True makes the loader more robust, but also + may result in missing data. Default: False """ self.base_url = base_url or web_page if self.base_url.endswith("/"): @@ -43,6 +48,7 @@ def __init__( super().__init__(web_paths) self.load_all_paths = load_all_paths self.content_selector = content_selector + self.continue_on_failure = continue_on_failure def load(self) -> List[Document]: """Fetch text from one single GitBook page.""" diff --git a/libs/langchain/langchain/document_loaders/sitemap.py b/libs/langchain/langchain/document_loaders/sitemap.py index 68fe88eefbcfc..158f19e0d2d94 100644 --- a/libs/langchain/langchain/document_loaders/sitemap.py +++ b/libs/langchain/langchain/document_loaders/sitemap.py @@ -33,6 +33,7 @@ def __init__( blocknum: int = 0, meta_function: Optional[Callable] = None, is_local: bool = False, + continue_on_failure: bool = False, ): """Initialize with webpage path and optional filter URLs. @@ -48,6 +49,10 @@ def __init__( remember when setting this method to also copy metadata["loc"] to metadata["source"] if you are using this field is_local: whether the sitemap is a local file. Default: False + continue_on_failure: whether to continue loading the sitemap if an error + occurs loading a url, emitting a warning instead of raising an + exception. Setting this to True makes the loader more robust, but also + may result in missing data. Default: False """ if blocksize is not None and blocksize < 1: @@ -71,6 +76,7 @@ def __init__( self.blocksize = blocksize self.blocknum = blocknum self.is_local = is_local + self.continue_on_failure = continue_on_failure def parse_sitemap(self, soup: Any) -> List[dict]: """Parse sitemap xml and load into a list of dicts. diff --git a/libs/langchain/langchain/document_loaders/web_base.py b/libs/langchain/langchain/document_loaders/web_base.py index ed684cd9ee155..5ae9482bce996 100644 --- a/libs/langchain/langchain/document_loaders/web_base.py +++ b/libs/langchain/langchain/document_loaders/web_base.py @@ -62,6 +62,7 @@ def __init__( header_template: Optional[dict] = None, verify_ssl: Optional[bool] = True, proxies: Optional[dict] = None, + continue_on_failure: Optional[bool] = False, ): """Initialize with webpage path.""" @@ -96,6 +97,7 @@ def __init__( self.session = requests.Session() self.session.headers = dict(headers) self.session.verify = verify_ssl + self.continue_on_failure = continue_on_failure if proxies: self.session.proxies.update(proxies) @@ -133,7 +135,20 @@ async def _fetch_with_rate_limit( self, url: str, semaphore: asyncio.Semaphore ) -> str: async with semaphore: - return await self._fetch(url) + try: + return await self._fetch(url) + except Exception as e: + if self.continue_on_failure: + logger.warning( + f"Error fetching {url}, skipping due to" + f" continue_on_failure=True" + ) + return "" + logger.exception( + f"Error fetching {url} and aborting, use continue_on_failure=True " + "to continue loading urls after encountering an error." + ) + raise e async def fetch_all(self, urls: List[str]) -> Any: """Fetch all urls concurrently with rate limiting."""