From cff52638b21d83b95f1757c90ccacb2f77fc5e39 Mon Sep 17 00:00:00 2001
From: Oege Dijk <oegedijk@gmail.com>
Date: Tue, 8 Aug 2023 00:35:41 +0200
Subject: [PATCH] when encountering error during fetch return "" in web_base.py
 (#8753)

when e.g. downloading a sitemap with a malformed url (e.g.
"ttp://example.com/index.html" with the h omitted at the beginning of
the url), this will ensure that the sitemap download does not crash, but
just emits a warning. (maybe should be optional with e.g. a
`skip_faulty_urls:bool=True` parameter, but this was the most
straightforward fix)

@rlancemartin, @eyurtsev
---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
---
 .../langchain/document_loaders/blackboard.py    |  8 +++++++-
 .../langchain/document_loaders/gitbook.py       |  6 ++++++
 .../langchain/document_loaders/sitemap.py       |  6 ++++++
 .../langchain/document_loaders/web_base.py      | 17 ++++++++++++++++-
 4 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/libs/langchain/langchain/document_loaders/blackboard.py b/libs/langchain/langchain/document_loaders/blackboard.py
index 0ab6ca11be885..b21db2dfd1b78 100644
--- a/libs/langchain/langchain/document_loaders/blackboard.py
+++ b/libs/langchain/langchain/document_loaders/blackboard.py
@@ -31,7 +31,7 @@ class BlackboardLoader(WebBaseLoader):
             )
             documents = loader.load()
 
-    """
+    """  # noqa: E501
 
     base_url: str
     """Base url of the blackboard course."""
@@ -47,6 +47,7 @@ def __init__(
         load_all_recursively: bool = True,
         basic_auth: Optional[Tuple[str, str]] = None,
         cookies: Optional[dict] = None,
+        continue_on_failure: Optional[bool] = False,
     ):
         """Initialize with blackboard course url.
 
@@ -58,6 +59,10 @@ def __init__(
             load_all_recursively: If True, load all documents recursively.
             basic_auth: Basic auth credentials.
             cookies: Cookies.
+            continue_on_failure: whether to continue loading the sitemap if an error
+                occurs loading a url, emitting a warning instead of raising an
+                exception. Setting this to True makes the loader more robust, but also
+                may result in missing data. Default: False
 
         Raises:
             ValueError: If blackboard course url is invalid.
@@ -80,6 +85,7 @@ def __init__(
         cookies.update({"BbRouter": bbrouter})
         self.session.cookies.update(cookies)
         self.load_all_recursively = load_all_recursively
+        self.continue_on_failure = continue_on_failure
         self.check_bs4()
 
     def check_bs4(self) -> None:
diff --git a/libs/langchain/langchain/document_loaders/gitbook.py b/libs/langchain/langchain/document_loaders/gitbook.py
index e293c70c0ad1a..1fcec22922d10 100644
--- a/libs/langchain/langchain/document_loaders/gitbook.py
+++ b/libs/langchain/langchain/document_loaders/gitbook.py
@@ -19,6 +19,7 @@ def __init__(
         load_all_paths: bool = False,
         base_url: Optional[str] = None,
         content_selector: str = "main",
+        continue_on_failure: Optional[bool] = False,
     ):
         """Initialize with web page and whether to load all paths.
 
@@ -31,6 +32,10 @@ def __init__(
                 appended to this base url. Defaults to `web_page`.
             content_selector: The CSS selector for the content to load.
                 Defaults to "main".
+            continue_on_failure: whether to continue loading the sitemap if an error
+                occurs loading a url, emitting a warning instead of raising an
+                exception. Setting this to True makes the loader more robust, but also
+                may result in missing data. Default: False
         """
         self.base_url = base_url or web_page
         if self.base_url.endswith("/"):
@@ -43,6 +48,7 @@ def __init__(
         super().__init__(web_paths)
         self.load_all_paths = load_all_paths
         self.content_selector = content_selector
+        self.continue_on_failure = continue_on_failure
 
     def load(self) -> List[Document]:
         """Fetch text from one single GitBook page."""
diff --git a/libs/langchain/langchain/document_loaders/sitemap.py b/libs/langchain/langchain/document_loaders/sitemap.py
index 68fe88eefbcfc..158f19e0d2d94 100644
--- a/libs/langchain/langchain/document_loaders/sitemap.py
+++ b/libs/langchain/langchain/document_loaders/sitemap.py
@@ -33,6 +33,7 @@ def __init__(
         blocknum: int = 0,
         meta_function: Optional[Callable] = None,
         is_local: bool = False,
+        continue_on_failure: bool = False,
     ):
         """Initialize with webpage path and optional filter URLs.
 
@@ -48,6 +49,10 @@ def __init__(
                 remember when setting this method to also copy metadata["loc"]
                 to metadata["source"] if you are using this field
             is_local: whether the sitemap is a local file. Default: False
+            continue_on_failure: whether to continue loading the sitemap if an error
+                occurs loading a url, emitting a warning instead of raising an
+                exception. Setting this to True makes the loader more robust, but also
+                may result in missing data. Default: False
         """
 
         if blocksize is not None and blocksize < 1:
@@ -71,6 +76,7 @@ def __init__(
         self.blocksize = blocksize
         self.blocknum = blocknum
         self.is_local = is_local
+        self.continue_on_failure = continue_on_failure
 
     def parse_sitemap(self, soup: Any) -> List[dict]:
         """Parse sitemap xml and load into a list of dicts.
diff --git a/libs/langchain/langchain/document_loaders/web_base.py b/libs/langchain/langchain/document_loaders/web_base.py
index ed684cd9ee155..5ae9482bce996 100644
--- a/libs/langchain/langchain/document_loaders/web_base.py
+++ b/libs/langchain/langchain/document_loaders/web_base.py
@@ -62,6 +62,7 @@ def __init__(
         header_template: Optional[dict] = None,
         verify_ssl: Optional[bool] = True,
         proxies: Optional[dict] = None,
+        continue_on_failure: Optional[bool] = False,
     ):
         """Initialize with webpage path."""
 
@@ -96,6 +97,7 @@ def __init__(
         self.session = requests.Session()
         self.session.headers = dict(headers)
         self.session.verify = verify_ssl
+        self.continue_on_failure = continue_on_failure
 
         if proxies:
             self.session.proxies.update(proxies)
@@ -133,7 +135,20 @@ async def _fetch_with_rate_limit(
         self, url: str, semaphore: asyncio.Semaphore
     ) -> str:
         async with semaphore:
-            return await self._fetch(url)
+            try:
+                return await self._fetch(url)
+            except Exception as e:
+                if self.continue_on_failure:
+                    logger.warning(
+                        f"Error fetching {url}, skipping due to"
+                        f" continue_on_failure=True"
+                    )
+                    return ""
+                logger.exception(
+                    f"Error fetching {url} and aborting, use continue_on_failure=True "
+                    "to continue loading urls after encountering an error."
+                )
+                raise e
 
     async def fetch_all(self, urls: List[str]) -> Any:
         """Fetch all urls concurrently with rate limiting."""