From 3256b5d6ae4ffb3118d2b0de0b102551eed3f42e Mon Sep 17 00:00:00 2001
From: Yuxin Chen <katy.chen@mail.utoronto.ca>
Date: Wed, 18 Dec 2024 15:27:59 -0500
Subject: [PATCH] text-splitters: fix state persistence issue in
 ExperimentalMarkdownSyntaxTextSplitter (#28373)

- **Description:**
This PR resolves an issue with the
`ExperimentalMarkdownSyntaxTextSplitter` class, which retains the
internal state across multiple calls to the `split_text` method. This
behaviour caused an unintended accumulation of chunks in `self`
variables, leading to incorrect outputs when processing multiple
Markdown files sequentially.

- Modified `libs\text-splitters\langchain_text_splitters\markdown.py` to
reset the relevant internal attributes at the start of each `split_text`
invocation. This ensures each call processes the input independently.
- Added unit tests in
`libs\text-splitters\tests\unit_tests\test_text_splitters.py` to verify
the fix and ensure the state does not persist across calls.

- **Issue:**
Fixes [#26440](https://github.com/langchain-ai/langchain/issues/26440).

- **Dependencies:**
No additional dependencies are introduced with this change.


- [x] Unit tests were added to verify the changes.
- [x] Updated documentation where necessary.
- [x] Ran `make format`, `make lint`, and `make test` to ensure
compliance with project standards.

---------

Co-authored-by: Angel Chen <angelchen396@gmail.com>
Co-authored-by: Chester Curme <chester.curme@gmail.com>
---
 .../langchain_text_splitters/markdown.py      |   5 +
 .../tests/unit_tests/test_text_splitters.py   | 401 ++++++++++++++++++
 2 files changed, 406 insertions(+)

diff --git a/libs/text-splitters/langchain_text_splitters/markdown.py b/libs/text-splitters/langchain_text_splitters/markdown.py
index 34c7d2197d238..f8d3807dc5011 100644
--- a/libs/text-splitters/langchain_text_splitters/markdown.py
+++ b/libs/text-splitters/langchain_text_splitters/markdown.py
@@ -324,6 +324,11 @@ def split_text(self, text: str) -> List[Document]:
             chunks of the input text. If `return_each_line` is enabled, each line
             is returned as a separate `Document`.
         """
+        # Reset the state for each new file processed
+        self.chunks.clear()
+        self.current_chunk = Document(page_content="")
+        self.current_header_stack.clear()
+
         raw_lines = text.splitlines(keepends=True)
 
         while raw_lines:
diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
index 95f170d52b7c4..ee86ae6a60abe 100644
--- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py
+++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@@ -1527,6 +1527,407 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
     assert output == expected_output
 
 
+EXPERIMENTAL_MARKDOWN_DOCUMENTS = [
+    (
+        "# My Header 1 From Document 1\n"
+        "Content for header 1 from Document 1\n"
+        "## Header 2 From Document 1\n"
+        "Content for header 2 from Document 1\n"
+        "```python\n"
+        "def func_definition():\n"
+        "   print('Keep the whitespace consistent')\n"
+        "```\n"
+        "# Header 1 again From Document 1\n"
+        "We should also split on the horizontal line\n"
+        "----\n"
+        "This will be a new doc but with the same header metadata\n\n"
+        "And it includes a new paragraph"
+    ),
+    (
+        "# My Header 1 From Document 2\n"
+        "Content for header 1 from Document 2\n"
+        "## Header 2 From Document 2\n"
+        "Content for header 2 from Document 2\n"
+        "```python\n"
+        "def func_definition():\n"
+        "   print('Keep the whitespace consistent')\n"
+        "```\n"
+        "# Header 1 again From Document 2\n"
+        "We should also split on the horizontal line\n"
+        "----\n"
+        "This will be a new doc but with the same header metadata\n\n"
+        "And it includes a new paragraph"
+    ),
+]
+
+
+def test_experimental_markdown_syntax_text_splitter_on_multi_files() -> None:
+    """Test experimental markdown syntax splitter split
+    on default called consecutively on two files."""
+    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter()
+    output = []
+    for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
+        output += markdown_splitter.split_text(experimental_markdown_document)
+
+    expected_output = [
+        Document(
+            page_content="Content for header 1 from Document 1\n",
+            metadata={"Header 1": "My Header 1 From Document 1"},
+        ),
+        Document(
+            page_content="Content for header 2 from Document 1\n",
+            metadata={
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content="We should also split on the horizontal line\n",
+            metadata={"Header 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Header 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content="Content for header 1 from Document 2\n",
+            metadata={"Header 1": "My Header 1 From Document 2"},
+        ),
+        Document(
+            page_content="Content for header 2 from Document 2\n",
+            metadata={
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content="We should also split on the horizontal line\n",
+            metadata={"Header 1": "Header 1 again From Document 2"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Header 1": "Header 1 again From Document 2"},
+        ),
+    ]
+
+    assert output == expected_output
+
+
+def test_experimental_markdown_syntax_text_splitter_split_lines_on_multi_files() -> (
+    None
+):
+    """Test experimental markdown syntax splitter split
+    on each line called consecutively on two files."""
+    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(return_each_line=True)
+    output = []
+    for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
+        output += markdown_splitter.split_text(experimental_markdown_document)
+    expected_output = [
+        Document(
+            page_content="Content for header 1 from Document 1",
+            metadata={"Header 1": "My Header 1 From Document 1"},
+        ),
+        Document(
+            page_content="Content for header 2 from Document 1",
+            metadata={
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content="```python",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content="def func_definition():",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content="   print('Keep the whitespace consistent')",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content="```",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content="We should also split on the horizontal line",
+            metadata={"Header 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content="This will be a new doc but with the same header metadata",
+            metadata={"Header 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content="And it includes a new paragraph",
+            metadata={"Header 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content="Content for header 1 from Document 2",
+            metadata={"Header 1": "My Header 1 From Document 2"},
+        ),
+        Document(
+            page_content="Content for header 2 from Document 2",
+            metadata={
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content="```python",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content="def func_definition():",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content="   print('Keep the whitespace consistent')",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content="```",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content="We should also split on the horizontal line",
+            metadata={"Header 1": "Header 1 again From Document 2"},
+        ),
+        Document(
+            page_content="This will be a new doc but with the same header metadata",
+            metadata={"Header 1": "Header 1 again From Document 2"},
+        ),
+        Document(
+            page_content="And it includes a new paragraph",
+            metadata={"Header 1": "Header 1 again From Document 2"},
+        ),
+    ]
+
+    assert output == expected_output
+
+
+def test_experimental_markdown_syntax_text_splitter_with_header_on_multi_files() -> (
+    None
+):
+    """Test experimental markdown splitter
+    by header called consecutively on two files"""
+
+    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(strip_headers=False)
+    output = []
+    for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
+        output += markdown_splitter.split_text(experimental_markdown_document)
+
+    expected_output = [
+        Document(
+            page_content="# My Header 1 From Document 1\n"
+            "Content for header 1 from Document 1\n",
+            metadata={"Header 1": "My Header 1 From Document 1"},
+        ),
+        Document(
+            page_content="## Header 2 From Document 1\n"
+            "Content for header 2 from Document 1\n",
+            metadata={
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 1",
+                "Header 2": "Header 2 From Document 1",
+            },
+        ),
+        Document(
+            page_content="# Header 1 again From Document 1\n"
+            "We should also split on the horizontal line\n",
+            metadata={"Header 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Header 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content="# My Header 1 From Document 2\n"
+            "Content for header 1 from Document 2\n",
+            metadata={"Header 1": "My Header 1 From Document 2"},
+        ),
+        Document(
+            page_content="## Header 2 From Document 2\n"
+            "Content for header 2 from Document 2\n",
+            metadata={
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1 From Document 2",
+                "Header 2": "Header 2 From Document 2",
+            },
+        ),
+        Document(
+            page_content="# Header 1 again From Document 2\n"
+            "We should also split on the horizontal line\n",
+            metadata={"Header 1": "Header 1 again From Document 2"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Header 1": "Header 1 again From Document 2"},
+        ),
+    ]
+    assert output == expected_output
+
+
+def test_experimental_markdown_syntax_text_splitter_header_config_on_multi_files() -> (
+    None
+):
+    """Test experimental markdown splitter
+    by header configuration called consecutively on two files"""
+
+    headers_to_split_on = [("#", "Encabezamiento 1")]
+    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(
+        headers_to_split_on=headers_to_split_on
+    )
+    output = []
+    for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS:
+        output += markdown_splitter.split_text(experimental_markdown_document)
+
+    expected_output = [
+        Document(
+            page_content="Content for header 1 from Document 1\n"
+            "## Header 2 From Document 1\n"
+            "Content for header 2 from Document 1\n",
+            metadata={"Encabezamiento 1": "My Header 1 From Document 1"},
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={
+                "Code": "python",
+                "Encabezamiento 1": "My Header 1 From Document 1",
+            },
+        ),
+        Document(
+            page_content="We should also split on the horizontal line\n",
+            metadata={"Encabezamiento 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Encabezamiento 1": "Header 1 again From Document 1"},
+        ),
+        Document(
+            page_content="Content for header 1 from Document 2\n"
+            "## Header 2 From Document 2\n"
+            "Content for header 2 from Document 2\n",
+            metadata={"Encabezamiento 1": "My Header 1 From Document 2"},
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={
+                "Code": "python",
+                "Encabezamiento 1": "My Header 1 From Document 2",
+            },
+        ),
+        Document(
+            page_content="We should also split on the horizontal line\n",
+            metadata={"Encabezamiento 1": "Header 1 again From Document 2"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Encabezamiento 1": "Header 1 again From Document 2"},
+        ),
+    ]
+
+    assert output == expected_output
+
+
 def test_solidity_code_splitter() -> None:
     splitter = RecursiveCharacterTextSplitter.from_language(
         Language.SOL, chunk_size=CHUNK_SIZE, chunk_overlap=0