From 3256b5d6ae4ffb3118d2b0de0b102551eed3f42e Mon Sep 17 00:00:00 2001 From: Yuxin Chen Date: Wed, 18 Dec 2024 15:27:59 -0500 Subject: [PATCH] text-splitters: fix state persistence issue in ExperimentalMarkdownSyntaxTextSplitter (#28373) - **Description:** This PR resolves an issue with the `ExperimentalMarkdownSyntaxTextSplitter` class, which retains the internal state across multiple calls to the `split_text` method. This behaviour caused an unintended accumulation of chunks in `self` variables, leading to incorrect outputs when processing multiple Markdown files sequentially. - Modified `libs\text-splitters\langchain_text_splitters\markdown.py` to reset the relevant internal attributes at the start of each `split_text` invocation. This ensures each call processes the input independently. - Added unit tests in `libs\text-splitters\tests\unit_tests\test_text_splitters.py` to verify the fix and ensure the state does not persist across calls. - **Issue:** Fixes [#26440](https://github.com/langchain-ai/langchain/issues/26440). - **Dependencies:** No additional dependencies are introduced with this change. - [x] Unit tests were added to verify the changes. - [x] Updated documentation where necessary. - [x] Ran `make format`, `make lint`, and `make test` to ensure compliance with project standards. --------- Co-authored-by: Angel Chen Co-authored-by: Chester Curme --- .../langchain_text_splitters/markdown.py | 5 + .../tests/unit_tests/test_text_splitters.py | 401 ++++++++++++++++++ 2 files changed, 406 insertions(+) diff --git a/libs/text-splitters/langchain_text_splitters/markdown.py b/libs/text-splitters/langchain_text_splitters/markdown.py index 34c7d2197d238..f8d3807dc5011 100644 --- a/libs/text-splitters/langchain_text_splitters/markdown.py +++ b/libs/text-splitters/langchain_text_splitters/markdown.py @@ -324,6 +324,11 @@ def split_text(self, text: str) -> List[Document]: chunks of the input text. If `return_each_line` is enabled, each line is returned as a separate `Document`. """ + # Reset the state for each new file processed + self.chunks.clear() + self.current_chunk = Document(page_content="") + self.current_header_stack.clear() + raw_lines = text.splitlines(keepends=True) while raw_lines: diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 95f170d52b7c4..ee86ae6a60abe 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -1527,6 +1527,407 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None: assert output == expected_output +EXPERIMENTAL_MARKDOWN_DOCUMENTS = [ + ( + "# My Header 1 From Document 1\n" + "Content for header 1 from Document 1\n" + "## Header 2 From Document 1\n" + "Content for header 2 from Document 1\n" + "```python\n" + "def func_definition():\n" + " print('Keep the whitespace consistent')\n" + "```\n" + "# Header 1 again From Document 1\n" + "We should also split on the horizontal line\n" + "----\n" + "This will be a new doc but with the same header metadata\n\n" + "And it includes a new paragraph" + ), + ( + "# My Header 1 From Document 2\n" + "Content for header 1 from Document 2\n" + "## Header 2 From Document 2\n" + "Content for header 2 from Document 2\n" + "```python\n" + "def func_definition():\n" + " print('Keep the whitespace consistent')\n" + "```\n" + "# Header 1 again From Document 2\n" + "We should also split on the horizontal line\n" + "----\n" + "This will be a new doc but with the same header metadata\n\n" + "And it includes a new paragraph" + ), +] + + +def test_experimental_markdown_syntax_text_splitter_on_multi_files() -> None: + """Test experimental markdown syntax splitter split + on default called consecutively on two files.""" + markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter() + output = [] + for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS: + output += markdown_splitter.split_text(experimental_markdown_document) + + expected_output = [ + Document( + page_content="Content for header 1 from Document 1\n", + metadata={"Header 1": "My Header 1 From Document 1"}, + ), + Document( + page_content="Content for header 2 from Document 1\n", + metadata={ + "Header 1": "My Header 1 From Document 1", + "Header 2": "Header 2 From Document 1", + }, + ), + Document( + page_content=( + "```python\ndef func_definition():\n " + "print('Keep the whitespace consistent')\n```\n" + ), + metadata={ + "Code": "python", + "Header 1": "My Header 1 From Document 1", + "Header 2": "Header 2 From Document 1", + }, + ), + Document( + page_content="We should also split on the horizontal line\n", + metadata={"Header 1": "Header 1 again From Document 1"}, + ), + Document( + page_content=( + "This will be a new doc but with the same header metadata\n\n" + "And it includes a new paragraph" + ), + metadata={"Header 1": "Header 1 again From Document 1"}, + ), + Document( + page_content="Content for header 1 from Document 2\n", + metadata={"Header 1": "My Header 1 From Document 2"}, + ), + Document( + page_content="Content for header 2 from Document 2\n", + metadata={ + "Header 1": "My Header 1 From Document 2", + "Header 2": "Header 2 From Document 2", + }, + ), + Document( + page_content=( + "```python\ndef func_definition():\n " + "print('Keep the whitespace consistent')\n```\n" + ), + metadata={ + "Code": "python", + "Header 1": "My Header 1 From Document 2", + "Header 2": "Header 2 From Document 2", + }, + ), + Document( + page_content="We should also split on the horizontal line\n", + metadata={"Header 1": "Header 1 again From Document 2"}, + ), + Document( + page_content=( + "This will be a new doc but with the same header metadata\n\n" + "And it includes a new paragraph" + ), + metadata={"Header 1": "Header 1 again From Document 2"}, + ), + ] + + assert output == expected_output + + +def test_experimental_markdown_syntax_text_splitter_split_lines_on_multi_files() -> ( + None +): + """Test experimental markdown syntax splitter split + on each line called consecutively on two files.""" + markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(return_each_line=True) + output = [] + for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS: + output += markdown_splitter.split_text(experimental_markdown_document) + expected_output = [ + Document( + page_content="Content for header 1 from Document 1", + metadata={"Header 1": "My Header 1 From Document 1"}, + ), + Document( + page_content="Content for header 2 from Document 1", + metadata={ + "Header 1": "My Header 1 From Document 1", + "Header 2": "Header 2 From Document 1", + }, + ), + Document( + page_content="```python", + metadata={ + "Code": "python", + "Header 1": "My Header 1 From Document 1", + "Header 2": "Header 2 From Document 1", + }, + ), + Document( + page_content="def func_definition():", + metadata={ + "Code": "python", + "Header 1": "My Header 1 From Document 1", + "Header 2": "Header 2 From Document 1", + }, + ), + Document( + page_content=" print('Keep the whitespace consistent')", + metadata={ + "Code": "python", + "Header 1": "My Header 1 From Document 1", + "Header 2": "Header 2 From Document 1", + }, + ), + Document( + page_content="```", + metadata={ + "Code": "python", + "Header 1": "My Header 1 From Document 1", + "Header 2": "Header 2 From Document 1", + }, + ), + Document( + page_content="We should also split on the horizontal line", + metadata={"Header 1": "Header 1 again From Document 1"}, + ), + Document( + page_content="This will be a new doc but with the same header metadata", + metadata={"Header 1": "Header 1 again From Document 1"}, + ), + Document( + page_content="And it includes a new paragraph", + metadata={"Header 1": "Header 1 again From Document 1"}, + ), + Document( + page_content="Content for header 1 from Document 2", + metadata={"Header 1": "My Header 1 From Document 2"}, + ), + Document( + page_content="Content for header 2 from Document 2", + metadata={ + "Header 1": "My Header 1 From Document 2", + "Header 2": "Header 2 From Document 2", + }, + ), + Document( + page_content="```python", + metadata={ + "Code": "python", + "Header 1": "My Header 1 From Document 2", + "Header 2": "Header 2 From Document 2", + }, + ), + Document( + page_content="def func_definition():", + metadata={ + "Code": "python", + "Header 1": "My Header 1 From Document 2", + "Header 2": "Header 2 From Document 2", + }, + ), + Document( + page_content=" print('Keep the whitespace consistent')", + metadata={ + "Code": "python", + "Header 1": "My Header 1 From Document 2", + "Header 2": "Header 2 From Document 2", + }, + ), + Document( + page_content="```", + metadata={ + "Code": "python", + "Header 1": "My Header 1 From Document 2", + "Header 2": "Header 2 From Document 2", + }, + ), + Document( + page_content="We should also split on the horizontal line", + metadata={"Header 1": "Header 1 again From Document 2"}, + ), + Document( + page_content="This will be a new doc but with the same header metadata", + metadata={"Header 1": "Header 1 again From Document 2"}, + ), + Document( + page_content="And it includes a new paragraph", + metadata={"Header 1": "Header 1 again From Document 2"}, + ), + ] + + assert output == expected_output + + +def test_experimental_markdown_syntax_text_splitter_with_header_on_multi_files() -> ( + None +): + """Test experimental markdown splitter + by header called consecutively on two files""" + + markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(strip_headers=False) + output = [] + for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS: + output += markdown_splitter.split_text(experimental_markdown_document) + + expected_output = [ + Document( + page_content="# My Header 1 From Document 1\n" + "Content for header 1 from Document 1\n", + metadata={"Header 1": "My Header 1 From Document 1"}, + ), + Document( + page_content="## Header 2 From Document 1\n" + "Content for header 2 from Document 1\n", + metadata={ + "Header 1": "My Header 1 From Document 1", + "Header 2": "Header 2 From Document 1", + }, + ), + Document( + page_content=( + "```python\ndef func_definition():\n " + "print('Keep the whitespace consistent')\n```\n" + ), + metadata={ + "Code": "python", + "Header 1": "My Header 1 From Document 1", + "Header 2": "Header 2 From Document 1", + }, + ), + Document( + page_content="# Header 1 again From Document 1\n" + "We should also split on the horizontal line\n", + metadata={"Header 1": "Header 1 again From Document 1"}, + ), + Document( + page_content=( + "This will be a new doc but with the same header metadata\n\n" + "And it includes a new paragraph" + ), + metadata={"Header 1": "Header 1 again From Document 1"}, + ), + Document( + page_content="# My Header 1 From Document 2\n" + "Content for header 1 from Document 2\n", + metadata={"Header 1": "My Header 1 From Document 2"}, + ), + Document( + page_content="## Header 2 From Document 2\n" + "Content for header 2 from Document 2\n", + metadata={ + "Header 1": "My Header 1 From Document 2", + "Header 2": "Header 2 From Document 2", + }, + ), + Document( + page_content=( + "```python\ndef func_definition():\n " + "print('Keep the whitespace consistent')\n```\n" + ), + metadata={ + "Code": "python", + "Header 1": "My Header 1 From Document 2", + "Header 2": "Header 2 From Document 2", + }, + ), + Document( + page_content="# Header 1 again From Document 2\n" + "We should also split on the horizontal line\n", + metadata={"Header 1": "Header 1 again From Document 2"}, + ), + Document( + page_content=( + "This will be a new doc but with the same header metadata\n\n" + "And it includes a new paragraph" + ), + metadata={"Header 1": "Header 1 again From Document 2"}, + ), + ] + assert output == expected_output + + +def test_experimental_markdown_syntax_text_splitter_header_config_on_multi_files() -> ( + None +): + """Test experimental markdown splitter + by header configuration called consecutively on two files""" + + headers_to_split_on = [("#", "Encabezamiento 1")] + markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter( + headers_to_split_on=headers_to_split_on + ) + output = [] + for experimental_markdown_document in EXPERIMENTAL_MARKDOWN_DOCUMENTS: + output += markdown_splitter.split_text(experimental_markdown_document) + + expected_output = [ + Document( + page_content="Content for header 1 from Document 1\n" + "## Header 2 From Document 1\n" + "Content for header 2 from Document 1\n", + metadata={"Encabezamiento 1": "My Header 1 From Document 1"}, + ), + Document( + page_content=( + "```python\ndef func_definition():\n " + "print('Keep the whitespace consistent')\n```\n" + ), + metadata={ + "Code": "python", + "Encabezamiento 1": "My Header 1 From Document 1", + }, + ), + Document( + page_content="We should also split on the horizontal line\n", + metadata={"Encabezamiento 1": "Header 1 again From Document 1"}, + ), + Document( + page_content=( + "This will be a new doc but with the same header metadata\n\n" + "And it includes a new paragraph" + ), + metadata={"Encabezamiento 1": "Header 1 again From Document 1"}, + ), + Document( + page_content="Content for header 1 from Document 2\n" + "## Header 2 From Document 2\n" + "Content for header 2 from Document 2\n", + metadata={"Encabezamiento 1": "My Header 1 From Document 2"}, + ), + Document( + page_content=( + "```python\ndef func_definition():\n " + "print('Keep the whitespace consistent')\n```\n" + ), + metadata={ + "Code": "python", + "Encabezamiento 1": "My Header 1 From Document 2", + }, + ), + Document( + page_content="We should also split on the horizontal line\n", + metadata={"Encabezamiento 1": "Header 1 again From Document 2"}, + ), + Document( + page_content=( + "This will be a new doc but with the same header metadata\n\n" + "And it includes a new paragraph" + ), + metadata={"Encabezamiento 1": "Header 1 again From Document 2"}, + ), + ] + + assert output == expected_output + + def test_solidity_code_splitter() -> None: splitter = RecursiveCharacterTextSplitter.from_language( Language.SOL, chunk_size=CHUNK_SIZE, chunk_overlap=0