diff --git a/libs/langchain/langchain/output_parsers/json.py b/libs/langchain/langchain/output_parsers/json.py index 9e0c83e56b6dc..f0f653eca2101 100644 --- a/libs/langchain/langchain/output_parsers/json.py +++ b/libs/langchain/langchain/output_parsers/json.py @@ -8,6 +8,36 @@ from langchain.schema import BaseOutputParser, OutputParserException +def _replace_new_line(match: re.Match[str]) -> str: + value = match.group(2) + value = re.sub(r"\n", r"\\n", value) + value = re.sub(r"\r", r"\\r", value) + value = re.sub(r"\t", r"\\t", value) + value = re.sub('"', r"\"", value) + + return match.group(1) + value + match.group(3) + + +def _custom_parser(multiline_string: str) -> str: + """ + The LLM response for `action_input` may be a multiline + string containing unescaped newlines, tabs or quotes. This function + replaces those characters with their escaped counterparts. + (newlines in JSON must be double-escaped: `\\n`) + """ + if isinstance(multiline_string, (bytes, bytearray)): + multiline_string = multiline_string.decode() + + multiline_string = re.sub( + r'("action_input"\:\s*")(.*)(")', + _replace_new_line, + multiline_string, + flags=re.DOTALL, + ) + + return multiline_string + + def parse_json_markdown(json_string: str) -> dict: """ Parse a JSON string from a Markdown string. @@ -31,6 +61,9 @@ def parse_json_markdown(json_string: str) -> dict: # Strip whitespace and newlines from the start and end json_str = json_str.strip() + # handle newlines and other special characters inside the returned value + json_str = _custom_parser(json_str) + # Parse the JSON string into a Python dictionary parsed = json.loads(json_str) diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_json.py b/libs/langchain/tests/unit_tests/output_parsers/test_json.py index 1762024ae1b75..afaa4bf64f308 100644 --- a/libs/langchain/tests/unit_tests/output_parsers/test_json.py +++ b/libs/langchain/tests/unit_tests/output_parsers/test_json.py @@ -60,6 +60,13 @@ } ```""" +JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES = """```json +{ + "action": "Final Answer", + "action_input": "```bar\n
\n\ttext\n
```" +} +```""" + NO_TICKS = """{ "foo": "bar" }""" @@ -114,6 +121,13 @@ def test_parse_json(json_string: str) -> None: assert parsed == {"foo": "bar"} -def test_parse_json_with_code_block() -> None: +def test_parse_json_with_code_blocks() -> None: parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK) assert parsed == {"foo": "```bar```"} + + parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES) + + assert parsed == { + "action": "Final Answer", + "action_input": '```bar\n
\n\ttext\n
```', + }