Skip to content

Commit

Permalink
Make json output parser handle newlines inside markdown code blocks (#…
Browse files Browse the repository at this point in the history
…8682)

Update to #8528

Newlines and other special characters within markdown code blocks
returned as `action_input` should be handled correctly (in particular,
unescaped `"` => `\"` and `\n` => `\\n`) so they don't break JSON
parsing.

@baskaryan
  • Loading branch information
bborn authored Aug 7, 2023
1 parent ce3666c commit d56eff0
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 1 deletion.
33 changes: 33 additions & 0 deletions libs/langchain/langchain/output_parsers/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,36 @@
from langchain.schema import BaseOutputParser, OutputParserException


def _replace_new_line(match: re.Match[str]) -> str:
value = match.group(2)
value = re.sub(r"\n", r"\\n", value)
value = re.sub(r"\r", r"\\r", value)
value = re.sub(r"\t", r"\\t", value)
value = re.sub('"', r"\"", value)

return match.group(1) + value + match.group(3)


def _custom_parser(multiline_string: str) -> str:
"""
The LLM response for `action_input` may be a multiline
string containing unescaped newlines, tabs or quotes. This function
replaces those characters with their escaped counterparts.
(newlines in JSON must be double-escaped: `\\n`)
"""
if isinstance(multiline_string, (bytes, bytearray)):
multiline_string = multiline_string.decode()

multiline_string = re.sub(
r'("action_input"\:\s*")(.*)(")',
_replace_new_line,
multiline_string,
flags=re.DOTALL,
)

return multiline_string


def parse_json_markdown(json_string: str) -> dict:
"""
Parse a JSON string from a Markdown string.
Expand All @@ -31,6 +61,9 @@ def parse_json_markdown(json_string: str) -> dict:
# Strip whitespace and newlines from the start and end
json_str = json_str.strip()

# handle newlines and other special characters inside the returned value
json_str = _custom_parser(json_str)

# Parse the JSON string into a Python dictionary
parsed = json.loads(json_str)

Expand Down
16 changes: 15 additions & 1 deletion libs/langchain/tests/unit_tests/output_parsers/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@
}
```"""

JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES = """```json
{
"action": "Final Answer",
"action_input": "```bar\n<div id="1" class=\"value\">\n\ttext\n</div>```"
}
```"""

NO_TICKS = """{
"foo": "bar"
}"""
Expand Down Expand Up @@ -114,6 +121,13 @@ def test_parse_json(json_string: str) -> None:
assert parsed == {"foo": "bar"}


def test_parse_json_with_code_block() -> None:
def test_parse_json_with_code_blocks() -> None:
parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK)
assert parsed == {"foo": "```bar```"}

parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES)

assert parsed == {
"action": "Final Answer",
"action_input": '```bar\n<div id="1" class="value">\n\ttext\n</div>```',
}

0 comments on commit d56eff0

Please sign in to comment.