From 32e62360acc5f0eb311ce0197ecfd57875f90066 Mon Sep 17 00:00:00 2001 From: Anthony Sottile Date: Mon, 21 Oct 2024 20:05:00 -0400 Subject: [PATCH] handle unicode named escapes in fstring components --- tests/tokenize_rt_test.py | 34 ++++++++++++++++++++++++++++++++++ tokenize_rt.py | 16 ++++++++++++++-- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/tests/tokenize_rt_test.py b/tests/tokenize_rt_test.py index b6f572a..258f399 100644 --- a/tests/tokenize_rt_test.py +++ b/tests/tokenize_rt_test.py @@ -6,6 +6,7 @@ import pytest from tokenize_rt import _re_partition +from tokenize_rt import curly_escape from tokenize_rt import ESCAPED_NL from tokenize_rt import main from tokenize_rt import Offset @@ -201,6 +202,26 @@ def test_src_to_tokens_fstring_with_escapes(): ] +def test_src_to_tokens_fstring_with_named_escapes(): + src = r'f" \N{SNOWMAN} "' + ret = src_to_tokens(src) + if sys.version_info >= (3, 12): # pragma: >=3.12 cover + assert ret == [ + Token(name='FSTRING_START', src='f"', line=1, utf8_byte_offset=0), + Token(name='FSTRING_MIDDLE', src=' \\N{SNOWMAN}', line=1, utf8_byte_offset=2), # noqa: E501 + Token(name='FSTRING_MIDDLE', src=' ', line=1, utf8_byte_offset=14), + Token(name='FSTRING_END', src='"', line=1, utf8_byte_offset=15), + Token(name='NEWLINE', src='', line=1, utf8_byte_offset=16), + Token(name='ENDMARKER', src='', line=2, utf8_byte_offset=0), + ] + else: # pragma: <3.12 cover + assert ret == [ + Token(name='STRING', src='f" \\N{SNOWMAN} "', line=1, utf8_byte_offset=0), # noqa: E501 + Token(name='NEWLINE', src='', line=1, utf8_byte_offset=16), + Token(name='ENDMARKER', src='', line=2, utf8_byte_offset=0), + ] + + @pytest.mark.parametrize( 'filename', ( @@ -343,3 +364,16 @@ def test_main(capsys, tmp_path): "1:5 NEWLINE '\\n'\n" "2:0 ENDMARKER ''\n" ) + + +@pytest.mark.parametrize( + ('s', 'expected'), + ( + ('', ''), + ('{foo}', '{{foo}}'), + (r'\N{SNOWMAN}', r'\N{SNOWMAN}'), + (r'\N{SNOWMAN} {bar}', r'\N{SNOWMAN} {{bar}}'), + ), +) +def test_curly_escape(s, expected): + assert curly_escape(s) == expected diff --git a/tokenize_rt.py b/tokenize_rt.py index 559b6d5..f909ead 100644 --- a/tokenize_rt.py +++ b/tokenize_rt.py @@ -47,6 +47,16 @@ def matches(self, *, name: str, src: str) -> bool: _string_re = re.compile('^([^\'"]*)(.*)$', re.DOTALL) _escaped_nl_re = re.compile(r'\\(\n|\r\n|\r)') +NAMED_UNICODE_RE = re.compile(r'(? str: + parts = NAMED_UNICODE_RE.split(s) + return ''.join( + part.replace('{', '{{').replace('}', '}}') if i % 2 == 0 else part + for i, part in enumerate(parts) + ) + def _re_partition(regex: Pattern[str], s: str) -> tuple[str, str, str]: match = regex.search(s) @@ -101,8 +111,10 @@ def src_to_tokens(src: str) -> list[Token]: tok_name = tokenize.tok_name[tok_type] if tok_name == 'FSTRING_MIDDLE': # pragma: >=3.12 cover - ecol += tok_text.count('{') + tok_text.count('}') - tok_text = tok_text.replace('{', '{{').replace('}', '}}') + if '{' in tok_text or '}' in tok_text: + new_tok_text = curly_escape(tok_text) + ecol += len(new_tok_text) - len(tok_text) + tok_text = new_tok_text tokens.append(Token(tok_name, tok_text, sline, end_offset)) last_line, last_col = eline, ecol