Skip to content

Commit

Permalink
handle unicode named escapes in fstring components
Browse files Browse the repository at this point in the history
  • Loading branch information
asottile committed Oct 22, 2024
1 parent 75de5c5 commit 32e6236
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 2 deletions.
34 changes: 34 additions & 0 deletions tests/tokenize_rt_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytest

from tokenize_rt import _re_partition
from tokenize_rt import curly_escape
from tokenize_rt import ESCAPED_NL
from tokenize_rt import main
from tokenize_rt import Offset
Expand Down Expand Up @@ -201,6 +202,26 @@ def test_src_to_tokens_fstring_with_escapes():
]


def test_src_to_tokens_fstring_with_named_escapes():
src = r'f" \N{SNOWMAN} "'
ret = src_to_tokens(src)
if sys.version_info >= (3, 12): # pragma: >=3.12 cover
assert ret == [
Token(name='FSTRING_START', src='f"', line=1, utf8_byte_offset=0),
Token(name='FSTRING_MIDDLE', src=' \\N{SNOWMAN}', line=1, utf8_byte_offset=2), # noqa: E501
Token(name='FSTRING_MIDDLE', src=' ', line=1, utf8_byte_offset=14),
Token(name='FSTRING_END', src='"', line=1, utf8_byte_offset=15),
Token(name='NEWLINE', src='', line=1, utf8_byte_offset=16),
Token(name='ENDMARKER', src='', line=2, utf8_byte_offset=0),
]
else: # pragma: <3.12 cover
assert ret == [
Token(name='STRING', src='f" \\N{SNOWMAN} "', line=1, utf8_byte_offset=0), # noqa: E501
Token(name='NEWLINE', src='', line=1, utf8_byte_offset=16),
Token(name='ENDMARKER', src='', line=2, utf8_byte_offset=0),
]


@pytest.mark.parametrize(
'filename',
(
Expand Down Expand Up @@ -343,3 +364,16 @@ def test_main(capsys, tmp_path):
"1:5 NEWLINE '\\n'\n"
"2:0 ENDMARKER ''\n"
)


@pytest.mark.parametrize(
('s', 'expected'),
(
('', ''),
('{foo}', '{{foo}}'),
(r'\N{SNOWMAN}', r'\N{SNOWMAN}'),
(r'\N{SNOWMAN} {bar}', r'\N{SNOWMAN} {{bar}}'),
),
)
def test_curly_escape(s, expected):
assert curly_escape(s) == expected
16 changes: 14 additions & 2 deletions tokenize_rt.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@ def matches(self, *, name: str, src: str) -> bool:
_string_re = re.compile('^([^\'"]*)(.*)$', re.DOTALL)
_escaped_nl_re = re.compile(r'\\(\n|\r\n|\r)')

NAMED_UNICODE_RE = re.compile(r'(?<!\\)(?:\\\\)*(\\N\{[^}]+\})')


def curly_escape(s: str) -> str:
parts = NAMED_UNICODE_RE.split(s)
return ''.join(
part.replace('{', '{{').replace('}', '}}') if i % 2 == 0 else part
for i, part in enumerate(parts)
)


def _re_partition(regex: Pattern[str], s: str) -> tuple[str, str, str]:
match = regex.search(s)
Expand Down Expand Up @@ -101,8 +111,10 @@ def src_to_tokens(src: str) -> list[Token]:
tok_name = tokenize.tok_name[tok_type]

if tok_name == 'FSTRING_MIDDLE': # pragma: >=3.12 cover
ecol += tok_text.count('{') + tok_text.count('}')
tok_text = tok_text.replace('{', '{{').replace('}', '}}')
if '{' in tok_text or '}' in tok_text:
new_tok_text = curly_escape(tok_text)
ecol += len(new_tok_text) - len(tok_text)
tok_text = new_tok_text

tokens.append(Token(tok_name, tok_text, sline, end_offset))
last_line, last_col = eline, ecol
Expand Down

0 comments on commit 32e6236

Please sign in to comment.