From 10e5ccff940e09a8502d350950d5e09e4f2def26 Mon Sep 17 00:00:00 2001 From: Thomas Kowalski Date: Thu, 30 Apr 2026 10:53:52 +0000 Subject: [PATCH 1/4] fix: avoid MemoryError in tokenize --- Lib/test/test_tokenize.py | 10 ++++++++++ Parser/lexer/lexer.c | 23 ++++++++++++++++------- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index ab53a20cff5539..8c91e271f4d5e8 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -3188,6 +3188,16 @@ def get_tokens(string): with self.subTest(case=case): self.assertRaises(tokenize.TokenError, get_tokens, case) + def test_tstring_multiline_bang_underflow(self): + # gh-149183: t-string with '!' across two lines used to raise + # MemoryError because last_expr_end > last_expr_size produced a + # negative length that was cast to a huge size_t. + self.assertRaises( + tokenize.TokenError, + list, + tokenize.tokenize(BytesIO(b't"{!\n!x').readline), + ) + @support.skip_wasi_stack_overflow() def test_max_indent(self): MAXINDENT = 100 diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 7f25afec302c22..748df54338f6a3 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -121,12 +121,22 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { } PyObject *res = NULL; + Py_ssize_t expr_len = tok_mode->last_expr_size - tok_mode->last_expr_end; + if (expr_len < 0) { + /* last_expr_end > last_expr_size: happens when '{' and the closing + delimiter span different source lines, causing the strlen-based + size tracking to underflow. Treat as a tokenizer error rather + than passing a negative length (cast to huge size_t) to malloc or + PyUnicode_DecodeUTF8. */ + return -1; + } + // Look for a # character outside of string literals int hash_detected = 0; int in_string = 0; char quote_char = 0; - for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) { + for (Py_ssize_t i = 0; i < expr_len; i++) { char ch = tok_mode->last_expr_buffer[i]; // Skip escaped characters @@ -163,7 +173,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { // If we found a # character in the expression, we need to handle comments if (hash_detected) { // Allocate buffer for processed result - char *result = (char *)PyMem_Malloc((tok_mode->last_expr_size - tok_mode->last_expr_end + 1) * sizeof(char)); + char *result = (char *)PyMem_Malloc((expr_len + 1) * sizeof(char)); if (!result) { return -1; } @@ -174,7 +184,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { quote_char = 0; // Current string quote char // Process each character - while (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { + while (i < expr_len) { char ch = tok_mode->last_expr_buffer[i]; // Handle string quotes @@ -190,11 +200,10 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { } // Skip comments else if (ch == '#' && !in_string) { - while (i < tok_mode->last_expr_size - tok_mode->last_expr_end && - tok_mode->last_expr_buffer[i] != '\n') { + while (i < expr_len && tok_mode->last_expr_buffer[i] != '\n') { i++; } - if (i < tok_mode->last_expr_size - tok_mode->last_expr_end) { + if (i < expr_len) { result[j++] = '\n'; } } @@ -211,7 +220,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { } else { res = PyUnicode_DecodeUTF8( tok_mode->last_expr_buffer, - tok_mode->last_expr_size - tok_mode->last_expr_end, + expr_len, NULL ); } From 1ca203e9eecfb1f2c4ba93aaebd221a999ac4fff Mon Sep 17 00:00:00 2001 From: Thomas Kowalski Date: Mon, 11 May 2026 10:47:43 +0200 Subject: [PATCH 2/4] misc: add news entry --- .../2026-05-11-10-46-00.gh-issue-149183.aB3xQr.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2026-05-11-10-46-00.gh-issue-149183.aB3xQr.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-05-11-10-46-00.gh-issue-149183.aB3xQr.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-05-11-10-46-00.gh-issue-149183.aB3xQr.rst new file mode 100644 index 00000000000000..14bb6ca4080bc3 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-05-11-10-46-00.gh-issue-149183.aB3xQr.rst @@ -0,0 +1,2 @@ +Fix :exc:`MemoryError` in the t-string tokenizer when the opening ``{`` +and closing delimiter span different source lines. From 876683f4f79102885e7bd8e976baf17d512681c9 Mon Sep 17 00:00:00 2001 From: Thomas Kowalski Date: Mon, 11 May 2026 10:47:48 +0200 Subject: [PATCH 3/4] review: move check before declaration --- Parser/lexer/lexer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 748df54338f6a3..bf987cb85cf44e 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -119,7 +119,6 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { if (!(tok_mode->in_debug || tok_mode->string_kind == TSTRING) || token->metadata) { return 0; } - PyObject *res = NULL; Py_ssize_t expr_len = tok_mode->last_expr_size - tok_mode->last_expr_end; if (expr_len < 0) { @@ -131,6 +130,8 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) { return -1; } + PyObject *res = NULL; + // Look for a # character outside of string literals int hash_detected = 0; int in_string = 0; From 9203a8ec158cdf6cf1c51e2cfbee9aecede11ec9 Mon Sep 17 00:00:00 2001 From: Thomas Kowalski Date: Mon, 11 May 2026 13:33:20 +0200 Subject: [PATCH 4/4] review: update Lib/test/test_tokenize.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- Lib/test/test_tokenize.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 8c91e271f4d5e8..4ed94b649fc0f2 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -3192,11 +3192,8 @@ def test_tstring_multiline_bang_underflow(self): # gh-149183: t-string with '!' across two lines used to raise # MemoryError because last_expr_end > last_expr_size produced a # negative length that was cast to a huge size_t. - self.assertRaises( - tokenize.TokenError, - list, - tokenize.tokenize(BytesIO(b't"{!\n!x').readline), - ) + readline = BytesIO(b't"{!\n!x').readline + self.assertRaises(tokenize.TokenError, list, tokenize.tokenize(readline)) @support.skip_wasi_stack_overflow() def test_max_indent(self):