From ef61984766d76f51fa65763d764b0b66f31856ca Mon Sep 17 00:00:00 2001 From: Mike Edmunds Date: Sat, 1 Mar 2025 14:06:50 -0800 Subject: [PATCH 1/3] pythongh-128110: Fix rfc2047 handling in email parser address headers RFC 2047 Section 6.2 requires that "any 'linear-white-space' that separates a pair of adjacent 'encoded-word's is ignored." The modern header value parser correctly implements that for unstructured headers, but had missed a case in structured headers. This could cause a parsed address header to include extraneous spaces in a display-name. Fixed in get_atom() by converting a trailing CFWSList token after an encoded-word to an EWWhiteSpaceTerminal if another encoded-word follows. Deliberately left similar code in get_dotatom() unmodified. A dotatom can only appear within an addr-spec. RFC 2047 Section 5 prohibits use of an encoded-word in any portion of an addr-spec, so its appearance in a dotatom is invalid. Adding (and testing) special white-space handling in an invalid dotatom seems an unnecessary complication. --- Lib/email/_header_value_parser.py | 12 +++ .../test_email/test__header_value_parser.py | 89 +++++++++++++++++++ ...-03-01-13-36-02.gh-issue-128110.9wx_G0.rst | 5 ++ 3 files changed, 106 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 3d845c09d415f6..a92f4ab0327458 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1340,6 +1340,18 @@ def get_atom(value): atom.append(token) if value and value[0] in CFWS_LEADER: token, value = get_cfws(value) + # Peek ahead to ignore linear-white-space between adjacent encoded-words. + if ( + atom[-1].token_type == 'encoded-word' + and value.startswith('=?') + and all(ws.token_type == 'fws' for ws in token) # not comments + ): + try: + get_encoded_word(value) + except errors.HeaderParseError: + pass + else: + token = EWWhiteSpaceTerminal(token, 'fws') atom.append(token) return atom, value diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index d60a7039f9d4c6..af1c2e975142e2 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1039,6 +1039,79 @@ def get_phrase_cfws_only_raises(self): with self.assertRaises(errors.HeaderParseError): parser.get_phrase(' (foo) ') + def test_get_phrase_adjacent_ew(self): + # In structured headers, the requirement to ignore linear-white-space + # between adjacent encoded-words is actually implemented by get_atom. + # But it's easier to see the results by testing get_phrase. + self._test_get_x(parser.get_phrase, '=?ascii?q?Joi?= \t =?ascii?q?ned?=', 'Joined', 'Joined', [], '') + + def test_get_phrase_adjacent_ew_different_encodings(self): + self._test_get_x( + parser.get_phrase, + '=?utf-8?q?B=C3=A9r?= =?iso-8859-1?q?=E9nice?=', 'Bérénice', 'Bérénice', [], '' + ) + + def test_get_phrase_adjacent_ew_encoded_spaces(self): + self._test_get_x( + parser.get_phrase, + '=?ascii?q?Encoded?= =?ascii?q?_spaces_?= =?ascii?q?preserved?=', + 'Encoded spaces preserved', + 'Encoded spaces preserved', + [], + '' + ) + + def test_get_phrase_adjacent_ew_comment_is_not_linear_white_space(self): + self._test_get_x( + parser.get_phrase, + '=?ascii?q?Comment?= (is not) =?ascii?q?linear-white-space?=', + 'Comment (is not) linear-white-space', + 'Comment linear-white-space', + [], + '', + comments=['is not'], + ) + + def test_get_phrase_adjacent_ew_no_error_on_defects(self): + self._test_get_x( + parser.get_phrase, + '=?ascii?q?Def?= =?ascii?q?ect still joins?=', + 'Defect still joins', + 'Defect still joins', + [errors.InvalidHeaderDefect], # whitespace inside encoded word + '' + ) + + def test_get_phrase_adjacent_ew_ignore_non_ew(self): + self._test_get_x( + parser.get_phrase, + '=?ascii?q?No?= =?join?= for non-ew', + 'No =?join?= for non-ew', + 'No =?join?= for non-ew', + [], + '' + ) + + def test_get_phrase_adjacent_ew_ignore_invalid_ew(self): + self._test_get_x( + parser.get_phrase, + '=?ascii?q?No?= =?ascii?rot13?wbva= for invalid ew', + 'No =?ascii?rot13?wbva= for invalid ew', + 'No =?ascii?rot13?wbva= for invalid ew', + [], + '' + ) + + def test_get_phrase_adjacent_ew_missing_space(self): + self._test_get_x( + parser.get_phrase, + '=?ascii?q?Joi?==?ascii?q?ned?=', + 'Joined', + 'Joined', + [errors.InvalidHeaderDefect], # missing trailing whitespace + '' + ) + # get_local_part def test_get_local_part_simple(self): @@ -2365,6 +2438,22 @@ def test_get_address_rfc2047_display_name(self): self.assertEqual(address[0].token_type, 'mailbox') + def test_get_address_rfc2047_display_name_adjacent_ews(self): + address = self._test_get_x(parser.get_address, + '=?utf-8?q?B=C3=A9r?= =?utf-8?q?=C3=A9nice?= ', + 'Bérénice ', + 'Bérénice ', + [], + '') + self.assertEqual(address.token_type, 'address') + self.assertEqual(len(address.mailboxes), 1) + self.assertEqual(address.mailboxes, + address.all_mailboxes) + self.assertEqual(address.mailboxes[0].display_name, + 'Bérénice') + self.assertEqual(address[0].token_type, + 'mailbox') + def test_get_address_empty_group(self): address = self._test_get_x(parser.get_address, 'Monty Python:;', diff --git a/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst b/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst new file mode 100644 index 00000000000000..6177db3f6cf96d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst @@ -0,0 +1,5 @@ +Fix bug in the parsing of email address headers that could result in +extraneous spaces in the decoded text when using a modern email policy. +Space between pairs of adjacent rfc2047 encoded-words is now ignored, per +rfc2047 section 6.2 (and consistent with existing parsing of unstructured +headers like *Subject*). From 5a92b2ae1cb4ffbf218b9c0e3e2c40ef6a86b8ce Mon Sep 17 00:00:00 2001 From: Mike Edmunds Date: Mon, 11 May 2026 13:13:22 -0700 Subject: [PATCH 2/3] Move adjacent ew detection to get_phrase Switch to @bitdancer's fix from review feedback. Recharacterize space between ews as fws after parsing in get_phrase (rather than peeking ahead after first ew in get_word). Co-authored-by: R David Murray --- Lib/email/_header_value_parser.py | 22 +++++++++---------- .../test_email/test__header_value_parser.py | 5 ++--- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 686d1c990b5b6c..792072ab9f6128 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -1352,18 +1352,6 @@ def get_atom(value): atom.append(token) if value and value[0] in CFWS_LEADER: token, value = get_cfws(value) - # Peek ahead to ignore linear-white-space between adjacent encoded-words. - if ( - atom[-1].token_type == 'encoded-word' - and value.startswith('=?') - and all(ws.token_type == 'fws' for ws in token) # not comments - ): - try: - get_encoded_word(value) - except errors.HeaderParseError: - pass - else: - token = EWWhiteSpaceTerminal(token, 'fws') atom.append(token) return atom, value @@ -1473,6 +1461,16 @@ def get_phrase(value): else: try: token, value = get_word(value) + if (token[0].token_type == 'encoded-word' + and phrase + and phrase[-1].token_type == 'atom' + and len(phrase[-1]) > 1 + and phrase[-1][-2].token_type == 'encoded-word' + and phrase[-1][-1].token_type == 'cfws' + and not phrase[-1][-1].comments + ): + # linear ws between ews needs special handing... + phrase[-1][-1] = EWWhiteSpaceTerminal(phrase[-1], 'fws') except errors.HeaderParseError: if value[0] in CFWS_LEADER: token, value = get_cfws(value) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 9de1294e6a0b4d..9d9fe418ee4d06 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -1061,9 +1061,8 @@ def get_phrase_cfws_only_raises(self): parser.get_phrase(' (foo) ') def test_get_phrase_adjacent_ew(self): - # In structured headers, the requirement to ignore linear-white-space - # between adjacent encoded-words is actually implemented by get_atom. - # But it's easier to see the results by testing get_phrase. + # "'linear-white-space' that separates a pair of adjacent + # 'encoded-word's is ignored" (rfc2047 section 6.2) self._test_get_x(parser.get_phrase, '=?ascii?q?Joi?= \t =?ascii?q?ned?=', 'Joined', 'Joined', [], '') def test_get_phrase_adjacent_ew_different_encodings(self): From 7c062e4c8bbeabe7e32e832cb0950ad891b0b10d Mon Sep 17 00:00:00 2001 From: "R. David Murray" Date: Mon, 11 May 2026 17:35:38 -0400 Subject: [PATCH 3/3] Update 2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst --- .../Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst b/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst index 6177db3f6cf96d..b08b1886cff9cf 100644 --- a/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst +++ b/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst @@ -1,5 +1,5 @@ -Fix bug in the parsing of email address headers that could result in +Fix bug in the parsing of :mod:`email` address headers that could result in extraneous spaces in the decoded text when using a modern email policy. -Space between pairs of adjacent rfc2047 encoded-words is now ignored, per -rfc2047 section 6.2 (and consistent with existing parsing of unstructured +Space between pairs of adjacent :rfc:`2047` encoded-words is now ignored, per +section 6.2 (and consistent with existing parsing of unstructured headers like *Subject*).