From ef61984766d76f51fa65763d764b0b66f31856ca Mon Sep 17 00:00:00 2001
From: Mike Edmunds <medmunds@gmail.com>
Date: Sat, 1 Mar 2025 14:06:50 -0800
Subject: [PATCH 1/3] pythongh-128110: Fix rfc2047 handling in email parser
 address headers

RFC 2047 Section 6.2 requires that "any 'linear-white-space' that
separates a pair of adjacent 'encoded-word's is ignored." The modern
header value parser correctly implements that for unstructured headers,
but had missed a case in structured headers. This could cause a parsed
address header to include extraneous spaces in a display-name.

Fixed in get_atom() by converting a trailing CFWSList token after an
encoded-word to an EWWhiteSpaceTerminal if another encoded-word follows.

Deliberately left similar code in get_dotatom() unmodified. A dotatom
can only appear within an addr-spec. RFC 2047 Section 5 prohibits
use of an encoded-word in any portion of an addr-spec, so its appearance
in a dotatom is invalid. Adding (and testing) special white-space
handling in an invalid dotatom seems an unnecessary complication.
---
 Lib/email/_header_value_parser.py             | 12 +++
 .../test_email/test__header_value_parser.py   | 89 +++++++++++++++++++
 ...-03-01-13-36-02.gh-issue-128110.9wx_G0.rst |  5 ++
 3 files changed, 106 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst

diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 3d845c09d415f6..a92f4ab0327458 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -1340,6 +1340,18 @@ def get_atom(value):
     atom.append(token)
     if value and value[0] in CFWS_LEADER:
         token, value = get_cfws(value)
+        # Peek ahead to ignore linear-white-space between adjacent encoded-words.
+        if (
+            atom[-1].token_type == 'encoded-word'
+            and value.startswith('=?')
+            and all(ws.token_type == 'fws' for ws in token)  # not comments
+        ):
+            try:
+                get_encoded_word(value)
+            except errors.HeaderParseError:
+                pass
+            else:
+                token = EWWhiteSpaceTerminal(token, 'fws')
         atom.append(token)
     return atom, value
 
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
index d60a7039f9d4c6..af1c2e975142e2 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -1039,6 +1039,79 @@ def get_phrase_cfws_only_raises(self):
         with self.assertRaises(errors.HeaderParseError):
             parser.get_phrase(' (foo) ')
 
+    def test_get_phrase_adjacent_ew(self):
+        # In structured headers, the requirement to ignore linear-white-space
+        # between adjacent encoded-words is actually implemented by get_atom.
+        # But it's easier to see the results by testing get_phrase.
+        self._test_get_x(parser.get_phrase, '=?ascii?q?Joi?= \t =?ascii?q?ned?=', 'Joined', 'Joined', [], '')
+
+    def test_get_phrase_adjacent_ew_different_encodings(self):
+        self._test_get_x(
+            parser.get_phrase,
+            '=?utf-8?q?B=C3=A9r?= =?iso-8859-1?q?=E9nice?=', 'Bérénice', 'Bérénice', [], ''
+        )
+
+    def test_get_phrase_adjacent_ew_encoded_spaces(self):
+        self._test_get_x(
+            parser.get_phrase,
+            '=?ascii?q?Encoded?= =?ascii?q?_spaces_?= =?ascii?q?preserved?=',
+            'Encoded spaces preserved',
+            'Encoded spaces preserved',
+            [],
+            ''
+        )
+
+    def test_get_phrase_adjacent_ew_comment_is_not_linear_white_space(self):
+        self._test_get_x(
+            parser.get_phrase,
+            '=?ascii?q?Comment?= (is not) =?ascii?q?linear-white-space?=',
+            'Comment (is not) linear-white-space',
+            'Comment linear-white-space',
+            [],
+            '',
+            comments=['is not'],
+        )
+
+    def test_get_phrase_adjacent_ew_no_error_on_defects(self):
+        self._test_get_x(
+            parser.get_phrase,
+            '=?ascii?q?Def?= =?ascii?q?ect still joins?=',
+            'Defect still joins',
+            'Defect still joins',
+            [errors.InvalidHeaderDefect],  # whitespace inside encoded word
+            ''
+        )
+
+    def test_get_phrase_adjacent_ew_ignore_non_ew(self):
+        self._test_get_x(
+            parser.get_phrase,
+            '=?ascii?q?No?= =?join?= for non-ew',
+            'No =?join?= for non-ew',
+            'No =?join?= for non-ew',
+            [],
+            ''
+        )
+
+    def test_get_phrase_adjacent_ew_ignore_invalid_ew(self):
+        self._test_get_x(
+            parser.get_phrase,
+            '=?ascii?q?No?= =?ascii?rot13?wbva= for invalid ew',
+            'No =?ascii?rot13?wbva= for invalid ew',
+            'No =?ascii?rot13?wbva= for invalid ew',
+            [],
+            ''
+        )
+
+    def test_get_phrase_adjacent_ew_missing_space(self):
+        self._test_get_x(
+            parser.get_phrase,
+            '=?ascii?q?Joi?==?ascii?q?ned?=',
+            'Joined',
+            'Joined',
+            [errors.InvalidHeaderDefect],  # missing trailing whitespace
+            ''
+        )
+
     # get_local_part
 
     def test_get_local_part_simple(self):
@@ -2365,6 +2438,22 @@ def test_get_address_rfc2047_display_name(self):
         self.assertEqual(address[0].token_type,
                          'mailbox')
 
+    def test_get_address_rfc2047_display_name_adjacent_ews(self):
+        address = self._test_get_x(parser.get_address,
+            '=?utf-8?q?B=C3=A9r?= =?utf-8?q?=C3=A9nice?= <foo@example.com>',
+            'Bérénice <foo@example.com>',
+            'Bérénice <foo@example.com>',
+            [],
+            '')
+        self.assertEqual(address.token_type, 'address')
+        self.assertEqual(len(address.mailboxes), 1)
+        self.assertEqual(address.mailboxes,
+                         address.all_mailboxes)
+        self.assertEqual(address.mailboxes[0].display_name,
+                         'Bérénice')
+        self.assertEqual(address[0].token_type,
+                         'mailbox')
+
     def test_get_address_empty_group(self):
         address = self._test_get_x(parser.get_address,
             'Monty Python:;',
diff --git a/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst b/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst
new file mode 100644
index 00000000000000..6177db3f6cf96d
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst
@@ -0,0 +1,5 @@
+Fix bug in the parsing of email address headers that could result in
+extraneous spaces in the decoded text when using a modern email policy.
+Space between pairs of adjacent rfc2047 encoded-words is now ignored, per
+rfc2047 section 6.2 (and consistent with existing parsing of unstructured
+headers like *Subject*).

From 5a92b2ae1cb4ffbf218b9c0e3e2c40ef6a86b8ce Mon Sep 17 00:00:00 2001
From: Mike Edmunds <medmunds@gmail.com>
Date: Mon, 11 May 2026 13:13:22 -0700
Subject: [PATCH 2/3] Move adjacent ew detection to get_phrase

Switch to @bitdancer's fix from review feedback. Recharacterize space
between ews as fws after parsing in get_phrase (rather than peeking
ahead after first ew in get_word).

Co-authored-by: R David Murray <rdmurray@bitdance.com>
---
 Lib/email/_header_value_parser.py             | 22 +++++++++----------
 .../test_email/test__header_value_parser.py   |  5 ++---
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 686d1c990b5b6c..792072ab9f6128 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -1352,18 +1352,6 @@ def get_atom(value):
     atom.append(token)
     if value and value[0] in CFWS_LEADER:
         token, value = get_cfws(value)
-        # Peek ahead to ignore linear-white-space between adjacent encoded-words.
-        if (
-            atom[-1].token_type == 'encoded-word'
-            and value.startswith('=?')
-            and all(ws.token_type == 'fws' for ws in token)  # not comments
-        ):
-            try:
-                get_encoded_word(value)
-            except errors.HeaderParseError:
-                pass
-            else:
-                token = EWWhiteSpaceTerminal(token, 'fws')
         atom.append(token)
     return atom, value
 
@@ -1473,6 +1461,16 @@ def get_phrase(value):
         else:
             try:
                 token, value = get_word(value)
+                if (token[0].token_type == 'encoded-word'
+                    and phrase
+                    and phrase[-1].token_type == 'atom'
+                    and len(phrase[-1]) > 1
+                    and phrase[-1][-2].token_type == 'encoded-word'
+                    and phrase[-1][-1].token_type == 'cfws'
+                    and not phrase[-1][-1].comments
+                ):
+                    # linear ws between ews needs special handing...
+                    phrase[-1][-1] = EWWhiteSpaceTerminal(phrase[-1], 'fws')
             except errors.HeaderParseError:
                 if value[0] in CFWS_LEADER:
                     token, value = get_cfws(value)
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
index 9de1294e6a0b4d..9d9fe418ee4d06 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -1061,9 +1061,8 @@ def get_phrase_cfws_only_raises(self):
             parser.get_phrase(' (foo) ')
 
     def test_get_phrase_adjacent_ew(self):
-        # In structured headers, the requirement to ignore linear-white-space
-        # between adjacent encoded-words is actually implemented by get_atom.
-        # But it's easier to see the results by testing get_phrase.
+        # "'linear-white-space' that separates a pair of adjacent
+        # 'encoded-word's is ignored" (rfc2047 section 6.2)
         self._test_get_x(parser.get_phrase, '=?ascii?q?Joi?= \t =?ascii?q?ned?=', 'Joined', 'Joined', [], '')
 
     def test_get_phrase_adjacent_ew_different_encodings(self):

From 7c062e4c8bbeabe7e32e832cb0950ad891b0b10d Mon Sep 17 00:00:00 2001
From: "R. David Murray" <rdmurray@bitdance.com>
Date: Mon, 11 May 2026 17:35:38 -0400
Subject: [PATCH 3/3] Update 2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst

---
 .../Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst b/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst
index 6177db3f6cf96d..b08b1886cff9cf 100644
--- a/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst
+++ b/Misc/NEWS.d/next/Library/2025-03-01-13-36-02.gh-issue-128110.9wx_G0.rst
@@ -1,5 +1,5 @@
-Fix bug in the parsing of email address headers that could result in
+Fix bug in the parsing of :mod:`email` address headers that could result in
 extraneous spaces in the decoded text when using a modern email policy.
-Space between pairs of adjacent rfc2047 encoded-words is now ignored, per
-rfc2047 section 6.2 (and consistent with existing parsing of unstructured
+Space between pairs of adjacent :rfc:`2047` encoded-words is now ignored, per
+section 6.2 (and consistent with existing parsing of unstructured
 headers like *Subject*).