Skip to content
Open
75 changes: 38 additions & 37 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@
# Useful constants and functions
#

WSP = set(' \t')
_WSP = ' \t'
WSP = set(_WSP)
CFWS_LEADER = WSP | set('(')
SPECIALS = set(r'()<>@,:;.\"[]')
ATOM_ENDS = SPECIALS | WSP
Expand Down Expand Up @@ -2835,6 +2836,7 @@ def _steal_trailing_WSP_if_exists(lines):
lines.pop()
return wsp


def _refold_parse_tree(parse_tree, *, policy):
"""Return string of contents of parse_tree folded according to RFC rules.

Expand All @@ -2843,11 +2845,9 @@ def _refold_parse_tree(parse_tree, *, policy):
maxlen = policy.max_line_length or sys.maxsize
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
lines = [''] # Folded lines to be output
leading_whitespace = '' # When we have whitespace between two encoded
# words, we may need to encode the whitespace
# at the beginning of the second word.
last_ew = None # Points to the last encoded character if there's an ew on
# the line
last_word_is_ew = False
last_ew = None # if there is an encoded word in the last line of lines,
# points to the encoded word's first character
last_charset = None
wrap_as_ew_blocked = 0
want_encoding = False # This is set to True if we need to encode this part
Expand Down Expand Up @@ -2882,6 +2882,7 @@ def _refold_parse_tree(parse_tree, *, policy):
if part.token_type == 'mime-parameters':
# Mime parameter folding (using RFC2231) is extra special.
_fold_mime_parameters(part, lines, maxlen, encoding)
last_word_is_ew = False
continue

if want_encoding and not wrap_as_ew_blocked:
Expand All @@ -2898,6 +2899,7 @@ def _refold_parse_tree(parse_tree, *, policy):
# XXX what if encoded_part has no leading FWS?
lines.append(newline)
lines[-1] += encoded_part
last_word_is_ew = False
continue
# Either this is not a major syntactic break, so we don't
# want it on a line by itself even if it fits, or it
Expand All @@ -2916,11 +2918,16 @@ def _refold_parse_tree(parse_tree, *, policy):
(last_charset == 'unknown-8bit' or
last_charset == 'utf-8' and charset != 'us-ascii')):
last_ew = None
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
part.ew_combine_allowed, charset, leading_whitespace)
# This whitespace has been added to the lines in _fold_as_ew()
# so clear it now.
leading_whitespace = ''
last_ew = _fold_as_ew(
tstr,
lines,
maxlen,
last_ew,
part.ew_combine_allowed,
charset,
last_word_is_ew,
)
last_word_is_ew = True
last_charset = charset
want_encoding = False
continue
Expand All @@ -2933,28 +2940,19 @@ def _refold_parse_tree(parse_tree, *, policy):

if len(tstr) <= maxlen - len(lines[-1]):
lines[-1] += tstr
last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP))
continue

# This part is too long to fit. The RFC wants us to break at
# "major syntactic breaks", so unless we don't consider this
# to be one, check if it will fit on the next line by itself.
leading_whitespace = ''
if (part.syntactic_break and
len(tstr) + 1 <= maxlen):
newline = _steal_trailing_WSP_if_exists(lines)
if newline or part.startswith_fws():
# We're going to fold the data onto a new line here. Due to
# the way encoded strings handle continuation lines, we need to
# be prepared to encode any whitespace if the next line turns
# out to start with an encoded word.
lines.append(newline + tstr)

whitespace_accumulator = []
for char in lines[-1]:
if char not in WSP:
break
whitespace_accumulator.append(char)
leading_whitespace = ''.join(whitespace_accumulator)
last_word_is_ew = (last_word_is_ew
and not bool(lines[-1].strip(_WSP)))
last_ew = None
continue
if not hasattr(part, 'encode'):
Expand Down Expand Up @@ -2994,10 +2992,11 @@ def _refold_parse_tree(parse_tree, *, policy):
else:
# We can't fold it onto the next line either...
lines[-1] += tstr
last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP))

return policy.linesep.join(lines) + policy.linesep

def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace):
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew):
"""Fold string to_encode into lines as encoded word, combining if allowed.
Return the new value for last_ew, or None if ew_combine_allowed is False.

Expand All @@ -3012,14 +3011,24 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
to_encode = str(
get_unstructured(lines[-1][last_ew:] + to_encode))
lines[-1] = lines[-1][:last_ew]
elif to_encode[0] in WSP:
elif to_encode[0] in WSP and not last_word_is_ew:
# We're joining this to non-encoded text, so don't encode
# the leading blank.
leading_wsp = to_encode[0]
to_encode = to_encode[1:]
if (len(lines[-1]) == maxlen):
lines.append(_steal_trailing_WSP_if_exists(lines))
lines[-1] += leading_wsp
elif last_word_is_ew:
# If we are following up an encoded word with another encoded word,
# any white space between the two will be ignored when decoded.
# Therefore, we encode all to-be-displayed whitespace in the second
# encoded word.
len_without_wsp = len(lines[-1].rstrip(_WSP))
leading_whitespace = lines[-1][len_without_wsp:]
lines[-1] = (lines[-1][:len_without_wsp]
+ (' ' if leading_whitespace else ''))
to_encode = leading_whitespace + to_encode
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
to_encode = leading_whitespace + to_encode
len_without_wsp = len(lines[-1].rstrip(_WSP))
leading_whitespace = lines[-1][len_without_wsp:]
lines[-1] = lines[-1][:len_without_wsp] + (
' ' if leading_whitespace else '')
to_encode = leading_whitespace + to_encode

This avoids that complicated single use function. We should never be producing lines that contain only blanks, they get encoded.

You can also move this up to be the first elif and drop the 'and not last_word_is_ew'.

Copy link
Contributor Author

@robsdedude robsdedude Feb 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should never be producing lines that contain only blanks, they get encoded.

While testing, I noticed, too, that wrapped spaces are always encoded. TBH, I didn't like that and wanted to come up with a solution that's robust to changing this behavior in the future. I didn't like it because reading the RFC and the rest of this code it sure seems like not encoding words is generally preferable because of its human readability (and to a lesser extent mail clients that don't support encoded words). Further, I thought assuming an invariant that's enforced a few hundred lines away in a different function is a nice recipe for spooky action at a distance.

If you anyway think the fix should assume this invariant, I'll change this part.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, you have to keep in mind that the RFC requires that there be no line that consists of only whitespace. In theory the folder is only supposed to encode whitespace that it has to, but a completely blank line is an example of "has to". If it is being overly aggressive and encoding whitespace it doesn't need to that would be a bug. Though as the need for this PR demonstrates, getting it right isn't easy ;)

As for the spooky action at a distance...yes, this code has grown into a spaghetti ball. Believe it or, the original (pre bug fixing) version of this code was like the fourth iteration of ending up with an incomprehensible mess and starting over. I have plans to redo the whole thing from scratch again, and do something like have an accumulator object that would own things like the last_word_is_ew flag, but I'm working on rewriting the parser first, and that is taking a while. And I want to add a lot more tests before I attempt a rewrite, to make sure I don't break anything.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you have to keep in mind that the RFC requires that there be no line that consists of only whitespace.

Thanks, I overlooked that part in the RFC. In this case I agree that there's much less risk I will change the PR accordingly.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is better to move this elif up above the one it is currently after. It only saves 23 characters, but since we can, why not :) More importantly, the logic is IMO slightly easier to follow: first we check if we are appending to an encoded word, and then if not we handle the remaining special case that only matters when we aren't appending to an encoded word.


trailing_wsp = ''
if to_encode[-1] in WSP:
Expand All @@ -3040,20 +3049,13 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,

while to_encode:
remaining_space = maxlen - len(lines[-1])
text_space = remaining_space - chrome_len - len(leading_whitespace)
text_space = remaining_space - chrome_len
if text_space <= 0:
lines.append(' ')
newline = _steal_trailing_WSP_if_exists(lines)
lines.append(newline or ' ')
new_last_ew = len(lines[-1])
continue

# If we are at the start of a continuation line, prepend whitespace
# (we only want to do this when the line starts with an encoded word
# but if we're folding in this helper function, then we know that we
# are going to be writing out an encoded word.)
if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace:
encoded_word = _ew.encode(leading_whitespace, charset=encode_as)
lines[-1] += encoded_word
leading_whitespace = ''

to_encode_word = to_encode[:text_space]
encoded_word = _ew.encode(to_encode_word, charset=encode_as)
excess = len(encoded_word) - remaining_space
Expand All @@ -3065,7 +3067,6 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset,
excess = len(encoded_word) - remaining_space
lines[-1] += encoded_word
to_encode = to_encode[len(to_encode_word):]
leading_whitespace = ''

if to_encode:
lines.append(' ')
Expand Down
44 changes: 44 additions & 0 deletions Lib/test/test_email/test_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,50 @@ def test_defaults_handle_spaces_at_start_of_continuation_line(self):
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)

# gh-144156: fold between non-encoded and encoded words don't need to encoded
# the separating space
def test_defaults_handle_spaces_at_start_of_continuation_line_2(self):
source = ("Re: [SOS-1495488] Commande et livraison - Demande de retour - "
"bibijolie - 251210-AABBCC - Abo actualités digitales 20 semaines "
"d’abonnement à 24 heures, Bilan, Tribune de Genève et tous les titres Tamedia")
expected = (
b"Subject: "
b"Re: [SOS-1495488] Commande et livraison - Demande de retour -\n"
b" bibijolie - 251210-AABBCC - Abo =?utf-8?q?actualit=C3=A9s?= digitales 20\n"
b" semaines =?utf-8?q?d=E2=80=99abonnement_=C3=A0?= 24 heures, Bilan, Tribune de\n"
b" =?utf-8?q?Gen=C3=A8ve?= et tous les titres Tamedia\n\n"
)
msg = EmailMessage()
msg['Subject'] = source
s = io.BytesIO()
g = BytesGenerator(s)
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)

def test_ew_folding_round_trip_1(self):
print()
source = "aaaaaaaaa фффффффф "
msg = EmailMessage()
msg['Subject'] = source
s = io.BytesIO()
g = BytesGenerator(s, maxheaderlen=30)
g.flatten(msg)
flat = s.getvalue()
reparsed = message_from_bytes(flat, policy=policy.default)['Subject']
self.assertMultiLineEqual(reparsed, source)

def test_ew_folding_round_trip_2(self):
print()
source = "aaa aaaaaaa aaa ффф фффф "
msg = EmailMessage()
msg['Subject'] = source
s = io.BytesIO()
g = BytesGenerator(s, maxheaderlen=30)
g.flatten(msg)
flat = s.getvalue()
reparsed = message_from_bytes(flat, policy=policy.default)['Subject']
self.assertMultiLineEqual(reparsed, source)

def test_cte_type_7bit_handles_unknown_8bit(self):
source = ("Subject: Maintenant je vous présente mon "
"collègue\n\n").encode('utf-8')
Expand Down
4 changes: 2 additions & 2 deletions Lib/test/test_email/test_headerregistry.py
Original file line number Diff line number Diff line change
Expand Up @@ -1711,7 +1711,7 @@ def test_fold_unstructured_with_overlong_word(self):
'singlewordthatwontfit')
self.assertEqual(
h.fold(policy=policy.default.clone(max_line_length=20)),
'Subject: \n'
'Subject:\n'
' =?utf-8?q?thisisa?=\n'
' =?utf-8?q?verylon?=\n'
' =?utf-8?q?glineco?=\n'
Expand All @@ -1727,7 +1727,7 @@ def test_fold_unstructured_with_two_overlong_words(self):
'singlewordthatwontfit plusanotherverylongwordthatwontfit')
self.assertEqual(
h.fold(policy=policy.default.clone(max_line_length=20)),
'Subject: \n'
'Subject:\n'
' =?utf-8?q?thisisa?=\n'
' =?utf-8?q?verylon?=\n'
' =?utf-8?q?glineco?=\n'
Expand Down
2 changes: 1 addition & 1 deletion Lib/test/test_email/test_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def test_non_ascii_chars_do_not_cause_inf_loop(self):
actual = policy.fold('Subject', 'ą' * 12)
self.assertEqual(
actual,
'Subject: \n' +
'Subject:\n' +
12 * ' =?utf-8?q?=C4=85?=\n')

def test_short_maxlen_error(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix the folding of headers by the :mod:`email` library when :rfc:`2047` encoded words are used. Now whitespace is correctly preserved and also correctly added between adjacent encoded words. The latter property was broken by the fix for gh-92081, which mostly fixed previous failures to preserve whitespace.
Loading