diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index ea7083fc88b8c0..b5003943ab0d97 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -96,6 +96,18 @@ def quote_string(value): return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' +# Match a RFC 2047 word, looks like =?utf-8?q?someword?= +rfc2047_matcher = re.compile(r''' + =\? # literal =? + [^?]* # charset + \? # literal ? + [qQbB] # literal 'q' or 'b', case insensitive + \? # literal ? + .*? # encoded word + \?= # literal ?= +''', re.VERBOSE | re.MULTILINE) + + # # TokenList and its subclasses # @@ -1054,6 +1066,10 @@ def get_encoded_word(value): _validate_xtext(vtext) ew.append(vtext) text = ''.join(remainder) + # Encoded words should be followed by a WS + if value and value[0] not in WSP: + ew.defects.append(errors.InvalidHeaderDefect( + "missing trailing whitespace after encoded-word")) return ew, value def get_unstructured(value): @@ -1106,6 +1122,11 @@ def get_unstructured(value): unstructured.append(token) continue tok, *remainder = _wsp_splitter(value, 1) + # Split in the middle of an atom if there is a rfc2047 encoded word + # which does not have WSP on both sides. The defect will be registered + # the next time through the loop. + if rfc2047_matcher.search(tok): + tok, *remainder = value.partition('=?') vtext = ValueTerminal(tok, 'vtext') _validate_xtext(vtext) unstructured.append(vtext) diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 198cf17bbf3ed8..bad4333dbc4351 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -118,7 +118,7 @@ def test_get_encoded_word_gets_first_even_if_no_space(self): '=?us-ascii?q?first?==?utf-8?q?second?=', 'first', 'first', - [], + [errors.InvalidHeaderDefect], '=?utf-8?q?second?=') def test_get_encoded_word_sets_extra_attributes(self): @@ -361,6 +361,25 @@ def test_get_unstructured_no_whitespace_between_ews(self): '=?utf-8?q?foo?==?utf-8?q?bar?=', 'foobar', 'foobar', + [errors.InvalidHeaderDefect, + errors.InvalidHeaderDefect], + '') + + def test_get_unstructured_ew_without_leading_whitespace(self): + self._test_get_x( + self._get_unst, + 'nowhitespace=?utf-8?q?somevalue?=', + 'nowhitespacesomevalue', + 'nowhitespacesomevalue', + [errors.InvalidHeaderDefect], + '') + + def test_get_unstructured_ew_without_trailing_whitespace(self): + self._test_get_x( + self._get_unst, + '=?utf-8?q?somevalue?=nowhitespace', + 'somevaluenowhitespace', + 'somevaluenowhitespace', [errors.InvalidHeaderDefect], '') @@ -550,7 +569,8 @@ def test_encoded_word_inside_quotes(self): '"=?utf-8?Q?not_really_valid?="', '"not really valid"', 'not really valid', - [errors.InvalidHeaderDefect], + [errors.InvalidHeaderDefect, + errors.InvalidHeaderDefect], '') # get_comment diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index a6b48385aeac2a..8d89c5dd58322e 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase): 'rfc2047_atom_in_quoted_string_is_decoded': ('"=?utf-8?q?=C3=89ric?=" ', - [errors.InvalidHeaderDefect], + [errors.InvalidHeaderDefect, + errors.InvalidHeaderDefect], 'Éric ', 'Éric', 'foo@example.com', diff --git a/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst new file mode 100644 index 00000000000000..dd0dd7f72c0a3f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst @@ -0,0 +1,4 @@ +Email headers containing RFC2047 encoded words are parsed despite the missing +whitespace, and a defect registered. Also missing trailing whitespace after +encoded words is now registered as a defect. +