Skip to content

Commit 05cb728

Browse files
bpo-30349: Raise FutureWarning for nested sets and set operations (#1553)
in regular expressions.
1 parent 3daaafb commit 05cb728

File tree

8 files changed

+106
-9
lines changed

8 files changed

+106
-9
lines changed

Doc/library/re.rst

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,20 @@ The special characters are:
200200
place it at the beginning of the set. For example, both ``[()[\]{}]`` and
201201
``[]()[{}]`` will both match a parenthesis.
202202

203+
* Support of nested sets and set operations as in `Unicode Technical
204+
Standard #18`_ might be added in the future. This would change the
205+
syntax, so to facilitate this change a :exc:`FutureWarning` will be raised
206+
in ambiguous cases for the time being.
207+
That include sets starting with a literal ``'['`` or containing literal
208+
character sequences ``'--'``, ``'&&'``, ``'~~'``, and ``'||'``. To
209+
avoid a warning escape them with a backslash.
210+
211+
.. _Unicode Technical Standard #18: https://unicode.org/reports/tr18/
212+
213+
.. versionchanged:: 3.7
214+
:exc:`FutureWarning` is raised if a character set contains constructs
215+
that will change semantically in the future.
216+
203217
``|``
204218
``A|B``, where *A* and *B* can be arbitrary REs, creates a regular expression that
205219
will match either *A* or *B*. An arbitrary number of REs can be separated by the
@@ -829,7 +843,7 @@ form.
829843

830844
>>> legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:"
831845
>>> print('[%s]+' % re.escape(legal_chars))
832-
[abcdefghijklmnopqrstuvwxyz0123456789!\#\$%&'\*\+\-\.\^_`\|~:]+
846+
[abcdefghijklmnopqrstuvwxyz0123456789!\#\$%\&'\*\+\-\.\^_`\|\~:]+
833847

834848
>>> operators = ['+', '-', '*', '/', '**']
835849
>>> print('|'.join(map(re.escape, sorted(operators, reverse=True))))

Doc/tools/susp-ignored.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ whatsnew/3.2,,:gz,">>> with tarfile.open(name='myarchive.tar.gz', mode='w:gz') a
300300
whatsnew/3.2,,:location,zope9-location = ${zope9:location}
301301
whatsnew/3.2,,:prefix,zope-conf = ${custom:prefix}/etc/zope.conf
302302
library/re,,`,!#$%&'*+-.^_`|~:
303-
library/re,,`,!\#\$%&'\*\+\-\.\^_`\|~:
303+
library/re,,`,!\#\$%\&'\*\+\-\.\^_`\|\~:
304304
library/tarfile,,:xz,'x:xz'
305305
library/xml.etree.elementtree,,:sometag,prefix:sometag
306306
library/xml.etree.elementtree,,:fictional,"<actors xmlns:fictional=""http://characters.example.com"""

Doc/whatsnew/3.7.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -700,6 +700,17 @@ Changes in the Python API
700700
argument ``os.scandir`` instead of ``os.listdir`` when listing the direcory
701701
is failed.
702702

703+
* Support of nested sets and set operations in regular expressions as in
704+
`Unicode Technical Standard #18`_ might be added in the future. This would
705+
change the syntax, so to facilitate this change a :exc:`FutureWarning` will
706+
be raised in ambiguous cases for the time being.
707+
That include sets starting with a literal ``'['`` or containing literal
708+
character sequences ``'--'``, ``'&&'``, ``'~~'``, and ``'||'``. To
709+
avoid a warning escape them with a backslash.
710+
(Contributed by Serhiy Storchaka in :issue:`30349`.)
711+
712+
.. _Unicode Technical Standard #18: https://unicode.org/reports/tr18/
713+
703714

704715
Changes in the C API
705716
--------------------

Lib/email/_header_value_parser.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1354,15 +1354,14 @@ def __str__(self):
13541354

13551355
_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
13561356
_non_atom_end_matcher = re.compile(r"[^{}]+".format(
1357-
''.join(ATOM_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
1357+
re.escape(''.join(ATOM_ENDS)))).match
13581358
_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
13591359
_non_token_end_matcher = re.compile(r"[^{}]+".format(
1360-
''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
1360+
re.escape(''.join(TOKEN_ENDS)))).match
13611361
_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
1362-
''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
1362+
re.escape(''.join(ATTRIBUTE_ENDS)))).match
13631363
_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
1364-
''.join(EXTENDED_ATTRIBUTE_ENDS).replace(
1365-
'\\','\\\\').replace(']',r'\]'))).match
1364+
re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match
13661365

13671366
def _validate_xtext(xtext):
13681367
"""If input token contains ASCII non-printables, register a defect."""

Lib/re.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,8 +251,9 @@ def template(pattern, flags=0):
251251
# SPECIAL_CHARS
252252
# closing ')', '}' and ']'
253253
# '-' (a range in character set)
254+
# '&', '~', (extended character set operations)
254255
# '#' (comment) and WHITESPACE (ignored) in verbose mode
255-
_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.# \t\n\r\v\f'}
256+
_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f'}
256257

257258
def escape(pattern):
258259
"""

Lib/sre_parse.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,12 @@ def _parse(source, state, verbose, nested, first=False):
517517
setappend = set.append
518518
## if sourcematch(":"):
519519
## pass # handle character classes
520+
if source.next == '[':
521+
import warnings
522+
warnings.warn(
523+
'Possible nested set at position %d' % source.tell(),
524+
FutureWarning, stacklevel=nested + 6
525+
)
520526
negate = sourcematch("^")
521527
# check remaining characters
522528
while True:
@@ -529,6 +535,17 @@ def _parse(source, state, verbose, nested, first=False):
529535
elif this[0] == "\\":
530536
code1 = _class_escape(source, this)
531537
else:
538+
if set and this in '-&~|' and source.next == this:
539+
import warnings
540+
warnings.warn(
541+
'Possible set %s at position %d' % (
542+
'difference' if this == '-' else
543+
'intersection' if this == '&' else
544+
'symmetric difference' if this == '~' else
545+
'union',
546+
source.tell() - 1),
547+
FutureWarning, stacklevel=nested + 6
548+
)
532549
code1 = LITERAL, _ord(this)
533550
if sourcematch("-"):
534551
# potential range
@@ -545,6 +562,13 @@ def _parse(source, state, verbose, nested, first=False):
545562
if that[0] == "\\":
546563
code2 = _class_escape(source, that)
547564
else:
565+
if that == '-':
566+
import warnings
567+
warnings.warn(
568+
'Possible set difference at position %d' % (
569+
source.tell() - 2),
570+
FutureWarning, stacklevel=nested + 6
571+
)
548572
code2 = LITERAL, _ord(that)
549573
if code1[0] != LITERAL or code2[0] != LITERAL:
550574
msg = "bad character range %s-%s" % (this, that)

Lib/test/test_re.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -914,6 +914,51 @@ def test_not_literal(self):
914914
self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
915915
self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")
916916

917+
def test_possible_set_operations(self):
918+
s = bytes(range(128)).decode()
919+
with self.assertWarns(FutureWarning):
920+
p = re.compile(r'[0-9--1]')
921+
self.assertEqual(p.findall(s), list('-./0123456789'))
922+
self.assertEqual(re.findall(r'[--1]', s), list('-./01'))
923+
with self.assertWarns(FutureWarning):
924+
p = re.compile(r'[%--1]')
925+
self.assertEqual(p.findall(s), list("%&'()*+,-1"))
926+
with self.assertWarns(FutureWarning):
927+
p = re.compile(r'[%--]')
928+
self.assertEqual(p.findall(s), list("%&'()*+,-"))
929+
930+
with self.assertWarns(FutureWarning):
931+
p = re.compile(r'[0-9&&1]')
932+
self.assertEqual(p.findall(s), list('&0123456789'))
933+
with self.assertWarns(FutureWarning):
934+
p = re.compile(r'[\d&&1]')
935+
self.assertEqual(p.findall(s), list('&0123456789'))
936+
self.assertEqual(re.findall(r'[&&1]', s), list('&1'))
937+
938+
with self.assertWarns(FutureWarning):
939+
p = re.compile(r'[0-9||a]')
940+
self.assertEqual(p.findall(s), list('0123456789a|'))
941+
with self.assertWarns(FutureWarning):
942+
p = re.compile(r'[\d||a]')
943+
self.assertEqual(p.findall(s), list('0123456789a|'))
944+
self.assertEqual(re.findall(r'[||1]', s), list('1|'))
945+
946+
with self.assertWarns(FutureWarning):
947+
p = re.compile(r'[0-9~~1]')
948+
self.assertEqual(p.findall(s), list('0123456789~'))
949+
with self.assertWarns(FutureWarning):
950+
p = re.compile(r'[\d~~1]')
951+
self.assertEqual(p.findall(s), list('0123456789~'))
952+
self.assertEqual(re.findall(r'[~~1]', s), list('1~'))
953+
954+
with self.assertWarns(FutureWarning):
955+
p = re.compile(r'[[0-9]|]')
956+
self.assertEqual(p.findall(s), list('0123456789[]'))
957+
958+
with self.assertWarns(FutureWarning):
959+
p = re.compile(r'[[:digit:]|]')
960+
self.assertEqual(p.findall(s), list(':[]dgit'))
961+
917962
def test_search_coverage(self):
918963
self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
919964
self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
@@ -932,7 +977,7 @@ def assertMatch(self, pattern, text, match=None, span=None,
932977
self.assertEqual(m.group(), match)
933978
self.assertEqual(m.span(), span)
934979

935-
LITERAL_CHARS = string.ascii_letters + string.digits + '!"%&\',/:;<=>@_`~'
980+
LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`'
936981

937982
def test_re_escape(self):
938983
p = ''.join(chr(i) for i in range(256))
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
FutureWarning is now emitted if a regular expression contains character set
2+
constructs that will change semantically in the future (nested sets and set
3+
operations).

0 commit comments

Comments
 (0)