python · AA-Turner · Mar 31, 2025 · Apr 1, 2025
diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py
@@ -8,9 +8,11 @@
 # $Id$
 #
 
+import re
 import unittest
 
 from textwrap import TextWrapper, wrap, fill, dedent, indent, shorten
+from textwrap import _cached_regex as cached_regex
 
 
 class BaseTestCase(unittest.TestCase):
@@ -712,6 +714,44 @@ def test_do_not_break_long_words_or_on_hyphens(self):
                     'ng_option_', 'indeed-', 'good-bye"']
         self.check_wrap(self.text2, 10, expected)
 
+
+class TextWrapperCachedRegexTestCase(BaseTestCase):
+    def test_attr_access(self):
+        wrapper = TextWrapper()
+        # these names are not part of the public interface,
+        # but are not prefixed with an underscore.
+        for attr in 'wordsep_re', 'wordsep_simple_re', 'sentence_end_re':
+            self.assertTrue(hasattr(wrapper, attr))
+            self.assertIsInstance(getattr(wrapper, attr), re.Pattern)
+            self.assertIsInstance(getattr(TextWrapper, attr), re.Pattern)
+
+            setattr(wrapper, attr, attr)
+            self.assertEqual(getattr(wrapper, attr), attr)
+            self.assertIsInstance(getattr(TextWrapper, attr), re.Pattern)
+
+    def test_cached_regex(self):
+        class Spam:
+            pat1 = cached_regex('pat1')
+            pat2 = cached_regex('pat2')
+
+        # both patterns are instances of cached_regex
+        self.assertIsInstance(Spam.__dict__['pat1'], cached_regex)
+        self.assertIsInstance(Spam.__dict__['pat2'], cached_regex)
+
+        # the attribute is replaced with a compiled pattern when accessed
+        self.assertEqual(Spam.pat1, re.compile('pat1'))
+        self.assertEqual(Spam.__dict__['pat1'], re.compile('pat1'))
+
+        # including when accessed from an instance
+        spam = Spam()
+        self.assertEqual(spam.__dict__, {})
+        self.assertIsInstance(spam.__class__.__dict__['pat2'], cached_regex)
+        self.assertEqual(spam.pat2, re.compile('pat2'))
+        self.assertEqual(Spam.pat2, re.compile('pat2'))
+        self.assertEqual(spam.__class__.__dict__['pat2'], re.compile('pat2'))
+        self.assertIs(spam.pat2, Spam.pat2)
+
+
 class IndentTestCases(BaseTestCase):
 
     # called before each test method

diff --git a/Lib/textwrap.py b/Lib/textwrap.py
@@ -5,10 +5,26 @@
 # Copyright (C) 2002 Python Software Foundation.
 # Written by Greg Ward <[email protected]>
 
-import re
-
 __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
 
+
+class _cached_regex:
+    def __init__(self, pattern):
+        self.pattern = pattern
+
+    def __set_name__(self, owner, name):
+        self.attr_name = name
+
+    def __get__(self, instance, owner=None):
+        if owner is None:
+            return self
+        import re
+        # replace this descriptor with the compiled pattern
+        pat = re.compile(self.pattern)
+        setattr(owner, self.attr_name, pat)
+        return pat
+
+
 # Hardcode the recognized whitespace characters to the US-ASCII
 # whitespace characters.  The main reason for doing this is that
 # some Unicode spaces (like \u00a0) are non-breaking whitespaces.
@@ -73,41 +89,39 @@ class TextWrapper:
     # (after stripping out empty strings).
     word_punct = r'[\w!"\'&.,?]'
     letter = r'[^\d\W]'
-    whitespace = r'[%s]' % re.escape(_whitespace)
-    nowhitespace = '[^' + whitespace[1:]
-    wordsep_re = re.compile(r'''
+    whitespace = fr'[{_whitespace}]'
+    no_whitespace = f'[^{_whitespace}]'
+    wordsep_re = _cached_regex(fr'''(?x)
         ( # any whitespace
-          %(ws)s+
+          {whitespace}+
         | # em-dash between words
-          (?<=%(wp)s) -{2,} (?=\w)
+          (?<={word_punct}) -{{2,}} (?=\w)
         | # word, possibly hyphenated
-          %(nws)s+? (?:
+          {no_whitespace}+? (?:
             # hyphenated word
-              -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
-              (?= %(lt)s -? %(lt)s)
+              -(?: (?<={letter}{{2}}-) | (?<={letter}-{letter}-))
+              (?= {letter} -? {letter})
             | # end of word
-              (?=%(ws)s|\Z)
+              (?={whitespace}|\Z)
             | # em-dash
-              (?<=%(wp)s) (?=-{2,}\w)
+              (?<={word_punct}) (?=-{{2,}}\w)
             )
-        )''' % {'wp': word_punct, 'lt': letter,
-                'ws': whitespace, 'nws': nowhitespace},
-        re.VERBOSE)
-    del word_punct, letter, nowhitespace
+        )''')
+    del word_punct, letter, no_whitespace
 
     # This less funky little regex just split on recognized spaces. E.g.
     #   "Hello there -- you goof-ball, use the -b option!"
     # splits into
     #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
-    wordsep_simple_re = re.compile(r'(%s+)' % whitespace)
+    wordsep_simple_re = _cached_regex(fr'({whitespace}+)')
     del whitespace
 
     # XXX this is not locale- or charset-aware -- string.lowercase
     # is US-ASCII only (and therefore English-only)
-    sentence_end_re = re.compile(r'[a-z]'             # lowercase letter
-                                 r'[\.\!\?]'          # sentence-ending punct.
-                                 r'[\"\']?'           # optional end-of-quote
-                                 r'\Z')               # end of chunk
+    sentence_end_re = _cached_regex(r'[a-z]'          # lowercase letter
+                                    r'[\.\!\?]'       # sentence-ending punct.
+                                    r'[\"\']?'        # optional end-of-quote
+                                    r'\Z')            # end of chunk
 
     def __init__(self,
                  width=70,
@@ -250,7 +264,7 @@ def _wrap_chunks(self, chunks):
         """
         lines = []
         if self.width <= 0:
-            raise ValueError("invalid width %r (must be > 0)" % self.width)
+            raise ValueError(f"invalid width {self.width!r} (must be > 0)")
         if self.max_lines is not None:
             if self.max_lines > 1:
                 indent = self.subsequent_indent