From a759b016648aa3d01bd44b666775e58cb2c7ed05 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Mon, 31 Mar 2025 20:24:44 +0100 Subject: [PATCH 1/2] Optimise import time for textwrap --- Lib/textwrap.py | 58 ++++++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index bac98c99e41df8..729f6aead55bec 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -5,10 +5,26 @@ # Copyright (C) 2002 Python Software Foundation. # Written by Greg Ward -import re - __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten'] + +class _cached_regex: + def __init__(self, pattern): + self.pattern = pattern + + def __set_name__(self, owner, name): + self.attr_name = name + + def __get__(self, instance, owner=None): + if owner is None: + return self + import re + # replace this descriptor with the compiled pattern + pat = re.compile(self.pattern) + setattr(owner, self.attr_name, pat) + return pat + + # Hardcode the recognized whitespace characters to the US-ASCII # whitespace characters. The main reason for doing this is that # some Unicode spaces (like \u00a0) are non-breaking whitespaces. @@ -73,41 +89,39 @@ class TextWrapper: # (after stripping out empty strings). word_punct = r'[\w!"\'&.,?]' letter = r'[^\d\W]' - whitespace = r'[%s]' % re.escape(_whitespace) - nowhitespace = '[^' + whitespace[1:] - wordsep_re = re.compile(r''' + whitespace = fr'[{_whitespace}]' + no_whitespace = f'[^{_whitespace}]' + wordsep_re = _cached_regex(fr'''(?x) ( # any whitespace - %(ws)s+ + {whitespace}+ | # em-dash between words - (?<=%(wp)s) -{2,} (?=\w) + (?<={word_punct}) -{{2,}} (?=\w) | # word, possibly hyphenated - %(nws)s+? (?: + {no_whitespace}+? (?: # hyphenated word - -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-)) - (?= %(lt)s -? %(lt)s) + -(?: (?<={letter}{{2}}-) | (?<={letter}-{letter}-)) + (?= {letter} -? {letter}) | # end of word - (?=%(ws)s|\Z) + (?={whitespace}|\Z) | # em-dash - (?<=%(wp)s) (?=-{2,}\w) + (?<={word_punct}) (?=-{{2,}}\w) ) - )''' % {'wp': word_punct, 'lt': letter, - 'ws': whitespace, 'nws': nowhitespace}, - re.VERBOSE) - del word_punct, letter, nowhitespace + )''') + del word_punct, letter, no_whitespace # This less funky little regex just split on recognized spaces. E.g. # "Hello there -- you goof-ball, use the -b option!" # splits into # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ - wordsep_simple_re = re.compile(r'(%s+)' % whitespace) + wordsep_simple_re = _cached_regex(fr'({whitespace}+)') del whitespace # XXX this is not locale- or charset-aware -- string.lowercase # is US-ASCII only (and therefore English-only) - sentence_end_re = re.compile(r'[a-z]' # lowercase letter - r'[\.\!\?]' # sentence-ending punct. - r'[\"\']?' # optional end-of-quote - r'\Z') # end of chunk + sentence_end_re = _cached_regex(r'[a-z]' # lowercase letter + r'[\.\!\?]' # sentence-ending punct. + r'[\"\']?' # optional end-of-quote + r'\Z') # end of chunk def __init__(self, width=70, @@ -250,7 +264,7 @@ def _wrap_chunks(self, chunks): """ lines = [] if self.width <= 0: - raise ValueError("invalid width %r (must be > 0)" % self.width) + raise ValueError(f"invalid width {self.width!r} (must be > 0)") if self.max_lines is not None: if self.max_lines > 1: indent = self.subsequent_indent From b67e4aaec6aa3c2069e466737e89cd493fe400e8 Mon Sep 17 00:00:00 2001 From: Adam Turner <9087854+aa-turner@users.noreply.github.com> Date: Tue, 1 Apr 2025 05:40:29 +0100 Subject: [PATCH 2/2] Add tests --- Lib/test/test_textwrap.py | 40 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index 77366988b57fa7..c298d2f4fb414c 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -8,9 +8,11 @@ # $Id$ # +import re import unittest from textwrap import TextWrapper, wrap, fill, dedent, indent, shorten +from textwrap import _cached_regex as cached_regex class BaseTestCase(unittest.TestCase): @@ -712,6 +714,44 @@ def test_do_not_break_long_words_or_on_hyphens(self): 'ng_option_', 'indeed-', 'good-bye"'] self.check_wrap(self.text2, 10, expected) + +class TextWrapperCachedRegexTestCase(BaseTestCase): + def test_attr_access(self): + wrapper = TextWrapper() + # these names are not part of the public interface, + # but are not prefixed with an underscore. + for attr in 'wordsep_re', 'wordsep_simple_re', 'sentence_end_re': + self.assertTrue(hasattr(wrapper, attr)) + self.assertIsInstance(getattr(wrapper, attr), re.Pattern) + self.assertIsInstance(getattr(TextWrapper, attr), re.Pattern) + + setattr(wrapper, attr, attr) + self.assertEqual(getattr(wrapper, attr), attr) + self.assertIsInstance(getattr(TextWrapper, attr), re.Pattern) + + def test_cached_regex(self): + class Spam: + pat1 = cached_regex('pat1') + pat2 = cached_regex('pat2') + + # both patterns are instances of cached_regex + self.assertIsInstance(Spam.__dict__['pat1'], cached_regex) + self.assertIsInstance(Spam.__dict__['pat2'], cached_regex) + + # the attribute is replaced with a compiled pattern when accessed + self.assertEqual(Spam.pat1, re.compile('pat1')) + self.assertEqual(Spam.__dict__['pat1'], re.compile('pat1')) + + # including when accessed from an instance + spam = Spam() + self.assertEqual(spam.__dict__, {}) + self.assertIsInstance(spam.__class__.__dict__['pat2'], cached_regex) + self.assertEqual(spam.pat2, re.compile('pat2')) + self.assertEqual(Spam.pat2, re.compile('pat2')) + self.assertEqual(spam.__class__.__dict__['pat2'], re.compile('pat2')) + self.assertIs(spam.pat2, Spam.pat2) + + class IndentTestCases(BaseTestCase): # called before each test method