Skip to content

gh-118761: Optimise import time for textwrap #131956

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions Lib/test/test_textwrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
# $Id$
#

import re
import unittest

from textwrap import TextWrapper, wrap, fill, dedent, indent, shorten
from textwrap import _cached_regex as cached_regex


class BaseTestCase(unittest.TestCase):
Expand Down Expand Up @@ -712,6 +714,44 @@ def test_do_not_break_long_words_or_on_hyphens(self):
'ng_option_', 'indeed-', 'good-bye"']
self.check_wrap(self.text2, 10, expected)


class TextWrapperCachedRegexTestCase(BaseTestCase):
def test_attr_access(self):
wrapper = TextWrapper()
# these names are not part of the public interface,
# but are not prefixed with an underscore.
for attr in 'wordsep_re', 'wordsep_simple_re', 'sentence_end_re':
self.assertTrue(hasattr(wrapper, attr))
self.assertIsInstance(getattr(wrapper, attr), re.Pattern)
self.assertIsInstance(getattr(TextWrapper, attr), re.Pattern)

setattr(wrapper, attr, attr)
self.assertEqual(getattr(wrapper, attr), attr)
self.assertIsInstance(getattr(TextWrapper, attr), re.Pattern)

def test_cached_regex(self):
class Spam:
pat1 = cached_regex('pat1')
pat2 = cached_regex('pat2')

# both patterns are instances of cached_regex
self.assertIsInstance(Spam.__dict__['pat1'], cached_regex)
self.assertIsInstance(Spam.__dict__['pat2'], cached_regex)

# the attribute is replaced with a compiled pattern when accessed
self.assertEqual(Spam.pat1, re.compile('pat1'))
self.assertEqual(Spam.__dict__['pat1'], re.compile('pat1'))

# including when accessed from an instance
spam = Spam()
self.assertEqual(spam.__dict__, {})
self.assertIsInstance(spam.__class__.__dict__['pat2'], cached_regex)
self.assertEqual(spam.pat2, re.compile('pat2'))
self.assertEqual(Spam.pat2, re.compile('pat2'))
self.assertEqual(spam.__class__.__dict__['pat2'], re.compile('pat2'))
self.assertIs(spam.pat2, Spam.pat2)


class IndentTestCases(BaseTestCase):

# called before each test method
Expand Down
58 changes: 36 additions & 22 deletions Lib/textwrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,26 @@
# Copyright (C) 2002 Python Software Foundation.
# Written by Greg Ward <[email protected]>

import re

__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']


class _cached_regex:
def __init__(self, pattern):
self.pattern = pattern

def __set_name__(self, owner, name):
self.attr_name = name

def __get__(self, instance, owner=None):
if owner is None:
return self
import re
# replace this descriptor with the compiled pattern
pat = re.compile(self.pattern)
setattr(owner, self.attr_name, pat)
return pat


# Hardcode the recognized whitespace characters to the US-ASCII
# whitespace characters. The main reason for doing this is that
# some Unicode spaces (like \u00a0) are non-breaking whitespaces.
Expand Down Expand Up @@ -73,41 +89,39 @@ class TextWrapper:
# (after stripping out empty strings).
word_punct = r'[\w!"\'&.,?]'
letter = r'[^\d\W]'
whitespace = r'[%s]' % re.escape(_whitespace)
nowhitespace = '[^' + whitespace[1:]
wordsep_re = re.compile(r'''
whitespace = fr'[{_whitespace}]'
no_whitespace = f'[^{_whitespace}]'
wordsep_re = _cached_regex(fr'''(?x)
( # any whitespace
%(ws)s+
{whitespace}+
| # em-dash between words
(?<=%(wp)s) -{2,} (?=\w)
(?<={word_punct}) -{{2,}} (?=\w)
| # word, possibly hyphenated
%(nws)s+? (?:
{no_whitespace}+? (?:
# hyphenated word
-(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
(?= %(lt)s -? %(lt)s)
-(?: (?<={letter}{{2}}-) | (?<={letter}-{letter}-))
(?= {letter} -? {letter})
| # end of word
(?=%(ws)s|\Z)
(?={whitespace}|\Z)
| # em-dash
(?<=%(wp)s) (?=-{2,}\w)
(?<={word_punct}) (?=-{{2,}}\w)
)
)''' % {'wp': word_punct, 'lt': letter,
'ws': whitespace, 'nws': nowhitespace},
re.VERBOSE)
del word_punct, letter, nowhitespace
)''')
del word_punct, letter, no_whitespace

# This less funky little regex just split on recognized spaces. E.g.
# "Hello there -- you goof-ball, use the -b option!"
# splits into
# Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
wordsep_simple_re = re.compile(r'(%s+)' % whitespace)
wordsep_simple_re = _cached_regex(fr'({whitespace}+)')
del whitespace

# XXX this is not locale- or charset-aware -- string.lowercase
# is US-ASCII only (and therefore English-only)
sentence_end_re = re.compile(r'[a-z]' # lowercase letter
r'[\.\!\?]' # sentence-ending punct.
r'[\"\']?' # optional end-of-quote
r'\Z') # end of chunk
sentence_end_re = _cached_regex(r'[a-z]' # lowercase letter
r'[\.\!\?]' # sentence-ending punct.
r'[\"\']?' # optional end-of-quote
r'\Z') # end of chunk

def __init__(self,
width=70,
Expand Down Expand Up @@ -250,7 +264,7 @@ def _wrap_chunks(self, chunks):
"""
lines = []
if self.width <= 0:
raise ValueError("invalid width %r (must be > 0)" % self.width)
raise ValueError(f"invalid width {self.width!r} (must be > 0)")
if self.max_lines is not None:
if self.max_lines > 1:
indent = self.subsequent_indent
Expand Down
Loading