From 72bd327db0b26e542a327449bef77bd2bc059da4 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 24 Mar 2015 18:32:27 +0200 Subject: Issue #22687: Fixed some corner cases in breaking words in tetxtwrap. Got rid of quadratic complexity in breaking long words. --- Lib/textwrap.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'Lib/textwrap.py') diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 24891804dd3..49ea9a655d9 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -79,10 +79,25 @@ class TextWrapper: # splits into # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! # (after stripping out empty strings). - wordsep_re = re.compile( - r'(\s+|' # any whitespace - r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words - r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash + word_punct = r'[\w!"\'&.,?]' + letter = r'[^\d\W]' + wordsep_re = re.compile(r''' + ( # any whitespace + \s+ + | # em-dash between words + (?<=%(wp)s) -{2,} (?=\w) + | # word, possibly hyphenated + \S+? (?: + # hyphenated word + -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-)) + (?= %(lt)s -? %(lt)s) + | # end of word + (?=\s|\Z) + | # em-dash + (?<=%(wp)s) (?=-{2,}\w) + ) + )''' % {'wp': word_punct, 'lt': letter}, re.VERBOSE) + del word_punct, letter # This less funky little regex just split on recognized spaces. E.g. # "Hello there -- you goof-ball, use the -b option!" -- cgit v1.2.3-65-gdbad