Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Lib/test/test_textwrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,10 @@ def setUp(self):
self.text = '''\
Did you say "supercalifragilisticexpialidocious?"
How *do* you spell that odd word, anyways?
'''
self.text_cjk = '''\
Did you say "いろはにほへとちりぬるをいろはにほ?"
How りぬ るをいろはにほり ぬるは, anyways?
'''

def test_break_long(self):
Expand All @@ -579,6 +583,14 @@ def test_break_long(self):
self.check_wrap(self.text, 50,
['Did you say "supercalifragilisticexpialidocious?"',
'How *do* you spell that odd word, anyways?'])
self.check_wrap(self.text_cjk, 30,
['Did you say "いろはにほへとち',
'りぬるをいろはにほ?" How りぬ',
'るをいろはにほり ぬるは,',
'anyways?'], cjk=True)
self.check_wrap(self.text_cjk, 50,
['Did you say "いろはにほへとちりぬるをいろはにほ?"',
'How りぬ るをいろはにほり ぬるは, anyways?'], cjk=True)

# SF bug 797650. Prevent an infinite loop by making sure that at
# least one character gets split off on every pass.
Expand Down
69 changes: 57 additions & 12 deletions Lib/textwrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
# Copyright (C) 2002, 2003 Python Software Foundation.
# Written by Greg Ward <[email protected]>

import re
import re, unicodedata

__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten',
'cjkwide', 'cjklen', 'cjkslices']

# Hardcode the recognized whitespace characters to the US-ASCII
# whitespace characters. The main reason for doing this is that
Expand All @@ -26,6 +27,8 @@ class TextWrapper:
width (default: 70)
the maximum width of wrapped lines (unless break_long_words
is false)
cjk (default: False)
Handle double-width CJK chars.
initial_indent (default: "")
string that will be prepended to the first line of wrapped
output. Counts towards the line's width.
Expand Down Expand Up @@ -114,6 +117,7 @@ class TextWrapper:

def __init__(self,
width=70,
cjk=False,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These arguments can be passed as positional argument.
So new argument should be added last.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree

initial_indent="",
subsequent_indent="",
expand_tabs=True,
Expand All @@ -127,6 +131,7 @@ def __init__(self,
max_lines=None,
placeholder=' [...]'):
self.width = width
self.cjk = cjk
self.initial_indent = initial_indent
self.subsequent_indent = subsequent_indent
self.expand_tabs = expand_tabs
Expand All @@ -139,6 +144,7 @@ def __init__(self,
self.max_lines = max_lines
self.placeholder = placeholder

self.len = cjklen if self.cjk else len
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest to make it private and use a different name than len: self._text_width() for example.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exposing functions mean that you must document them, write unit tests, and that someone (maybe not you) have to maintain them. I don't think that it's worth it. Let's try with something simple, make these functions private.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree


# -- Private methods -----------------------------------------------
# (possibly useful for subclasses to override)
Expand Down Expand Up @@ -215,8 +221,13 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
# If we're allowed to break long words, then do so: put as much
# of the next chunk onto the current line as will fit.
if self.break_long_words:
cur_line.append(reversed_chunks[-1][:space_left])
reversed_chunks[-1] = reversed_chunks[-1][space_left:]
if self.cjk:
chunk_start, chunk_end = cjkslices(reversed_chunks[-1], space_left)
cur_line.append(chunk_start)
reversed_chunks[-1] = chunk_end
else:
cur_line.append(reversed_chunks[-1][:space_left])
reversed_chunks[-1] = reversed_chunks[-1][space_left:]

# Otherwise, we have to preserve the long word intact. Only add
# it to the current line if there's nothing already there --
Expand Down Expand Up @@ -246,6 +257,9 @@ def _wrap_chunks(self, chunks):
lines = []
if self.width <= 0:
raise ValueError("invalid width %r (must be > 0)" % self.width)
elif self.width == 1 and (sum(self.len(chunk) for chunk in chunks) >
sum(len(chunk) for chunk in chunks)):
raise ValueError("invalid width 1 (must be > 1 when CJK chars)")
if self.max_lines is not None:
if self.max_lines > 1:
indent = self.subsequent_indent
Expand Down Expand Up @@ -280,7 +294,7 @@ def _wrap_chunks(self, chunks):
del chunks[-1]

while chunks:
l = len(chunks[-1])
l = self.len(chunks[-1])

# Can at least squeeze this chunk onto the current line.
if cur_len + l <= width:
Expand All @@ -293,7 +307,7 @@ def _wrap_chunks(self, chunks):

# The current line is full, and the next chunk is too big to
# fit on *any* line (not just this one).
if chunks and len(chunks[-1]) > width:
if chunks and self.len(chunks[-1]) > width:
self._handle_long_word(chunks, cur_line, cur_len, width)
cur_len = sum(map(len, cur_line))

Expand Down Expand Up @@ -365,7 +379,7 @@ def fill(self, text):

# -- Convenience interface ---------------------------------------------

def wrap(text, width=70, **kwargs):
def wrap(text, width=70, cjk=False, **kwargs):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is not need to repeat cjk here, there is already a generic **kwargs. Same for other functions below.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the same need as width: to be visible so easily usable.

"""Wrap a single paragraph of text, returning a list of wrapped lines.

Reformat the single paragraph in 'text' so it fits in lines of no
Expand All @@ -375,10 +389,10 @@ def wrap(text, width=70, **kwargs):
space. See TextWrapper class for available keyword args to customize
wrapping behaviour.
"""
w = TextWrapper(width=width, **kwargs)
w = TextWrapper(width=width, cjk=cjk, **kwargs)
return w.wrap(text)

def fill(text, width=70, **kwargs):
def fill(text, width=70, cjk=False, **kwargs):
"""Fill a single paragraph of text, returning a new string.

Reformat the single paragraph in 'text' to fit in lines of no more
Expand All @@ -387,10 +401,10 @@ def fill(text, width=70, **kwargs):
whitespace characters converted to space. See TextWrapper class for
available keyword args to customize wrapping behaviour.
"""
w = TextWrapper(width=width, **kwargs)
w = TextWrapper(width=width, cjk=cjk, **kwargs)
return w.fill(text)

def shorten(text, width, **kwargs):
def shorten(text, width, cjk=False, **kwargs):
"""Collapse and truncate the given text to fit in the given width.

The text first has its whitespace collapsed. If it then fits in
Expand All @@ -402,10 +416,41 @@ def shorten(text, width, **kwargs):
>>> textwrap.shorten("Hello world!", width=11)
'Hello [...]'
"""
w = TextWrapper(width=width, max_lines=1, **kwargs)
w = TextWrapper(width=width, cjk=cjk, max_lines=1, **kwargs)
return w.fill(' '.join(text.strip().split()))


# -- CJK support ------------------------------------------------------

def cjkwide(char):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add _ in the name: cjk_wide().

Note sure about the name: is_cjk_wide_char()?

Do we really need to make the function?

"""Return True if char is Fullwidth or Wide, False otherwise.
Fullwidth and Wide CJK chars are double-width.
"""
return unicodedata.east_asian_width(char) in ('F', 'W')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can write {'F', 'W'}, it's optimized as a constant frozenset.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's really faster than a tuple ?

This comment was marked as resolved.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please, @duboviy or @Haypo, could you explain to me why/how a tuple could be slower than a frozenset.



def cjklen(text):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename to cfk_width()?

"""Return the real width of text (its len if not a string).
"""
if not isinstance(text, str):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strange case handling, maybe we should expect only string type text argument in this function...

Copy link
Author

@fgallaire fgallaire Mar 12, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again: it's for an handy replacement of the built-in len():
from textwrap import cjk_len as len

return len(text)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this case. Why do you pass a non-string to the function?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for people who want to do:
from textwrap import cjklen as len
and use cjklen transparently

return sum(2 if cjkwide(char) else 1 for char in text)


def cjkslices(text, index):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO it's better to make all these new functions private: add _ prefix. Rename to _cjk_slices().

"index": is it a number of character or a width? Maybe rename to width?

Copy link
Author

@fgallaire fgallaire Feb 14, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I really want this functions to be exported, they are useful

"""Return the two slices of text cut to the index.
"""
if not isinstance(text, str):
return text[:index], text[index:]
if cjklen(text) <= index:
return text, ''
i = 1
# <= and i-1 to catch the last double length char of odd line
while cjklen(text[:i]) <= index:
i = i + 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like this O(n^2) algorithm.

w = 0
for i, c in enumerate(text):
    w += cjkwide(c) + 1
    if w > index:
        break

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very relevant point.

return text[:i-1], text[i-1:]


# -- Loosely related functionality -------------------------------------

_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
Expand Down
1 change: 1 addition & 0 deletions Misc/ACKS
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,7 @@ Lele Gaifax
Santiago Gala
Yitzchak Gale
Matthew Gallagher
Florent Gallaire
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if that makes lots of sense for such a small change (even though such changes are very welcome of course!)

There are commits from loads of people in this repo, but way less entries in this file (adding every contributor would quickly make the file pretty chaotic).

IMO it'd be better to only add yourself to this file when you contribute e.g. a major fix, new feature, etc

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I strongly disagree, if persons are missing they should be added.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Florent definitely belongs in ACKS. We have been fairly liberal in adding people, some for as little as a well-crafted sentence or line of code.

Quentin Gallet-Gilles
Riccardo Attilio Galli
Raymund Galvin
Expand Down