Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unicode text wrapping in comments and string literals. #3275

Merged
merged 2 commits into from
Jan 14, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ rustc-ap-syntax = "306.0.0"
rustc-ap-syntax_pos = "306.0.0"
failure = "0.1.1"
bytecount = "0.4"
unicode-width = "0.1.5"
unicode_categories = "0.1.1"

# A noop dependency that changes in the Rust repository, it's a bit of a hack.
# See the `src/tools/rustc-workspace-hack/README.md` file in `rust-lang/rust`
Expand Down
40 changes: 23 additions & 17 deletions src/comment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ use config::Config;
use rewrite::RewriteContext;
use shape::{Indent, Shape};
use string::{rewrite_string, StringFormat};
use utils::{count_newlines, first_line_width, last_line_width, trim_left_preserve_layout};
use utils::{
count_newlines, first_line_width, last_line_width, trim_left_preserve_layout, unicode_str_width,
};
use {ErrorKind, FormattingError};

fn is_custom_comment(comment: &str) -> bool {
Expand Down Expand Up @@ -264,7 +266,8 @@ fn identify_comment(
) -> Option<String> {
let style = comment_style(orig, false);

// Computes the len of line taking into account a newline if the line is part of a paragraph.
// Computes the byte length of line taking into account a newline if the line is part of a
// paragraph.
fn compute_len(orig: &str, line: &str) -> usize {
if orig.len() > line.len() {
if orig.as_bytes()[line.len()] == b'\r' {
Expand Down Expand Up @@ -477,7 +480,7 @@ struct CommentRewrite<'a> {
item_block: Option<ItemizedBlock>,
comment_line_separator: String,
indent_str: String,
max_chars: usize,
max_width: usize,
fmt_indent: Indent,
fmt: StringFormat<'a>,

Expand All @@ -499,7 +502,7 @@ impl<'a> CommentRewrite<'a> {
comment_style(orig, config.normalize_comments()).to_str_tuplet()
};

let max_chars = shape
let max_width = shape
.width
.checked_sub(closer.len() + opener.len())
.unwrap_or(1);
Expand All @@ -514,7 +517,7 @@ impl<'a> CommentRewrite<'a> {
item_block_buffer: String::with_capacity(128),
item_block: None,
comment_line_separator: format!("{}{}", indent_str, line_start),
max_chars,
max_width,
indent_str,
fmt_indent,

Expand All @@ -523,7 +526,7 @@ impl<'a> CommentRewrite<'a> {
closer: "",
line_start,
line_end: "",
shape: Shape::legacy(max_chars, fmt_indent),
shape: Shape::legacy(max_width, fmt_indent),
trim_end: true,
config,
},
Expand Down Expand Up @@ -563,7 +566,7 @@ impl<'a> CommentRewrite<'a> {

if !self.item_block_buffer.is_empty() {
// the last few lines are part of an itemized block
self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
let mut ib = None;
::std::mem::swap(&mut ib, &mut self.item_block);
let ib = ib.unwrap();
Expand All @@ -573,7 +576,7 @@ impl<'a> CommentRewrite<'a> {
match rewrite_string(
&self.item_block_buffer.replace("\n", " "),
&item_fmt,
self.max_chars.saturating_sub(ib.indent),
self.max_width.saturating_sub(ib.indent),
) {
Some(s) => self.result.push_str(&Self::join_block(
&s,
Expand Down Expand Up @@ -611,14 +614,14 @@ impl<'a> CommentRewrite<'a> {
return false;
}
self.is_prev_line_multi_line = false;
self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
let item_fmt = ib.create_string_format(&self.fmt);
self.result.push_str(&self.comment_line_separator);
self.result.push_str(&ib.opener);
match rewrite_string(
&self.item_block_buffer.replace("\n", " "),
&item_fmt,
self.max_chars.saturating_sub(ib.indent),
self.max_width.saturating_sub(ib.indent),
) {
Some(s) => self.result.push_str(&Self::join_block(
&s,
Expand Down Expand Up @@ -698,8 +701,11 @@ impl<'a> CommentRewrite<'a> {
}
}

if self.fmt.config.wrap_comments() && line.len() > self.fmt.shape.width && !has_url(line) {
match rewrite_string(line, &self.fmt, self.max_chars) {
if self.fmt.config.wrap_comments()
&& unicode_str_width(line) > self.fmt.shape.width
&& !has_url(line)
{
match rewrite_string(line, &self.fmt, self.max_width) {
Some(ref s) => {
self.is_prev_line_multi_line = s.contains('\n');
self.result.push_str(s);
Expand All @@ -709,8 +715,8 @@ impl<'a> CommentRewrite<'a> {
// Remove the trailing space, then start rewrite on the next line.
self.result.pop();
self.result.push_str(&self.comment_line_separator);
self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
match rewrite_string(line, &self.fmt, self.max_chars) {
self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
match rewrite_string(line, &self.fmt, self.max_width) {
Some(ref s) => {
self.is_prev_line_multi_line = s.contains('\n');
self.result.push_str(s);
Expand All @@ -731,20 +737,20 @@ impl<'a> CommentRewrite<'a> {
// 1 = " "
let offset = 1 + last_line_width(&self.result) - self.line_start.len();
Shape {
width: self.max_chars.saturating_sub(offset),
width: self.max_width.saturating_sub(offset),
indent: self.fmt_indent,
offset: self.fmt.shape.offset + offset,
}
} else {
Shape::legacy(self.max_chars, self.fmt_indent)
Shape::legacy(self.max_width, self.fmt_indent)
};
} else {
if line.is_empty() && self.result.ends_with(' ') && !is_last {
// Remove space if this is an empty comment or a doc comment.
self.result.pop();
}
self.result.push_str(line);
self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
self.is_prev_line_multi_line = false;
}

Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ extern crate serde_json;
extern crate syntax;
extern crate syntax_pos;
extern crate toml;
extern crate unicode_categories;
extern crate unicode_segmentation;
extern crate unicode_width;

use std::cell::RefCell;
use std::collections::HashMap;
Expand Down
3 changes: 2 additions & 1 deletion src/overflow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,8 @@ impl<'a> Context<'a> {
};

if let Some(rewrite) = rewrite {
let rewrite_first_line = Some(rewrite[..first_line_width(&rewrite)].to_owned());
// splitn(2, *).next().unwrap() is always safe.
let rewrite_first_line = Some(rewrite.splitn(2, '\n').next().unwrap().to_owned());
last_list_item.item = rewrite_first_line;
Some(rewrite)
} else {
Expand Down
70 changes: 44 additions & 26 deletions src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@
// Format string literals.

use regex::Regex;
use unicode_categories::UnicodeCategories;
use unicode_segmentation::UnicodeSegmentation;

use config::Config;
use shape::Shape;
use utils::wrap_str;
use utils::{unicode_str_width, wrap_str};

const MIN_STRING: usize = 10;

Expand Down Expand Up @@ -53,7 +54,7 @@ impl<'a> StringFormat<'a> {
/// indentation into account.
///
/// If we cannot put at least a single character per line, the rewrite won't succeed.
fn max_chars_with_indent(&self) -> Option<usize> {
fn max_width_with_indent(&self) -> Option<usize> {
Some(
self.shape
.width
Expand All @@ -62,10 +63,10 @@ impl<'a> StringFormat<'a> {
)
}

/// Like max_chars_with_indent but the indentation is not subtracted.
/// Like max_width_with_indent but the indentation is not subtracted.
/// This allows to fit more graphemes from the string on a line when
/// SnippetState::EndWithLineFeed.
fn max_chars_without_indent(&self) -> Option<usize> {
fn max_width_without_indent(&self) -> Option<usize> {
Some(self.config.max_width().checked_sub(self.line_end.len())?)
}
}
Expand All @@ -75,8 +76,8 @@ pub fn rewrite_string<'a>(
fmt: &StringFormat<'a>,
newline_max_chars: usize,
) -> Option<String> {
let max_chars_with_indent = fmt.max_chars_with_indent()?;
let max_chars_without_indent = fmt.max_chars_without_indent()?;
let max_width_with_indent = fmt.max_width_with_indent()?;
let max_width_without_indent = fmt.max_width_without_indent()?;
let indent_with_newline = fmt.shape.indent.to_string_with_newline(fmt.config);
let indent_without_newline = fmt.shape.indent.to_string(fmt.config);

Expand All @@ -99,11 +100,11 @@ pub fn rewrite_string<'a>(

// Snip a line at a time from `stripped_str` until it is used up. Push the snippet
// onto result.
let mut cur_max_chars = max_chars_with_indent;
let mut cur_max_width = max_width_with_indent;
let is_bareline_ok = fmt.line_start.is_empty() || is_whitespace(fmt.line_start);
loop {
// All the input starting at cur_start fits on the current line
if graphemes.len() - cur_start <= cur_max_chars {
if graphemes_width(&graphemes[cur_start..]) <= cur_max_width {
for (i, grapheme) in graphemes[cur_start..].iter().enumerate() {
if is_new_line(grapheme) {
// take care of blank lines
Expand All @@ -123,7 +124,7 @@ pub fn rewrite_string<'a>(

// The input starting at cur_start needs to be broken
match break_string(
cur_max_chars,
cur_max_width,
fmt.trim_end,
fmt.line_end,
&graphemes[cur_start..],
Expand All @@ -133,7 +134,7 @@ pub fn rewrite_string<'a>(
result.push_str(fmt.line_end);
result.push_str(&indent_with_newline);
result.push_str(fmt.line_start);
cur_max_chars = newline_max_chars;
cur_max_width = newline_max_chars;
cur_start += len;
}
SnippetState::EndWithLineFeed(line, len) => {
Expand All @@ -143,11 +144,11 @@ pub fn rewrite_string<'a>(
result.push_str(&line);
if is_bareline_ok {
// the next line can benefit from the full width
cur_max_chars = max_chars_without_indent;
cur_max_width = max_width_without_indent;
} else {
result.push_str(&indent_without_newline);
result.push_str(fmt.line_start);
cur_max_chars = max_chars_with_indent;
cur_max_width = max_width_with_indent;
}
cur_start += len;
}
Expand Down Expand Up @@ -226,9 +227,10 @@ fn not_whitespace_except_line_feed(g: &str) -> bool {
is_new_line(g) || !is_whitespace(g)
}

/// Break the input string at a boundary character around the offset `max_chars`. A boundary
/// Break the input string at a boundary character around the offset `max_width`. A boundary
/// character is either a punctuation or a whitespace.
fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str]) -> SnippetState {
/// FIXME(issue#3281): We must follow UAX#14 algorithm instead of this.
fn break_string(max_width: usize, trim_end: bool, line_end: &str, input: &[&str]) -> SnippetState {
let break_at = |index /* grapheme at index is included */| {
// Take in any whitespaces to the left/right of `input[index]` while
// preserving line feeds
Expand Down Expand Up @@ -272,19 +274,33 @@ fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str]
}
};

// find a first index where the unicode width of input[0..x] become > max_width
let max_width_index_in_input = {
let mut cur_width = 0;
let mut cur_index = 0;
for (i, grapheme) in input.iter().enumerate() {
cur_width += unicode_str_width(grapheme);
cur_index = i;
if cur_width > max_width {
break;
}
}
cur_index
};

// Find the position in input for breaking the string
if line_end.is_empty()
&& trim_end
&& !is_whitespace(input[max_chars - 1])
&& is_whitespace(input[max_chars])
&& !is_whitespace(input[max_width_index_in_input - 1])
&& is_whitespace(input[max_width_index_in_input])
{
// At a breaking point already
// The line won't invalidate the rewriting because:
// - no extra space needed for the line_end character
// - extra whitespaces to the right can be trimmed
return break_at(max_chars - 1);
return break_at(max_width_index_in_input - 1);
}
if let Some(url_index_end) = detect_url(input, max_chars) {
if let Some(url_index_end) = detect_url(input, max_width_index_in_input) {
let index_plus_ws = url_index_end
+ input[url_index_end..]
.iter()
Expand All @@ -297,27 +313,28 @@ fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str]
return SnippetState::LineEnd(input[..=index_plus_ws].concat(), index_plus_ws + 1);
};
}
match input[0..max_chars]

match input[0..max_width_index_in_input]
.iter()
.rposition(|grapheme| is_whitespace(grapheme))
{
// Found a whitespace and what is on its left side is big enough.
Some(index) if index >= MIN_STRING => break_at(index),
// No whitespace found, try looking for a punctuation instead
_ => match input[0..max_chars]
_ => match input[0..max_width_index_in_input]
.iter()
.rposition(|grapheme| is_punctuation(grapheme))
{
// Found a punctuation and what is on its left side is big enough.
Some(index) if index >= MIN_STRING => break_at(index),
// Either no boundary character was found to the left of `input[max_chars]`, or the line
// got too small. We try searching for a boundary character to the right.
_ => match input[max_chars..]
_ => match input[max_width_index_in_input..]
.iter()
.position(|grapheme| is_whitespace(grapheme) || is_punctuation(grapheme))
{
// A boundary was found after the line limit
Some(index) => break_at(max_chars + index),
Some(index) => break_at(max_width_index_in_input + index),
// No boundary to the right, the input cannot be broken
None => SnippetState::EndOfInput(input.concat()),
},
Expand All @@ -335,10 +352,11 @@ fn is_whitespace(grapheme: &str) -> bool {
}

fn is_punctuation(grapheme: &str) -> bool {
match grapheme.as_bytes()[0] {
b':' | b',' | b';' | b'.' => true,
_ => false,
}
grapheme.chars().all(|c| c.is_punctuation_other())
}

fn graphemes_width(graphemes: &[&str]) -> usize {
graphemes.iter().map(|s| unicode_str_width(s)).sum()
}

#[cfg(test)]
Expand Down
Loading