diff --git a/Cargo.lock b/Cargo.lock index 6991e6e7b8f..ef4b6fd92a7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -595,6 +595,8 @@ dependencies = [ "term 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", "toml 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -748,6 +750,11 @@ name = "unicode-xid" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "unreachable" version = "1.0.0" @@ -896,6 +903,7 @@ dependencies = [ "checksum unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "aa6024fc12ddfd1c6dbc14a80fa2324d4568849869b779f6bd37e5e4c03344d1" "checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" +"checksum unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" "checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" "checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737" "checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" diff --git a/Cargo.toml b/Cargo.toml index 7281ce5efdc..9e0bce1dc08 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,6 +53,8 @@ rustc-ap-syntax = "306.0.0" rustc-ap-syntax_pos = "306.0.0" failure = "0.1.1" bytecount = "0.4" +unicode-width = "0.1.5" +unicode_categories = "0.1.1" # A noop dependency that changes in the Rust repository, it's a bit of a hack. # See the `src/tools/rustc-workspace-hack/README.md` file in `rust-lang/rust` diff --git a/src/comment.rs b/src/comment.rs index 54c3ee1d9d7..3841424904f 100644 --- a/src/comment.rs +++ b/src/comment.rs @@ -19,7 +19,9 @@ use config::Config; use rewrite::RewriteContext; use shape::{Indent, Shape}; use string::{rewrite_string, StringFormat}; -use utils::{count_newlines, first_line_width, last_line_width, trim_left_preserve_layout}; +use utils::{ + count_newlines, first_line_width, last_line_width, trim_left_preserve_layout, unicode_str_width, +}; use {ErrorKind, FormattingError}; fn is_custom_comment(comment: &str) -> bool { @@ -264,7 +266,8 @@ fn identify_comment( ) -> Option { let style = comment_style(orig, false); - // Computes the len of line taking into account a newline if the line is part of a paragraph. + // Computes the byte length of line taking into account a newline if the line is part of a + // paragraph. fn compute_len(orig: &str, line: &str) -> usize { if orig.len() > line.len() { if orig.as_bytes()[line.len()] == b'\r' { @@ -498,7 +501,7 @@ struct CommentRewrite<'a> { item_block: Option, comment_line_separator: String, indent_str: String, - max_chars: usize, + max_width: usize, fmt_indent: Indent, fmt: StringFormat<'a>, @@ -520,7 +523,7 @@ impl<'a> CommentRewrite<'a> { comment_style(orig, config.normalize_comments()).to_str_tuplet() }; - let max_chars = shape + let max_width = shape .width .checked_sub(closer.len() + opener.len()) .unwrap_or(1); @@ -534,7 +537,7 @@ impl<'a> CommentRewrite<'a> { code_block_attr: None, item_block: None, comment_line_separator: format!("{}{}", indent_str, line_start), - max_chars, + max_width, indent_str, fmt_indent, @@ -543,7 +546,7 @@ impl<'a> CommentRewrite<'a> { closer: "", line_start, line_end: "", - shape: Shape::legacy(max_chars, fmt_indent), + shape: Shape::legacy(max_width, fmt_indent), trim_end: true, config, }, @@ -583,14 +586,14 @@ impl<'a> CommentRewrite<'a> { if let Some(ref ib) = self.item_block { // the last few lines are part of an itemized block - self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent); + self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent); let item_fmt = ib.create_string_format(&self.fmt); self.result.push_str(&self.comment_line_separator); self.result.push_str(&ib.opener); match rewrite_string( &ib.trimmed_block_as_string(), &item_fmt, - self.max_chars.saturating_sub(ib.indent), + self.max_width.saturating_sub(ib.indent), ) { Some(s) => self.result.push_str(&Self::join_block( &s, @@ -626,14 +629,14 @@ impl<'a> CommentRewrite<'a> { return false; } self.is_prev_line_multi_line = false; - self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent); + self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent); let item_fmt = ib.create_string_format(&self.fmt); self.result.push_str(&self.comment_line_separator); self.result.push_str(&ib.opener); match rewrite_string( &ib.trimmed_block_as_string(), &item_fmt, - self.max_chars.saturating_sub(ib.indent), + self.max_width.saturating_sub(ib.indent), ) { Some(s) => self.result.push_str(&Self::join_block( &s, @@ -710,8 +713,11 @@ impl<'a> CommentRewrite<'a> { } } - if self.fmt.config.wrap_comments() && line.len() > self.fmt.shape.width && !has_url(line) { - match rewrite_string(line, &self.fmt, self.max_chars) { + if self.fmt.config.wrap_comments() + && unicode_str_width(line) > self.fmt.shape.width + && !has_url(line) + { + match rewrite_string(line, &self.fmt, self.max_width) { Some(ref s) => { self.is_prev_line_multi_line = s.contains('\n'); self.result.push_str(s); @@ -721,8 +727,8 @@ impl<'a> CommentRewrite<'a> { // Remove the trailing space, then start rewrite on the next line. self.result.pop(); self.result.push_str(&self.comment_line_separator); - self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent); - match rewrite_string(line, &self.fmt, self.max_chars) { + self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent); + match rewrite_string(line, &self.fmt, self.max_width) { Some(ref s) => { self.is_prev_line_multi_line = s.contains('\n'); self.result.push_str(s); @@ -743,12 +749,12 @@ impl<'a> CommentRewrite<'a> { // 1 = " " let offset = 1 + last_line_width(&self.result) - self.line_start.len(); Shape { - width: self.max_chars.saturating_sub(offset), + width: self.max_width.saturating_sub(offset), indent: self.fmt_indent, offset: self.fmt.shape.offset + offset, } } else { - Shape::legacy(self.max_chars, self.fmt_indent) + Shape::legacy(self.max_width, self.fmt_indent) }; } else { if line.is_empty() && self.result.ends_with(' ') && !is_last { @@ -756,7 +762,7 @@ impl<'a> CommentRewrite<'a> { self.result.pop(); } self.result.push_str(line); - self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent); + self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent); self.is_prev_line_multi_line = false; } diff --git a/src/lib.rs b/src/lib.rs index 751f9acfd66..ce28fce3182 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,7 +29,9 @@ extern crate serde_json; extern crate syntax; extern crate syntax_pos; extern crate toml; +extern crate unicode_categories; extern crate unicode_segmentation; +extern crate unicode_width; use std::cell::RefCell; use std::collections::HashMap; diff --git a/src/overflow.rs b/src/overflow.rs index 4a583fc1de8..8c4bac2a8a5 100644 --- a/src/overflow.rs +++ b/src/overflow.rs @@ -431,7 +431,8 @@ impl<'a> Context<'a> { }; if let Some(rewrite) = rewrite { - let rewrite_first_line = Some(rewrite[..first_line_width(&rewrite)].to_owned()); + // splitn(2, *).next().unwrap() is always safe. + let rewrite_first_line = Some(rewrite.splitn(2, '\n').next().unwrap().to_owned()); last_list_item.item = rewrite_first_line; Some(rewrite) } else { diff --git a/src/string.rs b/src/string.rs index c2e136d9938..dc48a6b13fd 100644 --- a/src/string.rs +++ b/src/string.rs @@ -11,11 +11,12 @@ // Format string literals. use regex::Regex; +use unicode_categories::UnicodeCategories; use unicode_segmentation::UnicodeSegmentation; use config::Config; use shape::Shape; -use utils::wrap_str; +use utils::{unicode_str_width, wrap_str}; const MIN_STRING: usize = 10; @@ -53,7 +54,7 @@ impl<'a> StringFormat<'a> { /// indentation into account. /// /// If we cannot put at least a single character per line, the rewrite won't succeed. - fn max_chars_with_indent(&self) -> Option { + fn max_width_with_indent(&self) -> Option { Some( self.shape .width @@ -62,10 +63,10 @@ impl<'a> StringFormat<'a> { ) } - /// Like max_chars_with_indent but the indentation is not subtracted. + /// Like max_width_with_indent but the indentation is not subtracted. /// This allows to fit more graphemes from the string on a line when /// SnippetState::EndWithLineFeed. - fn max_chars_without_indent(&self) -> Option { + fn max_width_without_indent(&self) -> Option { Some(self.config.max_width().checked_sub(self.line_end.len())?) } } @@ -75,8 +76,8 @@ pub fn rewrite_string<'a>( fmt: &StringFormat<'a>, newline_max_chars: usize, ) -> Option { - let max_chars_with_indent = fmt.max_chars_with_indent()?; - let max_chars_without_indent = fmt.max_chars_without_indent()?; + let max_width_with_indent = fmt.max_width_with_indent()?; + let max_width_without_indent = fmt.max_width_without_indent()?; let indent_with_newline = fmt.shape.indent.to_string_with_newline(fmt.config); let indent_without_newline = fmt.shape.indent.to_string(fmt.config); @@ -99,11 +100,11 @@ pub fn rewrite_string<'a>( // Snip a line at a time from `stripped_str` until it is used up. Push the snippet // onto result. - let mut cur_max_chars = max_chars_with_indent; + let mut cur_max_width = max_width_with_indent; let is_bareline_ok = fmt.line_start.is_empty() || is_whitespace(fmt.line_start); loop { // All the input starting at cur_start fits on the current line - if graphemes.len() - cur_start <= cur_max_chars { + if graphemes_width(&graphemes[cur_start..]) <= cur_max_width { for (i, grapheme) in graphemes[cur_start..].iter().enumerate() { if is_new_line(grapheme) { // take care of blank lines @@ -123,7 +124,7 @@ pub fn rewrite_string<'a>( // The input starting at cur_start needs to be broken match break_string( - cur_max_chars, + cur_max_width, fmt.trim_end, fmt.line_end, &graphemes[cur_start..], @@ -133,7 +134,7 @@ pub fn rewrite_string<'a>( result.push_str(fmt.line_end); result.push_str(&indent_with_newline); result.push_str(fmt.line_start); - cur_max_chars = newline_max_chars; + cur_max_width = newline_max_chars; cur_start += len; } SnippetState::EndWithLineFeed(line, len) => { @@ -143,11 +144,11 @@ pub fn rewrite_string<'a>( result.push_str(&line); if is_bareline_ok { // the next line can benefit from the full width - cur_max_chars = max_chars_without_indent; + cur_max_width = max_width_without_indent; } else { result.push_str(&indent_without_newline); result.push_str(fmt.line_start); - cur_max_chars = max_chars_with_indent; + cur_max_width = max_width_with_indent; } cur_start += len; } @@ -226,9 +227,10 @@ fn not_whitespace_except_line_feed(g: &str) -> bool { is_new_line(g) || !is_whitespace(g) } -/// Break the input string at a boundary character around the offset `max_chars`. A boundary +/// Break the input string at a boundary character around the offset `max_width`. A boundary /// character is either a punctuation or a whitespace. -fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str]) -> SnippetState { +/// FIXME(issue#3281): We must follow UAX#14 algorithm instead of this. +fn break_string(max_width: usize, trim_end: bool, line_end: &str, input: &[&str]) -> SnippetState { let break_at = |index /* grapheme at index is included */| { // Take in any whitespaces to the left/right of `input[index]` while // preserving line feeds @@ -272,19 +274,33 @@ fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str] } }; + // find a first index where the unicode width of input[0..x] become > max_width + let max_width_index_in_input = { + let mut cur_width = 0; + let mut cur_index = 0; + for (i, grapheme) in input.iter().enumerate() { + cur_width += unicode_str_width(grapheme); + cur_index = i; + if cur_width > max_width { + break; + } + } + cur_index + }; + // Find the position in input for breaking the string if line_end.is_empty() && trim_end - && !is_whitespace(input[max_chars - 1]) - && is_whitespace(input[max_chars]) + && !is_whitespace(input[max_width_index_in_input - 1]) + && is_whitespace(input[max_width_index_in_input]) { // At a breaking point already // The line won't invalidate the rewriting because: // - no extra space needed for the line_end character // - extra whitespaces to the right can be trimmed - return break_at(max_chars - 1); + return break_at(max_width_index_in_input - 1); } - if let Some(url_index_end) = detect_url(input, max_chars) { + if let Some(url_index_end) = detect_url(input, max_width_index_in_input) { let index_plus_ws = url_index_end + input[url_index_end..] .iter() @@ -297,14 +313,15 @@ fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str] return SnippetState::LineEnd(input[..=index_plus_ws].concat(), index_plus_ws + 1); }; } - match input[0..max_chars] + + match input[0..max_width_index_in_input] .iter() .rposition(|grapheme| is_whitespace(grapheme)) { // Found a whitespace and what is on its left side is big enough. Some(index) if index >= MIN_STRING => break_at(index), // No whitespace found, try looking for a punctuation instead - _ => match input[0..max_chars] + _ => match input[0..max_width_index_in_input] .iter() .rposition(|grapheme| is_punctuation(grapheme)) { @@ -312,12 +329,12 @@ fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str] Some(index) if index >= MIN_STRING => break_at(index), // Either no boundary character was found to the left of `input[max_chars]`, or the line // got too small. We try searching for a boundary character to the right. - _ => match input[max_chars..] + _ => match input[max_width_index_in_input..] .iter() .position(|grapheme| is_whitespace(grapheme) || is_punctuation(grapheme)) { // A boundary was found after the line limit - Some(index) => break_at(max_chars + index), + Some(index) => break_at(max_width_index_in_input + index), // No boundary to the right, the input cannot be broken None => SnippetState::EndOfInput(input.concat()), }, @@ -335,10 +352,11 @@ fn is_whitespace(grapheme: &str) -> bool { } fn is_punctuation(grapheme: &str) -> bool { - match grapheme.as_bytes()[0] { - b':' | b',' | b';' | b'.' => true, - _ => false, - } + grapheme.chars().all(|c| c.is_punctuation_other()) +} + +fn graphemes_width(graphemes: &[&str]) -> usize { + graphemes.iter().map(|s| unicode_str_width(s)).sum() } #[cfg(test)] diff --git a/src/utils.rs b/src/utils.rs index a6a46df6ddc..9efe57e7277 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -26,6 +26,8 @@ use config::Config; use rewrite::RewriteContext; use shape::{Indent, Shape}; +use unicode_width::UnicodeWidthStr; + pub const DEPR_SKIP_ANNOTATION: &str = "rustfmt_skip"; pub const SKIP_ANNOTATION: &str = "rustfmt::skip"; @@ -193,19 +195,13 @@ pub fn is_attributes_extendable(attrs_str: &str) -> bool { // The width of the first line in s. #[inline] pub fn first_line_width(s: &str) -> usize { - match s.find('\n') { - Some(n) => n, - None => s.len(), - } + unicode_str_width(s.splitn(2, '\n').next().unwrap_or("")) } // The width of the last line in s. #[inline] pub fn last_line_width(s: &str) -> usize { - match s.rfind('\n') { - Some(n) => s.len() - n - 1, - None => s.len(), - } + unicode_str_width(s.rsplitn(2, '\n').next().unwrap_or("")) } // The total used width of the last line. @@ -214,16 +210,16 @@ pub fn last_line_used_width(s: &str, offset: usize) -> usize { if s.contains('\n') { last_line_width(s) } else { - offset + s.len() + offset + unicode_str_width(s) } } #[inline] pub fn trimmed_last_line_width(s: &str) -> usize { - match s.rfind('\n') { - Some(n) => s[(n + 1)..].trim().len(), - None => s.trim().len(), - } + unicode_str_width(match s.rfind('\n') { + Some(n) => s[(n + 1)..].trim(), + None => s.trim(), + }) } #[inline] @@ -370,11 +366,15 @@ fn is_valid_str(snippet: &str, max_width: usize, shape: Shape) -> bool { return false; } // If the snippet does not include newline, we are done. - if first_line_width(snippet) == snippet.len() { + if is_single_line(snippet) { return true; } // The other lines must fit within the maximum width. - if snippet.lines().skip(1).any(|line| line.len() > max_width) { + if snippet + .lines() + .skip(1) + .any(|line| unicode_str_width(line) > max_width) + { return false; } // A special check for the last line, since the caller may @@ -593,6 +593,10 @@ impl NodeIdExt for NodeId { } } +pub(crate) fn unicode_str_width(s: &str) -> usize { + s.width() +} + #[cfg(test)] mod test { use super::*; diff --git a/tests/source/comment6.rs b/tests/source/comment6.rs new file mode 100644 index 00000000000..e5d72113ce6 --- /dev/null +++ b/tests/source/comment6.rs @@ -0,0 +1,10 @@ +// rustfmt-wrap_comments: true + +// Pendant la nuit du 9 mars 1860, les nuages, se confondant avec la mer, limitaient à quelques brasses la portée de la vue. +// Sur cette mer démontée, dont les lames déferlaient en projetant des lueurs livides, un léger bâtiment fuyait presque à sec de toile. + +pub mod foo {} + +// ゆく河の流れは絶えずして、しかももとの水にあらず。淀みに浮かぶうたかたは、かつ消えかつ結びて、久しくとどまりたるためしなし。世の中にある人とすみかと、またかくのごとし。 + +pub mod bar {} diff --git a/tests/source/string_punctuation.rs b/tests/source/string_punctuation.rs index efbadebb083..552c461ed34 100644 --- a/tests/source/string_punctuation.rs +++ b/tests/source/string_punctuation.rs @@ -4,4 +4,6 @@ fn main() { println!("ThisIsAReallyLongStringWithNoSpaces.It_should_prefer_to_break_onpunctuation:Likethisssssssssssss"); format!("{}__{}__{}ItShouldOnlyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyNoticeSemicolonsPeriodsColonsAndCommasAndResortToMid-CharBreaksAfterPunctuation{}{}",x,y,z,a,b); println!("aaaaaaaaaaaaaaaaaaaaaaaaaaaaalhijalfhiigjapdighjapdigjapdighdapighapdighpaidhg;adopgihadoguaadbadgad,qeoihapethae8t0aet8haetadbjtaeg;ooeouthaoeutgadlgajduabgoiuadogabudogubaodugbadgadgadga;adoughaoeugbaouea"); + println!("sentuhaesnuthaesnutheasunteahusnaethuseantuihaesntdiastnidaetnuhaideuhsenathe。WeShouldSupportNonAsciiPunctuations§ensuhatheasunteahsuneathusneathuasnuhaesnuhaesnuaethusnaetuheasnuth"); + println!("ThisIsASampleOfCJKString.祇園精舍の鐘の声、諸行無常の響きあり。娑羅双樹の花の色、盛者必衰の理をあらはす。奢れる人も久しからず、ただ春の夜の夢のごとし。猛き者もつひにはほろびぬ、ひとへに風の前の塵に同じ。"); } diff --git a/tests/target/comment6.rs b/tests/target/comment6.rs new file mode 100644 index 00000000000..565fee632f5 --- /dev/null +++ b/tests/target/comment6.rs @@ -0,0 +1,14 @@ +// rustfmt-wrap_comments: true + +// Pendant la nuit du 9 mars 1860, les nuages, se confondant avec la mer, +// limitaient à quelques brasses la portée de la vue. Sur cette mer démontée, +// dont les lames déferlaient en projetant des lueurs livides, un léger bâtiment +// fuyait presque à sec de toile. + +pub mod foo {} + +// ゆく河の流れは絶えずして、しかももとの水にあらず。淀みに浮かぶうたかたは、 +// かつ消えかつ結びて、久しくとどまりたるためしなし。世の中にある人とすみかと、 +// またかくのごとし。 + +pub mod bar {} diff --git a/tests/target/string_punctuation.rs b/tests/target/string_punctuation.rs index 5e574a400cc..0b8ec1b7f39 100644 --- a/tests/target/string_punctuation.rs +++ b/tests/target/string_punctuation.rs @@ -11,4 +11,14 @@ fn main() { adopgihadoguaadbadgad,qeoihapethae8t0aet8haetadbjtaeg;\ ooeouthaoeutgadlgajduabgoiuadogabudogubaodugbadgadgadga;adoughaoeugbaouea" ); + println!( + "sentuhaesnuthaesnutheasunteahusnaethuseantuihaesntdiastnidaetnuhaideuhsenathe。\ + WeShouldSupportNonAsciiPunctuations§\ + ensuhatheasunteahsuneathusneathuasnuhaesnuhaesnuaethusnaetuheasnuth" + ); + println!( + "ThisIsASampleOfCJKString.祇園精舍の鐘の声、諸行無常の響きあり。娑羅双樹の花の色、\ + 盛者必衰の理をあらはす。奢れる人も久しからず、ただ春の夜の夢のごとし。\ + 猛き者もつひにはほろびぬ、ひとへに風の前の塵に同じ。" + ); }