Skip to content

Commit ca612da

Browse files
authored
Merge pull request #131 from jfkthame/isolates-opt
Use a simplified codepath if no bidi isolation controls are present.
2 parents ea27a79 + aa06343 commit ca612da

File tree

4 files changed

+295
-103
lines changed

4 files changed

+295
-103
lines changed

src/explicit.rs

+30-1
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,25 @@ use super::char_data::{
1919
BidiClass::{self, *},
2020
};
2121
use super::level::Level;
22+
use super::prepare::removed_by_x9;
23+
use super::LevelRunVec;
2224
use super::TextSource;
2325

24-
/// Compute explicit embedding levels for one paragraph of text (X1-X8).
26+
/// Compute explicit embedding levels for one paragraph of text (X1-X8), and identify
27+
/// level runs (BD7) for use when determining Isolating Run Sequences (X10).
2528
///
2629
/// `processing_classes[i]` must contain the `BidiClass` of the char at byte index `i`,
2730
/// for each char in `text`.
31+
///
32+
/// `runs` returns the list of level runs (BD7) of the text.
2833
#[cfg_attr(feature = "flame_it", flamer::flame)]
2934
pub fn compute<'a, T: TextSource<'a> + ?Sized>(
3035
text: &'a T,
3136
para_level: Level,
3237
original_classes: &[BidiClass],
3338
levels: &mut [Level],
3439
processing_classes: &mut [BidiClass],
40+
runs: &mut LevelRunVec,
3541
) {
3642
assert_eq!(text.len(), original_classes.len());
3743

@@ -51,6 +57,9 @@ pub fn compute<'a, T: TextSource<'a> + ?Sized>(
5157
let mut overflow_embedding_count = 0u32;
5258
let mut valid_isolate_count = 0u32;
5359

60+
let mut current_run_level = Level::ltr();
61+
let mut current_run_start = 0;
62+
5463
for (i, len) in text.indices_lengths() {
5564
let last = stack.last().unwrap();
5665

@@ -182,6 +191,26 @@ pub fn compute<'a, T: TextSource<'a> + ?Sized>(
182191
levels[i + j] = levels[i];
183192
processing_classes[i + j] = processing_classes[i];
184193
}
194+
195+
// Identify level runs to be passed to prepare::isolating_run_sequences().
196+
if i == 0 {
197+
// Initialize for the first (or only) run.
198+
current_run_level = levels[i];
199+
} else {
200+
// Check if we need to start a new level run.
201+
// <https://www.unicode.org/reports/tr9/#BD7>
202+
if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level {
203+
// End the last run and start a new one.
204+
runs.push(current_run_start..i);
205+
current_run_level = levels[i];
206+
current_run_start = i;
207+
}
208+
}
209+
}
210+
211+
// Append the trailing level run, if non-empty.
212+
if levels.len() > current_run_start {
213+
runs.push(current_run_start..levels.len());
185214
}
186215
}
187216

src/lib.rs

+83-21
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ mod prepare;
8888
pub use crate::char_data::{BidiClass, UNICODE_VERSION};
8989
pub use crate::data_source::BidiDataSource;
9090
pub use crate::level::{Level, LTR_LEVEL, RTL_LEVEL};
91-
pub use crate::prepare::LevelRun;
91+
pub use crate::prepare::{LevelRun, LevelRunVec};
9292

9393
#[cfg(feature = "hardcoded-data")]
9494
pub use crate::char_data::{bidi_class, HardcodedBidiData};
@@ -248,8 +248,14 @@ struct InitialInfoExt<'text> {
248248

249249
/// Parallel to base.paragraphs, records whether each paragraph is "pure LTR" that
250250
/// requires no further bidi processing (i.e. there are no RTL characters or bidi
251-
/// control codes present).
252-
pure_ltr: Vec<bool>,
251+
/// control codes present), and whether any bidi isolation controls are present.
252+
flags: Vec<ParagraphInfoFlags>,
253+
}
254+
255+
#[derive(PartialEq, Debug)]
256+
struct ParagraphInfoFlags {
257+
is_pure_ltr: bool,
258+
has_isolate_controls: bool,
253259
}
254260

255261
impl<'text> InitialInfoExt<'text> {
@@ -269,12 +275,12 @@ impl<'text> InitialInfoExt<'text> {
269275
default_para_level: Option<Level>,
270276
) -> InitialInfoExt<'a> {
271277
let mut paragraphs = Vec::<ParagraphInfo>::new();
272-
let mut pure_ltr = Vec::<bool>::new();
273-
let (original_classes, _, _) = compute_initial_info(
278+
let mut flags = Vec::<ParagraphInfoFlags>::new();
279+
let (original_classes, _, _, _) = compute_initial_info(
274280
data_source,
275281
text,
276282
default_para_level,
277-
Some((&mut paragraphs, &mut pure_ltr)),
283+
Some((&mut paragraphs, &mut flags)),
278284
);
279285

280286
InitialInfoExt {
@@ -283,7 +289,7 @@ impl<'text> InitialInfoExt<'text> {
283289
original_classes,
284290
paragraphs,
285291
},
286-
pure_ltr,
292+
flags,
287293
}
288294
}
289295
}
@@ -299,8 +305,8 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
299305
data_source: &D,
300306
text: &'a T,
301307
default_para_level: Option<Level>,
302-
mut split_paragraphs: Option<(&mut Vec<ParagraphInfo>, &mut Vec<bool>)>,
303-
) -> (Vec<BidiClass>, Level, bool) {
308+
mut split_paragraphs: Option<(&mut Vec<ParagraphInfo>, &mut Vec<ParagraphInfoFlags>)>,
309+
) -> (Vec<BidiClass>, Level, bool, bool) {
304310
let mut original_classes = Vec::with_capacity(text.len());
305311

306312
// The stack contains the starting code unit index for each nested isolate we're inside.
@@ -310,8 +316,8 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
310316
let mut isolate_stack = Vec::new();
311317

312318
debug_assert!(
313-
if let Some((ref paragraphs, ref pure_ltr)) = split_paragraphs {
314-
paragraphs.is_empty() && pure_ltr.is_empty()
319+
if let Some((ref paragraphs, ref flags)) = split_paragraphs {
320+
paragraphs.is_empty() && flags.is_empty()
315321
} else {
316322
true
317323
}
@@ -323,6 +329,8 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
323329
// Per-paragraph flag: can subsequent processing be skipped? Set to false if any
324330
// RTL characters or bidi control characters are encountered in the paragraph.
325331
let mut is_pure_ltr = true;
332+
// Set to true if any bidi isolation controls are present in the paragraph.
333+
let mut has_isolate_controls = false;
326334

327335
#[cfg(feature = "flame_it")]
328336
flame::start("compute_initial_info(): iter text.char_indices()");
@@ -341,7 +349,7 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
341349

342350
match class {
343351
B => {
344-
if let Some((ref mut paragraphs, ref mut pure_ltr)) = split_paragraphs {
352+
if let Some((ref mut paragraphs, ref mut flags)) = split_paragraphs {
345353
// P1. Split the text into separate paragraphs. The paragraph separator is kept
346354
// with the previous paragraph.
347355
let para_end = i + len;
@@ -350,14 +358,18 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
350358
// P3. If no character is found in p2, set the paragraph level to zero.
351359
level: para_level.unwrap_or(LTR_LEVEL),
352360
});
353-
pure_ltr.push(is_pure_ltr);
361+
flags.push(ParagraphInfoFlags {
362+
is_pure_ltr,
363+
has_isolate_controls,
364+
});
354365
// Reset state for the start of the next paragraph.
355366
para_start = para_end;
356367
// TODO: Support defaulting to direction of previous paragraph
357368
//
358369
// <http://www.unicode.org/reports/tr9/#HL1>
359370
para_level = default_para_level;
360371
is_pure_ltr = true;
372+
has_isolate_controls = false;
361373
isolate_stack.clear();
362374
}
363375
}
@@ -394,6 +406,7 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
394406

395407
RLI | LRI | FSI => {
396408
is_pure_ltr = false;
409+
has_isolate_controls = true;
397410
isolate_stack.push(i);
398411
}
399412

@@ -405,15 +418,18 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
405418
}
406419
}
407420

408-
if let Some((paragraphs, pure_ltr)) = split_paragraphs {
421+
if let Some((paragraphs, flags)) = split_paragraphs {
409422
if para_start < text.len() {
410423
paragraphs.push(ParagraphInfo {
411424
range: para_start..text.len(),
412425
level: para_level.unwrap_or(LTR_LEVEL),
413426
});
414-
pure_ltr.push(is_pure_ltr);
427+
flags.push(ParagraphInfoFlags {
428+
is_pure_ltr,
429+
has_isolate_controls,
430+
});
415431
}
416-
debug_assert_eq!(paragraphs.len(), pure_ltr.len());
432+
debug_assert_eq!(paragraphs.len(), flags.len());
417433
}
418434
debug_assert_eq!(original_classes.len(), text.len());
419435

@@ -424,6 +440,7 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
424440
original_classes,
425441
para_level.unwrap_or(LTR_LEVEL),
426442
is_pure_ltr,
443+
has_isolate_controls,
427444
)
428445
}
429446

@@ -482,20 +499,21 @@ impl<'text> BidiInfo<'text> {
482499
text: &'a str,
483500
default_para_level: Option<Level>,
484501
) -> BidiInfo<'a> {
485-
let InitialInfoExt { base, pure_ltr, .. } =
502+
let InitialInfoExt { base, flags, .. } =
486503
InitialInfoExt::new_with_data_source(data_source, text, default_para_level);
487504

488505
let mut levels = Vec::<Level>::with_capacity(text.len());
489506
let mut processing_classes = base.original_classes.clone();
490507

491-
for (para, is_pure_ltr) in base.paragraphs.iter().zip(pure_ltr.iter()) {
508+
for (para, flags) in base.paragraphs.iter().zip(flags.iter()) {
492509
let text = &text[para.range.clone()];
493510
let original_classes = &base.original_classes[para.range.clone()];
494511

495512
compute_bidi_info_for_para(
496513
data_source,
497514
para,
498-
*is_pure_ltr,
515+
flags.is_pure_ltr,
516+
flags.has_isolate_controls,
499517
text,
500518
original_classes,
501519
&mut processing_classes,
@@ -720,7 +738,7 @@ impl<'text> ParagraphBidiInfo<'text> {
720738
) -> ParagraphBidiInfo<'a> {
721739
// Here we could create a ParagraphInitialInfo struct to parallel the one
722740
// used by BidiInfo, but there doesn't seem any compelling reason for it.
723-
let (original_classes, paragraph_level, is_pure_ltr) =
741+
let (original_classes, paragraph_level, is_pure_ltr, has_isolate_controls) =
724742
compute_initial_info(data_source, text, default_para_level, None);
725743

726744
let mut levels = Vec::<Level>::with_capacity(text.len());
@@ -738,6 +756,7 @@ impl<'text> ParagraphBidiInfo<'text> {
738756
data_source,
739757
&para_info,
740758
is_pure_ltr,
759+
has_isolate_controls,
741760
text,
742761
&original_classes,
743762
&mut processing_classes,
@@ -1066,6 +1085,7 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>
10661085
data_source: &D,
10671086
para: &ParagraphInfo,
10681087
is_pure_ltr: bool,
1088+
has_isolate_controls: bool,
10691089
text: &'a T,
10701090
original_classes: &[BidiClass],
10711091
processing_classes: &mut [BidiClass],
@@ -1079,16 +1099,26 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>
10791099

10801100
let processing_classes = &mut processing_classes[para.range.clone()];
10811101
let levels = &mut levels[para.range.clone()];
1102+
let mut level_runs = LevelRunVec::new();
10821103

10831104
explicit::compute(
10841105
text,
10851106
para.level,
10861107
original_classes,
10871108
levels,
10881109
processing_classes,
1110+
&mut level_runs,
10891111
);
10901112

1091-
let sequences = prepare::isolating_run_sequences(para.level, original_classes, levels);
1113+
let mut sequences = prepare::IsolatingRunSequenceVec::new();
1114+
prepare::isolating_run_sequences(
1115+
para.level,
1116+
original_classes,
1117+
levels,
1118+
level_runs,
1119+
has_isolate_controls,
1120+
&mut sequences,
1121+
);
10921122
for sequence in &sequences {
10931123
implicit::resolve_weak(text, sequence, processing_classes);
10941124
implicit::resolve_neutral(
@@ -1100,6 +1130,7 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>
11001130
processing_classes,
11011131
);
11021132
}
1133+
11031134
implicit::resolve_levels(processing_classes, levels);
11041135

11051136
assign_levels_to_removed_chars(para.level, original_classes, levels);
@@ -1549,6 +1580,24 @@ mod tests {
15491580
#[cfg(feature = "hardcoded-data")]
15501581
fn test_process_text() {
15511582
let tests = vec![
1583+
(
1584+
// text
1585+
"",
1586+
// base level
1587+
Some(RTL_LEVEL),
1588+
// levels
1589+
Level::vec(&[]),
1590+
// original_classes
1591+
vec![],
1592+
// paragraphs
1593+
vec![],
1594+
// levels_u16
1595+
Level::vec(&[]),
1596+
// original_classes_u16
1597+
vec![],
1598+
// paragraphs_u16
1599+
vec![],
1600+
),
15521601
(
15531602
// text
15541603
"abc123",
@@ -1710,6 +1759,19 @@ mod tests {
17101759
paragraphs: t.4.clone(),
17111760
}
17121761
);
1762+
// If it was empty, also test that ParagraphBidiInfo handles it safely.
1763+
if t.4.len() == 0 {
1764+
assert_eq!(
1765+
ParagraphBidiInfo::new(t.0, t.1),
1766+
ParagraphBidiInfo {
1767+
text: t.0,
1768+
original_classes: t.3.clone(),
1769+
levels: t.2.clone(),
1770+
paragraph_level: RTL_LEVEL,
1771+
is_pure_ltr: true,
1772+
}
1773+
)
1774+
}
17131775
// If it was a single paragraph, also test ParagraphBidiInfo.
17141776
if t.4.len() == 1 {
17151777
assert_eq!(

0 commit comments

Comments
 (0)