@@ -19,6 +19,7 @@ use imara_diff::{
1919} ;
2020use pulldown_cmark_escape:: FmtWriter ;
2121use regex:: Regex ;
22+ use unicode_segmentation:: UnicodeSegmentation ;
2223
2324use crate :: github:: GithubCompare ;
2425use crate :: utils:: is_repo_autorized;
@@ -249,18 +250,34 @@ fn process_old_new(
249250 background-color: rgba(150, 255, 150, 1);
250251 white-space: pre;
251252 }}
252- .removed- line-after {{
253+ .line-removed -after {{
253254 color: rgb(220, 0, 0)
254255 }}
255- .added- line-after {{
256+ .line-added -after {{
256257 color: rgb(0, 73, 0)
257258 }}
258- .removed- line-before {{
259+ .line-removed -before {{
259260 color: rgb(192, 78, 76)
260261 }}
261- .added- line-before {{
262+ .line-added -before {{
262263 color: rgb(63, 128, 94)
263264 }}
265+ .word-removed-after {{
266+ color: white;
267+ background-color: rgb(220, 0, 0);
268+ }}
269+ .word-added-after {{
270+ color: white;
271+ background-color: rgb(0, 73, 0);
272+ }}
273+ .word-removed-before {{
274+ color: white;
275+ background-color: rgb(192, 78, 76);
276+ }}
277+ .word-added-before {{
278+ color: white;
279+ background-color: rgb(63, 128, 94);
280+ }}
264281 @media (prefers-color-scheme: dark) {{
265282 body {{
266283 background: #0C0C0C;
@@ -277,18 +294,34 @@ fn process_old_new(
277294 background-color: rgba(70, 120, 70, 1);
278295 white-space: pre;
279296 }}
280- .removed- line-after {{
297+ .line-removed -after {{
281298 color: rgba(255, 0, 0, 1);
282299 }}
283- .added- line-after {{
300+ .line-added -after {{
284301 color: rgba(0, 255, 0, 1);
285302 }}
286- .removed- line-before {{
303+ .line-removed -before {{
287304 color: rgba(100, 0, 0, 1);
288305 }}
289- .added- line-before {{
306+ .line-added -before {{
290307 color: rgba(0, 100, 0, 1);
291308 }}
309+ .word-removed-after {{
310+ color: black;
311+ background-color: rgba(255, 0, 0, 1);
312+ }}
313+ .word-added-after {{
314+ color: black;
315+ background-color: rgba(0, 255, 0, 1);
316+ }}
317+ .word-removed-before {{
318+ color: black;
319+ background-color: rgba(100, 0, 0, 1);
320+ }}
321+ .word-added-before {{
322+ color: black;
323+ background-color: rgba(0, 100, 0, 1);
324+ }}
292325 }}
293326 </style>
294327</head>
@@ -400,6 +433,7 @@ fn process_old_new(
400433const REMOVED_BLOCK_SIGN : & str = r#"<span class="removed-block"> - </span>"# ;
401434const ADDED_BLOCK_SIGN : & str = r#"<span class="added-block"> + </span>"# ;
402435
436+ #[ derive( Copy , Clone ) ]
403437enum HunkTokenStatus {
404438 Added ,
405439 Removed ,
@@ -408,39 +442,56 @@ enum HunkTokenStatus {
408442struct HtmlDiffPrinter < ' a > ( pub & ' a Interner < & ' a str > ) ;
409443
410444impl HtmlDiffPrinter < ' _ > {
411- fn handle_hunk_token (
445+ fn handle_hunk_line < ' a > (
412446 & self ,
413447 mut f : impl fmt:: Write ,
414448 hunk_token_status : HunkTokenStatus ,
415- token : & str ,
449+ words : impl Iterator < Item = ( & ' a str , bool ) > ,
416450 ) -> fmt:: Result {
417451 // Show the hunk status
418452 match hunk_token_status {
419453 HunkTokenStatus :: Added => write ! ( f, "{ADDED_BLOCK_SIGN} " ) ?,
420454 HunkTokenStatus :: Removed => write ! ( f, "{REMOVED_BLOCK_SIGN} " ) ?,
421455 } ;
422456
423- let is_add = token. starts_with ( '+' ) ;
424- let is_remove = token. starts_with ( '-' ) ;
457+ let mut words = words. peekable ( ) ;
458+
459+ let first_word = words. peek ( ) ;
460+ let is_add = first_word. map ( |w| w. 0 . starts_with ( '+' ) ) . unwrap_or_default ( ) ;
461+ let is_remove = first_word. map ( |w| w. 0 . starts_with ( '-' ) ) . unwrap_or_default ( ) ;
425462
426463 // Highlight in the same was as `git range-diff` does for diff-lines
427- // that changed. (Contrary to `git range-diff` we don't color unchanged
464+ // that changed. In addition we also do word highlighting.
465+ //
466+ // (Contrary to `git range-diff` we don't color unchanged
428467 // diff lines though, since then the coloring distracts from what is
429468 // relevant.)
430469 if is_add || is_remove {
431- let class = match ( hunk_token_status, is_add) {
432- ( HunkTokenStatus :: Removed , true ) => "added-line- before" ,
433- ( HunkTokenStatus :: Removed , false ) => "removed-line- before" ,
434- ( HunkTokenStatus :: Added , true ) => "added-line- after" ,
435- ( HunkTokenStatus :: Added , false ) => "removed-line- after" ,
470+ let prefix_class = match ( hunk_token_status, is_add) {
471+ ( HunkTokenStatus :: Removed , true ) => "added-before" ,
472+ ( HunkTokenStatus :: Removed , false ) => "removed-before" ,
473+ ( HunkTokenStatus :: Added , true ) => "added-after" ,
474+ ( HunkTokenStatus :: Added , false ) => "removed-after" ,
436475 } ;
476+ write ! ( f, r#"<span class="line-{prefix_class}">"# ) ?;
477+
478+ for ( word, changed) in words {
479+ if changed {
480+ write ! ( f, r#"<span class="word-{prefix_class}">"# ) ?;
481+ pulldown_cmark_escape:: escape_html ( FmtWriter ( & mut f) , word) ?;
482+ write ! ( f, "</span>" ) ?;
483+ } else {
484+ pulldown_cmark_escape:: escape_html ( FmtWriter ( & mut f) , word) ?;
485+ }
486+ }
437487
438- write ! ( f, r#"<span class="{class}">"# ) ?;
439- pulldown_cmark_escape:: escape_html ( FmtWriter ( & mut f) , token) ?;
440488 write ! ( f, "</span>" ) ?;
441489 } else {
442- pulldown_cmark_escape:: escape_html ( FmtWriter ( & mut f) , token) ?;
490+ for ( word, _status) in words {
491+ pulldown_cmark_escape:: escape_html ( FmtWriter ( & mut f) , word) ?;
492+ }
443493 }
494+
444495 Ok ( ( ) )
445496 }
446497}
@@ -474,23 +525,82 @@ impl UnifiedDiffPrinter for HtmlDiffPrinter<'_> {
474525 before : & [ Token ] ,
475526 after : & [ Token ] ,
476527 ) -> fmt:: Result {
477- if let Some ( & last) = before. last ( ) {
478- for & token in before {
479- let token = self . 0 [ token] ;
480- self . handle_hunk_token ( & mut f, HunkTokenStatus :: Removed , token) ?;
528+ // To improve on the line-by-line diff we also want to do a sort of `git --words-diff`
529+ // (aka word highlighting). To achieve word highlighting, we only consider hunk that
530+ // have the same number of lines removed and added, otherwise it's much more complex
531+ // to link the changes together.
532+
533+ if before. len ( ) == after. len ( ) {
534+ // Same number of lines before and after, can do word-hightling.
535+
536+ // Diff the individual lines together.
537+ let diffs_and_inputs: Vec < _ > = before
538+ . into_iter ( )
539+ . zip ( after. into_iter ( ) )
540+ . map ( |( b_token, a_token) | {
541+ // Split both lines by words and intern them.
542+ let input: InternedInput < & str > = InternedInput :: new (
543+ SplitWordBoundaries ( self . 0 [ * b_token] ) ,
544+ SplitWordBoundaries ( self . 0 [ * a_token] ) ,
545+ ) ;
546+
547+ // Compute the (word) diff
548+ let diff = Diff :: compute ( Algorithm :: Histogram , & input) ;
549+
550+ ( diff, input)
551+ } )
552+ . collect ( ) ;
553+
554+ // Process all before lines first
555+ for ( diff, input) in diffs_and_inputs. iter ( ) {
556+ self . handle_hunk_line (
557+ & mut f,
558+ HunkTokenStatus :: Removed ,
559+ input. before . iter ( ) . enumerate ( ) . map ( |( b_pos, b_token) | {
560+ ( input. interner [ * b_token] , diff. is_removed ( b_pos as u32 ) )
561+ } ) ,
562+ ) ?;
481563 }
482- if !self . 0 [ last] . ends_with ( '\n' ) {
483- writeln ! ( f) ?;
484- }
485- }
486564
487- if let Some ( & last) = after. last ( ) {
488- for & token in after {
489- let token = self . 0 [ token] ;
490- self . handle_hunk_token ( & mut f, HunkTokenStatus :: Added , token) ?;
565+ // Then process all after lines
566+ for ( diff, input) in diffs_and_inputs. iter ( ) {
567+ self . handle_hunk_line (
568+ & mut f,
569+ HunkTokenStatus :: Added ,
570+ input. after . iter ( ) . enumerate ( ) . map ( |( a_pos, a_token) | {
571+ ( input. interner [ * a_token] , diff. is_added ( a_pos as u32 ) )
572+ } ) ,
573+ ) ?;
574+ }
575+ } else {
576+ // Can't do word-highlighting, simply print each line.
577+
578+ if let Some ( & last) = before. last ( ) {
579+ for & token in before {
580+ let token = self . 0 [ token] ;
581+ self . handle_hunk_line (
582+ & mut f,
583+ HunkTokenStatus :: Removed ,
584+ std:: iter:: once ( ( token, false ) ) ,
585+ ) ?;
586+ }
587+ if !self . 0 [ last] . ends_with ( '\n' ) {
588+ writeln ! ( f) ?;
589+ }
491590 }
492- if !self . 0 [ last] . ends_with ( '\n' ) {
493- writeln ! ( f) ?;
591+
592+ if let Some ( & last) = after. last ( ) {
593+ for & token in after {
594+ let token = self . 0 [ token] ;
595+ self . handle_hunk_line (
596+ & mut f,
597+ HunkTokenStatus :: Added ,
598+ std:: iter:: once ( ( token, false ) ) ,
599+ ) ?;
600+ }
601+ if !self . 0 [ last] . ends_with ( '\n' ) {
602+ writeln ! ( f) ?;
603+ }
494604 }
495605 }
496606 Ok ( ( ) )
@@ -514,3 +624,20 @@ fn bookmarklet(host: &str) -> String {
514624}})();"
515625 )
516626}
627+
628+ // Simple abstraction over `unicode_segmentation::split_word_bounds` for `imara_diff::TokenSource`
629+ struct SplitWordBoundaries < ' a > ( & ' a str ) ;
630+
631+ impl < ' a > imara_diff:: TokenSource for SplitWordBoundaries < ' a > {
632+ type Token = & ' a str ;
633+ type Tokenizer = unicode_segmentation:: UWordBounds < ' a > ;
634+
635+ fn tokenize ( & self ) -> Self :: Tokenizer {
636+ self . 0 . split_word_bounds ( )
637+ }
638+
639+ fn estimate_tokens ( & self ) -> u32 {
640+ // https://www.wyliecomm.com/2021/11/whats-the-best-length-of-a-word-online/
641+ ( self . 0 . len ( ) as f32 / 4.7f32 ) as u32
642+ }
643+ }
0 commit comments