From 2f519c3dd050bddd551e84bcd612ff12062f0cf5 Mon Sep 17 00:00:00 2001 From: tompng Date: Tue, 18 Jun 2024 02:19:50 +0900 Subject: [PATCH] Make Reline::Unicode's vi_ ed_ em_ method encoding safe --- lib/reline/unicode.rb | 92 ++++++++++++++++++++----------------- test/reline/test_unicode.rb | 45 ++++++++++++++---- 2 files changed, 87 insertions(+), 50 deletions(-) diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb index 87382ad86e..9e6ec04545 100644 --- a/lib/reline/unicode.rb +++ b/lib/reline/unicode.rb @@ -269,29 +269,29 @@ def self.get_prev_mbchar_size(line, byte_pointer) def self.em_forward_word(line, byte_pointer) gcs = line.byteslice(byte_pointer..).grapheme_clusters - nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) } - words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) } + nonwords = gcs.take_while { |c| !word_character?(c) } + words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) } nonwords.sum(&:bytesize) + words.sum(&:bytesize) end def self.em_forward_word_with_capitalization(line, byte_pointer) gcs = line.byteslice(byte_pointer..).grapheme_clusters - nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) } - words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) } + nonwords = gcs.take_while { |c| !word_character?(c) } + words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) } [nonwords.sum(&:bytesize) + words.sum(&:bytesize), nonwords.join + words.join.capitalize] end def self.em_backward_word(line, byte_pointer) gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse - nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) } - words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) } + nonwords = gcs.take_while { |c| !word_character?(c) } + words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) } nonwords.sum(&:bytesize) + words.sum(&:bytesize) end def self.em_big_backward_word(line, byte_pointer) gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse - spaces = gcs.take_while { |c| c.match?(/\s/) } - nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) } + spaces = gcs.take_while { |c| space_character?(c) } + nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) } spaces.sum(&:bytesize) + nonspaces.sum(&:bytesize) end @@ -299,20 +299,19 @@ def self.ed_transpose_words(line, byte_pointer) gcs = line.byteslice(0, byte_pointer).grapheme_clusters pos = gcs.size gcs += line.byteslice(byte_pointer..).grapheme_clusters - gcs.map! { |c| c.encode(Encoding::UTF_8) } - pos += 1 while pos < gcs.size && gcs[pos].match?(/\P{Word}/) + pos += 1 while pos < gcs.size && !word_character?(gcs[pos]) if pos == gcs.size # 'aaa bbb [cursor] ' - pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\P{Word}/) + pos -= 1 while pos > 0 && !word_character?(gcs[pos - 1]) second_word_end = gcs.size else # 'aaa [cursor]bbb' - pos += 1 while pos < gcs.size && gcs[pos].match?(/\p{Word}/) + pos += 1 while pos < gcs.size && word_character?(gcs[pos]) second_word_end = pos end - pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\p{Word}/) + pos -= 1 while pos > 0 && word_character?(gcs[pos - 1]) second_word_start = pos - pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\P{Word}/) + pos -= 1 while pos > 0 && !word_character?(gcs[pos - 1]) first_word_end = pos - pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\p{Word}/) + pos -= 1 while pos > 0 && word_character?(gcs[pos - 1]) first_word_start = pos [first_word_start, first_word_end, second_word_start, second_word_end].map do |idx| @@ -322,16 +321,16 @@ def self.ed_transpose_words(line, byte_pointer) def self.vi_big_forward_word(line, byte_pointer) gcs = line.byteslice(byte_pointer..).grapheme_clusters - nonspaces = gcs.take_while { |c| c.match?(/\S/) } - spaces = gcs.drop(nonspaces.size).take_while { |c| c.match?(/\s/) } + nonspaces = gcs.take_while { |c| !space_character?(c) } + spaces = gcs.drop(nonspaces.size).take_while { |c| space_character?(c) } nonspaces.sum(&:bytesize) + spaces.sum(&:bytesize) end def self.vi_big_forward_end_word(line, byte_pointer) gcs = line.byteslice(byte_pointer..).grapheme_clusters first = gcs.shift(1) - spaces = gcs.take_while { |c| c.match?(/\s/) } - nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) } + spaces = gcs.take_while { |c| space_character?(c) } + nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) } matched = spaces + nonspaces matched.pop first.sum(&:bytesize) + matched.sum(&:bytesize) @@ -339,60 +338,71 @@ def self.vi_big_forward_end_word(line, byte_pointer) def self.vi_big_backward_word(line, byte_pointer) gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse - spaces = gcs.take_while { |c| c.match?(/\s/) } - nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) } + spaces = gcs.take_while { |c| space_character?(c) } + nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) } spaces.sum(&:bytesize) + nonspaces.sum(&:bytesize) end def self.vi_forward_word(line, byte_pointer, drop_terminate_spaces = false) - gcs = line.byteslice(byte_pointer..).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) } + gcs = line.byteslice(byte_pointer..).grapheme_clusters return 0 if gcs.empty? - regexp = - case gcs.first - when /\p{Word}/ - /\p{Word}/ - when /\s/ - /\s/ + c = gcs.first + matched = + if word_character?(c) + gcs.take_while { |c| word_character?(c) } + elsif space_character?(c) + gcs.take_while { |c| space_character?(c) } else - /[^\p{Word}\s]/ + gcs.take_while { |c| !word_character?(c) && !space_character?(c) } end - matched = gcs.take_while { |c| c.match?(regexp) } + return matched.sum(&:bytesize) if drop_terminate_spaces - spaces = gcs.drop(matched.size).take_while { |c| c.match?(/\s/) } + spaces = gcs.drop(matched.size).take_while { |c| space_character?(c) } matched.sum(&:bytesize) + spaces.sum(&:bytesize) end def self.vi_forward_end_word(line, byte_pointer) - gcs = line.byteslice(byte_pointer..).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) } + gcs = line.byteslice(byte_pointer..).grapheme_clusters return 0 if gcs.empty? return gcs.first.bytesize if gcs.size == 1 start = gcs.shift skips = [start] - if start.match?(/\s/) || gcs.first.match?(/\s/) - spaces = gcs.take_while { |c| c.match?(/\s/) } + if space_character?(start) || space_character?(gcs.first) + spaces = gcs.take_while { |c| space_character?(c) } skips += spaces gcs.shift(spaces.size) end - regexp = /\p{Word}/.match?(gcs.first) ? /\p{Word}/ : /[^\p{Word}\s]/ - matched = gcs.take_while { |c| c.match?(regexp) } + start_with_word = word_character?(gcs.first) + matched = gcs.take_while { |c| start_with_word ? word_character?(c) : !word_character?(c) && !space_character?(c) } matched.pop skips.sum(&:bytesize) + matched.sum(&:bytesize) end def self.vi_backward_word(line, byte_pointer) - gcs = line.byteslice(0, byte_pointer).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) }.reverse - spaces = gcs.take_while { |c| c.match?(/\s/) } + gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse + spaces = gcs.take_while { |c| space_character?(c) } gcs.shift(spaces.size) - regexp = /\p{Word}/.match?(gcs.first) ? /\p{Word}/ : /[^\p{Word}\s]/ - spaces.sum(&:bytesize) + gcs.take_while { |c| c.match?(regexp) }.sum(&:bytesize) + start_with_word = word_character?(gcs.first) + matched = gcs.take_while { |c| start_with_word ? word_character?(c) : !word_character?(c) && !space_character?(c) } + spaces.sum(&:bytesize) + matched.sum(&:bytesize) end def self.vi_first_print(line) gcs = line.grapheme_clusters - spaces = gcs.take_while { |c| c.match?(/\s/) } + spaces = gcs.take_while { |c| space_character?(c) } spaces.sum(&:bytesize) end + + def self.word_character?(s) + s.encode(Encoding::UTF_8).match?(/\p{Word}/) if s + rescue Encoding::UndefinedConversionError + false + end + + def self.space_character?(s) + s.match?(/\s/) if s + end end diff --git a/test/reline/test_unicode.rb b/test/reline/test_unicode.rb index 24ea688404..d16575b05b 100644 --- a/test/reline/test_unicode.rb +++ b/test/reline/test_unicode.rb @@ -92,6 +92,7 @@ def test_take_mbchar_range def test_em_forward_word assert_equal(12, Reline::Unicode.em_forward_word('abc---fooあbar-baz', 3)) + assert_equal(11, Reline::Unicode.em_forward_word('abc---fooあbar-baz'.encode('sjis'), 3)) assert_equal(3, Reline::Unicode.em_forward_word('abcfoo', 3)) assert_equal(3, Reline::Unicode.em_forward_word('abc---', 3)) assert_equal(0, Reline::Unicode.em_forward_word('abc', 3)) @@ -99,6 +100,7 @@ def test_em_forward_word def test_em_forward_word_with_capitalization assert_equal([12, '---Fooあbar'], Reline::Unicode.em_forward_word_with_capitalization('abc---foOあBar-baz', 3)) + assert_equal([11, '---Fooあbar'.encode('sjis')], Reline::Unicode.em_forward_word_with_capitalization('abc---foOあBar-baz'.encode('sjis'), 3)) assert_equal([3, 'Foo'], Reline::Unicode.em_forward_word_with_capitalization('abcfOo', 3)) assert_equal([3, '---'], Reline::Unicode.em_forward_word_with_capitalization('abc---', 3)) assert_equal([0, ''], Reline::Unicode.em_forward_word_with_capitalization('abc', 3)) @@ -107,6 +109,7 @@ def test_em_forward_word_with_capitalization def test_em_backward_word assert_equal(12, Reline::Unicode.em_backward_word('abc foo-barあbaz--- xyz', 20)) + assert_equal(11, Reline::Unicode.em_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19)) assert_equal(2, Reline::Unicode.em_backward_word(' ', 2)) assert_equal(2, Reline::Unicode.em_backward_word('ab', 2)) assert_equal(0, Reline::Unicode.em_backward_word('ab', 0)) @@ -114,6 +117,7 @@ def test_em_backward_word def test_em_big_backward_word assert_equal(16, Reline::Unicode.em_big_backward_word('abc foo-barあbaz--- xyz', 20)) + assert_equal(15, Reline::Unicode.em_big_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19)) assert_equal(2, Reline::Unicode.em_big_backward_word(' ', 2)) assert_equal(2, Reline::Unicode.em_big_backward_word('ab', 2)) assert_equal(0, Reline::Unicode.em_big_backward_word('ab', 0)) @@ -129,20 +133,20 @@ def test_ed_transpose_words assert_equal([3, 5, 6, 8], Reline::Unicode.ed_transpose_words('aa bb cc ', 7)) assert_equal([3, 5, 6, 10], Reline::Unicode.ed_transpose_words('aa bb cc ', 8)) assert_equal([3, 5, 6, 10], Reline::Unicode.ed_transpose_words('aa bb cc ', 9)) - word1 = 'fooあ' - word2 = 'barあbaz' - left = 'aaa -' - middle = '- -' - right = '- bbb' - expected = [left.bytesize, (left + word1).bytesize, (left + word1 + middle).bytesize, (left + word1 + middle + word2).bytesize] - assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize)) - assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize)) - assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize + word2.bytesize - 1)) + ['sjis', 'utf-8'].each do |encoding| + texts = ['fooあ', 'barあbaz', 'aaa -', '- -', '- bbb'] + word1, word2, left, middle, right = texts.map { |text| text.encode(encoding) } + expected = [left.bytesize, (left + word1).bytesize, (left + word1 + middle).bytesize, (left + word1 + middle + word2).bytesize] + assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize)) + assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize)) + assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize + word2.bytesize - 1)) + end end def test_vi_big_forward_word assert_equal(18, Reline::Unicode.vi_big_forward_word('abc---fooあbar-baz xyz', 3)) assert_equal(8, Reline::Unicode.vi_big_forward_word('abcfooあ --', 3)) + assert_equal(7, Reline::Unicode.vi_big_forward_word('abcfooあ --'.encode('sjis'), 3)) assert_equal(6, Reline::Unicode.vi_big_forward_word('abcfooあ', 3)) assert_equal(3, Reline::Unicode.vi_big_forward_word('abc- ', 3)) assert_equal(0, Reline::Unicode.vi_big_forward_word('abc', 3)) @@ -156,6 +160,7 @@ def test_vi_big_forward_end_word assert_equal(1, Reline::Unicode.vi_big_forward_end_word('aa b', 0)) assert_equal(3, Reline::Unicode.vi_big_forward_end_word(' aa b', 0)) assert_equal(15, Reline::Unicode.vi_big_forward_end_word('abc---fooあbar-baz xyz', 3)) + assert_equal(14, Reline::Unicode.vi_big_forward_end_word('abc---fooあbar-baz xyz'.encode('sjis'), 3)) assert_equal(3, Reline::Unicode.vi_big_forward_end_word('abcfooあ --', 3)) assert_equal(3, Reline::Unicode.vi_big_forward_end_word('abcfooあ', 3)) assert_equal(2, Reline::Unicode.vi_big_forward_end_word('abc- ', 3)) @@ -164,6 +169,7 @@ def test_vi_big_forward_end_word def test_vi_big_backward_word assert_equal(16, Reline::Unicode.vi_big_backward_word('abc foo-barあbaz--- xyz', 20)) + assert_equal(15, Reline::Unicode.vi_big_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19)) assert_equal(2, Reline::Unicode.vi_big_backward_word(' ', 2)) assert_equal(2, Reline::Unicode.vi_big_backward_word('ab', 2)) assert_equal(0, Reline::Unicode.vi_big_backward_word('ab', 0)) @@ -172,6 +178,7 @@ def test_vi_big_backward_word def test_vi_forward_word assert_equal(3, Reline::Unicode.vi_forward_word('abc---fooあbar-baz', 3)) assert_equal(9, Reline::Unicode.vi_forward_word('abc---fooあbar-baz', 6)) + assert_equal(8, Reline::Unicode.vi_forward_word('abc---fooあbar-baz'.encode('sjis'), 6)) assert_equal(6, Reline::Unicode.vi_forward_word('abcfooあ', 3)) assert_equal(3, Reline::Unicode.vi_forward_word('abc---', 3)) assert_equal(0, Reline::Unicode.vi_forward_word('abc', 3)) @@ -180,6 +187,7 @@ def test_vi_forward_word def test_vi_forward_end_word assert_equal(2, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz', 3)) assert_equal(8, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz', 6)) + assert_equal(7, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz'.encode('sjis'), 6)) assert_equal(3, Reline::Unicode.vi_forward_end_word('abcfooあ', 3)) assert_equal(2, Reline::Unicode.vi_forward_end_word('abc---', 3)) assert_equal(0, Reline::Unicode.vi_forward_end_word('abc', 3)) @@ -188,6 +196,7 @@ def test_vi_forward_end_word def test_vi_backward_word assert_equal(3, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz', 20)) assert_equal(9, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz', 17)) + assert_equal(8, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 16)) assert_equal(2, Reline::Unicode.vi_backward_word(' ', 2)) assert_equal(2, Reline::Unicode.vi_backward_word('ab', 2)) assert_equal(0, Reline::Unicode.vi_backward_word('ab', 0)) @@ -197,6 +206,24 @@ def test_vi_first_print assert_equal(3, Reline::Unicode.vi_first_print(' abcdefg')) assert_equal(3, Reline::Unicode.vi_first_print(' ')) assert_equal(0, Reline::Unicode.vi_first_print('abc')) + assert_equal(0, Reline::Unicode.vi_first_print('あ')) + assert_equal(0, Reline::Unicode.vi_first_print('あ'.encode('sjis'))) assert_equal(0, Reline::Unicode.vi_first_print('')) end + + def test_character_type + assert(Reline::Unicode.word_character?('a')) + assert(Reline::Unicode.word_character?('あ')) + assert(Reline::Unicode.word_character?('あ'.encode('sjis'))) + refute(Reline::Unicode.word_character?(33345.chr('sjis'))) + refute(Reline::Unicode.word_character?('-')) + refute(Reline::Unicode.word_character?(nil)) + + assert(Reline::Unicode.space_character?(' ')) + refute(Reline::Unicode.space_character?('あ')) + refute(Reline::Unicode.space_character?('あ'.encode('sjis'))) + refute(Reline::Unicode.space_character?(33345.chr('sjis'))) + refute(Reline::Unicode.space_character?('-')) + refute(Reline::Unicode.space_character?(nil)) + end end