Skip to content

Commit cf98f2b

Browse files
committed
Tokenizer for syntax highlighting using Prism
1 parent c55c1f5 commit cf98f2b

File tree

6 files changed

+435
-40
lines changed

6 files changed

+435
-40
lines changed

lib/rdoc/markup/to_html.rb

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# frozen_string_literal: true
22
require 'cgi/escape'
33
require 'cgi/util' unless defined?(CGI::EscapeExt)
4+
require 'rdoc/parser/ripper_state_lex'
45

56
##
67
# Outputs RDoc markup as HTML.
@@ -216,6 +217,23 @@ def accept_paragraph(paragraph)
216217
@res << "</p>\n"
217218
end
218219

220+
# Generate syntax highlighted html for ruby-like text.
221+
222+
def parsable_text_to_html(text)
223+
if defined?(RDoc::Parser::PrismRuby) && RDoc::Parser::Ruby == RDoc::Parser::PrismRuby
224+
tokens = RDoc::Parser::Tokenizer.tokenize(text).map do |type, text|
225+
RDoc::TokenStream::RipperStateLexCompatToken.new(type, text)
226+
end
227+
else
228+
# RipperStateLex.parse is assumed to fail in some cases.
229+
# Failing input is unknown.
230+
tokens = RDoc::Parser::RipperStateLex.parse(text) rescue return
231+
end
232+
result = RDoc::TokenStream.to_html tokens
233+
result = result + "\n" unless "\n" == result[-1]
234+
result
235+
end
236+
219237
##
220238
# Adds +verbatim+ to the output
221239

@@ -224,20 +242,12 @@ def accept_verbatim(verbatim)
224242

225243
klass = nil
226244

227-
content = if verbatim.ruby? or parseable? text then
228-
begin
229-
tokens = RDoc::Parser::RipperStateLex.parse text
230-
klass = ' class="ruby"'
231-
232-
result = RDoc::TokenStream.to_html tokens
233-
result = result + "\n" unless "\n" == result[-1]
234-
result
235-
rescue
236-
CGI.escapeHTML text
237-
end
238-
else
239-
CGI.escapeHTML text
240-
end
245+
if verbatim.ruby? || parseable?(text)
246+
content = parsable_text_to_html(text)
247+
klass = ' class="ruby"' if content # RDoc::Parser::RipperStateLex.parse may fail
248+
end
249+
250+
content ||= CGI.escapeHTML text
241251

242252
if @options.pipe then
243253
@res << "\n<pre><code>#{CGI.escapeHTML text}\n</code></pre>\n"

lib/rdoc/parser/prism_ruby.rb

Lines changed: 20 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# frozen_string_literal: true
22

33
require 'prism'
4-
require_relative 'ripper_state_lex'
4+
require_relative 'tokenizer'
55

66
# Unlike lib/rdoc/parser/ruby.rb, this file is not based on rtags and does not contain code from
77
# rtags.rb -
@@ -89,10 +89,13 @@ def record_location(container) # :nodoc:
8989
# Scans this Ruby file for Ruby constructs
9090

9191
def scan
92-
@tokens = RDoc::Parser::RipperStateLex.parse(@content)
9392
@lines = @content.lines
94-
result = Prism.parse(@content)
95-
@program_node = result.value
93+
result = Prism.parse_lex(@content)
94+
@prism_comments = result.comments
95+
@program_node, unordered_tokens = result.value
96+
# Heredoc tokens are not in start_offset order.
97+
# Need to sort them to use bsearch for finding tokens by location.
98+
@prism_tokens = unordered_tokens.map(&:first).sort_by { |t| t.location.start_offset }
9699
@line_nodes = {}
97100
prepare_line_nodes(@program_node)
98101
prepare_comments(result.comments)
@@ -205,7 +208,7 @@ def parse_comment_tomdoc(container, comment, line_no, start_line)
205208

206209
meth.start_collecting_tokens(:ruby)
207210
node = @line_nodes[line_no]
208-
tokens = node ? visible_tokens_from_location(node.location) : [file_line_comment_token(start_line)]
211+
tokens = node ? visible_tokens_from_node(node) : [file_line_comment_token(start_line)]
209212
tokens.each { |token| meth.token_stream << token }
210213

211214
container.add_method meth
@@ -273,7 +276,7 @@ def handle_meta_method_comment(comment, directives, node)
273276
elsif line_no || node
274277
method_name ||= call_node_name_arguments(node).first if is_call_node
275278
if node
276-
tokens = visible_tokens_from_location(node.location)
279+
tokens = visible_tokens_from_node(node)
277280
line_no = node.location.start_line
278281
else
279282
tokens = [file_line_comment_token(line_no)]
@@ -368,30 +371,21 @@ def parse_comment_text_to_directives(comment_text, start_line) # :nodoc:
368371
[comment, directives]
369372
end
370373

371-
def slice_tokens(start_pos, end_pos) # :nodoc:
372-
start_index = @tokens.bsearch_index { |t| ([t.line_no, t.char_no] <=> start_pos) >= 0 }
373-
end_index = @tokens.bsearch_index { |t| ([t.line_no, t.char_no] <=> end_pos) >= 0 }
374-
tokens = @tokens[start_index...end_index]
375-
tokens.pop if tokens.last&.kind == :on_nl
376-
tokens
377-
end
378-
379374
def file_line_comment_token(line_no) # :nodoc:
380-
position_comment = RDoc::Parser::RipperStateLex::Token.new(line_no - 1, 0, :on_comment)
381-
position_comment[:text] = "# File #{@top_level.relative_name}, line #{line_no}"
382-
position_comment
375+
text = "# File #{@top_level.relative_name}, line #{line_no}"
376+
RDoc::TokenStream::RipperStateLexCompatToken.new(:on_comment, text)
383377
end
384378

385-
# Returns tokens from the given location
379+
# Returns tokens of the given node's location for syntax highlighting
386380

387-
def visible_tokens_from_location(location)
381+
def visible_tokens_from_node(node)
382+
location = node.location
388383
position_comment = file_line_comment_token(location.start_line)
389-
newline_token = RDoc::Parser::RipperStateLex::Token.new(0, 0, :on_nl, "\n")
390-
indent_token = RDoc::Parser::RipperStateLex::Token.new(location.start_line, 0, :on_sp, ' ' * location.start_character_column)
391-
tokens = slice_tokens(
392-
[location.start_line, location.start_character_column],
393-
[location.end_line, location.end_character_column]
394-
)
384+
newline_token = RDoc::TokenStream::RipperStateLexCompatToken.new(:on_nl, "\n")
385+
indent_token = RDoc::TokenStream::RipperStateLexCompatToken.new(:on_sp, ' ' * location.start_character_column)
386+
tokens = RDoc::Parser::Tokenizer.partial_tokenize(@content, node, @prism_tokens, @prism_comments).map do |type, text|
387+
RDoc::TokenStream::RipperStateLexCompatToken.new(type, text)
388+
end
395389
[position_comment, newline_token, indent_token, *tokens]
396390
end
397391

@@ -894,7 +888,7 @@ def visit_def_node(node)
894888
end
895889
name = node.name.to_s
896890
params, block_params, calls_super = MethodSignatureVisitor.scan_signature(node)
897-
tokens = @scanner.visible_tokens_from_location(node.location)
891+
tokens = @scanner.visible_tokens_from_node(node)
898892

899893
@scanner.add_method(
900894
name,

lib/rdoc/parser/tokenizer.rb

Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
require 'prism'
2+
require 'set'
3+
4+
# Tokenize Ruby code as RDoc::Parser::RipperStateLex style types and token squashing.
5+
# Token squashing is required by RDoc::TokenStream's syntax highlighting.
6+
module RDoc::Parser::Tokenizer
7+
# This constants and token type map are for compatibility with RDoc::Parser::RipperStateLex.
8+
OTHER = :other
9+
SPACE = :on_sp
10+
NEWLINE = :on_nl
11+
KEYWORD = :on_kw
12+
OP = :on_op
13+
HEREDOC_BEG = :on_heredoc_beg
14+
HEREDOC_CONTENT = :on_heredoc
15+
HEREDOC_END = :on_heredoc_end
16+
COMMENT = :on_comment
17+
INTEGER = :on_int
18+
FLOAT = :on_float
19+
RATIONAL = :on_rational
20+
IMAGINARY = :on_imaginary
21+
SYMBOL = :on_symbol
22+
REGEXP = :on_regexp
23+
STRING = :on_tstring
24+
WORDS = :on_dstring
25+
DEF_METHOD_NAME = :on_ident
26+
DSTRING = :on_dstring
27+
28+
OP_TOKENS = %i[
29+
AMPERSAND AMPERSAND_AMPERSAND
30+
BANG BANG_EQUAL BANG_TILDE CARET COLON COLON_COLON
31+
EQUAL EQUAL_EQUAL EQUAL_GREATER EQUAL_TILDE
32+
GREATER GREATER_GREATER
33+
LESS LESS_EQUAL LESS_EQUAL_GREATER LESS_LESS
34+
MINUS MINUS_GREATER PERCENT PIPE PIPE_PIPE PLUS
35+
QUESTION_MARK SLASH STAR STAR_STAR TILDE
36+
UAMPERSAND UMINUS UPLUS USTAR USTAR_STAR
37+
].to_set
38+
39+
TOKEN_TYPE_MAP = {
40+
IDENTIFIER: :on_ident,
41+
METHOD_NAME: :on_ident,
42+
INSTANCE_VARIABLE: :on_ivar,
43+
CLASS_VARIABLE: :on_cvar,
44+
GLOBAL_VARIABLE: :on_gvar,
45+
BACK_REFERENCE: :on_backref,
46+
NUMBERED_REFERENCE: :on_backref,
47+
CONSTANT: :on_const,
48+
LABEL: :on_label,
49+
INTEGER: :on_int,
50+
FLOAT: :on_float,
51+
RATIONAL: :on_rational,
52+
IMAGINARY: :on_imaginary,
53+
}
54+
55+
class << self
56+
def tokenize(code)
57+
result = Prism.parse_lex(code)
58+
program_node, unordered_tokens = result.value
59+
prism_tokens = unordered_tokens.map(&:first).sort_by! { |token| token.location.start_offset }
60+
partial_tokenize(code, program_node, prism_tokens, result.comments, 0, code.bytesize)
61+
end
62+
63+
def partial_tokenize(whole_code, node, prism_tokens, prism_comments, start_offset = nil, end_offset = nil)
64+
start_offset ||= node.location.start_offset
65+
end_offset ||= node.location.end_offset
66+
visitor = SquashTokenVisitor.new
67+
node.accept(visitor)
68+
squashed_tokens = visitor.tokens
69+
comment_tokens = comment_tokens(slice_by_location(prism_comments, start_offset, end_offset))
70+
normal_tokens = normal_tokens(slice_by_location(prism_tokens, start_offset, end_offset))
71+
prior_tokens = (squashed_tokens + comment_tokens).sort_by {|_, start_offset, _| start_offset }
72+
unify_tokens(whole_code, prior_tokens, normal_tokens, start_offset, end_offset)
73+
end
74+
75+
private
76+
77+
def slice_by_location(items, start_offset, end_offset)
78+
start_index = items.bsearch_index { |item| item.location.end_offset > start_offset } || items.size
79+
end_index = items.bsearch_index { |item| item.location.start_offset >= end_offset } || items.size
80+
items[start_index...end_index]
81+
end
82+
83+
# Unify prior tokens and normal tokens into a token stream.
84+
# Prior tokens have higher priority than normal tokens.
85+
# Also adds missing text (spaces, newlines, etc.) as separate tokens
86+
# so that the entire code is covered.
87+
def unify_tokens(code, prior_tokens, normal_tokens, start_offset, end_offset)
88+
tokens = []
89+
offset = start_offset
90+
91+
# Add missing text such as spaces and newlines as a separate token
92+
flush = -> next_offset {
93+
return if offset == next_offset
94+
95+
code[offset...next_offset].scan(/\n|\s+|[^\s]+/) do |text|
96+
type =
97+
if text == "\n"
98+
NEWLINE
99+
elsif /\A\s+\z/.match?(text)
100+
SPACE
101+
else
102+
OTHER
103+
end
104+
tokens << [type, text]
105+
end
106+
}
107+
108+
until prior_tokens.empty? && normal_tokens.empty?
109+
ptok = prior_tokens.first
110+
ntok = normal_tokens.first
111+
if ntok && (!ptok || ntok[2] <= ptok[1])
112+
token = normal_tokens.shift
113+
else
114+
token = prior_tokens.shift
115+
end
116+
type, start_pos, end_pos = token
117+
next if start_pos < offset
118+
119+
flush.call(start_pos)
120+
tokens << [type, code.byteslice(start_pos...end_pos)]
121+
offset = end_pos
122+
end
123+
flush.call(end_offset)
124+
tokens
125+
end
126+
127+
# Extract normal comment and embdoc comment (consists of multiple tokens) as a single token
128+
def comment_tokens(comments)
129+
comments.map do |comment|
130+
[COMMENT, comment.location.start_offset, comment.location.end_offset]
131+
end
132+
end
133+
134+
# Convert normal Prism tokens to [type, start_offset, end_offset]
135+
def normal_tokens(tokens)
136+
tokens.map do |token,|
137+
type =
138+
if token.type.start_with?('KEYWORD_')
139+
KEYWORD
140+
elsif OP_TOKENS.include?(token.type.to_sym)
141+
OP
142+
else
143+
TOKEN_TYPE_MAP[token.type] || OTHER
144+
end
145+
[type, token.location.start_offset, token.location.end_offset]
146+
end
147+
end
148+
end
149+
150+
# Visitor to squash several tokens that consist a single node into a single token
151+
class SquashTokenVisitor < Prism::Visitor
152+
attr_reader :tokens
153+
def initialize
154+
@tokens = []
155+
end
156+
157+
def heredoc?(node)
158+
/\A<</.match?(node.opening)
159+
end
160+
161+
# Squash UMINUS and its operand(integer, float, rational, imaginary) token into a single token
162+
def visit_integer_node(node)
163+
push_location(node.location, INTEGER)
164+
end
165+
166+
def visit_float_node(node)
167+
push_location(node.location, FLOAT)
168+
end
169+
170+
def visit_rational_node(node)
171+
push_location(node.location, RATIONAL)
172+
end
173+
174+
def visit_imaginary_node(node)
175+
push_location(node.location, IMAGINARY)
176+
end
177+
178+
def visit_symbol_node(node)
179+
push_location(node.location, SYMBOL)
180+
end
181+
alias visit_interpolated_symbol_node visit_symbol_node
182+
183+
def visit_regular_expression_node(node)
184+
push_location(node.location, REGEXP)
185+
end
186+
alias visit_match_last_line_node visit_regular_expression_node
187+
alias visit_interpolated_regular_expression_node visit_regular_expression_node
188+
alias visit_interpolated_match_last_line_node visit_regular_expression_node
189+
190+
def visit_string_node(node)
191+
if heredoc?(node)
192+
push_location(node.opening_loc, HEREDOC_BEG)
193+
push_location(node.content_loc, HEREDOC_CONTENT)
194+
push_location(node.closing_loc, HEREDOC_END)
195+
else
196+
push_location(node.location, STRING)
197+
end
198+
end
199+
alias visit_x_string_node visit_string_node
200+
201+
def visit_array_node(node)
202+
if /\A%/.match?(node.opening)
203+
# Percent array: squash entire node into a single token.
204+
# We don't handle embedded expressions inside yet.
205+
push_location(node.location, WORDS)
206+
else
207+
super
208+
end
209+
end
210+
211+
def push_location(location, type)
212+
@tokens << [type, location.start_offset, location.end_offset]
213+
end
214+
215+
def visit_def_node(node)
216+
# For special colorizing of method name in def node
217+
push_location(node.name_loc, DEF_METHOD_NAME)
218+
super
219+
end
220+
221+
def visit_interpolated_string_node(node)
222+
if /\A<</.match?(node.opening)
223+
# Heredocs. Squash content into a single token.
224+
# We don't tokenize embedded expressions inside, and don't handle nested heredocs yet.
225+
push_location(node.opening_loc, HEREDOC_BEG)
226+
unless node.parts.empty?
227+
# Squash heredoc content into a single token
228+
part_locations = node.parts.map(&:location)
229+
@tokens << [
230+
HEREDOC_CONTENT,
231+
part_locations.map(&:start_offset).min,
232+
part_locations.map(&:end_offset).max
233+
]
234+
end
235+
# incomplete heredoc might not have closing_loc
236+
push_location(node.closing_loc, HEREDOC_END) if node.closing_loc
237+
else
238+
# Squash entire node into a single token
239+
push_location(node.location, DSTRING)
240+
end
241+
end
242+
alias visit_interpolated_x_string_node visit_interpolated_string_node
243+
end
244+
end

0 commit comments

Comments
 (0)