Skip to content

Commit 39940ad

Browse files
nigeltaogopherbot
authored andcommitted
html: parse comments per HTML spec
Updates golang/go#58246 Change-Id: Iaba5ed65f5d244fd47372ef0c08fc4cdb5ed90f9 Reviewed-on: https://go-review.googlesource.com/c/net/+/466776 TryBot-Result: Gopher Robot <[email protected]> Auto-Submit: Nigel Tao <[email protected]> Reviewed-by: Damien Neil <[email protected]> Run-TryBot: Nigel Tao <[email protected]> Reviewed-by: Nigel Tao (INACTIVE; USE @golang.org INSTEAD) <[email protected]>
1 parent 87ce33e commit 39940ad

File tree

3 files changed

+347
-9
lines changed

3 files changed

+347
-9
lines changed

html/comment_test.go

+270
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
// Copyright 2023 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package html
6+
7+
import (
8+
"bytes"
9+
"testing"
10+
)
11+
12+
// TestComments exhaustively tests every 'interesting' N-byte string is
13+
// correctly parsed as a comment. N ranges from 4+1 to 4+suffixLen inclusive,
14+
// where 4 is the length of the "<!--" prefix that starts an HTML comment.
15+
//
16+
// 'Interesting' means that the N-4 byte suffix consists entirely of bytes
17+
// sampled from the interestingCommentBytes const string, below. These cover
18+
// all of the possible state transitions from comment-related parser states, as
19+
// listed in the HTML spec (https://html.spec.whatwg.org/#comment-start-state
20+
// and subsequent sections).
21+
//
22+
// The spec is written as an explicit state machine that, as a side effect,
23+
// accumulates "the comment token's data" to a separate buffer.
24+
// Tokenizer.readComment in this package does not have an explicit state
25+
// machine and usually returns the comment text as a sub-slice of the input,
26+
// between the opening '<' and closing '>' or EOF. This test confirms that the
27+
// two algorithms match.
28+
func TestComments(t *testing.T) {
29+
const prefix = "<!--"
30+
const suffixLen = 6
31+
buffer := make([]byte, 0, len(prefix)+suffixLen)
32+
testAllComments(t, append(buffer, prefix...))
33+
}
34+
35+
// NUL isn't in this list, even though the HTML spec sections 13.2.5.43 -
36+
// 13.2.5.52 mentions it. It's not interesting in terms of state transitions.
37+
// It's equivalent to any other non-interesting byte (other than being replaced
38+
// by U+FFFD REPLACEMENT CHARACTER).
39+
//
40+
// EOF isn't in this list. The HTML spec treats EOF as "an input character" but
41+
// testOneComment below breaks the loop instead.
42+
//
43+
// 'x' represents all other "non-interesting" comment bytes.
44+
var interestingCommentBytes = [...]byte{
45+
'!', '-', '<', '>', 'x',
46+
}
47+
48+
// testAllComments recursively fills in buffer[len(buffer):cap(buffer)] with
49+
// interesting bytes and then tests that this package's tokenization matches
50+
// the HTML spec.
51+
//
52+
// Precondition: len(buffer) < cap(buffer)
53+
// Precondition: string(buffer[:4]) == "<!--"
54+
func testAllComments(t *testing.T, buffer []byte) {
55+
for _, interesting := range interestingCommentBytes {
56+
b := append(buffer, interesting)
57+
testOneComment(t, b)
58+
if len(b) < cap(b) {
59+
testAllComments(t, b)
60+
}
61+
}
62+
}
63+
64+
func testOneComment(t *testing.T, b []byte) {
65+
z := NewTokenizer(bytes.NewReader(b))
66+
if next := z.Next(); next != CommentToken {
67+
t.Fatalf("Next(%q): got %v, want %v", b, next, CommentToken)
68+
}
69+
gotRemainder := string(b[len(z.Raw()):])
70+
gotComment := string(z.Text())
71+
72+
i := len("<!--")
73+
wantBuffer := []byte(nil)
74+
loop:
75+
for state := 43; ; {
76+
// Consume the next input character, handling EOF.
77+
if i >= len(b) {
78+
break
79+
}
80+
nextInputCharacter := b[i]
81+
i++
82+
83+
switch state {
84+
case 43: // 13.2.5.43 Comment start state.
85+
switch nextInputCharacter {
86+
case '-':
87+
state = 44
88+
case '>':
89+
break loop
90+
default:
91+
i-- // Reconsume.
92+
state = 45
93+
}
94+
95+
case 44: // 13.2.5.44 Comment start dash state.
96+
switch nextInputCharacter {
97+
case '-':
98+
state = 51
99+
case '>':
100+
break loop
101+
default:
102+
wantBuffer = append(wantBuffer, '-')
103+
i-- // Reconsume.
104+
state = 45
105+
}
106+
107+
case 45: // 13.2.5.45 Comment state.
108+
switch nextInputCharacter {
109+
case '-':
110+
state = 50
111+
case '<':
112+
wantBuffer = append(wantBuffer, '<')
113+
state = 46
114+
default:
115+
wantBuffer = append(wantBuffer, nextInputCharacter)
116+
}
117+
118+
case 46: // 13.2.5.46 Comment less-than sign state.
119+
switch nextInputCharacter {
120+
case '!':
121+
wantBuffer = append(wantBuffer, '!')
122+
state = 47
123+
case '<':
124+
wantBuffer = append(wantBuffer, '<')
125+
state = 46
126+
default:
127+
i-- // Reconsume.
128+
state = 45
129+
}
130+
131+
case 47: // 13.2.5.47 Comment less-than sign bang state.
132+
switch nextInputCharacter {
133+
case '-':
134+
state = 48
135+
default:
136+
i-- // Reconsume.
137+
state = 45
138+
}
139+
140+
case 48: // 13.2.5.48 Comment less-than sign bang dash state.
141+
switch nextInputCharacter {
142+
case '-':
143+
state = 49
144+
default:
145+
i-- // Reconsume.
146+
state = 50
147+
}
148+
149+
case 49: // 13.2.5.49 Comment less-than sign bang dash dash state.
150+
switch nextInputCharacter {
151+
case '>':
152+
break loop
153+
default:
154+
i-- // Reconsume.
155+
state = 51
156+
}
157+
158+
case 50: // 13.2.5.50 Comment end dash state.
159+
switch nextInputCharacter {
160+
case '-':
161+
state = 51
162+
default:
163+
wantBuffer = append(wantBuffer, '-')
164+
i-- // Reconsume.
165+
state = 45
166+
}
167+
168+
case 51: // 13.2.5.51 Comment end state.
169+
switch nextInputCharacter {
170+
case '!':
171+
state = 52
172+
case '-':
173+
wantBuffer = append(wantBuffer, '-')
174+
case '>':
175+
break loop
176+
default:
177+
wantBuffer = append(wantBuffer, "--"...)
178+
i-- // Reconsume.
179+
state = 45
180+
}
181+
182+
case 52: // 13.2.5.52 Comment end bang state.
183+
switch nextInputCharacter {
184+
case '-':
185+
wantBuffer = append(wantBuffer, "--!"...)
186+
state = 50
187+
case '>':
188+
break loop
189+
default:
190+
wantBuffer = append(wantBuffer, "--!"...)
191+
i-- // Reconsume.
192+
state = 45
193+
}
194+
195+
default:
196+
t.Fatalf("input=%q: unexpected state %d", b, state)
197+
}
198+
}
199+
200+
wantRemainder := ""
201+
if i < len(b) {
202+
wantRemainder = string(b[i:])
203+
}
204+
wantComment := string(wantBuffer)
205+
if (gotComment != wantComment) || (gotRemainder != wantRemainder) {
206+
t.Errorf("input=%q\ngot: %q + %q\nwant: %q + %q",
207+
b, gotComment, gotRemainder, wantComment, wantRemainder)
208+
}
209+
}
210+
211+
// This table below summarizes the HTML-comment-related state machine from
212+
// 13.2.5.43 "Comment start state" and subsequent sections.
213+
// https://html.spec.whatwg.org/#comment-start-state
214+
//
215+
// Get to state 13.2.5.43 after seeing "<!--". Specifically, starting from the
216+
// initial 13.2.5.1 "Data state":
217+
// - "<" moves to 13.2.5.6 "Tag open state",
218+
// - "!" moves to 13.2.5.42 "Markup declaration open state",
219+
// - "--" moves to 13.2.5.43 "Comment start state".
220+
// Each of these transitions are the only way to get to the 6/42/43 states.
221+
//
222+
// State ! - < > NUL EOF default HTML spec section
223+
// 43 ... s44 ... s01.T.E0 ... ... r45 13.2.5.43 Comment start state
224+
// 44 ... s51 ... s01.T.E0 ... T.Z.E1 r45.A- 13.2.5.44 Comment start dash state
225+
// 45 ... s50 s46.A< ... t45.A?.E2 T.Z.E1 t45.Ax 13.2.5.45 Comment state
226+
// 46 s47.A! ... t46.A< ... ... ... r45 13.2.5.46 Comment less-than sign state
227+
// 47 ... s48 ... ... ... ... r45 13.2.5.47 Comment less-than sign bang state
228+
// 48 ... s49 ... ... ... ... r50 13.2.5.48 Comment less-than sign bang dash state
229+
// 49 ... ... ... s01.T ... T.Z.E1 r51.E3 13.2.5.49 Comment less-than sign bang dash dash state
230+
// 50 ... s51 ... ... ... T.Z.E1 r45.A- 13.2.5.50 Comment end dash state
231+
// 51 s52 t51.A- ... s01.T ... T.Z.E1 r45.A-- 13.2.5.51 Comment end state
232+
// 52 ... s50.A--! ... s01.T.E4 ... T.Z.E1 r45.A--! 13.2.5.52 Comment end bang state
233+
//
234+
// State 43 is the "Comment start state" meaning that we've only seen "<!--"
235+
// and nothing else. Similarly, state 44 means that we've only seen "<!---",
236+
// with three dashes, and nothing else. For the other states, we deduce
237+
// (working backwards) that the immediate prior input must be:
238+
// - 45 something that's not '-'
239+
// - 46 "<"
240+
// - 47 "<!"
241+
// - 48 "<!-"
242+
// - 49 "<!--" not including the opening "<!--"
243+
// - 50 "-" not including the opening "<!--" and also not "--"
244+
// - 51 "--" not including the opening "<!--"
245+
// - 52 "--!"
246+
//
247+
// The table cell actions:
248+
// - ... do the default action
249+
// - A! append "!" to the comment token's data.
250+
// - A- append "-" to the comment token's data.
251+
// - A-- append "--" to the comment token's data.
252+
// - A--! append "--!" to the comment token's data.
253+
// - A< append "<" to the comment token's data.
254+
// - A? append "\uFFFD" to the comment token's data.
255+
// - Ax append the current input character to the comment token's data.
256+
// - E0 parse error (abrupt-closing-of-empty-comment).
257+
// - E1 parse error (eof-in-comment).
258+
// - E2 parse error (unexpected-null-character).
259+
// - E3 parse error (nested-comment).
260+
// - E4 parse error (incorrectly-closed-comment).
261+
// - T emit the current comment token.
262+
// - Z emit an end-of-file token.
263+
// - rNN reconsume in the 13.2.5.NN state (after any A* or E* operations).
264+
// - s01 switch to the 13.2.5.1 Data state (after any A* or E* operations).
265+
// - sNN switch to the 13.2.5.NN state (after any A* or E* operations).
266+
// - tNN stay in the 13.2.5.NN state (after any A* or E* operations).
267+
//
268+
// The E* actions are called errors in the HTML spec but they are not fatal
269+
// (https://html.spec.whatwg.org/#parse-errors says "may [but not must] abort
270+
// the parser"). They are warnings that, in practice, browsers simply ignore.

html/token.go

+41-8
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,11 @@ scriptDataDoubleEscapeEnd:
598598
// readComment reads the next comment token starting with "<!--". The opening
599599
// "<!--" has already been consumed.
600600
func (z *Tokenizer) readComment() {
601+
// When modifying this function, consider manually increasing the suffixLen
602+
// constant in func TestComments, from 6 to e.g. 9 or more. That increase
603+
// should only be temporary, not committed, as it exponentially affects the
604+
// test running time.
605+
601606
z.data.start = z.raw.end
602607
defer func() {
603608
if z.data.end < z.data.start {
@@ -611,11 +616,7 @@ func (z *Tokenizer) readComment() {
611616
for {
612617
c := z.readByte()
613618
if z.err != nil {
614-
// Ignore up to two dashes at EOF.
615-
if dashCount > 2 {
616-
dashCount = 2
617-
}
618-
z.data.end = z.raw.end - dashCount
619+
z.data.end = z.calculateAbruptCommentDataEnd()
619620
return
620621
}
621622
switch c {
@@ -631,12 +632,15 @@ func (z *Tokenizer) readComment() {
631632
if dashCount >= 2 {
632633
c = z.readByte()
633634
if z.err != nil {
634-
z.data.end = z.raw.end
635+
z.data.end = z.calculateAbruptCommentDataEnd()
635636
return
636-
}
637-
if c == '>' {
637+
} else if c == '>' {
638638
z.data.end = z.raw.end - len("--!>")
639639
return
640+
} else if c == '-' {
641+
dashCount = 1
642+
beginning = false
643+
continue
640644
}
641645
}
642646
}
@@ -645,6 +649,35 @@ func (z *Tokenizer) readComment() {
645649
}
646650
}
647651

652+
func (z *Tokenizer) calculateAbruptCommentDataEnd() int {
653+
raw := z.Raw()
654+
const prefixLen = len("<!--")
655+
if len(raw) >= prefixLen {
656+
raw = raw[prefixLen:]
657+
if hasSuffix(raw, "--!") {
658+
return z.raw.end - 3
659+
} else if hasSuffix(raw, "--") {
660+
return z.raw.end - 2
661+
} else if hasSuffix(raw, "-") {
662+
return z.raw.end - 1
663+
}
664+
}
665+
return z.raw.end
666+
}
667+
668+
func hasSuffix(b []byte, suffix string) bool {
669+
if len(b) < len(suffix) {
670+
return false
671+
}
672+
b = b[len(b)-len(suffix):]
673+
for i := range b {
674+
if b[i] != suffix[i] {
675+
return false
676+
}
677+
}
678+
return true
679+
}
680+
648681
// readUntilCloseAngle reads until the next ">".
649682
func (z *Tokenizer) readUntilCloseAngle() {
650683
z.data.start = z.raw.end

0 commit comments

Comments
 (0)