html: parse comments per HTML spec

nigeltao · gopherbot · commit 39940adcaaa7 · 2023-02-10T18:21:14.000Z
Updates golang/go#58246 Change-Id: Iaba5ed65f5d244fd47372ef0c08fc4cdb5ed90f9 Reviewed-on: https://go-review.googlesource.com/c/net/+/466776 TryBot-Result: Gopher Robot <gobot@golang.org> Auto-Submit: Nigel Tao <nigeltao@golang.org> Reviewed-by: Damien Neil <dneil@google.com> Run-TryBot: Nigel Tao <nigeltao@golang.org> Reviewed-by: Nigel Tao (INACTIVE; USE @golang.org INSTEAD) <nigeltao@google.com>
diff --git a/html/comment_test.go b/html/comment_test.go
@@ -0,0 +1,270 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+	"bytes"
+	"testing"
+)
+
+// TestComments exhaustively tests every 'interesting' N-byte string is
+// correctly parsed as a comment. N ranges from 4+1 to 4+suffixLen inclusive,
+// where 4 is the length of the "<!--" prefix that starts an HTML comment.
+//
+// 'Interesting' means that the N-4 byte suffix consists entirely of bytes
+// sampled from the interestingCommentBytes const string, below. These cover
+// all of the possible state transitions from comment-related parser states, as
+// listed in the HTML spec (https://html.spec.whatwg.org/#comment-start-state
+// and subsequent sections).
+//
+// The spec is written as an explicit state machine that, as a side effect,
+// accumulates "the comment token's data" to a separate buffer.
+// Tokenizer.readComment in this package does not have an explicit state
+// machine and usually returns the comment text as a sub-slice of the input,
+// between the opening '<' and closing '>' or EOF. This test confirms that the
+// two algorithms match.
+func TestComments(t *testing.T) {
+	const prefix = "<!--"
+	const suffixLen = 6
+	buffer := make([]byte, 0, len(prefix)+suffixLen)
+	testAllComments(t, append(buffer, prefix...))
+}
+
+// NUL isn't in this list, even though the HTML spec sections 13.2.5.43 -
+// 13.2.5.52 mentions it. It's not interesting in terms of state transitions.
+// It's equivalent to any other non-interesting byte (other than being replaced
+// by U+FFFD REPLACEMENT CHARACTER).
+//
+// EOF isn't in this list. The HTML spec treats EOF as "an input character" but
+// testOneComment below breaks the loop instead.
+//
+// 'x' represents all other "non-interesting" comment bytes.
+var interestingCommentBytes = [...]byte{
+	'!', '-', '<', '>', 'x',
+}
+
+// testAllComments recursively fills in buffer[len(buffer):cap(buffer)] with
+// interesting bytes and then tests that this package's tokenization matches
+// the HTML spec.
+//
+// Precondition: len(buffer) < cap(buffer)
+// Precondition: string(buffer[:4]) == "<!--"
+func testAllComments(t *testing.T, buffer []byte) {
+	for _, interesting := range interestingCommentBytes {
+		b := append(buffer, interesting)
+		testOneComment(t, b)
+		if len(b) < cap(b) {
+			testAllComments(t, b)
+		}
+	}
+}
+
+func testOneComment(t *testing.T, b []byte) {
+	z := NewTokenizer(bytes.NewReader(b))
+	if next := z.Next(); next != CommentToken {
+		t.Fatalf("Next(%q): got %v, want %v", b, next, CommentToken)
+	}
+	gotRemainder := string(b[len(z.Raw()):])
+	gotComment := string(z.Text())
+
+	i := len("<!--")
+	wantBuffer := []byte(nil)
+loop:
+	for state := 43; ; {
+		// Consume the next input character, handling EOF.
+		if i >= len(b) {
+			break
+		}
+		nextInputCharacter := b[i]
+		i++
+
+		switch state {
+		case 43: // 13.2.5.43 Comment start state.
+			switch nextInputCharacter {
+			case '-':
+				state = 44
+			case '>':
+				break loop
+			default:
+				i-- // Reconsume.
+				state = 45
+			}
+
+		case 44: // 13.2.5.44 Comment start dash state.
+			switch nextInputCharacter {
+			case '-':
+				state = 51
+			case '>':
+				break loop
+			default:
+				wantBuffer = append(wantBuffer, '-')
+				i-- // Reconsume.
+				state = 45
+			}
+
+		case 45: // 13.2.5.45 Comment state.
+			switch nextInputCharacter {
+			case '-':
+				state = 50
+			case '<':
+				wantBuffer = append(wantBuffer, '<')
+				state = 46
+			default:
+				wantBuffer = append(wantBuffer, nextInputCharacter)
+			}
+
+		case 46: // 13.2.5.46 Comment less-than sign state.
+			switch nextInputCharacter {
+			case '!':
+				wantBuffer = append(wantBuffer, '!')
+				state = 47
+			case '<':
+				wantBuffer = append(wantBuffer, '<')
+				state = 46
+			default:
+				i-- // Reconsume.
+				state = 45
+			}
+
+		case 47: // 13.2.5.47 Comment less-than sign bang state.
+			switch nextInputCharacter {
+			case '-':
+				state = 48
+			default:
+				i-- // Reconsume.
+				state = 45
+			}
+
+		case 48: // 13.2.5.48 Comment less-than sign bang dash state.
+			switch nextInputCharacter {
+			case '-':
+				state = 49
+			default:
+				i-- // Reconsume.
+				state = 50
+			}
+
+		case 49: // 13.2.5.49 Comment less-than sign bang dash dash state.
+			switch nextInputCharacter {
+			case '>':
+				break loop
+			default:
+				i-- // Reconsume.
+				state = 51
+			}
+
+		case 50: // 13.2.5.50 Comment end dash state.
+			switch nextInputCharacter {
+			case '-':
+				state = 51
+			default:
+				wantBuffer = append(wantBuffer, '-')
+				i-- // Reconsume.
+				state = 45
+			}
+
+		case 51: // 13.2.5.51 Comment end state.
+			switch nextInputCharacter {
+			case '!':
+				state = 52
+			case '-':
+				wantBuffer = append(wantBuffer, '-')
+			case '>':
+				break loop
+			default:
+				wantBuffer = append(wantBuffer, "--"...)
+				i-- // Reconsume.
+				state = 45
+			}
+
+		case 52: // 13.2.5.52 Comment end bang state.
+			switch nextInputCharacter {
+			case '-':
+				wantBuffer = append(wantBuffer, "--!"...)
+				state = 50
+			case '>':
+				break loop
+			default:
+				wantBuffer = append(wantBuffer, "--!"...)
+				i-- // Reconsume.
+				state = 45
+			}
+
+		default:
+			t.Fatalf("input=%q: unexpected state %d", b, state)
+		}
+	}
+
+	wantRemainder := ""
+	if i < len(b) {
+		wantRemainder = string(b[i:])
+	}
+	wantComment := string(wantBuffer)
+	if (gotComment != wantComment) || (gotRemainder != wantRemainder) {
+		t.Errorf("input=%q\ngot:  %q + %q\nwant: %q + %q",
+			b, gotComment, gotRemainder, wantComment, wantRemainder)
+	}
+}
+
+// This table below summarizes the HTML-comment-related state machine from
+// 13.2.5.43 "Comment start state" and subsequent sections.
+// https://html.spec.whatwg.org/#comment-start-state
+//
+// Get to state 13.2.5.43 after seeing "<!--". Specifically, starting from the
+// initial 13.2.5.1 "Data state":
+//   - "<"  moves to 13.2.5.6  "Tag open state",
+//   - "!"  moves to 13.2.5.42 "Markup declaration open state",
+//   - "--" moves to 13.2.5.43 "Comment start state".
+// Each of these transitions are the only way to get to the 6/42/43 states.
+//
+// State   !         -         <         >         NUL       EOF       default   HTML spec section
+// 43      ...       s44       ...       s01.T.E0  ...       ...       r45       13.2.5.43 Comment start state
+// 44      ...       s51       ...       s01.T.E0  ...       T.Z.E1    r45.A-    13.2.5.44 Comment start dash state
+// 45      ...       s50       s46.A<    ...       t45.A?.E2 T.Z.E1    t45.Ax    13.2.5.45 Comment state
+// 46      s47.A!    ...       t46.A<    ...       ...       ...       r45       13.2.5.46 Comment less-than sign state
+// 47      ...       s48       ...       ...       ...       ...       r45       13.2.5.47 Comment less-than sign bang state
+// 48      ...       s49       ...       ...       ...       ...       r50       13.2.5.48 Comment less-than sign bang dash state
+// 49      ...       ...       ...       s01.T     ...       T.Z.E1    r51.E3    13.2.5.49 Comment less-than sign bang dash dash state
+// 50      ...       s51       ...       ...       ...       T.Z.E1    r45.A-    13.2.5.50 Comment end dash state
+// 51      s52       t51.A-    ...       s01.T     ...       T.Z.E1    r45.A--   13.2.5.51 Comment end state
+// 52      ...       s50.A--!  ...       s01.T.E4  ...       T.Z.E1    r45.A--!  13.2.5.52 Comment end bang state
+//
+// State 43 is the "Comment start state" meaning that we've only seen "<!--"
+// and nothing else. Similarly, state 44 means that we've only seen "<!---",
+// with three dashes, and nothing else. For the other states, we deduce
+// (working backwards) that the immediate prior input must be:
+//   - 45  something that's not '-'
+//   - 46  "<"
+//   - 47  "<!"
+//   - 48  "<!-"
+//   - 49  "<!--"  not including the opening "<!--"
+//   - 50  "-"     not including the opening "<!--" and also not "--"
+//   - 51  "--"    not including the opening "<!--"
+//   - 52  "--!"
+//
+// The table cell actions:
+//   - ...   do the default action
+//   - A!    append "!"      to the comment token's data.
+//   - A-    append "-"      to the comment token's data.
+//   - A--   append "--"     to the comment token's data.
+//   - A--!  append "--!"    to the comment token's data.
+//   - A<    append "<"      to the comment token's data.
+//   - A?    append "\uFFFD" to the comment token's data.
+//   - Ax    append the current input character to the comment token's data.
+//   - E0    parse error (abrupt-closing-of-empty-comment).
+//   - E1    parse error (eof-in-comment).
+//   - E2    parse error (unexpected-null-character).
+//   - E3    parse error (nested-comment).
+//   - E4    parse error (incorrectly-closed-comment).
+//   - T     emit the current comment token.
+//   - Z     emit an end-of-file token.
+//   - rNN   reconsume in the 13.2.5.NN     state (after any A* or E* operations).
+//   - s01   switch to the    13.2.5.1 Data state (after any A* or E* operations).
+//   - sNN   switch to the    13.2.5.NN     state (after any A* or E* operations).
+//   - tNN   stay in the      13.2.5.NN     state (after any A* or E* operations).
+//
+// The E* actions are called errors in the HTML spec but they are not fatal
+// (https://html.spec.whatwg.org/#parse-errors says "may [but not must] abort
+// the parser"). They are warnings that, in practice, browsers simply ignore.
diff --git a/html/token.go b/html/token.go
@@ -598,6 +598,11 @@ scriptDataDoubleEscapeEnd:
 // readComment reads the next comment token starting with "<!--". The opening
 // "<!--" has already been consumed.
 func (z *Tokenizer) readComment() {
+	// When modifying this function, consider manually increasing the suffixLen
+	// constant in func TestComments, from 6 to e.g. 9 or more. That increase
+	// should only be temporary, not committed, as it exponentially affects the
+	// test running time.
+
 	z.data.start = z.raw.end
 	defer func() {
 		if z.data.end < z.data.start {
@@ -611,11 +616,7 @@ func (z *Tokenizer) readComment() {
 	for {
 		c := z.readByte()
 		if z.err != nil {
-			// Ignore up to two dashes at EOF.
-			if dashCount > 2 {
-				dashCount = 2
-			}
-			z.data.end = z.raw.end - dashCount
+			z.data.end = z.calculateAbruptCommentDataEnd()
 			return
 		}
 		switch c {
@@ -631,12 +632,15 @@ func (z *Tokenizer) readComment() {
 			if dashCount >= 2 {
 				c = z.readByte()
 				if z.err != nil {
-					z.data.end = z.raw.end
+					z.data.end = z.calculateAbruptCommentDataEnd()
 					return
-				}
-				if c == '>' {
+				} else if c == '>' {
 					z.data.end = z.raw.end - len("--!>")
 					return
+				} else if c == '-' {
+					dashCount = 1
+					beginning = false
+					continue
 				}
 			}
 		}
@@ -645,6 +649,35 @@ func (z *Tokenizer) readComment() {
 	}
 }
 
+func (z *Tokenizer) calculateAbruptCommentDataEnd() int {
+	raw := z.Raw()
+	const prefixLen = len("<!--")
+	if len(raw) >= prefixLen {
+		raw = raw[prefixLen:]
+		if hasSuffix(raw, "--!") {
+			return z.raw.end - 3
+		} else if hasSuffix(raw, "--") {
+			return z.raw.end - 2
+		} else if hasSuffix(raw, "-") {
+			return z.raw.end - 1
+		}
+	}
+	return z.raw.end
+}
+
+func hasSuffix(b []byte, suffix string) bool {
+	if len(b) < len(suffix) {
+		return false
+	}
+	b = b[len(b)-len(suffix):]
+	for i := range b {
+		if b[i] != suffix[i] {
+			return false
+		}
+	}
+	return true
+}
+
 // readUntilCloseAngle reads until the next ">".
 func (z *Tokenizer) readUntilCloseAngle() {
 	z.data.start = z.raw.end
diff --git a/html/token_test.go b/html/token_test.go