|
| 1 | +// Copyright 2023 The Go Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +package html |
| 6 | + |
| 7 | +import ( |
| 8 | + "bytes" |
| 9 | + "testing" |
| 10 | +) |
| 11 | + |
| 12 | +// TestComments exhaustively tests every 'interesting' N-byte string is |
| 13 | +// correctly parsed as a comment. N ranges from 4+1 to 4+suffixLen inclusive, |
| 14 | +// where 4 is the length of the "<!--" prefix that starts an HTML comment. |
| 15 | +// |
| 16 | +// 'Interesting' means that the N-4 byte suffix consists entirely of bytes |
| 17 | +// sampled from the interestingCommentBytes const string, below. These cover |
| 18 | +// all of the possible state transitions from comment-related parser states, as |
| 19 | +// listed in the HTML spec (https://html.spec.whatwg.org/#comment-start-state |
| 20 | +// and subsequent sections). |
| 21 | +// |
| 22 | +// The spec is written as an explicit state machine that, as a side effect, |
| 23 | +// accumulates "the comment token's data" to a separate buffer. |
| 24 | +// Tokenizer.readComment in this package does not have an explicit state |
| 25 | +// machine and usually returns the comment text as a sub-slice of the input, |
| 26 | +// between the opening '<' and closing '>' or EOF. This test confirms that the |
| 27 | +// two algorithms match. |
| 28 | +func TestComments(t *testing.T) { |
| 29 | + const prefix = "<!--" |
| 30 | + const suffixLen = 6 |
| 31 | + buffer := make([]byte, 0, len(prefix)+suffixLen) |
| 32 | + testAllComments(t, append(buffer, prefix...)) |
| 33 | +} |
| 34 | + |
| 35 | +// NUL isn't in this list, even though the HTML spec sections 13.2.5.43 - |
| 36 | +// 13.2.5.52 mentions it. It's not interesting in terms of state transitions. |
| 37 | +// It's equivalent to any other non-interesting byte (other than being replaced |
| 38 | +// by U+FFFD REPLACEMENT CHARACTER). |
| 39 | +// |
| 40 | +// EOF isn't in this list. The HTML spec treats EOF as "an input character" but |
| 41 | +// testOneComment below breaks the loop instead. |
| 42 | +// |
| 43 | +// 'x' represents all other "non-interesting" comment bytes. |
| 44 | +var interestingCommentBytes = [...]byte{ |
| 45 | + '!', '-', '<', '>', 'x', |
| 46 | +} |
| 47 | + |
| 48 | +// testAllComments recursively fills in buffer[len(buffer):cap(buffer)] with |
| 49 | +// interesting bytes and then tests that this package's tokenization matches |
| 50 | +// the HTML spec. |
| 51 | +// |
| 52 | +// Precondition: len(buffer) < cap(buffer) |
| 53 | +// Precondition: string(buffer[:4]) == "<!--" |
| 54 | +func testAllComments(t *testing.T, buffer []byte) { |
| 55 | + for _, interesting := range interestingCommentBytes { |
| 56 | + b := append(buffer, interesting) |
| 57 | + testOneComment(t, b) |
| 58 | + if len(b) < cap(b) { |
| 59 | + testAllComments(t, b) |
| 60 | + } |
| 61 | + } |
| 62 | +} |
| 63 | + |
| 64 | +func testOneComment(t *testing.T, b []byte) { |
| 65 | + z := NewTokenizer(bytes.NewReader(b)) |
| 66 | + if next := z.Next(); next != CommentToken { |
| 67 | + t.Fatalf("Next(%q): got %v, want %v", b, next, CommentToken) |
| 68 | + } |
| 69 | + gotRemainder := string(b[len(z.Raw()):]) |
| 70 | + gotComment := string(z.Text()) |
| 71 | + |
| 72 | + i := len("<!--") |
| 73 | + wantBuffer := []byte(nil) |
| 74 | +loop: |
| 75 | + for state := 43; ; { |
| 76 | + // Consume the next input character, handling EOF. |
| 77 | + if i >= len(b) { |
| 78 | + break |
| 79 | + } |
| 80 | + nextInputCharacter := b[i] |
| 81 | + i++ |
| 82 | + |
| 83 | + switch state { |
| 84 | + case 43: // 13.2.5.43 Comment start state. |
| 85 | + switch nextInputCharacter { |
| 86 | + case '-': |
| 87 | + state = 44 |
| 88 | + case '>': |
| 89 | + break loop |
| 90 | + default: |
| 91 | + i-- // Reconsume. |
| 92 | + state = 45 |
| 93 | + } |
| 94 | + |
| 95 | + case 44: // 13.2.5.44 Comment start dash state. |
| 96 | + switch nextInputCharacter { |
| 97 | + case '-': |
| 98 | + state = 51 |
| 99 | + case '>': |
| 100 | + break loop |
| 101 | + default: |
| 102 | + wantBuffer = append(wantBuffer, '-') |
| 103 | + i-- // Reconsume. |
| 104 | + state = 45 |
| 105 | + } |
| 106 | + |
| 107 | + case 45: // 13.2.5.45 Comment state. |
| 108 | + switch nextInputCharacter { |
| 109 | + case '-': |
| 110 | + state = 50 |
| 111 | + case '<': |
| 112 | + wantBuffer = append(wantBuffer, '<') |
| 113 | + state = 46 |
| 114 | + default: |
| 115 | + wantBuffer = append(wantBuffer, nextInputCharacter) |
| 116 | + } |
| 117 | + |
| 118 | + case 46: // 13.2.5.46 Comment less-than sign state. |
| 119 | + switch nextInputCharacter { |
| 120 | + case '!': |
| 121 | + wantBuffer = append(wantBuffer, '!') |
| 122 | + state = 47 |
| 123 | + case '<': |
| 124 | + wantBuffer = append(wantBuffer, '<') |
| 125 | + state = 46 |
| 126 | + default: |
| 127 | + i-- // Reconsume. |
| 128 | + state = 45 |
| 129 | + } |
| 130 | + |
| 131 | + case 47: // 13.2.5.47 Comment less-than sign bang state. |
| 132 | + switch nextInputCharacter { |
| 133 | + case '-': |
| 134 | + state = 48 |
| 135 | + default: |
| 136 | + i-- // Reconsume. |
| 137 | + state = 45 |
| 138 | + } |
| 139 | + |
| 140 | + case 48: // 13.2.5.48 Comment less-than sign bang dash state. |
| 141 | + switch nextInputCharacter { |
| 142 | + case '-': |
| 143 | + state = 49 |
| 144 | + default: |
| 145 | + i-- // Reconsume. |
| 146 | + state = 50 |
| 147 | + } |
| 148 | + |
| 149 | + case 49: // 13.2.5.49 Comment less-than sign bang dash dash state. |
| 150 | + switch nextInputCharacter { |
| 151 | + case '>': |
| 152 | + break loop |
| 153 | + default: |
| 154 | + i-- // Reconsume. |
| 155 | + state = 51 |
| 156 | + } |
| 157 | + |
| 158 | + case 50: // 13.2.5.50 Comment end dash state. |
| 159 | + switch nextInputCharacter { |
| 160 | + case '-': |
| 161 | + state = 51 |
| 162 | + default: |
| 163 | + wantBuffer = append(wantBuffer, '-') |
| 164 | + i-- // Reconsume. |
| 165 | + state = 45 |
| 166 | + } |
| 167 | + |
| 168 | + case 51: // 13.2.5.51 Comment end state. |
| 169 | + switch nextInputCharacter { |
| 170 | + case '!': |
| 171 | + state = 52 |
| 172 | + case '-': |
| 173 | + wantBuffer = append(wantBuffer, '-') |
| 174 | + case '>': |
| 175 | + break loop |
| 176 | + default: |
| 177 | + wantBuffer = append(wantBuffer, "--"...) |
| 178 | + i-- // Reconsume. |
| 179 | + state = 45 |
| 180 | + } |
| 181 | + |
| 182 | + case 52: // 13.2.5.52 Comment end bang state. |
| 183 | + switch nextInputCharacter { |
| 184 | + case '-': |
| 185 | + wantBuffer = append(wantBuffer, "--!"...) |
| 186 | + state = 50 |
| 187 | + case '>': |
| 188 | + break loop |
| 189 | + default: |
| 190 | + wantBuffer = append(wantBuffer, "--!"...) |
| 191 | + i-- // Reconsume. |
| 192 | + state = 45 |
| 193 | + } |
| 194 | + |
| 195 | + default: |
| 196 | + t.Fatalf("input=%q: unexpected state %d", b, state) |
| 197 | + } |
| 198 | + } |
| 199 | + |
| 200 | + wantRemainder := "" |
| 201 | + if i < len(b) { |
| 202 | + wantRemainder = string(b[i:]) |
| 203 | + } |
| 204 | + wantComment := string(wantBuffer) |
| 205 | + if (gotComment != wantComment) || (gotRemainder != wantRemainder) { |
| 206 | + t.Errorf("input=%q\ngot: %q + %q\nwant: %q + %q", |
| 207 | + b, gotComment, gotRemainder, wantComment, wantRemainder) |
| 208 | + } |
| 209 | +} |
| 210 | + |
| 211 | +// This table below summarizes the HTML-comment-related state machine from |
| 212 | +// 13.2.5.43 "Comment start state" and subsequent sections. |
| 213 | +// https://html.spec.whatwg.org/#comment-start-state |
| 214 | +// |
| 215 | +// Get to state 13.2.5.43 after seeing "<!--". Specifically, starting from the |
| 216 | +// initial 13.2.5.1 "Data state": |
| 217 | +// - "<" moves to 13.2.5.6 "Tag open state", |
| 218 | +// - "!" moves to 13.2.5.42 "Markup declaration open state", |
| 219 | +// - "--" moves to 13.2.5.43 "Comment start state". |
| 220 | +// Each of these transitions are the only way to get to the 6/42/43 states. |
| 221 | +// |
| 222 | +// State ! - < > NUL EOF default HTML spec section |
| 223 | +// 43 ... s44 ... s01.T.E0 ... ... r45 13.2.5.43 Comment start state |
| 224 | +// 44 ... s51 ... s01.T.E0 ... T.Z.E1 r45.A- 13.2.5.44 Comment start dash state |
| 225 | +// 45 ... s50 s46.A< ... t45.A?.E2 T.Z.E1 t45.Ax 13.2.5.45 Comment state |
| 226 | +// 46 s47.A! ... t46.A< ... ... ... r45 13.2.5.46 Comment less-than sign state |
| 227 | +// 47 ... s48 ... ... ... ... r45 13.2.5.47 Comment less-than sign bang state |
| 228 | +// 48 ... s49 ... ... ... ... r50 13.2.5.48 Comment less-than sign bang dash state |
| 229 | +// 49 ... ... ... s01.T ... T.Z.E1 r51.E3 13.2.5.49 Comment less-than sign bang dash dash state |
| 230 | +// 50 ... s51 ... ... ... T.Z.E1 r45.A- 13.2.5.50 Comment end dash state |
| 231 | +// 51 s52 t51.A- ... s01.T ... T.Z.E1 r45.A-- 13.2.5.51 Comment end state |
| 232 | +// 52 ... s50.A--! ... s01.T.E4 ... T.Z.E1 r45.A--! 13.2.5.52 Comment end bang state |
| 233 | +// |
| 234 | +// State 43 is the "Comment start state" meaning that we've only seen "<!--" |
| 235 | +// and nothing else. Similarly, state 44 means that we've only seen "<!---", |
| 236 | +// with three dashes, and nothing else. For the other states, we deduce |
| 237 | +// (working backwards) that the immediate prior input must be: |
| 238 | +// - 45 something that's not '-' |
| 239 | +// - 46 "<" |
| 240 | +// - 47 "<!" |
| 241 | +// - 48 "<!-" |
| 242 | +// - 49 "<!--" not including the opening "<!--" |
| 243 | +// - 50 "-" not including the opening "<!--" and also not "--" |
| 244 | +// - 51 "--" not including the opening "<!--" |
| 245 | +// - 52 "--!" |
| 246 | +// |
| 247 | +// The table cell actions: |
| 248 | +// - ... do the default action |
| 249 | +// - A! append "!" to the comment token's data. |
| 250 | +// - A- append "-" to the comment token's data. |
| 251 | +// - A-- append "--" to the comment token's data. |
| 252 | +// - A--! append "--!" to the comment token's data. |
| 253 | +// - A< append "<" to the comment token's data. |
| 254 | +// - A? append "\uFFFD" to the comment token's data. |
| 255 | +// - Ax append the current input character to the comment token's data. |
| 256 | +// - E0 parse error (abrupt-closing-of-empty-comment). |
| 257 | +// - E1 parse error (eof-in-comment). |
| 258 | +// - E2 parse error (unexpected-null-character). |
| 259 | +// - E3 parse error (nested-comment). |
| 260 | +// - E4 parse error (incorrectly-closed-comment). |
| 261 | +// - T emit the current comment token. |
| 262 | +// - Z emit an end-of-file token. |
| 263 | +// - rNN reconsume in the 13.2.5.NN state (after any A* or E* operations). |
| 264 | +// - s01 switch to the 13.2.5.1 Data state (after any A* or E* operations). |
| 265 | +// - sNN switch to the 13.2.5.NN state (after any A* or E* operations). |
| 266 | +// - tNN stay in the 13.2.5.NN state (after any A* or E* operations). |
| 267 | +// |
| 268 | +// The E* actions are called errors in the HTML spec but they are not fatal |
| 269 | +// (https://html.spec.whatwg.org/#parse-errors says "may [but not must] abort |
| 270 | +// the parser"). They are warnings that, in practice, browsers simply ignore. |
0 commit comments