forked from jbendotnet/html-strip-tags-go
-
Notifications
You must be signed in to change notification settings - Fork 0
/
strip.go
3712 lines (3426 loc) · 111 KB
/
strip.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package strip
import (
"bytes"
"encoding/json"
"fmt"
"html"
"io"
"io/ioutil"
"path/filepath"
"reflect"
"strings"
"sync"
"text/template"
"text/template/parse"
"unicode"
"unicode/utf8"
)
// htmlNospaceEscaper escapes for inclusion in unquoted attribute values.
func htmlNospaceEscaper(args ...interface{}) string {
s, t := stringify(args...)
if t == contentTypeHTML {
return htmlReplacer(StripTags(s), htmlNospaceNormReplacementTable, false)
}
return htmlReplacer(s, htmlNospaceReplacementTable, false)
}
// attrEscaper escapes for inclusion in quoted attribute values.
func attrEscaper(args ...interface{}) string {
s, t := stringify(args...)
if t == contentTypeHTML {
return htmlReplacer(StripTags(s), htmlNormReplacementTable, true)
}
return htmlReplacer(s, htmlReplacementTable, true)
}
// rcdataEscaper escapes for inclusion in an RCDATA element body.
func rcdataEscaper(args ...interface{}) string {
s, t := stringify(args...)
if t == contentTypeHTML {
return htmlReplacer(s, htmlNormReplacementTable, true)
}
return htmlReplacer(s, htmlReplacementTable, true)
}
// htmlEscaper escapes for inclusion in HTML text.
func htmlEscaper(args ...interface{}) string {
s, t := stringify(args...)
if t == contentTypeHTML {
return s
}
return htmlReplacer(s, htmlReplacementTable, true)
}
// htmlReplacementTable contains the runes that need to be escaped
// inside a quoted attribute value or in a text node.
var htmlReplacementTable = []string{
// http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
// U+0000 NULL Parse error. Append a U+FFFD REPLACEMENT
// CHARACTER character to the current attribute's value.
// "
// and similarly
// http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
0: "\uFFFD",
'"': """,
'&': "&",
'\'': "'",
'+': "+",
'<': "<",
'>': ">",
}
// htmlNormReplacementTable is like htmlReplacementTable but without '&' to
// avoid over-encoding existing entities.
var htmlNormReplacementTable = []string{
0: "\uFFFD",
'"': """,
'\'': "'",
'+': "+",
'<': "<",
'>': ">",
}
// htmlNospaceReplacementTable contains the runes that need to be escaped
// inside an unquoted attribute value.
// The set of runes escaped is the union of the HTML specials and
// those determined by running the JS below in browsers:
// <div id=d></div>
// <script>(function () {
// var a = [], d = document.getElementById("d"), i, c, s;
// for (i = 0; i < 0x10000; ++i) {
// c = String.fromCharCode(i);
// d.innerHTML = "<span title=" + c + "lt" + c + "></span>"
// s = d.getElementsByTagName("SPAN")[0];
// if (!s || s.title !== c + "lt" + c) { a.push(i.toString(16)); }
// }
// document.write(a.join(", "));
// })()</script>
var htmlNospaceReplacementTable = []string{
0: "�",
'\t': "	",
'\n': " ",
'\v': "",
'\f': "",
'\r': " ",
' ': " ",
'"': """,
'&': "&",
'\'': "'",
'+': "+",
'<': "<",
'=': "=",
'>': ">",
// A parse error in the attribute value (unquoted) and
// before attribute value states.
// Treated as a quoting character by IE.
'`': "`",
}
// htmlNospaceNormReplacementTable is like htmlNospaceReplacementTable but
// without '&' to avoid over-encoding existing entities.
var htmlNospaceNormReplacementTable = []string{
0: "�",
'\t': "	",
'\n': " ",
'\v': "",
'\f': "",
'\r': " ",
' ': " ",
'"': """,
'\'': "'",
'+': "+",
'<': "<",
'=': "=",
'>': ">",
// A parse error in the attribute value (unquoted) and
// before attribute value states.
// Treated as a quoting character by IE.
'`': "`",
}
// htmlReplacer returns s with runes replaced according to replacementTable
// and when badRunes is true, certain bad runes are allowed through unescaped.
func htmlReplacer(s string, replacementTable []string, badRunes bool) string {
written, b := 0, new(bytes.Buffer)
for i, r := range s {
if int(r) < len(replacementTable) {
if repl := replacementTable[r]; len(repl) != 0 {
b.WriteString(s[written:i])
b.WriteString(repl)
// Valid as long as replacementTable doesn't
// include anything above 0x7f.
written = i + utf8.RuneLen(r)
}
} else if badRunes {
// No-op.
// IE does not allow these ranges in unquoted attrs.
} else if 0xfdd0 <= r && r <= 0xfdef || 0xfff0 <= r && r <= 0xffff {
fmt.Fprintf(b, "%s&#x%x;", s[written:i], r)
written = i + utf8.RuneLen(r)
}
}
if written == 0 {
return s
}
b.WriteString(s[written:])
return b.String()
}
// stripTags takes a snippet of HTML and returns only the text content.
// For example, `<b>¡Hi!</b> <script>...</script>` -> `¡Hi! `.
func StripTags(html string) string {
var b bytes.Buffer
s, c, i, allText := []byte(html), context{}, 0, true
// Using the transition funcs helps us avoid mangling
// `<div title="1>2">` or `I <3 Ponies!`.
for i != len(s) {
if c.delim == delimNone {
st := c.state
// Use RCDATA instead of parsing into JS or CSS styles.
if c.element != elementNone && !isInTag(st) {
st = stateRCDATA
}
d, nread := transitionFunc[st](c, s[i:])
i1 := i + nread
if c.state == stateText || c.state == stateRCDATA {
// Emit text up to the start of the tag or comment.
j := i1
if d.state != c.state {
for j1 := j - 1; j1 >= i; j1-- {
if s[j1] == '<' {
j = j1
break
}
}
}
b.Write(s[i:j])
} else {
allText = false
}
c, i = d, i1
continue
}
i1 := i + bytes.IndexAny(s[i:], delimEnds[c.delim])
if i1 < i {
break
}
if c.delim != delimSpaceOrTagEnd {
// Consume any quote.
i1++
}
c, i = context{state: stateTag, element: c.element}, i1
}
if allText {
return html
} else if c.state == stateText || c.state == stateRCDATA {
b.Write(s[i:])
}
return b.String()
}
// htmlNameFilter accepts valid parts of an HTML attribute or tag name or
// a known-safe HTML attribute.
func htmlNameFilter(args ...interface{}) string {
s, t := stringify(args...)
if t == contentTypeHTMLAttr {
return s
}
if len(s) == 0 {
// Avoid violation of structure preservation.
// <input checked {{.K}}={{.V}}>.
// Without this, if .K is empty then .V is the value of
// checked, but otherwise .V is the value of the attribute
// named .K.
return filterFailsafe
}
s = strings.ToLower(s)
if t := attrType(s); t != contentTypePlain {
// TODO: Split attr and element name part filters so we can whitelist
// attributes.
return filterFailsafe
}
for _, r := range s {
switch {
case '0' <= r && r <= '9':
case 'a' <= r && r <= 'z':
default:
return filterFailsafe
}
}
return s
}
// commentEscaper returns the empty string regardless of input.
// Comment content does not correspond to any parsed structure or
// human-readable content, so the simplest and most secure policy is to drop
// content interpolated into comments.
// This approach is equally valid whether or not static comment content is
// removed from the template.
func commentEscaper(args ...interface{}) string {
return ""
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// context describes the state an HTML parser must be in when it reaches the
// portion of HTML produced by evaluating a particular template node.
//
// The zero value of type context is the start context for a template that
// produces an HTML fragment as defined at
// http://www.w3.org/TR/html5/syntax.html#the-end
// where the context element is null.
type context struct {
state state
delim delim
urlPart urlPart
jsCtx jsCtx
attr attr
element element
err *Error
}
func (c context) String() string {
return fmt.Sprintf("{%v %v %v %v %v %v %v}", c.state, c.delim, c.urlPart, c.jsCtx, c.attr, c.element, c.err)
}
// eq reports whether two contexts are equal.
func (c context) eq(d context) bool {
return c.state == d.state &&
c.delim == d.delim &&
c.urlPart == d.urlPart &&
c.jsCtx == d.jsCtx &&
c.attr == d.attr &&
c.element == d.element &&
c.err == d.err
}
// mangle produces an identifier that includes a suffix that distinguishes it
// from template names mangled with different contexts.
func (c context) mangle(templateName string) string {
// The mangled name for the default context is the input templateName.
if c.state == stateText {
return templateName
}
s := templateName + "$htmltemplate_" + c.state.String()
if c.delim != 0 {
s += "_" + c.delim.String()
}
if c.urlPart != 0 {
s += "_" + c.urlPart.String()
}
if c.jsCtx != 0 {
s += "_" + c.jsCtx.String()
}
if c.attr != 0 {
s += "_" + c.attr.String()
}
if c.element != 0 {
s += "_" + c.element.String()
}
return s
}
// state describes a high-level HTML parser state.
//
// It bounds the top of the element stack, and by extension the HTML insertion
// mode, but also contains state that does not correspond to anything in the
// HTML5 parsing algorithm because a single token production in the HTML
// grammar may contain embedded actions in a template. For instance, the quoted
// HTML attribute produced by
// <div title="Hello {{.World}}">
// is a single token in HTML's grammar but in a template spans several nodes.
type state uint8
const (
// stateText is parsed character data. An HTML parser is in
// this state when its parse position is outside an HTML tag,
// directive, comment, and special element body.
stateText state = iota
// stateTag occurs before an HTML attribute or the end of a tag.
stateTag
// stateAttrName occurs inside an attribute name.
// It occurs between the ^'s in ` ^name^ = value`.
stateAttrName
// stateAfterName occurs after an attr name has ended but before any
// equals sign. It occurs between the ^'s in ` name^ ^= value`.
stateAfterName
// stateBeforeValue occurs after the equals sign but before the value.
// It occurs between the ^'s in ` name =^ ^value`.
stateBeforeValue
// stateHTMLCmt occurs inside an <!-- HTML comment -->.
stateHTMLCmt
// stateRCDATA occurs inside an RCDATA element (<textarea> or <title>)
// as described at http://www.w3.org/TR/html5/syntax.html#elements-0
stateRCDATA
// stateAttr occurs inside an HTML attribute whose content is text.
stateAttr
// stateURL occurs inside an HTML attribute whose content is a URL.
stateURL
// stateJS occurs inside an event handler or script element.
stateJS
// stateJSDqStr occurs inside a JavaScript double quoted string.
stateJSDqStr
// stateJSSqStr occurs inside a JavaScript single quoted string.
stateJSSqStr
// stateJSRegexp occurs inside a JavaScript regexp literal.
stateJSRegexp
// stateJSBlockCmt occurs inside a JavaScript /* block comment */.
stateJSBlockCmt
// stateJSLineCmt occurs inside a JavaScript // line comment.
stateJSLineCmt
// stateCSS occurs inside a <style> element or style attribute.
stateCSS
// stateCSSDqStr occurs inside a CSS double quoted string.
stateCSSDqStr
// stateCSSSqStr occurs inside a CSS single quoted string.
stateCSSSqStr
// stateCSSDqURL occurs inside a CSS double quoted url("...").
stateCSSDqURL
// stateCSSSqURL occurs inside a CSS single quoted url('...').
stateCSSSqURL
// stateCSSURL occurs inside a CSS unquoted url(...).
stateCSSURL
// stateCSSBlockCmt occurs inside a CSS /* block comment */.
stateCSSBlockCmt
// stateCSSLineCmt occurs inside a CSS // line comment.
stateCSSLineCmt
// stateError is an infectious error state outside any valid
// HTML/CSS/JS construct.
stateError
)
var stateNames = [...]string{
stateText: "stateText",
stateTag: "stateTag",
stateAttrName: "stateAttrName",
stateAfterName: "stateAfterName",
stateBeforeValue: "stateBeforeValue",
stateHTMLCmt: "stateHTMLCmt",
stateRCDATA: "stateRCDATA",
stateAttr: "stateAttr",
stateURL: "stateURL",
stateJS: "stateJS",
stateJSDqStr: "stateJSDqStr",
stateJSSqStr: "stateJSSqStr",
stateJSRegexp: "stateJSRegexp",
stateJSBlockCmt: "stateJSBlockCmt",
stateJSLineCmt: "stateJSLineCmt",
stateCSS: "stateCSS",
stateCSSDqStr: "stateCSSDqStr",
stateCSSSqStr: "stateCSSSqStr",
stateCSSDqURL: "stateCSSDqURL",
stateCSSSqURL: "stateCSSSqURL",
stateCSSURL: "stateCSSURL",
stateCSSBlockCmt: "stateCSSBlockCmt",
stateCSSLineCmt: "stateCSSLineCmt",
stateError: "stateError",
}
func (s state) String() string {
if int(s) < len(stateNames) {
return stateNames[s]
}
return fmt.Sprintf("illegal state %d", int(s))
}
// isComment is true for any state that contains content meant for template
// authors & maintainers, not for end-users or machines.
func isComment(s state) bool {
switch s {
case stateHTMLCmt, stateJSBlockCmt, stateJSLineCmt, stateCSSBlockCmt, stateCSSLineCmt:
return true
}
return false
}
// isInTag return whether s occurs solely inside an HTML tag.
func isInTag(s state) bool {
switch s {
case stateTag, stateAttrName, stateAfterName, stateBeforeValue, stateAttr:
return true
}
return false
}
// delim is the delimiter that will end the current HTML attribute.
type delim uint8
const (
// delimNone occurs outside any attribute.
delimNone delim = iota
// delimDoubleQuote occurs when a double quote (") closes the attribute.
delimDoubleQuote
// delimSingleQuote occurs when a single quote (') closes the attribute.
delimSingleQuote
// delimSpaceOrTagEnd occurs when a space or right angle bracket (>)
// closes the attribute.
delimSpaceOrTagEnd
)
var delimNames = [...]string{
delimNone: "delimNone",
delimDoubleQuote: "delimDoubleQuote",
delimSingleQuote: "delimSingleQuote",
delimSpaceOrTagEnd: "delimSpaceOrTagEnd",
}
func (d delim) String() string {
if int(d) < len(delimNames) {
return delimNames[d]
}
return fmt.Sprintf("illegal delim %d", int(d))
}
// urlPart identifies a part in an RFC 3986 hierarchical URL to allow different
// encoding strategies.
type urlPart uint8
const (
// urlPartNone occurs when not in a URL, or possibly at the start:
// ^ in "^http://auth/path?k=v#frag".
urlPartNone urlPart = iota
// urlPartPreQuery occurs in the scheme, authority, or path; between the
// ^s in "h^ttp://auth/path^?k=v#frag".
urlPartPreQuery
// urlPartQueryOrFrag occurs in the query portion between the ^s in
// "http://auth/path?^k=v#frag^".
urlPartQueryOrFrag
// urlPartUnknown occurs due to joining of contexts both before and
// after the query separator.
urlPartUnknown
)
var urlPartNames = [...]string{
urlPartNone: "urlPartNone",
urlPartPreQuery: "urlPartPreQuery",
urlPartQueryOrFrag: "urlPartQueryOrFrag",
urlPartUnknown: "urlPartUnknown",
}
func (u urlPart) String() string {
if int(u) < len(urlPartNames) {
return urlPartNames[u]
}
return fmt.Sprintf("illegal urlPart %d", int(u))
}
// jsCtx determines whether a '/' starts a regular expression literal or a
// division operator.
type jsCtx uint8
const (
// jsCtxRegexp occurs where a '/' would start a regexp literal.
jsCtxRegexp jsCtx = iota
// jsCtxDivOp occurs where a '/' would start a division operator.
jsCtxDivOp
// jsCtxUnknown occurs where a '/' is ambiguous due to context joining.
jsCtxUnknown
)
func (c jsCtx) String() string {
switch c {
case jsCtxRegexp:
return "jsCtxRegexp"
case jsCtxDivOp:
return "jsCtxDivOp"
case jsCtxUnknown:
return "jsCtxUnknown"
}
return fmt.Sprintf("illegal jsCtx %d", int(c))
}
// element identifies the HTML element when inside a start tag or special body.
// Certain HTML element (for example <script> and <style>) have bodies that are
// treated differently from stateText so the element type is necessary to
// transition into the correct context at the end of a tag and to identify the
// end delimiter for the body.
type element uint8
const (
// elementNone occurs outside a special tag or special element body.
elementNone element = iota
// elementScript corresponds to the raw text <script> element.
elementScript
// elementStyle corresponds to the raw text <style> element.
elementStyle
// elementTextarea corresponds to the RCDATA <textarea> element.
elementTextarea
// elementTitle corresponds to the RCDATA <title> element.
elementTitle
)
var elementNames = [...]string{
elementNone: "elementNone",
elementScript: "elementScript",
elementStyle: "elementStyle",
elementTextarea: "elementTextarea",
elementTitle: "elementTitle",
}
func (e element) String() string {
if int(e) < len(elementNames) {
return elementNames[e]
}
return fmt.Sprintf("illegal element %d", int(e))
}
// attr identifies the most recent HTML attribute when inside a start tag.
type attr uint8
const (
// attrNone corresponds to a normal attribute or no attribute.
attrNone attr = iota
// attrScript corresponds to an event handler attribute.
attrScript
// attrStyle corresponds to the style attribute whose value is CSS.
attrStyle
// attrURL corresponds to an attribute whose value is a URL.
attrURL
)
var attrNames = [...]string{
attrNone: "attrNone",
attrScript: "attrScript",
attrStyle: "attrStyle",
attrURL: "attrURL",
}
func (a attr) String() string {
if int(a) < len(attrNames) {
return attrNames[a]
}
return fmt.Sprintf("illegal attr %d", int(a))
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// escapeTemplates rewrites the named templates, which must be
// associated with t, to guarantee that the output of any of the named
// templates is properly escaped. Names should include the names of
// all templates that might be Executed but need not include helper
// templates. If no error is returned, then the named templates have
// been modified. Otherwise the named templates have been rendered
// unusable.
func escapeTemplates(tmpl *Template, names ...string) error {
e := newEscaper(tmpl)
for _, name := range names {
c, _ := e.escapeTree(context{}, name, 0)
var err error
if c.err != nil {
err, c.err.Name = c.err, name
} else if c.state != stateText {
err = &Error{ErrEndContext, name, 0, fmt.Sprintf("ends in a non-text context: %v", c)}
}
if err != nil {
// Prevent execution of unsafe templates.
for _, name := range names {
if t := tmpl.set[name]; t != nil {
t.text.Tree = nil
t.Tree = nil
}
}
return err
}
}
e.commit()
for _, name := range names {
if t := tmpl.set[name]; t != nil {
t.escaped = true
t.Tree = t.text.Tree
}
}
return nil
}
// funcMap maps command names to functions that render their inputs safe.
var funcMap = template.FuncMap{
"html_template_attrescaper": attrEscaper,
"html_template_commentescaper": commentEscaper,
"html_template_cssescaper": cssEscaper,
"html_template_cssvaluefilter": cssValueFilter,
"html_template_htmlnamefilter": htmlNameFilter,
"html_template_htmlescaper": htmlEscaper,
"html_template_jsregexpescaper": jsRegexpEscaper,
"html_template_jsstrescaper": jsStrEscaper,
"html_template_jsvalescaper": jsValEscaper,
"html_template_nospaceescaper": htmlNospaceEscaper,
"html_template_rcdataescaper": rcdataEscaper,
"html_template_urlescaper": urlEscaper,
"html_template_urlfilter": urlFilter,
"html_template_urlnormalizer": urlNormalizer,
}
// equivEscapers matches contextual escapers to equivalent template builtins.
var equivEscapers = map[string]string{
"html_template_attrescaper": "html",
"html_template_htmlescaper": "html",
"html_template_nospaceescaper": "html",
"html_template_rcdataescaper": "html",
"html_template_urlescaper": "urlquery",
"html_template_urlnormalizer": "urlquery",
}
// escaper collects type inferences about templates and changes needed to make
// templates injection safe.
type escaper struct {
tmpl *Template
// output[templateName] is the output context for a templateName that
// has been mangled to include its input context.
output map[string]context
// derived[c.mangle(name)] maps to a template derived from the template
// named name templateName for the start context c.
derived map[string]*template.Template
// called[templateName] is a set of called mangled template names.
called map[string]bool
// xxxNodeEdits are the accumulated edits to apply during commit.
// Such edits are not applied immediately in case a template set
// executes a given template in different escaping contexts.
actionNodeEdits map[*parse.ActionNode][]string
templateNodeEdits map[*parse.TemplateNode]string
textNodeEdits map[*parse.TextNode][]byte
}
// newEscaper creates a blank escaper for the given set.
func newEscaper(t *Template) *escaper {
return &escaper{
t,
map[string]context{},
map[string]*template.Template{},
map[string]bool{},
map[*parse.ActionNode][]string{},
map[*parse.TemplateNode]string{},
map[*parse.TextNode][]byte{},
}
}
// filterFailsafe is an innocuous word that is emitted in place of unsafe values
// by sanitizer functions. It is not a keyword in any programming language,
// contains no special characters, is not empty, and when it appears in output
// it is distinct enough that a developer can find the source of the problem
// via a search engine.
const filterFailsafe = "ZgotmplZ"
// escape escapes a template node.
func (e *escaper) escape(c context, n parse.Node) context {
switch n := n.(type) {
case *parse.ActionNode:
return e.escapeAction(c, n)
case *parse.IfNode:
return e.escapeBranch(c, &n.BranchNode, "if")
case *parse.ListNode:
return e.escapeList(c, n)
case *parse.RangeNode:
return e.escapeBranch(c, &n.BranchNode, "range")
case *parse.TemplateNode:
return e.escapeTemplate(c, n)
case *parse.TextNode:
return e.escapeText(c, n)
case *parse.WithNode:
return e.escapeBranch(c, &n.BranchNode, "with")
}
panic("escaping " + n.String() + " is unimplemented")
}
// escapeAction escapes an action template node.
func (e *escaper) escapeAction(c context, n *parse.ActionNode) context {
if len(n.Pipe.Decl) != 0 {
// A local variable assignment, not an interpolation.
return c
}
c = nudge(c)
s := make([]string, 0, 3)
switch c.state {
case stateError:
return c
case stateURL, stateCSSDqStr, stateCSSSqStr, stateCSSDqURL, stateCSSSqURL, stateCSSURL:
switch c.urlPart {
case urlPartNone:
s = append(s, "html_template_urlfilter")
fallthrough
case urlPartPreQuery:
switch c.state {
case stateCSSDqStr, stateCSSSqStr:
s = append(s, "html_template_cssescaper")
default:
s = append(s, "html_template_urlnormalizer")
}
case urlPartQueryOrFrag:
s = append(s, "html_template_urlescaper")
case urlPartUnknown:
return context{
state: stateError,
err: errorf(ErrAmbigContext, n.Line, "%s appears in an ambiguous URL context", n),
}
default:
panic(c.urlPart.String())
}
case stateJS:
s = append(s, "html_template_jsvalescaper")
// A slash after a value starts a div operator.
c.jsCtx = jsCtxDivOp
case stateJSDqStr, stateJSSqStr:
s = append(s, "html_template_jsstrescaper")
case stateJSRegexp:
s = append(s, "html_template_jsregexpescaper")
case stateCSS:
s = append(s, "html_template_cssvaluefilter")
case stateText:
s = append(s, "html_template_htmlescaper")
case stateRCDATA:
s = append(s, "html_template_rcdataescaper")
case stateAttr:
// Handled below in delim check.
case stateAttrName, stateTag:
c.state = stateAttrName
s = append(s, "html_template_htmlnamefilter")
default:
if isComment(c.state) {
s = append(s, "html_template_commentescaper")
} else {
panic("unexpected state " + c.state.String())
}
}
switch c.delim {
case delimNone:
// No extra-escaping needed for raw text content.
case delimSpaceOrTagEnd:
s = append(s, "html_template_nospaceescaper")
default:
s = append(s, "html_template_attrescaper")
}
e.editActionNode(n, s)
return c
}
// allIdents returns the names of the identifiers under the Ident field of the node,
// which might be a singleton (Identifier) or a slice (Field).
func allIdents(node parse.Node) []string {
switch node := node.(type) {
case *parse.IdentifierNode:
return []string{node.Ident}
case *parse.FieldNode:
return node.Ident
}
panic("unidentified node type in allIdents")
}
// ensurePipelineContains ensures that the pipeline has commands with
// the identifiers in s in order.
// If the pipeline already has some of the sanitizers, do not interfere.
// For example, if p is (.X | html) and s is ["escapeJSVal", "html"] then it
// has one matching, "html", and one to insert, "escapeJSVal", to produce
// (.X | escapeJSVal | html).
func ensurePipelineContains(p *parse.PipeNode, s []string) {
if len(s) == 0 {
return
}
n := len(p.Cmds)
// Find the identifiers at the end of the command chain.
idents := p.Cmds
for i := n - 1; i >= 0; i-- {
if cmd := p.Cmds[i]; len(cmd.Args) != 0 {
if _, ok := cmd.Args[0].(*parse.IdentifierNode); ok {
continue
}
}
idents = p.Cmds[i+1:]
}
dups := 0
for _, idNode := range idents {
for _, ident := range allIdents(idNode.Args[0]) {
if escFnsEq(s[dups], ident) {
dups++
if dups == len(s) {
return
}
}
}
}
newCmds := make([]*parse.CommandNode, n-len(idents), n+len(s)-dups)
copy(newCmds, p.Cmds)
// Merge existing identifier commands with the sanitizers needed.
for _, idNode := range idents {
pos := idNode.Args[0].Position()
for _, ident := range allIdents(idNode.Args[0]) {
i := indexOfStr(ident, s, escFnsEq)
if i != -1 {
for _, name := range s[:i] {
newCmds = appendCmd(newCmds, newIdentCmd(name, pos))
}
s = s[i+1:]
}
}
newCmds = appendCmd(newCmds, idNode)
}
// Create any remaining sanitizers.
for _, name := range s {
newCmds = appendCmd(newCmds, newIdentCmd(name, p.Position()))
}
p.Cmds = newCmds
}
// redundantFuncs[a][b] implies that funcMap[b](funcMap[a](x)) == funcMap[a](x)
// for all x.
var redundantFuncs = map[string]map[string]bool{
"html_template_commentescaper": {
"html_template_attrescaper": true,
"html_template_nospaceescaper": true,
"html_template_htmlescaper": true,
},
"html_template_cssescaper": {
"html_template_attrescaper": true,
},
"html_template_jsregexpescaper": {
"html_template_attrescaper": true,
},
"html_template_jsstrescaper": {
"html_template_attrescaper": true,
},
"html_template_urlescaper": {
"html_template_urlnormalizer": true,
},
}
// appendCmd appends the given command to the end of the command pipeline
// unless it is redundant with the last command.
func appendCmd(cmds []*parse.CommandNode, cmd *parse.CommandNode) []*parse.CommandNode {
if n := len(cmds); n != 0 {
last, ok := cmds[n-1].Args[0].(*parse.IdentifierNode)
next, _ := cmd.Args[0].(*parse.IdentifierNode)
if ok && redundantFuncs[last.Ident][next.Ident] {
return cmds
}
}
return append(cmds, cmd)
}
// indexOfStr is the first i such that eq(s, strs[i]) or -1 if s was not found.
func indexOfStr(s string, strs []string, eq func(a, b string) bool) int {
for i, t := range strs {
if eq(s, t) {
return i
}
}
return -1
}
// escFnsEq reports whether the two escaping functions are equivalent.
func escFnsEq(a, b string) bool {
if e := equivEscapers[a]; e != "" {
a = e
}
if e := equivEscapers[b]; e != "" {
b = e
}
return a == b
}
// newIdentCmd produces a command containing a single identifier node.
func newIdentCmd(identifier string, pos parse.Pos) *parse.CommandNode {
return &parse.CommandNode{
NodeType: parse.NodeCommand,
Args: []parse.Node{parse.NewIdentifier(identifier).SetPos(pos)},
}
}
// nudge returns the context that would result from following empty string
// transitions from the input context.
// For example, parsing:
// `<a href=`
// will end in context{stateBeforeValue, attrURL}, but parsing one extra rune:
// `<a href=x`
// will end in context{stateURL, delimSpaceOrTagEnd, ...}.
// There are two transitions that happen when the 'x' is seen:
// (1) Transition from a before-value state to a start-of-value state without
// consuming any character.
// (2) Consume 'x' and transition past the first value character.
// In this case, nudging produces the context after (1) happens.
func nudge(c context) context {
switch c.state {
case stateTag:
// In `<foo {{.}}`, the action should emit an attribute.
c.state = stateAttrName
case stateBeforeValue:
// In `<foo bar={{.}}`, the action is an undelimited value.
c.state, c.delim, c.attr = attrStartStates[c.attr], delimSpaceOrTagEnd, attrNone
case stateAfterName:
// In `<foo bar {{.}}`, the action is an attribute name.
c.state, c.attr = stateAttrName, attrNone
}
return c
}
// join joins the two contexts of a branch template node. The result is an
// error context if either of the input contexts are error contexts, or if the
// the input contexts differ.
func join(a, b context, line int, nodeName string) context {
if a.state == stateError {
return a
}
if b.state == stateError {
return b
}
if a.eq(b) {
return a
}
c := a
c.urlPart = b.urlPart
if c.eq(b) {
// The contexts differ only by urlPart.
c.urlPart = urlPartUnknown
return c
}
c = a
c.jsCtx = b.jsCtx
if c.eq(b) {
// The contexts differ only by jsCtx.
c.jsCtx = jsCtxUnknown
return c
}
// Allow a nudged context to join with an unnudged one.
// This means that
// <p title={{if .C}}{{.}}{{end}}
// ends in an unquoted value state even though the else branch
// ends in stateBeforeValue.
if c, d := nudge(a), nudge(b); !(c.eq(a) && d.eq(b)) {
if e := join(c, d, line, nodeName); e.state != stateError {
return e