From f285923a0c8d5b2ecce67d8eaf9ed26cce16e5b0 Mon Sep 17 00:00:00 2001 From: Justin Taylor <738231+justin-taylor@users.noreply.github.com> Date: Tue, 17 Dec 2024 00:08:53 -0800 Subject: [PATCH] SRT HTML Entities (#122) * Updated escaping to be general for ampersand and escape for srt * Moved html escaping to subtitles.go and updated tests * Move back to using acceptable list of escapable html entities --- srt.go | 4 +-- subtitles.go | 14 +++++++++ subtitles_test.go | 39 ++++++++++++++++++++++++++ testdata/example-in-html-entities.srt | 12 ++++++++ testdata/example-in-html-entities.vtt | 14 +++++++++ testdata/example-out-html-entities.srt | 12 ++++++++ testdata/example-out-html-entities.vtt | 14 +++++++++ webvtt.go | 18 +++--------- webvtt_test.go | 31 -------------------- 9 files changed, 111 insertions(+), 47 deletions(-) create mode 100644 testdata/example-in-html-entities.srt create mode 100644 testdata/example-in-html-entities.vtt create mode 100644 testdata/example-out-html-entities.srt create mode 100644 testdata/example-out-html-entities.vtt diff --git a/srt.go b/srt.go index 7b8d0c9..e9eac43 100644 --- a/srt.go +++ b/srt.go @@ -204,7 +204,7 @@ func parseTextSrt(i string) (o Line) { // Append item o.Items = append(o.Items, LineItem{ InlineStyle: sa, - Text: s, + Text: unescapeHTML(s), }) } } @@ -305,7 +305,7 @@ func (li LineItem) srtBytes() (c []byte) { if pos != 0 { c = append(c, []byte(fmt.Sprintf(`{\an%d}`, pos))...) } - c = append(c, []byte(li.Text)...) + c = append(c, []byte(escapeHTML(li.Text))...) if u { c = append(c, []byte("")...) } diff --git a/subtitles.go b/subtitles.go index ff4811f..ad7844a 100644 --- a/subtitles.go +++ b/subtitles.go @@ -48,6 +48,12 @@ var ( ErrNoSubtitlesToWrite = errors.New("astisub: no subtitles to write") ) +// HTML Escape +var ( + htmlEscaper = strings.NewReplacer("&", "&", "<", "<", "\u00A0", " ") + htmlUnescaper = strings.NewReplacer("&", "&", "<", "<", " ", "\u00A0") +) + // Now allows testing functions using it var Now = func() time.Time { return time.Now() @@ -913,3 +919,11 @@ func htmlTokenAttribute(t *html.Token, key string) *string { return nil } + +func escapeHTML(i string) string { + return htmlEscaper.Replace(i) +} + +func unescapeHTML(i string) string { + return htmlUnescaper.Replace(i) +} diff --git a/subtitles_test.go b/subtitles_test.go index a8012ba..3df1cf6 100644 --- a/subtitles_test.go +++ b/subtitles_test.go @@ -1,6 +1,8 @@ package astisub_test import ( + "bytes" + "os" "testing" "time" @@ -301,3 +303,40 @@ func TestSubtitles_ApplyLinearCorrection(t *testing.T) { require.Equal(t, 11*time.Second, s.Items[2].StartAt) require.Equal(t, 15500*time.Millisecond, s.Items[2].EndAt) } + +func TestHTMLEntity(t *testing.T) { + exts := []string{"srt", "vtt"} + for _, ext := range exts { + // Read input with entities + s, err := astisub.OpenFile("./testdata/example-in-html-entities." + ext) + assert.NoError(t, err) + + assert.Len(t, s.Items, 3) + assert.Equal(t, 331*time.Millisecond, s.Items[0].StartAt) + assert.Equal(t, 3*time.Second+750*time.Millisecond, s.Items[0].EndAt) + assert.Equal(t, "The man in black fled across the desert, \u00A0", s.Items[0].Lines[0].String()) + assert.Equal(t, "& the gunslinger followed.", s.Items[0].Lines[1].String()) + assert.Equal(t, 4*time.Second+101*time.Millisecond, s.Items[1].StartAt) + assert.Equal(t, 5*time.Second+430*time.Millisecond, s.Items[1].EndAt) + assert.Equal(t, "Go,\u00A0then,", s.Items[1].Lines[0].String()) + assert.Equal(t, 6*time.Second+331*time.Millisecond, s.Items[2].StartAt) + assert.Equal(t, 9*time.Second+675*time.Millisecond, s.Items[2].EndAt) + assert.Equal(t, "there are other < worlds than these.", s.Items[2].Lines[0].String()) + + //Write to srt + w := &bytes.Buffer{} + c, err := os.ReadFile("./testdata/example-out-html-entities.srt") + assert.NoError(t, err) + err = s.WriteToSRT(w) + assert.NoError(t, err) + assert.Equal(t, string(c), w.String()) + + //Write WebVTT + w = &bytes.Buffer{} + c, err = os.ReadFile("./testdata/example-out-html-entities.vtt") + assert.NoError(t, err) + err = s.WriteToWebVTT(w) + assert.NoError(t, err) + assert.Equal(t, string(c), w.String()) + } +} diff --git a/testdata/example-in-html-entities.srt b/testdata/example-in-html-entities.srt new file mode 100644 index 0000000..9ff93e3 --- /dev/null +++ b/testdata/example-in-html-entities.srt @@ -0,0 +1,12 @@ +1 +00:00:00,331 --> 00:00:03,750 +The man in black fled across the desert,   +& the gunslinger followed. + +2 +00:00:04,101 --> 00:00:05,430 +Go, then, + +3 +00:00:06,331 --> 00:00:09,675 +there are other < worlds than these. diff --git a/testdata/example-in-html-entities.vtt b/testdata/example-in-html-entities.vtt new file mode 100644 index 0000000..e980105 --- /dev/null +++ b/testdata/example-in-html-entities.vtt @@ -0,0 +1,14 @@ +WEBVTT + +1 +00:00:00.331 --> 00:00:03.750 +The man in black fled across the desert,   +& the gunslinger followed. + +2 +00:00:04.101 --> 00:00:05.430 +Go, then, + +3 +00:00:06.331 --> 00:00:09.675 +there are other < worlds than these. diff --git a/testdata/example-out-html-entities.srt b/testdata/example-out-html-entities.srt new file mode 100644 index 0000000..9ff93e3 --- /dev/null +++ b/testdata/example-out-html-entities.srt @@ -0,0 +1,12 @@ +1 +00:00:00,331 --> 00:00:03,750 +The man in black fled across the desert,   +& the gunslinger followed. + +2 +00:00:04,101 --> 00:00:05,430 +Go, then, + +3 +00:00:06,331 --> 00:00:09,675 +there are other < worlds than these. diff --git a/testdata/example-out-html-entities.vtt b/testdata/example-out-html-entities.vtt new file mode 100644 index 0000000..e980105 --- /dev/null +++ b/testdata/example-out-html-entities.vtt @@ -0,0 +1,14 @@ +WEBVTT + +1 +00:00:00.331 --> 00:00:03.750 +The man in black fled across the desert,   +& the gunslinger followed. + +2 +00:00:04.101 --> 00:00:05.430 +Go, then, + +3 +00:00:06.331 --> 00:00:09.675 +there are other < worlds than these. diff --git a/webvtt.go b/webvtt.go index 89672fe..5883d3d 100644 --- a/webvtt.go +++ b/webvtt.go @@ -36,8 +36,6 @@ var ( bytesWebVTTTimeBoundariesSeparator = []byte(webvttTimeBoundariesSeparator) webVTTRegexpInlineTimestamp = regexp.MustCompile(`<((?:\d{2,}:)?\d{2}:\d{2}\.\d{3})>`) webVTTRegexpTag = regexp.MustCompile(`()`) - webVTTEscaper = strings.NewReplacer("&", "&", "<", "<") - webVTTUnescaper = strings.NewReplacer("&", "&", "<", "<") ) // parseDurationWebVTT parses a .vtt duration @@ -331,14 +329,6 @@ func ReadFromWebVTT(i io.Reader) (o *Subtitles, err error) { return } -func escapeWebVTT(i string) string { - return webVTTEscaper.Replace(i) -} - -func unescapeWebVTT(i string) string { - return webVTTUnescaper.Replace(i) -} - // parseTextWebVTT parses the input line to fill the Line func parseTextWebVTT(i string) (o Line) { // Create tokenizer @@ -421,7 +411,7 @@ func parseTextWebVTTTextToken(sa *StyleAttributes, line string) (ret []LineItem) if s := strings.TrimSpace(line); s != "" { return []LineItem{{ InlineStyle: sa, - Text: unescapeWebVTT(s), + Text: unescapeHTML(s), }} } return @@ -431,7 +421,7 @@ func parseTextWebVTTTextToken(sa *StyleAttributes, line string) (ret []LineItem) if s := strings.TrimSpace(line[:indexes[0][0]]); s != "" { ret = append(ret, LineItem{ InlineStyle: sa, - Text: unescapeWebVTT(s), + Text: unescapeHTML(s), }) } @@ -455,7 +445,7 @@ func parseTextWebVTTTextToken(sa *StyleAttributes, line string) (ret []LineItem) ret = append(ret, LineItem{ InlineStyle: sa, StartAt: t, - Text: unescapeWebVTT(s), + Text: unescapeHTML(s), }) } @@ -671,7 +661,7 @@ func (li LineItem) webVTTBytes() (c []byte) { c = append(c, []byte(tag.startTag())...) } } - c = append(c, []byte(escapeWebVTT(li.Text))...) + c = append(c, []byte(escapeHTML(li.Text))...) if li.InlineStyle != nil { noTags := len(li.InlineStyle.WebVTTTags) for i := noTags - 1; i >= 0; i-- { diff --git a/webvtt_test.go b/webvtt_test.go index 8ab1fd9..a06d783 100644 --- a/webvtt_test.go +++ b/webvtt_test.go @@ -145,37 +145,6 @@ Evening. `, b.String()) } -func TestWebVTTEscape(t *testing.T) { - testData := `WEBVTT - - 00:01:00.000 --> 00:02:00.000 - Sentence with an & in the middle - - 00:02:00.000 --> 00:03:00.000 - Sentence with an < in the middle` - - s, err := astisub.ReadFromWebVTT(strings.NewReader(testData)) - require.NoError(t, err) - - require.Len(t, s.Items, 2) - require.Equal(t, "Sentence with an & in the middle", s.Items[0].String()) - require.Equal(t, "Sentence with an < in the middle", s.Items[1].String()) - - b := &bytes.Buffer{} - err = s.WriteToWebVTT(b) - require.NoError(t, err) - require.Equal(t, `WEBVTT - -1 -00:01:00.000 --> 00:02:00.000 -Sentence with an & in the middle - -2 -00:02:00.000 --> 00:03:00.000 -Sentence with an < in the middle -`, b.String()) -} - func TestWebVTTTags(t *testing.T) { testData := `WEBVTT