Skip to content

Commit

Permalink
SRT HTML Entities (#122)
Browse files Browse the repository at this point in the history
* Updated escaping to be general for ampersand and escape for srt

* Moved html escaping to subtitles.go and updated tests

* Move back to using acceptable list of escapable html entities
  • Loading branch information
justin-taylor authored Dec 17, 2024
1 parent cba5e0f commit f285923
Show file tree
Hide file tree
Showing 9 changed files with 111 additions and 47 deletions.
4 changes: 2 additions & 2 deletions srt.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ func parseTextSrt(i string) (o Line) {
// Append item
o.Items = append(o.Items, LineItem{
InlineStyle: sa,
Text: s,
Text: unescapeHTML(s),
})
}
}
Expand Down Expand Up @@ -305,7 +305,7 @@ func (li LineItem) srtBytes() (c []byte) {
if pos != 0 {
c = append(c, []byte(fmt.Sprintf(`{\an%d}`, pos))...)
}
c = append(c, []byte(li.Text)...)
c = append(c, []byte(escapeHTML(li.Text))...)
if u {
c = append(c, []byte("</u>")...)
}
Expand Down
14 changes: 14 additions & 0 deletions subtitles.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ var (
ErrNoSubtitlesToWrite = errors.New("astisub: no subtitles to write")
)

// HTML Escape
var (
htmlEscaper = strings.NewReplacer("&", "&amp;", "<", "&lt;", "\u00A0", "&nbsp;")
htmlUnescaper = strings.NewReplacer("&amp;", "&", "&lt;", "<", "&nbsp;", "\u00A0")
)

// Now allows testing functions using it
var Now = func() time.Time {
return time.Now()
Expand Down Expand Up @@ -913,3 +919,11 @@ func htmlTokenAttribute(t *html.Token, key string) *string {

return nil
}

func escapeHTML(i string) string {
return htmlEscaper.Replace(i)
}

func unescapeHTML(i string) string {
return htmlUnescaper.Replace(i)
}
39 changes: 39 additions & 0 deletions subtitles_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package astisub_test

import (
"bytes"
"os"
"testing"
"time"

Expand Down Expand Up @@ -301,3 +303,40 @@ func TestSubtitles_ApplyLinearCorrection(t *testing.T) {
require.Equal(t, 11*time.Second, s.Items[2].StartAt)
require.Equal(t, 15500*time.Millisecond, s.Items[2].EndAt)
}

func TestHTMLEntity(t *testing.T) {
exts := []string{"srt", "vtt"}
for _, ext := range exts {
// Read input with entities
s, err := astisub.OpenFile("./testdata/example-in-html-entities." + ext)
assert.NoError(t, err)

assert.Len(t, s.Items, 3)
assert.Equal(t, 331*time.Millisecond, s.Items[0].StartAt)
assert.Equal(t, 3*time.Second+750*time.Millisecond, s.Items[0].EndAt)
assert.Equal(t, "The man in black fled across the desert, \u00A0", s.Items[0].Lines[0].String())
assert.Equal(t, "& the gunslinger followed.", s.Items[0].Lines[1].String())
assert.Equal(t, 4*time.Second+101*time.Millisecond, s.Items[1].StartAt)
assert.Equal(t, 5*time.Second+430*time.Millisecond, s.Items[1].EndAt)
assert.Equal(t, "Go,\u00A0then,", s.Items[1].Lines[0].String())
assert.Equal(t, 6*time.Second+331*time.Millisecond, s.Items[2].StartAt)
assert.Equal(t, 9*time.Second+675*time.Millisecond, s.Items[2].EndAt)
assert.Equal(t, "there are other < worlds than these.", s.Items[2].Lines[0].String())

//Write to srt
w := &bytes.Buffer{}
c, err := os.ReadFile("./testdata/example-out-html-entities.srt")
assert.NoError(t, err)
err = s.WriteToSRT(w)
assert.NoError(t, err)
assert.Equal(t, string(c), w.String())

//Write WebVTT
w = &bytes.Buffer{}
c, err = os.ReadFile("./testdata/example-out-html-entities.vtt")
assert.NoError(t, err)
err = s.WriteToWebVTT(w)
assert.NoError(t, err)
assert.Equal(t, string(c), w.String())
}
}
12 changes: 12 additions & 0 deletions testdata/example-in-html-entities.srt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1
00:00:00,331 --> 00:00:03,750
The man in black fled across the desert, &nbsp;
&amp; the gunslinger followed.

2
00:00:04,101 --> 00:00:05,430
Go,&nbsp;then,

3
00:00:06,331 --> 00:00:09,675
there are other &lt; worlds than these.
14 changes: 14 additions & 0 deletions testdata/example-in-html-entities.vtt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
WEBVTT
1
00:00:00.331 --> 00:00:03.750
The man in black fled across the desert, &nbsp;
&amp; the gunslinger followed.

2
00:00:04.101 --> 00:00:05.430
Go,&nbsp;then,

3
00:00:06.331 --> 00:00:09.675
there are other &lt; worlds than these.
12 changes: 12 additions & 0 deletions testdata/example-out-html-entities.srt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1
00:00:00,331 --> 00:00:03,750
The man in black fled across the desert, &nbsp;
&amp; the gunslinger followed.

2
00:00:04,101 --> 00:00:05,430
Go,&nbsp;then,

3
00:00:06,331 --> 00:00:09,675
there are other &lt; worlds than these.
14 changes: 14 additions & 0 deletions testdata/example-out-html-entities.vtt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
WEBVTT
1
00:00:00.331 --> 00:00:03.750
The man in black fled across the desert, &nbsp;
&amp; the gunslinger followed.

2
00:00:04.101 --> 00:00:05.430
Go,&nbsp;then,

3
00:00:06.331 --> 00:00:09.675
there are other &lt; worlds than these.
18 changes: 4 additions & 14 deletions webvtt.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ var (
bytesWebVTTTimeBoundariesSeparator = []byte(webvttTimeBoundariesSeparator)
webVTTRegexpInlineTimestamp = regexp.MustCompile(`<((?:\d{2,}:)?\d{2}:\d{2}\.\d{3})>`)
webVTTRegexpTag = regexp.MustCompile(`(</*\s*([^\.\s]+)(\.[^\s/]*)*\s*([^/]*)\s*/*>)`)
webVTTEscaper = strings.NewReplacer("&", "&amp;", "<", "&lt;")
webVTTUnescaper = strings.NewReplacer("&amp;", "&", "&lt;", "<")
)

// parseDurationWebVTT parses a .vtt duration
Expand Down Expand Up @@ -331,14 +329,6 @@ func ReadFromWebVTT(i io.Reader) (o *Subtitles, err error) {
return
}

func escapeWebVTT(i string) string {
return webVTTEscaper.Replace(i)
}

func unescapeWebVTT(i string) string {
return webVTTUnescaper.Replace(i)
}

// parseTextWebVTT parses the input line to fill the Line
func parseTextWebVTT(i string) (o Line) {
// Create tokenizer
Expand Down Expand Up @@ -421,7 +411,7 @@ func parseTextWebVTTTextToken(sa *StyleAttributes, line string) (ret []LineItem)
if s := strings.TrimSpace(line); s != "" {
return []LineItem{{
InlineStyle: sa,
Text: unescapeWebVTT(s),
Text: unescapeHTML(s),
}}
}
return
Expand All @@ -431,7 +421,7 @@ func parseTextWebVTTTextToken(sa *StyleAttributes, line string) (ret []LineItem)
if s := strings.TrimSpace(line[:indexes[0][0]]); s != "" {
ret = append(ret, LineItem{
InlineStyle: sa,
Text: unescapeWebVTT(s),
Text: unescapeHTML(s),
})
}

Expand All @@ -455,7 +445,7 @@ func parseTextWebVTTTextToken(sa *StyleAttributes, line string) (ret []LineItem)
ret = append(ret, LineItem{
InlineStyle: sa,
StartAt: t,
Text: unescapeWebVTT(s),
Text: unescapeHTML(s),
})
}

Expand Down Expand Up @@ -671,7 +661,7 @@ func (li LineItem) webVTTBytes() (c []byte) {
c = append(c, []byte(tag.startTag())...)
}
}
c = append(c, []byte(escapeWebVTT(li.Text))...)
c = append(c, []byte(escapeHTML(li.Text))...)
if li.InlineStyle != nil {
noTags := len(li.InlineStyle.WebVTTTags)
for i := noTags - 1; i >= 0; i-- {
Expand Down
31 changes: 0 additions & 31 deletions webvtt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,37 +145,6 @@ Evening.
`, b.String())
}

func TestWebVTTEscape(t *testing.T) {
testData := `WEBVTT
00:01:00.000 --> 00:02:00.000
Sentence with an &amp; in the middle
00:02:00.000 --> 00:03:00.000
Sentence with an &lt; in the middle`

s, err := astisub.ReadFromWebVTT(strings.NewReader(testData))
require.NoError(t, err)

require.Len(t, s.Items, 2)
require.Equal(t, "Sentence with an & in the middle", s.Items[0].String())
require.Equal(t, "Sentence with an < in the middle", s.Items[1].String())

b := &bytes.Buffer{}
err = s.WriteToWebVTT(b)
require.NoError(t, err)
require.Equal(t, `WEBVTT
1
00:01:00.000 --> 00:02:00.000
Sentence with an &amp; in the middle
2
00:02:00.000 --> 00:03:00.000
Sentence with an &lt; in the middle
`, b.String())
}

func TestWebVTTTags(t *testing.T) {
testData := `WEBVTT
Expand Down

0 comments on commit f285923

Please sign in to comment.