Skip to content

Commit

Permalink
ccdb: handle obscure text-based format
Browse files Browse the repository at this point in the history
  • Loading branch information
martinlindhe committed Apr 12, 2018
1 parent 497af97 commit 724b4b5
Show file tree
Hide file tree
Showing 14 changed files with 164 additions and 61 deletions.
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
install:
go install -v ./...
65 changes: 65 additions & 0 deletions ccdb.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package subtitles

// an obscure (to me) text-based subtitle format, can have extension .cc or .txt
// found in The Way We Live Now (2001) BBC TV 1.txt
// STATUS: incomplete detection and support

import (
"log"
"strings"
"time"
)

func looksLikeCCDBCapture(s string) bool {
return strings.Contains(s, "[SUBTITLE]")
}

// NewFromCCDBCapture parses a ccdb capture text into []Caption, assumes s is a clean utf8 string
func NewFromCCDBCapture(s string) (res Subtitle, err error) {
rows := strings.Split(s, "\n")
seq := 1
caption := Caption{Seq: seq}
parseText := false
for rowNum, row := range rows {
if len(row) > 1 && row[0] == '[' {
continue
}
if parseText {
// log.Println("TEXT:", row)
if row == "\r" || row == "" {
parseText = false // XXX until text = empty
} else {
row = strings.TrimSpace(row)
if row != "" {
caption.Text = append(caption.Text, row)
}
}
if strings.Join(caption.Text, "") != "" {
res.Captions = append(res.Captions, caption)
seq++
caption = Caption{Seq: seq}
}
} else if !parseText {
if row == "" {
if rowNum != len(rows)-1 {
log.Println("NOTICE: ccdb seem to have reached end of valid stream at row", rowNum, "of", len(rows))
}
break
}
// log.Println("TIME:", row)
parts := strings.SplitN(row, ",", 2)
if len(parts) == 2 {
caption.Start, _ = parseCCDBTime(parts[0])
caption.End, _ = parseCCDBTime(parts[1])
} else {
log.Println("TIME seq", seq, ", input row", (rowNum + 1), "error:", row)
}
parseText = true
}
}
return
}

func parseCCDBTime(s string) (time.Time, error) {
return parseTime(s)
}
51 changes: 51 additions & 0 deletions ccdb_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package subtitles

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestNewFromCCDBCapture(t *testing.T) {

in := "[SUBTITLE]\r\n" +
"[COLF]&HFFFFFF,[STYLE]no,[SIZE]10,[FONT]Arial\r\n" +
"00:00:16.24,00:01:25.82\r\n" +
"Whoa. \r\n" +
"\r\n" +
"00:01:31.45,00:01:33.62\r\n" +
"Go on. Get out. \r\n" +
"\r\n" +
"00:01:33.62,00:01:33.65\r\n" +
" \r\n" + // should disappear in the parsed captions
"\r\n" +
"00:01:33.65,00:01:34.81\r\n" +
"Out! \r\n" +
"\r\n"

expected := Subtitle{[]Caption{{
1,
makeTime(0, 0, 16, 24),
makeTime(0, 1, 25, 82),
[]string{"Whoa."},
}, Caption{
2,
makeTime(0, 1, 31, 45),
makeTime(0, 1, 33, 62),
[]string{"Go on. Get out."},
}, Caption{
3,
makeTime(0, 1, 33, 65),
makeTime(0, 1, 34, 81),
[]string{"Out!"},
}}}

res, err := NewFromCCDBCapture(in)
assert.Equal(t, nil, err)
assert.Equal(t, expected, res)
}

func TestParseCCDBTime(t *testing.T) {
t1, _ := parseCCDBTime("00:00:16.24")
assert.Equal(t, makeTime(0, 0, 16, 24), t1)
}
2 changes: 0 additions & 2 deletions cleaner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
)

func TestRemoveAds(t *testing.T) {

in := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
Expand Down Expand Up @@ -36,6 +35,5 @@ func TestRemoveAds(t *testing.T) {
makeTime(0, 1, 11, 005),
[]string{"No ninja!"},
}}}

assert.Equal(t, &expected, in.RemoveAds())
}
3 changes: 1 addition & 2 deletions cmd/subber/subber.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ func cleanupSub(data []byte, filterName string, keepAds bool, sync int) (string,
func action(inFileName string) error {

ext := path.Ext(inFileName)
if ext == ".srt" {
if subtitles.LooksLikeTextSubtitle(inFileName) {
if !*dontTouch {
parseAndWriteSubFile(inFileName, *filterName, *keepAds, *sync)
}
Expand All @@ -103,7 +103,6 @@ func action(inFileName string) error {
subFileName := inFileName[:len(inFileName)-len(ext)] + ".srt"

if fileExists(subFileName) {

verboseMessage("Subs found locally in", subFileName, ", skipping download")

if !*dontTouch {
Expand Down
2 changes: 0 additions & 2 deletions encoding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@ import (
)

func TestLooksLikeLatin1(t *testing.T) {

assert.Equal(t, true, looksLikeLatin1([]byte("hall\xe5")))
assert.Equal(t, false, looksLikeLatin1([]byte("hallå")))
}

func TestReadFileAsUTF8(t *testing.T) {

f, err := os.Open("README.md")
assert.Equal(t, nil, err)

Expand Down
3 changes: 0 additions & 3 deletions filter_caps_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,17 @@ import (
)

func TestFilterCapitalization(t *testing.T) {

in := Subtitle{Captions: []Caption{{
Seq: 1,
Start: makeTime(0, 0, 4, 630),
End: makeTime(0, 0, 6, 18),
Text: []string{"GO NINJA!", "NINJA GO!"},
}}}

expected := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"Go ninja!", "Ninja go!"},
}}}

assert.Equal(t, &expected, in.filterCapitalization())
}
3 changes: 0 additions & 3 deletions filter_html_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,17 @@ import (
)

func TestFilterHTML(t *testing.T) {

in := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"<b>GO NINJA!</b>", "NINJA&nbsp;GO!"},
}}}

expected := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"GO NINJA!", "NINJA GO!"},
}}}

assert.Equal(t, &expected, in.filterHTML())
}
15 changes: 0 additions & 15 deletions filter_ocr_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,96 +7,81 @@ import (
)

func TestFilterOCRLower(t *testing.T) {

in := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"s0mething good"},
}}}

expected := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"something good"},
}}}

assert.Equal(t, &expected, in.filterOCR())
}

func TestFilterOCRUpper(t *testing.T) {

in := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"S0METHING GOOD"},
}}}

expected := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"SOMETHING GOOD"},
}}}

assert.Equal(t, &expected, in.filterOCR())
}

func TestFilterOCRUcFirst(t *testing.T) {

in := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"S0mething good"},
}}}

expected := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"Something good"},
}}}

assert.Equal(t, &expected, in.filterOCR())
}

func TestFilterOCREnglish(t *testing.T) {

in := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"l've got a feeling"},
}}}

expected := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"i've got a feeling"},
}}}

assert.Equal(t, &expected, in.filterOCR())
}

func TestFilterOCRCapitalization(t *testing.T) {

in := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"GAsPs slowly"},
}}}

expected := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"GASPS slowly"},
}}}

assert.Equal(t, &expected, in.filterOCR())
}
2 changes: 0 additions & 2 deletions finder_thesubdb_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (
)

func TestDownloadFromTheSubDb(t *testing.T) {

fileName := createZeroedTempFile(1024 * 1024 * 4)
defer os.Remove(fileName)

Expand Down Expand Up @@ -44,7 +43,6 @@ func subDbConformTest(t *testing.T, fileName string, expectedHash string) {
}

func TestSubDbHashFromFile(t *testing.T) {

// NOTE for this to work, run "./hash-conformance-deps" to fetch needed files

// http://thesubdb.com/api/samples/dexter.mp4
Expand Down
20 changes: 18 additions & 2 deletions parser.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
package subtitles

import "fmt"
import (
"fmt"
"io/ioutil"
"log"
)

// Parse tries to parse a subtitle
func Parse(b []byte) (Subtitle, error) {
s := ConvertToUTF8(b)
if looksLikeSSA(s) {
if looksLikeCCDBCapture(s) {
return NewFromCCDBCapture(s)
} else if looksLikeSSA(s) {
return NewFromSSA(s)
} else if looksLikeDCSub(s) {
return NewFromDCSub(s)
Expand All @@ -14,3 +20,13 @@ func Parse(b []byte) (Subtitle, error) {
}
return Subtitle{}, fmt.Errorf("parse: unrecognized subtitle type")
}

// LooksLikeTextSubtitle returns true i byte stream seems to be of a recognized format
func LooksLikeTextSubtitle(filename string) bool {
data, err := ioutil.ReadFile(filename)
if err != nil {
log.Fatal(err)
}
s := ConvertToUTF8(data)
return looksLikeCCDBCapture(s) || looksLikeSSA(s) || looksLikeDCSub(s) || looksLikeSRT(s)
}
Loading

0 comments on commit 724b4b5

Please sign in to comment.