Skip to content

Commit

Permalink
add ocr filter that detects and fixes some common ocr errors, activat…
Browse files Browse the repository at this point in the history
…e using the 'ocr' filter, or the new 'all' filter
  • Loading branch information
martinlindhe committed Oct 14, 2017
1 parent b96fd96 commit 578c6a2
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 1 deletion.
2 changes: 1 addition & 1 deletion cmd/subber/subber.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ var (
keepAds = kingpin.Flag("keep-ads", "Do not strip advertisement captions.").Bool()
skipBackups = kingpin.Flag("skip-backups", "Do not make backup (.srt.org) of original .srt").Bool()
language = kingpin.Flag("language", "Language.").Default("en").String()
filterName = kingpin.Flag("filter", "Filter (none, caps, html).").Default("none").String()
filterName = kingpin.Flag("filter", "Filter (none, caps, html, ocr, all).").Default("none").String()
sync = kingpin.Flag("sync", "Synchronize captions (milliseconds).").Int()
)

Expand Down
6 changes: 6 additions & 0 deletions filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,16 @@ import (
// FilterCaptions pass the captions through a filter function
func (subtitle *Subtitle) FilterCaptions(filter string) {
switch filter {
case "all":
subtitle.filterCapitalization()
subtitle.filterHTML()
subtitle.filterOCR()
case "caps":
subtitle.filterCapitalization()
case "html":
subtitle.filterHTML()
case "ocr":
subtitle.filterOCR()
case "none":
default:
fmt.Printf("Unrecognized filter name: %s\n", filter)
Expand Down
41 changes: 41 additions & 0 deletions filter_ocr.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package subtitles

import (
"strings"

log "github.com/Sirupsen/logrus"
)

var (
ocrErrors = map[string]string{
"s0 ": "so ",
"g0 ": "go ",
"0n ": "on ",
"c0uld": "could",
"s0mething": "something",
"l've": "i've",
}
)

// filterOCR corrects some OCR mistakes
func (subtitle *Subtitle) filterOCR() *Subtitle {
for _, cap := range subtitle.Captions {
for i, org := range cap.Text {
for bad, good := range ocrErrors {
// lower case
cap.Text[i] = strings.Replace(cap.Text[i], bad, good, -1)

// upper case
cap.Text[i] = strings.Replace(cap.Text[i], strings.ToUpper(bad), strings.ToUpper(good), -1)

// ucfirst
cap.Text[i] = strings.Replace(cap.Text[i], strings.Title(bad), strings.Title(good), -1)
}

if org != cap.Text[i] {
log.Println("[ocr]", org, "->", cap.Text[i])
}
}
}
return subtitle
}
83 changes: 83 additions & 0 deletions filter_ocr_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package subtitles

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestFilterOCRLower(t *testing.T) {

in := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"s0mething good"},
}}}

expected := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"something good"},
}}}

assert.Equal(t, &expected, in.filterOCR())
}

func TestFilterOCRUpper(t *testing.T) {

in := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"S0METHING GOOD"},
}}}

expected := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"SOMETHING GOOD"},
}}}

assert.Equal(t, &expected, in.filterOCR())
}

func TestFilterOCRUcFirst(t *testing.T) {

in := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"S0mething good"},
}}}

expected := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"Something good"},
}}}

assert.Equal(t, &expected, in.filterOCR())
}

func TestFilterOCREnglish(t *testing.T) {

in := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"l've got a feeling"},
}}}

expected := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"i've got a feeling"},
}}}

assert.Equal(t, &expected, in.filterOCR())
}

0 comments on commit 578c6a2

Please sign in to comment.