Skip to content

Commit

Permalink
fix: abbreviations and apostraphes
Browse files Browse the repository at this point in the history
  • Loading branch information
tympanix committed Mar 16, 2019
1 parent 1b81dcb commit f54c0ac
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 8 deletions.
8 changes: 6 additions & 2 deletions parse/capitalize.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ func isUpper(str string) bool {
return upper > 0
}

var omissionRegex = regexp.MustCompile(`'[a-zA-Z]+`)

// Capitalize returns the string with proper english capitalization
func Capitalize(str string) string {
if isUpper(str) && len(str) > 3 {
Expand All @@ -109,9 +111,11 @@ func Capitalize(str string) string {
str = breakRegex.ReplaceAllStringFunc(str, func(word string) string {
return strings.Title(word)
})
str = abbreviationRegexp.ReplaceAllStringFunc(str, func(abbr string) string {
str = abbrevRegex.ReplaceAllStringFunc(str, func(abbr string) string {
return strings.ToUpper(abbr)
})
str = strings.Replace(str, "'S", "'s", -1)
str = omissionRegex.ReplaceAllStringFunc(str, func(omm string) string {
return strings.ToLower(omm)
})
return str
}
27 changes: 21 additions & 6 deletions parse/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ func isAbbreviation(str string) bool {
return false
}

var abbreviationRegexp = regexp.MustCompile(`\b[A-Za-z]([\s\.][A-Za-z])+\b`)
var illegalcharsRegexp = regexp.MustCompile(`[^\p{L}0-9\s&'_\(\)\-,:]`)
var spaceReplaceRegexp = regexp.MustCompile(`[\.\s_]+`)
var websiteRegexp = regexp.MustCompile(`((https?|ftp|smtp):\/\/)?(www.)[a-z0-9]+\.[a-z]+(\/[a-zA-Z0-9#]+\/?)*`)
Expand All @@ -93,9 +92,7 @@ func CleanName(name string) string {
name = spaceReplaceRegexp.ReplaceAllString(name, " ")
name = illegalcharsRegexp.ReplaceAllString(name, "")

name = abbreviationRegexp.ReplaceAllStringFunc(name, func(match string) string {
return strings.Replace(match, " ", ".", -1) + "."
})
name = cleanAbbreviations(name)

name = wordRegex.ReplaceAllStringFunc(name, func(match string) string {
if isAbbreviation(match) {
Expand All @@ -110,9 +107,27 @@ func CleanName(name string) string {

if len(match) > 1 {
return match[1]
} else {
return ""
}
return ""
}

var abbrevRegex = regexp.MustCompile(`(?:^|[\.\s])((?:\p{L})(?:[\.\s]\p{L})+)(?:[\.\s]|$)`)

func cleanAbbreviations(s string) string {
g := abbrevRegex.FindAllStringSubmatchIndex(s, -1)
if g == nil {
return s
}
var res string
i := 0
for _, p := range g {
abbrev := s[p[2]:p[3]]
r := strings.Join(spaceReplaceRegexp.Split(abbrev, -1), ".")
res = s[i:p[2]] + r + "."
i = p[3]
}
res = res + s[i:]
return res
}

var illegalIdentity = regexp.MustCompile(`[^\p{L}0-9]`)
Expand Down
4 changes: 4 additions & 0 deletions parse/parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ var testMovieTitles = []string{
"Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb",
"X-Men Origins: Wolverine",
"Mr. & Mrs. Smith",
"Don't Think Twice",
"10 Things I Hate About You",
"Berlin, I Love You",
"To All the Boys I've Loved Before",
}

func TestCleanNameMovieTitles(t *testing.T) {
Expand Down

0 comments on commit f54c0ac

Please sign in to comment.