Skip to content

Commit

Permalink
Fix Unicode handling (#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
sindresorhus authored May 17, 2021
1 parent d2551de commit bbaeb15
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 9 deletions.
21 changes: 16 additions & 5 deletions Sources/Regex/Regex.swift
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ extension Regex {
public let range: Range<String.Index>

fileprivate init(originalString: String, range: NSRange) {
self.range = Range(range, in: originalString)!
self.range = originalString.range(fromNSRange: range)
self.value = String(originalString[self.range])
}
}
Expand Down Expand Up @@ -193,7 +193,10 @@ extension Regex {
public func group(named name: String) -> Group? {
let range = checkingResult.range(withName: name)

guard range.length > 0 else {
guard
range.location != NSNotFound,
range.length > 0
else {
return nil
}

Expand All @@ -203,12 +206,20 @@ extension Regex {
fileprivate init(checkingResult: NSTextCheckingResult, string: String) {
self.checkingResult = checkingResult
self.originalString = string
self.value = string[nsRange: checkingResult.range]!.string
self.range = Range(checkingResult.range, in: string)!
self.range = string.range(fromNSRange: checkingResult.range)
self.value = String(string[self.range])

// The first range is the full range, so we ignore that.
self.groups = (1..<checkingResult.numberOfRanges).map {
self.groups = (1..<checkingResult.numberOfRanges).compactMap {
let range = checkingResult.range(at: $0)

guard
range.location != NSNotFound,
range.length > 0
else {
return nil
}

return Group(originalString: string, range: range)
}
}
Expand Down
14 changes: 14 additions & 0 deletions Sources/Regex/Utilities.swift
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,20 @@ extension String {
}


extension String {
/**
Get a string range from a `NSRange`.

This works better than the built-in `Range(nsRange, in: string)`, which doesn't correctly handle some Unicode compositions.
*/
func range(fromNSRange nsRange: NSRange) -> Range<Index> {
let startIndex = utf16.index(utf16.startIndex, offsetBy: nsRange.lowerBound)
let endIndex = utf16.index(startIndex, offsetBy: nsRange.length)
return rangeOfComposedCharacterSequences(for: startIndex..<endIndex)
}
}


/// Convenience wrappers that make the `range` parameter optional and type-safe.
extension NSRegularExpression {
/// Returns an array containing all the matches of the regular expression in the string.
Expand Down
120 changes: 116 additions & 4 deletions Tests/RegexTests/RegexTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,47 @@ final class RegexTests: XCTestCase {
}

func testMatchGroupRange() {
let string = "foo-456"
let groups = Regex(#"([a-z]+)-(\d+)"#).firstMatch(in: string)!.groups
let fixture = "foo-456"
let groups = Regex(#"([a-z]+)-(\d+)"#).firstMatch(in: fixture)!.groups

XCTAssertEqual(
string[groups[0].range],
fixture[groups[0].range],
"foo"
)

XCTAssertEqual(
string[groups[1].range],
fixture[groups[1].range],
"456"
)

XCTAssertEqual(
String(fixture[groups[0].range]),
groups[0].value
)

XCTAssertEqual(
String(fixture[groups[1].range]),
groups[1].value
)
}

func testMatchGroupUnicode() {
let fixture = "foo ΰ΄«ΰ΅†ΰ΄―ΰ΅β€Œ bar"

// The `fixture` without `ZERO WIDTH NON-JOINER`.
let expected = "ഫࡆയࡍ"

let groups = Regex(#"foo (\p{malayalam}+)"#).firstMatch(in: fixture)!.groups

XCTAssertEqual(
groups[0].value,
expected
)

XCTAssertEqual(
String(fixture[groups[0].range]),
groups[0].value
)
}

func testPatternMatching() {
Expand All @@ -90,4 +119,87 @@ final class RegexTests: XCTestCase {

XCTAssertTrue(regex.isMatched(by: "foo123"))
}

func testUnicode() {
/*
UTF16 representation:
0d2b MALAYALAM LETTER PHA (U+0D2B)
0d46 MALAYALAM VOWEL SIGN E (U+0D46)
0d2f MALAYALAM LETTER YA (U+0D2F)
0d4d MALAYALAM SIGN VIRAMA (U+0D4D)
200c ZERO WIDTH NON-JOINER (U+200C)
*/
let fixture = "ΰ΄«ΰ΅†ΰ΄―ΰ΅β€Œ"

/*
UTF16 representation:
0d2b MALAYALAM LETTER PHA (U+0D2B)
0d46 MALAYALAM VOWEL SIGN E (U+0D46)
0d2f MALAYALAM LETTER YA (U+0D2F)
0d4d MALAYALAM SIGN VIRAMA (U+0D4D)
*/
let expected = "ഫࡆയࡍ"

let match = Regex(#"\p{malayalam}+"#).firstMatch(in: fixture)!

XCTAssertEqual(
match.value,
expected
)

XCTAssertEqual(
String(fixture[match.range]),
match.value
)
}

func testUnicode2() {
let fixture = "foo ΰ΄«ΰ΅†ΰ΄―ΰ΅β€Œ bar"

// The `fixture` without `ZERO WIDTH NON-JOINER`.
let expected = "ഫࡆയࡍ"

let match = Regex(#"\p{malayalam}+"#).firstMatch(in: fixture)!

XCTAssertEqual(
match.value,
expected
)

XCTAssertEqual(
String(fixture[match.range]),
match.value
)
}

func testUnicode3() {
let fixture = "foo ΰ΄«ΰ΅†ΰ΄―ΰ΅β€Œ bar"
let match = Regex(#"\p{malayalam}"#).firstMatch(in: fixture)!

XCTAssertEqual(
match.value,
"ഫࡆ"
)

XCTAssertEqual(
String(fixture[match.range]),
match.value
)
}

func testUnicode4() {
let fixture = "foo πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘¦β€‹πŸ‡³πŸ‡΄ bar"
let expected = "πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘¦β€‹πŸ‡³πŸ‡΄"
let match = Regex(#"[^foo ]+"#).firstMatch(in: fixture)!

XCTAssertEqual(
match.value,
expected
)

XCTAssertEqual(
String(fixture[match.range]),
match.value
)
}
}

0 comments on commit bbaeb15

Please sign in to comment.