diff --git a/stdlib/public/core/StringIndexValidation.swift b/stdlib/public/core/StringIndexValidation.swift index 93e363ff0589c..f1f84ee50f955 100644 --- a/stdlib/public/core/StringIndexValidation.swift +++ b/stdlib/public/core/StringIndexValidation.swift @@ -400,20 +400,3 @@ extension _StringGuts { scalarAlign(validateInclusiveSubscalarIndex_5_7(i))) } } - -// Word index validation (String) -extension _StringGuts { - internal func validateWordIndex( - _ i: String.Index - ) -> String.Index { - return roundDownToNearestWord(scalarAlign(validateSubscalarIndex(i))) - } - - internal func validateInclusiveWordIndex( - _ i: String.Index - ) -> String.Index { - return roundDownToNearestWord( - scalarAlign(validateInclusiveSubscalarIndex(i)) - ) - } -} diff --git a/stdlib/public/core/StringWordBreaking.swift b/stdlib/public/core/StringWordBreaking.swift index bc98e590e5d76..75564907315d2 100644 --- a/stdlib/public/core/StringWordBreaking.swift +++ b/stdlib/public/core/StringWordBreaking.swift @@ -1,683 +1,773 @@ //===----------------------------------------------------------------------===// // +// // This source file is part of the Swift.org open source project // -// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Copyright (c) 2022 - 2025 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// -extension _StringGuts { - internal func roundDownToNearestWord( - _ i: String.Index - ) -> String.Index { - _internalInvariant(i._encodedOffset <= count) - - let offset = i._encodedOffset - - if offset == 0 || offset == count { - return i +extension Unicode { + /// A state machine for recognizing word boundaries in an arbitrary series of + /// Unicode scalars, based on the specification in [Unicode Annex + /// #29](https://unicode.org/reports/tr29/#Word_Boundary_Rules). + /// + /// The text segmentation algorithm is not stable, and it allows implementers + /// to tailor it to their needs. Accordingly, reported word boundaries may + /// vary in arbitrary ways between Unicode implementations and system + /// configurations, including between versions of the Swift Standard Library. + /// + /// To implement the rules as specified, this low-level construct has built-in + /// support to defer making a decision on whether there is a word boundary + /// between two Unicode scalars until more scalars are fed to the state + /// machine; that is to say, it implements limited lookahead. The API surface + /// only allows one such candidate position to exist at any given time -- this + /// corresponds to allowing looking ahead up to the next word boundary. (In + /// the unlikely case the rules evolve to require looking ahead even further, + /// then this interface will need to be modified or replaced accordingly.) + /// + /// To detect word breaks in a sequence of Unicode scalars, feed each of them + /// to the recognizer by calling its `hasBreak(before:)` method. The method + /// indicates if there is a word break preceding the given scalar, or at a + /// previously reported candidate position. When every scalar in the text has + /// been fed to the recognizer, the `hasCandidateBreakAtEnd()` method should + /// be called to determine if there is a break at the last reported candidate + /// position. There is also an (implicit) word break at the end of text + /// position. + /// + /// Note that `_WordRecognizer` does not take or return actual text positions + /// (such as a string index); it is entirely independent of the underlying + /// text representation, and it is able to work with any container model. (For + /// example, it can be used to incrementally recognize word breaks in UTF-16 + /// data streamed from a network connection, or iterate over word boundaries + /// in piecewise contiguous UTF-8 buffers stored in a rope data structure.) Of + /// course, it is also possible to use it to detect word breaks in a standard + /// `String` value, such as done by this example function: + /// + /// func collectWordBreaks(in string: String) -> [String.Index] { + /// var result: [String.Index] = [] + /// var recognizer = Unicode._WordRecognizer() + /// var candidate = string.startIndex + /// for i in string.unicodeScalars.indices { + /// let r = recognizer.hasBreak(before: string.unicodeScalars[i]) + /// if r.setCandidate { candidate = i } + /// if r.breakAtCandidate { result.append(candidate) } + /// if r.breakHere { result.append(i) } + /// } + /// if recognizer.hasCandidateBreakAtEnd() { + /// result.append(candidate) + /// } + /// result.append(string.endIndex) + /// return result + /// } + /// + /// When used this way, the state machine is able to efficiently iterate over + /// all breaks within the string by visiting each scalar exactly once, without + /// any backtracking. + /// + /// It is also possible to discard the recognizer after each detected + /// boundary, reinitializing it from scratch for each iteration step: + /// + /// func wordBreak( + /// after knownBreak: String.Index, in string: String + /// ) -> String.Index { + /// var recognizer = Unicode._WordRecognizer(after: string.unicodeScalars[knownBreak]) + /// var i = string.unicodeScalars.index(after: knownBreak) + /// var candidate = i + /// while i < string.endIndex { + /// let r = recognizer.hasBreak(before: string.unicodeScalars[i]) + /// if r.setCandidate { candidate = i } + /// if r.breakAtCandidate { return candidate } + /// if r.breakHere { return i } + /// string.uncodeScalars.formIndex(after: &i) + /// } + /// if recognizer.hasCandidateBreakAtEnd() { + /// return candidate + /// } + /// return i + /// } + /// + /// However, note that iterating this way is less efficient, because it + /// discards lookahead information -- some scalars will be processed multiple + /// times. The rules are carefully constructed so that the algorithm reports + /// the same word boundaries whether or not recognizer state is preserved. + @available(StdlibDeploymentTarget 6.3, *) + public // Core primitive + struct _WordRecognizer: Sendable { + // FIXME: We also need proper public API for this + + /// The last scalar that was fed to `hasBreak(before:)`. + var _prevScalar: Unicode.Scalar + /// The cached word break property of `_prevScalar`. + var _prevCategory: _WordBreakProperty + /// The word break property of the last preceding scalar that wasn't ignored by rule WB4. + var _baseCategory: _WordBreakProperty + /// The current state of the recognizer. + var _state: _State + + /// Initialize a new word recognizer at the _start of text_ (sot) + /// position. + /// + /// The resulting state machine will report a word break before the first + /// scalar that is fed to it. + public init() { + // To avoid having to handle the empty case specially, we use LF as the + // placeholder before the first scalar. Per WB3a, we always produce a break + // following a line feed. + _baseCategory = .newlineCRLF + _prevScalar = Unicode.Scalar(0x0A as UInt8) + _prevCategory = .newlineCRLF + _state = .ordinary } - let start = previousWordIndex(endingAt: offset) - let end = nextWordIndex(startingAt: start) - _internalInvariant(offset <= end, "Word breaking inconsistency") - - if offset == end { - return i + /// Initialize a new word recognizer with a state after a previously + /// recognized word boundary. + /// + /// This enables clients to iterate over word boundaries without maintaining + /// a persistent recognizer state. However, iterating this way may result in + /// an arbitrarily large amount of duplicate work. (This is because the word + /// segmentation algorithm requires looking ahead by as much as a full + /// word's worth of Unicode scalars, and when the old recognizer is + /// discarded, the information thus collected has to be recreated from + /// scratch.) Whenever possible, it is therefore preferable to iterate over + /// multiple word boundaries using a single recognizer instance. + /// + /// - Parameter scalar: The Unicode scalar immediately following a known + /// word boundary position. + public init(after scalar: Unicode.Scalar) { + // We assume that the state machine provides stable results even if the + // start position was a retroactive candidate. + _prevScalar = scalar + _prevCategory = Unicode._WordBreakProperty(from: scalar) + _baseCategory = _prevCategory + _state = .ordinary } - - return String.Index(_encodedOffset: start) } +} - @inline(never) - @_effects(releasenone) - internal func nextWordIndex(startingAt i: Int) -> Int { - if _slowPath(isForeign) { - return _foreignNextWordIndex(startingAt: i) - } - - return unsafe withFastUTF8 { utf8 in - nextWordBoundary(startingAt: i) { - _internalInvariant($0 >= 0) - - guard $0 < utf8.count else { - return nil - } - - let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: $0) - return (scalar, $0 &+ len) +@available(StdlibDeploymentTarget 6.3, *) +extension Unicode._WordRecognizer { + /// The parts of the word recognizer state that are in addition to saved + /// information about previous scalars. + /// + /// This is used to implement stateful lookahead when a break at a particular + /// candidate position may need to be suppressed or activated based on a + /// subsequent scalar value. It is also used to keep track of the number of + /// contiguous regional indicator scalars we have seen so far. + internal enum _State: Int, Sendable { + case ordinary + case afterWB6 // AHLetter × (MidLetter | MidNumLetQ) AHLetter + case afterWB7b // Hebrew_Letter × Double_Quote Hebrew_Letter + case afterWB12 // Numeric × (MidNum | MidNumLetQ) Numeric + case afterMidFlag // [^RI] (RI RI)* RI × RI + + var hasPendingCandidate: Bool { + switch self { + case .afterWB6, .afterWB7b, .afterWB12: return true + default: return false } } } - internal func _foreignNextWordIndex(startingAt i: Int) -> Int { -#if _runtime(_ObjC) - return nextWordBoundary(startingAt: i) { - _internalInvariant($0 >= 0) - - guard $0 < count else { - return nil - } - - let scalars = String.UnicodeScalarView(self) - let idx = String.Index(_encodedOffset: $0) - - let scalar = scalars[idx] - let nextIndex = scalars.index(after: idx) - - return (scalar, nextIndex._encodedOffset) - } -#else - fatalError("No foreign strings on this platform in this version of Swift.") -#endif + /// Reject a break at the current position, triggering a break at the current + /// candidate (if any). + internal mutating func _reject( + ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { + let breakAtCandidate = _state.hasPendingCandidate + _state = .ordinary + return ( + setCandidate: false, + breakAtCandidate: breakAtCandidate, + breakHere: false) } - internal func previousWordIndex(endingAt i: Int) -> Int { - if _slowPath(isForeign) { - return _foreignPreviousWordIndex(endingAt: i) - } - - return unsafe withFastUTF8 { utf8 in - previousWordBoundary(endingAt: i) { - _internalInvariant($0 <= count) - - guard $0 > 0 else { - return nil - } - - let (scalar, len) = unsafe _decodeScalar(utf8, endingAt: $0) - return (scalar, $0 &- len) - } - } + /// Skip this position, ignoring the current Unicode scalar. + internal mutating func _ignore( + ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { + // Note: not updating _baseCategory + return (setCandidate: false, breakAtCandidate: false, breakHere: false) } - @inline(never) - internal func _foreignPreviousWordIndex(endingAt i: Int) -> Int { -#if _runtime(_ObjC) - return previousWordBoundary(endingAt: i) { - _internalInvariant($0 <= count) - - guard $0 > 0 else { - return nil - } - - let scalars = String.UnicodeScalarView(self) - let idx = String.Index(_encodedOffset: $0) - - let previousIndex = scalars.index(before: idx) - let scalar = scalars[previousIndex] - - return (scalar, previousIndex._encodedOffset) - } -#else - fatalError("No foreign strings on this platform in this version of Swift.") -#endif + /// Signal a break at the current position, also triggering a break at the + /// current candidate (if any). + internal mutating func _accept( + ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { + // If we have a pending candidate, put a break at it + let breakAtCandidate = _state.hasPendingCandidate + _state = .ordinary + return ( + setCandidate: false, + breakAtCandidate: breakAtCandidate, + breakHere: true) } -} - -internal enum _WordQuestion { - case checkingRegionalIndicator(count: Int, previousRIIndex: Int) - case requireAHLetter - case requireNumeric - case requireHebrewLetter -} - -extension _WordQuestion: Equatable {} - -internal struct _WordBreakingState { - var constraint: (question: _WordQuestion, index: Int)? = nil - - var index: Int - - var previousIndex: Int? = nil - var previousProperty: Unicode._WordBreakProperty? = nil - - // When walking forward in a string, we need to not break on emoji flag - // sequences. Emoji flag sequences are composed of 2 regional indicators, so - // when we see our first (.regionalIndicator, .regionalIndicator) decision, - // we need to know to return false in this case. However, if the next scalar - // is another regional indicator, we reach the same decision rule, but in this - // case we actually need to break there's a boundary between emoji flag - // sequences. - var shouldBreakRI = false -} - -extension _StringGuts { - // Returns the stride of the next word at the previous boundary offset. - internal func nextWordBoundary( - startingAt index: Int, - nextScalar: (Int) -> (scalar: Unicode.Scalar, end: Int)? - ) -> Int { - _precondition(index < endIndex._encodedOffset) - - var (scalar, index) = nextScalar(index)! - var state = _WordBreakingState(index: index) - - while let (scalar2, nextIndex) = nextScalar(state.index) { - if shouldBreak(between: scalar, and: scalar2, with: &state) { - break - } - - scalar = scalar2 - state.index = nextIndex - } - - // If we have a leftover constraint, return the index - if let constraint = state.constraint { - return constraint.index - } - return state.index + /// Set the current position as the active break candidate, and transition into the + /// specified state. + internal mutating func _transition( + into state: _State + ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { + _internalInvariant(_state == .ordinary) + _state = state + return (setCandidate: true, breakAtCandidate: false, breakHere: false) } - // Returns the stride of the previous word at the current boundary offset. - internal func previousWordBoundary( - endingAt index: Int, - previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)? - ) -> Int { - var (scalar2, index) = previousScalar(index)! - var state = _WordBreakingState(index: index) - - while let (scalar, previousIndex) = previousScalar(state.index) { - if shouldBreakBackward(between: scalar, and: scalar2, with: &state) { - break - } - - scalar2 = scalar - state.index = previousIndex - } - - if let previousIndex = state.previousIndex { - return previousIndex - } - - if let constraint = state.constraint { - if let riIndex = handleRIConstraint(constraint, with: state) { - return riIndex - } - - return constraint.index - } - - return state.index + /// If the current state matches the given expectation, suppress a break at + /// this position and discard the active candidate; otherwise, report a break + /// at both positions. + internal mutating func _expect( + _ expectedState: _State + ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { + let breakHere = (_state != expectedState) + let breakAtCandidate = breakHere && _state.hasPendingCandidate + _state = .ordinary + return ( + setCandidate: false, + breakAtCandidate: breakAtCandidate, + breakHere: breakHere) } -} -extension _StringGuts { - // The "algorithm" that determines whether or not we should break between - // certain word break properties. - // - // This is based off of the Unicode Annex #29 for [Word Boundary - // Rules](https://unicode.org/reports/tr29/#Word_Boundary_Rules). - internal func shouldBreak( - between scalar1: Unicode.Scalar, - and scalar2: Unicode.Scalar, - with state: inout _WordBreakingState - ) -> Bool { - // WB3 - if scalar1.value == 0xD, scalar2.value == 0xA { - return false + /// Feeds the next scalar to the state machine, reporting if there is a word + /// boundary at the current position or a previously reported candidate. + /// + /// To decide whether there is a word break at the current position, the + /// segmentation algorithm sometimes needs to look ahead by visiting + /// additional scalars following the break, up to the next word boundary. To + /// allow this, the state machine can report that the current position is a + /// provisional break "candidate". Clients are expected to remember the + /// position of the last reported candidate, so that it can be retroactively + /// promoted to a full break as needed. + /// + /// - Parameter nextScalar: The scalar at the current position in the text. + /// - Returns: A triple of Boolean values `setCandidate`, `breakAtCandidate`, + /// `breakHere`. If `setCandidate` is true, then the caller is expected to + /// save the current text position as a potential word boundary. If + /// `breakAtCandidate` is true, then there is a word boundary at the last + /// candidate position. If `breakHere` is true, then there is a word + /// boundary at the current position. The caller is expected to process + /// these three components in this specific order. + public mutating func hasBreak( + before nextScalar: Unicode.Scalar + ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { + let nextCategory = Unicode._WordBreakProperty(from: nextScalar) + var nextBase = nextCategory + + // FIXME: Implement a proper state machine here, dispatching on + // (state, nextProperty), ideally through a static look-up table + + defer { + _prevScalar = nextScalar + _prevCategory = nextCategory + _baseCategory = nextBase } - let x = Unicode._WordBreakProperty(from: scalar1) - - // WB3a, handled here since we don't need to look up `y` for this - if x == .newlineCRLF { - return true - } - - let y = Unicode._WordBreakProperty(from: scalar2) - - switch (x, y) { - - // Fast path: If we know our scalars have no properties the decision is - // trivial and we don't need to crawl to the default statement. - case (.any, .any): - return true - - // WB3b - case (_, .newlineCRLF): - return true - - // WB3c - case (.zwj, .extendedPictographic): - return false + switch (_prevCategory, nextCategory) { + case (.any, .any): // WB999 + // Fast path: If we know our scalars have no properties then the decision + // is trivial and we don't need to crawl to the default statement. + return _accept() + + case (.newlineCRLF, _), // WB3a + (_, .newlineCRLF): // WB3b + if _prevScalar.value == 0xD, nextScalar.value == 0xA { // WB3 + _internalInvariant(_prevCategory == .newlineCRLF) + return _reject() + } + return _accept() - // WB3d - case (.wSegSpace, .wSegSpace): - return false + case (.zwj, .extendedPictographic), // WB3c + (.wSegSpace, .wSegSpace): // WB3d + return _reject() - // WB4 - case (_, .format), + case (_, .format), // WB4 (_, .extend), (_, .zwj): - if x != .format && x != .extend && x != .zwj { - state.previousProperty = x - } - - return false + nextBase = _baseCategory // Cancel _baseCategory update + return _ignore() default: - let newX = state.previousProperty ?? x - - return decidePostFormat(between: newX, and: y, with: &state) + break } - } - - internal func decidePostFormat( - between x: Unicode._WordBreakProperty, - and y: Unicode._WordBreakProperty, - with state: inout _WordBreakingState - ) -> Bool { - state.previousProperty = nil - switch (x, y) { - // WB5 - case (.aLetter, .aLetter), + switch (_baseCategory, nextCategory) { + case (.aLetter, .aLetter), // WB5 (.aLetter, .hebrewLetter), (.hebrewLetter, .aLetter), (.hebrewLetter, .hebrewLetter): - return false + return _reject() - // WB6 - case (.aLetter, .midLetter), + case (.aLetter, .midLetter), // WB6 (.hebrewLetter, .midLetter), (.aLetter, .midNumLet), (.hebrewLetter, .midNumLet), (.aLetter, .singleQuote): - state.constraint = (question: .requireAHLetter, index: state.index) + return _transition(into: .afterWB6) - return false - - // WB7 - case (.midLetter, .aLetter), + case (.midLetter, .aLetter), // WB7 (.midLetter, .hebrewLetter), (.midNumLet, .aLetter), (.midNumLet, .hebrewLetter), (.singleQuote, .aLetter), (.singleQuote, .hebrewLetter): - if let constraint = state.constraint { - if constraint.question == .requireAHLetter { - state.constraint = nil - return false - } - - state.index = constraint.index - return true - } + return _expect(.afterWB6) - return true + case (.hebrewLetter, .singleQuote): // WB7a + return _reject() - // WB7a - case (.hebrewLetter, .singleQuote): - return false + case (.hebrewLetter, .doubleQuote): // WB7b + return _transition(into: .afterWB7b) - // WB7b - case (.hebrewLetter, .doubleQuote): - state.constraint = (question: .requireHebrewLetter, index: state.index) + case (.doubleQuote, .hebrewLetter): // WB7c + return _expect(.afterWB7b) - return false + case (.numeric, .numeric), // WB8 + (.aLetter, .numeric), // WB9 + (.hebrewLetter, .numeric), // WB9 + (.numeric, .aLetter), // WB10 + (.numeric, .hebrewLetter): // WB10 + return _reject() - // WB7c - case (.doubleQuote, .hebrewLetter): - if let constraint = state.constraint { - if constraint.question == .requireHebrewLetter { - state.constraint = nil - return false - } - - state.index = constraint.index - return true - } - - return true - - // WB8 - case (.numeric, .numeric): - return false - - // WB9 - case (.aLetter, .numeric), - (.hebrewLetter, .numeric): - return false - - // WB10 - case (.numeric, .aLetter), - (.numeric, .hebrewLetter): - return false - - // WB11 - case (.midNum, .numeric), + case (.midNum, .numeric), // WB11 (.midNumLet, .numeric), (.singleQuote, .numeric): - if let constraint = state.constraint { - if constraint.question == .requireNumeric { - state.constraint = nil - return false - } - - state.index = constraint.index - return true - } - - return true + return _expect(.afterWB12) - // WB12 - case (.numeric, .midNum), + case (.numeric, .midNum), // WB12 (.numeric, .midNumLet), (.numeric, .singleQuote): - state.constraint = (question: .requireNumeric, index: state.index) + return _transition(into: .afterWB12) - return false - - // WB13 - case (.katakana, .katakana): - return false - - // WB13a - case (.aLetter, .extendNumLet), + case (.katakana, .katakana), // WB13 + (.aLetter, .extendNumLet), // WB13a (.hebrewLetter, .extendNumLet), (.numeric, .extendNumLet), (.katakana, .extendNumLet), - (.extendNumLet, .extendNumLet): - return false - - // WB13b - case (.extendNumLet, .aLetter), + (.extendNumLet, .extendNumLet), + (.extendNumLet, .aLetter), // WB13b (.extendNumLet, .hebrewLetter), (.extendNumLet, .numeric), (.extendNumLet, .katakana): - return false + return _reject() - // WB15 - case (.regionalIndicator, .regionalIndicator): - defer { - state.shouldBreakRI.toggle() + case (.regionalIndicator, .regionalIndicator): // WB15/WB16 + let breakHere: Bool + if _state == .afterMidFlag { + _state = .ordinary + breakHere = true + } else { + _state = .afterMidFlag + breakHere = false } + return (setCandidate: false, breakAtCandidate: false, breakHere: breakHere) - return state.shouldBreakRI - - default: - return true + default: // WB999 + return _accept() } } + + /// Returns true if the previously reported word boundary candidate needs to + /// be promoted to a full break if there are no more scalars in the input text. + /// + /// There is always an (implicit) word boundary at position at the end of + /// text, following the last scalar; however, the end may also trigger a + /// pending unreported break at the last candidate previously set by + /// `hasBreak`. This method returns true in that case, allowing clients to + /// reliably detect such boundaries. + public func hasCandidateBreakAtEnd() -> Bool { + _state.hasPendingCandidate + } } -extension _StringGuts { - // The "algorithm" that determines whether or not we should break between - // certain word break properties. - // - // This is based off of the Unicode Annex #29 for [Word Boundary - // Rules](https://unicode.org/reports/tr29/#Word_Boundary_Rules). - internal func shouldBreakBackward( - between scalar1: Unicode.Scalar, - and scalar2: Unicode.Scalar, - with state: inout _WordBreakingState - ) -> Bool { - // WB3 - if scalar1.value == 0xD, scalar2.value == 0xA { - return false +extension Unicode { + /// A state machine for recognizing safe word boundaries in a backward + /// sequence of Unicode scalars, based on the specification in [Unicode Annex + /// #29](https://unicode.org/reports/tr29/#Word_Boundary_Rules). + /// + /// The text segmentation algorithm is not stable, and it allows implementers + /// to tailor it to their needs. Accordingly, reported word boundaries may + /// vary in arbitrary ways between Unicode implementations and system + /// configurations, including between versions of the Swift Standard Library. + /// + /// This is intended to help implement searching for word boundaries near an + /// arbitrary position in a middle of a larger text, as described in [section + /// 6.4 of Annex #29](https://unicode.org/reports/tr29/#Random_Access). The + /// start position may be at at an arbitrary scalar anywhere in the input text + /// -- there is no expectation that the first scalar addresses a known word + /// boundary. The state machine scans backwards from that position until it + /// detects a reliable word boundary. + /// + /// To detect a word break near a particular position in a series of Unicode + /// scalars, start iterating scalars backward from the start position, feeding + /// each of them to the `hasGuaranteedBreak(after:)` method. The method + /// indicates if there is a word break preceding the given scalar, or at a + /// previously reported candidate position. There is always a word break at + /// the start of the text; so if we run out of scalars, the start position is + /// going to be a suitable safe word boundary. + /// + /// Note that this construct may skip over an arbitrary number of word + /// boundaries while it is searching for a safe break position. Once a safe + /// boundary is found, callers are usually expected to use it as the start + /// position to iterate forward using the standard segmentation algorithm + /// (implemented by `Unicode._WordRecognizer`), for example until they find + /// the break nearest to their original start position. In such cases, it is + /// usually a good idea to incrementally memoize word boundaries as they are + /// detected, to avoid repeating this process on the same positions later. + @available(StdlibDeploymentTarget 6.3, *) + public // Core primitive + struct _RandomAccessWordRecognizer: Sendable { + // FIXME: We also need proper public API for this + + /// The last scalar that was fed to `hasGuaranteedBreak`; i.e., the scalar + /// immediately following the current one in the text. + var _nextScalar: Unicode.Scalar + /// The cached word break property of `_nextScalar`. + var _nextCategory: _WordBreakProperty + /// The word break property of the most recently seen scalar that wasn't + /// ignored by rule WB4. + var _baseCategory: _WordBreakProperty + /// Additional recognizer state. + var _state: _State + var _hasPendingCandidate: Bool + + /// Initialize a new word recognizer at an arbitrary text position preceding + /// the given scalar. + public init(before scalar: Unicode.Scalar) { + _nextScalar = scalar + _nextCategory = Unicode._WordBreakProperty(from: scalar) + _baseCategory = _nextCategory + _state = .initial + _hasPendingCandidate = false } + } +} - let x = Unicode._WordBreakProperty(from: scalar1) - let y = Unicode._WordBreakProperty(from: scalar2) - - switch (x, y) { - // Fast path: If we know our scalars have no properties the decision is - // trivial and we don't need to crawl to the default statement. - case (.any, .any): - return true +@available(StdlibDeploymentTarget 6.3, *) +extension Unicode._RandomAccessWordRecognizer { + internal enum _State: Int, Sendable { + case initial + case ordinary + case beforeWB7 + case beforeWB7c + case beforeWB11 - // WB3a and WB3b - case (.newlineCRLF, _), - (_, .newlineCRLF): - return true + var hasPendingCandidate: Bool { + switch self { + case .beforeWB7, .beforeWB7c, .beforeWB11: return true + default: return false + } + } + } - // WB3c - case (.zwj, .extendedPictographic): - return false + internal mutating func _reject( + ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { + _hasPendingCandidate = false + return (setCandidate: false, breakAtCandidate: false, breakHere: false) + } - // WB3d - case (.wSegSpace, .wSegSpace): - return false + internal mutating func _ignore( + ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { + return (setCandidate: false, breakAtCandidate: false, breakHere: false) + } - // WB4 - case (.format, _), - (.extend, _), - (.zwj, _): - if y != .format && y != .extend && y != .zwj { - state.previousProperty = y - - // If we already have a constraint in flight, then use that as our base - // previous index. Otherwise, use where we're at right now. - if let constraint = state.constraint { - state.previousIndex = constraint.index - } else { - state.previousIndex = state.index - } - } + internal mutating func _accept( + ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { + if _hasPendingCandidate { + _hasPendingCandidate = false + return (setCandidate: false, breakAtCandidate: true, breakHere: false) + } + return (setCandidate: false, breakAtCandidate: false, breakHere: true) + } - return false + internal mutating func _placeCandidate( + ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { + if _state == .initial { + return _reject() + } + _hasPendingCandidate = true + return (setCandidate: true, breakAtCandidate: false, breakHere: false) + } - // WB4 - case (_, .format), - (_, .extend), - (_, .zwj): - if state.previousProperty != nil { - fallthrough - } + internal mutating func _placeSoftCandidate( + ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { + if _hasPendingCandidate || _state == .initial { + return (false, false, false) + } + return _placeCandidate() + } - return false + public mutating func hasGuaranteedBreak( + after previousScalar: Unicode.Scalar + ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { + let prevCategory = Unicode._WordBreakProperty(from: previousScalar) + var newState: _State = .ordinary + var newBase = prevCategory + defer { + _nextCategory = prevCategory + _nextScalar = previousScalar + _baseCategory = newBase + _state = newState + } - default: - var newY = y + switch (prevCategory, _nextCategory) { + case (.any, .any): // WB999 shortcut + return _accept() - if let previousProperty = state.previousProperty { - newY = previousProperty + case (.newlineCRLF, _), // WB3a + (_, .newlineCRLF): // WB3b + if previousScalar.value == 0xD, _nextScalar.value == 0xA { // WB3 + return _reject() } + return _accept() - return decidePostFormatBackward(between: x, and: newY, with: &state) - } - } + case (.zwj, .extendedPictographic), // WB3c + (.wSegSpace, .wSegSpace): // WB3d + newBase = _baseCategory + newState = _state + return _reject() - internal func decidePostFormatBackward( - between x: Unicode._WordBreakProperty, - and y: Unicode._WordBreakProperty, - with state: inout _WordBreakingState - ) -> Bool { - state.previousProperty = nil + case (.format, _), // WB4 + (.extend, _), + (.zwj, _): + newBase = _baseCategory + newState = _state + if _state == .initial || _nextCategory == .format || _nextCategory == .extend || _nextCategory == .zwj { + return _ignore() + } + return _placeSoftCandidate() - switch (x, y) { - case (.any, .any): - return true + default: + break + } - // WB5 - case (.aLetter, .aLetter), + switch (prevCategory, _baseCategory) { + case (.aLetter, .aLetter), // WB5 (.aLetter, .hebrewLetter), (.hebrewLetter, .aLetter), (.hebrewLetter, .hebrewLetter): - state.previousIndex = nil - return false - - // WB6 - case (.aLetter, .midLetter), - (.hebrewLetter, .midLetter), - (.aLetter, .midNumLet), - (.hebrewLetter, .midNumLet), - (.aLetter, .singleQuote): - if let constraint = state.constraint { - if constraint.question == .requireAHLetter { - state.constraint = nil - state.previousIndex = nil - return false - } - - state.index = constraint.index - return true - } + return _reject() - return true - - // WB7 - case (.midLetter, .aLetter), + case (.midLetter, .aLetter), // WB7 (.midLetter, .hebrewLetter), (.midNumLet, .aLetter), (.midNumLet, .hebrewLetter), (.singleQuote, .aLetter), (.singleQuote, .hebrewLetter): - state.constraint = (question: .requireAHLetter, index: state.index) - - return false - - // WB7a - case (.hebrewLetter, .singleQuote): - state.previousIndex = nil - return false - - // WB7b - case (.hebrewLetter, .doubleQuote): - if let constraint = state.constraint { - if constraint.question == .requireHebrewLetter { - state.constraint = nil - state.previousIndex = nil - return false - } - - state.index = constraint.index - return true - } - - return true + newState = .beforeWB7 + return _placeSoftCandidate() - // WB7c - case (.doubleQuote, .hebrewLetter): - state.constraint = (question: .requireHebrewLetter, index: state.index) + case (.aLetter, .midLetter), // WB6 + (.hebrewLetter, .midLetter), + (.aLetter, .midNumLet), + (.hebrewLetter, .midNumLet), + (.aLetter, .singleQuote): + if _state == .beforeWB7 || _state == .initial { + return _reject() + } + return _accept() - return false + case (.hebrewLetter, .singleQuote): // WB7a + return _reject() - // WB8 - case (.numeric, .numeric): - state.previousIndex = nil - return false + case (.doubleQuote, .hebrewLetter): // WB7c + newState = .beforeWB7c + return _placeSoftCandidate() - // WB9 - case (.aLetter, .numeric), - (.hebrewLetter, .numeric): - state.previousIndex = nil - return false + case (.hebrewLetter, .doubleQuote): // WB7b + if _state == .beforeWB7c || _state == .initial { + return _reject() + } + return _accept() - // WB10 - case (.numeric, .aLetter), + case (.numeric, .numeric), // WB8 + (.aLetter, .numeric), // WB9 + (.hebrewLetter, .numeric), + (.numeric, .aLetter), // WB10 (.numeric, .hebrewLetter): - state.previousIndex = nil - return false + return _reject() - // WB11 - case (.midNum, .numeric), + case (.midNum, .numeric), // WB11 (.midNumLet, .numeric), (.singleQuote, .numeric): - state.constraint = (question: .requireNumeric, index: state.index) - - return false + newState = .beforeWB11 + return _placeSoftCandidate() - // WB12 - case (.numeric, .midNum), + case (.numeric, .midNum), // WB12 (.numeric, .midNumLet), (.numeric, .singleQuote): - if let constraint = state.constraint { - if constraint.question == .requireNumeric { - state.constraint = nil - state.previousIndex = nil - return false - } - - state.index = constraint.index - return true + if _state == .beforeWB11 || _state == .initial { + return _reject() } + return _accept() - return true - - // WB13 - case (.katakana, .katakana): - state.previousIndex = nil - return false - - // WB13a - case (.aLetter, .extendNumLet), + case (.katakana, .katakana), // WB13 + (.aLetter, .extendNumLet), // WB13a (.hebrewLetter, .extendNumLet), (.numeric, .extendNumLet), (.katakana, .extendNumLet), - (.extendNumLet, .extendNumLet): - state.previousIndex = nil - return false - - // WB13b - case (.extendNumLet, .aLetter), + (.extendNumLet, .extendNumLet), + (.extendNumLet, .aLetter), // WB13b (.extendNumLet, .hebrewLetter), (.extendNumLet, .numeric), (.extendNumLet, .katakana): - state.previousIndex = nil - return false - - // WB15 - case (.regionalIndicator, .regionalIndicator): - var riCount = 0 - var previousRIIndex = state.index - var constraintIndex = state.index - - if let constraint = state.constraint { - if case let .checkingRegionalIndicator(count, riIndex) = - constraint.question { - riCount = count + 1 - previousRIIndex = count == 0 ? state.index : riIndex - constraintIndex = constraint.index - } - } else { - if let previousIndex = state.previousIndex { - constraintIndex = previousIndex - } - } - - state.constraint = ( - question: .checkingRegionalIndicator( - count: riCount, - previousRIIndex: previousRIIndex - ), - index: constraintIndex - ) + return _reject() - state.previousIndex = nil + case (.regionalIndicator, .regionalIndicator): // WB15/WB16 + return _reject() - return false + case (_, .format), + (_, .extend), + (_, .zwj): + _internalInvariant(!_hasPendingCandidate) + newState = .initial + return _reject() default: - return true + if + !_hasPendingCandidate, + _nextCategory == .format || _nextCategory == .extend || _nextCategory == .zwj + { + return _ignore() + } + return _accept() } } +} - internal func handleRIConstraint( - _ constraint: (question: _WordQuestion, index: Int), - with state: _WordBreakingState - ) -> Int? { - if case let .checkingRegionalIndicator(count, previousRIIndex) = - constraint.question { - // If our count is 0, then we were unable to update previousRIIndex. - // However, that index is now equal to state.index. - if count == 0 { - return state.index - } +extension String { + /// Find and return a word boundary position at or before an arbitrary index + /// within this string. The result may not be the closest word break to the + /// start position. + /// + /// This implements the core algorithm for finding a "safe" starting point for + /// random access to word breaks, following [section 6.4 of Unicode Annex + /// #29](https://unicode.org/reports/tr29/#Random_Access). Unicode defines + /// word boundaries using a forward-only state machine; this algorithm does + /// its best to run the state machine backwards until it finds a guaranteed + /// break position. + /// + /// This process is inherently an approximation: the algorithm may need to + /// skip over an arbitrary number of actual word boundaries before finding one + /// that it can judge with confidence. This makes it relatively expensive to + /// iterate over word boundaries backwards; in the worst case, a naive + /// implementation may have quadratic complexity. The recommended way to + /// mitigate this is to maintain a cache of word breaks already traversed, + /// only calling this method to extend the range of known breaks backwards as + /// needed. + /// + /// - Parameter i: An arbitrary index within the string, not necessarily + /// addressing a word boundary. + /// - Returns: A valid index less than equal to the input that is guaranteed to + /// identify some word boundary at or before `i`. + @available(SwiftStdlib 6.3, *) + public func _wordIndex(somewhereAtOrBefore i: Index) -> Index { + var j = _guts.validateInclusiveScalarIndex(i) + if j == endIndex { + return j + } + var recognizer = Unicode._RandomAccessWordRecognizer( + before: self.unicodeScalars[j]) + var candidate = j + while j > self.startIndex { + let p = self.unicodeScalars.index(before: j) + let b = recognizer.hasGuaranteedBreak(after: self.unicodeScalars[p]) + if b.setCandidate { candidate = j } + if b.breakAtCandidate { return candidate } + if b.breakHere { return j } + j = p + } + return j + } - // We were able to update previousRIIndex! - if count.isMultiple(of: 2) { - return previousRIIndex + /// Return the word boundary position following a known word boundary within + /// this string. + /// + /// This implements the word boundary specification of [Unicode Annex + /// #29](https://unicode.org/reports/tr29/#Default_Word_Boundaries). The + /// algorithm is not stable, and it allows implementers to tailor it to their + /// needs; accordingly, the result of this operation may vary between Unicode + /// implementations and system configurations, including versions of the Swift + /// Standard Library. + /// + /// Note: The input index must be on a known word boundary, otherwise the + /// result of this operation is unspecified. The start and end indices are + /// always known word boundaries, in every string. + /// + /// - Parameter i: A valid index addressing a word boundary within this + /// string. + /// - Returns: The first word break strictly following `i` in the string. + @available(StdlibDeploymentTarget 5.7, *) + public func _wordIndex(after i: String.Index) -> String.Index { + guard #available(StdlibDeploymentTarget 6.3, *) else { + fatalError("Unreachable") + } + let i = _guts.validateScalarIndex(i) + if _slowPath(_guts.isForeign) { + return _guts._nextForeignWordIndex(after: i) + } + return _guts._nextUTF8WordIndex(after: i) + } +} + +extension _StringGuts { + @available(StdlibDeploymentTarget 6.3, *) + @inline(never) + @_effects(releasenone) + internal func _nextUTF8WordIndex(after index: Index) -> Index { + _internalInvariant(self.isFastUTF8) + let result = unsafe self.withFastUTF8 { utf8 in + var offset = index._encodedOffset + let first = unsafe _decodeScalar(utf8, startingAt: offset) + offset &+= first.scalarLength + var recognizer = Unicode._WordRecognizer(after: first.0) + var candidate = offset + while offset < utf8.count { + let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: offset) + let r = recognizer.hasBreak(before: scalar) + if r.setCandidate { candidate = offset } + if r.breakAtCandidate { return candidate } + if r.breakHere { return offset } + offset &+= len + } + if recognizer.hasCandidateBreakAtEnd() { + return candidate } + return offset } + // Note: We only signal that the result is scalar aligned, not + // character-aligned. Unicode does attempt to ensure that word breaks are + // always character-aligned, but this is not a strict guarantee, especially + // not if either segmentation algorithm has a tailored implementation. (As + // of 6.3, we do not tailor our implementations, but we used to do so and we + // may choose to do it again in the future.) + return Index(_encodedOffset: result)._scalarAligned._knownUTF8 + } - return nil + @available(StdlibDeploymentTarget 6.3, *) + @inline(never) + internal func _nextForeignWordIndex(after index: Index) -> Index { + #if _runtime(_ObjC) + _internalInvariant(self.isForeign) + let scalars = String.UnicodeScalarView(self) + var recognizer = Unicode._WordRecognizer(after: scalars[index]) + var i = scalars.index(after: index) + var candidate = i + while i < scalars.endIndex { + let r = recognizer.hasBreak(before: scalars[i]) + if r.setCandidate { candidate = i } + if r.breakAtCandidate { return candidate } + if r.breakHere { return i } + scalars.formIndex(after: &i) + } + if recognizer.hasCandidateBreakAtEnd() { + return candidate + } + return i + #else + fatalError("Foreign strings are unsupported on this platform") + #endif } } - diff --git a/stdlib/public/core/UnicodeBreakProperty.swift b/stdlib/public/core/UnicodeBreakProperty.swift index c0f97694d6c51..d4e197e87f316 100644 --- a/stdlib/public/core/UnicodeBreakProperty.swift +++ b/stdlib/public/core/UnicodeBreakProperty.swift @@ -13,7 +13,7 @@ import SwiftShims extension Unicode { - internal enum _GraphemeBreakProperty { + internal enum _GraphemeBreakProperty: Sendable { case any case control case extend @@ -86,7 +86,7 @@ extension Unicode { } extension Unicode { - internal enum _WordBreakProperty { + internal enum _WordBreakProperty: UInt8, Sendable { case aLetter case any case doubleQuote @@ -105,8 +105,8 @@ extension Unicode { case singleQuote case wSegSpace case zwj - - init(from scalar: Unicode.Scalar) { + + internal init(from scalar: Unicode.Scalar) { switch scalar.value { case 0xA ... 0xD, 0x85, @@ -122,7 +122,7 @@ extension Unicode { self = .regionalIndicator default: let rawValue = _swift_stdlib_getWordBreakProperty(scalar.value) - + switch rawValue { case 0: self = .extend diff --git a/stdlib/public/core/UnicodeSPI.swift b/stdlib/public/core/UnicodeSPI.swift index 0b4b2f40e29f9..7939f878fd337 100644 --- a/stdlib/public/core/UnicodeSPI.swift +++ b/stdlib/public/core/UnicodeSPI.swift @@ -207,35 +207,3 @@ extension Unicode.Scalar.Properties { return result } } - -//===----------------------------------------------------------------------===// -// String Word Breaking -//===----------------------------------------------------------------------===// - -extension String { - @_spi(_Unicode) - @available(SwiftStdlib 5.7, *) - public func _wordIndex(after i: String.Index) -> String.Index { - let i = _guts.validateWordIndex(i) - - let next = _guts.nextWordIndex(startingAt: i._encodedOffset) - return String.Index(_encodedOffset: next) - } - - @_spi(_Unicode) - @available(SwiftStdlib 5.7, *) - public func _wordIndex(before i: String.Index) -> String.Index { - let i = _guts.validateInclusiveWordIndex(i) - - _precondition(i > startIndex, "String index is out of bounds") - - let previous = _guts.previousWordIndex(endingAt: i._encodedOffset) - return String.Index(_encodedOffset: previous) - } - - @_spi(_Unicode) - @available(SwiftStdlib 5.7, *) - public func _nearestWordIndex(atOrBelow i: String.Index) -> String.Index { - _guts.validateInclusiveWordIndex(i) - } -} diff --git a/test/abi/macOS/arm64/stdlib.swift b/test/abi/macOS/arm64/stdlib.swift index a1776ce3fcc34..f88a7efc3fa70 100644 --- a/test/abi/macOS/arm64/stdlib.swift +++ b/test/abi/macOS/arm64/stdlib.swift @@ -1121,3 +1121,21 @@ Added: _$ss11InlineArrayVsRi__rlE17_protectedAddressSPyq_GvpMV // lengthOfBytes(using:) Added: __swift_stdlib_NSStringLengthOfBytesInEncodingTrampoline + +// Word breaking symbols exposed in 6.3 +Added: _$sSS10_wordIndex19somewhereAtOrBeforeSS0B0VAD_tF +Added: _$ss7UnicodeO15_WordRecognizerV22hasCandidateBreakAtEndSbyF +Added: _$ss7UnicodeO15_WordRecognizerV5afterAdB6ScalarV_tcfC +Added: _$ss7UnicodeO15_WordRecognizerV8hasBreak6beforeSb12setCandidate_Sb07breakAtH0Sb0I4HeretAB6ScalarV_tF +Added: _$ss7UnicodeO15_WordRecognizerVADycfC +Added: _$ss7UnicodeO15_WordRecognizerVMa +Added: _$ss7UnicodeO15_WordRecognizerVMn +Added: _$ss7UnicodeO15_WordRecognizerVN +Added: _$ss7UnicodeO27_RandomAccessWordRecognizerV18hasGuaranteedBreak5afterSb12setCandidate_Sb07breakAtJ0Sb0K4HeretAB6ScalarV_tF +Added: _$ss7UnicodeO27_RandomAccessWordRecognizerV6beforeAdB6ScalarV_tcfC +Added: _$ss7UnicodeO27_RandomAccessWordRecognizerVMa +Added: _$ss7UnicodeO27_RandomAccessWordRecognizerVMn +Added: _$ss7UnicodeO27_RandomAccessWordRecognizerVN +// Obsolete/broken SPIs removed in 6.3 +Removed: _$sSS17_nearestWordIndex9atOrBelowSS0C0VAD_tF +Removed: _$sSS10_wordIndex6beforeSS0B0VAD_tF diff --git a/test/abi/macOS/x86_64/stdlib.swift b/test/abi/macOS/x86_64/stdlib.swift index c23a5913c6eeb..fc0656ba4f460 100644 --- a/test/abi/macOS/x86_64/stdlib.swift +++ b/test/abi/macOS/x86_64/stdlib.swift @@ -1121,3 +1121,21 @@ Added: _$ss11InlineArrayVsRi__rlE17_protectedAddressSPyq_GvpMV // lengthOfBytes(using:) Added: __swift_stdlib_NSStringLengthOfBytesInEncodingTrampoline + +// Word breaking symbols exposed in 6.3 +Added: _$sSS10_wordIndex19somewhereAtOrBeforeSS0B0VAD_tF +Added: _$ss7UnicodeO15_WordRecognizerV22hasCandidateBreakAtEndSbyF +Added: _$ss7UnicodeO15_WordRecognizerV5afterAdB6ScalarV_tcfC +Added: _$ss7UnicodeO15_WordRecognizerV8hasBreak6beforeSb12setCandidate_Sb07breakAtH0Sb0I4HeretAB6ScalarV_tF +Added: _$ss7UnicodeO15_WordRecognizerVADycfC +Added: _$ss7UnicodeO15_WordRecognizerVMa +Added: _$ss7UnicodeO15_WordRecognizerVMn +Added: _$ss7UnicodeO15_WordRecognizerVN +Added: _$ss7UnicodeO27_RandomAccessWordRecognizerV18hasGuaranteedBreak5afterSb12setCandidate_Sb07breakAtK0Sb0L4HeretAB6ScalarV_tF +Added: _$ss7UnicodeO27_RandomAccessWordRecognizerV6beforeAdB6ScalarV_tcfC +Added: _$ss7UnicodeO27_RandomAccessWordRecognizerVMa +Added: _$ss7UnicodeO27_RandomAccessWordRecognizerVMn +Added: _$ss7UnicodeO27_RandomAccessWordRecognizerVN +// Obsolete/broken SPIs removed in 6.3 +Removed: _$sSS17_nearestWordIndex9atOrBelowSS0C0VAD_tF +Removed: _$sSS10_wordIndex6beforeSS0B0VAD_tF diff --git a/validation-test/stdlib/StringWordBreaking.swift b/validation-test/stdlib/StringWordBreaking.swift index 6597c38898763..d3a0ca0a4268a 100644 --- a/validation-test/stdlib/StringWordBreaking.swift +++ b/validation-test/stdlib/StringWordBreaking.swift @@ -2,35 +2,119 @@ // RUN: %target-run-stdlib-swift %S/Inputs/ // REQUIRES: executable_test -// REQUIRES: objc_interop // REQUIRES: optimized_stdlib +// REQUIRES: objc_interop -@_spi(_Unicode) -import Swift +// FIXME: Text segmentation test cases are only available when we have Foundation import StdlibUnittest import StdlibUnicodeUnittest import Foundation let StringWordBreaking = TestSuite("StringWordBreaking") +defer { runAllTests() } -// FIXME: Reenable once we figure out what to do with WordView -// @available(SwiftStdlib 5.7, *) -// extension String._WordView { -// var backwardsCount: Int { -// var c = 0 -// var index = endIndex -// while index != startIndex { -// c += 1 -// formIndex(before: &index) -// } -// return c -// } -// } +extension String { + /// Returns all word boundaries within the string, using a single word + /// recognizer instance. This is the most efficient way to find word + /// boundaries, as it processes each scalar exactly once. + @available(StdlibDeploymentTarget 6.3, *) + func fastWordBreaks() -> [String.Index] { + var result: [String.Index] = [] + var i = self.startIndex + var recognizer = Unicode._WordRecognizer() + var candidate = i + while i < self.endIndex { + let (setCandidate, breakAtCandidate, breakHere) = + recognizer.hasBreak(before: self.unicodeScalars[i]) + if setCandidate { + candidate = i + } + if breakAtCandidate { + result.append(candidate) + } + if breakHere { + result.append(i) + } + self.unicodeScalars.formIndex(after: &i) + } + if recognizer.hasCandidateBreakAtEnd() { + result.append(candidate) + } + result.append(i) + return result + } + + /// Return the word boundary position preceding a known boundary within this + /// string. + /// + /// This implements the word boundary specification of [Unicode Annex + /// #29](https://unicode.org/reports/tr29/#Default_Word_Boundaries). The + /// algorithm is not stable, and it allows implementers to tailor it to their + /// needs; accordingly, the result of this operation may vary between Unicode + /// implementations and system configurations, including versions of the Swift + /// Standard Library. + /// + /// - Note: If the input index is not on a word boundary, then it is first + /// rounded down to the nearest boundary before starting this operation. + /// + /// - Warning: Using this method to iterate over the word breaks in a string + /// backward has worst-case complexity that is proportional to the _square_ + /// of the length of the string. It is usually a better idea to keep a + /// cache of known word boundaries, calculated by iterating _forwards_ from + /// the start index, or a position returned by + /// `_wordIndex(somewhereAtOrBefore:)`. + /// + /// - Parameter i: A valid index addressing a word boundary within this + /// string. + /// - Returns: The first word break strictly following `i` in the string. + @available(StdlibDeploymentTarget 6.3, *) + public func _wordIndex(before i: String.Index) -> String.Index { + let i = self.unicodeScalars._index(roundingDown: i) + var j = _wordIndex(somewhereAtOrBefore: unicodeScalars.index(before: i)) + + // We know there is a stable break at `j`, however, the backward search may + // have skipped over some conditional breaks that it could not fully + // evaluate. Find the closest actual break that precedes `i` by iterating + // forward until we reach or jump over it. + precondition(j < i) + var recognizer = Unicode._WordRecognizer() + var bestBreak = j + var candidate = j + while j < self.endIndex { + let r = recognizer.hasBreak(before: self.unicodeScalars[j]) + if r.setCandidate { candidate = j } + if r.breakAtCandidate { + guard candidate < i else { break } + bestBreak = candidate + } + if r.breakHere { + guard j < i else { break } + bestBreak = j + } + self.unicodeScalars.formIndex(after: &j) + } + if j == self.endIndex, candidate < i, recognizer.hasCandidateBreakAtEnd() { + bestBreak = candidate + } + precondition(bestBreak < i) + return bestBreak + } +} extension String { + @available(SwiftStdlib 6.3, *) + var statefulWords: [String] { + let breaks = fastWordBreaks() + var prev = breaks[0] + return breaks.dropFirst().map { next in + defer { prev = next } + return String(self[prev ..< next]) + } + } + @available(SwiftStdlib 5.9, *) - var _words: [String] { + var statelessWords: [String] { var result: [String] = [] var i = startIndex @@ -48,8 +132,8 @@ extension String { return result } - @available(SwiftStdlib 5.9, *) - var _wordsBackwards: [String] { + @available(SwiftStdlib 6.3, *) + var backwardWords: [String] { var result: [String] = [] var i = endIndex @@ -68,54 +152,22 @@ extension String { } } -if #available(SwiftStdlib 6.1, *) { - StringWordBreaking.test("word breaking") { - for wordBreakTest in wordBreakTests { - expectEqual( - wordBreakTest.1, - wordBreakTest.0._words, - "string: \(String(reflecting: wordBreakTest.0))") - expectEqual( - wordBreakTest.1.reversed(), - wordBreakTest.0._wordsBackwards, - "string: \(String(reflecting: wordBreakTest.0))") - } +extension Unicode.Scalar { + var unicodeNotation: String { + let v = String(self.value, radix: 16, uppercase: true) + return "U+\(String(repeating: "0", count: max(0, 4 - v.count)))\(v)" } } -// rdar://116652595 -// -// We were accidentally hanging when rounding word indices for some concoctions of -// strings. In particular, where we had a pair of scalars create a constraint -// for the preceding pair, but the preceding extend rules were not taking the -// constraint into consideration. -if #available(SwiftStdlib 5.10, *) { - StringWordBreaking.test("word breaking backward extend constraints") { - let strs = ["日\u{FE0F}:X ", "👨‍👨‍👧‍👦\u{FE0F}:X ", "⛔️:X ", "⛔️·X ", "⛔️:X "] - let strWords = [ - ["日\u{FE0F}", ":", "X", " "], - ["👨‍👨‍👧‍👦\u{FE0F}", ":", "X", " "], - ["⛔️", ":", "X", " "], - ["⛔️", "·", "X", " "], - ["⛔️", ":", "X", " "] - ] - - for (str, words) in zip(strs, strWords) { - expectEqual( - words, - str._words, - "string: \(String(reflecting: str))" - ) - - expectEqual( - words.reversed(), - str._wordsBackwards, - "string: \(String(reflecting: str))" - ) - } +extension String { + var scalarDescriptions: String { + return self.unicodeScalars + .lazy.map { $0.unicodeNotation } + .joined(separator: " ") } } +#if _runtime(_ObjC) // The most simple subclass of NSString that CoreFoundation does not know // about. class NonContiguousNSString : NSString { @@ -123,16 +175,17 @@ class NonContiguousNSString : NSString { fatalError("don't call this initializer") } required init(itemProviderData data: Data, typeIdentifier: String) throws { - fatalError("don't call this initializer") + fatalError("don't call this initializer") } - override init() { + override init() { _value = [] - super.init() + super.init() } - init(_ value: [UInt16]) { - _value = value + @inline(never) + init(_ value: some Sequence) { + _value = Array(value) super.init() } @@ -157,36 +210,77 @@ extension _StringGuts { @_silgen_name("$ss11_StringGutsV9isForeignSbvg") func _isForeign() -> Bool } - -func getUTF16Array(from string: String) -> [UInt16] { - var result: [UInt16] = [] - - for cp in string.utf16 { - result.append(cp) +#endif + +func testCases() -> [(String, [String])] { + var tests = StdlibUnicodeUnittest.wordBreakTests + if #available(SwiftStdlib 5.10, *) { + // rdar://116652595 + // + // We were accidentally hanging when rounding word indices for some + // concoctions of strings. In particular, where we had a pair of scalars + // create a constraint for the preceding pair, but the preceding extend + // rules were not taking the constraint into consideration. + tests += [ + ("日\u{FE0F}:X ", ["日\u{FE0F}", ":", "X", " "]), + ("👨‍👨‍👧‍👦\u{FE0F}:X ", ["👨‍👨‍👧‍👦\u{FE0F}", ":", "X", " "]), + ("⛔️:X ", ["⛔️", ":", "X", " "]), + ("⛔️·X ", ["⛔️", "·", "X", " "]), + ("⛔️:X ", ["⛔️", ":", "X", " "]), + ] } + if #available(SwiftStdlib 6.3, *) { + tests += [ + // https://github.com/swiftlang/swift-experimental-string-processing/issues/818 + // rdar://154902007 + ("\u{2060}\u{2018}\u{2060}\u{2060}example.com\u{2060}\u{2060}\u{2019}", + ["\u{2060}", "\u{2018}\u{2060}\u{2060}", "example.com\u{2060}\u{2060}", "\u{2019}"]), + ] + } + return tests +} - return result +if #available(SwiftStdlib 6.1, *) { + StringWordBreaking.test("word breaking") { + for (input, expectedWords) in testCases() { + expectEqual( + input.statelessWords, + expectedWords, + "input: \(input.debugDescription) \(input.scalarDescriptions)") + if #available(SwiftStdlib 6.3, *) { + expectEqual( + input.statefulWords, + expectedWords, + "input: \(input.debugDescription) \(input.scalarDescriptions)") + expectEqual( + input.backwardWords, + expectedWords.reversed(), + "input: \(input.debugDescription) \(input.scalarDescriptions)") + } + } + } } if #available(SwiftStdlib 6.1, *) { StringWordBreaking.test("word breaking foreign") { - for wordBreakTest in wordBreakTests { - let foreignTest = NonContiguousNSString( - getUTF16Array(from: wordBreakTest.0) - ) - let test = foreignTest as String + for (nativeString, expectedWords) in testCases() { + let input = NonContiguousNSString(nativeString.utf16) as String - expectTrue(test._guts._isForeign()) + expectTrue(input._guts._isForeign()) expectEqual( - wordBreakTest.1, - test._words, - "string: \(String(reflecting: wordBreakTest.0))") - expectEqual( - wordBreakTest.1.reversed(), - test._wordsBackwards, - "string: \(String(reflecting: wordBreakTest.0))") + input.statelessWords, + expectedWords, + "input: \(nativeString.debugDescription) \(nativeString.scalarDescriptions)") + if #available(SwiftStdlib 6.3, *) { + expectEqual( + input.statefulWords, + expectedWords, + "input: \(nativeString.debugDescription) \(nativeString.scalarDescriptions)") + expectEqual( + input.backwardWords, + expectedWords.reversed(), + "input: \(nativeString.debugDescription) \(nativeString.scalarDescriptions)") + } } } } - -runAllTests() diff --git a/validation-test/stdlib/UnicodeWordRecognizer.swift b/validation-test/stdlib/UnicodeWordRecognizer.swift new file mode 100644 index 0000000000000..352199139786e --- /dev/null +++ b/validation-test/stdlib/UnicodeWordRecognizer.swift @@ -0,0 +1,311 @@ +// RUN: %empty-directory(%t) +// RUN: %target-run-stdlib-swift + +// REQUIRES: executable_test +// REQUIRES: objc_interop +// REQUIRES: optimized_stdlib + +// Validate that the various forms of word breaking all lead to consistent +// results by exhaustively enumerating all possible state machine inputs up to +// an adequately high length. +// +// The word breaking algorithm only cares about word break properties, not +// specific scalar values. This lets us only use a single representative sample +// in each class, drastically cutting down the input space to iterate through. +// This makes it practical to do this up to limits that give us practically +// useful results. + +import StdlibUnittest + +let suite = TestSuite("UnicodeWordRecognizer") +defer { runAllTests() } + +// One representative sample from each character class that's relevant to word breaking +let samples: [Unicode.Scalar] = [ + "\u{000D}", // CR + "\u{000A}", // LF + "\u{2028}", // Newline (LINE SEPARATOR) + "\u{0041}", // ALetter (LATIN CAPITAL LETTER A) + "\u{0022}", // Double_Quote (QUOTATION MARK) + "\u{0027}", // Single_Quote (APOSTROPHE) + "\u{200D}", // ZWJ (ZERO WIDTH JOINER) + "\u{1F1E6}", // RI (REGIONAL INDICATOR SYMBOL LETTER A) + "\u{05D0}", // Hebrew_Letter (HEBREW LETTER ALEF) + "\u{0300}", // Extend (COMBINING GRAVE ACCENT) + "\u{00AD}", // Format (SOFT HYPHEN) + "\u{3031}", // Katakana (VERTICAL KANA REPEAT MARK) + "\u{003A}", // MidLetter (COLON) + "\u{002C}", // MidNum (COMMA) + "\u{002E}", // MidNumLet (FULL STOP) + "\u{0030}", // Numeric (DIGIT ZERO) + "\u{005F}", // ExtendNumLet (LOW LINE) + "\u{0020}", // WSegSpace (SPACE) + "\u{00A9}", // \p{Extended_Pictographic} (COPYRIGHT) + "\u{0021}", // Any (EXCLAMATION MARK) +] + +/// Call `body` with every array of the specified count consisting of +/// integer elements in the given range. This is calculating the +/// Cartesian `n`-ary power of the `range` argument. +/// +/// withEveryArray(of: 1 ..< 3, count: 3) { print($0) } +/// // [1, 1, 1] +/// // [2, 1, 1] +/// // [1, 2, 1] +/// // [2, 2, 1] +/// // [1, 1, 2] +/// // [2, 1, 2] +/// // [1, 2, 2] +/// // [2, 2, 2] +func withEveryArray( + of range: Range, + count n: Int, + _ body: ([Int]) throws(E) -> Void +) throws(E) { + var vector: [Int] = .init(repeating: range.lowerBound, count: n) + guard n > 0 else { + try body(vector) + return + } + while true { + try body(vector) + var i = 0 + while true { + vector[i] += 1 + if vector[i] < range.upperBound { + break + } + vector[i] = range.lowerBound + i += 1 + if i == n { + return // done + } + } + } +} + +func string(for vector: [Int]) -> String { + var s = "" + for digit in vector { + s.unicodeScalars.append(samples[digit]) + } + return s +} + +extension Unicode.Scalar { + var unicodeNotation: String { + let v = String(self.value, radix: 16, uppercase: true) + return "U+\(String(repeating: "0", count: max(0, 4 - v.count)))\(v)" + } +} + +extension String { + var scalarDescriptions: String { + return self.unicodeScalars + .lazy.map { $0.unicodeNotation } + .joined(separator: " ") + } +} + +extension Collection { + /// Return a sorted array of all valid indices in the collection, including + /// the end index. + func allIndices() -> [Index] { + var result: [Index] = [] + result.reserveCapacity(count + 1) + result.append(contentsOf: indices) + result.append(endIndex) + return result + } +} + +extension Sequence where Element: Equatable { + /// Returns true if the elements of `self` form a sub-sequence of `other`, + /// where both inputs are monotonic. `self` is not allowed to contain + /// more than a single copy of any item in `other`. + func isMonotonicSubsequence(of other: some Sequence) -> Bool { + var i = makeIterator() + var j = other.makeIterator() + var b = j.next() + while let a = i.next() { + while true { + if b == nil { return false } + if a == b { + b = j.next() + break + } + b = j.next() + } + } + return true + } + + /// Returns true if the elements of `self` form a sub-sequence of `other`, + /// where both inputs are monotonic. `self` is allowed to contain duplicate + /// elements. + func isMonotonicRepeatingSubsequence(of other: some Sequence) -> Bool { + var i = makeIterator() + var j = other.makeIterator() + var b = j.next() + while let a = i.next() { + while true { + if b == nil { return false } + if a == b { break } + b = j.next() + } + } + return true + } + +} + +extension String { + /// Returns all word boundaries within the string, using a single word + /// recognizer instance. This is the most efficient way to find word + /// boundaries, as it processes each scalar exactly once. + @available(StdlibDeploymentTarget 6.3, *) + func fastWordBreaks() -> [String.Index] { + var result: [String.Index] = [] + var i = self.startIndex + var recognizer = Unicode._WordRecognizer() + var candidate = i + while i < self.endIndex { + let (setCandidate, breakAtCandidate, breakHere) = + recognizer.hasBreak(before: self.unicodeScalars[i]) + if setCandidate { + candidate = i + } + if breakAtCandidate { + result.append(candidate) + } + if breakHere { + result.append(i) + } + self.unicodeScalars.formIndex(after: &i) + } + if recognizer.hasCandidateBreakAtEnd() { + result.append(candidate) + } + result.append(i) + return result + } + + /// Returns all word breaks without keeping persistent state, using + /// `_wordIndex(after:)`. This forgets lookahead information after each word + /// boundary, so it needs to process some scalars twice, resulting in a + /// performance regression vs `fastWordBreaks()`. However, both variants are + /// supposed to have the same results. + @available(StdlibDeploymentTarget 5.7, *) + func slowWordBreaks() -> [String.Index] { + var result: [String.Index] = [] + var i = self.startIndex + while i < self.endIndex { + result.append(i) + i = self._wordIndex(after: i) + } + result.append(i) + return result + } + + /// Return all "safe" word breaks in this string by using the backwards word + /// recognizer state machine, starting from the end, feeding it every Unicode + /// scalar in the string, and collecting all word boundaries detected. + /// + /// This is expected to sometimes skip over word boundaries that are detected + /// when going forward. However, it must never report a word boundary at a + /// position that isn't also detected by the forward recognizer. + @available(StdlibDeploymentTarget 6.3, *) + func safeWordBreaks() -> [String.Index] { + var result: [String.Index] = [] + guard !self.isEmpty else { return result } + result.append(self.endIndex) // There is always an implicit wordbreak at the end. + var i = self.unicodeScalars.index(before: self.endIndex) + var recognizer = Unicode._RandomAccessWordRecognizer(before: self.unicodeScalars[i]) + var candidate = i + while i > self.startIndex { + let j = self.unicodeScalars.index(before: i) + let r = recognizer.hasGuaranteedBreak(after: self.unicodeScalars[j]) + if r.setCandidate { + candidate = i + } + if r.breakAtCandidate { + result.append(candidate) + } + if r.breakHere { + result.append(i) + } + i = j + } + result.reverse() + return result + } + + /// Return an array of "safe" word boundaries detected by the backwards word + /// recognizer state machine, invoked through + /// `_wordIndex(somewhereAtOrBefore:)`, one result per each scalar position in + /// the string (including its end index). + /// + /// This is expected to be some monotonically increasing subsequence of word + /// boundaries detected in the forward direction, allowing some repeated + /// items. + @available(StdlibDeploymentTarget 6.3, *) + func randomAccessWordBreaks() -> [String.Index] { + unicodeScalars.allIndices().map { self._wordIndex(somewhereAtOrBefore: $0) } + } +} + +@available(StdlibDeploymentTarget 6.3, *) +func check(length: Int) { + withEveryArray(of: 0 ..< samples.count, count: length) { vector in + let str = string(for: vector) + + let fastBreaks = str.fastWordBreaks() + let slowBreaks = str.slowWordBreaks() + expectEqual( + fastBreaks, slowBreaks, + """ + Inconsistent word boundaries in stateful vs stateless iteration: + input: \(str.debugDescription) (\(str.scalarDescriptions)) + """) + + let safeBreaks = str.safeWordBreaks() + expectTrue( + safeBreaks.isMonotonicSubsequence(of: fastBreaks), + """ + Inconsistent safe word boundaries: + input: \(str.debugDescription) (\(str.scalarDescriptions))") + """) + + let randomAccessBreaks = str.randomAccessWordBreaks() + expectTrue( + randomAccessBreaks.isMonotonicRepeatingSubsequence(of: fastBreaks), + """ + Inconsistent random-access word boundaries: + input: \(str.debugDescription) (\(str.scalarDescriptions)) + breaks: \(fastBreaks) + random-access breaks: \(randomAccessBreaks) + """) + } +} + +if #available(StdlibDeploymentTarget 6.3, *) { + suite.test("Exhaustive consistency checks, length 1") { + check(length: 1) + } + + suite.test("Exhaustive consistency checks, length 2") { + check(length: 2) + } + + suite.test("Exhaustive consistency checks, length 3") { + check(length: 3) + } + + suite.test("Exhaustive consistency checks, length 4") { + check(length: 4) + } + + suite.test("Exhaustive consistency checks, length 5") { + check(length: 5) + } +}