Skip to content

Commit

Permalink
First pass at optimizing SplitOnWordBoundaries (#3140)
Browse files Browse the repository at this point in the history
## Motivation and Context
@Velfi observed this function consuming a significant amount of codegen
time.

## Description
Optimize split on word boundaries, primarily by avoiding recomputation
of `isStartOfWord` when we already know it isn't the start of a word.

## Testing
- Original: 3.4s. Updated: 688ms
- Existing tests are exhaustive

On a `:aws:sdk:assemble` for the smoke test services, this improved from
53 seconds to 42 seconds
----

_By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice._
  • Loading branch information
rcoh authored Nov 2, 2023
1 parent c8edefe commit 59a4783
Showing 1 changed file with 17 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,16 @@ fun String.doubleQuote(): String =
fun String.dq(): String = this.doubleQuote()

private val completeWords: List<String> = listOf("ipv4", "ipv6", "sigv4", "mib", "gib", "kib", "ttl")

private fun String.splitOnWordBoundaries(): List<String> {
val out = mutableListOf<String>()
// These are whole words but cased differently, e.g. `IPv4`, `MiB`, `GiB`, `TtL`
var currentWord = ""

var completeWordInProgress = true
// emit the current word and update from the next character
val emit = { next: Char ->
completeWordInProgress = true
if (currentWord.isNotEmpty()) {
out += currentWord.lowercase()
}
Expand All @@ -37,13 +40,17 @@ private fun String.splitOnWordBoundaries(): List<String> {
}
val allLowerCase = this.lowercase() == this
this.forEachIndexed { index, nextCharacter ->
val peek = this.getOrNull(index + 1)
val doublePeek = this.getOrNull(index + 2)
val completeWordInProgress = completeWords.any {
(currentWord + this.substring(index)).lowercase().startsWith(
it,
)
} && !completeWords.contains(currentWord.lowercase())
val computeWordInProgress = {
val result = completeWordInProgress && currentWord.isNotEmpty() && completeWords.any {
it.startsWith(currentWord, ignoreCase = true) && (currentWord + this.substring(index)).startsWith(
it,
ignoreCase = true,
) && !it.equals(currentWord, ignoreCase = true)
}

completeWordInProgress = result
result
}
when {
// [C] in these docs indicates the value of nextCharacter
// A[_]B
Expand All @@ -53,15 +60,15 @@ private fun String.splitOnWordBoundaries(): List<String> {
currentWord.isEmpty() -> currentWord += nextCharacter.toString()

// Abc[D]ef or Ab2[D]ef
!completeWordInProgress && loweredFollowedByUpper(currentWord, nextCharacter) -> emit(nextCharacter)
!computeWordInProgress() && loweredFollowedByUpper(currentWord, nextCharacter) -> emit(nextCharacter)

// s3[k]ey
!completeWordInProgress && allLowerCase && digitFollowedByLower(currentWord, nextCharacter) -> emit(
!computeWordInProgress() && allLowerCase && digitFollowedByLower(currentWord, nextCharacter) -> emit(
nextCharacter,
)

// DB[P]roxy, or `IAM[U]ser` but not AC[L]s
endOfAcronym(currentWord, nextCharacter, peek, doublePeek) -> emit(nextCharacter)
endOfAcronym(currentWord, nextCharacter, this.getOrNull(index + 1), this.getOrNull(index + 2)) -> emit(nextCharacter)

// If we haven't found a word boundary, push it and keep going
else -> currentWord += nextCharacter.toString()
Expand Down

0 comments on commit 59a4783

Please sign in to comment.