First pass at optimizing SplitOnWordBoundaries (#3140)

## Motivation and Context @Velfi observed this function consuming a significant amount of codegen time. ## Description Optimize split on word boundaries, primarily by avoiding recomputation of `isStartOfWord` when we already know it isn't the start of a word. ## Testing - Original: 3.4s. Updated: 688ms - Existing tests are exhaustive On a `:aws:sdk:assemble` for the smoke test services, this improved from 53 seconds to 42 seconds ---- _By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice._
smithy-lang · Nov 2, 2023 · 59a4783 · 59a4783
1 parent c8edefe
commit 59a4783
Showing 1 changed file with 17 additions and 10 deletions.
diff --git a/codegen-core/src/main/kotlin/software/amazon/smithy/rust/codegen/core/util/Strings.kt b/codegen-core/src/main/kotlin/software/amazon/smithy/rust/codegen/core/util/Strings.kt
@@ -19,13 +19,16 @@ fun String.doubleQuote(): String =
 fun String.dq(): String = this.doubleQuote()
 
 private val completeWords: List<String> = listOf("ipv4", "ipv6", "sigv4", "mib", "gib", "kib", "ttl")
+
 private fun String.splitOnWordBoundaries(): List<String> {
     val out = mutableListOf<String>()
     // These are whole words but cased differently, e.g. `IPv4`, `MiB`, `GiB`, `TtL`
     var currentWord = ""
 
+    var completeWordInProgress = true
     // emit the current word and update from the next character
     val emit = { next: Char ->
+        completeWordInProgress = true
         if (currentWord.isNotEmpty()) {
             out += currentWord.lowercase()
         }
@@ -37,13 +40,17 @@ private fun String.splitOnWordBoundaries(): List<String> {
     }
     val allLowerCase = this.lowercase() == this
     this.forEachIndexed { index, nextCharacter ->
-        val peek = this.getOrNull(index + 1)
-        val doublePeek = this.getOrNull(index + 2)
-        val completeWordInProgress = completeWords.any {
-            (currentWord + this.substring(index)).lowercase().startsWith(
-                it,
-            )
-        } && !completeWords.contains(currentWord.lowercase())
+        val computeWordInProgress = {
+            val result = completeWordInProgress && currentWord.isNotEmpty() && completeWords.any {
+                it.startsWith(currentWord, ignoreCase = true) && (currentWord + this.substring(index)).startsWith(
+                    it,
+                    ignoreCase = true,
+                ) && !it.equals(currentWord, ignoreCase = true)
+            }
+
+            completeWordInProgress = result
+            result
+        }
         when {
             // [C] in these docs indicates the value of nextCharacter
             // A[_]B
@@ -53,15 +60,15 @@ private fun String.splitOnWordBoundaries(): List<String> {
             currentWord.isEmpty() -> currentWord += nextCharacter.toString()
 
             // Abc[D]ef or Ab2[D]ef
-            !completeWordInProgress && loweredFollowedByUpper(currentWord, nextCharacter) -> emit(nextCharacter)
+            !computeWordInProgress() && loweredFollowedByUpper(currentWord, nextCharacter) -> emit(nextCharacter)
 
             // s3[k]ey
-            !completeWordInProgress && allLowerCase && digitFollowedByLower(currentWord, nextCharacter) -> emit(
+            !computeWordInProgress() && allLowerCase && digitFollowedByLower(currentWord, nextCharacter) -> emit(
                 nextCharacter,
             )
 
             // DB[P]roxy, or `IAM[U]ser` but not AC[L]s
-            endOfAcronym(currentWord, nextCharacter, peek, doublePeek) -> emit(nextCharacter)
+            endOfAcronym(currentWord, nextCharacter, this.getOrNull(index + 1), this.getOrNull(index + 2)) -> emit(nextCharacter)
 
             // If we haven't found a word boundary, push it and keep going
             else -> currentWord += nextCharacter.toString()