From 59a4783330e8f76279f202a8ccef86b2e944f736 Mon Sep 17 00:00:00 2001
From: Russell Cohen <rcoh@amazon.com>
Date: Thu, 2 Nov 2023 12:10:28 -0400
Subject: [PATCH] First pass at optimizing SplitOnWordBoundaries (#3140)

## Motivation and Context
@Velfi observed this function consuming a significant amount of codegen
time.

## Description
Optimize split on word boundaries, primarily by avoiding recomputation
of `isStartOfWord` when we already know it isn't the start of a word.

## Testing
- Original: 3.4s. Updated: 688ms
- Existing tests are exhaustive

On a `:aws:sdk:assemble` for the smoke test services, this improved from
53 seconds to 42 seconds
----

_By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice._
---
 .../smithy/rust/codegen/core/util/Strings.kt  | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)
diff --git a/codegen-core/src/main/kotlin/software/amazon/smithy/rust/codegen/core/util/Strings.kt b/codegen-core/src/main/kotlin/software/amazon/smithy/rust/codegen/core/util/Strings.kt
index 244eace361..3ef2731293 100644
--- a/codegen-core/src/main/kotlin/software/amazon/smithy/rust/codegen/core/util/Strings.kt
+++ b/codegen-core/src/main/kotlin/software/amazon/smithy/rust/codegen/core/util/Strings.kt
@@ -19,13 +19,16 @@ fun String.doubleQuote(): String =
 fun String.dq(): String = this.doubleQuote()
 
 private val completeWords: List<String> = listOf("ipv4", "ipv6", "sigv4", "mib", "gib", "kib", "ttl")
+
 private fun String.splitOnWordBoundaries(): List<String> {
     val out = mutableListOf<String>()
     // These are whole words but cased differently, e.g. `IPv4`, `MiB`, `GiB`, `TtL`
     var currentWord = ""
 
+    var completeWordInProgress = true
     // emit the current word and update from the next character
     val emit = { next: Char ->
+        completeWordInProgress = true
         if (currentWord.isNotEmpty()) {
             out += currentWord.lowercase()
         }
@@ -37,13 +40,17 @@ private fun String.splitOnWordBoundaries(): List<String> {
     }
     val allLowerCase = this.lowercase() == this
     this.forEachIndexed { index, nextCharacter ->
-        val peek = this.getOrNull(index + 1)
-        val doublePeek = this.getOrNull(index + 2)
-        val completeWordInProgress = completeWords.any {
-            (currentWord + this.substring(index)).lowercase().startsWith(
-                it,
-            )
-        } && !completeWords.contains(currentWord.lowercase())
+        val computeWordInProgress = {
+            val result = completeWordInProgress && currentWord.isNotEmpty() && completeWords.any {
+                it.startsWith(currentWord, ignoreCase = true) && (currentWord + this.substring(index)).startsWith(
+                    it,
+                    ignoreCase = true,
+                ) && !it.equals(currentWord, ignoreCase = true)
+            }
+
+            completeWordInProgress = result
+            result
+        }
         when {
             // [C] in these docs indicates the value of nextCharacter
             // A[_]B
@@ -53,15 +60,15 @@ private fun String.splitOnWordBoundaries(): List<String> {
             currentWord.isEmpty() -> currentWord += nextCharacter.toString()
 
             // Abc[D]ef or Ab2[D]ef
-            !completeWordInProgress && loweredFollowedByUpper(currentWord, nextCharacter) -> emit(nextCharacter)
+            !computeWordInProgress() && loweredFollowedByUpper(currentWord, nextCharacter) -> emit(nextCharacter)
 
             // s3[k]ey
-            !completeWordInProgress && allLowerCase && digitFollowedByLower(currentWord, nextCharacter) -> emit(
+            !computeWordInProgress() && allLowerCase && digitFollowedByLower(currentWord, nextCharacter) -> emit(
                 nextCharacter,
             )
 
             // DB[P]roxy, or `IAM[U]ser` but not AC[L]s
-            endOfAcronym(currentWord, nextCharacter, peek, doublePeek) -> emit(nextCharacter)
+            endOfAcronym(currentWord, nextCharacter, this.getOrNull(index + 1), this.getOrNull(index + 2)) -> emit(nextCharacter)
 
             // If we haven't found a word boundary, push it and keep going
             else -> currentWord += nextCharacter.toString()