From 59a4783330e8f76279f202a8ccef86b2e944f736 Mon Sep 17 00:00:00 2001 From: Russell Cohen Date: Thu, 2 Nov 2023 12:10:28 -0400 Subject: [PATCH] First pass at optimizing SplitOnWordBoundaries (#3140) ## Motivation and Context @Velfi observed this function consuming a significant amount of codegen time. ## Description Optimize split on word boundaries, primarily by avoiding recomputation of `isStartOfWord` when we already know it isn't the start of a word. ## Testing - Original: 3.4s. Updated: 688ms - Existing tests are exhaustive On a `:aws:sdk:assemble` for the smoke test services, this improved from 53 seconds to 42 seconds ---- _By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice._ --- .../smithy/rust/codegen/core/util/Strings.kt | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/codegen-core/src/main/kotlin/software/amazon/smithy/rust/codegen/core/util/Strings.kt b/codegen-core/src/main/kotlin/software/amazon/smithy/rust/codegen/core/util/Strings.kt index 244eace361..3ef2731293 100644 --- a/codegen-core/src/main/kotlin/software/amazon/smithy/rust/codegen/core/util/Strings.kt +++ b/codegen-core/src/main/kotlin/software/amazon/smithy/rust/codegen/core/util/Strings.kt @@ -19,13 +19,16 @@ fun String.doubleQuote(): String = fun String.dq(): String = this.doubleQuote() private val completeWords: List = listOf("ipv4", "ipv6", "sigv4", "mib", "gib", "kib", "ttl") + private fun String.splitOnWordBoundaries(): List { val out = mutableListOf() // These are whole words but cased differently, e.g. `IPv4`, `MiB`, `GiB`, `TtL` var currentWord = "" + var completeWordInProgress = true // emit the current word and update from the next character val emit = { next: Char -> + completeWordInProgress = true if (currentWord.isNotEmpty()) { out += currentWord.lowercase() } @@ -37,13 +40,17 @@ private fun String.splitOnWordBoundaries(): List { } val allLowerCase = this.lowercase() == this this.forEachIndexed { index, nextCharacter -> - val peek = this.getOrNull(index + 1) - val doublePeek = this.getOrNull(index + 2) - val completeWordInProgress = completeWords.any { - (currentWord + this.substring(index)).lowercase().startsWith( - it, - ) - } && !completeWords.contains(currentWord.lowercase()) + val computeWordInProgress = { + val result = completeWordInProgress && currentWord.isNotEmpty() && completeWords.any { + it.startsWith(currentWord, ignoreCase = true) && (currentWord + this.substring(index)).startsWith( + it, + ignoreCase = true, + ) && !it.equals(currentWord, ignoreCase = true) + } + + completeWordInProgress = result + result + } when { // [C] in these docs indicates the value of nextCharacter // A[_]B @@ -53,15 +60,15 @@ private fun String.splitOnWordBoundaries(): List { currentWord.isEmpty() -> currentWord += nextCharacter.toString() // Abc[D]ef or Ab2[D]ef - !completeWordInProgress && loweredFollowedByUpper(currentWord, nextCharacter) -> emit(nextCharacter) + !computeWordInProgress() && loweredFollowedByUpper(currentWord, nextCharacter) -> emit(nextCharacter) // s3[k]ey - !completeWordInProgress && allLowerCase && digitFollowedByLower(currentWord, nextCharacter) -> emit( + !computeWordInProgress() && allLowerCase && digitFollowedByLower(currentWord, nextCharacter) -> emit( nextCharacter, ) // DB[P]roxy, or `IAM[U]ser` but not AC[L]s - endOfAcronym(currentWord, nextCharacter, peek, doublePeek) -> emit(nextCharacter) + endOfAcronym(currentWord, nextCharacter, this.getOrNull(index + 1), this.getOrNull(index + 2)) -> emit(nextCharacter) // If we haven't found a word boundary, push it and keep going else -> currentWord += nextCharacter.toString()