diff --git a/src/MarkdownSnippets/ContentValidation.cs b/src/MarkdownSnippets/ContentValidation.cs index 3718b34b..1cc70514 100644 --- a/src/MarkdownSnippets/ContentValidation.cs +++ b/src/MarkdownSnippets/ContentValidation.cs @@ -33,10 +33,8 @@ static class ContentValidation new("kind of", "similar or approximately ") ]); - static List invalidStrings; - - static List invalidWords = - [ + static FrozenSet invalidWordSet = new[] + { "you", "we", "our", @@ -64,13 +62,16 @@ static class ContentValidation "whereat", "wherein", "whereof" - ]; - - static ContentValidation() => - invalidStrings = BuildInvalidStrings().ToList(); + }.ToFrozenSet(); - static IEnumerable BuildInvalidStrings() => - invalidWords.Select(word => $" {word} "); + static FrozenDictionary[]> phrasesByFirstWord = + phrases + .GroupBy(p => + { + var spaceIndex = p.Key.IndexOf(' '); + return spaceIndex == -1 ? p.Key : p.Key[..spaceIndex]; + }) + .ToFrozenDictionary(g => g.Key, g => g.ToArray()); public static IEnumerable<(string error, int column)> Verify(string line) { @@ -88,29 +89,65 @@ static IEnumerable BuildInvalidStrings() => yield return (message, exclamationIndex1); } - foreach (var invalidString in invalidStrings) + // Tokenize words with positions + var words = Tokenize(cleanedLine); + + // Check invalid words via set lookup (report first occurrence only) + var seenWords = new HashSet(); + foreach (var (word, start) in words) { - var indexOf = cleanedLine.IndexOf(invalidString); - if (indexOf == -1) + if (invalidWordSet.Contains(word) && seenWords.Add(word)) { - continue; + yield return ($"Invalid word detected: '{word}'", start - 1); } + } - var error = $"Invalid word detected: '{invalidString.Trim()}'"; - yield return (error, indexOf); + // Check phrases via first-word lookup (report first occurrence only) + var seenPhrases = new HashSet(); + foreach (var (word, start) in words) + { + if (phrasesByFirstWord.TryGetValue(word, out var candidates)) + { + foreach (var candidate in candidates) + { + if (seenPhrases.Contains(candidate.Key)) + { + continue; + } + + if (cleanedLine.AsSpan(start).StartsWith(candidate.Key.AsSpan(), StringComparison.Ordinal)) + { + seenPhrases.Add(candidate.Key); + yield return ($"Invalid phrase detected: '{candidate.Key}'. Instead consider '{candidate.Value}'", start); + } + } + } } + } - foreach (var phrase in phrases) + static List<(string word, int start)> Tokenize(string cleanedLine) + { + var words = new List<(string word, int start)>(); + var span = cleanedLine.AsSpan(); + var pos = 0; + while (pos < span.Length) { - var indexOf = cleanedLine.IndexOf(phrase.Key); - if (indexOf == -1) + if (span[pos] == ' ') { + pos++; continue; } - var error = $"Invalid phrase detected: '{phrase.Key}'. Instead consider '{phrase.Value}'"; - yield return (error, indexOf); + var wordStart = pos; + while (pos < span.Length && span[pos] != ' ') + { + pos++; + } + + words.Add((span[wordStart..pos].ToString(), wordStart)); } + + return words; } static string Clean(string input) diff --git a/src/Tests/ContentValidationTest.CheckInvalidWordIndicatesAllViolationsInTheExceptionMessageIgnoringCase.verified.txt b/src/Tests/ContentValidationTest.CheckInvalidWordIndicatesAllViolationsInTheExceptionMessageIgnoringCase.verified.txt index 5df3a9de..5f751764 100644 --- a/src/Tests/ContentValidationTest.CheckInvalidWordIndicatesAllViolationsInTheExceptionMessageIgnoringCase.verified.txt +++ b/src/Tests/ContentValidationTest.CheckInvalidWordIndicatesAllViolationsInTheExceptionMessageIgnoringCase.verified.txt @@ -7,12 +7,12 @@ Item1: Invalid word detected: 'you', Item2: 1 }, - { - Item1: Invalid word detected: 'us', - Item2: 37 - }, { Item1: Invalid word detected: 'yourself', Item2: 27 + }, + { + Item1: Invalid word detected: 'us', + Item2: 37 } ] \ No newline at end of file