Skip to content

Commit e77c8fb

Browse files
uros-dbcloud-fan
authored andcommitted
[SPARK-49204][SQL] Fix surrogate pair handling in StringReplace
### What changes were proposed in this pull request? Fix the following string expression to handle surrogate pairs properly: - StringRepeat The issue has to do with counting surrogate pairs, which are single Unicode code points (and single UTF-8 characters), but are represented using 2 characters in UTF-16 (Java String). Example of incorrect result (under `UNICODE` collation, but similar issues are noted for all ICU collations): ``` StringReplace("😄a", "a", "b") // returns: "😄ab" (incorrect), instead of: "😄b" (correct) ``` ### Why are the changes needed? Currently, some string expressions are giving wrong results when working with surrogate pairs. ### Does this PR introduce _any_ user-facing change? Yes, this expression will now work properly with surrogate pairs: `repeat`. ### How was this patch tested? New tests in `CollationSupportSuite`. ### Was this patch authored or co-authored using generative AI tooling? Yes. Closes #47710 from uros-db/surrogate-replace. Authored-by: Uros Bojanic <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent e38e135 commit e77c8fb

File tree

2 files changed

+181
-147
lines changed

2 files changed

+181
-147
lines changed

common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java

Lines changed: 34 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -300,114 +300,82 @@ private static int compareLowerCaseSlow(final UTF8String left, final UTF8String
300300
return lowerCaseCodePoints(left).binaryCompare(lowerCaseCodePoints(right));
301301
}
302302

303-
/*
303+
/**
304304
* Performs string replacement for ICU collations by searching for instances of the search
305-
* string in the `src` string, with respect to the specified collation, and then replacing
305+
* string in the `target` string, with respect to the specified collation, and then replacing
306306
* them with the replace string. The method returns a new UTF8String with all instances of the
307307
* search string replaced using the replace string. Similar to UTF8String.findInSet behavior
308-
* used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty.
308+
* used for UTF8_BINARY, the method returns the `target` string if the `search` string is empty.
309309
*
310-
* @param src the string to be searched in
310+
* @param target the string to be searched in
311311
* @param search the string to be searched for
312312
* @param replace the string to be used as replacement
313313
* @param collationId the collation ID to use for string search
314314
* @return the position of the first occurrence of `match` in `set`
315315
*/
316-
public static UTF8String replace(final UTF8String src, final UTF8String search,
316+
public static UTF8String replace(final UTF8String target, final UTF8String search,
317317
final UTF8String replace, final int collationId) {
318318
// This collation aware implementation is based on existing implementation on UTF8String
319-
if (src.numBytes() == 0 || search.numBytes() == 0) {
320-
return src;
321-
}
322-
323-
StringSearch stringSearch = CollationFactory.getStringSearch(src, search, collationId);
324-
325-
// Find the first occurrence of the search string.
326-
int end = stringSearch.next();
327-
if (end == StringSearch.DONE) {
328-
// Search string was not found, so string is unchanged.
329-
return src;
330-
}
331-
332-
// Initialize byte positions
333-
int c = 0;
334-
int byteStart = 0; // position in byte
335-
int byteEnd = 0; // position in byte
336-
while (byteEnd < src.numBytes() && c < end) {
337-
byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
338-
c += 1;
319+
if (target.numBytes() == 0 || search.numBytes() == 0) {
320+
return target;
339321
}
340322

341-
// At least one match was found. Estimate space needed for result.
342-
// The 16x multiplier here is chosen to match commons-lang3's implementation.
343-
int increase = Math.max(0, Math.abs(replace.numBytes() - search.numBytes())) * 16;
344-
final UTF8StringBuilder buf = new UTF8StringBuilder(src.numBytes() + increase);
345-
while (end != StringSearch.DONE) {
346-
buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart, byteEnd - byteStart);
347-
buf.append(replace);
323+
String targetStr = target.toValidString();
324+
String searchStr = search.toValidString();
325+
StringSearch stringSearch = CollationFactory.getStringSearch(targetStr, searchStr, collationId);
348326

349-
// Move byteStart to the beginning of the current match
350-
byteStart = byteEnd;
351-
int cs = c;
352-
// Move cs to the end of the current match
353-
// This is necessary because the search string may contain 'multi-character' characters
354-
while (byteStart < src.numBytes() && cs < c + stringSearch.getMatchLength()) {
355-
byteStart += UTF8String.numBytesForFirstByte(src.getByte(byteStart));
356-
cs += 1;
357-
}
358-
// Go to next match
359-
end = stringSearch.next();
360-
// Update byte positions
361-
while (byteEnd < src.numBytes() && c < end) {
362-
byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
363-
c += 1;
364-
}
327+
StringBuilder sb = new StringBuilder();
328+
int start = 0;
329+
int matchStart = stringSearch.first();
330+
while (matchStart != StringSearch.DONE) {
331+
sb.append(targetStr, start, matchStart);
332+
sb.append(replace.toValidString());
333+
start = matchStart + stringSearch.getMatchLength();
334+
matchStart = stringSearch.next();
365335
}
366-
buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart,
367-
src.numBytes() - byteStart);
368-
return buf.build();
336+
sb.append(targetStr, start, targetStr.length());
337+
return UTF8String.fromString(sb.toString());
369338
}
370339

371-
/*
340+
/**
372341
* Performs string replacement for UTF8_LCASE collation by searching for instances of the search
373-
* string in the src string, with respect to lowercased string versions, and then replacing
342+
* string in the target string, with respect to lowercased string versions, and then replacing
374343
* them with the replace string. The method returns a new UTF8String with all instances of the
375344
* search string replaced using the replace string. Similar to UTF8String.findInSet behavior
376-
* used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty.
345+
* used for UTF8_BINARY, the method returns the `target` string if the `search` string is empty.
377346
*
378-
* @param src the string to be searched in
347+
* @param target the string to be searched in
379348
* @param search the string to be searched for
380349
* @param replace the string to be used as replacement
381-
* @param collationId the collation ID to use for string search
382350
* @return the position of the first occurrence of `match` in `set`
383351
*/
384-
public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String search,
352+
public static UTF8String lowercaseReplace(final UTF8String target, final UTF8String search,
385353
final UTF8String replace) {
386-
if (src.numBytes() == 0 || search.numBytes() == 0) {
387-
return src;
354+
if (target.numBytes() == 0 || search.numBytes() == 0) {
355+
return target;
388356
}
389357

390358
UTF8String lowercaseSearch = lowerCaseCodePoints(search);
391359

392360
int start = 0;
393-
int end = lowercaseFind(src, lowercaseSearch, start);
361+
int end = lowercaseFind(target, lowercaseSearch, start);
394362
if (end == -1) {
395363
// Search string was not found, so string is unchanged.
396-
return src;
364+
return target;
397365
}
398366

399367
// At least one match was found. Estimate space needed for result.
400368
// The 16x multiplier here is chosen to match commons-lang3's implementation.
401369
int increase = Math.max(0, replace.numBytes() - search.numBytes()) * 16;
402-
final UTF8StringBuilder buf = new UTF8StringBuilder(src.numBytes() + increase);
370+
final UTF8StringBuilder buf = new UTF8StringBuilder(target.numBytes() + increase);
403371
while (end != -1) {
404-
buf.append(src.substring(start, end));
372+
buf.append(target.substring(start, end));
405373
buf.append(replace);
406374
// Update character positions
407-
start = end + lowercaseMatchLengthFrom(src, lowercaseSearch, end);
408-
end = lowercaseFind(src, lowercaseSearch, start);
375+
start = end + lowercaseMatchLengthFrom(target, lowercaseSearch, end);
376+
end = lowercaseFind(target, lowercaseSearch, start);
409377
}
410-
buf.append(src.substring(start, src.numChars()));
378+
buf.append(target.substring(start, target.numChars()));
411379
return buf.build();
412380
}
413381

0 commit comments

Comments
 (0)