@@ -300,114 +300,82 @@ private static int compareLowerCaseSlow(final UTF8String left, final UTF8String
300300 return lowerCaseCodePoints (left ).binaryCompare (lowerCaseCodePoints (right ));
301301 }
302302
303- /*
303+ /**
304304 * Performs string replacement for ICU collations by searching for instances of the search
305- * string in the `src ` string, with respect to the specified collation, and then replacing
305+ * string in the `target ` string, with respect to the specified collation, and then replacing
306306 * them with the replace string. The method returns a new UTF8String with all instances of the
307307 * search string replaced using the replace string. Similar to UTF8String.findInSet behavior
308- * used for UTF8_BINARY, the method returns the `src ` string if the `search` string is empty.
308+ * used for UTF8_BINARY, the method returns the `target ` string if the `search` string is empty.
309309 *
310- * @param src the string to be searched in
310+ * @param target the string to be searched in
311311 * @param search the string to be searched for
312312 * @param replace the string to be used as replacement
313313 * @param collationId the collation ID to use for string search
314314 * @return the position of the first occurrence of `match` in `set`
315315 */
316- public static UTF8String replace (final UTF8String src , final UTF8String search ,
316+ public static UTF8String replace (final UTF8String target , final UTF8String search ,
317317 final UTF8String replace , final int collationId ) {
318318 // This collation aware implementation is based on existing implementation on UTF8String
319- if (src .numBytes () == 0 || search .numBytes () == 0 ) {
320- return src ;
321- }
322-
323- StringSearch stringSearch = CollationFactory .getStringSearch (src , search , collationId );
324-
325- // Find the first occurrence of the search string.
326- int end = stringSearch .next ();
327- if (end == StringSearch .DONE ) {
328- // Search string was not found, so string is unchanged.
329- return src ;
330- }
331-
332- // Initialize byte positions
333- int c = 0 ;
334- int byteStart = 0 ; // position in byte
335- int byteEnd = 0 ; // position in byte
336- while (byteEnd < src .numBytes () && c < end ) {
337- byteEnd += UTF8String .numBytesForFirstByte (src .getByte (byteEnd ));
338- c += 1 ;
319+ if (target .numBytes () == 0 || search .numBytes () == 0 ) {
320+ return target ;
339321 }
340322
341- // At least one match was found. Estimate space needed for result.
342- // The 16x multiplier here is chosen to match commons-lang3's implementation.
343- int increase = Math .max (0 , Math .abs (replace .numBytes () - search .numBytes ())) * 16 ;
344- final UTF8StringBuilder buf = new UTF8StringBuilder (src .numBytes () + increase );
345- while (end != StringSearch .DONE ) {
346- buf .appendBytes (src .getBaseObject (), src .getBaseOffset () + byteStart , byteEnd - byteStart );
347- buf .append (replace );
323+ String targetStr = target .toValidString ();
324+ String searchStr = search .toValidString ();
325+ StringSearch stringSearch = CollationFactory .getStringSearch (targetStr , searchStr , collationId );
348326
349- // Move byteStart to the beginning of the current match
350- byteStart = byteEnd ;
351- int cs = c ;
352- // Move cs to the end of the current match
353- // This is necessary because the search string may contain 'multi-character' characters
354- while (byteStart < src .numBytes () && cs < c + stringSearch .getMatchLength ()) {
355- byteStart += UTF8String .numBytesForFirstByte (src .getByte (byteStart ));
356- cs += 1 ;
357- }
358- // Go to next match
359- end = stringSearch .next ();
360- // Update byte positions
361- while (byteEnd < src .numBytes () && c < end ) {
362- byteEnd += UTF8String .numBytesForFirstByte (src .getByte (byteEnd ));
363- c += 1 ;
364- }
327+ StringBuilder sb = new StringBuilder ();
328+ int start = 0 ;
329+ int matchStart = stringSearch .first ();
330+ while (matchStart != StringSearch .DONE ) {
331+ sb .append (targetStr , start , matchStart );
332+ sb .append (replace .toValidString ());
333+ start = matchStart + stringSearch .getMatchLength ();
334+ matchStart = stringSearch .next ();
365335 }
366- buf .appendBytes (src .getBaseObject (), src .getBaseOffset () + byteStart ,
367- src .numBytes () - byteStart );
368- return buf .build ();
336+ sb .append (targetStr , start , targetStr .length ());
337+ return UTF8String .fromString (sb .toString ());
369338 }
370339
371- /*
340+ /**
372341 * Performs string replacement for UTF8_LCASE collation by searching for instances of the search
373- * string in the src string, with respect to lowercased string versions, and then replacing
342+ * string in the target string, with respect to lowercased string versions, and then replacing
374343 * them with the replace string. The method returns a new UTF8String with all instances of the
375344 * search string replaced using the replace string. Similar to UTF8String.findInSet behavior
376- * used for UTF8_BINARY, the method returns the `src ` string if the `search` string is empty.
345+ * used for UTF8_BINARY, the method returns the `target ` string if the `search` string is empty.
377346 *
378- * @param src the string to be searched in
347+ * @param target the string to be searched in
379348 * @param search the string to be searched for
380349 * @param replace the string to be used as replacement
381- * @param collationId the collation ID to use for string search
382350 * @return the position of the first occurrence of `match` in `set`
383351 */
384- public static UTF8String lowercaseReplace (final UTF8String src , final UTF8String search ,
352+ public static UTF8String lowercaseReplace (final UTF8String target , final UTF8String search ,
385353 final UTF8String replace ) {
386- if (src .numBytes () == 0 || search .numBytes () == 0 ) {
387- return src ;
354+ if (target .numBytes () == 0 || search .numBytes () == 0 ) {
355+ return target ;
388356 }
389357
390358 UTF8String lowercaseSearch = lowerCaseCodePoints (search );
391359
392360 int start = 0 ;
393- int end = lowercaseFind (src , lowercaseSearch , start );
361+ int end = lowercaseFind (target , lowercaseSearch , start );
394362 if (end == -1 ) {
395363 // Search string was not found, so string is unchanged.
396- return src ;
364+ return target ;
397365 }
398366
399367 // At least one match was found. Estimate space needed for result.
400368 // The 16x multiplier here is chosen to match commons-lang3's implementation.
401369 int increase = Math .max (0 , replace .numBytes () - search .numBytes ()) * 16 ;
402- final UTF8StringBuilder buf = new UTF8StringBuilder (src .numBytes () + increase );
370+ final UTF8StringBuilder buf = new UTF8StringBuilder (target .numBytes () + increase );
403371 while (end != -1 ) {
404- buf .append (src .substring (start , end ));
372+ buf .append (target .substring (start , end ));
405373 buf .append (replace );
406374 // Update character positions
407- start = end + lowercaseMatchLengthFrom (src , lowercaseSearch , end );
408- end = lowercaseFind (src , lowercaseSearch , start );
375+ start = end + lowercaseMatchLengthFrom (target , lowercaseSearch , end );
376+ end = lowercaseFind (target , lowercaseSearch , start );
409377 }
410- buf .append (src .substring (start , src .numChars ()));
378+ buf .append (target .substring (start , target .numChars ()));
411379 return buf .build ();
412380 }
413381
0 commit comments