Skip to content

Commit 8b5f11d

Browse files
authored
Add support for multibyte characters to $IFS (#92)
Add support for multibyte characters to $IFS This commit fixes BUG_MULTIBIFS, which had two bug reports in the ksh2020 branch. src/cmd/ksh93/sh/macro.c: - Backport Eric Scrivner's fix for multibyte IFS characters (slightly modified for compatibility with C89). Explanation from att#737: Previously, the varsub method used for the macro expansion of $param, ${param}, and ${param op word} would incorrectly expand the internal field separator (IFS) if it was a multibyte character. This was due to truncation based on the incorrect assumption that the IFS would never be larger than a single byte. This change fixes this issue by carefully tracking the number of bytes that should be persisted in the IFS case and ensuring that all bytes are written during expansion and substitution. Bug report: att#13 - Fixed another bug that caused multibyte characters with the same initial byte to be treated as the same character by the IFS. This bug was occurring because the first byte of a multibyte character wasn't being written to the stack when the IFS delimiter had the same initial byte: $ IFS=£ $ v='§' $ set -- $v $ v="${1-}" $ echo "$v" | hd # The first byte should be c2, but it isn't due to the bug 00000000 a7 0a |..| 00000002 Bug report: att#1372 src/cmd/ksh93/tests/variables.sh: - Add (reworked) regression tests from ksh2020 for the multibyte IFS bugs. - Add a regression test for att#1372 based on the reproducer.
1 parent 8c16f38 commit 8b5f11d

File tree

5 files changed

+75
-7
lines changed

5 files changed

+75
-7
lines changed

NEWS

+8
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,14 @@ For full details, see the git log at: https://github.com/ksh93/ksh
33

44
Any uppercase BUG_* names are modernish shell bug IDs.
55

6+
2020-07-25:
7+
8+
- Fixed BUG_MULTIBIFS: Multibyte characters can now be used as IFS
9+
delimiters. "$*" was incorrectly joining positional parameters on
10+
the first byte of a multibyte character. This was due to truncation
11+
based on the incorrect assumption the IFS would never be larger
12+
than a single byte.
13+
614
2020-07-23:
715

816
- Fixed an infinite loop that could occur when ksh is the system's /bin/sh.

TODO

-6
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,3 @@ https://github.com/modernish/modernish/tree/0.16/lib/modernish/cap/
5454
between 'while'/'until' and 'do'), the exit status passed down from the
5555
previous command is ignored and the function returns with status 0
5656
instead.
57-
58-
- BUG_MULTIBIFS: We're on a UTF-8 locale and the shell supports UTF-8
59-
characters in general (i.e. we don't have WRN_MULTIBYTE) – however, using
60-
multi-byte characters as IFS field delimiters still doesn't work. For
61-
example, "$*" joins positional parameters on the first byte of IFS instead
62-
of the first character.

src/cmd/ksh93/include/version.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@
1717
* David Korn <[email protected]> *
1818
* *
1919
***********************************************************************/
20-
#define SH_RELEASE "93u+m 2020-07-23"
20+
#define SH_RELEASE "93u+m 2020-07-25"

src/cmd/ksh93/sh/macro.c

+33
Original file line numberDiff line numberDiff line change
@@ -1954,10 +1954,34 @@ static int varsub(Mac_t *mp)
19541954
}
19551955
else if(d)
19561956
{
1957+
#if SHOPT_MULTIBYTE
1958+
Sfio_t *sfio_ptr = (mp->sp) ? mp->sp : stkp;
1959+
1960+
/*
1961+
* We know from above that if we are not performing @-expansion
1962+
* then we assigned `d` the value of `mp->ifs`, here we check
1963+
* whether or not we have a valid string of IFS characters to
1964+
* write as it is possible for `d` to be set to `mp->ifs` and
1965+
* yet `mp->ifsp` to be NULL.
1966+
*/
1967+
if(mode != '@' && mp->ifsp)
1968+
{
1969+
/*
1970+
* Handle multi-byte characters being used for the internal
1971+
* field separator (IFS).
1972+
*/
1973+
int i;
1974+
for(i = 0; i < mbsize(mp->ifsp); i++)
1975+
sfputc(sfio_ptr,mp->ifsp[i]);
1976+
}
1977+
else
1978+
sfputc(sfio_ptr,d);
1979+
#else
19571980
if(mp->sp)
19581981
sfputc(mp->sp,d);
19591982
else
19601983
sfputc(stkp,d);
1984+
#endif
19611985
}
19621986
}
19631987
if(arrmax)
@@ -2403,7 +2427,16 @@ static void mac_copy(register Mac_t *mp,register const char *str, register int s
24032427
if(n==S_MBYTE)
24042428
{
24052429
if(sh_strchr(mp->ifsp,cp-1)<0)
2430+
{
2431+
/*
2432+
* The multi-byte character that was found has the same initial
2433+
* byte as the IFS delimiter, but it's a different character. Put
2434+
* the first byte onto the stack and continue; multi-byte characters
2435+
* otherwise lose their initial byte.
2436+
*/
2437+
sfputc(stkp,c);
24062438
continue;
2439+
}
24072440
n = mbsize(cp-1) - 1;
24082441
if(n==-2)
24092442
n = 0;

src/cmd/ksh93/tests/variables.sh

+33
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,39 @@ case $(unset IFS; set -- $v; print $#) in
435435
*) err_exit 'BUG_KUNSETIFS detection failed'
436436
esac
437437

438+
# Multi-byte characters should work with $IFS
439+
(
440+
LC_ALL=C.UTF-8 # The multi-byte tests are pointless without UTF-8
441+
442+
# Test the following characters:
443+
# Lowercase accented e (two bytes)
444+
# Roman sestertius sign (four bytes)
445+
for delim in é 𐆘; do
446+
IFS="$delim"
447+
set : :
448+
[ "$*" == ":$delim:" ] || err_exit "IFS failed with multi-byte character $delim (expected :$delim:, got $*)"
449+
450+
read -r first second third <<< "one${delim}two${delim}three"
451+
[[ $first == one ]] || err_exit "IFS failed with multi-byte character $delim (expected one, got $first)"
452+
[[ $second == two ]] || err_exit "IFS failed with multi-byte character $delim (expected two, got $second)"
453+
[[ $third == three ]] || err_exit "IFS failed with multi-byte character $delim (expected three, got $three)"
454+
455+
# Ensure subshells don't get corrupted when IFS becomes a multi-byte character
456+
expected_output="$(printf ":$delim:\\ntrap -- 'echo end' EXIT\\nend")"
457+
output="$(LANG=C.UTF-8; IFS=$delim; set : :; echo "$*"; trap "echo end" EXIT; trap)"
458+
[[ $output == $expected_output ]] || err_exit "IFS in subshell failed with multi-byte character $delim (expected $expected_output, got $output)"
459+
done
460+
461+
# Multibyte characters with the same initial byte shouldn't be parsed as the same
462+
# character if they are different. The regression test below tests two characters
463+
# with the same initial byte (0xC2).
464+
IFS='£' # £ = C2 A3
465+
v='abc§def ghi§jkl' # § = C2 A7 (same initial byte)
466+
set -- $v
467+
v="${#},${1-},${2-},${3-}"
468+
[[ $v == '1,abc§def ghi§jkl,,' ]] || err_exit "IFS treats £ (C2 A3) and § (C2 A7) as the same character"
469+
)
470+
438471
# ^^^ end: IFS tests ^^^
439472
# restore default split:
440473
unset IFS

0 commit comments

Comments
 (0)