[lex] Better specify whitespace characters

AlisdairM · AlisdairM · commit 35ec5fe248f0 · 2024-11-01T10:13:39.000-04:00
This commit defines a grammar term for _whitespace-character_ and
uses it consistently where the plain text term whitespace character
is used.  A whitespace character is defined as one of the five
characters that are mentioned in the text closest to provifing a
defifinition.  The unicode character name is (mostly) consistently
used to name these characters, and for consistency, similar changes
were made to name unicode characters rather than render specified
characters in code font throughout [lex].  The one exception is
backslash, which is retained as-is to avoid making more issues for
P2348.  Note that this commit is not a replacement for P2348,
merely a clearer statement of the existing specification without
any normative changes.
diff --git a/source/lex.tex b/source/lex.tex
@@ -110,9 +110,9 @@
 \indextext{line splicing}%
 If the first translation character is \unicode{feff}{byte order mark},
 it is deleted.
-Each sequence of a backslash character (\textbackslash)
+Each sequence of a backslash character (\unicode{005c}{reverse solidus})
 immediately followed by
-zero or more whitespace characters other than new-line followed by
+zero or more \grammarterm{whitespace-character}s other than new-line followed by
 a new-line character is deleted, splicing
 physical source lines to form \defnx{logical source lines}{source line!logical}. Only the last
 backslash on any physical source line shall be eligible for being part
@@ -126,9 +126,13 @@
 shall be processed as if an additional new-line character were appended
 to the file.
 
-\item The source file is decomposed into preprocessing
-tokens\iref{lex.pptoken} and sequences of whitespace characters
-(including comments). A source file shall not end in a partial
+\item
+\indextext{whitespace}%
+\indextext{comment}%
+\indextext{token!preprocessing}%
+The source file is decomposed into preprocessing
+tokens\iref{lex.pptoken} and whitespace\iref{lex.whitespace} (sequences of \grammarterm{whitespace-character}s
+and comments). A source file shall not end in a partial
 preprocessing token or in a partial comment.
 \begin{footnote}
 A partial preprocessing
@@ -140,9 +144,9 @@
 would arise from a source file ending with an unclosed \tcode{/*}
 comment.
 \end{footnote}
-Each comment\iref{lex.comment} is replaced by one space character. New-line characters are
-retained. Whether each nonempty sequence of whitespace characters other
-than new-line is retained or replaced by one space character is
+Each comment\iref{lex.comment} is replaced by one \unicode{0020}{space} character. New-line characters are
+retained. Whether each nonempty sequence of \grammarterm{whitespace-character}s other
+than new-line is retained or replaced by one \unicode{0020}{space} character is
 unspecified.
 As characters from the source file are consumed
 to form the next preprocessing token
@@ -178,10 +182,10 @@
 \item
 Adjacent \grammarterm{string-literal} tokens are concatenated\iref{lex.string}.
 
-\item Whitespace characters separating tokens are no longer
-significant. Each preprocessing token is converted into a
-token\iref{lex.token}. The resulting tokens
-constitute a \defn{translation unit} and
+\item
+Each preprocessing token is converted into a token\iref{lex.token}.
+Any \grammarterm{whitespace-character}s separating tokens are no longer significant.
+The resulting tokens constitute a \defn{translation unit} and
 are syntactically and
 semantically analyzed and translated.
 \begin{note}
@@ -467,7 +471,28 @@
 None of these names or aliases have leading or trailing spaces.
 \end{note}
 
-\rSec1[lex.comment]{Comments}
+\rSec1[lex.whitespace]{Whitespace}
+\indextext{whitespace|(}%
+
+\rSec2[lex.whitechar]{Whitespace Characters}
+
+\indextext{character!whitespace|(}%
+\begin{bnf}
+\nontermdef{whitespace-character}\br
+    \unicode{0009}{character tabulation}\br
+    \textnormal{new-line}\br
+    \unicode{000b}{line tabulation}\br
+    \unicode{000c}{form feed}\br
+    \unicode{0020}{space}\br
+\end{bnf}
+
+\pnum
+\begin{note}
+Whitespace characters are used to separate elements of the \Cpp grammar.
+\end{note}
+\indextext{character!whitespace|)}
+
+\rSec2[lex.comment]{Comments}
 
 \pnum
 \indextext{comment|(}%
@@ -477,8 +502,8 @@
 characters \tcode{*/}. These comments do not nest.
 \indextext{comment!\tcode{//}}%
 The characters \tcode{//} start a comment, which terminates immediately before the
-next new-line character. If there is a form-feed or a vertical-tab
-character in such a comment, only whitespace characters shall appear
+next new-line character. If there is a \unicode{000c}{form feed} or a \unicode{000b}{line tabulation}
+character in such a comment, only \grammarterm{whitespace-character}s shall appear
 between it and the new-line that terminates the comment; no diagnostic
 is required.
 \begin{note}
@@ -489,6 +514,7 @@
 \tcode{/*} comment.
 \end{note}
 \indextext{comment|)}
+\indextext{whitespace|)}%
 
 \rSec1[lex.pptoken]{Preprocessing tokens}
 
@@ -506,7 +532,7 @@
     string-literal\br
     user-defined-string-literal\br
     preprocessing-op-or-punc\br
-    \textnormal{each non-whitespace character that cannot be one of the above}
+    \textnormal{each non-\grammarterm{whitespace-character} that cannot be one of the above}
 \end{bnf}
 
 \pnum
@@ -520,22 +546,17 @@
 (\grammarterm{import-keyword}, \grammarterm{module-keyword}, and \grammarterm{export-keyword}),
 identifiers, preprocessing numbers, character literals (including user-defined character
 literals), string literals (including user-defined string literals), preprocessing
-operators and punctuators, and single non-whitespace characters that do not lexically
+operators and punctuators, and single non-\grammarterm{whitespace-character}s that do not lexically
 match the other preprocessing token categories.
 If a \unicode{0027}{apostrophe} or a \unicode{0022}{quotation mark} character
 matches the last category, the program is ill-formed.
 If any character not in the basic character set matches the last category,
 the program is ill-formed.
 Preprocessing tokens can be separated by
 \indextext{whitespace}%
-whitespace;
+whitespace\iref{lex.whitespace};
 \indextext{comment}%
-this consists of comments\iref{lex.comment}, or whitespace characters
-(\unicode{0020}{space},
-\unicode{0009}{character tabulation},
-new-line,
-\unicode{000b}{line tabulation}, and
-\unicode{000c}{form feed}), or both.
+this consists of comments, \grammarterm{whitespace-character}s, or both.
 As described in \ref{cpp}, in certain
 circumstances during translation phase 4, whitespace (or the absence
 thereof) serves as more than preprocessing token separation. Whitespace
@@ -826,9 +847,7 @@
 \end{footnote}
 operators, and other separators.
 \indextext{whitespace}%
-Blanks, horizontal and vertical tabs, newlines, formfeeds, and comments
-(collectively, ``whitespace''), as described below, are ignored except
-as they serve to separate tokens.
+Whitespace\iref{lex.whitespace} is ignored except to separate tokens.
 \begin{note}
 Whitespace can separate otherwise adjacent identifiers, keywords, numeric
 literals, and alternative tokens containing alphabetic characters.
@@ -1790,8 +1809,8 @@
 \begin{bnf}
 \nontermdef{d-char}\br
     \textnormal{any member of the basic character set except:}\br
-    \bnfindent\textnormal{\unicode{0020}{space}, \unicode{0028}{left parenthesis}, \unicode{0029}{right parenthesis}, \unicode{005c}{reverse solidus},}\br
-    \bnfindent\textnormal{\unicode{0009}{character tabulation}, \unicode{000b}{line tabulation}, \unicode{000c}{form feed}, and new-line}
+    \bnfindent\textnormal{a \grammarterm{whitespace-character}, \unicode{0028}{left parenthesis}, \unicode{0029}{right parenthesis},}\br
+    \bnfindent\textnormal{and \unicode{005c}{reverse solidus}}
 \end{bnf}
 
 \pnum