Skip to content

Commit 5c8926c

Browse files
committed
Merge branch 'any-code-point'
2 parents c28cd81 + b328e96 commit 5c8926c

File tree

6 files changed

+48
-5
lines changed

6 files changed

+48
-5
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
### Breaking changes:
66

7+
- [#424]: `any` now consumes an entire code point (i.e., a full Unicode character), not just a single, 16-bit code unit.
78
- [55c787b]: The namespace helpers (`namespace`, `extendNamespace`) have been removed. (These were always optional.)
89
- [bea0be9]: When used as an ES module, the main 'ohm-js' module now has _only_ named exports (i.e., no default export). The same is true for `ohm-js/extras`.
910
- [#395]: In generated type definitions, action dictionary types now inherit from `BaseActionDict<T>`, a new supertype of `ActionDict<T>`.

doc/releases/ohm-js-17.0.md

+18
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,24 @@ This version also has experimental support for indentation-sensitive grammars.
66

77
## Upgrading
88

9+
### `any` now consumes a full code point
10+
11+
In JavaScript, a string is a sequence of 16-bit code units. Some Unicode characters, such as emoji, are encoded as pairs of 16-bit values. For example, the string '😆' has length 2, but contains a single Unicode code point. Previously, `any` matched a single 16-bit code unit — even if that unit was part of a surrogate pair. In v17, `any` now matches a full Unicode character.
12+
13+
Old behaviour:
14+
15+
```js
16+
const g = ohm.grammar('OneChar { start = any }');
17+
g.match('😆').succeeded(); // false
18+
```
19+
20+
New behaviour (Ohm v17+):
21+
22+
```js
23+
const g = ohm.grammar('OneChar { start = any }');
24+
g.match('😆').succeeded(); // true
25+
```
26+
927
### Namespace helpers removed
1028

1129
The top-level `namespace` and `extendNamespace` functions have been removed. They were never required — it was always possible to use a plain old object in any API that asked for a namespace.

doc/syntax-reference.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,9 @@ as well as multiline (`/* */`) comments like:
146146

147147
(See [src/built-in-rules.ohm](https://github.com/harc/ohm/blob/main/packages/ohm-js/src/built-in-rules.ohm).)
148148

149-
`any`: Matches the next character in the input stream, if one exists.
149+
`any`: Matches the next Unicode character — i.e., a single code point — in the input stream, if one exists.
150+
151+
**NOTE:** A JavaScript string is a sequence of 16-bit _code units_. Some Unicode characters, such as emoji, are encoded as pairs of 16-bit values. For example, the string `'😆'` has length 2, but contains a single Unicode code point. Prior to Ohm v17, `any` always consumed a single 16-bit code unit, rather than a full Unicode character.
150152

151153
`letter`: Matches a single character which is a letter (either uppercase or lowercase).
152154

packages/ohm-js/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "ohm-js",
3-
"version": "17.0.0",
3+
"version": "17.0.1",
44
"description": "An object-oriented language for parsing and pattern matching",
55
"repository": "https://github.com/harc/ohm",
66
"keywords": [

packages/ohm-js/src/pexprs-eval.js

+3-3
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ pexprs.PExpr.prototype.eval = common.abstract('eval'); // function(state) { ...
2929
pexprs.any.eval = function(state) {
3030
const {inputStream} = state;
3131
const origPos = inputStream.pos;
32-
const ch = inputStream.next();
33-
if (ch) {
34-
state.pushBinding(new TerminalNode(ch.length), origPos);
32+
const cp = inputStream.nextCodePoint();
33+
if (cp !== undefined) {
34+
state.pushBinding(new TerminalNode(String.fromCodePoint(cp).length), origPos);
3535
return true;
3636
} else {
3737
state.processFailure(origPos, this);

packages/ohm-js/test/test-ohm-syntax.js

+22
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,28 @@ test('ranges w/ code points > 0xFFFF, special cases', t => {
256256
assertSucceeds(t, g2.match('\u{D83D}x'));
257257
});
258258

259+
test('any consumes an entire code point', t => {
260+
const g = ohm.grammar('G { start = any any }');
261+
const re = /../u; // The regex equivalent of `any any`.
262+
263+
t.is('😇'.length, 2);
264+
t.is('😇!'.length, 3);
265+
t.is('😇😇'.length, 4);
266+
267+
t.is(g.match('😇😇').succeeded(), true);
268+
t.truthy(re.exec('😇😇'));
269+
270+
t.is(g.match('😇!').succeeded(), true);
271+
t.truthy(re.exec('😇!'));
272+
273+
t.is(g.match('!😇').succeeded(), true);
274+
t.truthy(re.exec('!😇'));
275+
276+
t.is('👋🏿'.length, 4); // Skin color modifier is a separate code point.
277+
t.is(g.match('👋🏿').succeeded(), true);
278+
t.truthy(re.exec('👋🏿'));
279+
});
280+
259281
describe('alt', test => {
260282
const m = ohm.grammar('M { altTest = "a" | "b" }');
261283
const s = m.createSemantics().addAttribute('v', {

0 commit comments

Comments
 (0)