Skip to content

Commit

Permalink
Block API: Parse entity only when valid character reference (#13512)
Browse files Browse the repository at this point in the history
* Block API: Rename IdentityEntityParser as DecodeEntityParser

* Block API: Parse entity only when valid character reference
  • Loading branch information
aduth authored and youknowriad committed Mar 6, 2019
1 parent 9eed8db commit defe27b
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 11 deletions.
4 changes: 4 additions & 0 deletions packages/blocks/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@

- Blocks' `transforms` will receive `innerBlocks` as the second argument (or an array of each block's respective `innerBlocks` for a multi-transform).

### Bug Fixes

- Block validation will now correctly validate character references, resolving some issues where a standalone ampersand `&` followed later in markup by a character reference (e.g. `&`) could wrongly mark a block as being invalid. ([#13512](https://github.com/WordPress/gutenberg/pull/13512))

## 6.0.5 (2019-01-03)

## 6.0.4 (2018-12-12)
Expand Down
49 changes: 45 additions & 4 deletions packages/blocks/src/api/test/validation.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
* Internal dependencies
*/
import {
IdentityEntityParser,
isValidCharacterReference,
DecodeEntityParser,
getTextPiecesSplitOnWhitespace,
getTextWithCollapsedWhitespace,
getMeaningfulAttributePairs,
Expand Down Expand Up @@ -41,13 +42,39 @@ describe( 'validation', () => {
} );
} );

describe( 'IdentityEntityParser', () => {
describe( 'isValidCharacterReference', () => {
it( 'returns true for a named character reference', () => {
const result = isValidCharacterReference( 'blk12' );

expect( result ).toBe( true );
} );

it( 'returns true for a decimal character reference', () => {
const result = isValidCharacterReference( '#33' );

expect( result ).toBe( true );
} );

it( 'returns true for a hexadecimal character reference', () => {
const result = isValidCharacterReference( '#xC6' );

expect( result ).toBe( true );
} );

it( 'returns false for an invalid character reference', () => {
const result = isValidCharacterReference( ' Test</h2><h2>Test &amp' );

expect( result ).toBe( false );
} );
} );

describe( 'DecodeEntityParser', () => {
it( 'can be constructed', () => {
expect( new IdentityEntityParser() instanceof IdentityEntityParser ).toBe( true );
expect( new DecodeEntityParser() instanceof DecodeEntityParser ).toBe( true );
} );

it( 'returns parse as decoded value', () => {
expect( new IdentityEntityParser().parse( 'quot' ) ).toBe( '"' );
expect( new DecodeEntityParser().parse( 'quot' ) ).toBe( '"' );
} );
} );

Expand Down Expand Up @@ -397,6 +424,20 @@ describe( 'validation', () => {
expect( isEquivalent ).toBe( true );
} );

it( 'should account for character reference validity', () => {
// Regression: Previously the validator would wrongly evaluate the
// segment of text ` Test</h2><h2>Test &amp` as a character
// reference, as it's between an opening `&` and terminating `;`.
//
// See: https://github.com/WordPress/gutenberg/issues/12448
const isEquivalent = isEquivalentHTML(
'<h2>Test &amp; Test</h2><h2>Test &amp; Test</h2>',
'<h2>Test & Test</h2><h2>Test &amp; Test</h2>'
);

expect( isEquivalent ).toBe( true );
} );

it( 'should return false when more tokens in first', () => {
const isEquivalent = isEquivalentHTML(
'<div>Hello</div>',
Expand Down
81 changes: 74 additions & 7 deletions packages/blocks/src/api/validation.js
Original file line number Diff line number Diff line change
Expand Up @@ -154,24 +154,91 @@ const TEXT_NORMALIZATIONS = [
];

/**
* Subsitute EntityParser class for `simple-html-tokenizer` which bypasses
* entity substitution in favor of validator's internal normalization.
* Regular expression matching a named character reference. In lieu of bundling
* a full set of references, the pattern covers the minimal necessary to test
* positively against the full set.
*
* "The ampersand must be followed by one of the names given in the named
* character references section, using the same case."
*
* Tested aginst "12.5 Named character references":
*
* ```
* const references = [ ...document.querySelectorAll(
* '#named-character-references-table tr[id^=entity-] td:first-child'
* ) ].map( ( code ) => code.textContent )
* references.every( ( reference ) => /^[\da-z]+$/i.test( reference ) )
* ```
*
* @link https://html.spec.whatwg.org/multipage/syntax.html#character-references
* @link https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
*
* @type {RegExp}
*/
const REGEXP_NAMED_CHARACTER_REFERENCE = /^[\da-z]+$/i;

/**
* Regular expression matching a decimal character reference.
*
* "The ampersand must be followed by a U+0023 NUMBER SIGN character (#),
* followed by one or more ASCII digits, representing a base-ten integer"
*
* @link https://html.spec.whatwg.org/multipage/syntax.html#character-references
*
* @type {RegExp}
*/
const REGEXP_DECIMAL_CHARACTER_REFERENCE = /^#\d+$/;

/**
* Regular expression matching a hexadecimal character reference.
*
* "The ampersand must be followed by a U+0023 NUMBER SIGN character (#), which
* must be followed by either a U+0078 LATIN SMALL LETTER X character (x) or a
* U+0058 LATIN CAPITAL LETTER X character (X), which must then be followed by
* one or more ASCII hex digits, representing a hexadecimal integer"
*
* @link https://html.spec.whatwg.org/multipage/syntax.html#character-references
*
* @type {RegExp}
*/
const REGEXP_HEXADECIMAL_CHARACTER_REFERENCE = /^#x[\da-f]+$/i;

/**
* Returns true if the given string is a valid character reference segment, or
* false otherwise. The text should be stripped of `&` and `;` demarcations.
*
* @param {string} text Text to test.
*
* @return {boolean} Whether text is valid character reference.
*/
export function isValidCharacterReference( text ) {
return (
REGEXP_NAMED_CHARACTER_REFERENCE.test( text ) ||
REGEXP_DECIMAL_CHARACTER_REFERENCE.test( text ) ||
REGEXP_HEXADECIMAL_CHARACTER_REFERENCE.test( text )
);
}

/**
* Subsitute EntityParser class for `simple-html-tokenizer` which uses the
* implementation of `decodeEntities` from `html-entities`, in order to avoid
* bundling a massive named character reference.
*
* @see https://github.com/tildeio/simple-html-tokenizer/tree/master/src/entity-parser.ts
*/
export class IdentityEntityParser {
export class DecodeEntityParser {
/**
* Returns a substitute string for an entity string sequence between `&`
* and `;`, or undefined if no substitution should occur.
*
* In this implementation, undefined is always returned.
*
* @param {string} entity Entity fragment discovered in HTML.
*
* @return {?string} Entity substitute value.
*/
parse( entity ) {
return decodeEntities( '&' + entity + ';' );
if ( isValidCharacterReference( entity ) ) {
return decodeEntities( '&' + entity + ';' );
}
}
}

Expand Down Expand Up @@ -451,7 +518,7 @@ export function getNextNonWhitespaceToken( tokens ) {
*/
function getHTMLTokens( html ) {
try {
return new Tokenizer( new IdentityEntityParser() ).tokenize( html );
return new Tokenizer( new DecodeEntityParser() ).tokenize( html );
} catch ( e ) {
log.warning( 'Malformed HTML detected: %s', html );
}
Expand Down

0 comments on commit defe27b

Please sign in to comment.