Skip to content

Commit

Permalink
Handling behavior of ambiguous ampersands, fixes #50
Browse files Browse the repository at this point in the history
  • Loading branch information
mdevils committed Jan 25, 2021
1 parent 7b3337a commit 30b504f
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 6 deletions.
32 changes: 26 additions & 6 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import {namedReferences} from './named-references';
import {bodyRegExps, namedReferences} from './named-references';
import {numericUnicodeMap} from './numeric-unicode-map';
import {fromCodePoint, getCodePoint} from './surrogate-pairs';

Expand Down Expand Up @@ -63,10 +63,30 @@ const defaultDecodeOptions: DecodeOptions = {
level: 'all'
};

const decodeRegExps: Record<DecodeScope, RegExp> = {
strict: /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);/g,
body: /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);?/g,
attribute: /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+)[;=]?/g
const strict = /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);/g;
const attribute = /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+)[;=]?/g;

const baseDecodeRegExps: Record<Exclude<Level, 'all'>, Record<DecodeScope, RegExp>> = {
xml: {
strict,
attribute,
body: bodyRegExps.xml
},
html4: {
strict,
attribute,
body: bodyRegExps.html4
},
html5: {
strict,
attribute,
body: bodyRegExps.html5
}
};

const decodeRegExps: Record<Level, Record<DecodeScope, RegExp>> = {
...baseDecodeRegExps,
all: baseDecodeRegExps.html5
};

const fromCharCode = String.fromCharCode;
Expand All @@ -81,7 +101,7 @@ export function decode(
const references = allNamedReferences[level].entities;
const isAttribute = scope === 'attribute';

return text.replace(decodeRegExps[scope], function (entity) {
return text.replace(decodeRegExps[level][scope], function (entity) {
if (isAttribute && entity[entity.length - 1] === '=') {
return entity;
}
Expand Down
5 changes: 5 additions & 0 deletions src/named-references.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ export type NamedReferences = {
characters: Record<string, string>;
}
};
export const bodyRegExps = {
xml: /&(?:#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);?/g,
html4: /&(?:nbsp|iexcl|cent|pound|curren|yen|brvbar|sect|uml|copy|ordf|laquo|not|shy|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|times|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|divide|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml|quot|amp|lt|gt|#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);?/g,
html5: /&(?:AElig|AMP|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|GT|Iacute|Icirc|Igrave|Iuml|LT|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|QUOT|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|amp|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|gt|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|lt|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|quot|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml|#\d+|#x[\da-fA-F]+|[0-9a-zA-Z]+);?/g
};
export const namedReferences: NamedReferences = {
"xml": {
"entities": {
Expand Down
3 changes: 3 additions & 0 deletions test/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ describe('decode()', () => {
it('should decode numeric entities without semicolon', () => {
expect(decode('&#34C&#34')).to.equal('"C"');
});
it('should decode incomplete named entities followed by alphanumeric characters', () => {
expect(decode('&uumlber')).to.equal('über');
});
describe('level', () => {
it('should decode according to the level', () => {
expect(decode('a\n&lt;&gt;&quot;&apos;&amp;&copy;&#8710;&rx;&#128514;&#0;&#1;', {level: 'all'})).to.equal(
Expand Down
15 changes: 15 additions & 0 deletions tools/process-named-references.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,25 @@ interface LevelData {

const result: {[key in Level]?: LevelData} = {};

const regExpStart = '/&(?:';
const regExpCommon = '#\\d+|#x[\\da-fA-F]+|[0-9a-zA-Z]+)';
const regExpEndBody = ';?/g';

const bodyRegExps: [string, string][] = [];

for (const [level, entityInfos] of getObjectEntries(namedReferences)) {
const bodyRegExpNamedReferences: string[] = [];
const levelData: LevelData = {entities: {}, characters: {}};
for (const [entity, {characters}] of getObjectEntries(entityInfos)) {
levelData.entities[entity] = characters;
levelData.characters[characters] = entity;
if (!entity.endsWith(';')) {
bodyRegExpNamedReferences.push(entity.slice(1));
}
}
result[level] = levelData;
bodyRegExpNamedReferences.push(regExpCommon);
bodyRegExps.push([level, regExpStart + bodyRegExpNamedReferences.join('|') + regExpEndBody]);
}

const processedNamedReferences = `// This file is autogenerated by tools/process-named-references.ts
Expand All @@ -31,6 +43,9 @@ export type NamedReferences = {
characters: Record<string, string>;
}
};
export const bodyRegExps = {
${bodyRegExps.map(([level, regExpStart]) => `${level}: ${regExpStart}`).join(',\n ')}
};
export const namedReferences: NamedReferences = ${JSON.stringify(result, null, 4)};\n`;

fs.writeFileSync(path.join(__dirname, '..', 'src', 'named-references.ts'), processedNamedReferences);

0 comments on commit 30b504f

Please sign in to comment.