Skip to content

Commit

Permalink
fix: properly decode html5 entities those names are part of html4 ent…
Browse files Browse the repository at this point in the history
…ities
  • Loading branch information
mdevils committed Jun 5, 2023
1 parent 9b970c0 commit c809228
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 5 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
2.3.4
-----

* Fix the problem decoding HTML5 entities those names are part of HTML4 entities.

2.3.3
-----

Expand Down
4 changes: 2 additions & 2 deletions src/named-references.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ export type NamedReferences = {
};
export const bodyRegExps = {
xml: /&(?:#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);?/g,
html4: /&(?:nbsp|iexcl|cent|pound|curren|yen|brvbar|sect|uml|copy|ordf|laquo|not|shy|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|times|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|divide|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml|quot|amp|lt|gt|#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);?/g,
html5: /&(?:AElig|AMP|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|GT|Iacute|Icirc|Igrave|Iuml|LT|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|QUOT|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|amp|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|gt|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|lt|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|quot|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml|#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);?/g
html4: / |¡|¢|£|¤|¥|¦|§|¨|©|ª|«|¬|­|®|¯|°|±|²|³|´|µ|¶|·|¸|¹|º|»|¼|½|¾|¿|À|Á|Â|Ã|Ä|Å|Æ|Ç|È|É|Ê|Ë|Ì|Í|Î|Ï|Ð|Ñ|Ò|Ó|Ô|Õ|Ö|×|Ø|Ù|Ú|Û|Ü|Ý|Þ|ß|à|á|â|ã|ä|å|æ|ç|è|é|ê|ë|ì|í|î|ï|ð|ñ|ò|ó|ô|õ|ö|÷|ø|ù|ú|û|ü|ý|þ|ÿ|"|&|<|>|∉|&(?:nbsp|iexcl|cent|pound|curren|yen|brvbar|sect|uml|copy|ordf|laquo|not|shy|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|times|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|divide|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml|quot|amp|lt|gt|#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);?/g,
html5: /Æ|&|Á|Â|À|Å|Ã|Ä|©|Ç|Ð|É|Ê|È|Ë|>|Í|Î|Ì|Ï|<|Ñ|Ó|Ô|Ò|Ø|Õ|Ö|"|®|Þ|Ú|Û|Ù|Ü|Ý|á|â|´|æ|à|&|å|ã|ä|¦|ç|¸|¢|·|©|℗|¤|°|÷|⋇|é|ê|è|ð|ë|½|¼|¾|>|⪧|⩺|⋗|⦕|⩼|⪆|⥸|⋗|⋛|⪌|≷|≳|í|î|¡|ì|¿|ï|«|<|⪦|⩹|⋖|⋋|⋉|⥶|⩻|⦖|◃|⊴|◂|¯|µ|·| |¬|∉|⋹̸|⋵̸|∉|⋷|⋶|∌|∌|⋾|⋽|ñ|ó|ô|ò|ª|º|ø|õ|ö|¶|∥|±|£|"|»|®|§|­|¹|²|³|ß|þ|×|⊠|⨱|⨰|ú|û|ù|¨|ü|ý|¥|ÿ|&(?:AElig|AMP|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|GT|Iacute|Icirc|Igrave|Iuml|LT|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|QUOT|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|amp|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|gt|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|lt|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|quot|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml|#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);?/g
};
export const namedReferences: NamedReferences = {
"xml": {
Expand Down
8 changes: 8 additions & 0 deletions test/index.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import {expect} from 'chai';
import * as HtmlEntities from '../src';
import {namedReferences} from '../src/named-references';

// eslint-disable-next-line @typescript-eslint/no-var-requires
const {encode, decode, decodeEntity} = require(process.env.TEST_LIB ? '../lib' : '../src') as typeof HtmlEntities;
Expand Down Expand Up @@ -110,6 +111,13 @@ describe('decode()', () => {
expect(decode('&amp=123', {scope: 'attribute'})).to.equal('&amp=123');
});
});
describe('bugs', () => {
it('should properly process html5 entitites those names start with html4 entity name - #77', () => {
for (const [entity, value] of Object.entries(namedReferences.html5.entities)) {
expect(decode(entity)).to.equal(value);
}
});
});
});

describe('decodeEntity()', () => {
Expand Down
23 changes: 20 additions & 3 deletions tools/process-named-references.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,42 @@ interface LevelData {

const result: {[key in Level]?: LevelData} = {};

const regExpStart = '/&(?:';
const regExpStart = '/';
const regExpEnd = '/g';
const regExpStartBody = '&(?:';
const regExpCommon = '#\\d+|#[xX][\\da-fA-F]+|[0-9a-zA-Z]+)';
const regExpEndBody = ';?/g';
const regExpEndBody = ';?';

const bodyRegExps: [string, string][] = [];

for (const [level, entityInfos] of getObjectEntries(namedReferences)) {
const conflictingBodyRegExpNamedReferences: string[] = [];
const bodyRegExpNamedReferences: string[] = [];
const levelData: LevelData = {entities: {}, characters: {}};
const entities: string[] = Object.keys(entityInfos);
for (const [entity, {characters}] of getObjectEntries(entityInfos)) {
levelData.entities[entity] = characters;
levelData.characters[characters] = entity;
if (!entity.endsWith(';')) {
bodyRegExpNamedReferences.push(entity.slice(1));
} else {
for (const otherEntity of entities) {
if (otherEntity !== entity && entity.startsWith(otherEntity)) {
conflictingBodyRegExpNamedReferences.push(entity);
}
}
}
}
result[level] = levelData;
bodyRegExpNamedReferences.push(regExpCommon);
bodyRegExps.push([level, regExpStart + bodyRegExpNamedReferences.join('|') + regExpEndBody]);
bodyRegExps.push([
level,
regExpStart +
conflictingBodyRegExpNamedReferences
.concat(regExpStartBody + bodyRegExpNamedReferences.join('|') + regExpEndBody)
.join('|') +
regExpEnd
]);
}

const processedNamedReferences = `// This file is autogenerated by tools/process-named-references.ts
Expand Down

0 comments on commit c809228

Please sign in to comment.