diff --git a/CHANGELOG.md b/CHANGELOG.md index 45b1b7a..2f5a077 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +2.3.4 +----- + + * Fix the problem decoding HTML5 entities those names are part of HTML4 entities. + 2.3.3 ----- diff --git a/src/named-references.ts b/src/named-references.ts index 59e4e53..559f27a 100644 --- a/src/named-references.ts +++ b/src/named-references.ts @@ -9,8 +9,8 @@ export type NamedReferences = { }; export const bodyRegExps = { xml: /&(?:#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);?/g, - html4: /&(?:nbsp|iexcl|cent|pound|curren|yen|brvbar|sect|uml|copy|ordf|laquo|not|shy|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|times|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|divide|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml|quot|amp|lt|gt|#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);?/g, - html5: /&(?:AElig|AMP|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|GT|Iacute|Icirc|Igrave|Iuml|LT|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|QUOT|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|amp|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|gt|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|lt|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|quot|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml|#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);?/g + html4: / |¡|¢|£|¤|¥|¦|§|¨|©|ª|«|¬|­|®|¯|°|±|²|³|´|µ|¶|·|¸|¹|º|»|¼|½|¾|¿|À|Á|Â|Ã|Ä|Å|Æ|Ç|È|É|Ê|Ë|Ì|Í|Î|Ï|Ð|Ñ|Ò|Ó|Ô|Õ|Ö|×|Ø|Ù|Ú|Û|Ü|Ý|Þ|ß|à|á|â|ã|ä|å|æ|ç|è|é|ê|ë|ì|í|î|ï|ð|ñ|ò|ó|ô|õ|ö|÷|ø|ù|ú|û|ü|ý|þ|ÿ|"|&|<|>|∉|&(?:nbsp|iexcl|cent|pound|curren|yen|brvbar|sect|uml|copy|ordf|laquo|not|shy|reg|macr|deg|plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|times|Oslash|Ugrave|Uacute|Ucirc|Uuml|Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|divide|oslash|ugrave|uacute|ucirc|uuml|yacute|thorn|yuml|quot|amp|lt|gt|#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);?/g, + html5: /Æ|&|Á|Â|À|Å|Ã|Ä|©|Ç|Ð|É|Ê|È|Ë|>|Í|Î|Ì|Ï|<|Ñ|Ó|Ô|Ò|Ø|Õ|Ö|"|®|Þ|Ú|Û|Ù|Ü|Ý|á|â|´|æ|à|&|å|ã|ä|¦|ç|¸|¢|·|©|℗|¤|°|÷|⋇|é|ê|è|ð|ë|½|¼|¾|>|⪧|⩺|⋗|⦕|⩼|⪆|⥸|⋗|⋛|⪌|≷|≳|í|î|¡|ì|¿|ï|«|<|⪦|⩹|⋖|⋋|⋉|⥶|⩻|⦖|◃|⊴|◂|¯|µ|·| |¬|∉|⋹̸|⋵̸|∉|⋷|⋶|∌|∌|⋾|⋽|ñ|ó|ô|ò|ª|º|ø|õ|ö|¶|∥|±|£|"|»|®|§|­|¹|²|³|ß|þ|×|⊠|⨱|⨰|ú|û|ù|¨|ü|ý|¥|ÿ|&(?:AElig|AMP|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|GT|Iacute|Icirc|Igrave|Iuml|LT|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|QUOT|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|amp|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|gt|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|lt|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|quot|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml|#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);?/g }; export const namedReferences: NamedReferences = { "xml": { diff --git a/test/index.test.ts b/test/index.test.ts index 1393375..76cab0c 100644 --- a/test/index.test.ts +++ b/test/index.test.ts @@ -1,5 +1,6 @@ import {expect} from 'chai'; import * as HtmlEntities from '../src'; +import {namedReferences} from '../src/named-references'; // eslint-disable-next-line @typescript-eslint/no-var-requires const {encode, decode, decodeEntity} = require(process.env.TEST_LIB ? '../lib' : '../src') as typeof HtmlEntities; @@ -110,6 +111,13 @@ describe('decode()', () => { expect(decode('&=123', {scope: 'attribute'})).to.equal('&=123'); }); }); + describe('bugs', () => { + it('should properly process html5 entitites those names start with html4 entity name - #77', () => { + for (const [entity, value] of Object.entries(namedReferences.html5.entities)) { + expect(decode(entity)).to.equal(value); + } + }); + }); }); describe('decodeEntity()', () => { diff --git a/tools/process-named-references.ts b/tools/process-named-references.ts index 22d46b8..6419c15 100644 --- a/tools/process-named-references.ts +++ b/tools/process-named-references.ts @@ -12,25 +12,42 @@ interface LevelData { const result: {[key in Level]?: LevelData} = {}; -const regExpStart = '/&(?:'; +const regExpStart = '/'; +const regExpEnd = '/g'; +const regExpStartBody = '&(?:'; const regExpCommon = '#\\d+|#[xX][\\da-fA-F]+|[0-9a-zA-Z]+)'; -const regExpEndBody = ';?/g'; +const regExpEndBody = ';?'; const bodyRegExps: [string, string][] = []; for (const [level, entityInfos] of getObjectEntries(namedReferences)) { + const conflictingBodyRegExpNamedReferences: string[] = []; const bodyRegExpNamedReferences: string[] = []; const levelData: LevelData = {entities: {}, characters: {}}; + const entities: string[] = Object.keys(entityInfos); for (const [entity, {characters}] of getObjectEntries(entityInfos)) { levelData.entities[entity] = characters; levelData.characters[characters] = entity; if (!entity.endsWith(';')) { bodyRegExpNamedReferences.push(entity.slice(1)); + } else { + for (const otherEntity of entities) { + if (otherEntity !== entity && entity.startsWith(otherEntity)) { + conflictingBodyRegExpNamedReferences.push(entity); + } + } } } result[level] = levelData; bodyRegExpNamedReferences.push(regExpCommon); - bodyRegExps.push([level, regExpStart + bodyRegExpNamedReferences.join('|') + regExpEndBody]); + bodyRegExps.push([ + level, + regExpStart + + conflictingBodyRegExpNamedReferences + .concat(regExpStartBody + bodyRegExpNamedReferences.join('|') + regExpEndBody) + .join('|') + + regExpEnd + ]); } const processedNamedReferences = `// This file is autogenerated by tools/process-named-references.ts