From 391f24fd954aee9452e3228b87362a3424e7b624 Mon Sep 17 00:00:00 2001 From: Jonas Schade Date: Sat, 16 Mar 2024 04:42:55 +0100 Subject: [PATCH] Add support for parsing HTML numeric entities (#645) --- docs/v4/5.Entities.md | 2 ++ spec/entities_spec.js | 35 +++++++++++++++++++++++++++ src/v5/EntitiesParser.js | 2 ++ src/v5/valueParsers/EntitiesParser.js | 2 ++ src/xmlparser/OrderedObjParser.js | 2 ++ 5 files changed, 43 insertions(+) diff --git a/docs/v4/5.Entities.md b/docs/v4/5.Entities.md index fd3788e6..f6718f07 100644 --- a/docs/v4/5.Entities.md +++ b/docs/v4/5.Entities.md @@ -132,6 +132,8 @@ Following HTML entities are supported by the parser by default when `htmlEntitie | ₹ | Indian Rupee | `&inr;` | `₹` | --- +In addition, [numeric character references](https://html.spec.whatwg.org/multipage/syntax.html#syntax-charref) are also supported. Both decimal (`num_dec`) and hexadecimal(`num_hex`). + In future version of FXP, we'll be supporting more features of DOCTYPE such as `ELEMENT`, reading content for an entity from a file etc. ## External Entities diff --git a/spec/entities_spec.js b/spec/entities_spec.js index c87ced65..38b8be0f 100644 --- a/spec/entities_spec.js +++ b/spec/entities_spec.js @@ -377,6 +377,41 @@ describe("XMLParser Entities", function() { expect(result).toEqual(expected); }); + + it("should parse HTML numeric entities when htmlEntities:true", function() { + const xmlData = ` + + + Bear + Bears are called Bären in German! + `; + + const expected = { + "?xml": { + "version": "1.0", + "encoding": "UTF-8" + }, + "note": { + "heading": "Bear", + "body": { + "#text": "Bears are called Bären in German!", + "face": "ʕ•ᴥ•ʔ" + } + } + }; + + const options = { + attributeNamePrefix: "", + ignoreAttributes: false, + processEntities: true, + htmlEntities: true, + }; + const parser = new XMLParser(options); + let result = parser.parse(xmlData); + + expect(result).toEqual(expected); + }); + it("should throw error if an entity name contains special char", function() { const xmlData = ` diff --git a/src/v5/EntitiesParser.js b/src/v5/EntitiesParser.js index d7dc400c..62cc02ff 100644 --- a/src/v5/EntitiesParser.js +++ b/src/v5/EntitiesParser.js @@ -13,6 +13,8 @@ const htmlEntities = { "copyright" : { regex: /&(copy|#169);/g, val: "©" }, "reg" : { regex: /&(reg|#174);/g, val: "®" }, "inr" : { regex: /&(inr|#8377);/g, val: "₹" }, + "num_dec": { regex: /&#([0-9]{1,7});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 10)) }, + "num_hex": { regex: /&#x([0-9a-fA-F]{1,6});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 16)) }, }; class EntitiesParser{ diff --git a/src/v5/valueParsers/EntitiesParser.js b/src/v5/valueParsers/EntitiesParser.js index d7dc400c..62cc02ff 100644 --- a/src/v5/valueParsers/EntitiesParser.js +++ b/src/v5/valueParsers/EntitiesParser.js @@ -13,6 +13,8 @@ const htmlEntities = { "copyright" : { regex: /&(copy|#169);/g, val: "©" }, "reg" : { regex: /&(reg|#174);/g, val: "®" }, "inr" : { regex: /&(inr|#8377);/g, val: "₹" }, + "num_dec": { regex: /&#([0-9]{1,7});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 10)) }, + "num_hex": { regex: /&#x([0-9a-fA-F]{1,6});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 16)) }, }; class EntitiesParser{ diff --git a/src/xmlparser/OrderedObjParser.js b/src/xmlparser/OrderedObjParser.js index 75d97184..7d1177f9 100644 --- a/src/xmlparser/OrderedObjParser.js +++ b/src/xmlparser/OrderedObjParser.js @@ -40,6 +40,8 @@ class OrderedObjParser{ "copyright" : { regex: /&(copy|#169);/g, val: "©" }, "reg" : { regex: /&(reg|#174);/g, val: "®" }, "inr" : { regex: /&(inr|#8377);/g, val: "₹" }, + "num_dec": { regex: /&#([0-9]{1,7});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 10)) }, + "num_hex": { regex: /&#x([0-9a-fA-F]{1,6});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 16)) }, }; this.addExternalEntities = addExternalEntities; this.parseXml = parseXml;