Skip to content
This repository has been archived by the owner on Jul 22, 2022. It is now read-only.

Commit

Permalink
Merge pull request #239 from mschnee/fix-xml-1.0-unicode-escapes
Browse files Browse the repository at this point in the history
Fix for #238 - bug in new emoji support allowed some illegal characters.
  • Loading branch information
natergj authored Oct 5, 2018
2 parents 6d78ec1 + c4e0272 commit 1a54652
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 29 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ tmp
references
coverage
.nyc_output
package-lock.json
60 changes: 60 additions & 0 deletions sample.js
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,66 @@ function generateWorkbook() {
* END date check sheet
*****************************************/

/*****************************************
* START valid characters
*****************************************/
var unicodeSheet = wb.addWorksheet('unicode');

unicodeSheet.column(1).setWidth(40);
unicodeSheet.column(2).setWidth(55);
unicodeSheet.column(3).setWidth(65);

unicodeSheet.cell(1, 1).string('Unicode String');
unicodeSheet.cell(1, 2).string('Text Representation');
unicodeSheet.cell(1, 3).string('Encoded Representation');

unicodeSheet.cell(2, 1).string('Hi <>');
unicodeSheet.cell(2, 2).string('Hi [less than][greater than]');
unicodeSheet.cell(2, 3).string('Hi <>');

unicodeSheet.cell(3, 1).string('😂');
unicodeSheet.cell(3, 2).string('[face with tears of joy]');
unicodeSheet.cell(3, 3).string('&#x1f602;');

unicodeSheet.cell(4, 1).string('hello! 😂');
unicodeSheet.cell(4, 2).string('hello! [face with tears of joy]');
unicodeSheet.cell(4, 3).string('hello! &#x1f602;');

unicodeSheet.cell(5, 1).string('☕️');
unicodeSheet.cell(5, 2).string('[hot beverage]');
unicodeSheet.cell(5, 3).string('☕️ (not escaped)');

unicodeSheet.cell(6, 1).string('😂☕️');
unicodeSheet.cell(6, 2).string('[face with tears of joy][hot beverage]');
unicodeSheet.cell(6, 3).string('&#x1f602;☕️ (not eescaped)');

unicodeSheet.cell(7, 1).string('Good 🤞🏼 Luck');
unicodeSheet.cell(7, 2).string('Good [crossed fingers: light skin tone] luck');
unicodeSheet.cell(7, 3).string('Good &#x1f91e;&#x1f3fc; Luck');

unicodeSheet.cell(8, 1).string('Fist 🤜🏻🤛🏿 bump');
unicodeSheet.cell(8, 2).string('Fist [right-facing fist: light skin tone][left-facing fist: dark skin tone] bump');
unicodeSheet.cell(8, 3).string('Fist &#x1f91c;&#x1f3fb;&#x1f91b;&#x1f3ff; bump');

unicodeSheet.cell(9, 1).string('I am the Α and the Ω');
unicodeSheet.cell(9, 2).string('I am the [greek capital letter alpha] and the [greek capital letter omega]');
unicodeSheet.cell(9, 3).string('I am the Α and the Ω (not escaped)');

unicodeSheet.cell(10, 1).string('𐤶');
unicodeSheet.cell(10, 2).string('[lydian letter En]');
unicodeSheet.cell(10, 3).string('&#x20046;');

unicodeSheet.cell(11, 1).string('𠁆');
unicodeSheet.cell(11, 2).string('[deograph bik6]');
unicodeSheet.cell(11, 3).string('&#x10936;');

unicodeSheet.cell(11, 1).string(String.fromCodePoint(0x000b));
unicodeSheet.cell(11, 2).string('[vertical tab]');
unicodeSheet.cell(11, 3).string('');
/*****************************************
* END valid characters
*****************************************/

/*****************************************
* START final sheet
*****************************************/
Expand Down
32 changes: 22 additions & 10 deletions source/lib/cell/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,27 @@ const Style = require('../style/style.js');
const utils = require('../utils.js');
const util = require('util');

const validXmlRegex = /[\u0009\u000a\u000d\u0020-\uD7FF\uE000-\uFFFD]/u;

/**
* The list of valid characters is
* #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
*
* We need to test codepoints numerically, instead of regex characters above 65536 (0x10000),
*/
function removeInvalidXml(str) {
return Array.from(str).map(c => {
const cp = c.codePointAt(0);
if (cp >= 65536 && cp <= 1114111) {
return c
} else if (c.match(validXmlRegex)) {
return c;
} else {
return '';
}
}).join('');
}

function stringSetter(val) {
let logger = this.ws.wb.logger;

Expand All @@ -15,16 +36,7 @@ function stringSetter(val) {
typeof (val));
val = '';
}

let invalidXml11Chars, chr;
invalidXml11Chars = /[^\u0001-\uD7FF\uE000-\uFFFD\uD800\uDC00-\uDBFF\uDFFF]/u;
chr = val.match(invalidXml11Chars);
if (chr) {
logger.warn('Invalid Character for XML "' + chr + '" in string "' + val + '"');
val = val.replace(chr, '');
}
// Remove Control characters, they aren't understood by xmlbuilder
val = val.replace(invalidXml11Chars, '');
val = removeInvalidXml(val);

if (!this.merged) {
this.cells.forEach((c) => {
Expand Down
19 changes: 0 additions & 19 deletions tests/emoji.test.js

This file was deleted.

39 changes: 39 additions & 0 deletions tests/unicodestring.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
let test = require('tape');
let xl = require('../source');

test('Escape Unicode Cell Values', (t) => {
let wb = new xl.Workbook();
let ws = wb.addWorksheet('test');
let cellIndex = 1;
/**
* To test that unicode is escaped properly, provide an unescaped source string, and then our
* expected escaped string.
*
* See the following literature:
* https://stackoverflow.com/questions/43094662/excel-accepts-some-characters-whereas-openxml-has-error/43141040#43141040
* https://stackoverflow.com/questions/43094662/excel-accepts-some-characters-whereas-openxml-has-error
* https://www.ecma-international.org/publications/standards/Ecma-376.htm
*/
function testUnicode(strVal, testVal) {
let cellAccessor = ws.cell(1, cellIndex);
let cell = cellAccessor.string(strVal);
let thisCell = ws.cells[cell.excelRefs[0]];
cellIndex++;
t.ok(wb.sharedStrings[thisCell.v] === testVal, 'Unicode "' + strVal + '" correctly escaped in cell');
}

testUnicode('Hi <>', 'Hi <>');
testUnicode('😂', '😂');
testUnicode('hello! 😂', 'hello! 😂');
testUnicode('☕️', '☕️'); // ☕️ is U+2615 which is within the valid range.
testUnicode('😂☕️', '😂☕️');
testUnicode('Good 🤞🏼 Luck', 'Good 🤞🏼 Luck');
testUnicode('Fist 🤜🏻🤛🏿 bump', 'Fist 🤜🏻🤛🏿 bump');
testUnicode('㭩', '㭩');
testUnicode('I am the Α and the Ω', 'I am the Α and the Ω');
testUnicode('𐤶', '𐤶'); // Lydian Letter En U+10936
testUnicode('𠁆', '𠁆'); // Ideograph bik6
testUnicode('\u000b', ''); // tab should be removed

t.end();
});

0 comments on commit 1a54652

Please sign in to comment.