diff --git a/.gitignore b/.gitignore index a919860..a366b4f 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ tmp references coverage .nyc_output +package-lock.json \ No newline at end of file diff --git a/sample.js b/sample.js index ee3a5be..a5f91e6 100644 --- a/sample.js +++ b/sample.js @@ -533,6 +533,66 @@ function generateWorkbook() { * END date check sheet *****************************************/ + /***************************************** + * START valid characters + *****************************************/ + var unicodeSheet = wb.addWorksheet('unicode'); + + unicodeSheet.column(1).setWidth(40); + unicodeSheet.column(2).setWidth(55); + unicodeSheet.column(3).setWidth(65); + + unicodeSheet.cell(1, 1).string('Unicode String'); + unicodeSheet.cell(1, 2).string('Text Representation'); + unicodeSheet.cell(1, 3).string('Encoded Representation'); + + unicodeSheet.cell(2, 1).string('Hi <>'); + unicodeSheet.cell(2, 2).string('Hi [less than][greater than]'); + unicodeSheet.cell(2, 3).string('Hi <>'); + + unicodeSheet.cell(3, 1).string('😂'); + unicodeSheet.cell(3, 2).string('[face with tears of joy]'); + unicodeSheet.cell(3, 3).string('😂'); + + unicodeSheet.cell(4, 1).string('hello! 😂'); + unicodeSheet.cell(4, 2).string('hello! [face with tears of joy]'); + unicodeSheet.cell(4, 3).string('hello! 😂'); + + unicodeSheet.cell(5, 1).string('☕️'); + unicodeSheet.cell(5, 2).string('[hot beverage]'); + unicodeSheet.cell(5, 3).string('☕️ (not escaped)'); + + unicodeSheet.cell(6, 1).string('😂☕️'); + unicodeSheet.cell(6, 2).string('[face with tears of joy][hot beverage]'); + unicodeSheet.cell(6, 3).string('😂☕️ (not eescaped)'); + + unicodeSheet.cell(7, 1).string('Good 🤞🏼 Luck'); + unicodeSheet.cell(7, 2).string('Good [crossed fingers: light skin tone] luck'); + unicodeSheet.cell(7, 3).string('Good 🤞🏼 Luck'); + + unicodeSheet.cell(8, 1).string('Fist 🤜🏻🤛🏿 bump'); + unicodeSheet.cell(8, 2).string('Fist [right-facing fist: light skin tone][left-facing fist: dark skin tone] bump'); + unicodeSheet.cell(8, 3).string('Fist 🤜🏻🤛🏿 bump'); + + unicodeSheet.cell(9, 1).string('I am the Α and the Ω'); + unicodeSheet.cell(9, 2).string('I am the [greek capital letter alpha] and the [greek capital letter omega]'); + unicodeSheet.cell(9, 3).string('I am the Α and the Ω (not escaped)'); + + unicodeSheet.cell(10, 1).string('𐤶'); + unicodeSheet.cell(10, 2).string('[lydian letter En]'); + unicodeSheet.cell(10, 3).string('𠁆'); + + unicodeSheet.cell(11, 1).string('𠁆'); + unicodeSheet.cell(11, 2).string('[deograph bik6]'); + unicodeSheet.cell(11, 3).string('𐤶'); + + unicodeSheet.cell(11, 1).string(String.fromCodePoint(0x000b)); + unicodeSheet.cell(11, 2).string('[vertical tab]'); + unicodeSheet.cell(11, 3).string(''); + /***************************************** + * END valid characters + *****************************************/ + /***************************************** * START final sheet *****************************************/ diff --git a/source/lib/cell/index.js b/source/lib/cell/index.js index 6cba5f0..073938d 100644 --- a/source/lib/cell/index.js +++ b/source/lib/cell/index.js @@ -6,6 +6,27 @@ const Style = require('../style/style.js'); const utils = require('../utils.js'); const util = require('util'); +const validXmlRegex = /[\u0009\u000a\u000d\u0020-\uD7FF\uE000-\uFFFD]/u; + +/** + * The list of valid characters is + * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + * + * We need to test codepoints numerically, instead of regex characters above 65536 (0x10000), + */ +function removeInvalidXml(str) { + return Array.from(str).map(c => { + const cp = c.codePointAt(0); + if (cp >= 65536 && cp <= 1114111) { + return c + } else if (c.match(validXmlRegex)) { + return c; + } else { + return ''; + } + }).join(''); +} + function stringSetter(val) { let logger = this.ws.wb.logger; @@ -15,16 +36,7 @@ function stringSetter(val) { typeof (val)); val = ''; } - - let invalidXml11Chars, chr; - invalidXml11Chars = /[^\u0001-\uD7FF\uE000-\uFFFD\uD800\uDC00-\uDBFF\uDFFF]/u; - chr = val.match(invalidXml11Chars); - if (chr) { - logger.warn('Invalid Character for XML "' + chr + '" in string "' + val + '"'); - val = val.replace(chr, ''); - } - // Remove Control characters, they aren't understood by xmlbuilder - val = val.replace(invalidXml11Chars, ''); + val = removeInvalidXml(val); if (!this.merged) { this.cells.forEach((c) => { diff --git a/tests/emoji.test.js b/tests/emoji.test.js deleted file mode 100644 index e7513ec..0000000 --- a/tests/emoji.test.js +++ /dev/null @@ -1,19 +0,0 @@ -let test = require('tape'); -let xl = require('../source/index'); - -function testEmoji(t, wb, ws, cellIndex, strVal) { - let cellAccessor = ws.cell(1, cellIndex); - let cell = cellAccessor.string(strVal); - let thisCell = ws.cells[cell.excelRefs[0]]; - t.ok(wb.sharedStrings[thisCell.v] === strVal, 'Emoji exists in cell'); -} -test('Cell coverage', (t) => { - let wb = new xl.Workbook(); - let ws = wb.addWorksheet('test'); - - testEmoji(t, wb, ws, 1, '😂'); - testEmoji(t, wb, ws, 2, 'hello! 😂'); - testEmoji(t, wb, ws, 3, '😂☕️'); - - t.end(); -}); \ No newline at end of file diff --git a/tests/unicodestring.test.js b/tests/unicodestring.test.js new file mode 100644 index 0000000..881d627 --- /dev/null +++ b/tests/unicodestring.test.js @@ -0,0 +1,39 @@ +let test = require('tape'); +let xl = require('../source'); + +test('Escape Unicode Cell Values', (t) => { + let wb = new xl.Workbook(); + let ws = wb.addWorksheet('test'); + let cellIndex = 1; + /** + * To test that unicode is escaped properly, provide an unescaped source string, and then our + * expected escaped string. + * + * See the following literature: + * https://stackoverflow.com/questions/43094662/excel-accepts-some-characters-whereas-openxml-has-error/43141040#43141040 + * https://stackoverflow.com/questions/43094662/excel-accepts-some-characters-whereas-openxml-has-error + * https://www.ecma-international.org/publications/standards/Ecma-376.htm + */ + function testUnicode(strVal, testVal) { + let cellAccessor = ws.cell(1, cellIndex); + let cell = cellAccessor.string(strVal); + let thisCell = ws.cells[cell.excelRefs[0]]; + cellIndex++; + t.ok(wb.sharedStrings[thisCell.v] === testVal, 'Unicode "' + strVal + '" correctly escaped in cell'); + } + + testUnicode('Hi <>', 'Hi <>'); + testUnicode('😂', '😂'); + testUnicode('hello! 😂', 'hello! 😂'); + testUnicode('☕️', '☕️'); // ☕️ is U+2615 which is within the valid range. + testUnicode('😂☕️', '😂☕️'); + testUnicode('Good 🤞🏼 Luck', 'Good 🤞🏼 Luck'); + testUnicode('Fist 🤜🏻🤛🏿 bump', 'Fist 🤜🏻🤛🏿 bump'); + testUnicode('㭩', '㭩'); + testUnicode('I am the Α and the Ω', 'I am the Α and the Ω'); + testUnicode('𐤶', '𐤶'); // Lydian Letter En U+10936 + testUnicode('𠁆', '𠁆'); // Ideograph bik6 + testUnicode('\u000b', ''); // tab should be removed + + t.end(); +}); \ No newline at end of file