diff --git a/lib/encoding.js b/lib/encoding.js index c8389cc..0082d8f 100644 --- a/lib/encoding.js +++ b/lib/encoding.js @@ -582,6 +582,7 @@ * @param {{fatal: boolean}} options */ function UTF8Encoder(options) { + var fatal = options.fatal; /** * @param {Stream} stream Input stream. * @param {number} code_point Next code point read from the stream. diff --git a/lib/encoding.cjs.js b/lib/encoding.lib.js similarity index 100% rename from lib/encoding.cjs.js rename to lib/encoding.lib.js diff --git a/lib/encoding.lib.mjs b/lib/encoding.lib.mjs new file mode 100644 index 0000000..79e8ddd --- /dev/null +++ b/lib/encoding.lib.mjs @@ -0,0 +1,665 @@ +'use strict'; + +// This is free and unencumbered software released into the public domain. +// See LICENSE.md for more information. + +// +// Utilities +// + +/** + * @param {number} a The number to test. + * @param {number} min The minimum value in the range, inclusive. + * @param {number} max The maximum value in the range, inclusive. + * @return {boolean} True if a >= min and a <= max. + */ +function inRange(a, min, max) { + return min <= a && a <= max; +} + +/** + * @param {*} o + * @return {Object} + */ +function ToDictionary(o) { + if (o === undefined) return {}; + if (o === Object(o)) return o; + throw TypeError('Could not convert argument to dictionary'); +} + +/** + * @param {string} string Input string of UTF-16 code units. + * @return {!Array.} Code points. + */ +function stringToCodePoints(string) { + // https://heycam.github.io/webidl/#dfn-obtain-unicode + + // 1. Let S be the DOMString value. + var s = String(string); + + // 2. Let n be the length of S. + var n = s.length; + + // 3. Initialize i to 0. + var i = 0; + + // 4. Initialize U to be an empty sequence of Unicode characters. + var u = []; + + // 5. While i < n: + while (i < n) { + + // 1. Let c be the code unit in S at index i. + var c = s.charCodeAt(i); + + // 2. Depending on the value of c: + + // c < 0xD800 or c > 0xDFFF + if (c < 0xD800 || c > 0xDFFF) { + // Append to U the Unicode character with code point c. + u.push(c); + } + + // 0xDC00 ≤ c ≤ 0xDFFF + else if (0xDC00 <= c && c <= 0xDFFF) { + // Append to U a U+FFFD REPLACEMENT CHARACTER. + u.push(0xFFFD); + } + + // 0xD800 ≤ c ≤ 0xDBFF + else if (0xD800 <= c && c <= 0xDBFF) { + // 1. If i = n−1, then append to U a U+FFFD REPLACEMENT + // CHARACTER. + if (i === n - 1) { + u.push(0xFFFD); + } + // 2. Otherwise, i < n−1: + else { + // 1. Let d be the code unit in S at index i+1. + var d = string.charCodeAt(i + 1); + + // 2. If 0xDC00 ≤ d ≤ 0xDFFF, then: + if (0xDC00 <= d && d <= 0xDFFF) { + // 1. Let a be c & 0x3FF. + var a = c & 0x3FF; + + // 2. Let b be d & 0x3FF. + var b = d & 0x3FF; + + // 3. Append to U the Unicode character with code point + // 2^16+2^10*a+b. + u.push(0x10000 + (a << 10) + b); + + // 4. Set i to i+1. + i += 1; + } + + // 3. Otherwise, d < 0xDC00 or d > 0xDFFF. Append to U a + // U+FFFD REPLACEMENT CHARACTER. + else { + u.push(0xFFFD); + } + } + } + + // 3. Set i to i+1. + i += 1; + } + + // 6. Return U. + return u; +} + +/** + * @param {!Array.} code_points Array of code points. + * @return {string} string String of UTF-16 code units. + */ +function codePointsToString(code_points) { + var s = ''; + for (var i = 0; i < code_points.length; ++i) { + var cp = code_points[i]; + if (cp <= 0xFFFF) { + s += String.fromCharCode(cp); + } else { + cp -= 0x10000; + s += String.fromCharCode((cp >> 10) + 0xD800, + (cp & 0x3FF) + 0xDC00); + } + } + return s; +} + + +// +// Implementation of Encoding specification +// https://encoding.spec.whatwg.org/ +// + +// +// 3. Terminology +// + +/** + * End-of-stream is a special token that signifies no more tokens + * are in the stream. + * @const + */ var end_of_stream = -1; + +/** + * A stream represents an ordered sequence of tokens. + * + * @constructor + * @param {!(Array.|Uint8Array)} tokens Array of tokens that provide the + * stream. + */ +function Stream(tokens) { + /** @type {!Array.} */ + this.tokens = [].slice.call(tokens); +} + +Stream.prototype = { + /** + * @return {boolean} True if end-of-stream has been hit. + */ + endOfStream: function() { + return !this.tokens.length; + }, + + /** + * When a token is read from a stream, the first token in the + * stream must be returned and subsequently removed, and + * end-of-stream must be returned otherwise. + * + * @return {number} Get the next token from the stream, or + * end_of_stream. + */ + read: function() { + if (!this.tokens.length) + return end_of_stream; + return this.tokens.shift(); + }, + + /** + * When one or more tokens are prepended to a stream, those tokens + * must be inserted, in given order, before the first token in the + * stream. + * + * @param {(number|!Array.)} token The token(s) to prepend to the stream. + */ + prepend: function(token) { + if (Array.isArray(token)) { + var tokens = /**@type {!Array.}*/(token); + while (tokens.length) + this.tokens.unshift(tokens.pop()); + } else { + this.tokens.unshift(token); + } + }, + + /** + * When one or more tokens are pushed to a stream, those tokens + * must be inserted, in given order, after the last token in the + * stream. + * + * @param {(number|!Array.)} token The tokens(s) to prepend to the stream. + */ + push: function(token) { + if (Array.isArray(token)) { + var tokens = /**@type {!Array.}*/(token); + while (tokens.length) + this.tokens.push(tokens.shift()); + } else { + this.tokens.push(token); + } + } +}; + +// +// 4. Encodings +// + +// 4.1 Encoders and decoders + +/** @const */ +var finished = -1; + +/** + * @param {boolean} fatal If true, decoding errors raise an exception. + * @param {number=} opt_code_point Override the standard fallback code point. + * @return {number} The code point to insert on a decoding error. + */ +function decoderError(fatal, opt_code_point) { + if (fatal) + throw TypeError('Decoder error'); + return opt_code_point || 0xFFFD; +} + +/** @interface */ +function Decoder() {} +Decoder.prototype = { + /** + * @param {Stream} stream The stream of bytes being decoded. + * @param {number} bite The next byte read from the stream. + * @return {?(number|!Array.)} The next code point(s) + * decoded, or null if not enough data exists in the input + * stream to decode a complete code point, or |finished|. + */ + handler: function(stream, bite) {} +}; + +/** @interface */ +function Encoder() {} +Encoder.prototype = { + /** + * @param {Stream} stream The stream of code points being encoded. + * @param {number} code_point Next code point read from the stream. + * @return {(number|!Array.)} Byte(s) to emit, or |finished|. + */ + handler: function(stream, code_point) {} +}; + +// +// 7. API +// + +/** @const */ var DEFAULT_ENCODING = 'utf-8'; + +// 7.1 Interface TextDecoder + +/** + * @constructor + * @param {string=} encoding The label of the encoding; + * defaults to 'utf-8'. + * @param {Object=} options + */ +function TextDecoder(encoding, options) { + if (!(this instanceof TextDecoder)) { + return new TextDecoder(encoding, options); + } + encoding = encoding !== undefined ? String(encoding).toLowerCase() : DEFAULT_ENCODING; + if (encoding !== DEFAULT_ENCODING) { + throw new Error('Encoding not supported. Only utf-8 is supported'); + } + options = ToDictionary(options); + + /** @private @type {boolean} */ + this._streaming = false; + /** @private @type {boolean} */ + this._BOMseen = false; + /** @private @type {?Decoder} */ + this._decoder = null; + /** @private @type {boolean} */ + this._fatal = Boolean(options['fatal']); + /** @private @type {boolean} */ + this._ignoreBOM = Boolean(options['ignoreBOM']); + + Object.defineProperty(this, 'encoding', {value: 'utf-8'}); + Object.defineProperty(this, 'fatal', {value: this._fatal}); + Object.defineProperty(this, 'ignoreBOM', {value: this._ignoreBOM}); +} + +TextDecoder.prototype = { + /** + * @param {ArrayBufferView=} input The buffer of bytes to decode. + * @param {Object=} options + * @return {string} The decoded string. + */ + decode: function decode(input, options) { + var bytes; + if (typeof input === 'object' && input instanceof ArrayBuffer) { + bytes = new Uint8Array(input); + } else if (typeof input === 'object' && 'buffer' in input && + input.buffer instanceof ArrayBuffer) { + bytes = new Uint8Array(input.buffer, + input.byteOffset, + input.byteLength); + } else { + bytes = new Uint8Array(0); + } + + options = ToDictionary(options); + + if (!this._streaming) { + this._decoder = new UTF8Decoder({fatal: this._fatal}); + this._BOMseen = false; + } + this._streaming = Boolean(options['stream']); + + var input_stream = new Stream(bytes); + + var code_points = []; + + /** @type {?(number|!Array.)} */ + var result; + + while (!input_stream.endOfStream()) { + result = this._decoder.handler(input_stream, input_stream.read()); + if (result === finished) + break; + if (result === null) + continue; + if (Array.isArray(result)) + code_points.push.apply(code_points, /**@type {!Array.}*/(result)); + else + code_points.push(result); + } + if (!this._streaming) { + do { + result = this._decoder.handler(input_stream, input_stream.read()); + if (result === finished) + break; + if (result === null) + continue; + if (Array.isArray(result)) + code_points.push.apply(code_points, /**@type {!Array.}*/(result)); + else + code_points.push(result); + } while (!input_stream.endOfStream()); + this._decoder = null; + } + + if (code_points.length) { + // If encoding is one of utf-8, utf-16be, and utf-16le, and + // ignore BOM flag and BOM seen flag are unset, run these + // subsubsteps: + if (['utf-8'].indexOf(this.encoding) !== -1 && + !this._ignoreBOM && !this._BOMseen) { + // If token is U+FEFF, set BOM seen flag. + if (code_points[0] === 0xFEFF) { + this._BOMseen = true; + code_points.shift(); + } else { + // Otherwise, if token is not end-of-stream, set BOM seen + // flag and append token to output. + this._BOMseen = true; + } + } + } + + return codePointsToString(code_points); + } +}; + +// 7.2 Interface TextEncoder + +/** + * @constructor + * @param {string=} encoding The label of the encoding; + * defaults to 'utf-8'. + * @param {Object=} options + */ +function TextEncoder(encoding, options) { + if (!(this instanceof TextEncoder)) + return new TextEncoder(encoding, options); + encoding = encoding !== undefined ? String(encoding).toLowerCase() : DEFAULT_ENCODING; + if (encoding !== DEFAULT_ENCODING) { + throw new Error('Encoding not supported. Only utf-8 is supported'); + } + options = ToDictionary(options); + + /** @private @type {boolean} */ + this._streaming = false; + /** @private @type {?Encoder} */ + this._encoder = null; + /** @private @type {{fatal: boolean}} */ + this._options = {fatal: Boolean(options['fatal'])}; + + Object.defineProperty(this, 'encoding', {value: 'utf-8'}); +} + +TextEncoder.prototype = { + /** + * @param {string=} opt_string The string to encode. + * @param {Object=} options + * @return {Uint8Array} Encoded bytes, as a Uint8Array. + */ + encode: function encode(opt_string, options) { + opt_string = opt_string ? String(opt_string) : ''; + options = ToDictionary(options); + + // NOTE: This option is nonstandard. None of the encodings + // permitted for encoding (i.e. UTF-8, UTF-16) are stateful, + // so streaming is not necessary. + if (!this._streaming) + this._encoder = new UTF8Encoder(this._options); + this._streaming = Boolean(options['stream']); + + var bytes = []; + var input_stream = new Stream(stringToCodePoints(opt_string)); + /** @type {?(number|!Array.)} */ + var result; + while (!input_stream.endOfStream()) { + result = this._encoder.handler(input_stream, input_stream.read()); + if (result === finished) + break; + if (Array.isArray(result)) + bytes.push.apply(bytes, /**@type {!Array.}*/(result)); + else + bytes.push(result); + } + if (!this._streaming) { + while (true) { + result = this._encoder.handler(input_stream, input_stream.read()); + if (result === finished) + break; + if (Array.isArray(result)) + bytes.push.apply(bytes, /**@type {!Array.}*/(result)); + else + bytes.push(result); + } + this._encoder = null; + } + return new Uint8Array(bytes); + } +}; + +// +// 8. The encoding +// + +// 8.1 utf-8 + +/** + * @constructor + * @implements {Decoder} + * @param {{fatal: boolean}} options + */ +function UTF8Decoder(options) { + var fatal = options.fatal; + + // utf-8's decoder's has an associated utf-8 code point, utf-8 + // bytes seen, and utf-8 bytes needed (all initially 0), a utf-8 + // lower boundary (initially 0x80), and a utf-8 upper boundary + // (initially 0xBF). + var /** @type {number} */ utf8_code_point = 0, + /** @type {number} */ utf8_bytes_seen = 0, + /** @type {number} */ utf8_bytes_needed = 0, + /** @type {number} */ utf8_lower_boundary = 0x80, + /** @type {number} */ utf8_upper_boundary = 0xBF; + + /** + * @param {Stream} stream The stream of bytes being decoded. + * @param {number} bite The next byte read from the stream. + * @return {?(number|!Array.)} The next code point(s) + * decoded, or null if not enough data exists in the input + * stream to decode a complete code point. + */ + this.handler = function(stream, bite) { + // 1. If byte is end-of-stream and utf-8 bytes needed is not 0, + // set utf-8 bytes needed to 0 and return error. + if (bite === end_of_stream && utf8_bytes_needed !== 0) { + utf8_bytes_needed = 0; + return decoderError(fatal); + } + + // 2. If byte is end-of-stream, return finished. + if (bite === end_of_stream) + return finished; + + // 3. If utf-8 bytes needed is 0, based on byte: + if (utf8_bytes_needed === 0) { + + // 0x00 to 0x7F + if (inRange(bite, 0x00, 0x7F)) { + // Return a code point whose value is byte. + return bite; + } + + // 0xC2 to 0xDF + if (inRange(bite, 0xC2, 0xDF)) { + // Set utf-8 bytes needed to 1 and utf-8 code point to byte + // − 0xC0. + utf8_bytes_needed = 1; + utf8_code_point = bite - 0xC0; + } + + // 0xE0 to 0xEF + else if (inRange(bite, 0xE0, 0xEF)) { + // 1. If byte is 0xE0, set utf-8 lower boundary to 0xA0. + if (bite === 0xE0) + utf8_lower_boundary = 0xA0; + // 2. If byte is 0xED, set utf-8 upper boundary to 0x9F. + if (bite === 0xED) + utf8_upper_boundary = 0x9F; + // 3. Set utf-8 bytes needed to 2 and utf-8 code point to + // byte − 0xE0. + utf8_bytes_needed = 2; + utf8_code_point = bite - 0xE0; + } + + // 0xF0 to 0xF4 + else if (inRange(bite, 0xF0, 0xF4)) { + // 1. If byte is 0xF0, set utf-8 lower boundary to 0x90. + if (bite === 0xF0) + utf8_lower_boundary = 0x90; + // 2. If byte is 0xF4, set utf-8 upper boundary to 0x8F. + if (bite === 0xF4) + utf8_upper_boundary = 0x8F; + // 3. Set utf-8 bytes needed to 3 and utf-8 code point to + // byte − 0xF0. + utf8_bytes_needed = 3; + utf8_code_point = bite - 0xF0; + } + + // Otherwise + else { + // Return error. + return decoderError(fatal); + } + + // Then (byte is in the range 0xC2 to 0xF4) set utf-8 code + // point to utf-8 code point << (6 × utf-8 bytes needed) and + // return continue. + utf8_code_point = utf8_code_point << (6 * utf8_bytes_needed); + return null; + } + + // 4. If byte is not in the range utf-8 lower boundary to utf-8 + // upper boundary, run these substeps: + if (!inRange(bite, utf8_lower_boundary, utf8_upper_boundary)) { + + // 1. Set utf-8 code point, utf-8 bytes needed, and utf-8 + // bytes seen to 0, set utf-8 lower boundary to 0x80, and set + // utf-8 upper boundary to 0xBF. + utf8_code_point = utf8_bytes_needed = utf8_bytes_seen = 0; + utf8_lower_boundary = 0x80; + utf8_upper_boundary = 0xBF; + + // 2. Prepend byte to stream. + stream.prepend(bite); + + // 3. Return error. + return decoderError(fatal); + } + + // 5. Set utf-8 lower boundary to 0x80 and utf-8 upper boundary + // to 0xBF. + utf8_lower_boundary = 0x80; + utf8_upper_boundary = 0xBF; + + // 6. Increase utf-8 bytes seen by one and set utf-8 code point + // to utf-8 code point + (byte − 0x80) << (6 × (utf-8 bytes + // needed − utf-8 bytes seen)). + utf8_bytes_seen += 1; + utf8_code_point += (bite - 0x80) << (6 * (utf8_bytes_needed - utf8_bytes_seen)); + + // 7. If utf-8 bytes seen is not equal to utf-8 bytes needed, + // continue. + if (utf8_bytes_seen !== utf8_bytes_needed) + return null; + + // 8. Let code point be utf-8 code point. + var code_point = utf8_code_point; + + // 9. Set utf-8 code point, utf-8 bytes needed, and utf-8 bytes + // seen to 0. + utf8_code_point = utf8_bytes_needed = utf8_bytes_seen = 0; + + // 10. Return a code point whose value is code point. + return code_point; + }; +} + +/** + * @constructor + * @implements {Encoder} + * @param {{fatal: boolean}} options + */ +function UTF8Encoder(options) { + var fatal = options.fatal; + /** + * @param {Stream} stream Input stream. + * @param {number} code_point Next code point read from the stream. + * @return {(number|!Array.)} Byte(s) to emit. + */ + this.handler = function(stream, code_point) { + // 1. If code point is end-of-stream, return finished. + if (code_point === end_of_stream) + return finished; + + // 2. If code point is in the range U+0000 to U+007F, return a + // byte whose value is code point. + if (inRange(code_point, 0x0000, 0x007f)) + return code_point; + + // 3. Set count and offset based on the range code point is in: + var count, offset; + // U+0080 to U+07FF: 1 and 0xC0 + if (inRange(code_point, 0x0080, 0x07FF)) { + count = 1; + offset = 0xC0; + } + // U+0800 to U+FFFF: 2 and 0xE0 + else if (inRange(code_point, 0x0800, 0xFFFF)) { + count = 2; + offset = 0xE0; + } + // U+10000 to U+10FFFF: 3 and 0xF0 + else if (inRange(code_point, 0x10000, 0x10FFFF)) { + count = 3; + offset = 0xF0; + } + + // 4.Let bytes be a byte sequence whose first byte is (code + // point >> (6 × count)) + offset. + var bytes = [(code_point >> (6 * count)) + offset]; + + // 5. Run these substeps while count is greater than 0: + while (count > 0) { + + // 1. Set temp to code point >> (6 × (count − 1)). + var temp = code_point >> (6 * (count - 1)); + + // 2. Append to bytes 0x80 | (temp & 0x3F). + bytes.push(0x80 | (temp & 0x3F)); + + // 3. Decrease count by one. + count -= 1; + } + + // 6. Return bytes bytes, in order. + return bytes; + }; +} + +export {TextEncoder, TextDecoder}; diff --git a/package.json b/package.json index 06bab4b..c7de3fd 100644 --- a/package.json +++ b/package.json @@ -5,13 +5,13 @@ "Rick Eyre ", "Joshua Bell " ], - "version": "1.0.0", + "version": "1.0.1", "description": "UTF-8 only polyfill for the Encoding Living Standard's API.", - "main": "lib/encoding.cjs.js", + "main": "lib/encoding.lib", "jsnext:main": "src/encoding.js", "files": [ "src", - "dist" + "lib" ], "repository": { "type": "git", @@ -27,7 +27,7 @@ }, "homepage": "https://github.com/arv/text-encoding-utf-8", "scripts": { - "prepublish": "rollup -f iife -o lib/encoding.js -- src/polyfill.js && rollup -f cjs -o lib/encoding.cjs.js -- src/encoding.js" + "prepublish": "rollup -f iife -o lib/encoding.js -- src/polyfill.js && rollup -f cjs -o lib/encoding.lib.js -- src/encoding.js && cp src/encoding.js lib/encoding.lib.mjs" }, "devDependencies": { "rollup": "^0.21.0"