From 17f7bcda45c563da3edf6c23ea0de53199b12e4d Mon Sep 17 00:00:00 2001 From: Luc Patiny Date: Thu, 5 Dec 2024 15:07:17 +0100 Subject: [PATCH] feat!: make parser async and use native zlib decompression --- README.md | 4 +--- src/Options.js | 6 ++++++ src/index.js | 11 ++++++----- src/mzdata/parseMzData.js | 22 ++++++++++++++++++---- src/mzml/parseMzML.js | 17 +++++++++++++---- src/mzml/utils.js | 7 ++++--- src/mzxml/parseMzXML.js | 22 +++++++++++++++++++--- src/mzxml/utils.js | 6 +++--- src/util/decodeBase64.js | 7 ++++--- src/util/inflate.js | 37 +++++++++++++++++++++++++++++++++++++ 10 files changed, 111 insertions(+), 28 deletions(-) create mode 100644 src/Options.js create mode 100644 src/util/inflate.js diff --git a/README.md b/README.md index 5fe268a..a90faf5 100755 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ import { parseMZ } from 'mzdata'; // mzData files const mzDataFile = readFileSync(__dirname + '/tiny.mzData.xml'); -var response = parseMZ(mzDataFile); +var response = await parseMZ(mzDataFile); ``` ## Ontology @@ -58,8 +58,6 @@ You can find various examples files at: http://www.psidev.info/mzML -## [API Documentation](https://cheminfo-js.github.io/mzData/) - ## License [MIT](./LICENSE) diff --git a/src/Options.js b/src/Options.js new file mode 100644 index 0000000..1f1220d --- /dev/null +++ b/src/Options.js @@ -0,0 +1,6 @@ +/** + * @typedef {Object} Options + * @property {import('cheminfo-types').Logger} [logger] - A potential logger + */ + +export {}; diff --git a/src/index.js b/src/index.js index 35d9b43..cf78020 100644 --- a/src/index.js +++ b/src/index.js @@ -7,9 +7,10 @@ const decoder = new TextDecoder(); /** * Reads a mzData v1.05 file * @param {ArrayBuffer|string} xml - ArrayBuffer or String or any Typed Array (including Node.js' Buffer from v4) with the data - * @return {{times: Array, series: { ms: { data:Array>}}}} + * @param {import('./Options.js').Options} [options={}] + * @return Promise<{{times: Array, series: { ms: { data:Array>}}}}> */ -export function parseMZ(xml) { +export async function parseMZ(xml, options = {}) { if (typeof xml === 'string') { const encoder = new TextEncoder(); xml = encoder.encode(xml); @@ -24,11 +25,11 @@ export function parseMZ(xml) { : xml.substring(0, 200); if (header.includes('mzData')) { - return parseMzData(xml); + return parseMzData(xml, options); } else if (header.includes('mzML')) { - return parseMzML(xml); + return parseMzML(xml, options); } else if (header.includes('mzXML')) { - return parseMzXML(xml); + return parseMzXML(xml, options); } else { throw new Error(`MZ parser: unknown format`); } diff --git a/src/mzdata/parseMzData.js b/src/mzdata/parseMzData.js index c7f0081..4362920 100644 --- a/src/mzdata/parseMzData.js +++ b/src/mzdata/parseMzData.js @@ -1,4 +1,5 @@ import { parse } from 'arraybuffer-xml-parser'; +import { recursiveResolve } from 'ml-spectra-processing'; import { decodeBase64 } from '../util/decodeBase64'; @@ -7,7 +8,14 @@ import { processSpectrumList } from './processSpectrumList'; const decoder = new TextDecoder(); -export function parseMzData(arrayBuffer) { +/** + * + * @param {*} arrayBuffer + * @param {import('../Options.js').Options} [options={}] + * @returns + */ +export async function parseMzData(arrayBuffer, options = {}) { + const { logger = console } = options; const result = { metadata: {}, times: [], @@ -19,14 +27,20 @@ export function parseMzData(arrayBuffer) { }; let parsed = parse(arrayBuffer, { - attributeNamePrefix: '', attributesNodeName: 'attributes', + attributeNameProcessor: (attributeName) => attributeName, tagValueProcessor: (value, node) => { if (node.tagName !== 'data') return decoder.decode(value); - return decodeBase64(node.value, node.attributes); + const promise = decodeBase64(node.bytes, node.attributes); + // avoid unhandled promise rejection and swallow the error + promise.catch((error) => { + logger.error('error decoding base64', error); + return []; + }); + return promise; }, }); - + await recursiveResolve(parsed); processMetadata(parsed.mzData, result.metadata); processSpectrumList(parsed.mzData, result.times, result.series.ms.data); diff --git a/src/mzml/parseMzML.js b/src/mzml/parseMzML.js index 22b3447..9190e89 100644 --- a/src/mzml/parseMzML.js +++ b/src/mzml/parseMzML.js @@ -1,4 +1,5 @@ import { parse } from 'arraybuffer-xml-parser'; +import { recursiveResolve } from 'ml-spectra-processing'; import { decodeBase64 } from '../util/decodeBase64'; @@ -8,7 +9,8 @@ const decoder = new TextDecoder(); // https://www.psidev.info/mzml // CV = Controlled vocabulary -export function parseMzML(arrayBuffer) { +export async function parseMzML(arrayBuffer, options = {}) { + const { logger = console } = options; const result = { metadata: {}, times: [], @@ -20,17 +22,24 @@ export function parseMzML(arrayBuffer) { }; let parsed = parse(arrayBuffer, { - attributeNamePrefix: '', attributesNodeName: 'attributes', + attributeNameProcessor: (attributeName) => attributeName, tagValueProcessor: (value, node) => { if (node.tagName !== 'binary') return decoder.decode(value); const ontologies = node.parent.children.cvParam.map( (entry) => entry.attributes.accession, ); - - return decodeBase64(node.value, { ontologies }); + const promise = decodeBase64(node.bytes, { ontologies }); + // avoid unhandled promise rejection and swallow the error + promise.catch((error) => { + logger.error('error decoding base64', error); + return []; + }); + return promise; }, }); + // parsed file still contains promises + await recursiveResolve(parsed); const mzML = parsed.mzML || parsed.indexedmzML.mzML; diff --git a/src/mzml/utils.js b/src/mzml/utils.js index c980b67..ea06214 100644 --- a/src/mzml/utils.js +++ b/src/mzml/utils.js @@ -1,12 +1,13 @@ import { decode } from 'uint8-base64'; -import { inflate } from 'pako'; -export function decoder(base64Encoded, options = {}) { +import { inflate } from '../util/inflate.js'; + +export async function decoder(base64Encoded, options = {}) { const { compressionAlgorithm } = options; let decoded; switch (compressionAlgorithm) { case 'zlib': - decoded = inflate(decode(base64Encoded)); + decoded = await inflate(decode(base64Encoded)); break; case undefined: case '': diff --git a/src/mzxml/parseMzXML.js b/src/mzxml/parseMzXML.js index fd519ca..0e9725d 100644 --- a/src/mzxml/parseMzXML.js +++ b/src/mzxml/parseMzXML.js @@ -1,4 +1,5 @@ import { parse } from 'arraybuffer-xml-parser'; +import { recursiveResolve } from 'ml-spectra-processing'; import { decodeBase64 } from '../util/decodeBase64'; @@ -6,7 +7,14 @@ import { processSpectrumList } from './processSpectrumList'; const decoder = new TextDecoder(); -export function parseMzXML(arrayBuffer) { +/** + * + * @param {*} arrayBuffer + * @param {import('../Options.js').Options} [options] + * @returns + */ +export async function parseMzXML(arrayBuffer, options = {}) { + const { logger = console } = options; const result = { metadata: {}, times: [], @@ -17,17 +25,25 @@ export function parseMzXML(arrayBuffer) { }, }; let parsed = parse(arrayBuffer, { - attributeNamePrefix: '', attributesNodeName: 'attributes', + attributeNameProcessor: (attributeName) => attributeName, tagValueProcessor: (value, node) => { if (node.tagName !== 'peaks') return decoder.decode(value); - return decodeBase64(node.value, { + + const promise = decodeBase64(node.bytes, { precision: node.attributes.precision, endian: node.attributes.byteOrder, compression: node.attributes.compressionType, }); + // avoid unhandled promise rejection and swallow the error + promise.catch((error) => { + logger.error('error decoding base64', error); + return []; + }); + return promise; }, }); + await recursiveResolve(parsed); processSpectrumList(parsed.mzXML, result.times, result.series.ms); diff --git a/src/mzxml/utils.js b/src/mzxml/utils.js index d7193b3..87d23fe 100644 --- a/src/mzxml/utils.js +++ b/src/mzxml/utils.js @@ -1,13 +1,13 @@ import { decode } from 'uint8-base64'; -import { inflate } from 'pako'; +import { inflate } from '../util/inflate.js'; -export function decoder(base64Encoded, options = {}) { +export async function decoder(base64Encoded, options = {}) { const { compressionAlgorithm } = options; let decoded; switch (compressionAlgorithm) { case 'zlib': - decoded = inflate(decode(base64Encoded)); + decoded = await inflate(decode(base64Encoded)); break; case undefined: case '': diff --git a/src/util/decodeBase64.js b/src/util/decodeBase64.js index 26d8cc1..75769c4 100644 --- a/src/util/decodeBase64.js +++ b/src/util/decodeBase64.js @@ -1,7 +1,8 @@ -import { inflate } from 'pako'; import { decode } from 'uint8-base64'; -export function decodeBase64(base64, options = {}) { +import { inflate } from './inflate.js'; + +export async function decodeBase64(base64, options = {}) { let { endian = 'little', precision, @@ -28,7 +29,7 @@ export function decodeBase64(base64, options = {}) { let uint8Array = decode(base64); switch (compression.toLowerCase()) { case 'zlib': - uint8Array = inflate(uint8Array); + uint8Array = await inflate(uint8Array); break; case '': case 'none': diff --git a/src/util/inflate.js b/src/util/inflate.js new file mode 100644 index 0000000..0775da9 --- /dev/null +++ b/src/util/inflate.js @@ -0,0 +1,37 @@ +export async function inflate(zlibCompressedData) { + // Strip the zlib header and footer + const strippedData = zlibCompressedData.subarray(2, -4); // Remove 2-byte header and 4-byte Adler-32 footer + + const inputStream = new ReadableStream({ + start(controller) { + controller.enqueue(strippedData); + controller.close(); + }, + }); + + const decompressedStream = inputStream.pipeThrough( + new DecompressionStream('deflate-raw'), + ); + + const reader = decompressedStream.getReader(); + const chunks = []; + let totalLength = 0; + + while (true) { + // eslint-disable-next-line no-await-in-loop + const { value, done } = await reader.read(); + if (done) break; + chunks.push(value); + totalLength += value.length; + } + + // Combine chunks into a single Uint8Array + const decompressedData = new Uint8Array(totalLength); + let offset = 0; + for (const chunk of chunks) { + decompressedData.set(chunk, offset); + offset += chunk.length; + } + + return decompressedData; +}