From 17f7bcda45c563da3edf6c23ea0de53199b12e4d Mon Sep 17 00:00:00 2001
From: Luc Patiny <luc@patiny.com>
Date: Thu, 5 Dec 2024 15:07:17 +0100
Subject: [PATCH] feat!: make parser async and use native zlib decompression

---
 README.md                 |  4 +---
 src/Options.js            |  6 ++++++
 src/index.js              | 11 ++++++-----
 src/mzdata/parseMzData.js | 22 ++++++++++++++++++----
 src/mzml/parseMzML.js     | 17 +++++++++++++----
 src/mzml/utils.js         |  7 ++++---
 src/mzxml/parseMzXML.js   | 22 +++++++++++++++++++---
 src/mzxml/utils.js        |  6 +++---
 src/util/decodeBase64.js  |  7 ++++---
 src/util/inflate.js       | 37 +++++++++++++++++++++++++++++++++++++
 10 files changed, 111 insertions(+), 28 deletions(-)
 create mode 100644 src/Options.js
 create mode 100644 src/util/inflate.js
diff --git a/README.md b/README.md
index 5fe268a..a90faf5 100755
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ import { parseMZ } from 'mzdata';
 
 // mzData files
 const mzDataFile = readFileSync(__dirname + '/tiny.mzData.xml');
-var response = parseMZ(mzDataFile);
+var response = await parseMZ(mzDataFile);
 ```
 
 ## Ontology
@@ -58,8 +58,6 @@ You can find various examples files at:
 
 http://www.psidev.info/mzML
 
-## [API Documentation](https://cheminfo-js.github.io/mzData/)
-
 ## License
 
 [MIT](./LICENSE)
diff --git a/src/Options.js b/src/Options.js
new file mode 100644
index 0000000..1f1220d
--- /dev/null
+++ b/src/Options.js
@@ -0,0 +1,6 @@
+/**
+ * @typedef {Object} Options
+ * @property {import('cheminfo-types').Logger} [logger] - A potential logger
+ */
+
+export {};
diff --git a/src/index.js b/src/index.js
index 35d9b43..cf78020 100644
--- a/src/index.js
+++ b/src/index.js
@@ -7,9 +7,10 @@ const decoder = new TextDecoder();
 /**
  * Reads a mzData v1.05 file
  * @param {ArrayBuffer|string} xml - ArrayBuffer or String or any Typed Array (including Node.js' Buffer from v4) with the data
- * @return {{times: Array<number>, series: { ms: { data:Array<Array<number>>}}}}
+ * @param {import('./Options.js').Options} [options={}]
+ * @return Promise<{{times: Array<number>, series: { ms: { data:Array<Array<number>>}}}}>
  */
-export function parseMZ(xml) {
+export async function parseMZ(xml, options = {}) {
   if (typeof xml === 'string') {
     const encoder = new TextEncoder();
     xml = encoder.encode(xml);
@@ -24,11 +25,11 @@ export function parseMZ(xml) {
     : xml.substring(0, 200);
 
   if (header.includes('mzData')) {
-    return parseMzData(xml);
+    return parseMzData(xml, options);
   } else if (header.includes('mzML')) {
-    return parseMzML(xml);
+    return parseMzML(xml, options);
   } else if (header.includes('mzXML')) {
-    return parseMzXML(xml);
+    return parseMzXML(xml, options);
   } else {
     throw new Error(`MZ parser: unknown format`);
   }
diff --git a/src/mzdata/parseMzData.js b/src/mzdata/parseMzData.js
index c7f0081..4362920 100644
--- a/src/mzdata/parseMzData.js
+++ b/src/mzdata/parseMzData.js
@@ -1,4 +1,5 @@
 import { parse } from 'arraybuffer-xml-parser';
+import { recursiveResolve } from 'ml-spectra-processing';
 
 import { decodeBase64 } from '../util/decodeBase64';
 
@@ -7,7 +8,14 @@ import { processSpectrumList } from './processSpectrumList';
 
 const decoder = new TextDecoder();
 
-export function parseMzData(arrayBuffer) {
+/**
+ *
+ * @param {*} arrayBuffer
+ * @param {import('../Options.js').Options} [options={}]
+ * @returns
+ */
+export async function parseMzData(arrayBuffer, options = {}) {
+  const { logger = console } = options;
   const result = {
     metadata: {},
     times: [],
@@ -19,14 +27,20 @@ export function parseMzData(arrayBuffer) {
   };
 
   let parsed = parse(arrayBuffer, {
-    attributeNamePrefix: '',
     attributesNodeName: 'attributes',
+    attributeNameProcessor: (attributeName) => attributeName,
     tagValueProcessor: (value, node) => {
       if (node.tagName !== 'data') return decoder.decode(value);
-      return decodeBase64(node.value, node.attributes);
+      const promise = decodeBase64(node.bytes, node.attributes);
+      // avoid unhandled promise rejection and swallow the error
+      promise.catch((error) => {
+        logger.error('error decoding base64', error);
+        return [];
+      });
+      return promise;
     },
   });
-
+  await recursiveResolve(parsed);
   processMetadata(parsed.mzData, result.metadata);
   processSpectrumList(parsed.mzData, result.times, result.series.ms.data);
 
diff --git a/src/mzml/parseMzML.js b/src/mzml/parseMzML.js
index 22b3447..9190e89 100644
--- a/src/mzml/parseMzML.js
+++ b/src/mzml/parseMzML.js
@@ -1,4 +1,5 @@
 import { parse } from 'arraybuffer-xml-parser';
+import { recursiveResolve } from 'ml-spectra-processing';
 
 import { decodeBase64 } from '../util/decodeBase64';
 
@@ -8,7 +9,8 @@ const decoder = new TextDecoder();
 
 // https://www.psidev.info/mzml
 // CV = Controlled vocabulary
-export function parseMzML(arrayBuffer) {
+export async function parseMzML(arrayBuffer, options = {}) {
+  const { logger = console } = options;
   const result = {
     metadata: {},
     times: [],
@@ -20,17 +22,24 @@ export function parseMzML(arrayBuffer) {
   };
 
   let parsed = parse(arrayBuffer, {
-    attributeNamePrefix: '',
     attributesNodeName: 'attributes',
+    attributeNameProcessor: (attributeName) => attributeName,
     tagValueProcessor: (value, node) => {
       if (node.tagName !== 'binary') return decoder.decode(value);
       const ontologies = node.parent.children.cvParam.map(
         (entry) => entry.attributes.accession,
       );
-
-      return decodeBase64(node.value, { ontologies });
+      const promise = decodeBase64(node.bytes, { ontologies });
+      // avoid unhandled promise rejection and swallow the error
+      promise.catch((error) => {
+        logger.error('error decoding base64', error);
+        return [];
+      });
+      return promise;
     },
   });
+  // parsed file still contains promises
+  await recursiveResolve(parsed);
 
   const mzML = parsed.mzML || parsed.indexedmzML.mzML;
 
diff --git a/src/mzml/utils.js b/src/mzml/utils.js
index c980b67..ea06214 100644
--- a/src/mzml/utils.js
+++ b/src/mzml/utils.js
@@ -1,12 +1,13 @@
 import { decode } from 'uint8-base64';
-import { inflate } from 'pako';
 
-export function decoder(base64Encoded, options = {}) {
+import { inflate } from '../util/inflate.js';
+
+export async function decoder(base64Encoded, options = {}) {
   const { compressionAlgorithm } = options;
   let decoded;
   switch (compressionAlgorithm) {
     case 'zlib':
-      decoded = inflate(decode(base64Encoded));
+      decoded = await inflate(decode(base64Encoded));
       break;
     case undefined:
     case '':
diff --git a/src/mzxml/parseMzXML.js b/src/mzxml/parseMzXML.js
index fd519ca..0e9725d 100644
--- a/src/mzxml/parseMzXML.js
+++ b/src/mzxml/parseMzXML.js
@@ -1,4 +1,5 @@
 import { parse } from 'arraybuffer-xml-parser';
+import { recursiveResolve } from 'ml-spectra-processing';
 
 import { decodeBase64 } from '../util/decodeBase64';
 
@@ -6,7 +7,14 @@ import { processSpectrumList } from './processSpectrumList';
 
 const decoder = new TextDecoder();
 
-export function parseMzXML(arrayBuffer) {
+/**
+ *
+ * @param {*} arrayBuffer
+ * @param {import('../Options.js').Options} [options]
+ * @returns
+ */
+export async function parseMzXML(arrayBuffer, options = {}) {
+  const { logger = console } = options;
   const result = {
     metadata: {},
     times: [],
@@ -17,17 +25,25 @@ export function parseMzXML(arrayBuffer) {
     },
   };
   let parsed = parse(arrayBuffer, {
-    attributeNamePrefix: '',
     attributesNodeName: 'attributes',
+    attributeNameProcessor: (attributeName) => attributeName,
     tagValueProcessor: (value, node) => {
       if (node.tagName !== 'peaks') return decoder.decode(value);
-      return decodeBase64(node.value, {
+
+      const promise = decodeBase64(node.bytes, {
         precision: node.attributes.precision,
         endian: node.attributes.byteOrder,
         compression: node.attributes.compressionType,
       });
+      // avoid unhandled promise rejection and swallow the error
+      promise.catch((error) => {
+        logger.error('error decoding base64', error);
+        return [];
+      });
+      return promise;
     },
   });
+  await recursiveResolve(parsed);
 
   processSpectrumList(parsed.mzXML, result.times, result.series.ms);
 
diff --git a/src/mzxml/utils.js b/src/mzxml/utils.js
index d7193b3..87d23fe 100644
--- a/src/mzxml/utils.js
+++ b/src/mzxml/utils.js
@@ -1,13 +1,13 @@
 import { decode } from 'uint8-base64';
 
-import { inflate } from 'pako';
+import { inflate } from '../util/inflate.js';
 
-export function decoder(base64Encoded, options = {}) {
+export async function decoder(base64Encoded, options = {}) {
   const { compressionAlgorithm } = options;
   let decoded;
   switch (compressionAlgorithm) {
     case 'zlib':
-      decoded = inflate(decode(base64Encoded));
+      decoded = await inflate(decode(base64Encoded));
       break;
     case undefined:
     case '':
diff --git a/src/util/decodeBase64.js b/src/util/decodeBase64.js
index 26d8cc1..75769c4 100644
--- a/src/util/decodeBase64.js
+++ b/src/util/decodeBase64.js
@@ -1,7 +1,8 @@
-import { inflate } from 'pako';
 import { decode } from 'uint8-base64';
 
-export function decodeBase64(base64, options = {}) {
+import { inflate } from './inflate.js';
+
+export async function decodeBase64(base64, options = {}) {
   let {
     endian = 'little',
     precision,
@@ -28,7 +29,7 @@ export function decodeBase64(base64, options = {}) {
   let uint8Array = decode(base64);
   switch (compression.toLowerCase()) {
     case 'zlib':
-      uint8Array = inflate(uint8Array);
+      uint8Array = await inflate(uint8Array);
       break;
     case '':
     case 'none':
diff --git a/src/util/inflate.js b/src/util/inflate.js
new file mode 100644
index 0000000..0775da9
--- /dev/null
+++ b/src/util/inflate.js
@@ -0,0 +1,37 @@
+export async function inflate(zlibCompressedData) {
+  // Strip the zlib header and footer
+  const strippedData = zlibCompressedData.subarray(2, -4); // Remove 2-byte header and 4-byte Adler-32 footer
+
+  const inputStream = new ReadableStream({
+    start(controller) {
+      controller.enqueue(strippedData);
+      controller.close();
+    },
+  });
+
+  const decompressedStream = inputStream.pipeThrough(
+    new DecompressionStream('deflate-raw'),
+  );
+
+  const reader = decompressedStream.getReader();
+  const chunks = [];
+  let totalLength = 0;
+
+  while (true) {
+    // eslint-disable-next-line no-await-in-loop
+    const { value, done } = await reader.read();
+    if (done) break;
+    chunks.push(value);
+    totalLength += value.length;
+  }
+
+  // Combine chunks into a single Uint8Array
+  const decompressedData = new Uint8Array(totalLength);
+  let offset = 0;
+  for (const chunk of chunks) {
+    decompressedData.set(chunk, offset);
+    offset += chunk.length;
+  }
+
+  return decompressedData;
+}