From 36f4ace5b698ecd4890925e7bab35796a7ed719f Mon Sep 17 00:00:00 2001 From: Mike Bostock Date: Sat, 5 Nov 2022 15:57:02 -0700 Subject: [PATCH] Apache Arrow for table cells --- .eslintrc.json | 2 +- bin/resolve-dependencies | 6 +++++- src/dependencies.mjs | 3 ++- src/fileAttachment.mjs | 19 ++++++++++++++----- src/index.mjs | 2 +- src/library.mjs | 6 +++--- src/require.mjs | 3 +++ src/table.mjs | 27 ++++++++++++++++++++++++++- 8 files changed, 55 insertions(+), 13 deletions(-) diff --git a/.eslintrc.json b/.eslintrc.json index 0cc3e774..48aa23da 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -2,7 +2,7 @@ "extends": "eslint:recommended", "parserOptions": { "sourceType": "module", - "ecmaVersion": 2018 + "ecmaVersion": 2020 }, "env": { "es6": true, diff --git a/bin/resolve-dependencies b/bin/resolve-dependencies index dcf2af1d..fefe6db8 100755 --- a/bin/resolve-dependencies +++ b/bin/resolve-dependencies @@ -64,7 +64,11 @@ const mains = ["unpkg", "jsdelivr", "browser", "main"]; } { const package = await resolve("apache-arrow@4"); - console.log(`export const arrow = dependency("${package.name}", "${package.version}", "${package.export}");`); + console.log(`export const arrow4 = dependency("${package.name}", "${package.version}", "${package.export}");`); + } + { + const package = await resolve("apache-arrow@9"); + console.log(`export const arrow9 = dependency("${package.name}", "${package.version}", "+esm");`); } { const package = await resolve("arquero"); diff --git a/src/dependencies.mjs b/src/dependencies.mjs index 9dc6652b..51ea8d78 100644 --- a/src/dependencies.mjs +++ b/src/dependencies.mjs @@ -13,7 +13,8 @@ export const sql = dependency("sql.js", "1.7.0", "dist/sql-wasm.js"); export const vega = dependency("vega", "5.22.1", "build/vega.min.js"); export const vegalite = dependency("vega-lite", "5.5.0", "build/vega-lite.min.js"); export const vegaliteApi = dependency("vega-lite-api", "5.0.0", "build/vega-lite-api.min.js"); -export const arrow = dependency("apache-arrow", "4.0.1", "Arrow.es2015.min.js"); +export const arrow4 = dependency("apache-arrow", "4.0.1", "Arrow.es2015.min.js"); +export const arrow9 = dependency("apache-arrow", "9.0.0", "+esm"); export const arquero = dependency("arquero", "4.8.8", "dist/arquero.min.js"); export const topojson = dependency("topojson-client", "3.1.0", "dist/topojson-client.min.js"); export const exceljs = dependency("exceljs", "4.3.0", "dist/exceljs.min.js"); diff --git a/src/fileAttachment.mjs b/src/fileAttachment.mjs index 43ef6a79..fbef1cf2 100644 --- a/src/fileAttachment.mjs +++ b/src/fileAttachment.mjs @@ -1,6 +1,6 @@ import {autoType, csvParse, csvParseRows, tsvParse, tsvParseRows} from "d3-dsv"; -import {arrow, jszip, exceljs} from "./dependencies.mjs"; -import {requireDefault} from "./require.mjs"; +import {arrow4, arrow9, jszip, exceljs} from "./dependencies.mjs"; +import {cdn, requireDefault} from "./require.mjs"; import {SQLiteDatabaseClient} from "./sqlite.mjs"; import {Workbook} from "./xlsx.mjs"; @@ -56,9 +56,18 @@ export class AbstractFile { i.src = url; }); } - async arrow() { - const [Arrow, response] = await Promise.all([requireDefault(arrow.resolve()), remote_fetch(this)]); - return Arrow.Table.from(response); + async arrow({version = 4} = {}) { + switch (version) { + case 4: { + const [Arrow, response] = await Promise.all([requireDefault(arrow4.resolve()), remote_fetch(this)]); + return Arrow.Table.from(response); + } + case 9: { + const [Arrow, response] = await Promise.all([import(`${cdn}${arrow9.resolve()}`), remote_fetch(this)]); + return Arrow.tableFromIPC(response); + } + default: throw new Error(`unsupported arrow version: ${version}`); + } } async sqlite() { return SQLiteDatabaseClient.open(remote_fetch(this)); diff --git a/src/index.mjs b/src/index.mjs index cd2d65ff..0ac6c62a 100644 --- a/src/index.mjs +++ b/src/index.mjs @@ -1,3 +1,3 @@ export {default as FileAttachments, AbstractFile} from "./fileAttachment.mjs"; export {default as Library} from "./library.mjs"; -export {makeQueryTemplate, loadDataSource, arrayIsPrimitive, isDataArray, isDatabaseClient} from "./table.mjs"; +export {makeQueryTemplate, loadDataSource, arrayIsPrimitive, isArrowTable, isDataArray, isDatabaseClient} from "./table.mjs"; diff --git a/src/library.mjs b/src/library.mjs index a8de4208..b01aa89e 100644 --- a/src/library.mjs +++ b/src/library.mjs @@ -17,7 +17,7 @@ import svg from "./svg.mjs"; import tex from "./tex.mjs"; import vegalite from "./vegalite.mjs"; import width from "./width.mjs"; -import {arquero, arrow, d3, graphviz, htl, inputs, lodash, plot, topojson} from "./dependencies.mjs"; +import {arquero, arrow4, d3, graphviz, htl, inputs, lodash, plot, topojson} from "./dependencies.mjs"; import {__query} from "./table.mjs"; export default Object.assign(Object.defineProperties(function Library(resolver) { @@ -39,8 +39,8 @@ export default Object.assign(Object.defineProperties(function Library(resolver) // Recommended libraries // https://observablehq.com/@observablehq/recommended-libraries _: () => require(lodash.resolve()), - aq: () => require.alias({"apache-arrow": arrow.resolve()})(arquero.resolve()), - Arrow: () => require(arrow.resolve()), + aq: () => require.alias({"apache-arrow": arrow4.resolve()})(arquero.resolve()), + Arrow: () => require(arrow4.resolve()), d3: () => require(d3.resolve()), Inputs: () => require(inputs.resolve()).then(Inputs => ({...Inputs, file: Inputs.fileOf(AbstractFile)})), L: () => leaflet(require), diff --git a/src/require.mjs b/src/require.mjs index d75635a8..a826ef1e 100644 --- a/src/require.mjs +++ b/src/require.mjs @@ -1,5 +1,8 @@ import {require as initialRequire, requireFrom} from "d3-require"; +// TODO Allow this to be overridden using the Library’s resolver. +export const cdn = "https://cdn.observableusercontent.com/npm/"; + export let requireDefault = initialRequire; export function setDefaultRequire(require) { diff --git a/src/table.mjs b/src/table.mjs index b2fe1072..d9a5fc1c 100644 --- a/src/table.mjs +++ b/src/table.mjs @@ -23,6 +23,20 @@ export function isDatabaseClient(value, mode) { ); } +// Returns true if the vaue is an Apache Arrow table. This uses a “duck” test +// (instead of strict instanceof) because we want it to work with a range of +// Apache Arrow versions at least 7.0.0 or above. +// https://arrow.apache.org/docs/7.0/js/classes/Arrow_dom.Table.html +export function isArrowTable(value) { + return ( + value && + typeof value.getChild === "function" && + typeof value.toArray === "function" && + value.schema && + Array.isArray(value.schema.fields) + ); +} + // Returns true if the value is a typed array (for a single-column table), or if // it’s an array. In the latter case, the elements of the array must be // consistently typed: either plain objects or primitives or dates. @@ -145,6 +159,7 @@ export const __query = Object.assign( source = await loadDataSource(await source, "table"); if (isDatabaseClient(source)) return evaluateQuery(source, makeQueryTemplate(operations, source), invalidation); if (isDataArray(source)) return __table(source, operations); + if (isArrowTable(source)) return __arrow(source, operations); if (!source) throw new Error("missing data source"); throw new Error("invalid data source"); }, @@ -164,6 +179,7 @@ export async function loadDataSource(source, mode) { case "text/csv": return source.csv({typed: true}); case "text/tab-separated-values": return source.tsv({typed: true}); case "application/json": return source.json(); + default: if (/\.arrow$/i.test(source.name)) return source.arrow({version: 9}); } } if (mode === "table" || mode === "sql") { @@ -390,8 +406,17 @@ function likeOperand(operand) { return {...operand, value: `%${operand.value}%`}; } +// This function applies table cell operations to an in-memory Apache Arrow +// table; it should be equivalent to the corresponding SQL query. +function __arrow(source, operations) { + operations; + return source; // TODO +} + // This function applies table cell operations to an in-memory table (array of -// objects); it should be equivalent to the corresponding SQL query. +// objects); it should be equivalent to the corresponding SQL query. TODO This +// is only exported for testing, but we should be testing the public __query +// instead of this internal method. export function __table(source, operations) { const input = source; let {schema, columns} = source;