diff --git a/c_glib/arrow-glib/Makefile.am b/c_glib/arrow-glib/Makefile.am index 16d07031427..9148f8a583a 100644 --- a/c_glib/arrow-glib/Makefile.am +++ b/c_glib/arrow-glib/Makefile.am @@ -16,6 +16,7 @@ # under the License. CLEANFILES = +DISTCLEANFILES = EXTRA_DIST = \ meson.build @@ -169,6 +170,10 @@ BUILT_SOURCES = \ stamp-enums.c \ stamp-enums.h +DISTCLEANFILES += \ + stamp-enums.c \ + stamp-enums.h + EXTRA_DIST += \ enums.c.template \ enums.h.template @@ -214,7 +219,7 @@ INTROSPECTION_SCANNER_ARGS = INTROSPECTION_SCANNER_ENV = if USE_ARROW_BUILD_DIR INTROSPECTION_SCANNER_ENV += \ - LD_LIBRARY_PATH=$(ARROW_LIB_DIR):$${PKG_CONFIG_PATH} + LD_LIBRARY_PATH=$(ARROW_LIB_DIR):$${LD_LIBRARY_PATH} endif if OS_MACOS INTROSPECTION_SCANNER_ENV += \ diff --git a/cpp/apidoc/HDFS.md b/cpp/apidoc/HDFS.md index d54ad270c05..d3671fb7691 100644 --- a/cpp/apidoc/HDFS.md +++ b/cpp/apidoc/HDFS.md @@ -50,6 +50,10 @@ export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` * `ARROW_LIBHDFS_DIR` (optional): explicit location of `libhdfs.so` if it is installed somewhere other than `$HADOOP_HOME/lib/native`. +To accommodate distribution-specific nuances, the `JAVA_HOME` variable may be +set to the root path for the Java SDK, the JRE path itself, or to the directory +containing the `libjvm` library. + ### Mac Specifics The installed location of Java on OS X can vary, however the following snippet diff --git a/cpp/cmake_modules/FindParquet.cmake b/cpp/cmake_modules/FindParquet.cmake index 0339ec56ae2..8bbe05f127f 100644 --- a/cpp/cmake_modules/FindParquet.cmake +++ b/cpp/cmake_modules/FindParquet.cmake @@ -60,8 +60,22 @@ if(PARQUET_HOME) PATHS ${PARQUET_HOME} NO_DEFAULT_PATH PATH_SUFFIXES "lib") get_filename_component(PARQUET_LIBS ${PARQUET_LIBRARIES} PATH ) - set(PARQUET_ABI_VERSION "1.0.0") - set(PARQUET_SO_VERSION "1") + + # Try to autodiscover the Parquet ABI version + get_filename_component(PARQUET_LIB_REALPATH ${PARQUET_LIBRARIES} REALPATH) + get_filename_component(PARQUET_EXT_REALPATH ${PARQUET_LIB_REALPATH} EXT) + string(REGEX MATCH ".([0-9]+.[0-9]+.[0-9]+)" HAS_ABI_VERSION ${PARQUET_EXT_REALPATH}) + if (HAS_ABI_VERSION) + if (APPLE) + string(REGEX REPLACE ".([0-9]+.[0-9]+.[0-9]+).dylib" "\\1" PARQUET_ABI_VERSION ${PARQUET_EXT_REALPATH}) + else() + string(REGEX REPLACE ".so.([0-9]+.[0-9]+.[0-9]+)" "\\1" PARQUET_ABI_VERSION ${PARQUET_EXT_REALPATH}) + endif() + string(REGEX REPLACE "([0-9]+).[0-9]+.[0-9]+" "\\1" PARQUET_SO_VERSION ${PARQUET_ABI_VERSION}) + else() + set(PARQUET_ABI_VERSION "1.0.0") + set(PARQUET_SO_VERSION "1") + endif() else() pkg_check_modules(PARQUET parquet) if (PARQUET_FOUND) diff --git a/cpp/src/arrow/buffer-test.cc b/cpp/src/arrow/buffer-test.cc index 5fd2706f046..398cc06363a 100644 --- a/cpp/src/arrow/buffer-test.cc +++ b/cpp/src/arrow/buffer-test.cc @@ -194,4 +194,29 @@ TEST(TestBuffer, SliceMutableBuffer) { ASSERT_TRUE(slice->Equals(expected)); } +TEST(TestBufferBuilder, ResizeReserve) { + const std::string data = "some data"; + auto data_ptr = data.c_str(); + + BufferBuilder builder; + + ASSERT_OK(builder.Append(data_ptr, 9)); + ASSERT_EQ(9, builder.length()); + + ASSERT_OK(builder.Resize(128)); + ASSERT_EQ(128, builder.capacity()); + + // Do not shrink to fit + ASSERT_OK(builder.Resize(64, false)); + ASSERT_EQ(128, builder.capacity()); + + // Shrink to fit + ASSERT_OK(builder.Resize(64)); + ASSERT_EQ(64, builder.capacity()); + + // Reserve elements + ASSERT_OK(builder.Reserve(60)); + ASSERT_EQ(128, builder.capacity()); +} + } // namespace arrow diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 450a4c78b5b..b50b1a1aa04 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -25,6 +25,7 @@ #include #include +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/util/bit-util.h" #include "arrow/util/macros.h" @@ -32,13 +33,12 @@ namespace arrow { -class MemoryPool; - // ---------------------------------------------------------------------- // Buffer classes -/// Immutable API for a chunk of bytes which may or may not be owned by the -/// class instance. +/// \class Buffer +/// \brief Object containing a pointer to a piece of contiguous memory with a +/// particular size. Base class does not own its memory /// /// Buffers have two related notions of length: size and capacity. Size is /// the number of bytes that might have valid data. Capacity is the number @@ -133,7 +133,8 @@ ARROW_EXPORT std::shared_ptr SliceMutableBuffer(const std::shared_ptr& buffer, const int64_t offset, const int64_t length); -/// A Buffer whose contents can be mutated. May or may not own its data. +/// \class MutableBuffer +/// \brief A Buffer whose contents can be mutated. May or may not own its data. class ARROW_EXPORT MutableBuffer : public Buffer { public: MutableBuffer(uint8_t* data, const int64_t size) : Buffer(data, size) { @@ -148,6 +149,8 @@ class ARROW_EXPORT MutableBuffer : public Buffer { MutableBuffer() : Buffer(NULLPTR, 0) {} }; +/// \class ResizableBuffer +/// \brief A mutable buffer that can be resized class ARROW_EXPORT ResizableBuffer : public MutableBuffer { public: /// Change buffer reported size to indicated size, allocating memory if @@ -190,13 +193,22 @@ class ARROW_EXPORT PoolBuffer : public ResizableBuffer { MemoryPool* pool_; }; +/// \class BufferBuilder +/// \brief A class for incrementally building a contiguous chunk of in-memory data class ARROW_EXPORT BufferBuilder { public: - explicit BufferBuilder(MemoryPool* pool) + explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) : pool_(pool), data_(NULLPTR), capacity_(0), size_(0) {} - /// Resizes the buffer to the nearest multiple of 64 bytes per Layout.md - Status Resize(const int64_t elements) { + /// \brief Resizes the buffer to the nearest multiple of 64 bytes + /// + /// \param elements the new capacity of the of the builder. Will be rounded + /// up to a multiple of 64 bytes for padding + /// \param shrink_to_fit if new capacity smaller than existing size, + /// reallocate internal buffer. Set to false to avoid reallocations when + /// shrinking the builder + /// \return Status + Status Resize(const int64_t elements, bool shrink_to_fit = true) { // Resize(0) is a no-op if (elements == 0) { return Status::OK(); @@ -205,7 +217,7 @@ class ARROW_EXPORT BufferBuilder { buffer_ = std::make_shared(pool_); } int64_t old_capacity = capacity_; - RETURN_NOT_OK(buffer_->Resize(elements)); + RETURN_NOT_OK(buffer_->Resize(elements, shrink_to_fit)); capacity_ = buffer_->capacity(); data_ = buffer_->mutable_data(); if (capacity_ > old_capacity) { @@ -214,7 +226,14 @@ class ARROW_EXPORT BufferBuilder { return Status::OK(); } - Status Append(const uint8_t* data, int64_t length) { + /// \brief Ensure that builder can accommodate the additional number of bytes + /// without the need to perform allocations + /// + /// \param size number of additional bytes to make space for + /// \return Status + Status Reserve(const int64_t size) { return Resize(size_ + size, false); } + + Status Append(const void* data, int64_t length) { if (capacity_ < length + size_) { int64_t new_capacity = BitUtil::NextPower2(length + size_); RETURN_NOT_OK(Resize(new_capacity)); @@ -248,7 +267,7 @@ class ARROW_EXPORT BufferBuilder { } // Unsafe methods don't check existing size - void UnsafeAppend(const uint8_t* data, int64_t length) { + void UnsafeAppend(const void* data, int64_t length) { memcpy(data_ + size_, data, static_cast(length)); size_ += length; } diff --git a/cpp/src/arrow/io/hdfs-internal.cc b/cpp/src/arrow/io/hdfs-internal.cc index 9cd1c5052fe..545b2d17d2e 100644 --- a/cpp/src/arrow/io/hdfs-internal.cc +++ b/cpp/src/arrow/io/hdfs-internal.cc @@ -147,7 +147,7 @@ static std::vector get_potential_libjvm_paths() { file_name = "jvm.dll"; #elif __APPLE__ search_prefixes = {""}; - search_suffixes = {"", "/jre/lib/server"}; + search_suffixes = {"", "/jre/lib/server", "/lib/server"}; file_name = "libjvm.dylib"; // SFrame uses /usr/libexec/java_home to find JAVA_HOME; for now we are @@ -175,7 +175,7 @@ static std::vector get_potential_libjvm_paths() { "/usr/lib/jvm/default", // alt centos "/usr/java/latest", // alt centos }; - search_suffixes = {"/jre/lib/amd64/server"}; + search_suffixes = {"", "/jre/lib/amd64/server", "/lib/amd64/server"}; file_name = "libjvm.so"; #endif // From direct environment variable diff --git a/dev/docker-compose.yml b/dev/docker-compose.yml index 4b901489400..a73fd1bfbba 100644 --- a/dev/docker-compose.yml +++ b/dev/docker-compose.yml @@ -17,7 +17,7 @@ version: '3' services: gen_apidocs: - build: + build: context: gen_apidocs volumes: - ../..:/apache-arrow @@ -29,7 +29,7 @@ services: volumes: - ../..:/apache-arrow dask_integration: - build: + build: context: dask_integration volumes: - ../..:/apache-arrow diff --git a/dev/gen_apidocs/Dockerfile b/dev/gen_apidocs/Dockerfile index ca4718e6378..da740ee0773 100644 --- a/dev/gen_apidocs/Dockerfile +++ b/dev/gen_apidocs/Dockerfile @@ -14,19 +14,24 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FROM ubuntu:14.04 -# Prerequsites for apt-add-repository -RUN apt-get update && apt-get install -y \ - software-properties-common python-software-properties +FROM ubuntu:16.04 + # Basic OS dependencies -RUN apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ - apt-get update && apt-get install -y \ +RUN apt-get update && apt-get install -y \ wget \ rsync \ git \ gcc-4.9 \ g++-4.9 \ - build-essential + build-essential \ + software-properties-common + +# Java build fails with default JDK8 +RUN add-apt-repository ppa:openjdk-r/ppa &&\ + apt-get update &&\ + apt-get install -y openjdk-7-jdk &&\ + update-java-alternatives -s java-1.7.0-openjdk-amd64 + # This will install conda in /home/ubuntu/miniconda RUN wget -O /tmp/miniconda.sh \ https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ @@ -73,6 +78,7 @@ RUN /home/ubuntu/miniconda/bin/conda create -y -q -n pyarrow-dev \ doxygen \ maven \ -c conda-forge + ADD . /apache-arrow WORKDIR /apache-arrow CMD arrow/dev/gen_apidocs/create_documents.sh diff --git a/dev/gen_apidocs/create_documents.sh b/dev/gen_apidocs/create_documents.sh index 566d9cee79c..54031262b3a 100755 --- a/dev/gen_apidocs/create_documents.sh +++ b/dev/gen_apidocs/create_documents.sh @@ -27,6 +27,7 @@ export ARROW_HOME=$(pwd)/dist export PARQUET_HOME=$(pwd)/dist CONDA_BASE=/home/ubuntu/miniconda export LD_LIBRARY_PATH=$(pwd)/dist/lib:${CONDA_BASE}/lib:${LD_LIBRARY_PATH} +export PKG_CONFIG_PATH=$(pwd)/dist/lib/pkgconfig:${PKG_CONFIG_PATH} export PATH=${CONDA_BASE}/bin:${PATH} # Prepare the asf-site before copying api docs @@ -38,16 +39,38 @@ git clone --branch=asf-site \ https://git-wip-us.apache.org/repos/asf/arrow-site.git asf-site popd +# Make Java documentation +export JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64 +wget http://mirrors.gigenet.com/apache/maven/maven-3/3.5.2/binaries/apache-maven-3.5.2-bin.tar.gz +tar xvf apache-maven-3.5.2-bin.tar.gz +export PATH=$(pwd)/apache-maven-3.5.2/bin:$PATH + +pushd arrow/java +rm -rf target/site/apidocs/* +mvn -Drat.skip=true install +mvn -Drat.skip=true site +mkdir -p ../site/asf-site/docs/java/ +rsync -r target/site/apidocs/ ../site/asf-site/docs/java/ +popd + # Make Python documentation (Depends on C++ ) # Build Arrow C++ source activate pyarrow-dev export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX +export BOOST_ROOT=$CONDA_PREFIX export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX +export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:${LD_LIBRARY_PATH} +export PKG_CONFIG_PATH=$CONDA_PREFIX/lib/pkgconfig:${PKG_CONFIG_PATH} + +export CC=gcc-4.9 +export CXX=g++-4.9 -rm -rf arrow/cpp/build_docs -mkdir arrow/cpp/build_docs -pushd arrow/cpp/build_docs +CPP_BUILD_DIR=$(pwd)/arrow/cpp/build_docs + +rm -rf $CPP_BUILD_DIR +mkdir $CPP_BUILD_DIR +pushd $CPP_BUILD_DIR cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ -DARROW_PYTHON=on \ @@ -58,6 +81,28 @@ make -j4 make install popd +# Build c_glib documentation +pushd arrow/c_glib +if [ -f Makefile ]; then + # Ensure updating to prevent auto re-configure + touch configure **/Makefile + make distclean + # Work around for 'make distclean' removes doc/reference/xml/ + git checkout doc/reference/xml +fi +./autogen.sh +rm -rf build_docs +mkdir build_docs +pushd build_docs +../configure \ + --prefix=${AROW_HOME} \ + --enable-gtk-doc +make -j4 GTK_DOC_V_XREF=": " +mkdir -p ../../site/asf-site/docs/c_glib +rsync -r doc/reference/html/ ../../site/asf-site/docs/c_glib +popd +popd + # Build Parquet C++ rm -rf parquet-cpp/build_docs mkdir parquet-cpp/build_docs @@ -83,19 +128,6 @@ mkdir -p ../site/asf-site/docs/python rsync -r doc/_build/html/ ../site/asf-site/docs/python popd -# Build c_glib documentation -pushd arrow/c_glib -rm -rf doc/reference/html/* -./autogen.sh -./configure \ - --with-arrow-cpp-build-dir=$(pwd)/../cpp/build \ - --with-arrow-cpp-build-type=$ARROW_BUILD_TYPE \ - --enable-gtk-doc -LD_LIBRARY_PATH=$(pwd)/../cpp/build/$ARROW_BUILD_TYPE make GTK_DOC_V_XREF=": " -mkdir -p ../site/asf-site/docs/c_glib -rsync -r doc/reference/html/ ../site/asf-site/docs/c_glib -popd - # Make C++ documentation pushd arrow/cpp/apidoc rm -rf html/* @@ -103,12 +135,3 @@ doxygen Doxyfile mkdir -p ../../site/asf-site/docs/cpp rsync -r html/ ../../site/asf-site/docs/cpp popd - -# Make Java documentation -pushd arrow/java -rm -rf target/site/apidocs/* -mvn -Drat.skip=true install -mvn -Drat.skip=true site -mkdir -p ../site/asf-site/docs/java/ -rsync -r target/site/apidocs/ ../site/asf-site/docs/java/ -popd diff --git a/dev/release/RELEASE_MANAGEMENT.md b/dev/release/RELEASE_MANAGEMENT.md index 73eaf5f95b3..0f8c2202fef 100644 --- a/dev/release/RELEASE_MANAGEMENT.md +++ b/dev/release/RELEASE_MANAGEMENT.md @@ -112,6 +112,15 @@ software must be built in order to create the documentation, so this step may take some time to run, especially the first time around as the Docker container will also have to be built. +To upload the updated documentation to the website, navigate to `site/asf-site` +and commit all changes: + +``` +pushd site/asf-site +git add . +git commit -m "Updated API documentation for version X.Y.Z" +``` + After successfully creating the API documentation the website can be run locally to browse the API documentation from the top level `Documentation` menu. To run the website issue the command: diff --git a/js/bin/integration.js b/js/bin/integration.js index fe32433d384..2aeb14d0e34 100755 --- a/js/bin/integration.js +++ b/js/bin/integration.js @@ -17,6 +17,8 @@ // specific language governing permissions and limitations // under the License. +var fs = require('fs'); +var glob = require('glob'); var path = require('path'); var gulp = require.resolve(path.join(`..`, `node_modules/gulp/bin/gulp.js`)); var child_process = require(`child_process`); @@ -29,12 +31,14 @@ var optionList = [ { type: String, name: 'arrow', alias: 'a', - description: 'The Arrow file to read/write' + multiple: true, defaultValue: [], + description: 'The Arrow file[s] to read/write' }, { type: String, name: 'json', alias: 'j', - description: 'The JSON file to read/write' + multiple: true, defaultValue: [], + description: 'The JSON file[s] to read/write' } ]; @@ -66,20 +70,60 @@ function print_usage() { process.exit(1); } -if (!argv.arrow || !argv.json || !argv.mode) { +let jsonPaths = argv.json; +let arrowPaths = argv.arrow; + +if (!argv.mode) { + return print_usage(); +} + +let mode = argv.mode.toUpperCase(); +if (mode === 'VALIDATE' && !jsonPaths.length) { + jsonPaths = glob.sync(path.resolve(__dirname, `../test/data/json/`, `*.json`)); + if (!arrowPaths.length) { + [jsonPaths, arrowPaths] = jsonPaths.reduce(([jsonPaths, arrowPaths], jsonPath) => { + const { name } = path.parse(jsonPath); + for (const source of ['cpp', 'java']) { + for (const format of ['file', 'stream']) { + const arrowPath = path.resolve(__dirname, `../test/data/${source}/${format}/${name}.arrow`); + if (fs.existsSync(arrowPath)) { + jsonPaths.push(jsonPath); + arrowPaths.push(arrowPath); + console.log('-j', jsonPath, '-a', arrowPath, '\\'); + } + } + } + return [jsonPaths, arrowPaths]; + }, [[], []]); + } +} else if (!jsonPaths.length) { return print_usage(); } -switch (argv.mode.toUpperCase()) { +switch (mode) { case 'VALIDATE': + const args = [`test`, `-i`].concat(argv._unknown || []); + jsonPaths.forEach((p, i) => { + args.push('-j', p, '-a', arrowPaths[i]); + }); child_process.spawnSync( - gulp, - [`test`, `-i`].concat(process.argv.slice(2)), + gulp, args, { cwd: path.resolve(__dirname, '..'), stdio: ['ignore', 'inherit', 'inherit'] } ); + // for (let i = -1, n = jsonPaths.length; ++i < n;) { + // const jsonPath = jsonPaths[i]; + // const arrowPath = arrowPaths[i]; + // child_process.spawnSync( + // gulp, args.concat(['-j', jsonPath, '-a', arrowPath]), + // { + // cwd: path.resolve(__dirname, '..'), + // stdio: ['ignore', 'inherit', 'inherit'] + // } + // ); + // } break; default: print_usage(); diff --git a/js/gulp/argv.js b/js/gulp/argv.js index 6f80912e97e..8a83820c1fe 100644 --- a/js/gulp/argv.js +++ b/js/gulp/argv.js @@ -15,6 +15,10 @@ // specific language governing permissions and limitations // under the License. +const fs = require('fs'); +const glob = require('glob'); +const path = require('path'); + const argv = require(`command-line-args`)([ { name: `all`, type: Boolean }, { name: 'update', alias: 'u', type: Boolean }, @@ -22,13 +26,11 @@ const argv = require(`command-line-args`)([ { name: `target`, type: String, defaultValue: `` }, { name: `module`, type: String, defaultValue: `` }, { name: `coverage`, type: Boolean, defaultValue: false }, - { name: `json_file`, alias: `j`, type: String, defaultValue: null }, - { name: `arrow_file`, alias: `a`, type: String, defaultValue: null }, { name: `integration`, alias: `i`, type: Boolean, defaultValue: false }, { name: `targets`, alias: `t`, type: String, multiple: true, defaultValue: [] }, { name: `modules`, alias: `m`, type: String, multiple: true, defaultValue: [] }, - { name: `sources`, alias: `s`, type: String, multiple: true, defaultValue: [`cpp`, `java`] }, - { name: `formats`, alias: `f`, type: String, multiple: true, defaultValue: [`file`, `stream`] }, + { name: `json_files`, alias: `j`, type: String, multiple: true, defaultValue: [] }, + { name: `arrow_files`, alias: `a`, type: String, multiple: true, defaultValue: [] }, ], { partial: true }); const { targets, modules } = argv; @@ -38,4 +40,25 @@ argv.module && !modules.length && modules.push(argv.module); (argv.all || !targets.length) && targets.push(`all`); (argv.all || !modules.length) && modules.push(`all`); +if (argv.coverage && (!argv.json_files || !argv.json_files.length)) { + + let [jsonPaths, arrowPaths] = glob + .sync(path.resolve(__dirname, `../test/data/json/`, `*.json`)) + .reduce((paths, jsonPath) => { + const { name } = path.parse(jsonPath); + const [jsonPaths, arrowPaths] = paths; + ['cpp', 'java'].forEach((source) => ['file', 'stream'].forEach((format) => { + const arrowPath = path.resolve(__dirname, `../test/data/${source}/${format}/${name}.arrow`); + if (fs.existsSync(arrowPath)) { + jsonPaths.push(jsonPath); + arrowPaths.push(arrowPath); + } + })); + return paths; + }, [[], []]); + + argv.json_files = jsonPaths; + argv.arrow_files = arrowPaths; +} + module.exports = { argv, targets, modules }; diff --git a/js/gulp/closure-task.js b/js/gulp/closure-task.js index 1bd872fd304..9633d7199cf 100644 --- a/js/gulp/closure-task.js +++ b/js/gulp/closure-task.js @@ -36,7 +36,7 @@ const closureTask = ((cache) => memoizeTask(cache, function closure(target, form const src = targetDir(target, `cls`); const out = targetDir(target, format); const entry = path.join(src, mainExport); - const externs = path.join(src, `${mainExport}.externs`); + const externs = path.join(`src/Arrow.externs.js`); return observableFromStreams( gulp.src([ /* external libs first --> */ `node_modules/tslib/package.json`, @@ -46,7 +46,6 @@ const closureTask = ((cache) => memoizeTask(cache, function closure(target, form `node_modules/text-encoding-utf-8/package.json`, `node_modules/text-encoding-utf-8/src/encoding.js`, /* then sources globs --> */ `${src}/**/*.js`, -/* and exclusions last --> */ `!${src}/Arrow.externs.js`, ], { base: `./` }), sourcemaps.init(), closureCompiler(createClosureArgs(entry, externs)), @@ -60,11 +59,11 @@ const closureTask = ((cache) => memoizeTask(cache, function closure(target, form }))({}); const createClosureArgs = (entry, externs) => ({ + externs, third_party: true, warning_level: `QUIET`, dependency_mode: `STRICT`, rewrite_polyfills: false, - externs: `${externs}.js`, entry_point: `${entry}.js`, module_resolution: `NODE`, // formatting: `PRETTY_PRINT`, debug: true, diff --git a/js/gulp/package-task.js b/js/gulp/package-task.js index 2976d0ad45d..c42b3fc3233 100644 --- a/js/gulp/package-task.js +++ b/js/gulp/package-task.js @@ -45,10 +45,11 @@ const createMainPackageJson = (target, format) => (orig) => ({ ...createTypeScriptPackageJson(target, format)(orig), name: npmPkgName, main: mainExport, + types: `${mainExport}.d.ts`, module: `${mainExport}.mjs`, dist: `${mainExport}.es5.min.js`, [`dist:es2015`]: `${mainExport}.es2015.min.js`, - [`@std/esm`]: { esm: `mjs` } + [`@std/esm`]: { esm: `mjs`, warnings: false, sourceMap: true } }); const createTypeScriptPackageJson = (target, format) => (orig) => ({ @@ -63,18 +64,20 @@ const createTypeScriptPackageJson = (target, format) => (orig) => ({ const createScopedPackageJSON = (target, format) => (({ name, ...orig }) => conditionallyAddStandardESMEntry(target, format)( - packageJSONFields.reduce( - (xs, key) => ({ ...xs, [key]: xs[key] || orig[key] }), - { name: `${npmOrgName}/${packageName(target, format)}`, - version: undefined, main: `${mainExport}.js`, types: `${mainExport}.d.ts`, - dist: undefined, [`dist:es2015`]: undefined, module: undefined, [`@std/esm`]: undefined } - ) + packageJSONFields.reduce( + (xs, key) => ({ ...xs, [key]: xs[key] || orig[key] }), + { + name: `${npmOrgName}/${packageName(target, format)}`, + version: undefined, main: `${mainExport}.js`, types: `${mainExport}.d.ts`, + dist: undefined, [`dist:es2015`]: undefined, module: undefined, [`@std/esm`]: undefined + } + ) ) ); const conditionallyAddStandardESMEntry = (target, format) => (packageJSON) => ( - format !== `esm` - ? packageJSON - : { ...packageJSON, [`@std/esm`]: { esm: `js` } } + format !== `esm` && format !== `cls` + ? packageJSON + : { ...packageJSON, [`@std/esm`]: { esm: `js`, warnings: false, sourceMap: true } } ); \ No newline at end of file diff --git a/js/gulp/test-task.js b/js/gulp/test-task.js index ab280b09263..7f655548eb8 100644 --- a/js/gulp/test-task.js +++ b/js/gulp/test-task.js @@ -44,15 +44,15 @@ const testOptions = { const testTask = ((cache, execArgv, testOptions) => memoizeTask(cache, function test(target, format, debug = false) { const opts = { ...testOptions }; const args = !debug ? [...execArgv] : [...debugArgv, ...execArgv]; - args.push(`test/${argv.integration ? `integration/*` : `unit/*`}`); + if (!argv.coverage) { + args.push(`test/${argv.integration ? `integration/*` : `unit/*`}`); + } opts.env = { ...opts.env, TEST_TARGET: target, TEST_MODULE: format, - JSON_PATH: argv.json_file, - ARROW_PATH: argv.arrow_file, TEST_TS_SOURCE: !!argv.coverage, - TEST_SOURCES: JSON.stringify(Array.isArray(argv.sources) ? argv.sources : [argv.sources]), - TEST_FORMATS: JSON.stringify(Array.isArray(argv.formats) ? argv.formats : [argv.formats]), + JSON_PATHS: JSON.stringify(Array.isArray(argv.json_files) ? argv.json_files : [argv.json_files]), + ARROW_PATHS: JSON.stringify(Array.isArray(argv.arrow_files) ? argv.arrow_files : [argv.arrow_files]), }; return !debug ? child_process.spawn(jest, args, opts) : diff --git a/js/gulp/typescript-task.js b/js/gulp/typescript-task.js index 8b755cf7f16..c42357adb2f 100644 --- a/js/gulp/typescript-task.js +++ b/js/gulp/typescript-task.js @@ -34,7 +34,7 @@ const typescriptTask = ((cache) => memoizeTask(cache, function typescript(target const tsProject = ts.createProject(path.join(`tsconfig`, tsconfigFile), { typescript: require(`typescript`) }); const { stream: { js, dts } } = observableFromStreams( tsProject.src(), sourcemaps.init(), - tsProject(ts.reporter.fullReporter(true)) + tsProject(ts.reporter.defaultReporter()) ); const writeDTypes = observableFromStreams(dts, gulp.dest(out)); const writeJS = observableFromStreams(js, sourcemaps.write(), gulp.dest(out)); @@ -52,12 +52,12 @@ function maybeCopyRawJSArrowFormatFiles(target, format) { return Observable.empty(); } return Observable.defer(async () => { - const outFormatDir = path.join(targetDir(target, format), `format`, `fb`); + const outFormatDir = path.join(targetDir(target, format), `fb`); await del(path.join(outFormatDir, '*.js')); await observableFromStreams( - gulp.src(path.join(`src`, `format`, `fb`, `*_generated.js`)), + gulp.src(path.join(`src`, `fb`, `*_generated.js`)), gulpRename((p) => { p.basename = p.basename.replace(`_generated`, ``); }), gulp.dest(outFormatDir) ).toPromise(); }); -} \ No newline at end of file +} diff --git a/js/gulp/uglify-task.js b/js/gulp/uglify-task.js index 5c605cb7882..988830f31bd 100644 --- a/js/gulp/uglify-task.js +++ b/js/gulp/uglify-task.js @@ -29,7 +29,7 @@ const webpack = require(`webpack`); const { memoizeTask } = require('./memoize-task'); const { Observable, ReplaySubject } = require('rxjs'); const UglifyJSPlugin = require(`uglifyjs-webpack-plugin`); -const esmRequire = require(`@std/esm`)(module, { cjs: true, esm: `js` }); +const esmRequire = require(`@std/esm`)(module, { cjs: true, esm: `js`, warnings: false }); const uglifyTask = ((cache, commonConfig) => memoizeTask(cache, function uglifyJS(target, format) { @@ -84,11 +84,19 @@ module.exports = uglifyTask; module.exports.uglifyTask = uglifyTask; const reservePublicNames = ((ESKeywords) => function reservePublicNames(target, format) { - const publicModulePath = `../${targetDir(target, format)}/${mainExport}.js`; - return [ - ...ESKeywords, - ...reserveExportedNames(esmRequire(publicModulePath)) + const src = targetDir(target, format); + const publicModulePaths = [ + `../${src}/data.js`, + `../${src}/type.js`, + `../${src}/table.js`, + `../${src}/vector.js`, + `../${src}/util/int.js`, + `../${src}/recordbatch.js`, + `../${src}/${mainExport}.js`, ]; + return publicModulePaths.reduce((keywords, publicModulePath) => [ + ...keywords, ...reserveExportedNames(esmRequire(publicModulePath, { warnings: false })) + ], [...ESKeywords]); })(ESKeywords); // Reflect on the Arrow modules to come up with a list of keys to save from Uglify's @@ -104,8 +112,8 @@ const reserveExportedNames = (entryModule) => ( .map((name) => [name, entryModule[name]]) .reduce((reserved, [name, value]) => { const fn = function() {}; - const ownKeys = value && Object.getOwnPropertyNames(value) || []; - const protoKeys = typeof value === `function` && Object.getOwnPropertyNames(value.prototype) || []; + const ownKeys = value && typeof value === 'object' && Object.getOwnPropertyNames(value) || []; + const protoKeys = typeof value === `function` && Object.getOwnPropertyNames(value.prototype || {}) || []; const publicNames = [...ownKeys, ...protoKeys].filter((x) => x !== `default` && x !== `undefined` && !(x in fn)); return [...reserved, name, ...publicNames]; }, [] diff --git a/js/gulp/util.js b/js/gulp/util.js index ba6ebece51b..f35a447e708 100644 --- a/js/gulp/util.js +++ b/js/gulp/util.js @@ -87,7 +87,7 @@ const ESKeywords = [ // EventTarget `addListener`, `removeListener`, `addEventListener`, `removeEventListener`, // Arrow properties - `low`, `high`, `data`, `index`, `field`, `validity`, `columns`, `fieldNode`, `subarray`, + `low`, `high`, `data`, `index`, `field`, `columns`, 'numCols', 'numRows', `values`, `valueOffsets`, `nullBitmap`, `subarray` ]; function taskName(target, format) { @@ -108,14 +108,13 @@ function targetDir(target, format) { function logAndDie(e) { if (e) { - console.error(e); process.exit(1); } } function observableFromStreams(...streams) { - const pumped = streams.length <= 1 ? streams[0] - : pump(...streams, logAndDie); + if (streams.length <= 0) { return Observable.empty(); } + const pumped = streams.length <= 1 ? streams[0] : pump(...streams, logAndDie); const fromEvent = Observable.fromEvent.bind(null, pumped); const streamObs = fromEvent(`data`) .merge(fromEvent(`error`).flatMap((e) => Observable.throw(e))) diff --git a/js/package.json b/js/package.json index 1f59ac1ef98..1c8b23604ab 100644 --- a/js/package.json +++ b/js/package.json @@ -3,7 +3,7 @@ "name": "apache-arrow", "description": "Apache Arrow columnar in-memory format", "bin": { - "arrow2csv": "bin/arrow2csv" + "arrow2csv": "bin/arrow2csv.js" }, "scripts": { "lerna": "lerna", @@ -12,6 +12,7 @@ "clean": "gulp clean", "debug": "gulp debug", "perf": "node ./perf/index.js", + "test:integration": "node ./bin/integration.js --mode validate", "create:perfdata": "python ./test/data/tables/generate.py ./test/data/tables/tracks.arrow", "release": "./npm-release.sh", "clean:all": "run-p clean clean:testdata", @@ -52,18 +53,18 @@ ], "dependencies": { "@types/text-encoding-utf-8": "1.0.1", - "command-line-args": "4.0.7", - "command-line-usage": "4.0.2", + "command-line-args": "5.0.1", + "command-line-usage": "4.1.0", "flatbuffers": "trxcllnt/flatbuffers-esm", "json-bignum": "0.0.3", "text-encoding-utf-8": "^1.0.2", - "tslib": "1.8.1" + "tslib": "1.9.0" }, "devDependencies": { - "@std/esm": "0.19.1", + "@std/esm": "0.19.7", "@types/flatbuffers": "1.6.5", - "@types/glob": "5.0.34", - "@types/jest": "22.0.1", + "@types/glob": "5.0.35", + "@types/jest": "22.1.0", "@types/node": "9.3.0", "ast-types": "0.10.1", "benchmark": "2.1.4", @@ -78,13 +79,13 @@ "gulp-rename": "1.2.2", "gulp-sourcemaps": "2.6.3", "gulp-transform-js-ast": "1.0.2", - "gulp-typescript": "3.2.3", + "gulp-typescript": "3.2.4", "ix": "2.3.4", - "jest": "22.0.5", + "jest": "22.1.4", "jest-environment-node-debug": "2.0.0", "json": "9.0.6", - "lerna": "2.6.0", - "lint-staged": "6.0.0", + "lerna": "2.7.1", + "lint-staged": "6.0.1", "merge2": "1.2.1", "mkdirp": "0.5.1", "npm-run-all": "4.1.2", @@ -101,6 +102,9 @@ "webpack": "3.10.0", "xml2js": "0.4.19" }, + "@std/esm": { + "warnings": false + }, "lint-staged": { "*.@(ts)": [ "tslint --fix", @@ -127,7 +131,7 @@ "lcov" ], "coveragePathIgnorePatterns": [ - "format\\/(File|Message|Schema|Tensor)_generated\\.(js|ts)$", + "fb\\/(File|Message|Schema|Tensor)_generated\\.(js|ts)$", "test\\/.*\\.(ts|tsx|js)$", "/node_modules/" ], @@ -136,7 +140,8 @@ ".(js|jsx)": "./node_modules/babel-jest/build/index.js" }, "transformIgnorePatterns": [ - "/node_modules/", "/(es2015|esnext)\/umd/" + "/node_modules/", + "/(es2015|esnext)/umd/" ], "testRegex": "(.*(-|\\.)(test|spec)s?)\\.(ts|tsx|js)$" } diff --git a/js/perf/index.js b/js/perf/index.js index 29d5edf56de..98f6dfb4000 100644 --- a/js/perf/index.js +++ b/js/perf/index.js @@ -16,48 +16,40 @@ // under the License. // Use the ES5 UMD target as perf baseline -// const { col, Table, readVectors } = require('../targets/es5/umd'); -// const { col, Table, readVectors } = require('../targets/es5/cjs'); -// const { col, Table, readVectors } = require('../targets/es2015/umd'); -const { col, Table, readVectors } = require('../targets/es2015/cjs'); +// const { col, Table, read: readBatches } = require('../targets/es5/umd'); +// const { col, Table, read: readBatches } = require('../targets/es5/cjs'); +// const { col, Table, read: readBatches } = require('../targets/es2015/umd'); +const { col, Table, read: readBatches } = require('../targets/es2015/cjs'); -const config = require('./config'); const Benchmark = require('benchmark'); const suites = []; -for (let { name, buffers} of config) { - const parseSuite = new Benchmark.Suite(`Parse "${name}"`, { async: true }); - const sliceSuite = new Benchmark.Suite(`Slice "${name}" vectors`, { async: true }); - const iterateSuite = new Benchmark.Suite(`Iterate "${name}" vectors`, { async: true }); - const getByIndexSuite = new Benchmark.Suite(`Get "${name}" values by index`, { async: true }); - parseSuite.add(createFromTableTest(name, buffers)); - parseSuite.add(createReadVectorsTest(name, buffers)); - for (const vector of Table.from(buffers).columns) { - sliceSuite.add(createSliceTest(vector)); - iterateSuite.add(createIterateTest(vector)); - getByIndexSuite.add(createGetByIndexTest(vector)); - } - suites.push(getByIndexSuite, iterateSuite, sliceSuite, parseSuite); +for (let { name, buffers } of require('./table_config')) { + const parseSuiteName = `Parse "${name}"`; + const sliceSuiteName = `Slice "${name}" vectors`; + const iterateSuiteName = `Iterate "${name}" vectors`; + const getByIndexSuiteName = `Get "${name}" values by index`; + const sliceToArraySuiteName = `Slice toArray "${name}" vectors`; + suites.push(createTestSuite(parseSuiteName, createFromTableTest(name, buffers))); + suites.push(createTestSuite(parseSuiteName, createReadBatchesTest(name, buffers))); + const table = Table.from(buffers); + suites.push(...table.columns.map((vector, i) => createTestSuite(getByIndexSuiteName, createGetByIndexTest(vector, table.schema.fields[i].name)))); + suites.push(...table.columns.map((vector, i) => createTestSuite(iterateSuiteName, createIterateTest(vector, table.schema.fields[i].name)))); + suites.push(...table.columns.map((vector, i) => createTestSuite(sliceToArraySuiteName, createSliceToArrayTest(vector, table.schema.fields[i].name)))); + suites.push(...table.columns.map((vector, i) => createTestSuite(sliceSuiteName, createSliceTest(vector, table.schema.fields[i].name)))); } for (let {name, buffers, countBys, counts} of require('./table_config')) { const table = Table.from(buffers); - const dfCountBySuite = new Benchmark.Suite(`DataFrame Count By "${name}"`, { async: true }); - for (countBy of countBys) { - dfCountBySuite.add(createDataFrameCountByTest(table, countBy)); - } + const dfCountBySuiteName = `DataFrame Count By "${name}"`; + const dfFilterCountSuiteName = `DataFrame Filter-Scan Count "${name}"`; + const dfDirectCountSuiteName = `DataFrame Direct Count "${name}"`; - const dfFilterCountSuite = new Benchmark.Suite(`DataFrame Filter-Scan Count "${name}"`, { async: true }); - const dfDirectCountSuite = new Benchmark.Suite(`DataFrame Direct Count "${name}"`, { async: true }); - - for (test of counts) { - dfFilterCountSuite.add(createDataFrameFilterCountTest(table, test.col, test.test, test.value)) - dfDirectCountSuite.add(createDataFrameDirectCountTest(table, test.col, test.test, test.value)) - } - - suites.push(dfCountBySuite, dfFilterCountSuite, dfDirectCountSuite) + suites.push(...countBys.map((countBy) => createTestSuite(dfCountBySuiteName, createDataFrameCountByTest(table, countBy)))); + suites.push(...counts.map(({ col, test, value }) => createTestSuite(dfFilterCountSuiteName, createDataFrameFilterCountTest(table, col, test, value)))); + suites.push(...counts.map(({ col, test, value }) => createTestSuite(dfDirectCountSuiteName, createDataFrameDirectCountTest(table, col, test, value)))); } console.log('Running apache-arrow performance tests...\n'); @@ -71,7 +63,7 @@ function run() { var str = x.toString(); var meanMsPerOp = Math.round(x.stats.mean * 100000)/100; var sliceOf60FPS = Math.round((meanMsPerOp / (1000/60)) * 100000)/1000; - return `${str} (avg: ${meanMsPerOp}ms, or ${sliceOf60FPS}% of a frame @ 60FPS) ${x.suffix || ''}`; + return `${str}\n avg: ${meanMsPerOp}ms\n ${sliceOf60FPS}% of a frame @ 60FPS ${x.suffix || ''}`; }).join('\n') + '\n'); if (suites.length > 0) { setTimeout(run, 1000); @@ -80,47 +72,60 @@ function run() { .run({ async: true }); } +function createTestSuite(name, test) { + return new Benchmark.Suite(name, { async: true }).add(test); +} + function createFromTableTest(name, buffers) { let table; return { async: true, - name: `Table.from`, + name: `Table.from\n`, fn() { table = Table.from(buffers); } }; } -function createReadVectorsTest(name, buffers) { - let vectors; +function createReadBatchesTest(name, buffers) { + let recordBatch; return { async: true, - name: `readVectors`, - fn() { for (vectors of readVectors(buffers)) {} } + name: `readBatches\n`, + fn() { for (recordBatch of readBatches(buffers)) {} } }; } -function createSliceTest(vector) { +function createSliceTest(vector, name) { let xs; return { async: true, - name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`, + name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`, fn() { xs = vector.slice(); } }; } -function createIterateTest(vector) { +function createSliceToArrayTest(vector, name) { + let xs; + return { + async: true, + name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`, + fn() { xs = vector.slice().toArray(); } + }; +} + +function createIterateTest(vector, name) { let value; return { async: true, - name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`, + name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`, fn() { for (value of vector) {} } }; } -function createGetByIndexTest(vector) { +function createGetByIndexTest(vector, name) { let value; return { async: true, - name: `name: '${vector.name}', length: ${vector.length}, type: ${vector.type}`, + name: `name: '${name}', length: ${vector.length}, type: ${vector.type}\n`, fn() { for (let i = -1, n = vector.length; ++i < n;) { value = vector.get(i); @@ -130,35 +135,35 @@ function createGetByIndexTest(vector) { } function createDataFrameDirectCountTest(table, column, test, value) { - let colidx = table.columns.findIndex((c)=>c.name === column); + let sum, colidx = table.schema.fields.findIndex((c)=>c.name === column); if (test == 'gteq') { op = function () { sum = 0; - for (let batch = -1; ++batch < table.lengths.length;) { - const length = table.lengths[batch]; - + let batches = table.batches; + let numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { // load batches - const columns = table.batches[batch]; - + const batch = batches[batchIndex]; + const vector = batch.getChildAt(colidx); // yield all indices - for (let idx = -1; ++idx < length;) { - sum += (columns[colidx].get(idx) >= value); + for (let index = -1; ++index < batch.length;) { + sum += (vector.get(index) >= value); } } } } else if (test == 'eq') { op = function() { sum = 0; - for (let batch = -1; ++batch < table.lengths.length;) { - const length = table.lengths[batch]; - + let batches = table.batches; + let numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { // load batches - const columns = table.batches[batch] - + const batch = batches[batchIndex]; + const vector = batch.getChildAt(colidx); // yield all indices - for (let idx = -1; ++idx < length;) { - sum += (columns[colidx].get(idx) == value); + for (let index = -1; ++index < batch.length;) { + sum += (vector.get(index) === value); } } } @@ -168,17 +173,17 @@ function createDataFrameDirectCountTest(table, column, test, value) { return { async: true, - name: `name: '${column}', length: ${table.length}, type: ${table.columns[colidx].type}, test: ${test}, value: ${value}`, + name: `name: '${column}', length: ${table.numRows}, type: ${table.getColumnAt(colidx).type}, test: ${test}, value: ${value}\n`, fn: op }; } function createDataFrameCountByTest(table, column) { - let colidx = table.columns.findIndex((c)=>c.name === column); + let colidx = table.schema.fields.findIndex((c)=> c.name === column); return { async: true, - name: `name: '${column}', length: ${table.length}, type: ${table.columns[colidx].type}`, + name: `name: '${column}', length: ${table.numRows}, type: ${table.getColumnAt(colidx).type}\n`, fn() { table.countBy(column); } @@ -186,7 +191,7 @@ function createDataFrameCountByTest(table, column) { } function createDataFrameFilterCountTest(table, column, test, value) { - let colidx = table.columns.findIndex((c)=>c.name === column); + let colidx = table.schema.fields.findIndex((c)=> c.name === column); let df; if (test == 'gteq') { @@ -199,7 +204,7 @@ function createDataFrameFilterCountTest(table, column, test, value) { return { async: true, - name: `name: '${column}', length: ${table.length}, type: ${table.columns[colidx].type}, test: ${test}, value: ${value}`, + name: `name: '${column}', length: ${table.numRows}, type: ${table.getColumnAt(colidx).type}, test: ${test}, value: ${value}\n`, fn() { df.count(); } diff --git a/js/src/Arrow.externs.js b/js/src/Arrow.externs.js new file mode 100644 index 00000000000..a0aff002fdb --- /dev/null +++ b/js/src/Arrow.externs.js @@ -0,0 +1,622 @@ +// @ts-nocheck +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* tslint:disable */ + +/** + * @fileoverview Closure Compiler externs for Arrow + * @externs + * @suppress {duplicate,checkTypes} + */ +/** @type {symbol} */ +Symbol.iterator; +/** @type {symbol} */ +Symbol.asyncIterator; + +var Table = function() {}; +/** @type {?} */ +Table.from = function() {}; +/** @type {?} */ +Table.fromAsync = function() {}; +/** @type {?} */ +Table.empty = function() {}; +/** @type {?} */ +Table.prototype.schema; +/** @type {?} */ +Table.prototype.columns; +/** @type {?} */ +Table.prototype.length; +/** @type {?} */ +Table.prototype.numCols; +/** @type {?} */ +Table.prototype.get; +/** @type {?} */ +Table.prototype.getColumn; +/** @type {?} */ +Table.prototype.getColumnAt; +/** @type {?} */ +Table.prototype.getColumnIndex; +/** @type {?} */ +Table.prototype.toArray; +/** @type {?} */ +Table.prototype.select; +/** @type {?} */ +Table.prototype.rowsToString; +/** @type {?} */ +Table.prototype.lengths; +/** @type {?} */ +Table.prototype.batches; +/** @type {?} */ +Table.prototype.countBy; +/** @type {?} */ +Table.prototype.scan; +/** @type {?} */ +Table.prototype.get; + +var CountByResult = function() {}; +/** @type {?} */ +CountByResult.prototype.asJSON; + +let Col = function() {}; +/** @type {?} */ +Col.prototype.gteq; +/** @type {?} */ +Col.prototype.lteq; +/** @type {?} */ +Col.prototype.eq; + +var TableToStringIterator = function() {}; +/** @type {?} */ +TableToStringIterator.prototype.pipe; + +var RecordBatch = function() {}; +/** @type {?} */ +RecordBatch.from = function() {}; +/** @type {?} */ +RecordBatch.prototype.numCols; +/** @type {?} */ +RecordBatch.prototype.numRows; +/** @type {?} */ +RecordBatch.prototype.schema; +/** @type {?} */ +RecordBatch.prototype.columns; +/** @type {?} */ +RecordBatch.prototype.select; + +var Vector = function() {}; +/** @type {?} */ +Vector.create = function() {}; +/** @type {?} */ +Vector.prototype.data; +/** @type {?} */ +Vector.prototype.type; +/** @type {?} */ +Vector.prototype.length; +/** @type {?} */ +Vector.prototype.nullCount; +/** @type {?} */ +Vector.prototype.nullBitmap; +/** @type {?} */ +Vector.prototype.isValid; +/** @type {?} */ +Vector.prototype.get; +/** @type {?} */ +Vector.prototype.set; +/** @type {?} */ +Vector.prototype.setData; +/** @type {?} */ +Vector.prototype.toArray; +/** @type {?} */ +Vector.prototype.concat; +/** @type {?} */ +Vector.prototype.slice; +/** @type {?} */ +Vector.prototype.acceptTypeVisitor; + +var BaseInt64 = function() {}; +/** @type {?} */ +BaseInt64.prototype.lessThan; +/** @type {?} */ +BaseInt64.prototype.equals; +/** @type {?} */ +BaseInt64.prototype.greaterThan; +/** @type {?} */ +BaseInt64.prototype.hex; + +var Uint64 = function() {}; +/** @type {?} */ +Uint64.add = function() {}; +/** @type {?} */ +Uint64.multiply = function() {}; +/** @type {?} */ +Uint64.prototype.times; +/** @type {?} */ +Uint64.prototype.plus + +var Int64 = function() {}; +/** @type {?} */ +Int64.add = function() {}; +/** @type {?} */ +Int64.multiply = function() {}; +/** @type {?} */ +Int64.fromString = function() {}; +/** @type {?} */ +Int64.prototype.negate +/** @type {?} */ +Int64.prototype.times +/** @type {?} */ +Int64.prototype.plus +/** @type {?} */ +Int64.prototype.lessThan + +var Int128 = function() {}; +/** @type {?} */ +Int128.add = function() {}; +/** @type {?} */ +Int128.multiply = function() {}; +/** @type {?} */ +Int128.fromString = function() {}; +/** @type {?} */ +Int128.prototype.negate +/** @type {?} */ +Int128.prototype.times +/** @type {?} */ +Int128.prototype.plus +/** @type {?} */ +Int128.prototype.hex + +var Type = function() {}; +/** @type {?} */ +Type.NONE = function() {}; +/** @type {?} */ +Type.Null = function() {}; +/** @type {?} */ +Type.Int = function() {}; +/** @type {?} */ +Type.Float = function() {}; +/** @type {?} */ +Type.Binary = function() {}; +/** @type {?} */ +Type.Utf8 = function() {}; +/** @type {?} */ +Type.Bool = function() {}; +/** @type {?} */ +Type.Decimal = function() {}; +/** @type {?} */ +Type.Date = function() {}; +/** @type {?} */ +Type.Time = function() {}; +/** @type {?} */ +Type.Timestamp = function() {}; +/** @type {?} */ +Type.Interval = function() {}; +/** @type {?} */ +Type.List = function() {}; +/** @type {?} */ +Type.Struct = function() {}; +/** @type {?} */ +Type.Union = function() {}; +/** @type {?} */ +Type.FixedSizeBinary = function() {}; +/** @type {?} */ +Type.FixedSizeList = function() {}; +/** @type {?} */ +Type.Map = function() {}; +/** @type {?} */ +Type.Dictionary = function() {}; +/** @type {?} */ +Type.DenseUnion = function() {}; +/** @type {?} */ +Type.SparseUnion = function() {}; + +var DataType = function() {}; +/** @type {?} */ +DataType.isNull = function() {}; +/** @type {?} */ +DataType.isInt = function() {}; +/** @type {?} */ +DataType.isFloat = function() {}; +/** @type {?} */ +DataType.isBinary = function() {}; +/** @type {?} */ +DataType.isUtf8 = function() {}; +/** @type {?} */ +DataType.isBool = function() {}; +/** @type {?} */ +DataType.isDecimal = function() {}; +/** @type {?} */ +DataType.isDate = function() {}; +/** @type {?} */ +DataType.isTime = function() {}; +/** @type {?} */ +DataType.isTimestamp = function() {}; +/** @type {?} */ +DataType.isInterval = function() {}; +/** @type {?} */ +DataType.isList = function() {}; +/** @type {?} */ +DataType.isStruct = function() {}; +/** @type {?} */ +DataType.isUnion = function() {}; +/** @type {?} */ +DataType.isDenseUnion = function() {}; +/** @type {?} */ +DataType.isSparseUnion = function() {}; +/** @type {?} */ +DataType.isFixedSizeBinary = function() {}; +/** @type {?} */ +DataType.isFixedSizeList = function() {}; +/** @type {?} */ +DataType.isMap = function() {}; +/** @type {?} */ +DataType.isDictionary = function() {}; +/** @type {?} */ +DataType.prototype.ArrayType; + +var Schema = function() {}; +/** @type {?} */ +Schema.from = function() {}; +/** @type {?} */ +Schema.prototype.fields; +/** @type {?} */ +Schema.prototype.version; +/** @type {?} */ +Schema.prototype.metadata; +/** @type {?} */ +Schema.prototype.dictionaries; +var Field = function() {}; +/** @type {?} */ +Field.prototype.name; +/** @type {?} */ +Field.prototype.type; +/** @type {?} */ +Field.prototype.nullable; +/** @type {?} */ +Field.prototype.metadata; +var Null = function() {}; +var Int8 = function() {}; +var Int16 = function() {}; +var Int32 = function() {}; +var Int64 = function() {}; +var Uint8 = function() {}; +var Uint16 = function() {}; +var Uint32 = function() {}; +var Uint64 = function() {}; +var Float16 = function() {}; +var Float32 = function() {}; +var Float64 = function() {}; +var Binary = function() {}; +var Utf8 = function() {}; +var Bool = function() {}; +var Decimal = function() {}; +var Date_ = function() {}; +var Time = function() {}; +var Timestamp = function() {}; +var Interval = function() {}; +var List = function() {}; +var Struct = function() {}; +var Union = function() {}; +var DenseUnion = function() {}; +var SparseUnion = function() {}; +var FixedSizeBinary = function() {}; +var FixedSizeList = function() {}; +var Map_ = function() {}; +var Dictionary = function() {}; + +var BaseData = function() {}; +/** @type {?} */ +BaseData.prototype.type; +/** @type {?} */ +BaseData.prototype.clone; +/** @type {?} */ +BaseData.prototype.slice; +/** @type {?} */ +BaseData.prototype.length; +/** @type {?} */ +BaseData.prototype.offset; +/** @type {?} */ +BaseData.prototype.typeId; +/** @type {?} */ +BaseData.prototype.childData; +/** @type {?} */ +BaseData.prototype.nullBitmap; +/** @type {?} */ +BaseData.prototype.nullCount; + +var BoolData = function() {}; +var NestedData = function() {}; +var SparseUnionData = function() {}; +var ChunkedData = function() {}; + +var FlatData = function() {}; +/** @type {?} */ +FlatData.prototype.values; + +var FlatListData = function() {}; +/** @type {?} */ +FlatListData.prototype.values; +/** @type {?} */ +FlatListData.prototype.valueOffsets; + +var DictionaryData = function() {}; +/** @type {?} */ +DictionaryData.prototype.indicies; +/** @type {?} */ +DictionaryData.prototype.dictionary; + +var ListData = function() {}; +/** @type {?} */ +ListData.prototype.values; +/** @type {?} */ +ListData.prototype.valueOffsets; + +var UnionData = function() {}; +/** @type {?} */ +UnionData.prototype.typeIds; + +var DenseUnionData = function() {}; +/** @type {?} */ +DenseUnionData.prototype.valueOffsets; + +var ChunkedData = function() {}; +/** @type {?} */ +ChunkedData.computeOffsets = function() {}; + +var FlatVector = function() {}; +/** @type {?} */ +FlatVector.prototype.values; +/** @type {?} */ +FlatVector.prototype.lows; +/** @type {?} */ +FlatVector.prototype.highs; +/** @type {?} */ +FlatVector.prototype.asInt32; + +var ListVectorBase = function() {}; +/** @type {?} */ +ListVectorBase.prototype.values; +/** @type {?} */ +ListVectorBase.prototype.valueOffsets; +/** @type {?} */ +ListVectorBase.prototype.getValueOffset; +/** @type {?} */ +ListVectorBase.prototype.getValueLength; + +var NestedVector = function() {}; +/** @type {?} */ +NestedVector.prototype.childData; +/** @type {?} */ +NestedVector.prototype.getChildAt; + +var NullVector = function() {}; +var BoolVector = function() {}; +/** @type {?} */ +BoolVector.from = function() {}; +/** @type {?} */ +BoolVector.prototype.values; +var IntVector = function() {}; +/** @type {?} */ +IntVector.from = function() {}; + +var FloatVector = function() {}; +/** @type {?} */ +FloatVector.from = function() {}; + +var DateVector = function() {}; +var DecimalVector = function() {}; +var TimeVector = function() {}; +var TimestampVector = function() {}; +var IntervalVector = function() {}; +var BinaryVector = function() {}; +var FixedSizeBinaryVector = function() {}; +var Utf8Vector = function() {}; +var ListVector = function() {}; +var FixedSizeListVector = function() {}; +var MapVector = function() {}; +var StructVector = function() {}; +var UnionVector = function() {}; + +var DictionaryVector = function() {}; +/** @type {?} */ +DictionaryVector.prototype.getKey; +/** @type {?} */ +DictionaryVector.prototype.getValue; + +var FlatView = function() {}; +/** @type {?} */ +FlatView.prototype.get; +/** @type {?} */ +FlatView.prototype.isValid; +/** @type {?} */ +FlatView.prototype.toArray; +/** @type {?} */ +FlatView.prototype.set; +/** @type {?} */ +FlatView.prototype.setData; + +var NullView = function() {}; +/** @type {?} */ +NullView.prototype.get; +/** @type {?} */ +NullView.prototype.isValid; +/** @type {?} */ +NullView.prototype.toArray; +/** @type {?} */ +NullView.prototype.set; +/** @type {?} */ +NullView.prototype.setData; + +var BoolView = function() {}; +/** @type {?} */ +BoolView.prototype.get; +/** @type {?} */ +BoolView.prototype.isValid; +/** @type {?} */ +BoolView.prototype.toArray; +/** @type {?} */ +BoolView.prototype.set; +/** @type {?} */ +BoolView.prototype.setData; + +var ValidityView = function() {}; +/** @type {?} */ +ValidityView.prototype.get; +/** @type {?} */ +ValidityView.prototype.isValid; +/** @type {?} */ +ValidityView.prototype.toArray; +/** @type {?} */ +ValidityView.prototype.set; +/** @type {?} */ +ValidityView.prototype.setData; + +var DictionaryView = function() {}; +/** @type {?} */ +DictionaryView.prototype.get; +/** @type {?} */ +DictionaryView.prototype.isValid; +/** @type {?} */ +DictionaryView.prototype.toArray; +/** @type {?} */ +DictionaryView.prototype.set; +/** @type {?} */ +DictionaryView.prototype.setData; + +var ListViewBase = function() {}; +/** @type {?} */ +ListViewBase.prototype.get; +/** @type {?} */ +ListViewBase.prototype.isValid; +/** @type {?} */ +ListViewBase.prototype.toArray; +/** @type {?} */ +ListViewBase.prototype.set; +/** @type {?} */ +ListViewBase.prototype.setData; + +var NestedView = function() {}; +/** @type {?} */ +NestedView.prototype.get; +/** @type {?} */ +NestedView.prototype.isValid; +/** @type {?} */ +NestedView.prototype.toArray; +/** @type {?} */ +NestedView.prototype.set; +/** @type {?} */ +NestedView.prototype.setData; + +var ChunkedView = function() {}; +/** @type {?} */ +ChunkedView.prototype.get; +/** @type {?} */ +ChunkedView.prototype.isValid; +/** @type {?} */ +ChunkedView.prototype.toArray; +/** @type {?} */ +ChunkedView.prototype.set; +/** @type {?} */ +ChunkedView.prototype.setData; + +var TypeVisitor = function() {}; +/** @type {?} */ +TypeVisitor.visitTypeInline = function() {}; +/** @type {?} */ +TypeVisitor.prototype.visit; +/** @type {?} */ +TypeVisitor.prototype.visitMany; +/** @type {?} */ +TypeVisitor.prototype.visitNull; +/** @type {?} */ +TypeVisitor.prototype.visitBool; +/** @type {?} */ +TypeVisitor.prototype.visitInt; +/** @type {?} */ +TypeVisitor.prototype.visitFloat; +/** @type {?} */ +TypeVisitor.prototype.visitUtf8; +/** @type {?} */ +TypeVisitor.prototype.visitBinary; +/** @type {?} */ +TypeVisitor.prototype.visitFixedSizeBinary; +/** @type {?} */ +TypeVisitor.prototype.visitDate; +/** @type {?} */ +TypeVisitor.prototype.visitTimestamp; +/** @type {?} */ +TypeVisitor.prototype.visitTime; +/** @type {?} */ +TypeVisitor.prototype.visitDecimal; +/** @type {?} */ +TypeVisitor.prototype.visitList; +/** @type {?} */ +TypeVisitor.prototype.visitStruct; +/** @type {?} */ +TypeVisitor.prototype.visitUnion; +/** @type {?} */ +TypeVisitor.prototype.visitDictionary; +/** @type {?} */ +TypeVisitor.prototype.visitInterval; +/** @type {?} */ +TypeVisitor.prototype.visitFixedSizeList; +/** @type {?} */ +TypeVisitor.prototype.visitMap; + +var VectorVisitor = function() {}; +/** @type {?} */ +VectorVisitor.visitTypeInline = function() {}; +/** @type {?} */ +VectorVisitor.prototype.visit; +/** @type {?} */ +VectorVisitor.prototype.visitMany; +/** @type {?} */ +VectorVisitor.prototype.visitNullVector; +/** @type {?} */ +VectorVisitor.prototype.visitBoolVector; +/** @type {?} */ +VectorVisitor.prototype.visitIntVector; +/** @type {?} */ +VectorVisitor.prototype.visitFloatVector; +/** @type {?} */ +VectorVisitor.prototype.visitUtf8Vector; +/** @type {?} */ +VectorVisitor.prototype.visitBinaryVector; +/** @type {?} */ +VectorVisitor.prototype.visitFixedSizeBinaryVector; +/** @type {?} */ +VectorVisitor.prototype.visitDateVector; +/** @type {?} */ +VectorVisitor.prototype.visitTimestampVector; +/** @type {?} */ +VectorVisitor.prototype.visitTimeVector; +/** @type {?} */ +VectorVisitor.prototype.visitDecimalVector; +/** @type {?} */ +VectorVisitor.prototype.visitListVector; +/** @type {?} */ +VectorVisitor.prototype.visitStructVector; +/** @type {?} */ +VectorVisitor.prototype.visitUnionVector; +/** @type {?} */ +VectorVisitor.prototype.visitDictionaryVector; +/** @type {?} */ +VectorVisitor.prototype.visitIntervalVector; +/** @type {?} */ +VectorVisitor.prototype.visitFixedSizeListVector; +/** @type {?} */ +VectorVisitor.prototype.visitMapVector; \ No newline at end of file diff --git a/js/src/Arrow.externs.ts b/js/src/Arrow.externs.ts deleted file mode 100644 index abc11eff509..00000000000 --- a/js/src/Arrow.externs.ts +++ /dev/null @@ -1,106 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/** - * @fileoverview Closure Compiler externs for Arrow - * @externs - * @suppress {duplicate,checkTypes} - */ -/** @type {symbol} */ -Symbol.iterator; -/** @type {symbol} */ -Symbol.asyncIterator; - -let RowVector = function() {}; -/** @type {?} */ -RowVector.prototype.toJSON; -/** @type {?} */ -RowVector.prototype.toArray; -/** @type {?} */ -RowVector.prototype.toObject; -/** @type {?} */ -RowVector.prototype.toString; - -let Table = function() {}; -/** @type {?} */ -( Table).from; -/** @type {?} */ -Table.prototype.columns; -/** @type {?} */ -Table.prototype.length; -/** @type {?} */ -Table.prototype.col; -/** @type {?} */ -Table.prototype.key; -/** @type {?} */ -Table.prototype.select; -/** @type {?} */ -Table.prototype.toString; -/** @type {?} */ -Table.prototype.lengths; -/** @type {?} */ -Table.prototype.batches; -/** @type {?} */ -Table.prototype.countBy; -/** @type {?} */ -Table.prototype.scan; -/** @type {?} */ -Table.prototype.get; - -let CountByResult = function() {}; -/** @type {?} */ -CountByResult.prototype.asJSON; - -let Vector = function() {}; -/** @type {?} */ -Vector.prototype.length; -/** @type {?} */ -Vector.prototype.name; -/** @type {?} */ -Vector.prototype.type; -/** @type {?} */ -Vector.prototype.get; -/** @type {?} */ -Vector.prototype.concat; -/** @type {?} */ -Vector.prototype.slice; -/** @type {?} */ -Vector.prototype.metadata; -/** @type {?} */ -Vector.prototype.nullable; -/** @type {?} */ -Vector.prototype.nullCount; - -let BoolVector = function() {}; -/** @type {?} */ -( BoolVector).pack; -/** @type {?} */ -BoolVector.prototype.set; - -let DictionaryVector = function() {}; -/** @type {?} */ -DictionaryVector.prototype.getKey; -/** @type {?} */ -DictionaryVector.prototype.getValue; - -let Col = function() {}; -/** @type {?} */ -Col.prototype.gteq; -/** @type {?} */ -Col.prototype.lteq; -/** @type {?} */ -Col.prototype.eq; diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index 21eb2976d44..1cbc6c36aa3 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -15,117 +15,245 @@ // specific language governing permissions and limitations // under the License. -import { Table, TableRow, CountByResult } from './table'; +import * as type_ from './type'; +import * as data_ from './data'; +import * as vector_ from './vector'; +import * as util_ from './util/int'; +import * as visitor_ from './visitor'; +import * as view_ from './vector/view'; +import { Vector } from './vector'; +import { RecordBatch } from './recordbatch'; +import { Schema, Field, Type } from './type'; +import { Table, CountByResult } from './table'; import { lit, col, Col, Value } from './predicate'; -import { Vector } from './vector/vector'; -import { Utf8Vector } from './vector/utf8'; -import { DictionaryVector } from './vector/dictionary'; -import { StructVector, StructRow } from './vector/struct'; -import { read, readAsync } from './reader/arrow'; -import { Uint64, Int64, Int128 } from './util/int'; -import { ListVector, BinaryVector, FixedSizeListVector } from './vector/list'; - -import { - BoolVector, - Int8Vector, - Int16Vector, - Int32Vector, - Int64Vector, - Uint8Vector, - Uint16Vector, - Uint32Vector, - Uint64Vector, - Float16Vector, - Float32Vector, - Float64Vector, - Date32Vector, - Date64Vector, - Time32Vector, - Time64Vector, - DecimalVector, - TimestampVector, -} from './vector/numeric'; - -// closure compiler always erases static method names: -// https://github.com/google/closure-compiler/issues/1776 -// set them via string indexers to save them from the mangler -Table['from'] = Table.from; -Table['fromAsync'] = Table.fromAsync; -BoolVector['pack'] = BoolVector.pack; +import { read, readAsync } from './ipc/reader/arrow'; + +export import View = vector_.View; +export import VectorLike = vector_.VectorLike; +export import TypedArray = type_.TypedArray; +export import IntBitWidth = type_.IntBitWidth; +export import TimeBitWidth = type_.TimeBitWidth; +export import TypedArrayConstructor = type_.TypedArrayConstructor; export { read, readAsync }; -export { Table, TableRow, CountByResult }; +export { Table, CountByResult }; export { lit, col, Col, Value }; -export { Vector, StructRow }; -export { Uint64, Int64, Int128 }; -export { NumericVectorConstructor } from './vector/numeric'; -export { List, TypedArray, TypedArrayConstructor } from './vector/types'; -export { - BoolVector, - ListVector, - Utf8Vector, - Int8Vector, - Int16Vector, - Int32Vector, - Int64Vector, - Uint8Vector, - Uint16Vector, - Uint32Vector, - Uint64Vector, - Date32Vector, - Date64Vector, - Time32Vector, - Time64Vector, - BinaryVector, - StructVector, - Float16Vector, - Float32Vector, - Float64Vector, - DecimalVector, - TimestampVector, - DictionaryVector, - FixedSizeListVector, -}; - -/* These exports are needed for the closure umd targets */ +export { Field, Schema, RecordBatch, Vector, Type }; + +export namespace util { + export import Uint64 = util_.Uint64; + export import Int64 = util_.Int64; + export import Int128 = util_.Int128; +} + +export namespace data { + export import BaseData = data_.BaseData; + export import FlatData = data_.FlatData; + export import BoolData = data_.BoolData; + export import FlatListData = data_.FlatListData; + export import DictionaryData = data_.DictionaryData; + export import NestedData = data_.NestedData; + export import ListData = data_.ListData; + export import UnionData = data_.UnionData; + export import SparseUnionData = data_.SparseUnionData; + export import DenseUnionData = data_.DenseUnionData; + export import ChunkedData = data_.ChunkedData; +} + +export namespace type { + export import Schema = type_.Schema; + export import Field = type_.Field; + export import Null = type_.Null; + export import Int = type_.Int; + export import Int8 = type_.Int8; + export import Int16 = type_.Int16; + export import Int32 = type_.Int32; + export import Int64 = type_.Int64; + export import Uint8 = type_.Uint8; + export import Uint16 = type_.Uint16; + export import Uint32 = type_.Uint32; + export import Uint64 = type_.Uint64; + export import Float = type_.Float; + export import Float16 = type_.Float16; + export import Float32 = type_.Float32; + export import Float64 = type_.Float64; + export import Binary = type_.Binary; + export import Utf8 = type_.Utf8; + export import Bool = type_.Bool; + export import Decimal = type_.Decimal; + export import Date_ = type_.Date_; + export import Time = type_.Time; + export import Timestamp = type_.Timestamp; + export import Interval = type_.Interval; + export import List = type_.List; + export import Struct = type_.Struct; + export import Union = type_.Union; + export import DenseUnion = type_.DenseUnion; + export import SparseUnion = type_.SparseUnion; + export import FixedSizeBinary = type_.FixedSizeBinary; + export import FixedSizeList = type_.FixedSizeList; + export import Map_ = type_.Map_; + export import Dictionary = type_.Dictionary; +} + +export namespace vector { + export import Vector = vector_.Vector; + export import NullVector = vector_.NullVector; + export import BoolVector = vector_.BoolVector; + export import IntVector = vector_.IntVector; + export import FloatVector = vector_.FloatVector; + export import DateVector = vector_.DateVector; + export import DecimalVector = vector_.DecimalVector; + export import TimeVector = vector_.TimeVector; + export import TimestampVector = vector_.TimestampVector; + export import IntervalVector = vector_.IntervalVector; + export import BinaryVector = vector_.BinaryVector; + export import FixedSizeBinaryVector = vector_.FixedSizeBinaryVector; + export import Utf8Vector = vector_.Utf8Vector; + export import ListVector = vector_.ListVector; + export import FixedSizeListVector = vector_.FixedSizeListVector; + export import MapVector = vector_.MapVector; + export import StructVector = vector_.StructVector; + export import UnionVector = vector_.UnionVector; + export import DictionaryVector = vector_.DictionaryVector; +} + +export namespace visitor { + export import TypeVisitor = visitor_.TypeVisitor; + export import VectorVisitor = visitor_.VectorVisitor; +} + +export namespace view { + export import ChunkedView = view_.ChunkedView; + export import DictionaryView = view_.DictionaryView; + export import ListView = view_.ListView; + export import FixedSizeListView = view_.FixedSizeListView; + export import BinaryView = view_.BinaryView; + export import Utf8View = view_.Utf8View; + export import UnionView = view_.UnionView; + export import DenseUnionView = view_.DenseUnionView; + export import NestedView = view_.NestedView; + export import StructView = view_.StructView; + export import MapView = view_.MapView; + export import FlatView = view_.FlatView; + export import NullView = view_.NullView; + export import BoolView = view_.BoolView; + export import ValidityView = view_.ValidityView; + export import PrimitiveView = view_.PrimitiveView; + export import FixedSizeView = view_.FixedSizeView; + export import Float16View = view_.Float16View; + export import DateDayView = view_.DateDayView; + export import DateMillisecondView = view_.DateMillisecondView; + export import TimestampDayView = view_.TimestampDayView; + export import TimestampSecondView = view_.TimestampSecondView; + export import TimestampMillisecondView = view_.TimestampMillisecondView; + export import TimestampMicrosecondView = view_.TimestampMicrosecondView; + export import TimestampNanosecondView = view_.TimestampNanosecondView; + export import IntervalYearMonthView = view_.IntervalYearMonthView; + export import IntervalYearView = view_.IntervalYearView; + export import IntervalMonthView = view_.IntervalMonthView; +} + +/* These exports are needed for the closure and uglify umd targets */ try { - const Arrow = eval('exports'); - if (typeof Arrow === 'object') { - // string indexers tell closure compiler not to rename these properties - Arrow['lit'] = lit; - Arrow['col'] = col; - Arrow['Col'] = Col; + let Arrow: any = eval('exports'); + if (Arrow && typeof Arrow === 'object') { + // string indexers tell closure and uglify not to rename these properties + Arrow['data'] = data; + Arrow['type'] = type; + Arrow['util'] = util; + Arrow['view'] = view; + Arrow['vector'] = vector; + Arrow['visitor'] = visitor; + Arrow['read'] = read; - Arrow['Value'] = Value; - Arrow['Table'] = Table; Arrow['readAsync'] = readAsync; + + Arrow['Type'] = Type; + Arrow['Field'] = Field; + Arrow['Schema'] = Schema; Arrow['Vector'] = Vector; - Arrow['StructRow'] = StructRow; - Arrow['BoolVector'] = BoolVector; - Arrow['ListVector'] = ListVector; - Arrow['Utf8Vector'] = Utf8Vector; - Arrow['Int8Vector'] = Int8Vector; - Arrow['Int16Vector'] = Int16Vector; - Arrow['Int32Vector'] = Int32Vector; - Arrow['Int64Vector'] = Int64Vector; - Arrow['Uint8Vector'] = Uint8Vector; - Arrow['Uint16Vector'] = Uint16Vector; - Arrow['Uint32Vector'] = Uint32Vector; - Arrow['Uint64Vector'] = Uint64Vector; - Arrow['Date32Vector'] = Date32Vector; - Arrow['Date64Vector'] = Date64Vector; - Arrow['Time32Vector'] = Time32Vector; - Arrow['Time64Vector'] = Time64Vector; - Arrow['BinaryVector'] = BinaryVector; - Arrow['StructVector'] = StructVector; - Arrow['Float16Vector'] = Float16Vector; - Arrow['Float32Vector'] = Float32Vector; - Arrow['Float64Vector'] = Float64Vector; - Arrow['DecimalVector'] = DecimalVector; + Arrow['RecordBatch'] = RecordBatch; + + Arrow['Table'] = Table; Arrow['CountByResult'] = CountByResult; - Arrow['TimestampVector'] = TimestampVector; - Arrow['DictionaryVector'] = DictionaryVector; - Arrow['FixedSizeListVector'] = FixedSizeListVector; + Arrow['Value'] = Value; + Arrow['lit'] = lit; + Arrow['col'] = col; + Arrow['Col'] = Col; } } catch (e) { /* not the UMD bundle */ } -/* end closure exports */ +/* end umd exports */ + +// closure compiler erases static properties/methods: +// https://github.com/google/closure-compiler/issues/1776 +// set them via string indexers to save them from the mangler +Schema['from'] = Schema.from; +Table['from'] = Table.from; +Table['fromAsync'] = Table.fromAsync; +Table['empty'] = Table.empty; +Vector['create'] = Vector.create; +RecordBatch['from'] = RecordBatch.from; + +util_.Uint64['add'] = util_.Uint64.add; +util_.Uint64['multiply'] = util_.Uint64.multiply; + +util_.Int64['add'] = util_.Int64.add; +util_.Int64['multiply'] = util_.Int64.multiply; +util_.Int64['fromString'] = util_.Int64.fromString; + +util_.Int128['add'] = util_.Int128.add; +util_.Int128['multiply'] = util_.Int128.multiply; +util_.Int128['fromString'] = util_.Int128.fromString; + +data_.ChunkedData['computeOffsets'] = data_.ChunkedData.computeOffsets; + +(type_.Type as any)['NONE'] = type_.Type.NONE; +(type_.Type as any)['Null'] = type_.Type.Null; +(type_.Type as any)['Int'] = type_.Type.Int; +(type_.Type as any)['Float'] = type_.Type.Float; +(type_.Type as any)['Binary'] = type_.Type.Binary; +(type_.Type as any)['Utf8'] = type_.Type.Utf8; +(type_.Type as any)['Bool'] = type_.Type.Bool; +(type_.Type as any)['Decimal'] = type_.Type.Decimal; +(type_.Type as any)['Date'] = type_.Type.Date; +(type_.Type as any)['Time'] = type_.Type.Time; +(type_.Type as any)['Timestamp'] = type_.Type.Timestamp; +(type_.Type as any)['Interval'] = type_.Type.Interval; +(type_.Type as any)['List'] = type_.Type.List; +(type_.Type as any)['Struct'] = type_.Type.Struct; +(type_.Type as any)['Union'] = type_.Type.Union; +(type_.Type as any)['FixedSizeBinary'] = type_.Type.FixedSizeBinary; +(type_.Type as any)['FixedSizeList'] = type_.Type.FixedSizeList; +(type_.Type as any)['Map'] = type_.Type.Map; +(type_.Type as any)['Dictionary'] = type_.Type.Dictionary; +(type_.Type as any)['DenseUnion'] = type_.Type.DenseUnion; +(type_.Type as any)['SparseUnion'] = type_.Type.SparseUnion; + +type_.DataType['isNull'] = type_.DataType.isNull; +type_.DataType['isInt'] = type_.DataType.isInt; +type_.DataType['isFloat'] = type_.DataType.isFloat; +type_.DataType['isBinary'] = type_.DataType.isBinary; +type_.DataType['isUtf8'] = type_.DataType.isUtf8; +type_.DataType['isBool'] = type_.DataType.isBool; +type_.DataType['isDecimal'] = type_.DataType.isDecimal; +type_.DataType['isDate'] = type_.DataType.isDate; +type_.DataType['isTime'] = type_.DataType.isTime; +type_.DataType['isTimestamp'] = type_.DataType.isTimestamp; +type_.DataType['isInterval'] = type_.DataType.isInterval; +type_.DataType['isList'] = type_.DataType.isList; +type_.DataType['isStruct'] = type_.DataType.isStruct; +type_.DataType['isUnion'] = type_.DataType.isUnion; +type_.DataType['isDenseUnion'] = type_.DataType.isDenseUnion; +type_.DataType['isSparseUnion'] = type_.DataType.isSparseUnion; +type_.DataType['isFixedSizeBinary'] = type_.DataType.isFixedSizeBinary; +type_.DataType['isFixedSizeList'] = type_.DataType.isFixedSizeList; +type_.DataType['isMap'] = type_.DataType.isMap; +type_.DataType['isDictionary'] = type_.DataType.isDictionary; + +vector_.BoolVector['from'] = vector_.BoolVector.from; +vector_.IntVector['from'] = vector_.IntVector.from; +vector_.FloatVector['from'] = vector_.FloatVector.from; + +visitor_.TypeVisitor['visitTypeInline'] = visitor_.TypeVisitor.visitTypeInline; +visitor_.VectorVisitor['visitTypeInline'] = visitor_.VectorVisitor.visitTypeInline; \ No newline at end of file diff --git a/js/src/bin/arrow2csv.ts b/js/src/bin/arrow2csv.ts index 2bc1600a840..ee956132378 100644 --- a/js/src/bin/arrow2csv.ts +++ b/js/src/bin/arrow2csv.ts @@ -1,4 +1,4 @@ -// #! /usr/bin/env node +#! /usr/bin/env node // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file @@ -19,11 +19,9 @@ /* tslint:disable */ +import * as fs from 'fs'; import * as Arrow from '../Arrow'; -(function() { - -const fs = require('fs'); const { parse } = require('json-bignum'); const optionList = [ { @@ -36,12 +34,13 @@ const optionList = [ { type: String, name: 'file', alias: 'f', + optional: false, multiple: true, description: 'The Arrow file to read' } ]; const argv = require(`command-line-args`)(optionList, { partial: true }); -const files = [argv.file, ...(argv._unknown || [])].filter(Boolean); +const files = [...argv.file, ...(argv._unknown || [])].filter(Boolean); if (!files.length) { console.log(require('command-line-usage')([ @@ -85,51 +84,16 @@ if (!files.length) { } files.forEach((source) => { - let table: any, input = fs.readFileSync(source); + debugger; + let table: Arrow.Table, input = fs.readFileSync(source); try { - table = Arrow.Table.from([input]); + table = Arrow.Table.from(input); } catch (e) { + debugger; table = Arrow.Table.from(parse(input + '')); } if (argv.schema && argv.schema.length) { table = table.select(...argv.schema); } - printTable(table); + table.rowsToString().pipe(process.stdout); }); - -function printTable(table: Arrow.Table) { - let header = [...table.columns.map((c) => c.name)].map(stringify); - let maxColumnWidths = header.map(x => x.length); - // Pass one to convert to strings and count max column widths - for (let i = -1, n = table.length - 1; ++i < n;) { - let val, - row = [i, ...table.get(i)]; - for (let j = -1, k = row.length; ++j < k; ) { - val = stringify(row[j]); - maxColumnWidths[j] = Math.max(maxColumnWidths[j], val.length); - } - } - console.log(header.map((x, j) => leftPad(x, ' ', maxColumnWidths[j])).join(' | ')); - // Pass two to pad each one to max column width - for (let i = -1, n = table.length; ++i < n; ) { - console.log( - [...table.get(i)] - .map(stringify) - .map((x, j) => leftPad(x, ' ', maxColumnWidths[j])) - .join(' | ') - ); - } -} - -function leftPad(str: string, fill: string, n: number) { - return (new Array(n + 1).join(fill) + str).slice(-1 * n); -} - -function stringify(x: any) { - return typeof x === 'string' ? `"${x}"` - : Array.isArray(x) ? JSON.stringify(x) - : ArrayBuffer.isView(x) ? `[${x}]` - : `${x}`; -} - -})(); diff --git a/js/src/data.ts b/js/src/data.ts new file mode 100644 index 00000000000..81d19a3cf63 --- /dev/null +++ b/js/src/data.ts @@ -0,0 +1,327 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { popcnt_bit_range } from './util/bit'; +import { VectorLike, Vector } from './vector'; +import { VectorType, TypedArray, TypedArrayConstructor, Dictionary } from './type'; +import { Int, Bool, FlatListType, List, FixedSizeList, Struct, Map_ } from './type'; +import { DataType, FlatType, ListType, NestedType, SingleNestedType, DenseUnion, SparseUnion } from './type'; + +export function toTypedArray(ArrayType: TypedArrayConstructor, values?: T | ArrayLike | Iterable | null): T { + if (!ArrayType && ArrayBuffer.isView(values)) { return values; } + return values instanceof ArrayType ? values + : !values || !ArrayBuffer.isView(values) ? ArrayType.from(values || []) + : new ArrayType(values.buffer, values.byteOffset, values.byteLength / ArrayType.BYTES_PER_ELEMENT); +} + +export type Data = DataTypes[T['TType']] & BaseData; +export interface DataTypes { +/* [Type.NONE]*/ 0: BaseData; +/* [Type.Null]*/ 1: FlatData; +/* [Type.Int]*/ 2: FlatData; +/* [Type.Float]*/ 3: FlatData; +/* [Type.Binary]*/ 4: FlatListData; +/* [Type.Utf8]*/ 5: FlatListData; +/* [Type.Bool]*/ 6: BoolData; +/* [Type.Decimal]*/ 7: FlatData; +/* [Type.Date]*/ 8: FlatData; +/* [Type.Time]*/ 9: FlatData; +/* [Type.Timestamp]*/ 10: FlatData; +/* [Type.Interval]*/ 11: FlatData; +/* [Type.List]*/ 12: ListData>; +/* [Type.Struct]*/ 13: NestedData; +/* [Type.Union]*/ 14: UnionData; +/* [Type.FixedSizeBinary]*/ 15: FlatData; +/* [Type.FixedSizeList]*/ 16: SingleNestedData>; +/* [Type.Map]*/ 17: NestedData; +/* [Type.DenseUnion]*/ DenseUnion: DenseUnionData; +/*[Type.SparseUnion]*/ SparseUnion: SparseUnionData; +/*[ Type.Dictionary]*/ Dictionary: DictionaryData; +} +// When slicing, we do not know the null count of the sliced range without +// doing some computation. To avoid doing this eagerly, we set the null count +// to -1 (any negative number will do). When Array::null_count is called the +// first time, the null count will be computed. See ARROW-33 +export type kUnknownNullCount = -1; +export const kUnknownNullCount = -1; + +export class BaseData implements VectorLike { + public type: T; + public length: number; + public offset: number; + // @ts-ignore + public childData: Data[]; + protected _nullCount: number | kUnknownNullCount; + protected /* [VectorType.OFFSET]:*/ 0?: Int32Array; + protected /* [VectorType.DATA]:*/ 1?: T['TArray']; + protected /*[VectorType.VALIDITY]:*/ 2?: Uint8Array; + protected /* [VectorType.TYPE]:*/ 3?: Int8Array; + constructor(type: T, length: number, offset?: number, nullCount?: number) { + this.type = type; + this.length = Math.floor(Math.max(length || 0, 0)); + this.offset = Math.floor(Math.max(offset || 0, 0)); + this._nullCount = Math.floor(Math.max(nullCount || 0, -1)); + } + public get typeId() { return this.type.TType; } + public get nullBitmap() { return this[VectorType.VALIDITY]; } + public get nullCount() { + let nullCount = this._nullCount; + let nullBitmap: Uint8Array | undefined; + if (nullCount === -1 && (nullBitmap = this[VectorType.VALIDITY])) { + this._nullCount = nullCount = this.length - popcnt_bit_range(nullBitmap, this.offset, this.offset + this.length); + } + return nullCount; + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new BaseData(type, length, offset, nullCount); + } + public slice(offset: number, length: number) { + return length <= 0 ? this : this.sliceInternal(this.clone( + this.type, length, this.offset + offset, +(this._nullCount === 0) - 1 + ) as any, offset, length); + } + protected sliceInternal(clone: this, offset: number, length: number) { + let arr: any; + // If typeIds exist, slice the typeIds buffer + (arr = this[VectorType.TYPE]) && (clone[VectorType.TYPE] = this.sliceData(arr, offset, length)); + // If offsets exist, only slice the offsets buffer + (arr = this[VectorType.OFFSET]) && (clone[VectorType.OFFSET] = this.sliceOffsets(arr, offset, length)) || + // Otherwise if no offsets, slice the data buffer + (arr = this[VectorType.DATA]) && (clone[VectorType.DATA] = this.sliceData(arr, offset, length)); + return clone; + } + protected sliceData(data: T['TArray'] & TypedArray, offset: number, length: number) { + return data.subarray(offset, offset + length); + } + protected sliceOffsets(valueOffsets: Int32Array, offset: number, length: number) { + return valueOffsets.subarray(offset, offset + length + 1); + } +} + +export class FlatData extends BaseData { + public /* [VectorType.DATA]:*/ 1: T['TArray']; + public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; + public get values() { return this[VectorType.DATA]; } + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, data: Iterable, offset?: number, nullCount?: number) { + super(type, length, offset, nullCount); + this[VectorType.DATA] = toTypedArray(this.ArrayType, data); + this[VectorType.VALIDITY] = toTypedArray(Uint8Array, nullBitmap); + } + public get ArrayType(): T['ArrayType'] { return this.type.ArrayType; } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new (this.constructor as any)(type, length, this[VectorType.VALIDITY], this[VectorType.DATA], offset, nullCount) as FlatData; + } +} + +export class BoolData extends FlatData { + protected sliceData(data: Uint8Array) { return data; } +} + +export class FlatListData extends FlatData { + public /* [VectorType.OFFSET]:*/ 0: Int32Array; + public /* [VectorType.DATA]:*/ 1: T['TArray']; + public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; + public get values() { return this[VectorType.DATA]; } + public get valueOffsets() { return this[VectorType.OFFSET]; } + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, valueOffsets: Iterable, data: T['TArray'], offset?: number, nullCount?: number) { + super(type, length, nullBitmap, data, offset, nullCount); + this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new FlatListData(type, length, this[VectorType.VALIDITY], this[VectorType.OFFSET], this[VectorType.DATA], offset, nullCount); + } +} + +export class DictionaryData extends BaseData> { + protected _dictionary: Vector; + protected _indicies: Data>; + public get indicies() { return this._indicies; } + public get dictionary() { return this._dictionary; } + constructor(type: Dictionary, dictionary: Vector, indicies: Data>) { + super(type, indicies.length, (indicies as any)._nullCount); + this._indicies = indicies; + this._dictionary = dictionary; + this.length = this._indicies.length; + } + public get nullCount() { return this._indicies.nullCount; } + public clone>(type: R, length = this.length, offset = this.offset) { + const data = this._dictionary.data.clone(type.dictionary as any); + return new DictionaryData( + this.type as any, + this._dictionary.clone(data) as any, + this._indicies.slice(offset - this.offset, length) + ) as any; + } + protected sliceInternal(clone: this, _offset: number, _length: number) { + clone.length = clone._indicies.length; + clone._nullCount = (clone._indicies as any)._nullCount; + return clone; + } +} + +export class NestedData extends BaseData { + public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, childData: Data[], offset?: number, nullCount?: number) { + super(type, length, offset, nullCount); + this.childData = childData; + this[VectorType.VALIDITY] = toTypedArray(Uint8Array, nullBitmap); + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new NestedData(type, length, this[VectorType.VALIDITY], this.childData, offset, nullCount); + } + protected sliceInternal(clone: this, offset: number, length: number) { + if (!this[VectorType.OFFSET]) { + clone.childData = this.childData.map((child) => child.slice(offset, length)); + } + return super.sliceInternal(clone, offset, length); + } +} + +export class SingleNestedData extends NestedData { + protected _valuesData: Data; + public get values() { return this._valuesData; } + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, valueChildData: Data, offset?: number, nullCount?: number) { + super(type, length, nullBitmap, [valueChildData], offset, nullCount); + this._valuesData = valueChildData; + } +} + +export class ListData extends SingleNestedData { + public /* [VectorType.OFFSET]:*/ 0: Int32Array; + public /*[VectorType.VALIDITY]:*/ 2: Uint8Array; + public get valueOffsets() { return this[VectorType.OFFSET]; } + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, valueOffsets: Iterable, valueChildData: Data, offset?: number, nullCount?: number) { + super(type, length, nullBitmap, valueChildData, offset, nullCount); + this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new ListData(type, length, this[VectorType.VALIDITY], this[VectorType.OFFSET], this._valuesData as any, offset, nullCount); + } +} + +export class UnionData extends NestedData { + public /* [VectorType.TYPE]:*/ 3: T['TArray']; + public get typeIds() { return this[VectorType.TYPE]; } + constructor(type: T, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, childData: Data[], offset?: number, nullCount?: number) { + super(type, length, nullBitmap, childData, offset, nullCount); + this[VectorType.TYPE] = toTypedArray(Int8Array, typeIds); + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new UnionData(type, length, this[VectorType.VALIDITY], this[VectorType.TYPE], this.childData, offset, nullCount); + } +} + +export class SparseUnionData extends UnionData { + constructor(type: SparseUnion, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, childData: Data[], offset?: number, nullCount?: number) { + super(type, length, nullBitmap, typeIds, childData, offset, nullCount); + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new SparseUnionData( + type, + length, + this[VectorType.VALIDITY], + this[VectorType.TYPE], + this.childData, + offset, nullCount + ) as any as UnionData; + } +} + +export class DenseUnionData extends UnionData { + public /* [VectorType.OFFSET]:*/ 0: Int32Array; + public get valueOffsets() { return this[VectorType.OFFSET]; } + constructor(type: DenseUnion, length: number, nullBitmap: Uint8Array | null | undefined, typeIds: Iterable, valueOffsets: Iterable, childData: Data[], offset?: number, nullCount?: number) { + super(type, length, nullBitmap, typeIds, childData, offset, nullCount); + this[VectorType.OFFSET] = toTypedArray(Int32Array, valueOffsets); + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new DenseUnionData( + type, + length, + this[VectorType.VALIDITY], + this[VectorType.TYPE], + this[VectorType.OFFSET], + this.childData, + offset, nullCount + ) as any as UnionData; + } +} + +export class ChunkedData extends BaseData { + // @ts-ignore + protected _chunkData: Data[]; + protected _chunkVectors: Vector[]; + protected _chunkOffsets: Uint32Array; + public get chunkVectors() { return this._chunkVectors; } + public get chunkOffsets() { return this._chunkOffsets; } + public get chunkData() { + return this._chunkData || ( + this._chunkData = this._chunkVectors.map(({ data }) => data)); + } + constructor(type: T, length: number, chunkVectors: Vector[], offset?: number, nullCount?: number, chunkOffsets?: Uint32Array) { + super(type, length, offset, nullCount); + this._chunkVectors = chunkVectors; + this._chunkOffsets = chunkOffsets || ChunkedData.computeOffsets(chunkVectors); + } + public get nullCount() { + let nullCount = this._nullCount; + if (nullCount === -1) { + this._nullCount = nullCount = this._chunkVectors.reduce((x, c) => x + c.nullCount, 0); + } + return nullCount; + } + public clone(type: R, length = this.length, offset = this.offset, nullCount = this._nullCount) { + return new ChunkedData( + type, length, + this._chunkVectors.map((vec) => vec.clone(vec.data.clone(type))) as any, + offset, nullCount, this._chunkOffsets + ); + } + protected sliceInternal(clone: this, offset: number, length: number) { + const chunks = this._chunkVectors; + const offsets = this._chunkOffsets; + const chunkSlices: Vector[] = []; + for (let childIndex = -1, numChildren = chunks.length; ++childIndex < numChildren;) { + const child = chunks[childIndex]; + const childLength = child.length; + const childOffset = offsets[childIndex]; + // If the child is to the right of the slice boundary, exclude + if (childOffset >= offset + length) { continue; } + // If the child is to the left of of the slice boundary, exclude + if (offset >= childOffset + childLength) { continue; } + // If the child is between both left and right boundaries, include w/o slicing + if (childOffset >= offset && (childOffset + childLength) <= offset + length) { + chunkSlices.push(child); + continue; + } + // If the child overlaps one of the slice boundaries, include that slice + const begin = Math.max(0, offset - childOffset); + const end = begin + Math.min(childLength - begin, (offset + length) - childOffset); + chunkSlices.push(child.slice(begin, end)); + } + clone._chunkVectors = chunkSlices; + clone._chunkOffsets = ChunkedData.computeOffsets(chunkSlices); + return clone; + } + static computeOffsets(childVectors: Vector[]) { + const childOffsets = new Uint32Array(childVectors.length + 1); + for (let index = 0, length = childOffsets.length, childOffset = childOffsets[0] = 0; ++index < length;) { + childOffsets[index] = (childOffset += childVectors[index - 1].length); + } + return childOffsets; + } +} diff --git a/js/src/format/fb/File.ts b/js/src/fb/File.ts similarity index 99% rename from js/src/format/fb/File.ts rename to js/src/fb/File.ts index 56f50ed20e9..f4ba865ff04 100644 --- a/js/src/format/fb/File.ts +++ b/js/src/fb/File.ts @@ -14,6 +14,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -175,6 +176,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** diff --git a/js/src/format/fb/File_generated.js b/js/src/fb/File_generated.js similarity index 100% rename from js/src/format/fb/File_generated.js rename to js/src/fb/File_generated.js diff --git a/js/src/format/fb/Message.ts b/js/src/fb/Message.ts similarity index 99% rename from js/src/format/fb/Message.ts rename to js/src/fb/Message.ts index 4610fbef2e1..537c65d1f8c 100644 --- a/js/src/format/fb/Message.ts +++ b/js/src/fb/Message.ts @@ -45,6 +45,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -110,6 +111,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -265,6 +267,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -369,6 +372,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** diff --git a/js/src/format/fb/Message_generated.js b/js/src/fb/Message_generated.js similarity index 100% rename from js/src/format/fb/Message_generated.js rename to js/src/fb/Message_generated.js diff --git a/js/src/format/fb/Schema.ts b/js/src/fb/Schema.ts similarity index 99% rename from js/src/format/fb/Schema.ts rename to js/src/fb/Schema.ts index d9b45ed2008..4a4aeb65599 100644 --- a/js/src/format/fb/Schema.ts +++ b/js/src/fb/Schema.ts @@ -165,6 +165,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -221,6 +222,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -273,6 +275,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -325,6 +328,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -420,6 +424,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -495,6 +500,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -617,6 +623,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -701,6 +708,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -771,6 +779,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -823,6 +832,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -875,6 +885,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -945,6 +956,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -997,6 +1009,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1092,6 +1105,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1164,6 +1178,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1255,6 +1270,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1363,6 +1379,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1435,6 +1452,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1527,6 +1545,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1620,6 +1639,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -1741,6 +1761,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -2026,6 +2047,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** @@ -2089,6 +2111,7 @@ export namespace org.apache.arrow.flatbuf { /** * @type {flatbuffers.ByteBuffer} */ + // @ts-ignore bb: flatbuffers.ByteBuffer; /** diff --git a/js/src/format/fb/Schema_generated.js b/js/src/fb/Schema_generated.js similarity index 100% rename from js/src/format/fb/Schema_generated.js rename to js/src/fb/Schema_generated.js diff --git a/js/src/format/arrow.ts b/js/src/format/arrow.ts deleted file mode 100644 index 14adf9040a4..00000000000 --- a/js/src/format/arrow.ts +++ /dev/null @@ -1,32 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { footerFromByteBuffer, messageFromByteBuffer } from './fb'; -import { schemaFromJSON, recordBatchFromJSON, dictionaryBatchFromJSON } from './json'; -import { - IntBitWidth, TimeBitWidth, - VisitorNode, Visitor, Footer, Block, Message, Schema, RecordBatch, DictionaryBatch, Field, DictionaryEncoding, Buffer, FieldNode, - Null, Int, FloatingPoint, Binary, Bool, Utf8, Decimal, Date, Time, Timestamp, Interval, List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, -} from './types'; - -export { - IntBitWidth, TimeBitWidth, - footerFromByteBuffer, messageFromByteBuffer, - schemaFromJSON, recordBatchFromJSON, dictionaryBatchFromJSON, - VisitorNode, Visitor, Footer, Block, Message, Schema, RecordBatch, DictionaryBatch, Field, DictionaryEncoding, Buffer, FieldNode, - Null, Int, FloatingPoint, Binary, Bool, Utf8, Decimal, Date, Time, Timestamp, Interval, List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_ as Map, -}; diff --git a/js/src/format/fb.ts b/js/src/format/fb.ts deleted file mode 100644 index fdf7f7b0ed9..00000000000 --- a/js/src/format/fb.ts +++ /dev/null @@ -1,234 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import * as File_ from './fb/File'; -import * as Schema_ from './fb/Schema'; -import * as Message_ from './fb/Message'; -import { flatbuffers } from 'flatbuffers'; -import ByteBuffer = flatbuffers.ByteBuffer; -import Type = Schema_.org.apache.arrow.flatbuf.Type; -import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; -import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; -import _Footer = File_.org.apache.arrow.flatbuf.Footer; -import _Block = File_.org.apache.arrow.flatbuf.Block; -import _Message = Message_.org.apache.arrow.flatbuf.Message; -import _Schema = Schema_.org.apache.arrow.flatbuf.Schema; -import _Field = Schema_.org.apache.arrow.flatbuf.Field; -import _RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; -import _DictionaryBatch = Message_.org.apache.arrow.flatbuf.DictionaryBatch; -import _FieldNode = Message_.org.apache.arrow.flatbuf.FieldNode; -import _Buffer = Schema_.org.apache.arrow.flatbuf.Buffer; -import _DictionaryEncoding = Schema_.org.apache.arrow.flatbuf.DictionaryEncoding; -import _Null = Schema_.org.apache.arrow.flatbuf.Null; -import _Int = Schema_.org.apache.arrow.flatbuf.Int; -import _FloatingPoint = Schema_.org.apache.arrow.flatbuf.FloatingPoint; -import _Binary = Schema_.org.apache.arrow.flatbuf.Binary; -import _Bool = Schema_.org.apache.arrow.flatbuf.Bool; -import _Utf8 = Schema_.org.apache.arrow.flatbuf.Utf8; -import _Decimal = Schema_.org.apache.arrow.flatbuf.Decimal; -import _Date = Schema_.org.apache.arrow.flatbuf.Date; -import _Time = Schema_.org.apache.arrow.flatbuf.Time; -import _Timestamp = Schema_.org.apache.arrow.flatbuf.Timestamp; -import _Interval = Schema_.org.apache.arrow.flatbuf.Interval; -import _List = Schema_.org.apache.arrow.flatbuf.List; -import _Struct = Schema_.org.apache.arrow.flatbuf.Struct_; -import _Union = Schema_.org.apache.arrow.flatbuf.Union; -import _FixedSizeBinary = Schema_.org.apache.arrow.flatbuf.FixedSizeBinary; -import _FixedSizeList = Schema_.org.apache.arrow.flatbuf.FixedSizeList; -import _Map = Schema_.org.apache.arrow.flatbuf.Map; - -import { - IntBitWidth, TimeBitWidth, - Footer, Block, Schema, RecordBatch, DictionaryBatch, Field, DictionaryEncoding, Buffer, FieldNode, - Null, Int, FloatingPoint, Binary, Bool, Utf8, Decimal, Date, Time, Timestamp, Interval, List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, -} from './types'; - -export function footerFromByteBuffer(bb: ByteBuffer) { - const f = _Footer.getRootAsFooter(bb), s = f.schema()!; - return new Footer( - dictionaryBatchesFromFooter(f), recordBatchesFromFooter(f), - new Schema(f.version(), fieldsFromSchema(s), customMetadata(s), s.endianness()) - ); -} - -export function messageFromByteBuffer(bb: ByteBuffer) { - const m = _Message.getRootAsMessage(bb)!, type = m.headerType(), version = m.version(); - switch (type) { - case MessageHeader.Schema: return schemaFromMessage(version, m.header(new _Schema())!); - case MessageHeader.RecordBatch: return recordBatchFromMessage(version, m.header(new _RecordBatch())!); - case MessageHeader.DictionaryBatch: return dictionaryBatchFromMessage(version, m.header(new _DictionaryBatch())!); - } - return null; - // throw new Error(`Unrecognized Message type '${type}'`); -} - -function schemaFromMessage(version: MetadataVersion, s: _Schema) { - return new Schema(version, fieldsFromSchema(s), customMetadata(s), s.endianness()); -} - -function recordBatchFromMessage(version: MetadataVersion, b: _RecordBatch) { - return new RecordBatch(version, b.length(), fieldNodesFromRecordBatch(b), buffersFromRecordBatch(b, version)); -} - -function dictionaryBatchFromMessage(version: MetadataVersion, d: _DictionaryBatch) { - return new DictionaryBatch(version, recordBatchFromMessage(version, d.data()!), d.id(), d.isDelta()); -} - -function dictionaryBatchesFromFooter(f: _Footer) { - const blocks = [] as Block[]; - for (let b: _Block, i = -1, n = f && f.dictionariesLength(); ++i < n;) { - if (b = f.dictionaries(i)!) { - blocks.push(new Block(b.metaDataLength(), b.bodyLength(), b.offset())); - } - } - return blocks; -} - -function recordBatchesFromFooter(f: _Footer) { - const blocks = [] as Block[]; - for (let b: _Block, i = -1, n = f && f.recordBatchesLength(); ++i < n;) { - if (b = f.recordBatches(i)!) { - blocks.push(new Block(b.metaDataLength(), b.bodyLength(), b.offset())); - } - } - return blocks; -} - -function fieldsFromSchema(s: _Schema) { - const fields = [] as Field[]; - for (let i = -1, n = s && s.fieldsLength(); ++i < n;) { - fields.push(field(s.fields(i)!)); - } - return fields; -} - -function fieldsFromField(f: _Field) { - const fields = [] as Field[]; - for (let i = -1, n = f && f.childrenLength(); ++i < n;) { - fields.push(field(f.children(i)!)); - } - return fields; -} - -function fieldNodesFromRecordBatch(b: _RecordBatch) { - const fieldNodes = [] as FieldNode[]; - for (let i = -1, n = b.nodesLength(); ++i < n;) { - fieldNodes.push(fieldNodeFromRecordBatch(b.nodes(i)!)); - } - return fieldNodes; -} - -function buffersFromRecordBatch(b: _RecordBatch, version: MetadataVersion) { - const buffers = [] as Buffer[]; - for (let i = -1, n = b.buffersLength(); ++i < n;) { - let buffer = b.buffers(i)!; - // If this Arrow buffer was written before version 4, - // advance the buffer's bb_pos 8 bytes to skip past - // the now-removed page id field. - if (version < MetadataVersion.V4) { - buffer.bb_pos += (8 * (i + 1)); - } - buffers.push(bufferFromRecordBatch(buffer)); - } - return buffers; -} - -function field(f: _Field) { - return new Field( - f.name()!, - typeFromField(f), - f.typeType(), - f.nullable(), - fieldsFromField(f), - customMetadata(f), - dictionaryEncodingFromField(f) - ); -} - -function dictionaryEncodingFromField(f: _Field) { - let t: _Int | null; - let e: _DictionaryEncoding | null; - if (e = f.dictionary()) { - if (t = e.indexType()) { - return new DictionaryEncoding(new Int(t.isSigned(), t.bitWidth() as IntBitWidth), e.id(), e.isOrdered()); - } - return new DictionaryEncoding(null, e.id(), e.isOrdered()); - } - return undefined; -} - -function customMetadata(parent?: _Schema | _Field | null) { - const data = new Map(); - if (parent) { - for (let entry, key, i = -1, n = parent.customMetadataLength() | 0; ++i < n;) { - if ((entry = parent.customMetadata(i)) && (key = entry.key()) != null) { - data.set(key, entry.value()!); - } - } - } - return data; -} - -function fieldNodeFromRecordBatch(f: _FieldNode) { - return new FieldNode(f.length(), f.nullCount()); -} - -function bufferFromRecordBatch(b: _Buffer) { - return new Buffer(b.offset(), b.length()); -} - -function typeFromField(f: _Field) { - switch (f.typeType()) { - case Type.NONE: return nullFromField(f.type(new _Null())!); - case Type.Null: return nullFromField(f.type(new _Null())!); - case Type.Int: return intFromField(f.type(new _Int())!); - case Type.FloatingPoint: return floatingPointFromField(f.type(new _FloatingPoint())!); - case Type.Binary: return binaryFromField(f.type(new _Binary())!); - case Type.Utf8: return utf8FromField(f.type(new _Utf8())!); - case Type.Bool: return boolFromField(f.type(new _Bool())!); - case Type.Decimal: return decimalFromField(f.type(new _Decimal())!); - case Type.Date: return dateFromField(f.type(new _Date())!); - case Type.Time: return timeFromField(f.type(new _Time())!); - case Type.Timestamp: return timestampFromField(f.type(new _Timestamp())!); - case Type.Interval: return intervalFromField(f.type(new _Interval())!); - case Type.List: return listFromField(f.type(new _List())!); - case Type.Struct_: return structFromField(f.type(new _Struct())!); - case Type.Union: return unionFromField(f.type(new _Union())!); - case Type.FixedSizeBinary: return fixedSizeBinaryFromField(f.type(new _FixedSizeBinary())!); - case Type.FixedSizeList: return fixedSizeListFromField(f.type(new _FixedSizeList())!); - case Type.Map: return mapFromField(f.type(new _Map())!); - } - throw new Error(`Unrecognized type ${f.typeType()}`); -} - -function nullFromField(_type: _Null) { return new Null(); } -function intFromField(_type: _Int) { return new Int(_type.isSigned(), _type.bitWidth() as IntBitWidth); } -function floatingPointFromField(_type: _FloatingPoint) { return new FloatingPoint(_type.precision()); } -function binaryFromField(_type: _Binary) { return new Binary(); } -function utf8FromField(_type: _Utf8) { return new Utf8(); } -function boolFromField(_type: _Bool) { return new Bool(); } -function decimalFromField(_type: _Decimal) { return new Decimal(_type.scale(), _type.precision()); } -function dateFromField(_type: _Date) { return new Date(_type.unit()); } -function timeFromField(_type: _Time) { return new Time(_type.unit(), _type.bitWidth() as TimeBitWidth); } -function timestampFromField(_type: _Timestamp) { return new Timestamp(_type.unit(), _type.timezone()); } -function intervalFromField(_type: _Interval) { return new Interval(_type.unit()); } -function listFromField(_type: _List) { return new List(); } -function structFromField(_type: _Struct) { return new Struct(); } -function unionFromField(_type: _Union) { return new Union(_type.mode(), (_type.typeIdsArray() || []) as Type[]); } -function fixedSizeBinaryFromField(_type: _FixedSizeBinary) { return new FixedSizeBinary(_type.byteWidth()); } -function fixedSizeListFromField(_type: _FixedSizeList) { return new FixedSizeList(_type.listSize()); } -function mapFromField(_type: _Map) { return new Map_(_type.keysSorted()); } diff --git a/js/src/format/json.ts b/js/src/format/json.ts deleted file mode 100644 index 3da3db6d5fe..00000000000 --- a/js/src/format/json.ts +++ /dev/null @@ -1,173 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import * as Schema_ from './fb/Schema'; -import { flatbuffers } from 'flatbuffers'; -import Long = flatbuffers.Long; -import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; -import Type = Schema_.org.apache.arrow.flatbuf.Type; -import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; -import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; -import Precision = Schema_.org.apache.arrow.flatbuf.Precision; -import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; -import { - IntBitWidth, TimeBitWidth, - Schema, RecordBatch, DictionaryBatch, Field, DictionaryEncoding, Buffer, FieldNode, - Null, Int, FloatingPoint, Binary, Bool, Utf8, Decimal, Date, Time, Timestamp, Interval, List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, -} from './types'; - -export function schemaFromJSON(s: any): Schema { - // todo: metadataFromJSON - return new Schema( - MetadataVersion.V4, - fieldsFromJSON(s['fields']), - customMetadata(s['customMetadata']) - ); -} - -export function recordBatchFromJSON(b: any): RecordBatch { - return new RecordBatch( - MetadataVersion.V4, - new Long(b['count'], 0), - fieldNodesFromJSON(b['columns']), - buffersFromJSON(b['columns']) - ); -} - -export function dictionaryBatchFromJSON(b: any): DictionaryBatch { - return new DictionaryBatch( - MetadataVersion.V4, - recordBatchFromJSON(b['data']), - new Long(b['id'], 0), b['isDelta'] - ); -} - -function fieldsFromJSON(fs: any[]): Field[] { - return (fs || []).map(fieldFromJSON); -} - -function fieldNodesFromJSON(xs: any[]): FieldNode[] { - return (xs || []).reduce((fieldNodes, column: any) => [ - ...fieldNodes, - new FieldNode( - new Long(column['count'], 0), - new Long(nullCountFromJSON(column['VALIDITY']), 0) - ), - ...fieldNodesFromJSON(column['children']) - ], [] as FieldNode[]); -} - -function buffersFromJSON(xs: any[], buffers: Buffer[] = []): Buffer[] { - for (let i = -1, n = (xs || []).length; ++i < n;) { - const column = xs[i]; - column['VALIDITY'] && buffers.push(new Buffer(new Long(buffers.length, 0), new Long(column['VALIDITY'].length, 0))); - column['OFFSET'] && buffers.push(new Buffer(new Long(buffers.length, 0), new Long(column['OFFSET'].length, 0))); - column['DATA'] && buffers.push(new Buffer(new Long(buffers.length, 0), new Long(column['DATA'].length, 0))); - buffers = buffersFromJSON(column['children'], buffers); - } - return buffers; -} - -function nullCountFromJSON(validity: number[]) { - return (validity || []).reduce((sum, val) => sum + +(val === 0), 0); -} - -function fieldFromJSON(f: any) { - return new Field( - f['name'], - typeFromJSON(f['type']), - namesToTypeMap[f['type']['name']], - f.nullable, - fieldsFromJSON(f['children']), - customMetadata(f['customMetadata']), - dictionaryEncodingFromJSON(f['dictionary']) - ); -} - -function dictionaryEncodingFromJSON(d: any) { - return !d ? null : new DictionaryEncoding( - d.indexType ? intFromJSON(d.indexType) : null, - new Long(d.id, 0), d.isOrdered - ); -} - -function customMetadata(metadata?: any) { - return new Map(Object.entries(metadata || {})); -} - -const namesToTypeMap: { [n: string]: Type } = { - 'NONE': Type.NONE, - 'null': Type.Null, - 'int': Type.Int, - 'floatingpoint': Type.FloatingPoint, - 'binary': Type.Binary, - 'bool': Type.Bool, - 'utf8': Type.Utf8, - 'decimal': Type.Decimal, - 'date': Type.Date, - 'time': Type.Time, - 'timestamp': Type.Timestamp, - 'interval': Type.Interval, - 'list': Type.List, - 'struct': Type.Struct_, - 'union': Type.Union, - 'fixedsizebinary': Type.FixedSizeBinary, - 'fixedsizelist': Type.FixedSizeList, - 'map': Type.Map, -}; - -function typeFromJSON(t: any) { - switch (namesToTypeMap[t['name']]) { - case Type.NONE: return nullFromJSON(t); - case Type.Null: return nullFromJSON(t); - case Type.Int: return intFromJSON(t); - case Type.FloatingPoint: return floatingPointFromJSON(t); - case Type.Binary: return binaryFromJSON(t); - case Type.Utf8: return utf8FromJSON(t); - case Type.Bool: return boolFromJSON(t); - case Type.Decimal: return decimalFromJSON(t); - case Type.Date: return dateFromJSON(t); - case Type.Time: return timeFromJSON(t); - case Type.Timestamp: return timestampFromJSON(t); - case Type.Interval: return intervalFromJSON(t); - case Type.List: return listFromJSON(t); - case Type.Struct_: return structFromJSON(t); - case Type.Union: return unionFromJSON(t); - case Type.FixedSizeBinary: return fixedSizeBinaryFromJSON(t); - case Type.FixedSizeList: return fixedSizeListFromJSON(t); - case Type.Map: return mapFromJSON(t); - } - throw new Error(`Unrecognized type ${t['name']}`); -} - -function nullFromJSON(_type: any) { return new Null(); } -function intFromJSON(_type: any) { return new Int(_type['isSigned'], _type['bitWidth'] as IntBitWidth); } -function floatingPointFromJSON(_type: any) { return new FloatingPoint(Precision[_type['precision']] as any); } -function binaryFromJSON(_type: any) { return new Binary(); } -function utf8FromJSON(_type: any) { return new Utf8(); } -function boolFromJSON(_type: any) { return new Bool(); } -function decimalFromJSON(_type: any) { return new Decimal(_type['scale'], _type['precision']); } -function dateFromJSON(_type: any) { return new Date(DateUnit[_type['unit']] as any); } -function timeFromJSON(_type: any) { return new Time(TimeUnit[_type['unit']] as any, _type['bitWidth'] as TimeBitWidth); } -function timestampFromJSON(_type: any) { return new Timestamp(TimeUnit[_type['unit']] as any, _type['timezone']); } -function intervalFromJSON(_type: any) { return new Interval(IntervalUnit[_type['unit']] as any); } -function listFromJSON(_type: any) { return new List(); } -function structFromJSON(_type: any) { return new Struct(); } -function unionFromJSON(_type: any) { return new Union(_type['mode'], (_type['typeIdsArray'] || []) as Type[]); } -function fixedSizeBinaryFromJSON(_type: any) { return new FixedSizeBinary(_type['byteWidth']); } -function fixedSizeListFromJSON(_type: any) { return new FixedSizeList(_type['listSize']); } -function mapFromJSON(_type: any) { return new Map_(_type['keysSorted']); } diff --git a/js/src/format/types.ts b/js/src/format/types.ts deleted file mode 100644 index 09df8ccbbdf..00000000000 --- a/js/src/format/types.ts +++ /dev/null @@ -1,393 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/* tslint:disable:class-name */ - -import { align } from '../util/layout'; -import * as Schema_ from './fb/Schema'; -import * as Message_ from './fb/Message'; -import { flatbuffers } from 'flatbuffers'; -import Long = flatbuffers.Long; -import Type = Schema_.org.apache.arrow.flatbuf.Type; -import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; -import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; -import Precision = Schema_.org.apache.arrow.flatbuf.Precision; -import UnionMode = Schema_.org.apache.arrow.flatbuf.UnionMode; -import Endianness = Schema_.org.apache.arrow.flatbuf.Endianness; -import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; -import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; -import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; - -export type IntBitWidth = 8 | 16 | 32 | 64; -export type TimeBitWidth = IntBitWidth | 128; - -export interface VisitorNode { - accept(visitor: Visitor): any; -} - -export abstract class Visitor { - visit(node: VisitorNode): T { - return node.accept(this); - } - visitMany(nodes: VisitorNode[]): T[] { - return nodes.map((node) => this.visit(node)); - } - abstract visitFooter(node: Footer): any; - abstract visitBlock(node: Block): any; - abstract visitMessage(node: Message): any; - abstract visitSchema(node: Schema): any; - abstract visitField(node: Field): any; - abstract visitBuffer(node: Buffer): any; - abstract visitFieldNode(node: FieldNode): any; - abstract visitRecordBatch(node: RecordBatch): any; - abstract visitDictionaryBatch(node: DictionaryBatch): any; - abstract visitDictionaryEncoding(node: DictionaryEncoding): any; - abstract visitNullFieldType(node: Null): any; - abstract visitIntFieldType(node: Int): any; - abstract visitFloatingPointFieldType(node: FloatingPoint): any; - abstract visitBinaryFieldType(node: Binary): any; - abstract visitBoolFieldType(node: Bool): any; - abstract visitUtf8FieldType(node: Utf8): any; - abstract visitDecimalFieldType(node: Decimal): any; - abstract visitDateFieldType(node: Date): any; - abstract visitTimeFieldType(node: Time): any; - abstract visitTimestampFieldType(node: Timestamp): any; - abstract visitIntervalFieldType(node: Interval): any; - abstract visitListFieldType(node: List): any; - abstract visitStructFieldType(node: Struct): any; - abstract visitUnionFieldType(node: Union): any; - abstract visitFixedSizeBinaryFieldType(node: FixedSizeBinary): any; - abstract visitFixedSizeListFieldType(node: FixedSizeList): any; - abstract visitMapFieldType(node: Map_): any; -} - -export class Footer implements VisitorNode { - constructor(public dictionaryBatches: Block[], public recordBatches: Block[], public schema: Schema) {} - accept(visitor: Visitor): any { - return visitor.visitFooter(this); - } -} - -export class Block implements VisitorNode { - constructor(public metaDataLength: number, public bodyLength: Long, public offset: Long) {} - accept(visitor: Visitor): any { - return visitor.visitBlock(this); - } -} - -export class Message implements VisitorNode { - constructor(public version: MetadataVersion, public bodyLength: Long, public headerType: MessageHeader) {} - isSchema(): this is Schema { return this.headerType === MessageHeader.Schema; } - isRecordBatch(): this is RecordBatch { return this.headerType === MessageHeader.RecordBatch; } - isDictionaryBatch(): this is DictionaryBatch { return this.headerType === MessageHeader.DictionaryBatch; } - accept(visitor: Visitor): any { - visitor.visitMessage(this); - } -} - -export class Schema extends Message { - public dictionaries: Map; - constructor(version: MetadataVersion, public fields: Field[], public customMetadata?: Map, public endianness = Endianness.Little) { - super(version, Long.ZERO, MessageHeader.Schema); - const dictionaries = [] as Field[]; - for (let f: Field, i = -1, n = fields.length; ++i < n;) { - if ((f = fields[i])) { - f.dictionary && dictionaries.push(f); - dictionaries.push(...f.dictionaries); - } - } - this.dictionaries = new Map(dictionaries.map<[string, Field]>((f) => [ - f.dictionary!.dictionaryId.toFloat64().toString(), f - ])); - } - accept(visitor: Visitor): any { - return visitor.visitSchema(this); - } -} - -export class RecordBatch extends Message { - constructor(version: MetadataVersion, public length: Long, public fieldNodes: FieldNode[], public buffers: Buffer[]) { - super(version, new Long(buffers.reduce((s, b) => align(s + b.length.low + (b.offset.low - s), 8), 0), 0), MessageHeader.RecordBatch); - } - accept(visitor: Visitor) { - return visitor.visitRecordBatch(this); - } -} - -export class DictionaryBatch extends Message { - constructor(version: MetadataVersion, public dictionary: RecordBatch, public dictionaryId: Long, public isDelta: boolean) { - super(version, dictionary.bodyLength, MessageHeader.DictionaryBatch); - } - get fieldNodes(): FieldNode[] { return this.dictionary.fieldNodes; } - get buffers(): Buffer[] { return this.dictionary.buffers; } - accept(visitor: Visitor) { - return visitor.visitDictionaryBatch(this); - } - static atomicDictionaryId = 0; -} - -export class Field implements VisitorNode { - public dictionaries: Field[]; - constructor(public name: string, - public type: FieldType, - public typeType: Type, - public nullable = false, - public children: Field[] = [], - public metadata?: Map | null, - public dictionary?: DictionaryEncoding | null) { - const dictionaries = [] as Field[]; - for (let f: Field, i = -1, n = children.length; ++i < n;) { - if ((f = children[i])) { - f.dictionary && dictionaries.push(f); - dictionaries.push(...f.dictionaries); - } - } - this.dictionaries = dictionaries; - } - accept(visitor: Visitor): any { - return visitor.visitField(this); - } - indexField() { - return !this.dictionary ? this : new Field( - this.name, - this.dictionary.indexType, this.dictionary.indexType.type, - this.nullable, this.children, this.metadata, this.dictionary - ); - } - toString() { return `Field name[${this.name}], nullable[${this.nullable}], type[${this.type.toString()}]`; } -} - -export class Buffer implements VisitorNode { - constructor(public offset: Long, public length: Long) {} - accept(visitor: Visitor) { - return visitor.visitBuffer(this); - } -} - -export class FieldNode implements VisitorNode { - constructor(public length: Long, public nullCount: Long) {} - accept(visitor: Visitor) { - return visitor.visitFieldNode(this); - } -} - -export class DictionaryEncoding implements VisitorNode { - public isOrdered: boolean; - public dictionaryId: Long; - public indexType: Int; - constructor(indexType?: Int | null, dictionaryId?: Long | null, isOrdered?: boolean | null) { - this.isOrdered = isOrdered || false; - /* a dictionary index defaults to signed 32 bit int if unspecified */ - this.indexType = indexType || new Int(true, 32); - this.dictionaryId = dictionaryId || new Long(DictionaryBatch.atomicDictionaryId++, 0); - } - accept(visitor: Visitor): any { - return visitor.visitDictionaryEncoding(this); - } -} - -export abstract class FieldType implements VisitorNode { - constructor(public type: Type) {} - abstract accept(visitor: Visitor): any; - isNull(): this is Null { return this.type === Type.Null; } - isInt(): this is Int { return this.type === Type.Int; } - isFloatingPoint(): this is FloatingPoint { return this.type === Type.FloatingPoint; } - isBinary(): this is Binary { return this.type === Type.Binary; } - isUtf8(): this is Utf8 { return this.type === Type.Utf8; } - isBool(): this is Bool { return this.type === Type.Bool; } - isDecimal(): this is Decimal { return this.type === Type.Decimal; } - isDate(): this is Date { return this.type === Type.Date; } - isTime(): this is Time { return this.type === Type.Time; } - isTimestamp(): this is Timestamp { return this.type === Type.Timestamp; } - isInterval(): this is Interval { return this.type === Type.Interval; } - isList(): this is List { return this.type === Type.List; } - isStruct(): this is Struct { return this.type === Type.Struct_; } - isUnion(): this is Union { return this.type === Type.Union; } - isFixedSizeBinary(): this is FixedSizeBinary { return this.type === Type.FixedSizeBinary; } - isFixedSizeList(): this is FixedSizeList { return this.type === Type.FixedSizeList; } - isMap(): this is Map_ { return this.type === Type.Map; } -} - -export class Null extends FieldType { - toString() { return `Null`; } - constructor() { - super(Type.Null); - } - accept(visitor: Visitor) { - return visitor.visitNullFieldType(this); - } -} - -export class Int extends FieldType { - toString() { return `Int isSigned[${this.isSigned}], bitWidth[${this.bitWidth}]`; } - constructor(public isSigned: boolean, public bitWidth: IntBitWidth) { - super(Type.Int); - } - accept(visitor: Visitor) { - return visitor.visitIntFieldType(this); - } -} - -export class FloatingPoint extends FieldType { - toString() { return `FloatingPoint precision`; } - constructor(public precision: Precision) { - super(Type.FloatingPoint); - } - accept(visitor: Visitor) { - return visitor.visitFloatingPointFieldType(this); - } -} - -export class Binary extends FieldType { - toString() { return `Binary`; } - constructor() { - super(Type.Binary); - } - accept(visitor: Visitor) { - return visitor.visitBinaryFieldType(this); - } -} - -export class Utf8 extends FieldType { - toString() { return `Utf8`; } - constructor() { - super(Type.Utf8); - } - accept(visitor: Visitor) { - return visitor.visitUtf8FieldType(this); - } -} - -export class Bool extends FieldType { - toString() { return `Bool`; } - constructor() { - super(Type.Bool); - } - accept(visitor: Visitor) { - return visitor.visitBoolFieldType(this); - } -} - -export class Decimal extends FieldType { - toString() { return `Decimal scale[${this.scale}], precision[${this.precision}]`; } - constructor(public scale: number, public precision: number) { - super(Type.Decimal); - } - accept(visitor: Visitor) { - return visitor.visitDecimalFieldType(this); - } -} - -export class Date extends FieldType { - toString() { return `Date unit[${this.unit}]`; } - constructor(public unit: DateUnit) { - super(Type.Date); - } - accept(visitor: Visitor) { - return visitor.visitDateFieldType(this); - } -} - -export class Time extends FieldType { - toString() { return `Time unit[${this.unit}], bitWidth[${this.bitWidth}]`; } - constructor(public unit: TimeUnit, public bitWidth: TimeBitWidth) { - super(Type.Time); - } - accept(visitor: Visitor) { - return visitor.visitTimeFieldType(this); - } -} - -export class Timestamp extends FieldType { - toString() { return `Timestamp unit[${this.unit}], timezone[${this.timezone}]`; } - constructor(public unit: TimeUnit, public timezone?: string | null) { - super(Type.Timestamp); - } - accept(visitor: Visitor) { - return visitor.visitTimestampFieldType(this); - } -} - -export class Interval extends FieldType { - toString() { return `Interval unit[${this.unit}]`; } - constructor(public unit: IntervalUnit) { - super(Type.Interval); - } - accept(visitor: Visitor) { - return visitor.visitIntervalFieldType(this); - } -} - -export class List extends FieldType { - toString() { return `List`; } - constructor() { - super(Type.List); - } - accept(visitor: Visitor) { - return visitor.visitListFieldType(this); - } -} - -export class Struct extends FieldType { - toString() { return `Struct`; } - constructor() { - super(Type.Struct_); - } - accept(visitor: Visitor) { - return visitor.visitStructFieldType(this); - } -} - -export class Union extends FieldType { - toString() { return `Union mode[${this.mode}], typeIds[${this.typeIds}]`; } - constructor(public mode: UnionMode, public typeIds: Type[]) { - super(Type.Union); - } - accept(visitor: Visitor) { - return visitor.visitUnionFieldType(this); - } -} - -export class FixedSizeBinary extends FieldType { - toString() { return `FixedSizeBinary byteWidth[${this.byteWidth}]`; } - constructor(public byteWidth: number) { - super(Type.FixedSizeBinary); - } - accept(visitor: Visitor) { - return visitor.visitFixedSizeBinaryFieldType(this); - } -} - -export class FixedSizeList extends FieldType { - toString() { return `FixedSizeList listSize[${this.listSize}]`; } - constructor(public listSize: number) { - super(Type.FixedSizeList); - } - accept(visitor: Visitor) { - return visitor.visitFixedSizeListFieldType(this); - } -} - -export class Map_ extends FieldType { - toString() { return `Map keysSorted[${this.keysSorted}]`; } - constructor(public keysSorted: boolean) { - super(Type.Map); - } - accept(visitor: Visitor) { - return visitor.visitMapFieldType(this); - } -} diff --git a/js/src/ipc/metadata.ts b/js/src/ipc/metadata.ts new file mode 100644 index 00000000000..88b7e52983b --- /dev/null +++ b/js/src/ipc/metadata.ts @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* tslint:disable:class-name */ + +import { align } from '../util/bit'; +import { Schema, Long, MessageHeader, MetadataVersion } from '../type'; + +export class Footer { + constructor(public dictionaryBatches: FileBlock[], public recordBatches: FileBlock[], public schema: Schema) {} +} + +export class FileBlock { + constructor(public metaDataLength: number, public bodyLength: Long, public offset: Long) {} +} + +export class Message { + public bodyLength: number; + public version: MetadataVersion; + public headerType: MessageHeader; + constructor(version: MetadataVersion, bodyLength: Long | number, headerType: MessageHeader) { + this.version = version; + this.headerType = headerType; + this.bodyLength = typeof bodyLength === 'number' ? bodyLength : bodyLength.low; + } + static isSchema(m: Message): m is Schema { return m.headerType === MessageHeader.Schema; } + static isRecordBatch(m: Message): m is RecordBatchMetadata { return m.headerType === MessageHeader.RecordBatch; } + static isDictionaryBatch(m: Message): m is DictionaryBatch { return m.headerType === MessageHeader.DictionaryBatch; } +} + +export class RecordBatchMetadata extends Message { + public length: number; + public nodes: FieldMetadata[]; + public buffers: BufferMetadata[]; + constructor(version: MetadataVersion, length: Long | number, nodes: FieldMetadata[], buffers: BufferMetadata[]) { + super(version, buffers.reduce((s, b) => align(s + b.length + (b.offset - s), 8), 0), MessageHeader.RecordBatch); + this.nodes = nodes; + this.buffers = buffers; + this.length = typeof length === 'number' ? length : length.low; + } +} + +export class DictionaryBatch extends Message { + public id: number; + public isDelta: boolean; + public data: RecordBatchMetadata; + constructor(version: MetadataVersion, data: RecordBatchMetadata, id: Long | number, isDelta: boolean = false) { + super(version, data.bodyLength, MessageHeader.DictionaryBatch); + this.isDelta = isDelta; + this.data = data; + this.id = typeof id === 'number' ? id : id.low; + } + private static atomicDictionaryId = 0; + public static getId() { return DictionaryBatch.atomicDictionaryId++; } + public get nodes(): FieldMetadata[] { return this.data.nodes; } + public get buffers(): BufferMetadata[] { return this.data.buffers; } +} + +export class BufferMetadata { + public offset: number; + public length: number; + constructor(offset: Long | number, length: Long | number) { + this.offset = typeof offset === 'number' ? offset : offset.low; + this.length = typeof length === 'number' ? length : length.low; + } +} + +export class FieldMetadata { + public length: number; + public nullCount: number; + constructor(length: Long | number, nullCount: Long | number) { + this.length = typeof length === 'number' ? length : length.low; + this.nullCount = typeof nullCount === 'number' ? nullCount : nullCount.low; + } +} diff --git a/js/src/reader/arrow.ts b/js/src/ipc/reader/arrow.ts similarity index 62% rename from js/src/reader/arrow.ts rename to js/src/ipc/reader/arrow.ts index cf8a3d6a281..af535900cbf 100644 --- a/js/src/reader/arrow.ts +++ b/js/src/ipc/reader/arrow.ts @@ -16,33 +16,33 @@ // under the License. import { readJSON } from './json'; -import { readBuffers, readBuffersAsync } from './buffer'; -import { readVectors, readVectorsAsync } from './vector'; -import { Vector } from '../vector/vector'; +import { RecordBatch } from '../../recordbatch'; +import { readBuffers, readBuffersAsync } from './binary'; +import { readRecordBatches, readRecordBatchesAsync, TypeDataLoader } from './vector'; +import { Schema } from '../../type'; +import { Message } from '../metadata'; -export { readJSON }; +export { readJSON, RecordBatch }; export { readBuffers, readBuffersAsync }; -export { readVectors, readVectorsAsync }; +export { readRecordBatches, readRecordBatchesAsync }; export function* read(sources: Iterable | object | string) { let input: any = sources; - let batches: Iterable; + let messages: Iterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>; if (typeof input === 'string') { try { input = JSON.parse(input); } catch (e) { input = sources; } } if (!input || typeof input !== 'object') { - batches = (typeof input === 'string') ? readVectors(readBuffers([input])) : []; + messages = (typeof input === 'string') ? readBuffers([input]) : []; } else { - batches = (typeof input[Symbol.iterator] === 'function') - ? readVectors(readBuffers(input)) - : readVectors(readJSON(input)); + messages = (typeof input[Symbol.iterator] === 'function') ? readBuffers(input) : readJSON(input); } - yield* batches; + yield* readRecordBatches(messages); } export async function* readAsync(sources: AsyncIterable) { - for await (let vectors of readVectorsAsync(readBuffersAsync(sources))) { - yield vectors; + for await (let recordBatch of readRecordBatchesAsync(readBuffersAsync(sources))) { + yield recordBatch; } } diff --git a/js/src/ipc/reader/binary.ts b/js/src/ipc/reader/binary.ts new file mode 100644 index 00000000000..6e3c7fc5cf0 --- /dev/null +++ b/js/src/ipc/reader/binary.ts @@ -0,0 +1,449 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../../vector'; +import { flatbuffers } from 'flatbuffers'; +import { TypeDataLoader } from './vector'; +import { Message, Footer, FileBlock, RecordBatchMetadata, DictionaryBatch, BufferMetadata, FieldMetadata, } from '../metadata'; +import { + Schema, Field, + DataType, Dictionary, + Null, TimeBitWidth, + Binary, Bool, Utf8, Decimal, + Date_, Time, Timestamp, Interval, + List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, +} from '../../type'; + +import { + Int8, Uint8, + Int16, Uint16, + Int32, Uint32, + Int64, Uint64, + Float16, Float64, Float32, +} from '../../type'; + +import ByteBuffer = flatbuffers.ByteBuffer; + +type MessageReader = (bb: ByteBuffer) => IterableIterator; + +export function* readBuffers(sources: Iterable | Uint8Array | Buffer | string) { + let schema: Schema | null = null; + let dictionaries = new Map(); + let readMessages: MessageReader | null = null; + if (ArrayBuffer.isView(sources) || typeof sources === 'string') { + sources = [sources as T]; + } + for (const source of sources) { + const bb = toByteBuffer(source); + if ((!schema && ({ schema, readMessages } = readSchema(bb))) && schema && readMessages) { + for (const message of readMessages(bb)) { + yield { + schema, message, + loader: new BinaryDataLoader( + bb, + arrayIterator(message.nodes), + arrayIterator(message.buffers), + dictionaries + ) + }; + } + } + } +} + +export async function* readBuffersAsync(sources: AsyncIterable) { + let schema: Schema | null = null; + let dictionaries = new Map(); + let readMessages: MessageReader | null = null; + for await (const source of sources) { + const bb = toByteBuffer(source); + if ((!schema && ({ schema, readMessages } = readSchema(bb))) && schema && readMessages) { + for (const message of readMessages(bb)) { + yield { + schema, message, + loader: new BinaryDataLoader( + bb, + arrayIterator(message.nodes), + arrayIterator(message.buffers), + dictionaries + ) + }; + } + } + } +} + +export class BinaryDataLoader extends TypeDataLoader { + private bytes: Uint8Array; + private messageOffset: number; + constructor(bb: ByteBuffer, nodes: Iterator, buffers: Iterator, dictionaries: Map) { + super(nodes, buffers, dictionaries); + this.bytes = bb.bytes(); + this.messageOffset = bb.position(); + } + protected readOffsets(type: T, buffer?: BufferMetadata) { return this.readData(type, buffer); } + protected readTypeIds(type: T, buffer?: BufferMetadata) { return this.readData(type, buffer); } + protected readData(_type: T, { length, offset }: BufferMetadata = this.getBufferMetadata()) { + return new Uint8Array(this.bytes.buffer, this.bytes.byteOffset + this.messageOffset + offset, length); + } +} + +function* arrayIterator(arr: Array) { yield* arr; } + +function toByteBuffer(bytes?: Uint8Array | Buffer | string) { + let arr: Uint8Array = bytes as any || new Uint8Array(0); + if (typeof bytes === 'string') { + arr = new Uint8Array(bytes.length); + for (let i = -1, n = bytes.length; ++i < n;) { + arr[i] = bytes.charCodeAt(i); + } + return new ByteBuffer(arr); + } + return new ByteBuffer(arr); +} + +function readSchema(bb: ByteBuffer) { + let schema: Schema, readMessages, footer: Footer | null; + if (footer = readFileSchema(bb)) { + schema = footer.schema; + readMessages = readFileMessages(footer); + } else if (schema = readStreamSchema(bb)!) { + readMessages = readStreamMessages; + } else { + throw new Error('Invalid Arrow buffer'); + } + return { schema, readMessages }; +} + +const PADDING = 4; +const MAGIC_STR = 'ARROW1'; +const MAGIC = new Uint8Array(MAGIC_STR.length); +for (let i = 0; i < MAGIC_STR.length; i += 1 | 0) { + MAGIC[i] = MAGIC_STR.charCodeAt(i); +} + +function checkForMagicArrowString(buffer: Uint8Array, index = 0) { + for (let i = -1, n = MAGIC.length; ++i < n;) { + if (MAGIC[i] !== buffer[index + i]) { + return false; + } + } + return true; +} + +const magicLength = MAGIC.length; +const magicAndPadding = magicLength + PADDING; +const magicX2AndPadding = magicLength * 2 + PADDING; + +function readStreamSchema(bb: ByteBuffer) { + if (!checkForMagicArrowString(bb.bytes(), 0)) { + for (const message of readMessages(bb)) { + if (Message.isSchema(message)) { + return message as Schema; + } + } + } + return null; +} + +function* readStreamMessages(bb: ByteBuffer) { + for (const message of readMessages(bb)) { + if (Message.isRecordBatch(message)) { + yield message; + } else if (Message.isDictionaryBatch(message)) { + yield message; + } else { + continue; + } + // position the buffer after the body to read the next message + bb.setPosition(bb.position() + message.bodyLength); + } +} + +function readFileSchema(bb: ByteBuffer) { + let fileLength = bb.capacity(), footerLength: number, footerOffset: number; + if ((fileLength < magicX2AndPadding /* Arrow buffer too small */) || + (!checkForMagicArrowString(bb.bytes(), 0) /* Missing magic start */) || + (!checkForMagicArrowString(bb.bytes(), fileLength - magicLength) /* Missing magic end */) || + (/* Invalid footer length */ + (footerLength = bb.readInt32(footerOffset = fileLength - magicAndPadding)) < 1 && + (footerLength + magicX2AndPadding > fileLength))) { + return null; + } + bb.setPosition(footerOffset - footerLength); + return footerFromByteBuffer(bb); +} + +function readFileMessages(footer: Footer) { + return function* (bb: ByteBuffer) { + for (let i = -1, batches = footer.dictionaryBatches, n = batches.length; ++i < n;) { + bb.setPosition(batches[i].offset.low); + yield readMessage(bb, bb.readInt32(bb.position())) as DictionaryBatch; + } + for (let i = -1, batches = footer.recordBatches, n = batches.length; ++i < n;) { + bb.setPosition(batches[i].offset.low); + yield readMessage(bb, bb.readInt32(bb.position())) as RecordBatchMetadata; + } + }; +} + +function* readMessages(bb: ByteBuffer) { + let length: number, message: Schema | RecordBatchMetadata | DictionaryBatch; + while (bb.position() < bb.capacity() && + (length = bb.readInt32(bb.position())) > 0) { + if (message = readMessage(bb, length)!) { + yield message; + } + } +} + +function readMessage(bb: ByteBuffer, length: number) { + bb.setPosition(bb.position() + PADDING); + const message = messageFromByteBuffer(bb); + bb.setPosition(bb.position() + length); + return message; +} + +import * as File_ from '../../fb/File'; +import * as Schema_ from '../../fb/Schema'; +import * as Message_ from '../../fb/Message'; + +import Type = Schema_.org.apache.arrow.flatbuf.Type; +import Precision = Schema_.org.apache.arrow.flatbuf.Precision; +import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; +import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; +import _Footer = File_.org.apache.arrow.flatbuf.Footer; +import _Block = File_.org.apache.arrow.flatbuf.Block; +import _Message = Message_.org.apache.arrow.flatbuf.Message; +import _Schema = Schema_.org.apache.arrow.flatbuf.Schema; +import _Field = Schema_.org.apache.arrow.flatbuf.Field; +import _RecordBatch = Message_.org.apache.arrow.flatbuf.RecordBatch; +import _DictionaryBatch = Message_.org.apache.arrow.flatbuf.DictionaryBatch; +import _FieldNode = Message_.org.apache.arrow.flatbuf.FieldNode; +import _Buffer = Schema_.org.apache.arrow.flatbuf.Buffer; +import _DictionaryEncoding = Schema_.org.apache.arrow.flatbuf.DictionaryEncoding; +import _Null = Schema_.org.apache.arrow.flatbuf.Null; +import _Int = Schema_.org.apache.arrow.flatbuf.Int; +import _FloatingPoint = Schema_.org.apache.arrow.flatbuf.FloatingPoint; +import _Binary = Schema_.org.apache.arrow.flatbuf.Binary; +import _Bool = Schema_.org.apache.arrow.flatbuf.Bool; +import _Utf8 = Schema_.org.apache.arrow.flatbuf.Utf8; +import _Decimal = Schema_.org.apache.arrow.flatbuf.Decimal; +import _Date = Schema_.org.apache.arrow.flatbuf.Date; +import _Time = Schema_.org.apache.arrow.flatbuf.Time; +import _Timestamp = Schema_.org.apache.arrow.flatbuf.Timestamp; +import _Interval = Schema_.org.apache.arrow.flatbuf.Interval; +import _List = Schema_.org.apache.arrow.flatbuf.List; +import _Struct = Schema_.org.apache.arrow.flatbuf.Struct_; +import _Union = Schema_.org.apache.arrow.flatbuf.Union; +import _FixedSizeBinary = Schema_.org.apache.arrow.flatbuf.FixedSizeBinary; +import _FixedSizeList = Schema_.org.apache.arrow.flatbuf.FixedSizeList; +import _Map = Schema_.org.apache.arrow.flatbuf.Map; + +function footerFromByteBuffer(bb: ByteBuffer) { + const dictionaryFields = new Map>(); + const f = _Footer.getRootAsFooter(bb), s = f.schema()!; + return new Footer( + dictionaryBatchesFromFooter(f), recordBatchesFromFooter(f), + new Schema(fieldsFromSchema(s, dictionaryFields), customMetadata(s), f.version(), dictionaryFields) + ); +} + +function messageFromByteBuffer(bb: ByteBuffer) { + const m = _Message.getRootAsMessage(bb)!, type = m.headerType(), version = m.version(); + switch (type) { + case MessageHeader.Schema: return schemaFromMessage(version, m.header(new _Schema())!, new Map()); + case MessageHeader.RecordBatch: return recordBatchFromMessage(version, m.header(new _RecordBatch())!); + case MessageHeader.DictionaryBatch: return dictionaryBatchFromMessage(version, m.header(new _DictionaryBatch())!); + } + return null; + // throw new Error(`Unrecognized Message type '${type}'`); +} + +function schemaFromMessage(version: MetadataVersion, s: _Schema, dictionaryFields: Map>) { + return new Schema(fieldsFromSchema(s, dictionaryFields), customMetadata(s), version, dictionaryFields); +} + +function recordBatchFromMessage(version: MetadataVersion, b: _RecordBatch) { + return new RecordBatchMetadata(version, b.length(), fieldNodesFromRecordBatch(b), buffersFromRecordBatch(b, version)); +} + +function dictionaryBatchFromMessage(version: MetadataVersion, d: _DictionaryBatch) { + return new DictionaryBatch(version, recordBatchFromMessage(version, d.data()!), d.id(), d.isDelta()); +} + +function dictionaryBatchesFromFooter(f: _Footer) { + const blocks = [] as FileBlock[]; + for (let b: _Block, i = -1, n = f && f.dictionariesLength(); ++i < n;) { + if (b = f.dictionaries(i)!) { + blocks.push(new FileBlock(b.metaDataLength(), b.bodyLength(), b.offset())); + } + } + return blocks; +} + +function recordBatchesFromFooter(f: _Footer) { + const blocks = [] as FileBlock[]; + for (let b: _Block, i = -1, n = f && f.recordBatchesLength(); ++i < n;) { + if (b = f.recordBatches(i)!) { + blocks.push(new FileBlock(b.metaDataLength(), b.bodyLength(), b.offset())); + } + } + return blocks; +} + +function fieldsFromSchema(s: _Schema, dictionaryFields: Map> | null) { + const fields = [] as Field[]; + for (let i = -1, c: Field | null, n = s && s.fieldsLength(); ++i < n;) { + if (c = field(s.fields(i)!, dictionaryFields)) { + fields.push(c); + } + } + return fields; +} + +function fieldsFromField(f: _Field, dictionaryFields: Map> | null) { + const fields = [] as Field[]; + for (let i = -1, c: Field | null, n = f && f.childrenLength(); ++i < n;) { + if (c = field(f.children(i)!, dictionaryFields)) { + fields.push(c); + } + } + return fields; +} + +function fieldNodesFromRecordBatch(b: _RecordBatch) { + const fieldNodes = [] as FieldMetadata[]; + for (let i = -1, n = b.nodesLength(); ++i < n;) { + fieldNodes.push(fieldNodeFromRecordBatch(b.nodes(i)!)); + } + return fieldNodes; +} + +function buffersFromRecordBatch(b: _RecordBatch, version: MetadataVersion) { + const buffers = [] as BufferMetadata[]; + for (let i = -1, n = b.buffersLength(); ++i < n;) { + let buffer = b.buffers(i)!; + // If this Arrow buffer was written before version 4, + // advance the buffer's bb_pos 8 bytes to skip past + // the now-removed page id field. + if (version < MetadataVersion.V4) { + buffer.bb_pos += (8 * (i + 1)); + } + buffers.push(bufferFromRecordBatch(buffer)); + } + return buffers; +} + +function field(f: _Field, dictionaryFields: Map> | null) { + let name = f.name()!; + let field: Field | void; + let nullable = f.nullable(); + let metadata = customMetadata(f); + let dataType: DataType | null; + let keysMeta: _Int | null, id: number; + let dictMeta: _DictionaryEncoding | null; + if (!dictionaryFields || !(dictMeta = f.dictionary())) { + if (dataType = typeFromField(f, fieldsFromField(f, dictionaryFields))) { + field = new Field(name, dataType, nullable, metadata); + } + } else if (dataType = dictionaryFields.has(id = dictMeta.id().low) + ? dictionaryFields.get(id)!.type.dictionary + : typeFromField(f, fieldsFromField(f, null))) { + dataType = new Dictionary(dataType, + // a dictionary index defaults to signed 32 bit int if unspecified + (keysMeta = dictMeta.indexType()) ? intFromField(keysMeta)! : new Int32(), + id, dictMeta.isOrdered() + ); + field = new Field(name, dataType, nullable, metadata); + dictionaryFields.has(id) || dictionaryFields.set(id, field as Field); + } + return field || null; +} + +function customMetadata(parent?: _Schema | _Field | null) { + const data = new Map(); + if (parent) { + for (let entry, key, i = -1, n = parent.customMetadataLength() | 0; ++i < n;) { + if ((entry = parent.customMetadata(i)) && (key = entry.key()) != null) { + data.set(key, entry.value()!); + } + } + } + return data; +} + +function fieldNodeFromRecordBatch(f: _FieldNode) { + return new FieldMetadata(f.length(), f.nullCount()); +} + +function bufferFromRecordBatch(b: _Buffer) { + return new BufferMetadata(b.offset(), b.length()); +} + +function typeFromField(f: _Field, children?: Field[]): DataType | null { + switch (f.typeType()) { + case Type.NONE: return null; + case Type.Null: return nullFromField(f.type(new _Null())!); + case Type.Int: return intFromField(f.type(new _Int())!); + case Type.FloatingPoint: return floatFromField(f.type(new _FloatingPoint())!); + case Type.Binary: return binaryFromField(f.type(new _Binary())!); + case Type.Utf8: return utf8FromField(f.type(new _Utf8())!); + case Type.Bool: return boolFromField(f.type(new _Bool())!); + case Type.Decimal: return decimalFromField(f.type(new _Decimal())!); + case Type.Date: return dateFromField(f.type(new _Date())!); + case Type.Time: return timeFromField(f.type(new _Time())!); + case Type.Timestamp: return timestampFromField(f.type(new _Timestamp())!); + case Type.Interval: return intervalFromField(f.type(new _Interval())!); + case Type.List: return listFromField(f.type(new _List())!, children || []); + case Type.Struct_: return structFromField(f.type(new _Struct())!, children || []); + case Type.Union: return unionFromField(f.type(new _Union())!, children || []); + case Type.FixedSizeBinary: return fixedSizeBinaryFromField(f.type(new _FixedSizeBinary())!); + case Type.FixedSizeList: return fixedSizeListFromField(f.type(new _FixedSizeList())!, children || []); + case Type.Map: return mapFromField(f.type(new _Map())!, children || []); + } + throw new Error(`Unrecognized type ${f.typeType()}`); +} + +function nullFromField (_type: _Null) { return new Null(); } +function intFromField (_type: _Int) { switch (_type.bitWidth()) { + case 8: return _type.isSigned() ? new Int8() : new Uint8(); + case 16: return _type.isSigned() ? new Int16() : new Uint16(); + case 32: return _type.isSigned() ? new Int32() : new Uint32(); + case 64: return _type.isSigned() ? new Int64() : new Uint64(); + } + return null; } +function floatFromField (_type: _FloatingPoint) { switch (_type.precision()) { + case Precision.HALF: return new Float16(); + case Precision.SINGLE: return new Float32(); + case Precision.DOUBLE: return new Float64(); + } + return null; } +function binaryFromField (_type: _Binary) { return new Binary(); } +function utf8FromField (_type: _Utf8) { return new Utf8(); } +function boolFromField (_type: _Bool) { return new Bool(); } +function decimalFromField (_type: _Decimal) { return new Decimal(_type.scale(), _type.precision()); } +function dateFromField (_type: _Date) { return new Date_(_type.unit()); } +function timeFromField (_type: _Time) { return new Time(_type.unit(), _type.bitWidth() as TimeBitWidth); } +function timestampFromField (_type: _Timestamp) { return new Timestamp(_type.unit(), _type.timezone()); } +function intervalFromField (_type: _Interval) { return new Interval(_type.unit()); } +function listFromField (_type: _List, children: Field[]) { return new List(children); } +function structFromField (_type: _Struct, children: Field[]) { return new Struct(children); } +function unionFromField (_type: _Union, children: Field[]) { return new Union(_type.mode(), (_type.typeIdsArray() || []) as Type[], children); } +function fixedSizeBinaryFromField(_type: _FixedSizeBinary) { return new FixedSizeBinary(_type.byteWidth()); } +function fixedSizeListFromField (_type: _FixedSizeList, children: Field[]) { return new FixedSizeList(_type.listSize(), children); } +function mapFromField (_type: _Map, children: Field[]) { return new Map_(_type.keysSorted(), children); } diff --git a/js/src/ipc/reader/json.ts b/js/src/ipc/reader/json.ts new file mode 100644 index 00000000000..10819986f6d --- /dev/null +++ b/js/src/ipc/reader/json.ts @@ -0,0 +1,323 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../../vector'; +import { flatbuffers } from 'flatbuffers'; +import { TypeDataLoader } from './vector'; +import { packBools } from '../../util/bit'; +import * as IntUtil from '../../util/int'; +import { TextEncoder } from 'text-encoding-utf-8'; +import { RecordBatchMetadata, DictionaryBatch, BufferMetadata, FieldMetadata } from '../metadata'; +import { + Schema, Field, + DataType, Dictionary, + Null, TimeBitWidth, + Binary, Bool, Utf8, Decimal, + Date_, Time, Timestamp, Interval, + List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, +} from '../../type'; + +import { + Int8, Uint8, + Int16, Uint16, + Int32, Uint32, + Int64, Uint64, + Float16, Float64, Float32, +} from '../../type'; + +import Long = flatbuffers.Long; + +export function* readJSON(json: any) { + const schema = schemaFromJSON(json['schema']); + const dictionaries = new Map(); + for (const batch of (json['dictionaries'] || [])) { + const message = dictionaryBatchFromJSON(batch); + yield { + schema, message, + loader: new JSONDataLoader( + flattenDataSources(batch['data']['columns']), + arrayIterator(message.nodes), + arrayIterator(message.buffers), + dictionaries + ) + }; + } + for (const batch of (json['batches'] || [])) { + const message = recordBatchFromJSON(batch); + yield { + schema, message, + loader: new JSONDataLoader( + flattenDataSources(batch['columns']), + arrayIterator(message.nodes), + arrayIterator(message.buffers), + dictionaries + ) + }; + } +} + +function* arrayIterator(arr: Array) { yield* arr; } +function flattenDataSources(xs: any[]): any[][] { + return (xs || []).reduce((buffers, column: any) => [ + ...buffers, + ...(column['VALIDITY'] && [column['VALIDITY']] || []), + ...(column['OFFSET'] && [column['OFFSET']] || []), + ...(column['DATA'] && [column['DATA']] || []), + ...flattenDataSources(column['children']) + ], [] as any[][]); +} + +const utf8Encoder = new TextEncoder('utf-8'); + +export class JSONDataLoader extends TypeDataLoader { + constructor(private sources: any[][], nodes: Iterator, buffers: Iterator, dictionaries: Map) { + super(nodes, buffers, dictionaries); + } + protected readNullBitmap(_type: T, nullCount: number, { offset } = this.getBufferMetadata()) { + return nullCount <= 0 ? new Uint8Array(0) : packBools(this.sources[offset]); + } + protected readOffsets(_type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { + return new Int32Array(this.sources[offset]); + } + protected readTypeIds(_type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { + return new Int8Array(this.sources[offset]); + } + protected readData(type: T, { offset }: BufferMetadata = this.getBufferMetadata()) { + const { sources } = this; + if (DataType.isTimestamp(type) === true) { + return new Uint8Array(int64DataFromJSON(sources[offset] as string[])); + } else if ((DataType.isInt(type) || DataType.isTime(type)) && type.bitWidth === 64) { + return new Uint8Array(int64DataFromJSON(sources[offset] as string[])); + } else if (DataType.isDate(type) && type.unit === DateUnit.MILLISECOND) { + return new Uint8Array(int64DataFromJSON(sources[offset] as string[])); + } else if (DataType.isDecimal(type) === true) { + return new Uint8Array(decimalDataFromJSON(sources[offset] as string[])); + } else if (DataType.isBinary(type) === true) { + return new Uint8Array(binaryDataFromJSON(sources[offset] as string[])); + } else if (DataType.isBool(type) === true) { + return new Uint8Array(packBools(sources[offset] as number[]).buffer); + } else if (DataType.isUtf8(type) === true) { + return utf8Encoder.encode((sources[offset] as string[]).join('')); + } else { + return toTypedArray(type.ArrayType, sources[offset].map((x) => +x)) as any; + } + } +} + +function int64DataFromJSON(values: string[]) { + const data = new Uint32Array(values.length * 2); + for (let i = -1, n = values.length; ++i < n;) { + // Force all values (even numbers) to be parsed as strings since + // pulling out high and low bits seems to lose precision sometimes + // For example: + // > -4613034156400212000 >>> 0 + // 721782784 + // The correct lower 32-bits are 721782752 + IntUtil.Int64.fromString(values[i].toString(), new Uint32Array(data.buffer, data.byteOffset + 2 * i * 4, 2)); + } + return data.buffer; +} + +function decimalDataFromJSON(values: string[]) { + const data = new Uint32Array(values.length * 4); + for (let i = -1, n = values.length; ++i < n;) { + IntUtil.Int128.fromString(values[i], new Uint32Array(data.buffer, data.byteOffset + 4 * 4 * i, 4)); + } + return data.buffer; +} + +function binaryDataFromJSON(values: string[]) { + // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] + // There are definitely more efficient ways to do this... but it gets the + // job done. + const joined = values.join(''); + const data = new Uint8Array(joined.length / 2); + for (let i = 0; i < joined.length; i += 2) { + data[i >> 1] = parseInt(joined.substr(i, 2), 16); + } + return data.buffer; +} + +import * as Schema_ from '../../fb/Schema'; +import Type = Schema_.org.apache.arrow.flatbuf.Type; +import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; +import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; +import Precision = Schema_.org.apache.arrow.flatbuf.Precision; +import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; +import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; +import { toTypedArray } from '../../data'; + +function schemaFromJSON(s: any): Schema { + const dictionaryFields = new Map>(); + return new Schema( + fieldsFromJSON(s['fields'], dictionaryFields), + customMetadata(s['customMetadata']), + MetadataVersion.V4, dictionaryFields + ); +} + +function recordBatchFromJSON(b: any): RecordBatchMetadata { + return new RecordBatchMetadata( + MetadataVersion.V4, + b['count'], + fieldNodesFromJSON(b['columns']), + buffersFromJSON(b['columns']) + ); +} + +function dictionaryBatchFromJSON(b: any): DictionaryBatch { + return new DictionaryBatch( + MetadataVersion.V4, + recordBatchFromJSON(b['data']), + b['id'], b['isDelta'] + ); +} + +function fieldsFromJSON(fs: any[], dictionaryFields: Map> | null): Field[] { + return (fs || []) + .map((f) => fieldFromJSON(f, dictionaryFields)) + .filter((f) => f != null) as Field[]; +} + +function fieldNodesFromJSON(xs: any[]): FieldMetadata[] { + return (xs || []).reduce((fieldNodes, column: any) => [ + ...fieldNodes, + new FieldMetadata( + new Long(column['count'], 0), + new Long(nullCountFromJSON(column['VALIDITY']), 0) + ), + ...fieldNodesFromJSON(column['children']) + ], [] as FieldMetadata[]); +} + +function buffersFromJSON(xs: any[], buffers: BufferMetadata[] = []): BufferMetadata[] { + for (let i = -1, n = (xs || []).length; ++i < n;) { + const column = xs[i]; + column['VALIDITY'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['VALIDITY'].length, 0))); + column['OFFSET'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['OFFSET'].length, 0))); + column['DATA'] && buffers.push(new BufferMetadata(new Long(buffers.length, 0), new Long(column['DATA'].length, 0))); + buffers = buffersFromJSON(column['children'], buffers); + } + return buffers; +} + +function nullCountFromJSON(validity: number[]) { + return (validity || []).reduce((sum, val) => sum + +(val === 0), 0); +} + +function fieldFromJSON(f: any, dictionaryFields: Map> | null) { + let name = f['name']; + let field: Field | void; + let nullable = f['nullable']; + let dataType: DataType | null; + let id: number, keysMeta: any, dictMeta: any; + let metadata = customMetadata(f['customMetadata']); + if (!dictionaryFields || !(dictMeta = f['dictionary'])) { + if (dataType = typeFromJSON(f['type'], fieldsFromJSON(f['children'], dictionaryFields))) { + field = new Field(name, dataType, nullable, metadata); + } + } else if (dataType = dictionaryFields.has(id = dictMeta['id']) + ? dictionaryFields.get(id)!.type.dictionary + : typeFromJSON(f['type'], fieldsFromJSON(f['children'], null))) { + dataType = new Dictionary(dataType, + // a dictionary index defaults to signed 32 bit int if unspecified + (keysMeta = dictMeta['indexType']) ? intFromJSON(keysMeta)! : new Int32(), + id, dictMeta['isOrdered'] + ); + field = new Field(name, dataType, nullable, metadata); + dictionaryFields.has(id) || dictionaryFields.set(id, field as Field); + } + return field || null; +} + +function customMetadata(metadata?: any) { + return new Map(Object.entries(metadata || {})); +} + +const namesToTypeMap: { [n: string]: Type } = { + 'NONE': Type.NONE, + 'null': Type.Null, + 'int': Type.Int, + 'floatingpoint': Type.FloatingPoint, + 'binary': Type.Binary, + 'bool': Type.Bool, + 'utf8': Type.Utf8, + 'decimal': Type.Decimal, + 'date': Type.Date, + 'time': Type.Time, + 'timestamp': Type.Timestamp, + 'interval': Type.Interval, + 'list': Type.List, + 'struct': Type.Struct_, + 'union': Type.Union, + 'fixedsizebinary': Type.FixedSizeBinary, + 'fixedsizelist': Type.FixedSizeList, + 'map': Type.Map, +}; + +function typeFromJSON(t: any, children?: Field[]) { + switch (namesToTypeMap[t['name']]) { + case Type.NONE: return null; + case Type.Null: return nullFromJSON(t); + case Type.Int: return intFromJSON(t); + case Type.FloatingPoint: return floatingPointFromJSON(t); + case Type.Binary: return binaryFromJSON(t); + case Type.Utf8: return utf8FromJSON(t); + case Type.Bool: return boolFromJSON(t); + case Type.Decimal: return decimalFromJSON(t); + case Type.Date: return dateFromJSON(t); + case Type.Time: return timeFromJSON(t); + case Type.Timestamp: return timestampFromJSON(t); + case Type.Interval: return intervalFromJSON(t); + case Type.List: return listFromJSON(t, children || []); + case Type.Struct_: return structFromJSON(t, children || []); + case Type.Union: return unionFromJSON(t, children || []); + case Type.FixedSizeBinary: return fixedSizeBinaryFromJSON(t); + case Type.FixedSizeList: return fixedSizeListFromJSON(t, children || []); + case Type.Map: return mapFromJSON(t, children || []); + } + throw new Error(`Unrecognized type ${t['name']}`); +} + +function nullFromJSON (_type: any) { return new Null(); } +function intFromJSON (_type: any) { switch (_type['bitWidth']) { + case 8: return _type['isSigned'] ? new Int8() : new Uint8(); + case 16: return _type['isSigned'] ? new Int16() : new Uint16(); + case 32: return _type['isSigned'] ? new Int32() : new Uint32(); + case 64: return _type['isSigned'] ? new Int64() : new Uint64(); + } + return null; } +function floatingPointFromJSON (_type: any) { switch (Precision[_type['precision']] as any) { + case Precision.HALF: return new Float16(); + case Precision.SINGLE: return new Float32(); + case Precision.DOUBLE: return new Float64(); + } + return null; } +function binaryFromJSON (_type: any) { return new Binary(); } +function utf8FromJSON (_type: any) { return new Utf8(); } +function boolFromJSON (_type: any) { return new Bool(); } +function decimalFromJSON (_type: any) { return new Decimal(_type['scale'], _type['precision']); } +function dateFromJSON (_type: any) { return new Date_(DateUnit[_type['unit']] as any); } +function timeFromJSON (_type: any) { return new Time(TimeUnit[_type['unit']] as any, _type['bitWidth'] as TimeBitWidth); } +function timestampFromJSON (_type: any) { return new Timestamp(TimeUnit[_type['unit']] as any, _type['timezone']); } +function intervalFromJSON (_type: any) { return new Interval(IntervalUnit[_type['unit']] as any); } +function listFromJSON (_type: any, children: Field[]) { return new List(children); } +function structFromJSON (_type: any, children: Field[]) { return new Struct(children); } +function unionFromJSON (_type: any, children: Field[]) { return new Union(_type['mode'], (_type['typeIdsArray'] || []) as Type[], children); } +function fixedSizeBinaryFromJSON(_type: any) { return new FixedSizeBinary(_type['byteWidth']); } +function fixedSizeListFromJSON (_type: any, children: Field[]) { return new FixedSizeList(_type['listSize'], children); } +function mapFromJSON (_type: any, children: Field[]) { return new Map_(_type['keysSorted'], children); } diff --git a/js/src/ipc/reader/vector.ts b/js/src/ipc/reader/vector.ts new file mode 100644 index 00000000000..809069c6d98 --- /dev/null +++ b/js/src/ipc/reader/vector.ts @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Vector } from '../../vector'; +import { RecordBatch } from '../../recordbatch'; +import { TypeVisitor } from '../../visitor'; +import { FlatType, NestedType, ListType } from '../../type'; +import { Message, FieldMetadata, BufferMetadata } from '../metadata'; +import { FlatData, ListData, NestedData, SingleNestedData, DenseUnionData, SparseUnionData, BoolData, FlatListData, DictionaryData } from '../../data'; +import { + Schema, Field, + Dictionary, + Null, Int, Float, + Binary, Bool, Utf8, Decimal, + Date_, Time, Timestamp, Interval, + List, Struct, Union, FixedSizeBinary, FixedSizeList, Map_, + UnionMode, SparseUnion, DenseUnion, FlatListType, DataType, +} from '../../type'; + +export function* readRecordBatches(messages: Iterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>) { + for (const { schema, message, loader } of messages) { + yield* readRecordBatch(schema, message, loader); + } +} + +export async function* readRecordBatchesAsync(messages: AsyncIterable<{ schema: Schema, message: Message, loader: TypeDataLoader }>) { + for await (const { schema, message, loader } of messages) { + yield* readRecordBatch(schema, message, loader); + } +} + +export function* readRecordBatch(schema: Schema, message: Message, loader: TypeDataLoader) { + if (Message.isRecordBatch(message)) { + yield new RecordBatch(schema, message.length, loader.visitFields(schema.fields)); + } else if (Message.isDictionaryBatch(message)) { + const dictionaryId = message.id; + const dictionaries = loader.dictionaries; + const dictionaryField = schema.dictionaries.get(dictionaryId)!; + const dictionaryDataType = (dictionaryField.type as Dictionary).dictionary; + let dictionaryVector = Vector.create(loader.visit(dictionaryDataType)); + if (message.isDelta && dictionaries.has(dictionaryId)) { + dictionaryVector = dictionaries.get(dictionaryId)!.concat(dictionaryVector); + } + dictionaries.set(dictionaryId, dictionaryVector); + } +} + +export abstract class TypeDataLoader extends TypeVisitor { + + public dictionaries: Map; + protected nodes: Iterator; + protected buffers: Iterator; + + constructor(nodes: Iterator, buffers: Iterator, dictionaries: Map) { + super(); + this.nodes = nodes; + this.buffers = buffers; + this.dictionaries = dictionaries; + } + + public visitFields(fields: Field[]) { return fields.map((field) => this.visit(field.type)); } + + public visitNull (type: Null) { return this.visitNullType(type); } + public visitInt (type: Int) { return this.visitFlatType(type); } + public visitFloat (type: Float) { return this.visitFlatType(type); } + public visitBinary (type: Binary) { return this.visitFlatList(type); } + public visitUtf8 (type: Utf8) { return this.visitFlatList(type); } + public visitBool (type: Bool) { return this.visitBoolType(type); } + public visitDecimal (type: Decimal) { return this.visitFlatType(type); } + public visitDate (type: Date_) { return this.visitFlatType(type); } + public visitTime (type: Time) { return this.visitFlatType(type); } + public visitTimestamp (type: Timestamp) { return this.visitFlatType(type); } + public visitInterval (type: Interval) { return this.visitFlatType(type); } + public visitList (type: List) { return this.visitListType(type); } + public visitStruct (type: Struct) { return this.visitNestedType(type); } + public visitUnion (type: Union) { return this.visitUnionType(type); } + public visitFixedSizeBinary(type: FixedSizeBinary) { return this.visitFlatType(type); } + public visitFixedSizeList (type: FixedSizeList) { return this.visitFixedSizeListType(type); } + public visitMap (type: Map_) { return this.visitNestedType(type); } + public visitDictionary (type: Dictionary) { + return new DictionaryData(type, this.dictionaries.get(type.id)!, this.visit(type.indicies)); + } + protected getFieldMetadata() { return this.nodes.next().value; } + protected getBufferMetadata() { return this.buffers.next().value; } + protected readNullBitmap(type: T, nullCount: number, buffer = this.getBufferMetadata()) { + return nullCount > 0 && this.readData(type, buffer) || new Uint8Array(0); + } + protected abstract readData(type: T, buffer?: BufferMetadata): any; + protected abstract readOffsets(type: T, buffer?: BufferMetadata): any; + protected abstract readTypeIds(type: T, buffer?: BufferMetadata): any; + protected visitNullType(type: Null, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new FlatData(type, length, this.readNullBitmap(type, nullCount), new Uint8Array(0), 0, nullCount); + } + protected visitFlatType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new FlatData(type, length, this.readNullBitmap(type, nullCount), this.readData(type), 0, nullCount); + } + protected visitBoolType(type: Bool, { length, nullCount }: FieldMetadata = this.getFieldMetadata(), data?: Uint8Array) { + return new BoolData(type, length, this.readNullBitmap(type, nullCount), data || this.readData(type), 0, nullCount); + } + protected visitFlatList(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new FlatListData(type, length, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.readData(type), 0, nullCount); + } + protected visitListType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new ListData(type, length, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.visit(type.children![0].type), 0, nullCount); + } + protected visitFixedSizeListType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new SingleNestedData(type, length, this.readNullBitmap(type, nullCount), this.visit(type.children![0].type), 0, nullCount); + } + protected visitNestedType(type: T, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return new NestedData(type, length, this.readNullBitmap(type, nullCount), this.visitFields(type.children), 0, nullCount); + } + protected visitUnionType(type: DenseUnion | SparseUnion, { length, nullCount }: FieldMetadata = this.getFieldMetadata()) { + return type.mode === UnionMode.Sparse ? + new SparseUnionData(type as SparseUnion, length, this.readNullBitmap(type, nullCount), this.readTypeIds(type), this.visitFields(type.children), 0, nullCount) : + new DenseUnionData(type as DenseUnion, length, this.readNullBitmap(type, nullCount), this.readOffsets(type), this.readTypeIds(type), this.visitFields(type.children), 0, nullCount); + } +} diff --git a/js/src/predicate.ts b/js/src/predicate.ts index a80e56ee599..ab327ea9d72 100644 --- a/js/src/predicate.ts +++ b/js/src/predicate.ts @@ -15,22 +15,22 @@ // specific language governing permissions and limitations // under the License. -import { Vector } from './vector/vector'; -import { DictionaryVector } from './vector/dictionary'; +import { RecordBatch } from './recordbatch'; +import { Vector, DictionaryVector } from './vector'; -export type ValueFunc = (idx: number, cols: Vector[]) => T|null; -export type PredicateFunc = (idx: number, cols: Vector[]) => boolean; +export type ValueFunc = (idx: number, cols: RecordBatch) => T | null; +export type PredicateFunc = (idx: number, cols: RecordBatch) => boolean; export abstract class Value { - eq(other: Value|T): Predicate { + eq(other: Value | T): Predicate { if (!(other instanceof Value)) { other = new Literal(other); } return new Equals(this, other); } - lteq(other: Value|T): Predicate { + lteq(other: Value | T): Predicate { if (!(other instanceof Value)) { other = new Literal(other); } return new LTeq(this, other); } - gteq(other: Value|T): Predicate { + gteq(other: Value | T): Predicate { if (!(other instanceof Value)) { other = new Literal(other); } return new GTeq(this, other); } @@ -41,24 +41,27 @@ export class Literal extends Value { } export class Col extends Value { - vector: Vector; - colidx: number; + // @ts-ignore + public vector: Vector; + // @ts-ignore + public colidx: number; constructor(public name: string) { super(); } - bind(cols: Vector[]) { + bind(batch: RecordBatch) { if (!this.colidx) { // Assume column index doesn't change between calls to bind //this.colidx = cols.findIndex(v => v.name.indexOf(this.name) != -1); this.colidx = -1; - for (let idx = -1; ++idx < cols.length;) { - if (cols[idx].name === this.name) { + const fields = batch.schema.fields; + for (let idx = -1; ++idx < fields.length;) { + if (fields[idx].name === this.name) { this.colidx = idx; break; } } if (this.colidx < 0) { throw new Error(`Failed to bind Col "${this.name}"`); } } - this.vector = cols[this.colidx]; + this.vector = batch.getChildAt(this.colidx)!; return this.vector.get.bind(this.vector); } @@ -66,7 +69,7 @@ export class Col extends Value { } export abstract class Predicate { - abstract bind(cols: Vector[]): PredicateFunc; + abstract bind(batch: RecordBatch): PredicateFunc; and(expr: Predicate): Predicate { return new And(this, expr); } or(expr: Predicate): Predicate { return new Or(this, expr); } ands(): Predicate[] { return [this]; } @@ -77,26 +80,26 @@ export abstract class ComparisonPredicate extends Predicate { super(); } - bind(cols: Vector[]) { + bind(batch: RecordBatch) { if (this.left instanceof Literal) { if (this.right instanceof Literal) { - return this._bindLitLit(cols, this.left, this.right); + return this._bindLitLit(batch, this.left, this.right); } else { // right is a Col - return this._bindColLit(cols, this.right as Col, this.left); + return this._bindColLit(batch, this.right as Col, this.left); } } else { // left is a Col if (this.right instanceof Literal) { - return this._bindColLit(cols, this.left as Col, this.right); + return this._bindColLit(batch, this.left as Col, this.right); } else { // right is a Col - return this._bindColCol(cols, this.left as Col, this.right as Col); + return this._bindColCol(batch, this.left as Col, this.right as Col); } } } - protected abstract _bindLitLit(cols: Vector[], left: Literal, right: Literal): PredicateFunc; - protected abstract _bindColCol(cols: Vector[], left: Col , right: Col ): PredicateFunc; - protected abstract _bindColLit(cols: Vector[], col: Col , lit: Literal ): PredicateFunc; + protected abstract _bindLitLit(batch: RecordBatch, left: Literal, right: Literal): PredicateFunc; + protected abstract _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc; + protected abstract _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc; } abstract class CombinationPredicate extends Predicate { @@ -106,49 +109,51 @@ abstract class CombinationPredicate extends Predicate { } class And extends CombinationPredicate { - bind(cols: Vector[]) { - const left = this.left.bind(cols); - const right = this.right.bind(cols); - return (idx: number, cols: Vector[]) => left(idx, cols) && right(idx, cols); + bind(batch: RecordBatch) { + const left = this.left.bind(batch); + const right = this.right.bind(batch); + return (idx: number, batch: RecordBatch) => left(idx, batch) && right(idx, batch); } ands(): Predicate[] { return this.left.ands().concat(this.right.ands()); } } class Or extends CombinationPredicate { - bind(cols: Vector[]) { - const left = this.left.bind(cols); - const right = this.right.bind(cols); - return (idx: number, cols: Vector[]) => left(idx, cols) || right(idx, cols); + bind(batch: RecordBatch) { + const left = this.left.bind(batch); + const right = this.right.bind(batch); + return (idx: number, batch: RecordBatch) => left(idx, batch) || right(idx, batch); } } export class Equals extends ComparisonPredicate { - protected _bindLitLit(_: Vector[], left: Literal, right: Literal): PredicateFunc { + protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { const rtrn: boolean = left.v == right.v; return () => rtrn; } - protected _bindColCol(cols: Vector[], left: Col , right: Col ): PredicateFunc { - const left_func = left.bind(cols); - const right_func = right.bind(cols); - return (idx: number, cols: Vector[]) => left_func(idx, cols) == right_func(idx, cols); + protected _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc { + const left_func = left.bind(batch); + const right_func = right.bind(batch); + return (idx: number, batch: RecordBatch) => left_func(idx, batch) == right_func(idx, batch); } - protected _bindColLit(cols: Vector[], col: Col , lit: Literal ): PredicateFunc { - const col_func = col.bind(cols); + protected _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc { + const col_func = col.bind(batch); if (col.vector instanceof DictionaryVector) { // Assume that there is only one key with the value `lit.v` // TODO: add lazily-computed reverse dictionary lookups, associated // with col.vector.data so that we only have to do this once per // dictionary let key = -1; - for (; ++key < col.vector.data.length;) { - if (col.vector.data.get(key) === lit.v) { + let dict = col.vector; + let data = dict.dictionary!; + for (let len = data.length; ++key < len;) { + if (data.get(key) === lit.v) { break; } } - if (key == col.vector.data.length) { + if (key == data.length) { // the value doesn't exist in the dictionary - always return // false // TODO: special-case of PredicateFunc that encapsulates this @@ -157,48 +162,48 @@ export class Equals extends ComparisonPredicate { return () => false; } else { return (idx: number) => { - return (col.vector as DictionaryVector).getKey(idx) === key; + return dict.getKey(idx) === key; }; } } else { - return (idx: number, cols: Vector[]) => col_func(idx, cols) == lit.v; + return (idx: number, cols: RecordBatch) => col_func(idx, cols) == lit.v; } } } export class LTeq extends ComparisonPredicate { - protected _bindLitLit(_: Vector[], left: Literal, right: Literal): PredicateFunc { + protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { const rtrn: boolean = left.v <= right.v; return () => rtrn; } - protected _bindColCol(cols: Vector[], left: Col , right: Col ): PredicateFunc { - const left_func = left.bind(cols); - const right_func = right.bind(cols); - return (idx: number, cols: Vector[]) => left_func(idx, cols) <= right_func(idx, cols); + protected _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc { + const left_func = left.bind(batch); + const right_func = right.bind(batch); + return (idx: number, cols: RecordBatch) => left_func(idx, cols) <= right_func(idx, cols); } - protected _bindColLit(cols: Vector[], col: Col , lit: Literal ): PredicateFunc { - const col_func = col.bind(cols); - return (idx: number, cols: Vector[]) => col_func(idx, cols) <= lit.v; + protected _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc { + const col_func = col.bind(batch); + return (idx: number, cols: RecordBatch) => col_func(idx, cols) <= lit.v; } } export class GTeq extends ComparisonPredicate { - protected _bindLitLit(_: Vector[], left: Literal, right: Literal): PredicateFunc { + protected _bindLitLit(_batch: RecordBatch, left: Literal, right: Literal): PredicateFunc { const rtrn: boolean = left.v >= right.v; return () => rtrn; } - protected _bindColCol(cols: Vector[], left: Col, right: Col): PredicateFunc { - const left_func = left.bind(cols); - const right_func = right.bind(cols); - return (idx: number, cols: Vector[]) => left_func(idx, cols) >= right_func(idx, cols); + protected _bindColCol(batch: RecordBatch, left: Col, right: Col): PredicateFunc { + const left_func = left.bind(batch); + const right_func = right.bind(batch); + return (idx: number, cols: RecordBatch) => left_func(idx, cols) >= right_func(idx, cols); } - protected _bindColLit(cols: Vector[], col: Col, lit: Literal): PredicateFunc { - const col_func = col.bind(cols); - return (idx: number, cols: Vector[]) => col_func(idx, cols) >= lit.v; + protected _bindColLit(batch: RecordBatch, col: Col, lit: Literal): PredicateFunc { + const col_func = col.bind(batch); + return (idx: number, cols: RecordBatch) => col_func(idx, cols) >= lit.v; } } diff --git a/js/src/reader/buffer.ts b/js/src/reader/buffer.ts deleted file mode 100644 index c7b90507e39..00000000000 --- a/js/src/reader/buffer.ts +++ /dev/null @@ -1,229 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import { flatbuffers } from 'flatbuffers'; -import { VectorLayoutReader } from './vector'; -import { TypedArray, TypedArrayConstructor } from '../vector/types'; -import { footerFromByteBuffer, messageFromByteBuffer } from '../format/fb'; -import { Footer, Schema, RecordBatch, DictionaryBatch, Field, Buffer, FieldNode } from '../format/arrow'; -import ByteBuffer = flatbuffers.ByteBuffer; - -export function* readBuffers(sources: Iterable) { - let schema: Schema | null = null; - let readMessages: ((bb: ByteBuffer) => IterableIterator) | null = null; - for (const source of sources) { - const bb = toByteBuffer(source); - if ((!schema && ({ schema, readMessages } = readSchema(bb))) && schema && readMessages) { - for (const message of readMessages(bb)) { - yield { - schema, message, reader: new BufferVectorLayoutReader( - bb, - (function* (fieldNodes) { yield* fieldNodes; })(message.fieldNodes), - (function* (buffers) { yield* buffers; })(message.buffers) - ) as VectorLayoutReader - }; - } - } - } -} - -export async function* readBuffersAsync(sources: AsyncIterable) { - let schema: Schema | null = null; - let readMessages: ((bb: ByteBuffer) => IterableIterator) | null = null; - for await (const source of sources) { - const bb = toByteBuffer(source); - if ((!schema && ({ schema, readMessages } = readSchema(bb))) && schema && readMessages) { - for (const message of readMessages(bb)) { - yield { - schema, message, reader: new BufferVectorLayoutReader( - bb, - (function* (fieldNodes) { yield* fieldNodes; })(message.fieldNodes), - (function* (buffers) { yield* buffers; })(message.buffers) - ) as VectorLayoutReader - }; - } - } - } -} - -function toByteBuffer(bytes?: Uint8Array | NodeBuffer | string) { - let arr: Uint8Array = bytes as any || new Uint8Array(0); - if (typeof bytes === 'string') { - arr = new Uint8Array(bytes.length); - for (let i = -1, n = bytes.length; ++i < n;) { - arr[i] = bytes.charCodeAt(i); - } - return new ByteBuffer(arr); - } - return new ByteBuffer(arr); -} - -function readSchema(bb: ByteBuffer) { - let schema: Schema, readMessages, footer: Footer | null; - if (footer = readFileSchema(bb)) { - schema = footer.schema!; - readMessages = readFileMessages(footer); - } else if (schema = readStreamSchema(bb)!) { - readMessages = readStreamMessages; - } else { - throw new Error('Invalid Arrow buffer'); - } - return { schema, readMessages }; -} - -const PADDING = 4; -const MAGIC_STR = 'ARROW1'; -const MAGIC = new Uint8Array(MAGIC_STR.length); -for (let i = 0; i < MAGIC_STR.length; i += 1 | 0) { - MAGIC[i] = MAGIC_STR.charCodeAt(i); -} - -function checkForMagicArrowString(buffer: Uint8Array, index = 0) { - for (let i = -1, n = MAGIC.length; ++i < n;) { - if (MAGIC[i] !== buffer[index + i]) { - return false; - } - } - return true; -} - -const magicLength = MAGIC.length; -const magicAndPadding = magicLength + PADDING; -const magicX2AndPadding = magicLength * 2 + PADDING; - -function readStreamSchema(bb: ByteBuffer) { - if (!checkForMagicArrowString(bb.bytes(), 0)) { - for (const message of readMessages(bb)) { - if (message.isSchema()) { - return message as Schema; - } - } - } - return null; -} - -function* readStreamMessages(bb: ByteBuffer) { - for (const message of readMessages(bb)) { - if (message.isRecordBatch()) { - yield message; - } else if (message.isDictionaryBatch()) { - yield message; - } else { - continue; - } - // position the buffer after the body to read the next message - bb.setPosition(bb.position() + message.bodyLength.low); - } -} - -function readFileSchema(bb: ByteBuffer) { - let fileLength = bb.capacity(), footerLength: number, footerOffset: number; - if ((fileLength < magicX2AndPadding /* Arrow buffer too small */) || - (!checkForMagicArrowString(bb.bytes(), 0) /* Missing magic start */) || - (!checkForMagicArrowString(bb.bytes(), fileLength - magicLength) /* Missing magic end */) || - (/* Invalid footer length */ - (footerLength = bb.readInt32(footerOffset = fileLength - magicAndPadding)) < 1 && - (footerLength + magicX2AndPadding > fileLength))) { - return null; - } - bb.setPosition(footerOffset - footerLength); - return footerFromByteBuffer(bb); -} - -function readFileMessages(footer: Footer) { - return function* (bb: ByteBuffer) { - for (let i = -1, batches = footer.dictionaryBatches, n = batches.length; ++i < n;) { - bb.setPosition(batches[i].offset.low); - yield readMessage(bb, bb.readInt32(bb.position())) as DictionaryBatch; - } - for (let i = -1, batches = footer.recordBatches, n = batches.length; ++i < n;) { - bb.setPosition(batches[i].offset.low); - yield readMessage(bb, bb.readInt32(bb.position())) as RecordBatch; - } - }; -} - -function* readMessages(bb: ByteBuffer) { - let length: number, message: Schema | RecordBatch | DictionaryBatch; - while (bb.position() < bb.capacity() && - (length = bb.readInt32(bb.position())) > 0) { - if (message = readMessage(bb, length)!) { - yield message; - } - } -} - -function readMessage(bb: ByteBuffer, length: number) { - bb.setPosition(bb.position() + PADDING); - const message = messageFromByteBuffer(bb); - bb.setPosition(bb.position() + length); - return message; -} - -class BufferVectorLayoutReader implements VectorLayoutReader { - private offset: number; - private bytes: Uint8Array; - constructor(bb: ByteBuffer, private fieldNodes: Iterator, private buffers: Iterator) { - this.bytes = bb.bytes(); - this.offset = bb.position(); - } - readContainerLayout(field: Field) { - const { bytes, offset, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(bytes, field, fieldNode, offset, buffers.next().value) - }; - } - readFixedWidthLayout(field: Field, dataType: TypedArrayConstructor) { - const { bytes, offset, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(bytes, field, fieldNode, offset, buffers.next().value), - data: createTypedArray(bytes, field, fieldNode, offset, buffers.next().value, dataType) - }; - } - readBinaryLayout(field: Field) { - const { bytes, offset, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(bytes, field, fieldNode, offset, buffers.next().value), - offsets: createTypedArray(bytes, field, fieldNode, offset, buffers.next().value, Int32Array), - data: createTypedArray(bytes, field, fieldNode, offset, buffers.next().value, Uint8Array) - }; - } - readVariableWidthLayout(field: Field) { - const { bytes, offset, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(bytes, field, fieldNode, offset, buffers.next().value), - offsets: createTypedArray(bytes, field, fieldNode, offset, buffers.next().value, Int32Array) - }; - } -} - -function createValidityArray(bytes: Uint8Array, field: Field, fieldNode: FieldNode, offset: number, buffer: Buffer) { - return field.nullable && fieldNode.nullCount.low > 0 && - createTypedArray(bytes, field, fieldNode, offset, buffer, Uint8Array) || null; -} - -function createTypedArray(bytes: Uint8Array, _field: Field, _fieldNode: FieldNode, offset: number, buffer: Buffer, ArrayConstructor: TypedArrayConstructor): T { - return new ArrayConstructor( - bytes.buffer, - bytes.byteOffset + offset + buffer.offset.low, - buffer.length.low / ArrayConstructor.BYTES_PER_ELEMENT - ); -} diff --git a/js/src/reader/json.ts b/js/src/reader/json.ts deleted file mode 100644 index 49431496354..00000000000 --- a/js/src/reader/json.ts +++ /dev/null @@ -1,176 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import * as Schema_ from '../format/fb/Schema'; -import { Int64, Int128 } from '../util/int'; -import { VectorLayoutReader } from './vector'; -import { TextEncoder } from 'text-encoding-utf-8'; -import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; -import { TypedArray, TypedArrayConstructor } from '../vector/types'; -import { schemaFromJSON, recordBatchFromJSON, dictionaryBatchFromJSON } from '../format/json'; -import { Schema, RecordBatch, DictionaryBatch, Field, Buffer, FieldNode } from '../format/arrow'; -export { Schema, RecordBatch, DictionaryBatch }; - -export function* readJSON(json: any) { - const schema = schemaFromJSON(json['schema']); - for (const batch of (json['dictionaries'] || [])) { - const message = dictionaryBatchFromJSON(batch); - yield { - schema, message, reader: new JSONVectorLayoutReader( - flattenDataSources(batch['data']['columns']), - (function* (fieldNodes) { yield* fieldNodes; })(message.fieldNodes), - (function* (buffers) { yield* buffers; })(message.buffers) - ) as VectorLayoutReader - }; - } - for (const batch of (json['batches'] || [])) { - const message = recordBatchFromJSON(batch); - yield { - schema, message, reader: new JSONVectorLayoutReader( - flattenDataSources(batch['columns']), - (function* (fieldNodes) { yield* fieldNodes; })(message.fieldNodes), - (function* (buffers) { yield* buffers; })(message.buffers) - ) as VectorLayoutReader - }; - } -} - -function flattenDataSources(xs: any[]): any[][] { - return (xs || []).reduce((buffers, column: any) => [ - ...buffers, - ...(column['VALIDITY'] && [column['VALIDITY']] || []), - ...(column['OFFSET'] && [column['OFFSET']] || []), - ...(column['DATA'] && [column['DATA']] || []), - ...flattenDataSources(column['children']) - ], [] as any[][]); -} - -class JSONVectorLayoutReader implements VectorLayoutReader { - constructor(private sources: any[][], private fieldNodes: Iterator, private buffers: Iterator) {} - readContainerLayout(field: Field) { - const { sources, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(sources, field, fieldNode, buffers.next().value) - }; - } - readFixedWidthLayout(field: Field, dataType: TypedArrayConstructor) { - const { sources, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(sources, field, fieldNode, buffers.next().value), - data: createDataArray(sources, field, fieldNode, buffers.next().value, dataType) - }; - } - readBinaryLayout(field: Field) { - const { sources, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(sources, field, fieldNode, buffers.next().value), - offsets: new Int32Array(sources[buffers.next().value.offset.low]), - data: createDataArray(sources, field, fieldNode, buffers.next().value, Uint8Array) - }; - } - readVariableWidthLayout(field: Field) { - const { sources, buffers } = this, fieldNode = this.fieldNodes.next().value; - return { - field, fieldNode, - validity: createValidityArray(sources, field, fieldNode, buffers.next().value), - offsets: new Int32Array(sources[buffers.next().value.offset.low]), - }; - } -} - -function createValidityArray(sources: any[][], field: Field, fieldNode: FieldNode, buffer: Buffer) { - return field.nullable && fieldNode.nullCount.low > 0 && - booleanFromJSON(sources[buffer.offset.low]) || null; -} - -const encoder = new TextEncoder('utf-8'); - -function createDataArray(sources: any[][], field: Field, _fieldNode: FieldNode, buffer: Buffer, ArrayConstructor: TypedArrayConstructor): T { - let type = field.type, data: ArrayLike | ArrayBufferLike; - if (type.isTimestamp() === true) { - data = int64sFromJSON(sources[buffer.offset.low] as string[]); - } else if ((type.isInt() || type.isTime()) && type.bitWidth === 64) { - data = int64sFromJSON(sources[buffer.offset.low] as string[]); - } else if (type.isDate() && type.unit === DateUnit.MILLISECOND) { - data = int64sFromJSON(sources[buffer.offset.low] as string[]); - } else if (type.isDecimal() === true) { - data = decimalFromJSON(sources[buffer.offset.low] as string[]); - } else if (type.isBinary() === true) { - data = binaryFromJSON(sources[buffer.offset.low] as string[]); - } else if (type.isBool() === true) { - data = booleanFromJSON(sources[buffer.offset.low] as number[]).buffer; - } else if (type.isUtf8() === true) { - data = encoder.encode((sources[buffer.offset.low] as string[]).join('')); - } else { - data = (sources[buffer.offset.low]).map((x) => +x); - } - return new ArrayConstructor(data); -} - -function int64sFromJSON(values: string[]) { - const data = new Uint32Array(values.length * 2); - for (let i = -1, n = values.length; ++i < n;) { - // Force all values (even numbers) to be parsed as strings since - // pulling out high and low bits seems to lose precision sometimes - // For example: - // > -4613034156400212000 >>> 0 - // 721782784 - // The correct lower 32-bits are 721782752 - Int64.fromString(values[i].toString(), new Uint32Array(data.buffer, data.byteOffset + 2 * i * 4, 2)); - } - return data.buffer; -} - -function decimalFromJSON(values: string[]) { - const data = new Uint32Array(values.length * 4); - for (let i = -1, n = values.length; ++i < n;) { - Int128.fromString(values[i], new Uint32Array(data.buffer, data.byteOffset + 4 * 4 * i, 4)); - } - return data.buffer; -} - -function binaryFromJSON(values: string[]) { - // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] - // There are definitely more efficient ways to do this... but it gets the - // job done. - const joined = values.join(''); - const data = new Uint8Array(joined.length / 2); - for (let i = 0; i < joined.length; i += 2) { - data[i >> 1] = parseInt(joined.substr(i, 2), 16); - } - return data.buffer; -} - -function booleanFromJSON(arr: number[]) { - let xs = [], n, i = 0; - let bit = 0, byte = 0; - for (const value of arr) { - value && (byte |= 1 << bit); - if (++bit === 8) { - xs[i++] = byte; - byte = bit = 0; - } - } - if (i === 0 || bit > 0) { xs[i++] = byte; } - if (i % 8 && (n = i + 8 - i % 8)) { - do { xs[i] = 0; } while (++i < n); - } - return new Uint8Array(xs); -} diff --git a/js/src/reader/vector.ts b/js/src/reader/vector.ts deleted file mode 100644 index 3bd6d2bb676..00000000000 --- a/js/src/reader/vector.ts +++ /dev/null @@ -1,255 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -import * as Schema_ from '../format/fb/Schema'; -import { TypedArray, TypedArrayConstructor } from '../vector/types'; -import { Schema, RecordBatch, DictionaryBatch, Field, FieldNode } from '../format/arrow'; -import { Int, Date, Time, Timestamp, Decimal, FixedSizeList, FixedSizeBinary, FloatingPoint } from '../format/arrow'; -import { - Vector, BoolVector, BinaryVector, DictionaryVector, - Int8Vector, Int16Vector, Int32Vector, Int64Vector, - Uint8Vector, Uint16Vector, Uint32Vector, Uint64Vector, - Utf8Vector, ListVector, FixedSizeListVector, StructVector, - Float16Vector, Float32Vector, Float64Vector, DecimalVector, - Date32Vector, Date64Vector, Time32Vector, Time64Vector, TimestampVector, -} from '../vector/arrow'; - -import Type = Schema_.org.apache.arrow.flatbuf.Type; -import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; -import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; -import Precision = Schema_.org.apache.arrow.flatbuf.Precision; -// import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; - -export interface ContainerLayout { - fieldNode: FieldNode; - validity: Uint8Array | null | void; -} - -export interface VariableWidthLayout { - fieldNode: FieldNode; - offsets: Int32Array; - validity: Uint8Array | null | void; -} - -export interface BinaryLayout extends FixedWidthLayout { - offsets: Int32Array; -} - -export interface FixedWidthLayout { - fieldNode: FieldNode; - data: T; - validity: Uint8Array | null | void; -} - -export function* readVectors(messages: Iterable<{ schema: Schema, message: RecordBatch | DictionaryBatch, reader: VectorLayoutReader }>) { - const dictionaries = new Map(); - for (const { schema, message, reader } of messages) { - yield* readMessageVectors(schema, message, new VectorReader(dictionaries, reader)); - } -} - -export async function* readVectorsAsync(messages: AsyncIterable<{ schema: Schema, message: RecordBatch | DictionaryBatch, reader: VectorLayoutReader }>) { - const dictionaries = new Map(); - for await (const { schema, message, reader } of messages) { - yield* readMessageVectors(schema, message, new VectorReader(dictionaries, reader)); - } -} - -function* readMessageVectors(schema: Schema, message: RecordBatch | DictionaryBatch, reader: VectorReader) { - if (message.isRecordBatch() === true) { - yield schema.fields.map((field) => reader.readVector(field)); - } else if (message.isDictionaryBatch()) { - let id = message.dictionaryId.toFloat64().toString(); - let vector = reader.readValueVector(schema.dictionaries.get(id)!); - if (message.isDelta) { - vector = reader.dictionaries.get(id)!.concat(vector); - } - reader.dictionaries.set(id, vector); - } -} - -export interface VectorLayoutReader { - readBinaryLayout(field: Field): BinaryLayout; - readContainerLayout(field: Field): ContainerLayout; - readVariableWidthLayout(field: Field): VariableWidthLayout; - readFixedWidthLayout(field: Field, TypedArrayConstructor: TypedArrayConstructor): FixedWidthLayout; -} - -export class VectorReader implements VectorLayoutReader { - constructor(public dictionaries: Map, protected layout: VectorLayoutReader) {} - readVector(field: Field): Vector { - return this.readDictionaryVector(field) || this.readValueVector(field); - } - readDictionaryVector(field: Field) { - const encoding = field.dictionary; - if (encoding) { - const keys = this.readIntVector(field.indexField()); - const data = this.dictionaries.get(encoding.dictionaryId.toFloat64().toString())!; - return new DictionaryVector({ - field, data, keys, - validity: (keys as any).validity, - fieldNode: (keys as any).fieldNode, - }); - } - return null; - } - readValueVector(field: Field) { - switch (field.typeType) { - case Type.NONE: return this.readNullVector(); - case Type.Null: return this.readNullVector(); - // case Type.Map: return this.readMapVector(field); - case Type.Int: return this.readIntVector(field); - case Type.Bool: return this.readBoolVector(field); - case Type.Date: return this.readDateVector(field); - case Type.List: return this.readListVector(field); - case Type.Utf8: return this.readUtf8Vector(field); - case Type.Time: return this.readTimeVector(field); - // case Type.Union: return this.readUnionVector(field); - case Type.Binary: return this.readBinaryVector(field); - case Type.Decimal: return this.readDecimalVector(field); - case Type.Struct_: return this.readStructVector(field); - case Type.FloatingPoint: return this.readFloatVector(field); - case Type.Timestamp: return this.readTimestampVector(field); - case Type.FixedSizeList: return this.readFixedSizeListVector(field); - case Type.FixedSizeBinary: return this.readFixedSizeBinaryVector(field); - } - throw new Error(`Unrecognized ${field.toString()}`); - } - readNullVector() { - return new Vector(); - } - readBoolVector(field: Field) { - return new BoolVector(this.readFixedWidthLayout(field, Uint8Array)); - } - readDateVector(field: Field) { - const type = field.type as Date; - switch (type.unit) { - case DateUnit.DAY: return new Date32Vector({ ...this.readFixedWidthLayout(field, Int32Array), unit: DateUnit[type.unit] }); - case DateUnit.MILLISECOND: return new Date64Vector({ ...this.readFixedWidthLayout(field, Int32Array), unit: DateUnit[type.unit] }); - } - throw new Error(`Unrecognized ${type.toString()}`); - } - readTimeVector(field: Field) { - const type = field.type as Time; - switch (type.bitWidth) { - case 32: return new Time32Vector({ ...this.readFixedWidthLayout(field, Int32Array), unit: TimeUnit[type.unit] }); - case 64: return new Time64Vector({ ...this.readFixedWidthLayout(field, Uint32Array), unit: TimeUnit[type.unit] }); - } - throw new Error(`Unrecognized ${type.toString()}`); - } - readTimestampVector(field: Field) { - const type = field.type as Timestamp; - const { fieldNode, validity, data } = this.readFixedWidthLayout(field, Uint32Array); - return new TimestampVector({ - field, fieldNode, validity, data, - timezone: type.timezone!, - unit: TimeUnit[type.unit], - }); - } - readListVector(field: Field) { - const { fieldNode, validity, offsets } = this.readVariableWidthLayout(field); - return new ListVector({ - field, fieldNode, validity, offsets, - values: this.readVector(field.children[0]) - }); - } - readStructVector(field: Field) { - const { fieldNode, validity } = this.readContainerLayout(field); - return new StructVector({ - field, fieldNode, validity, - columns: field.children.map((field) => this.readVector(field)) - }); - } - readBinaryVector(field: Field) { - return new BinaryVector(this.readBinaryLayout(field)); - } - readDecimalVector(field: Field) { - const type = field.type as Decimal; - const { fieldNode, validity, data } = this.readFixedWidthLayout(field, Uint32Array); - return new DecimalVector({ - scale: type.scale, - precision: type.precision, - field, fieldNode, validity, data - }); - } - readUtf8Vector(field: Field) { - const { fieldNode, validity, offsets, data } = this.readBinaryLayout(field); - return new Utf8Vector({ - field, fieldNode, - values: new BinaryVector({ - validity, offsets, data - }) - }); - } - readFixedSizeListVector(field: Field) { - const type = field.type as FixedSizeList; - const { fieldNode, validity } = this.readContainerLayout(field); - return new FixedSizeListVector({ - field, fieldNode, validity, - size: type.listSize, - values: this.readVector(field.children[0]) - }); - } - readFixedSizeBinaryVector(field: Field) { - const type = field.type as FixedSizeBinary; - const { fieldNode, validity, data } = this.readFixedWidthLayout(field, Uint8Array); - return new FixedSizeListVector({ - size: type.byteWidth, - field, fieldNode, validity, - values: new Uint8Vector({ data }) - }); - } - readFloatVector(field: Field) { - const type = field.type as FloatingPoint; - switch (type.precision) { - case Precision.HALF: return new Float16Vector(this.readFixedWidthLayout(field, Uint16Array)); - case Precision.SINGLE: return new Float32Vector(this.readFixedWidthLayout(field, Float32Array)); - case Precision.DOUBLE: return new Float64Vector(this.readFixedWidthLayout(field, Float64Array)); - } - throw new Error(`Unrecognized FloatingPoint { precision: ${type.precision} }`); - } - readIntVector(field: Field) { - const type = field.type as Int; - if (type.isSigned) { - switch (type.bitWidth) { - case 8: return new Int8Vector(this.readFixedWidthLayout(field, Int8Array)); - case 16: return new Int16Vector(this.readFixedWidthLayout(field, Int16Array)); - case 32: return new Int32Vector(this.readFixedWidthLayout(field, Int32Array)); - case 64: return new Int64Vector(this.readFixedWidthLayout(field, Int32Array)); - } - } - switch (type.bitWidth) { - case 8: return new Uint8Vector(this.readFixedWidthLayout(field, Uint8Array)); - case 16: return new Uint16Vector(this.readFixedWidthLayout(field, Uint16Array)); - case 32: return new Uint32Vector(this.readFixedWidthLayout(field, Uint32Array)); - case 64: return new Uint64Vector(this.readFixedWidthLayout(field, Uint32Array)); - } - throw new Error(`Unrecognized Int { isSigned: ${type.isSigned}, bitWidth: ${type.bitWidth} }`); - } - readContainerLayout(field: Field) { - return this.layout.readContainerLayout(field); - } - readBinaryLayout(field: Field) { - return this.layout.readBinaryLayout(field); - } - readVariableWidthLayout(field: Field) { - return this.layout.readVariableWidthLayout(field); - } - readFixedWidthLayout(field: Field, TypedArrayConstructor: TypedArrayConstructor) { - return this.layout.readFixedWidthLayout(field, TypedArrayConstructor); - } -} diff --git a/js/src/recordbatch.ts b/js/src/recordbatch.ts new file mode 100644 index 00000000000..07d94a9d496 --- /dev/null +++ b/js/src/recordbatch.ts @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Schema, Struct, DataType } from './type'; +import { flatbuffers } from 'flatbuffers'; +import { View, Vector, StructVector } from './vector'; +import { Data, NestedData } from './data'; + +import Long = flatbuffers.Long; + +export class RecordBatch extends StructVector { + public static from(vectors: Vector[]) { + return new RecordBatch(Schema.from(vectors), + Math.max(...vectors.map((v) => v.length)), + vectors + ); + } + public readonly schema: Schema; + public readonly length: number; + public readonly numCols: number; + constructor(schema: Schema, data: Data, view: View); + constructor(schema: Schema, numRows: Long | number, cols: Data | Vector[]); + constructor(...args: any[]) { + if (typeof args[1] !== 'number') { + const data = args[1] as Data; + super(data, args[2]); + this.schema = args[0]; + this.length = data.length; + } else { + const [schema, numRows, cols] = args; + const childData: Data[] = new Array(cols.length); + for (let index = -1, length = cols.length; ++index < length;) { + const col: Data | Vector = cols[index]; + childData[index] = col instanceof Vector ? col.data : col; + } + super(new NestedData(new Struct(schema.fields), numRows, null, childData)); + this.schema = schema; + this.length = numRows; + } + this.numCols = this.schema.fields.length; + } + public clone(data: Data, view: View = this.view.clone(data)): this { + return new RecordBatch(this.schema, data as any, view) as any; + } + public getChildAt(index: number): Vector | null { + return index < 0 || index >= this.numCols ? null : super.getChildAt(index); + } + public select(...columnNames: string[]) { + const fields = this.schema.fields; + const namesToKeep = columnNames.reduce((xs, x) => (xs[x] = true) && xs, Object.create(null)); + return new RecordBatch( + this.schema.select(...columnNames), this.length, + this.childData.filter((_, i) => namesToKeep[fields[i].name]) + ); + } +} diff --git a/js/src/table.ts b/js/src/table.ts index 42073fb9e0b..193c947a22d 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -15,28 +15,15 @@ // specific language governing permissions and limitations // under the License. -import { Vector } from './vector/vector'; -import { DictionaryVector } from './vector/dictionary'; -import { Uint32Vector } from './vector/numeric'; -import { read, readAsync } from './reader/arrow'; +import { RecordBatch } from './recordbatch'; import { Col, Predicate } from './predicate'; +import { Schema, Field, Struct } from './type'; +import { read, readAsync } from './ipc/reader/arrow'; +import { isPromise, isAsyncIterable } from './util/compat'; +import { Vector, DictionaryVector, IntVector, StructVector } from './vector'; +import { ChunkedView } from './vector/chunked'; -export type NextFunc = (idx: number, cols: Vector[]) => void; - -export class TableRow { - constructor (readonly batch: Vector[], readonly idx: number) {} - toArray() { - return this.batch.map((vec) => vec.get(this.idx)); - } - toString() { - return this.toArray().map((x) => JSON.stringify(x)).join(', '); - } - *[Symbol.iterator]() { - for (const vec of this.batch) { - yield vec.get(this.idx); - } - } -} +export type NextFunc = (idx: number, cols: RecordBatch) => void; export interface DataFrame { filter(predicate: Predicate): DataFrame; @@ -45,150 +32,181 @@ export interface DataFrame { countBy(col: (Col|string)): CountByResult; } -function columnsFromBatches(batches: Vector[][]) { - const remaining = batches.slice(1); - return batches[0].map((vec, colidx) => - vec.concat(...remaining.map((batch) => batch[colidx])) - ); -} - export class Table implements DataFrame { + static empty() { return new Table(new Schema([]), []); } static from(sources?: Iterable | object | string) { - let batches: Vector[][] = []; if (sources) { - batches = []; - for (let batch of read(sources)) { - batches.push(batch); + let schema: Schema | undefined; + let recordBatches: RecordBatch[] = []; + for (let recordBatch of read(sources)) { + schema = schema || recordBatch.schema; + recordBatches.push(recordBatch); } + return new Table(schema || new Schema([]), recordBatches); } - return new Table({ batches }); + return Table.empty(); } static async fromAsync(sources?: AsyncIterable) { - let batches: Vector[][] = []; - if (sources) { - batches = []; - for await (let batch of readAsync(sources)) { - batches.push(batch); + if (isAsyncIterable(sources)) { + let schema: Schema | undefined; + let recordBatches: RecordBatch[] = []; + for await (let recordBatch of readAsync(sources)) { + schema = schema || recordBatch.schema; + recordBatches.push(recordBatch); } + return new Table(schema || new Schema([]), recordBatches); + } else if (isPromise(sources)) { + return Table.from(await sources); + } else if (sources) { + return Table.from(sources); } - return new Table({ batches }); + return Table.empty(); } - - // VirtualVector of each column, spanning batches - readonly columns: Vector[]; - - // List of batches, where each batch is a list of Vectors - readonly batches: Vector[][]; - readonly lengths: Uint32Array; - readonly length: number; - constructor(argv: { batches: Vector[][] }) { - this.batches = argv.batches; - this.columns = columnsFromBatches(this.batches); - this.lengths = new Uint32Array(this.batches.map((batch) => batch[0].length)); - - this.length = this.lengths.reduce((acc, length) => acc + length); + static fromStruct(struct: StructVector) { + const schema = new Schema(struct.type.children); + const chunks = struct.view instanceof ChunkedView ? + (struct.view.chunkVectors as StructVector[]) : + [struct]; + return new Table(chunks.map((chunk) => new RecordBatch(schema, chunk.length, chunk.view.childData))); } - get(idx: number): TableRow { - let batch = 0; - while (idx >= this.lengths[batch] && batch < this.lengths.length) { - idx -= this.lengths[batch++]; - } - - if (batch === this.lengths.length) { throw new Error('Overflow'); } - return new TableRow(this.batches[batch], idx); + public readonly schema: Schema; + public readonly length: number; + public readonly numCols: number; + // List of inner RecordBatches + public readonly batches: RecordBatch[]; + // List of inner Vectors, possibly spanning batches + protected readonly _columns: Vector[] = []; + // Union of all inner RecordBatches into one RecordBatch, possibly chunked. + // If the Table has just one inner RecordBatch, this points to that. + // If the Table has multiple inner RecordBatches, then this is a Chunked view + // over the list of RecordBatches. This allows us to delegate the responsibility + // of indexing, iterating, slicing, and visiting to the Nested/Chunked Data/Views. + public readonly batchesUnion: RecordBatch; + + constructor(batches: RecordBatch[]); + constructor(...batches: RecordBatch[]); + constructor(schema: Schema, batches: RecordBatch[]); + constructor(schema: Schema, ...batches: RecordBatch[]); + constructor(...args: any[]) { + let schema: Schema; + let batches: RecordBatch[]; + if (args[0] instanceof Schema) { + schema = args[0]; + batches = Array.isArray(args[1][0]) ? args[1][0] : args[1]; + } else if (args[0] instanceof RecordBatch) { + schema = (batches = args)[0].schema; + } else { + schema = (batches = args[0])[0].schema; + } + this.schema = schema; + this.batches = batches; + this.batchesUnion = batches.length == 0 ? + new RecordBatch(schema, 0, []) : + batches.reduce((union, batch) => union.concat(batch)); + this.length = this.batchesUnion.length; + this.numCols = this.batchesUnion.numCols; } - filter(predicate: Predicate): DataFrame { - return new FilteredDataFrame(this, predicate); + public get(index: number): Struct['TValue'] { + return this.batchesUnion.get(index)!; } - scan(next: NextFunc) { - for (let batch = -1; ++batch < this.lengths.length;) { - const length = this.lengths[batch]; - + public getColumn(name: string) { + return this.getColumnAt(this.getColumnIndex(name)); + } + public getColumnAt(index: number) { + return index < 0 || index >= this.numCols + ? null + : this._columns[index] || ( + this._columns[index] = this.batchesUnion.getChildAt(index)!); + } + public getColumnIndex(name: string) { + return this.schema.fields.findIndex((f) => f.name === name); + } + public [Symbol.iterator](): IterableIterator { + return this.batchesUnion[Symbol.iterator]() as any; + } + public filter(predicate: Predicate): DataFrame { + return new FilteredDataFrame(this.batches, predicate); + } + public scan(next: NextFunc) { + const batches = this.batches, numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { // load batches - const columns = this.batches[batch]; - + const batch = batches[batchIndex]; // yield all indices - for (let idx = -1; ++idx < length;) { - next(idx, columns); + for (let index = -1, numRows = batch.length; ++index < numRows;) { + next(index, batch); } } } - count(): number { - return this.length; - } - countBy(count_by: (Col|string)): CountByResult { - if (!(count_by instanceof Col)) { - count_by = new Col(count_by); - } - + public count(): number { return this.length; } + public countBy(name: Col | string): CountByResult { + const batches = this.batches, numBatches = batches.length; + const count_by = typeof name === 'string' ? new Col(name) : name; // Assume that all dictionary batches are deltas, which means that the // last record batch has the most complete dictionary - count_by.bind(this.batches[this.batches.length - 1]); - if (!(count_by.vector instanceof DictionaryVector)) { + count_by.bind(batches[numBatches - 1]); + const vector = count_by.vector as DictionaryVector; + if (!(vector instanceof DictionaryVector)) { throw new Error('countBy currently only supports dictionary-encoded columns'); } - - let data: Vector = (count_by.vector as DictionaryVector).data; // TODO: Adjust array byte width based on overall length // (e.g. if this.length <= 255 use Uint8Array, etc...) - let counts: Uint32Array = new Uint32Array(data.length); - - for (let batch = -1; ++batch < this.lengths.length;) { - const length = this.lengths[batch]; - + const counts: Uint32Array = new Uint32Array(vector.dictionary.length); + for (let batchIndex = -1; ++batchIndex < numBatches;) { // load batches - const columns = this.batches[batch]; - count_by.bind(columns); - const keys: Vector = (count_by.vector as DictionaryVector).keys; - + const batch = batches[batchIndex]; + // rebind the countBy Col + count_by.bind(batch); + const keys = (count_by.vector as DictionaryVector).indicies; // yield all indices - for (let idx = -1; ++idx < length;) { - let key = keys.get(idx); + for (let index = -1, numRows = batch.length; ++index < numRows;) { + let key = keys.get(index); if (key !== null) { counts[key]++; } } } - - return new CountByResult(data, new Uint32Vector({data: counts})); + return new CountByResult(vector.dictionary, IntVector.from(counts)); } - *[Symbol.iterator]() { - for (let batch = -1; ++batch < this.lengths.length;) { - const length = this.lengths[batch]; - - // load batches - const columns = this.batches[batch]; - - // yield all indices - for (let idx = -1; ++idx < length;) { - yield new TableRow(columns, idx); - } + public select(...columnNames: string[]) { + return new Table(this.batches.map((batch) => batch.select(...columnNames))); + } + public toString(separator?: string) { + let str = ''; + for (const row of this.rowsToString(separator)) { + str += row + '\n'; } + return str; + } + public rowsToString(separator = ' | '): TableToStringIterator { + return new TableToStringIterator(tableRowsToString(this, separator)); } } class FilteredDataFrame implements DataFrame { - constructor (readonly parent: Table, private predicate: Predicate) {} - - scan(next: NextFunc) { + private predicate: Predicate; + private batches: RecordBatch[]; + constructor (batches: RecordBatch[], predicate: Predicate) { + this.batches = batches; + this.predicate = predicate; + } + public scan(next: NextFunc) { // inlined version of this: // this.parent.scan((idx, columns) => { // if (this.predicate(idx, columns)) next(idx, columns); // }); - for (let batch = -1; ++batch < this.parent.lengths.length;) { - const length = this.parent.lengths[batch]; - + const batches = this.batches; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { // load batches - const columns = this.parent.batches[batch]; - const predicate = this.predicate.bind(columns); - + const batch = batches[batchIndex]; + const predicate = this.predicate.bind(batch); // yield all indices - for (let idx = -1; ++idx < length;) { - if (predicate(idx, columns)) { next(idx, columns); } + for (let index = -1, numRows = batch.length; ++index < numRows;) { + if (predicate(index, batch)) { next(index, batch); } } } } - - count(): number { + public count(): number { // inlined version of this: // let sum = 0; // this.parent.scan((idx, columns) => { @@ -196,77 +214,125 @@ class FilteredDataFrame implements DataFrame { // }); // return sum; let sum = 0; - for (let batch = -1; ++batch < this.parent.lengths.length;) { - const length = this.parent.lengths[batch]; - + const batches = this.batches; + const numBatches = batches.length; + for (let batchIndex = -1; ++batchIndex < numBatches;) { // load batches - const columns = this.parent.batches[batch]; - const predicate = this.predicate.bind(columns); - + const batch = batches[batchIndex]; + const predicate = this.predicate.bind(batch); // yield all indices - for (let idx = -1; ++idx < length;) { - if (predicate(idx, columns)) { ++sum; } + for (let index = -1, numRows = batch.length; ++index < numRows;) { + if (predicate(index, batch)) { ++sum; } } } return sum; } - - filter(predicate: Predicate): DataFrame { + public filter(predicate: Predicate): DataFrame { return new FilteredDataFrame( - this.parent, + this.batches, this.predicate.and(predicate) ); } - - countBy(count_by: (Col|string)): CountByResult { - if (!(count_by instanceof Col)) { - count_by = new Col(count_by); - } - + public countBy(name: Col | string): CountByResult { + const batches = this.batches, numBatches = batches.length; + const count_by = typeof name === 'string' ? new Col(name) : name; // Assume that all dictionary batches are deltas, which means that the // last record batch has the most complete dictionary - count_by.bind(this.parent.batches[this.parent.batches.length - 1]); - if (!(count_by.vector instanceof DictionaryVector)) { + count_by.bind(batches[numBatches - 1]); + const vector = count_by.vector as DictionaryVector; + if (!(vector instanceof DictionaryVector)) { throw new Error('countBy currently only supports dictionary-encoded columns'); } - - const data: Vector = (count_by.vector as DictionaryVector).data; // TODO: Adjust array byte width based on overall length // (e.g. if this.length <= 255 use Uint8Array, etc...) - const counts: Uint32Array = new Uint32Array(data.length); - - for (let batch = -1; ++batch < this.parent.lengths.length;) { - const length = this.parent.lengths[batch]; - + const counts: Uint32Array = new Uint32Array(vector.dictionary.length); + for (let batchIndex = -1; ++batchIndex < numBatches;) { // load batches - const columns = this.parent.batches[batch]; - const predicate = this.predicate.bind(columns); - count_by.bind(columns); - const keys: Vector = (count_by.vector as DictionaryVector).keys; - + const batch = batches[batchIndex]; + const predicate = this.predicate.bind(batch); + // rebind the countBy Col + count_by.bind(batch); + const keys = (count_by.vector as DictionaryVector).indicies; // yield all indices - for (let idx = -1; ++idx < length;) { - let key = keys.get(idx); - if (key !== null && predicate(idx, columns)) { counts[key]++; } + for (let index = -1, numRows = batch.length; ++index < numRows;) { + let key = keys.get(index); + if (key !== null && predicate(index, batch)) { counts[key]++; } } } - - return new CountByResult(data, new Uint32Vector({data: counts})); + return new CountByResult(vector.dictionary, IntVector.from(counts)); } } export class CountByResult extends Table implements DataFrame { - constructor(readonly values: Vector, readonly counts: Vector) { - super({batches: [[values, counts]]}); + constructor(values: Vector, counts: IntVector) { + super( + new RecordBatch(new Schema([ + new Field('values', values.type), + new Field('counts', counts.type) + ]), + counts.length, [values, counts] + )); } - - toJSON(): Object { - let result: {[key: string]: number|null} = {}; - + public toJSON(): Object { + const values = this.getColumnAt(0)!; + const counts = this.getColumnAt(1)!; + const result = {} as { [k: string]: number | null }; for (let i = -1; ++i < this.length;) { - result[this.values.get(i)] = this.counts.get(i); + result[values.get(i)] = counts.get(i); } - return result; } } + +export class TableToStringIterator implements IterableIterator { + constructor(private iterator: IterableIterator) {} + [Symbol.iterator]() { return this.iterator; } + next(value?: any) { return this.iterator.next(value); } + throw(error?: any) { return this.iterator.throw && this.iterator.throw(error) || { done: true, value: '' }; } + return(value?: any) { return this.iterator.return && this.iterator.return(value) || { done: true, value: '' }; } + pipe(stream: NodeJS.WritableStream) { + let res: IteratorResult; + let write = () => { + if (stream.writable) { + do { + if ((res = this.next()).done) { break; } + } while (stream.write(res.value + '\n', 'utf8')); + } + if (!res || !res.done) { + stream.once('drain', write); + } else if (!(stream as any).isTTY) { + stream.end('\n'); + } + }; + write(); + } +} + +function* tableRowsToString(table: Table, separator = ' | ') { + const fields = table.schema.fields; + const header = ['row_id', ...fields.map((f) => `${f}`)].map(stringify); + const maxColumnWidths = header.map(x => x.length); + // Pass one to convert to strings and count max column widths + for (let i = -1, n = table.length - 1; ++i < n;) { + let val, row = [i, ...table.get(i)]; + for (let j = -1, k = row.length; ++j < k; ) { + val = stringify(row[j]); + maxColumnWidths[j] = Math.max(maxColumnWidths[j], val.length); + } + } + yield header.map((x, j) => leftPad(x, ' ', maxColumnWidths[j])).join(separator); + for (let i = -1, n = table.length; ++i < n;) { + yield [i, ...table.get(i)] + .map((x) => stringify(x)) + .map((x, j) => leftPad(x, ' ', maxColumnWidths[j])) + .join(separator); + } +} + +function leftPad(str: string, fill: string, n: number) { + return (new Array(n + 1).join(fill) + str).slice(-1 * n); +} + +function stringify(x: any) { + return typeof x === 'string' ? `"${x}"` : ArrayBuffer.isView(x) ? `[${x}]` : JSON.stringify(x); +} diff --git a/js/src/type.ts b/js/src/type.ts new file mode 100644 index 00000000000..6f382bd5b2b --- /dev/null +++ b/js/src/type.ts @@ -0,0 +1,578 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import * as Schema_ from './fb/Schema'; +import * as Message_ from './fb/Message'; +import { Vector, View } from './vector'; +import { flatbuffers } from 'flatbuffers'; +import { DictionaryBatch } from './ipc/metadata'; +import { TypeVisitor, VisitorNode } from './visitor'; + +export import Long = flatbuffers.Long; +export import ArrowType = Schema_.org.apache.arrow.flatbuf.Type; +export import DateUnit = Schema_.org.apache.arrow.flatbuf.DateUnit; +export import TimeUnit = Schema_.org.apache.arrow.flatbuf.TimeUnit; +export import Precision = Schema_.org.apache.arrow.flatbuf.Precision; +export import UnionMode = Schema_.org.apache.arrow.flatbuf.UnionMode; +export import VectorType = Schema_.org.apache.arrow.flatbuf.VectorType; +export import IntervalUnit = Schema_.org.apache.arrow.flatbuf.IntervalUnit; +export import MessageHeader = Message_.org.apache.arrow.flatbuf.MessageHeader; +export import MetadataVersion = Schema_.org.apache.arrow.flatbuf.MetadataVersion; + +export class Schema { + public static from(vectors: Vector[]) { + return new Schema(vectors.map((v, i) => new Field('' + i, v.type))); + } + // @ts-ignore + protected _bodyLength: number; + // @ts-ignore + protected _headerType: MessageHeader; + public readonly fields: Field[]; + public readonly version: MetadataVersion; + public readonly metadata?: Map; + public readonly dictionaries: Map>; + constructor(fields: Field[], + metadata?: Map, + version: MetadataVersion = MetadataVersion.V4, + dictionaries: Map> = new Map()) { + this.fields = fields; + this.version = version; + this.metadata = metadata; + this.dictionaries = dictionaries; + } + public get bodyLength() { return this._bodyLength; } + public get headerType() { return this._headerType; } + public select(...fieldNames: string[]): Schema { + const namesToKeep = fieldNames.reduce((xs, x) => (xs[x] = true) && xs, Object.create(null)); + const newDictFields = new Map(), newFields = this.fields.filter((f) => namesToKeep[f.name]); + this.dictionaries.forEach((f, dictId) => (namesToKeep[f.name]) && newDictFields.set(dictId, f)); + return new Schema(newFields, this.metadata, this.version, newDictFields); + } + public static [Symbol.toStringTag] = ((prototype: Schema) => { + prototype._bodyLength = 0; + prototype._headerType = MessageHeader.Schema; + return 'Schema'; + })(Schema.prototype); +} + +export class Field { + public readonly type: T; + public readonly name: string; + public readonly nullable: boolean; + public readonly metadata?: Map | null; + constructor(name: string, type: T, nullable = false, metadata?: Map | null) { + this.name = name; + this.type = type; + this.nullable = nullable; + this.metadata = metadata; + } + public toString() { return `${this.name}: ${this.type}`; } + public get typeId(): T['TType'] { return this.type.TType; } + public get [Symbol.toStringTag](): string { return 'Field'; } + public get indicies(): T | Int { + return DataType.isDictionary(this.type) ? this.type.indicies : this.type; + } +} + +export type TimeBitWidth = 32 | 64; +export type IntBitWidth = 8 | 16 | 32 | 64; + +export type NumericType = Int | Float | Date_ | Time | Interval | Timestamp; +export type FixedSizeType = Int64 | Uint64 | Decimal | FixedSizeBinary; +export type PrimitiveType = NumericType | FixedSizeType; + +export type FlatListType = Utf8 | Binary; // <-- these types have `offset`, `data`, and `validity` buffers +export type FlatType = Bool | PrimitiveType | FlatListType; // <-- these types have `data` and `validity` buffers +export type ListType = List; // <-- these types have `offset` and `validity` buffers +export type NestedType = Map_ | Struct | List | FixedSizeList | Union; // <-- these types have `validity` buffer and nested childData +export type SingleNestedType = List | FixedSizeList; // <-- these are nested types that can only have a single child + +/** + * * + * Main data type enumeration: + * * + * Data types in this library are all *logical*. They can be expressed as + * either a primitive physical type (bytes or bits of some fixed size), a + * nested type consisting of other data types, or another data type (e.g. a + * timestamp encoded as an int64) + */ + export enum Type { + NONE = 0, // The default placeholder type + Null = 1, // A NULL type having no physical storage + Int = 2, // Signed or unsigned 8, 16, 32, or 64-bit little-endian integer + Float = 3, // 2, 4, or 8-byte floating point value + Binary = 4, // Variable-length bytes (no guarantee of UTF8-ness) + Utf8 = 5, // UTF8 variable-length string as List + Bool = 6, // Boolean as 1 bit, LSB bit-packed ordering + Decimal = 7, // Precision-and-scale-based decimal type. Storage type depends on the parameters. + Date = 8, // int32_t days or int64_t milliseconds since the UNIX epoch + Time = 9, // Time as signed 32 or 64-bit integer, representing either seconds, milliseconds, microseconds, or nanoseconds since midnight since midnight + Timestamp = 10, // Exact timestamp encoded with int64 since UNIX epoch (Default unit millisecond) + Interval = 11, // YEAR_MONTH or DAY_TIME interval in SQL style + List = 12, // A list of some logical data type + Struct = 13, // Struct of logical types + Union = 14, // Union of logical types + FixedSizeBinary = 15, // Fixed-size binary. Each value occupies the same number of bytes + FixedSizeList = 16, // Fixed-size list. Each value occupies the same number of bytes + Map = 17, // Map of named logical types + Dictionary = 'Dictionary', // Dictionary aka Category type + DenseUnion = 'DenseUnion', // Dense Union of logical types + SparseUnion = 'SparseUnion', // Sparse Union of logical types +} + +export interface DataType { + readonly TType: TType; + readonly TArray: any; + readonly TValue: any; + readonly ArrayType: any; +} + +export abstract class DataType implements Partial { + + // @ts-ignore + public [Symbol.toStringTag]: string; + + static isNull (x: DataType): x is Null { return x.TType === Type.Null; } + static isInt (x: DataType): x is Int { return x.TType === Type.Int; } + static isFloat (x: DataType): x is Float { return x.TType === Type.Float; } + static isBinary (x: DataType): x is Binary { return x.TType === Type.Binary; } + static isUtf8 (x: DataType): x is Utf8 { return x.TType === Type.Utf8; } + static isBool (x: DataType): x is Bool { return x.TType === Type.Bool; } + static isDecimal (x: DataType): x is Decimal { return x.TType === Type.Decimal; } + static isDate (x: DataType): x is Date_ { return x.TType === Type.Date; } + static isTime (x: DataType): x is Time { return x.TType === Type.Time; } + static isTimestamp (x: DataType): x is Timestamp { return x.TType === Type.Timestamp; } + static isInterval (x: DataType): x is Interval { return x.TType === Type.Interval; } + static isList (x: DataType): x is List { return x.TType === Type.List; } + static isStruct (x: DataType): x is Struct { return x.TType === Type.Struct; } + static isUnion (x: DataType): x is Union { return x.TType === Type.Union; } + static isDenseUnion (x: DataType): x is DenseUnion { return x.TType === Type.DenseUnion; } + static isSparseUnion (x: DataType): x is SparseUnion { return x.TType === Type.SparseUnion; } + static isFixedSizeBinary (x: DataType): x is FixedSizeBinary { return x.TType === Type.FixedSizeBinary; } + static isFixedSizeList (x: DataType): x is FixedSizeList { return x.TType === Type.FixedSizeList; } + static isMap (x: DataType): x is Map_ { return x.TType === Type.Map; } + static isDictionary (x: DataType): x is Dictionary { return x.TType === Type.Dictionary; } + + constructor(public readonly TType: TType, + public readonly children?: Field[]) {} + + acceptTypeVisitor(visitor: TypeVisitor): any { + switch (this.TType) { + case Type.Null: return DataType.isNull(this) && visitor.visitNull(this) || null; + case Type.Int: return DataType.isInt(this) && visitor.visitInt(this) || null; + case Type.Float: return DataType.isFloat(this) && visitor.visitFloat(this) || null; + case Type.Binary: return DataType.isBinary(this) && visitor.visitBinary(this) || null; + case Type.Utf8: return DataType.isUtf8(this) && visitor.visitUtf8(this) || null; + case Type.Bool: return DataType.isBool(this) && visitor.visitBool(this) || null; + case Type.Decimal: return DataType.isDecimal(this) && visitor.visitDecimal(this) || null; + case Type.Date: return DataType.isDate(this) && visitor.visitDate(this) || null; + case Type.Time: return DataType.isTime(this) && visitor.visitTime(this) || null; + case Type.Timestamp: return DataType.isTimestamp(this) && visitor.visitTimestamp(this) || null; + case Type.Interval: return DataType.isInterval(this) && visitor.visitInterval(this) || null; + case Type.List: return DataType.isList(this) && visitor.visitList(this) || null; + case Type.Struct: return DataType.isStruct(this) && visitor.visitStruct(this) || null; + case Type.Union: return DataType.isUnion(this) && visitor.visitUnion(this) || null; + case Type.FixedSizeBinary: return DataType.isFixedSizeBinary(this) && visitor.visitFixedSizeBinary(this) || null; + case Type.FixedSizeList: return DataType.isFixedSizeList(this) && visitor.visitFixedSizeList(this) || null; + case Type.Map: return DataType.isMap(this) && visitor.visitMap(this) || null; + case Type.Dictionary: return DataType.isDictionary(this) && visitor.visitDictionary(this) || null; + default: return null; + } + } + protected static [Symbol.toStringTag] = ((proto: DataType) => { + ( proto).ArrayType = Array; + return proto[Symbol.toStringTag] = 'DataType'; + })(DataType.prototype); +} + +export interface Null extends DataType { TArray: void; TValue: null; } +export class Null extends DataType { + constructor() { + super(Type.Null); + } + public toString() { return `Null`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitNull(this); + } + protected static [Symbol.toStringTag] = ((proto: Null) => { + return proto[Symbol.toStringTag] = 'Null'; + })(Null.prototype); +} + +export interface Int extends DataType { TArray: TArrayType; TValue: TValueType; } +export class Int extends DataType { + constructor(public readonly isSigned: boolean, + public readonly bitWidth: IntBitWidth) { + super(Type.Int); + } + public get ArrayType(): TypedArrayConstructor { + switch (this.bitWidth) { + case 8: return (this.isSigned ? Int8Array : Uint8Array) as any; + case 16: return (this.isSigned ? Int16Array : Uint16Array) as any; + case 32: return (this.isSigned ? Int32Array : Uint32Array) as any; + case 64: return (this.isSigned ? Int32Array : Uint32Array) as any; + } + throw new Error(`Unrecognized ${this[Symbol.toStringTag]} type`); + } + public toString() { return `${this.isSigned ? `I` : `Ui`}nt${this.bitWidth}`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitInt(this); } + protected static [Symbol.toStringTag] = ((proto: Int) => { + return proto[Symbol.toStringTag] = 'Int'; + })(Int.prototype); +} + +export class Int8 extends Int { constructor() { super(true, 8); } } +export class Int16 extends Int { constructor() { super(true, 16); } } +export class Int32 extends Int { constructor() { super(true, 32); } } +export class Int64 extends Int { constructor() { super(true, 64); } } +export class Uint8 extends Int { constructor() { super(false, 8); } } +export class Uint16 extends Int { constructor() { super(false, 16); } } +export class Uint32 extends Int { constructor() { super(false, 32); } } +export class Uint64 extends Int { constructor() { super(false, 64); } } + +export interface Float extends DataType { TArray: TArrayType; TValue: number; } +export class Float extends DataType { + constructor(public readonly precision: Precision) { + super(Type.Float); + } + // @ts-ignore + public get ArrayType(): TypedArrayConstructor { + switch (this.precision) { + case Precision.HALF: return Uint16Array as any; + case Precision.SINGLE: return Float32Array as any; + case Precision.DOUBLE: return Float64Array as any; + } + throw new Error(`Unrecognized ${this[Symbol.toStringTag]} type`); + } + public toString() { return `Float${(this.precision << 5) || 16}`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitFloat(this); } + protected static [Symbol.toStringTag] = ((proto: Float) => { + return proto[Symbol.toStringTag] = 'Float'; + })(Float.prototype); +} + +export class Float16 extends Float { constructor() { super(Precision.HALF); } } +export class Float32 extends Float { constructor() { super(Precision.SINGLE); } } +export class Float64 extends Float { constructor() { super(Precision.DOUBLE); } } + +export interface Binary extends DataType { TArray: Uint8Array; TValue: Uint8Array; } +export class Binary extends DataType { + constructor() { + super(Type.Binary); + } + public toString() { return `Binary`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitBinary(this); + } + protected static [Symbol.toStringTag] = ((proto: Binary) => { + ( proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Binary'; + })(Binary.prototype); +} + +export interface Utf8 extends DataType { TArray: Uint8Array; TValue: string; } +export class Utf8 extends DataType { + constructor() { + super(Type.Utf8); + } + public toString() { return `Utf8`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitUtf8(this); + } + protected static [Symbol.toStringTag] = ((proto: Utf8) => { + ( proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Utf8'; + })(Utf8.prototype); +} + +export interface Bool extends DataType { TArray: Uint8Array; TValue: boolean; } +export class Bool extends DataType { + constructor() { + super(Type.Bool); + } + public toString() { return `Bool`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitBool(this); + } + protected static [Symbol.toStringTag] = ((proto: Bool) => { + ( proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Bool'; + })(Bool.prototype); +} + +export interface Decimal extends DataType { TArray: Uint32Array; TValue: Uint32Array; } +export class Decimal extends DataType { + constructor(public readonly scale: number, + public readonly precision: number) { + super(Type.Decimal); + } + public toString() { return `Decimal[${this.precision}e${this.scale > 0 ? `+` : ``}${this.scale}]`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitDecimal(this); + } + protected static [Symbol.toStringTag] = ((proto: Decimal) => { + ( proto).ArrayType = Uint32Array; + return proto[Symbol.toStringTag] = 'Decimal'; + })(Decimal.prototype); +} + +/* tslint:disable:class-name */ +export interface Date_ extends DataType { TArray: Int32Array; TValue: Date; } +export class Date_ extends DataType { + constructor(public readonly unit: DateUnit) { + super(Type.Date); + } + public toString() { return `Date${(this.unit + 1) * 32}<${DateUnit[this.unit]}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitDate(this); + } + protected static [Symbol.toStringTag] = ((proto: Date_) => { + ( proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Date'; + })(Date_.prototype); +} + +export interface Time extends DataType { TArray: Uint32Array; TValue: number; } +export class Time extends DataType { + constructor(public readonly unit: TimeUnit, + public readonly bitWidth: TimeBitWidth) { + super(Type.Time); + } + public toString() { return `Time${this.bitWidth}<${TimeUnit[this.unit]}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitTime(this); + } + protected static [Symbol.toStringTag] = ((proto: Time) => { + ( proto).ArrayType = Uint32Array; + return proto[Symbol.toStringTag] = 'Time'; + })(Time.prototype); +} + +export interface Timestamp extends DataType { TArray: Int32Array; TValue: number; } +export class Timestamp extends DataType { + constructor(public unit: TimeUnit, public timezone?: string | null) { + super(Type.Timestamp); + } + public toString() { return `Timestamp<${TimeUnit[this.unit]}${this.timezone ? `, ${this.timezone}` : ``}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitTimestamp(this); + } + protected static [Symbol.toStringTag] = ((proto: Timestamp) => { + ( proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Timestamp'; + })(Timestamp.prototype); +} + +export interface Interval extends DataType { TArray: Int32Array; TValue: Int32Array; } +export class Interval extends DataType { + constructor(public unit: IntervalUnit) { + super(Type.Interval); + } + public toString() { return `Interval<${IntervalUnit[this.unit]}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitInterval(this); + } + protected static [Symbol.toStringTag] = ((proto: Interval) => { + ( proto).ArrayType = Int32Array; + return proto[Symbol.toStringTag] = 'Interval'; + })(Interval.prototype); +} + +export interface List extends DataType { TArray: any; TValue: Vector; } +export class List extends DataType { + constructor(public children: Field[]) { + super(Type.List, children); + } + public toString() { return `List<${this.valueType}>`; } + public get ArrayType() { return this.valueType.ArrayType; } + public get valueType() { return this.children[0].type as T; } + public get valueField() { return this.children[0] as Field; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitList(this); + } + protected static [Symbol.toStringTag] = ((proto: List) => { + return proto[Symbol.toStringTag] = 'List'; + })(List.prototype); +} + +export interface Struct extends DataType { TArray: any; TValue: View; } +export class Struct extends DataType { + constructor(public children: Field[]) { + super(Type.Struct, children); + } + public toString() { return `Struct<${this.children.map((f) => f.type).join(`, `)}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitStruct(this); + } + protected static [Symbol.toStringTag] = ((proto: Struct) => { + return proto[Symbol.toStringTag] = 'Struct'; + })(Struct.prototype); +} + +export interface Union extends DataType { TArray: Int8Array; TValue: any; } +export class Union extends DataType { + constructor(public readonly mode: UnionMode, + public readonly typeIds: ArrowType[], + public readonly children: Field[]) { + super( (mode === UnionMode.Sparse ? Type.SparseUnion : Type.DenseUnion), children); + } + public toString() { return `${this[Symbol.toStringTag]}<${this.typeIds.map((x) => Type[x]).join(` | `)}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitUnion(this); } + protected static [Symbol.toStringTag] = ((proto: Union) => { + ( proto).ArrayType = Int8Array; + return proto[Symbol.toStringTag] = 'Union'; + })(Union.prototype); +} + +export class DenseUnion extends Union { + constructor(typeIds: ArrowType[], children: Field[]) { + super(UnionMode.Dense, typeIds, children); + } + protected static [Symbol.toStringTag] = ((proto: DenseUnion) => { + return proto[Symbol.toStringTag] = 'DenseUnion'; + })(DenseUnion.prototype); +} + +export class SparseUnion extends Union { + constructor(typeIds: ArrowType[], children: Field[]) { + super(UnionMode.Sparse, typeIds, children); + } + protected static [Symbol.toStringTag] = ((proto: SparseUnion) => { + return proto[Symbol.toStringTag] = 'SparseUnion'; + })(SparseUnion.prototype); +} + +export interface FixedSizeBinary extends DataType { TArray: Uint8Array; TValue: Uint8Array; } +export class FixedSizeBinary extends DataType { + constructor(public readonly byteWidth: number) { + super(Type.FixedSizeBinary); + } + public toString() { return `FixedSizeBinary[${this.byteWidth}]`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitFixedSizeBinary(this); } + protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => { + ( proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'FixedSizeBinary'; + })(FixedSizeBinary.prototype); +} + +export interface FixedSizeList extends DataType { TArray: any; TValue: Vector; } +export class FixedSizeList extends DataType { + constructor(public readonly listSize: number, + public readonly children: Field[]) { + super(Type.FixedSizeList, children); + } + public get ArrayType() { return this.valueType.ArrayType; } + public get valueType() { return this.children[0].type as T; } + public get valueField() { return this.children[0] as Field; } + public toString() { return `FixedSizeList[${this.listSize}]<${this.valueType}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitFixedSizeList(this); } + protected static [Symbol.toStringTag] = ((proto: FixedSizeList) => { + return proto[Symbol.toStringTag] = 'FixedSizeList'; + })(FixedSizeList.prototype); +} + +/* tslint:disable:class-name */ +export interface Map_ extends DataType { TArray: Uint8Array; TValue: View; } +export class Map_ extends DataType { + constructor(public readonly keysSorted: boolean, + public readonly children: Field[]) { + super(Type.Map, children); + } + public toString() { return `Map<${this.children.join(`, `)}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { return visitor.visitMap(this); } + protected static [Symbol.toStringTag] = ((proto: Map_) => { + return proto[Symbol.toStringTag] = 'Map_'; + })(Map_.prototype); +} + +export interface Dictionary extends DataType { TArray: T['TArray']; TValue: T['TValue']; } +export class Dictionary extends DataType { + public readonly id: number; + public readonly dictionary: T; + public readonly indicies: Int; + public readonly isOrdered: boolean; + constructor(dictionary: T, indicies: Int, id?: Long | number | null, isOrdered?: boolean | null) { + super(Type.Dictionary); + this.indicies = indicies; + this.dictionary = dictionary; + this.isOrdered = isOrdered || false; + this.id = id == null ? DictionaryBatch.getId() : typeof id === 'number' ? id : id.low; + } + public get ArrayType() { return this.dictionary.ArrayType; } + public toString() { return `Dictionary<${this.dictionary}, ${this.indicies}>`; } + public acceptTypeVisitor(visitor: TypeVisitor): any { + return visitor.visitDictionary(this); + } + protected static [Symbol.toStringTag] = ((proto: Dictionary) => { + return proto[Symbol.toStringTag] = 'Dictionary'; + })(Dictionary.prototype); +} +export interface IterableArrayLike extends ArrayLike, Iterable {} + +export interface TypedArrayConstructor { + readonly prototype: T; + readonly BYTES_PER_ELEMENT: number; + new (length: number): T; + new (elements: Iterable): T; + new (arrayOrArrayBuffer: ArrayLike | ArrayBufferLike): T; + new (buffer: ArrayBufferLike, byteOffset: number, length?: number): T; + of(...items: number[]): T; + from(arrayLike: ArrayLike | Iterable, mapfn?: (v: number, k: number) => number, thisArg?: any): T; +} + +export type FloatArray = Uint16Array | Float32Array | Float64Array; +export type IntArray = Int8Array | Int16Array | Int32Array | Uint8Array | Uint16Array | Uint32Array; + +export interface TypedArray extends Iterable { + [index: number]: number; + readonly length: number; + readonly byteLength: number; + readonly byteOffset: number; + readonly buffer: ArrayBufferLike; + readonly BYTES_PER_ELEMENT: number; + [Symbol.toStringTag]: any; + [Symbol.iterator](): IterableIterator; + entries(): IterableIterator<[number, number]>; + keys(): IterableIterator; + values(): IterableIterator; + copyWithin(target: number, start: number, end?: number): this; + every(callbackfn: (value: number, index: number, array: TypedArray) => boolean, thisArg?: any): boolean; + fill(value: number, start?: number, end?: number): this; + filter(callbackfn: (value: number, index: number, array: TypedArray) => any, thisArg?: any): TypedArray; + find(predicate: (value: number, index: number, obj: TypedArray) => boolean, thisArg?: any): number | undefined; + findIndex(predicate: (value: number, index: number, obj: TypedArray) => boolean, thisArg?: any): number; + forEach(callbackfn: (value: number, index: number, array: TypedArray) => void, thisArg?: any): void; + includes(searchElement: number, fromIndex?: number): boolean; + indexOf(searchElement: number, fromIndex?: number): number; + join(separator?: string): string; + lastIndexOf(searchElement: number, fromIndex?: number): number; + map(callbackfn: (value: number, index: number, array: TypedArray) => number, thisArg?: any): TypedArray; + reduce(callbackfn: (previousValue: number, currentValue: number, currentIndex: number, array: TypedArray) => number): number; + reduce(callbackfn: (previousValue: number, currentValue: number, currentIndex: number, array: TypedArray) => number, initialValue: number): number; + reduce(callbackfn: (previousValue: U, currentValue: number, currentIndex: number, array: TypedArray) => U, initialValue: U): U; + reduceRight(callbackfn: (previousValue: number, currentValue: number, currentIndex: number, array: TypedArray) => number): number; + reduceRight(callbackfn: (previousValue: number, currentValue: number, currentIndex: number, array: TypedArray) => number, initialValue: number): number; + reduceRight(callbackfn: (previousValue: U, currentValue: number, currentIndex: number, array: TypedArray) => U, initialValue: U): U; + reverse(): TypedArray; + set(array: ArrayLike, offset?: number): void; + slice(start?: number, end?: number): TypedArray; + some(callbackfn: (value: number, index: number, array: TypedArray) => boolean, thisArg?: any): boolean; + sort(compareFn?: (a: number, b: number) => number): this; + subarray(begin: number, end?: number): TypedArray; + toLocaleString(): string; + toString(): string; +} diff --git a/js/src/util/bit.ts b/js/src/util/bit.ts new file mode 100644 index 00000000000..2308bf6a2e0 --- /dev/null +++ b/js/src/util/bit.ts @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { TypedArray } from '../type'; + +export function align(value: number, alignment: number) { + return value + padding(value, alignment); +} + +export function padding(value: number, alignment: number) { + return (value % alignment === 0 ? 0 : alignment - value % alignment); +} + +export function getBool(_data: any, _index: number, byte: number, bit: number) { + return (byte & 1 << bit) !== 0; +} + +export function getBit(_data: any, _index: number, byte: number, bit: number): 0 | 1 { + return (byte & 1 << bit) >> bit as (0 | 1); +} + +export function setBool(bytes: Uint8Array, index: number, value: any) { + return value ? + !!(bytes[index >> 3] |= (1 << (index % 8))) || true : + !(bytes[index >> 3] &= ~(1 << (index % 8))) && false ; +} + +export function packBools(values: Iterable) { + let n = 0, i = 0; + let xs: number[] = []; + let bit = 0, byte = 0; + for (const value of values) { + value && (byte |= 1 << bit); + if (++bit === 8) { + xs[i++] = byte; + byte = bit = 0; + } + } + if (i === 0 || bit > 0) { xs[i++] = byte; } + if (i % 8 && (n = i + 8 - i % 8)) { + do { xs[i] = 0; } while (++i < n); + } + return new Uint8Array(xs); +} + +export function* iterateBits(bytes: Uint8Array, begin: number, length: number, context: any, + get: (context: any, index: number, byte: number, bit: number) => T) { + let bit = begin % 8; + let byteIndex = begin >> 3; + let index = 0, remaining = length; + for (; remaining > 0; bit = 0) { + let byte = bytes[byteIndex++]; + do { + yield get(context, index++, byte, bit); + } while (--remaining > 0 && ++bit < 8); + } +} + +/** + * Compute the population count (the number of bits set to 1) for a range of bits in a Uint8Array. + * @param vector The Uint8Array of bits for which to compute the population count. + * @param lhs The range's left-hand side (or start) bit + * @param rhs The range's right-hand side (or end) bit + */ +export function popcnt_bit_range(data: Uint8Array, lhs: number, rhs: number): number { + if (rhs - lhs <= 0) { return 0; } + // If the bit range is less than one byte, sum the 1 bits in the bit range + if (rhs - lhs < 8) { + let sum = 0; + for (const bit of iterateBits(data, lhs, rhs - lhs, data, getBit)) { + sum += bit; + } + return sum; + } + // Get the next lowest multiple of 8 from the right hand side + const rhsInside = rhs >> 3 << 3; + // Get the next highest multiple of 8 from the left hand side + const lhsInside = lhs + (lhs % 8 === 0 ? 0 : 8 - lhs % 8); + return ( + // Get the popcnt of bits between the left hand side, and the next highest multiple of 8 + popcnt_bit_range(data, lhs, lhsInside) + + // Get the popcnt of bits between the right hand side, and the next lowest multiple of 8 + popcnt_bit_range(data, rhsInside, rhs) + + // Get the popcnt of all bits between the left and right hand sides' multiples of 8 + popcnt_array(data, lhsInside >> 3, (rhsInside - lhsInside) >> 3) + ); +} + +export function popcnt_array(arr: TypedArray, byteOffset?: number, byteLength?: number) { + let cnt = 0, pos = byteOffset! | 0; + const view = new DataView(arr.buffer, arr.byteOffset, arr.byteLength); + const len = byteLength === void 0 ? arr.byteLength : pos + byteLength; + while (len - pos >= 4) { + cnt += popcnt_uint32(view.getUint32(pos)); + pos += 4; + } + while (len - pos >= 2) { + cnt += popcnt_uint32(view.getUint16(pos)); + pos += 2; + } + while (len - pos >= 1) { + cnt += popcnt_uint32(view.getUint8(pos)); + pos += 1; + } + return cnt; +} + +export function popcnt_uint32(uint32: number): number { + let i = uint32 | 0; + i = i - ((i >>> 1) & 0x55555555); + i = (i & 0x33333333) + ((i >>> 2) & 0x33333333); + return (((i + (i >>> 4)) & 0x0F0F0F0F) * 0x01010101) >>> 24; +} diff --git a/js/src/util/compat.ts b/js/src/util/compat.ts new file mode 100644 index 00000000000..7a4232ee8c3 --- /dev/null +++ b/js/src/util/compat.ts @@ -0,0 +1,49 @@ +export interface Subscription { + unsubscribe: () => void; +} + +export interface Observer { + closed?: boolean; + next: (value: T) => void; + error: (err: any) => void; + complete: () => void; +} + +export interface Observable { + subscribe: (observer: Observer) => Subscription; +} + +/** + * @ignore + */ +export function isPromise(x: any): x is PromiseLike { + return x != null && Object(x) === x && typeof x['then'] === 'function'; +} + +/** + * @ignore + */ +export function isObservable(x: any): x is Observable { + return x != null && Object(x) === x && typeof x['subscribe'] === 'function'; +} + +/** + * @ignore + */ +export function isArrayLike(x: any): x is ArrayLike { + return x != null && Object(x) === x && typeof x['length'] === 'number'; +} + +/** + * @ignore + */ +export function isIterable(x: any): x is Iterable { + return x != null && Object(x) === x && typeof x[Symbol.iterator] !== 'undefined'; +} + +/** + * @ignore + */ +export function isAsyncIterable(x: any): x is AsyncIterable { + return x != null && Object(x) === x && typeof x[Symbol.asyncIterator] !== 'undefined'; +} diff --git a/js/src/util/layout.ts b/js/src/util/layout.ts index c064ee9d7d0..29698fb3d2b 100644 --- a/js/src/util/layout.ts +++ b/js/src/util/layout.ts @@ -15,16 +15,9 @@ // specific language governing permissions and limitations // under the License. +import { align } from './bit'; import { TextEncoder } from 'text-encoding-utf-8'; -import { TypedArrayConstructor, TypedArray } from '../vector/types'; - -export function align(value: number, alignment: number) { - return value + padding(value, alignment); -} - -export function padding(value: number, alignment: number) { - return (value % alignment === 0 ? 0 : alignment - value % alignment); -} +import { TypedArrayConstructor, TypedArray } from '../type'; export type NullableLayout = { nullCount: number, validity: Uint8Array }; export type BufferLayout> = { data: TArray }; diff --git a/js/src/vector.ts b/js/src/vector.ts new file mode 100644 index 00000000000..d9ca97b5fd1 --- /dev/null +++ b/js/src/vector.ts @@ -0,0 +1,441 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Data, ChunkedData, FlatData, BoolData, FlatListData, NestedData, DictionaryData } from './data'; +import { VisitorNode, TypeVisitor, VectorVisitor } from './visitor'; +import { DataType, ListType, FlatType, NestedType, FlatListType, TimeUnit } from './type'; +import { IterableArrayLike, Precision, DateUnit, IntervalUnit, UnionMode } from './type'; + +export interface VectorLike { length: number; nullCount: number; } + +export interface View { + clone(data: Data): this; + isValid(index: number): boolean; + get(index: number): T['TValue'] | null; + set(index: number, value: T['TValue']): void; + toArray(): IterableArrayLike; + [Symbol.iterator](): IterableIterator; +} + +export class Vector implements VectorLike, View, VisitorNode { + public static create(data: Data): Vector { + return createVector(data); + } + public static concat(source?: Vector | null, ...others: Vector[]): Vector { + return others.reduce((a, b) => a ? a.concat(b) : b, source!); + } + public type: T; + public length: number; + public readonly data: Data; + public readonly view: View; + constructor(data: Data, view: View) { + this.data = data; + this.type = data.type; + this.length = data.length; + let nulls: Uint8Array; + if (( data instanceof ChunkedData) && !(view instanceof ChunkedView)) { + this.view = new ChunkedView(data); + } else if (!(view instanceof ValidityView) && (nulls = data.nullBitmap!) && nulls.length > 0 && data.nullCount > 0) { + this.view = new ValidityView(data, view); + } else { + this.view = view; + } + } + + public get nullCount() { return this.data.nullCount; } + public get nullBitmap() { return this.data.nullBitmap; } + public get [Symbol.toStringTag]() { + return `Vector<${this.type[Symbol.toStringTag]}>`; + } + public toJSON(): any { return this.toArray(); } + public clone(data: Data, view: View = this.view.clone(data) as any): this { + return new (this.constructor as any)(data, view); + } + public isValid(index: number): boolean { + return this.view.isValid(index); + } + public get(index: number): T['TValue'] | null { + return this.view.get(index); + } + public set(index: number, value: T['TValue']): void { + return this.view.set(index, value); + } + public toArray(): IterableArrayLike { + return this.view.toArray(); + } + public [Symbol.iterator](): IterableIterator { + return this.view[Symbol.iterator](); + } + public concat(...others: Vector[]): this { + if ((others = others.filter(Boolean)).length === 0) { + return this; + } + const { view } = this; + const vecs = !(view instanceof ChunkedView) + ? [this, ...others] + : [...view.chunkVectors, ...others]; + const offsets = ChunkedData.computeOffsets(vecs); + const chunksLength = offsets[offsets.length - 1]; + const chunkedData = new ChunkedData(this.type, chunksLength, vecs, 0, -1, offsets); + return this.clone(chunkedData, new ChunkedView(chunkedData)) as this; + } + public slice(begin?: number, end?: number): this { + let { length } = this; + let size = (this.view as any).size || 1; + let total = length, from = (begin || 0) * size; + let to = (typeof end === 'number' ? end : total) * size; + if (to < 0) { to = total - (to * -1) % total; } + if (from < 0) { from = total - (from * -1) % total; } + if (to < from) { [from, to] = [to, from]; } + total = !isFinite(total = (to - from)) || total < 0 ? 0 : total; + const slicedData = this.data.slice(from, Math.min(total, length)); + return this.clone(slicedData, this.view.clone(slicedData)) as this; + } + + public acceptTypeVisitor(visitor: TypeVisitor): any { + return TypeVisitor.visitTypeInline(visitor, this.type); + } + public acceptVectorVisitor(visitor: VectorVisitor): any { + return VectorVisitor.visitTypeInline(visitor, this.type, this); + } +} + +export abstract class FlatVector extends Vector { + public get values() { return this.data.values; } + public lows(): IntVector { return this.asInt32(0, 2); } + public highs(): IntVector { return this.asInt32(1, 2); } + public asInt32(offset: number = 0, stride: number = 2): IntVector { + let data = (this.data as FlatData).clone(new Int32()); + if (offset > 0) { + data = data.slice(offset, this.length - offset); + } + const int32s = new IntVector(data, new PrimitiveView(data, stride)); + int32s.length = this.length / stride | 0; + return int32s; + } +} + +export abstract class ListVectorBase extends Vector { + public get values() { return this.data.values; } + public get valueOffsets() { return this.data.valueOffsets; } + public getValueOffset(index: number) { + return this.valueOffsets[index]; + } + public getValueLength(index: number) { + return this.valueOffsets[index + 1] - this.valueOffsets[index]; + } +} + +export abstract class NestedVector extends Vector { + // @ts-ignore + public readonly view: NestedView; + // @ts-ignore + protected _childData: Data[]; + public getChildAt(index: number): Vector | null { + return this.view.getChildAt(index); + } + public get childData(): Data[] { + let data: Data | Data[]; + if ((data = this._childData)) { + // Return the cached childData reference first + return data as Data[]; + } else if (!( (data = this.data) instanceof ChunkedData)) { + // If data isn't chunked, cache and return NestedData's childData + return this._childData = (data as NestedData).childData; + } + // Otherwise if the data is chunked, concatenate the childVectors from each chunk + // to construct a single chunked Vector for each column. Then return the ChunkedData + // instance from each unified chunked column as the childData of a chunked NestedVector + const chunks = ((data as ChunkedData).chunkVectors as NestedVector[]); + return this._childData = chunks + .reduce<(Vector | null)[][]>((cols, chunk) => chunk.childData + .reduce<(Vector | null)[][]>((cols, _, i) => ( + (cols[i] || (cols[i] = [])).push(chunk.getChildAt(i)) + ) && cols || cols, cols), [] as Vector[][]) + .map((vecs) => Vector.concat(...vecs).data); + } +} + +import { List, Binary, Utf8, Bool, } from './type'; +import { Null, Int, Float, Decimal, Date_, Time, Timestamp, Interval } from './type'; +import { Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Float16, Float32, Float64 } from './type'; +import { Struct, Union, SparseUnion, DenseUnion, FixedSizeBinary, FixedSizeList, Map_, Dictionary } from './type'; + +import { ChunkedView } from './vector/chunked'; +import { DictionaryView } from './vector/dictionary'; +import { ListView, FixedSizeListView, BinaryView, Utf8View } from './vector/list'; +import { UnionView, DenseUnionView, NestedView, StructView, MapView } from './vector/nested'; +import { FlatView, NullView, BoolView, ValidityView, PrimitiveView, FixedSizeView, Float16View } from './vector/flat'; +import { DateDayView, DateMillisecondView, IntervalYearMonthView } from './vector/flat'; +import { TimestampDayView, TimestampSecondView, TimestampMillisecondView, TimestampMicrosecondView, TimestampNanosecondView } from './vector/flat'; +import { packBools } from './util/bit'; + +export class NullVector extends Vector { + constructor(data: Data, view: View = new NullView(data)) { + super(data, view); + } +} + +export class BoolVector extends Vector { + public static from(data: IterableArrayLike) { + return new BoolVector(new BoolData(new Bool(), data.length, null, packBools(data))); + } + public get values() { return this.data.values; } + constructor(data: Data, view: View = new BoolView(data)) { + super(data, view); + } +} + +export class IntVector> extends FlatVector { + public static from(data: Int8Array): IntVector; + public static from(data: Int16Array): IntVector; + public static from(data: Int32Array): IntVector; + public static from(data: Uint8Array): IntVector; + public static from(data: Uint16Array): IntVector; + public static from(data: Uint32Array): IntVector; + public static from(data: Int32Array, is64: true): IntVector; + public static from(data: Uint32Array, is64: true): IntVector; + public static from(data: any, is64?: boolean) { + if (is64 === true) { + return data instanceof Int32Array + ? new IntVector(new FlatData(new Int64(), data.length, null, data)) + : new IntVector(new FlatData(new Uint64(), data.length, null, data)); + } + switch (data.constructor) { + case Int8Array: return new IntVector(new FlatData(new Int8(), data.length, null, data)); + case Int16Array: return new IntVector(new FlatData(new Int16(), data.length, null, data)); + case Int32Array: return new IntVector(new FlatData(new Int32(), data.length, null, data)); + case Uint8Array: return new IntVector(new FlatData(new Uint8(), data.length, null, data)); + case Uint16Array: return new IntVector(new FlatData(new Uint16(), data.length, null, data)); + case Uint32Array: return new IntVector(new FlatData(new Uint32(), data.length, null, data)); + } + throw new TypeError('Unrecognized Int data'); + } + static defaultView(data: Data) { + return data.type.bitWidth <= 32 ? new FlatView(data) : new FixedSizeView(data, (data.type.bitWidth / 32) | 0); + } + constructor(data: Data, view: View = IntVector.defaultView(data)) { + super(data, view); + } +} + +export class FloatVector> extends FlatVector { + public static from(data: Uint16Array): FloatVector; + public static from(data: Float32Array): FloatVector; + public static from(data: Float64Array): FloatVector; + public static from(data: any) { + switch (data.constructor) { + case Uint16Array: return new FloatVector(new FlatData(new Float16(), data.length, null, data)); + case Float32Array: return new FloatVector(new FlatData(new Float32(), data.length, null, data)); + case Float64Array: return new FloatVector(new FlatData(new Float64(), data.length, null, data)); + } + throw new TypeError('Unrecognized Float data'); + } + static defaultView(data: Data): FlatView { + return data.type.precision !== Precision.HALF ? new FlatView(data) : new Float16View(data as Data); + } + constructor(data: Data, view: View = FloatVector.defaultView(data)) { + super(data, view); + } +} + +export class DateVector extends FlatVector { + static defaultView(data: Data) { + return data.type.unit === DateUnit.DAY ? new DateDayView(data) : new DateMillisecondView(data, 2); + } + constructor(data: Data, view: View = DateVector.defaultView(data)) { + super(data, view); + } + public lows(): IntVector { + return this.type.unit === DateUnit.DAY ? this.asInt32(0, 1) : this.asInt32(0, 2); + } + public highs(): IntVector { + return this.type.unit === DateUnit.DAY ? this.asInt32(0, 1) : this.asInt32(1, 2); + } + public asEpochMilliseconds(): IntVector { + let data = (this.data as FlatData).clone(new Int32()); + switch (this.type.unit) { + case DateUnit.DAY: return new IntVector(data, new TimestampDayView(data as any, 1) as any); + case DateUnit.MILLISECOND: return new IntVector(data, new TimestampMillisecondView(data as any, 2) as any); + } + throw new TypeError(`Unrecognized date unit "${DateUnit[this.type.unit]}"`); + } +} + +export class DecimalVector extends FlatVector { + constructor(data: Data, view: View = new FixedSizeView(data, 4)) { + super(data, view); + } +} + +export class TimeVector extends FlatVector