diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp
index a5df681421c..7625bcd51ee 100644
--- a/c_glib/arrow-glib/array-builder.cpp
+++ b/c_glib/arrow-glib/array-builder.cpp
@@ -24,6 +24,7 @@
#include
#include
#include
+#include
template
gboolean
@@ -327,6 +328,39 @@ garrow_array_builder_release_ownership(GArrowArrayBuilder *builder)
priv->have_ownership = FALSE;
}
+/**
+ * garrow_array_builder_get_value_data_type:
+ * @builder: A #GArrowArrayBuilder.
+ *
+ * Returns: (transfer full): The #GArrowDataType of the value of
+ * the array builder.
+ *
+ * Since: 0.9.0
+ */
+GArrowDataType *
+garrow_array_builder_get_value_data_type(GArrowArrayBuilder *builder)
+{
+ auto arrow_builder = garrow_array_builder_get_raw(builder);
+ auto arrow_type = arrow_builder->type();
+ return garrow_data_type_new_raw(&arrow_type);
+}
+
+/**
+ * garrow_array_builder_get_value_type:
+ * @builder: A #GArrowArrayBuilder.
+ *
+ * Returns: The #GArrowType of the value of the array builder.
+ *
+ * Since: 0.9.0
+ */
+GArrowType
+garrow_array_builder_get_value_type(GArrowArrayBuilder *builder)
+{
+ auto arrow_builder = garrow_array_builder_get_raw(builder);
+ auto arrow_type = arrow_builder->type();
+ return garrow_type_from_raw(arrow_type->id());
+}
+
/**
* garrow_array_builder_finish:
* @builder: A #GArrowArrayBuilder.
diff --git a/c_glib/arrow-glib/array-builder.h b/c_glib/arrow-glib/array-builder.h
index 19dadb30999..ea95f31e8fa 100644
--- a/c_glib/arrow-glib/array-builder.h
+++ b/c_glib/arrow-glib/array-builder.h
@@ -37,6 +37,10 @@ struct _GArrowArrayBuilderClass
void garrow_array_builder_release_ownership(GArrowArrayBuilder *builder);
+GArrowDataType *
+garrow_array_builder_get_value_data_type(GArrowArrayBuilder *builder);
+GArrowType garrow_array_builder_get_value_type(GArrowArrayBuilder *builder);
+
GArrowArray *garrow_array_builder_finish (GArrowArrayBuilder *builder,
GError **error);
diff --git a/c_glib/test/test-array-builder.rb b/c_glib/test/test-array-builder.rb
index 92976a424cc..a773131e15b 100644
--- a/c_glib/test/test-array-builder.rb
+++ b/c_glib/test/test-array-builder.rb
@@ -76,6 +76,18 @@ def test_negative
end
end
+module ArrayBuilderValueTypeTests
+ def test_value_data_type
+ assert_equal(value_data_type,
+ build_array(sample_values).value_data_type)
+ end
+
+ def test_value_type
+ assert_equal(value_data_type.id,
+ build_array(sample_values).value_type)
+ end
+end
+
class TestArrayBuilder < Test::Unit::TestCase
include Helper::Buildable
include Helper::Omittable
@@ -93,6 +105,10 @@ def create_builder
Arrow::BooleanArrayBuilder.new
end
+ def value_data_type
+ Arrow::BooleanDataType.new
+ end
+
def builder_class_name
"boolean-array-builder"
end
@@ -101,6 +117,10 @@ def sample_values
[true, false, true]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -115,6 +135,10 @@ def create_builder
Arrow::IntArrayBuilder.new
end
+ def value_data_type
+ Arrow::Int8DataType.new
+ end
+
def builder_class_name
"int-array-builder"
end
@@ -123,6 +147,10 @@ def sample_values
[1, -2, 3]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -137,6 +165,10 @@ def create_builder
Arrow::UIntArrayBuilder.new
end
+ def value_data_type
+ Arrow::UInt8DataType.new
+ end
+
def builder_class_name
"uint-array-builder"
end
@@ -145,6 +177,10 @@ def sample_values
[1, 2, 3]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -159,6 +195,10 @@ def create_builder
Arrow::Int8ArrayBuilder.new
end
+ def value_data_type
+ Arrow::Int8DataType.new
+ end
+
def builder_class_name
"int8-array-builder"
end
@@ -167,6 +207,10 @@ def sample_values
[1, -2, 3]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -181,6 +225,10 @@ def create_builder
Arrow::UInt8ArrayBuilder.new
end
+ def value_data_type
+ Arrow::UInt8DataType.new
+ end
+
def builder_class_name
"uint8-array-builder"
end
@@ -189,6 +237,10 @@ def sample_values
[1, 2, 3]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -203,6 +255,10 @@ def create_builder
Arrow::Int16ArrayBuilder.new
end
+ def value_data_type
+ Arrow::Int16DataType.new
+ end
+
def builder_class_name
"int16-array-builder"
end
@@ -211,6 +267,10 @@ def sample_values
[1, -2, 3]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -225,6 +285,10 @@ def create_builder
Arrow::UInt16ArrayBuilder.new
end
+ def value_data_type
+ Arrow::UInt16DataType.new
+ end
+
def builder_class_name
"uint16-array-builder"
end
@@ -233,6 +297,10 @@ def sample_values
[1, 2, 3]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -247,6 +315,10 @@ def create_builder
Arrow::Int32ArrayBuilder.new
end
+ def value_data_type
+ Arrow::Int32DataType.new
+ end
+
def builder_class_name
"int32-array-builder"
end
@@ -255,6 +327,10 @@ def sample_values
[1, -2, 3]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -269,6 +345,10 @@ def create_builder
Arrow::UInt32ArrayBuilder.new
end
+ def value_data_type
+ Arrow::UInt32DataType.new
+ end
+
def builder_class_name
"uint32-array-builder"
end
@@ -277,6 +357,10 @@ def sample_values
[1, 2, 3]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -291,6 +375,10 @@ def create_builder
Arrow::Int64ArrayBuilder.new
end
+ def value_data_type
+ Arrow::Int64DataType.new
+ end
+
def builder_class_name
"int64-array-builder"
end
@@ -299,6 +387,10 @@ def sample_values
[1, -2, 3]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -313,6 +405,10 @@ def create_builder
Arrow::UInt64ArrayBuilder.new
end
+ def value_data_type
+ Arrow::UInt64DataType.new
+ end
+
def builder_class_name
"uint64-array-builder"
end
@@ -321,6 +417,10 @@ def sample_values
[1, 2, 3]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -335,6 +435,10 @@ def create_builder
Arrow::FloatArrayBuilder.new
end
+ def value_data_type
+ Arrow::FloatDataType.new
+ end
+
def builder_class_name
"float-array-builder"
end
@@ -343,6 +447,10 @@ def sample_values
[1.1, -2.2, 3.3]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -357,6 +465,10 @@ def create_builder
Arrow::DoubleArrayBuilder.new
end
+ def value_data_type
+ Arrow::DoubleDataType.new
+ end
+
def builder_class_name
"double-array-builder"
end
@@ -365,6 +477,10 @@ def sample_values
[1.1, -2.2, 3.3]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -379,6 +495,10 @@ def create_builder
Arrow::Date32ArrayBuilder.new
end
+ def value_data_type
+ Arrow::Date32DataType.new
+ end
+
def builder_class_name
"date32-array-builder"
end
@@ -391,6 +511,10 @@ def sample_values
]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -405,6 +529,10 @@ def create_builder
Arrow::Date64ArrayBuilder.new
end
+ def value_data_type
+ Arrow::Date64DataType.new
+ end
+
def builder_class_name
"date64-array-builder"
end
@@ -417,6 +545,10 @@ def sample_values
]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -432,6 +564,10 @@ def create_builder
Arrow::TimestampArrayBuilder.new(data_type)
end
+ def value_data_type
+ Arrow::TimestampDataType.new(:milli)
+ end
+
def builder_class_name
"timestamp-array-builder"
end
@@ -444,6 +580,10 @@ def sample_values
]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -459,6 +599,10 @@ def create_builder
Arrow::Time32ArrayBuilder.new(data_type)
end
+ def value_data_type
+ Arrow::Time32DataType.new(:second)
+ end
+
def builder_class_name
"time32-array-builder"
end
@@ -471,6 +615,10 @@ def sample_values
]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
@@ -486,6 +634,10 @@ def create_builder
Arrow::Time64ArrayBuilder.new(data_type)
end
+ def value_data_type
+ Arrow::Time64DataType.new(:micro)
+ end
+
def builder_class_name
"time64-array-builder"
end
@@ -498,6 +650,10 @@ def sample_values
]
end
+ sub_test_case("value type") do
+ include ArrayBuilderValueTypeTests
+ end
+
sub_test_case("#append_values") do
include ArrayBuilderAppendValuesTests
end
diff --git a/cpp/apidoc/Windows.md b/cpp/apidoc/Windows.md
index e7e83f1946b..aa3d31f1f7b 100644
--- a/cpp/apidoc/Windows.md
+++ b/cpp/apidoc/Windows.md
@@ -55,20 +55,16 @@ previous step:
activate arrow-dev
```
-We are using [cmake][4] tool to support Windows builds.
+We are using the [cmake][4] tool to support Windows builds.
To allow cmake to pick up 3rd party dependencies, you should set
`ARROW_BUILD_TOOLCHAIN` environment variable to contain `Library` folder
path of new created on previous step `arrow-dev` conda environment.
-For instance, if `Miniconda` was installed to default destination, `Library`
-folder path for `arrow-dev` conda environment will be as following:
+To set `ARROW_BUILD_TOOLCHAIN` environment variable visible only for current terminal
+session you can run following. `%CONDA_PREFIX` is set by conda to the current environment
+root by the `activate` script.
```shell
-C:\Users\YOUR_USER_NAME\Miniconda3\envs\arrow-dev\Library
-```
-
-To set `ARROW_BUILD_TOOLCHAIN` environment variable visible only for current terminal session you can run following:
-```shell
-set ARROW_BUILD_TOOLCHAIN=C:\Users\YOUR_USER_NAME\Miniconda3\envs\arrow-dev\Library
+set ARROW_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library
```
To validate value of `ARROW_BUILD_TOOLCHAIN` environment variable you can run following terminal command:
diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index a36dffb52e2..8b61a3acfe7 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -63,10 +63,10 @@ if (ARROW_BUILD_BENCHMARKS)
Shlwapi.lib
)
else()
- target_link_libraries(arrow_benchmark_main
+ target_link_libraries(arrow_benchmark_main
benchmark
pthread
- )
+ )
endif()
# TODO(wesm): Some benchmarks include gtest.h
@@ -80,4 +80,6 @@ ADD_ARROW_TEST(key-value-metadata-test)
ADD_ARROW_TEST(rle-encoding-test)
ADD_ARROW_TEST(stl-util-test)
+ADD_ARROW_BENCHMARK(bit-util-benchmark)
+
add_subdirectory(variant)
diff --git a/cpp/src/arrow/util/bit-util-benchmark.cc b/cpp/src/arrow/util/bit-util-benchmark.cc
new file mode 100644
index 00000000000..8969dd80b15
--- /dev/null
+++ b/cpp/src/arrow/util/bit-util-benchmark.cc
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+
+#include
+
+#include "arrow/buffer.h"
+#include "arrow/builder.h"
+#include "arrow/memory_pool.h"
+#include "arrow/test-util.h"
+#include "arrow/util/bit-util.h"
+
+namespace arrow {
+namespace BitUtil {
+
+static void BM_CopyBitmap(benchmark::State& state) { // NOLINT non-const reference
+ const int kBufferSize = state.range(0);
+
+ std::shared_ptr buffer;
+ ASSERT_OK(AllocateBuffer(default_memory_pool(), kBufferSize, &buffer));
+ memset(buffer->mutable_data(), 0, kBufferSize);
+ test::random_bytes(kBufferSize, 0, buffer->mutable_data());
+
+ const int num_bits = kBufferSize * 8;
+ const uint8_t* src = buffer->data();
+
+ std::shared_ptr copy;
+ while (state.KeepRunning()) {
+ ABORT_NOT_OK(CopyBitmap(default_memory_pool(), src, state.range(1), num_bits, ©));
+ }
+ state.SetBytesProcessed(state.iterations() * kBufferSize * sizeof(int8_t));
+}
+
+BENCHMARK(BM_CopyBitmap)
+ ->Args({100000, 0})
+ ->Args({1000000, 0})
+ ->Args({100000, 4})
+ ->Args({1000000, 4})
+ ->MinTime(1.0)
+ ->Unit(benchmark::kMicrosecond);
+
+} // namespace BitUtil
+} // namespace arrow
diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc
index 92bdcb5fc08..4c64dea374d 100644
--- a/cpp/src/arrow/util/bit-util-test.cc
+++ b/cpp/src/arrow/util/bit-util-test.cc
@@ -165,19 +165,20 @@ TEST(BitUtilTests, TestCopyBitmap) {
memset(buffer->mutable_data(), 0, kBufferSize);
test::random_bytes(kBufferSize, 0, buffer->mutable_data());
- const int num_bits = kBufferSize * 8;
-
const uint8_t* src = buffer->data();
+ std::vector lengths = {kBufferSize * 8 - 4, kBufferSize * 8};
std::vector offsets = {0, 12, 16, 32, 37, 63, 64, 128};
- for (int64_t offset : offsets) {
- const int64_t copy_length = num_bits - offset;
+ for (int64_t num_bits : lengths) {
+ for (int64_t offset : offsets) {
+ const int64_t copy_length = num_bits - offset;
- std::shared_ptr copy;
- ASSERT_OK(CopyBitmap(default_memory_pool(), src, offset, copy_length, ©));
+ std::shared_ptr copy;
+ ASSERT_OK(CopyBitmap(default_memory_pool(), src, offset, copy_length, ©));
- for (int64_t i = 0; i < copy_length; ++i) {
- ASSERT_EQ(BitUtil::GetBit(src, i + offset), BitUtil::GetBit(copy->data(), i));
+ for (int64_t i = 0; i < copy_length; ++i) {
+ ASSERT_EQ(BitUtil::GetBit(src, i + offset), BitUtil::GetBit(copy->data(), i));
+ }
}
}
}
diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc
index 4dd91e99ad9..c77f0d008b5 100644
--- a/cpp/src/arrow/util/bit-util.cc
+++ b/cpp/src/arrow/util/bit-util.cc
@@ -109,9 +109,37 @@ Status CopyBitmap(MemoryPool* pool, const uint8_t* data, int64_t offset, int64_t
std::shared_ptr buffer;
RETURN_NOT_OK(GetEmptyBitmap(pool, length, &buffer));
uint8_t* dest = buffer->mutable_data();
- for (int64_t i = 0; i < length; ++i) {
- BitUtil::SetBitTo(dest, i, BitUtil::GetBit(data, i + offset));
+
+ int64_t byte_offset = offset / 8;
+ int64_t bit_offset = offset % 8;
+ int64_t num_bytes = BitUtil::BytesForBits(length);
+ int64_t bits_to_zero = num_bytes * 8 - length;
+
+ if (bit_offset > 0) {
+ uint32_t carry_mask = BitUtil::kBitmask[bit_offset] - 1U;
+ uint32_t carry_shift = 8U - static_cast(bit_offset);
+
+ uint32_t carry = 0U;
+ if (BitUtil::BytesForBits(length + bit_offset) > num_bytes) {
+ carry = (data[byte_offset + num_bytes] & carry_mask) << carry_shift;
+ }
+
+ int64_t i = num_bytes - 1;
+ while (i + 1 > 0) {
+ uint8_t cur_byte = data[byte_offset + i];
+ dest[i] = static_cast((cur_byte >> bit_offset) | carry);
+ carry = (cur_byte & carry_mask) << carry_shift;
+ --i;
+ }
+ } else {
+ std::memcpy(dest, data + byte_offset, static_cast(num_bytes));
+ }
+
+ for (int64_t i = length; i < length + bits_to_zero; ++i) {
+ // Both branches may copy extra bits - unsetting to match specification.
+ BitUtil::SetBitTo(dest, i, false);
}
+
*out = buffer;
return Status::OK();
}
diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h
index cab3c9ee703..86c17d16801 100644
--- a/cpp/src/arrow/util/bit-util.h
+++ b/cpp/src/arrow/util/bit-util.h
@@ -139,13 +139,10 @@ static inline void SetArrayBit(uint8_t* bits, int i, bool is_set) {
}
static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
- // TODO: speed up. See https://graphics.stanford.edu/~seander/bithacks.html
+ // https://graphics.stanford.edu/~seander/bithacks.html
// "Conditionally set or clear bits without branching"
- if (bit_is_set) {
- SetBit(bits, i);
- } else {
- ClearBit(bits, i);
- }
+ bits[i / 8] ^= static_cast(-static_cast(bit_is_set) ^ bits[i / 8]) &
+ kBitmask[i % 8];
}
// Returns the minimum number of bits needed to represent the value of 'x'
diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc
index 0dd1c44d71c..d74c0f412d9 100644
--- a/cpp/src/plasma/client.cc
+++ b/cpp/src/plasma/client.cc
@@ -513,9 +513,20 @@ Status PlasmaClient::Abort(const ObjectID& object_id) {
}
Status PlasmaClient::Delete(const ObjectID& object_id) {
- // TODO(rkn): In the future, we can use this method to give hints to the
- // eviction policy about when an object will no longer be needed.
- return Status::NotImplemented("PlasmaClient::Delete is not implemented.");
+ RETURN_NOT_OK(FlushReleaseHistory());
+ // If the object is in used, client can't send the remove message.
+ if (objects_in_use_.count(object_id) > 0) {
+ return Status::UnknownError("PlasmaClient::Object is in use.");
+ } else {
+ // If we don't already have a reference to the object, we can try to remove the object
+ RETURN_NOT_OK(SendDeleteRequest(store_conn_, object_id));
+ std::vector buffer;
+ RETURN_NOT_OK(PlasmaReceive(store_conn_, MessageType_PlasmaDeleteReply, &buffer));
+ ObjectID object_id2;
+ DCHECK_GT(buffer.size(), 0);
+ RETURN_NOT_OK(ReadDeleteReply(buffer.data(), buffer.size(), &object_id2));
+ return Status::OK();
+ }
}
Status PlasmaClient::Evict(int64_t num_bytes, int64_t& num_bytes_evicted) {
diff --git a/cpp/src/plasma/client.h b/cpp/src/plasma/client.h
index 78793f1a73a..35182f84032 100644
--- a/cpp/src/plasma/client.h
+++ b/cpp/src/plasma/client.h
@@ -174,7 +174,8 @@ class ARROW_EXPORT PlasmaClient {
Status Seal(const ObjectID& object_id);
/// Delete an object from the object store. This currently assumes that the
- /// object is present and has been sealed.
+ /// object is present, has been sealed and not used by another client. Otherwise,
+ /// it is a no operation.
///
/// @todo We may want to allow the deletion of objects that are not present or
/// haven't been sealed.
diff --git a/cpp/src/plasma/eviction_policy.cc b/cpp/src/plasma/eviction_policy.cc
index a7758fd2c0e..66a3b2ea298 100644
--- a/cpp/src/plasma/eviction_policy.cc
+++ b/cpp/src/plasma/eviction_policy.cc
@@ -102,4 +102,14 @@ void EvictionPolicy::end_object_access(const ObjectID& object_id,
cache_.add(object_id, entry->info.data_size + entry->info.metadata_size);
}
+void EvictionPolicy::remove_object(const ObjectID& object_id) {
+ /* If the object is in the LRU cache, remove it. */
+ cache_.remove(object_id);
+
+ auto entry = store_info_->objects[object_id].get();
+ int64_t size = entry->info.data_size + entry->info.metadata_size;
+ ARROW_CHECK(memory_used_ >= size);
+ memory_used_ -= size;
+}
+
} // namespace plasma
diff --git a/cpp/src/plasma/eviction_policy.h b/cpp/src/plasma/eviction_policy.h
index cebf35b1c1b..b0763095529 100644
--- a/cpp/src/plasma/eviction_policy.h
+++ b/cpp/src/plasma/eviction_policy.h
@@ -120,6 +120,11 @@ class EvictionPolicy {
int64_t choose_objects_to_evict(int64_t num_bytes_required,
std::vector* objects_to_evict);
+ /// This method will be called when an object is going to be removed
+ ///
+ /// @param object_id The ID of the object that is now being used.
+ void remove_object(const ObjectID& object_id);
+
private:
/// The amount of memory (in bytes) currently being used.
int64_t memory_used_;
diff --git a/cpp/src/plasma/format/plasma.fbs b/cpp/src/plasma/format/plasma.fbs
index b6d03b8a3c1..ea6dc8bb98d 100644
--- a/cpp/src/plasma/format/plasma.fbs
+++ b/cpp/src/plasma/format/plasma.fbs
@@ -76,7 +76,11 @@ enum PlasmaError:int {
// Trying to access an object that doesn't exist.
ObjectNonexistent,
// Trying to create an object but there isn't enough space in the store.
- OutOfMemory
+ OutOfMemory,
+ // Trying to delete an object but it's not sealed.
+ ObjectNotSealed,
+ // Trying to delete an object but it's in use.
+ ObjectInUse
}
// Plasma store messages
diff --git a/cpp/src/plasma/store.cc b/cpp/src/plasma/store.cc
index c6a19a54718..dde7f9cdfa8 100644
--- a/cpp/src/plasma/store.cc
+++ b/cpp/src/plasma/store.cc
@@ -411,6 +411,39 @@ int PlasmaStore::abort_object(const ObjectID& object_id, Client* client) {
}
}
+int PlasmaStore::delete_object(ObjectID& object_id) {
+ auto entry = get_object_table_entry(&store_info_, object_id);
+ // TODO(rkn): This should probably not fail, but should instead throw an
+ // error. Maybe we should also support deleting objects that have been
+ // created but not sealed.
+ if (entry == NULL) {
+ // To delete an object it must be in the object table.
+ return PlasmaError_ObjectNonexistent;
+ }
+
+ if (entry->state != PLASMA_SEALED) {
+ // To delete an object it must have been sealed.
+ return PlasmaError_ObjectNotSealed;
+ }
+
+ if (entry->clients.size() != 0) {
+ // To delete an object, there must be no clients currently using it.
+ return PlasmaError_ObjectInUse;
+ }
+
+ eviction_policy_.remove_object(object_id);
+
+ dlfree(entry->pointer);
+ store_info_.objects.erase(object_id);
+ // Inform all subscribers that the object has been deleted.
+ ObjectInfoT notification;
+ notification.object_id = object_id.binary();
+ notification.is_deletion = true;
+ push_notification(¬ification);
+
+ return PlasmaError_OK;
+}
+
void PlasmaStore::delete_objects(const std::vector& object_ids) {
for (const auto& object_id : object_ids) {
ARROW_LOG(DEBUG) << "deleting object " << object_id.hex();
@@ -626,18 +659,23 @@ Status PlasmaStore::process_message(Client* client) {
RETURN_NOT_OK(ReadGetRequest(input, input_size, object_ids_to_get, &timeout_ms));
process_get_request(client, object_ids_to_get, timeout_ms);
} break;
- case MessageType_PlasmaReleaseRequest:
+ case MessageType_PlasmaReleaseRequest: {
RETURN_NOT_OK(ReadReleaseRequest(input, input_size, &object_id));
release_object(object_id, client);
- break;
- case MessageType_PlasmaContainsRequest:
+ } break;
+ case MessageType_PlasmaDeleteRequest: {
+ RETURN_NOT_OK(ReadDeleteRequest(input, input_size, &object_id));
+ int error_code = delete_object(object_id);
+ HANDLE_SIGPIPE(SendDeleteReply(client->fd, object_id, error_code), client->fd);
+ } break;
+ case MessageType_PlasmaContainsRequest: {
RETURN_NOT_OK(ReadContainsRequest(input, input_size, &object_id));
if (contains_object(object_id) == OBJECT_FOUND) {
HANDLE_SIGPIPE(SendContainsReply(client->fd, object_id, 1), client->fd);
} else {
HANDLE_SIGPIPE(SendContainsReply(client->fd, object_id, 0), client->fd);
}
- break;
+ } break;
case MessageType_PlasmaSealRequest: {
unsigned char digest[kDigestSize];
RETURN_NOT_OK(ReadSealRequest(input, input_size, &object_id, &digest[0]));
diff --git a/cpp/src/plasma/store.h b/cpp/src/plasma/store.h
index a72c6259a9c..7eada5a1269 100644
--- a/cpp/src/plasma/store.h
+++ b/cpp/src/plasma/store.h
@@ -83,6 +83,15 @@ class PlasmaStore {
/// @return 1 if the abort succeeds, else 0.
int abort_object(const ObjectID& object_id, Client* client);
+ /// Delete an specific object by object_id that have been created in the hash table.
+ ///
+ /// @param object_id Object ID of the object to be deleted.
+ /// @return One of the following error codes:
+ /// - PlasmaError_OK, if the object was delete successfully.
+ /// - PlasmaError_ObjectNonexistent, if ths object isn't existed.
+ /// - PlasmaError_ObjectInUse, if the object is in use.
+ int delete_object(ObjectID& object_id);
+
/// Delete objects that have been created in the hash table. This should only
/// be called on objects that are returned by the eviction policy to evict.
///
diff --git a/cpp/src/plasma/test/client_tests.cc b/cpp/src/plasma/test/client_tests.cc
index 5cd3063bb43..f19c2bfbdb3 100644
--- a/cpp/src/plasma/test/client_tests.cc
+++ b/cpp/src/plasma/test/client_tests.cc
@@ -58,6 +58,31 @@ class TestPlasmaStore : public ::testing::Test {
PlasmaClient client2_;
};
+TEST_F(TestPlasmaStore, DeleteTest) {
+ ObjectID object_id = ObjectID::from_random();
+
+ // Test for deleting non-existance object.
+ Status result = client_.Delete(object_id);
+ ASSERT_EQ(result.IsPlasmaObjectNonexistent(), true);
+
+ // Test for the object being in local Plasma store.
+ // First create object.
+ int64_t data_size = 100;
+ uint8_t metadata[] = {5};
+ int64_t metadata_size = sizeof(metadata);
+ std::shared_ptr data;
+ ARROW_CHECK_OK(client_.Create(object_id, data_size, metadata, metadata_size, &data));
+ ARROW_CHECK_OK(client_.Seal(object_id));
+
+ // Object is in use, can't be delete.
+ result = client_.Delete(object_id);
+ ASSERT_EQ(result.IsUnknownError(), true);
+
+ // Avoid race condition of Plasma Manager waiting for notification.
+ ARROW_CHECK_OK(client_.Release(object_id));
+ ARROW_CHECK_OK(client_.Delete(object_id));
+}
+
TEST_F(TestPlasmaStore, ContainsTest) {
ObjectID object_id = ObjectID::from_random();
diff --git a/js/README.md b/js/README.md
index e58d335cd0d..b427923e37e 100644
--- a/js/README.md
+++ b/js/README.md
@@ -178,7 +178,7 @@ The base `apache-arrow` package includes all the compilation targets for conveni
The targets are also published under the `@apache-arrow` namespace:
```sh
-npm install apache-arrow # <-- combined es5/CommonJS + UMD, es2015/ESModules + UMD, and TypeScript package
+npm install apache-arrow # <-- combined es5/UMD, es2015/CommonJS/ESModules/UMD, and TypeScript package
npm install @apache-arrow/ts # standalone TypeScript package
npm install @apache-arrow/es5-cjs # standalone es5/CommonJS package
npm install @apache-arrow/es5-esm # standalone es5/ESModules package
diff --git a/js/gulp/arrow-task.js b/js/gulp/arrow-task.js
index cc33ee14497..d1e8046e67a 100644
--- a/js/gulp/arrow-task.js
+++ b/js/gulp/arrow-task.js
@@ -28,8 +28,8 @@ const { Observable, ReplaySubject } = require('rxjs');
const arrowTask = ((cache) => memoizeTask(cache, function copyMain(target, format) {
const out = targetDir(target);
- const srcGlob = `src/**/*.ts`;
- const es5Glob = `${targetDir(`es5`, `cjs`)}/**/*.js`;
+ const dtsGlob = `${targetDir(`es2015`, `cjs`)}/**/*.ts`;
+ const cjsGlob = `${targetDir(`es2015`, `cjs`)}/**/*.js`;
const esmGlob = `${targetDir(`es2015`, `esm`)}/**/*.js`;
const es5UmdGlob = `${targetDir(`es5`, `umd`)}/**/*.js`;
const es5UmdMaps = `${targetDir(`es5`, `umd`)}/**/*.map`;
@@ -38,8 +38,8 @@ const arrowTask = ((cache) => memoizeTask(cache, function copyMain(target, forma
const ch_ext = (ext) => gulpRename((p) => { p.extname = ext; });
const append = (ap) => gulpRename((p) => { p.basename += ap; });
return Observable.forkJoin(
- observableFromStreams(gulp.src(srcGlob), gulp.dest(out)), // copy src ts files
- observableFromStreams(gulp.src(es5Glob), gulp.dest(out)), // copy es5 cjs files
+ observableFromStreams(gulp.src(dtsGlob), gulp.dest(out)), // copy d.ts files
+ observableFromStreams(gulp.src(cjsGlob), gulp.dest(out)), // copy es2015 cjs files
observableFromStreams(gulp.src(esmGlob), ch_ext(`.mjs`), gulp.dest(out)), // copy es2015 esm files and rename to `.mjs`
observableFromStreams(gulp.src(es5UmdGlob), append(`.es5.min`), gulp.dest(out)), // copy es5 umd files and add `.min`
observableFromStreams(gulp.src(es5UmdMaps), gulp.dest(out)), // copy es5 umd sourcemap files, but don't rename
diff --git a/js/gulp/package-task.js b/js/gulp/package-task.js
index fc959643503..2976d0ad45d 100644
--- a/js/gulp/package-task.js
+++ b/js/gulp/package-task.js
@@ -46,8 +46,8 @@ const createMainPackageJson = (target, format) => (orig) => ({
name: npmPkgName,
main: mainExport,
module: `${mainExport}.mjs`,
- browser: `${mainExport}.es5.min.js`,
- [`browser:es2015`]: `${mainExport}.es2015.min.js`,
+ dist: `${mainExport}.es5.min.js`,
+ [`dist:es2015`]: `${mainExport}.es2015.min.js`,
[`@std/esm`]: { esm: `mjs` }
});
@@ -67,7 +67,7 @@ const createScopedPackageJSON = (target, format) => (({ name, ...orig }) =>
(xs, key) => ({ ...xs, [key]: xs[key] || orig[key] }),
{ name: `${npmOrgName}/${packageName(target, format)}`,
version: undefined, main: `${mainExport}.js`, types: `${mainExport}.d.ts`,
- browser: undefined, [`browser:es2015`]: undefined, module: undefined, [`@std/esm`]: undefined }
+ dist: undefined, [`dist:es2015`]: undefined, module: undefined, [`@std/esm`]: undefined }
)
)
);
diff --git a/js/gulp/test-task.js b/js/gulp/test-task.js
index f21aaf2364d..ab280b09263 100644
--- a/js/gulp/test-task.js
+++ b/js/gulp/test-task.js
@@ -34,7 +34,7 @@ argv.update && jestArgv.push(`-u`);
argv.verbose && jestArgv.push(`--verbose`);
argv.coverage && jestArgv.push(`--coverage`);
-const debugArgv = [`--runInBand`, `--env`, `jest-environment-node-debug`];
+const debugArgv = [`--runInBand`, `--env`, `node-debug`];
const jest = require.resolve(path.join(`..`, `node_modules`, `.bin`, `jest`));
const testOptions = {
env: { ...process.env },
diff --git a/js/package.json b/js/package.json
index 3903d1eedc4..d68e7a6279e 100644
--- a/js/package.json
+++ b/js/package.json
@@ -49,10 +49,8 @@
"gulpfile.js",
"npm-release.sh"
],
- "peerDependencies": {
- "command-line-usage": "4.0.1"
- },
"dependencies": {
+ "@types/text-encoding-utf-8": "1.0.1",
"command-line-args": "4.0.7",
"command-line-usage": "4.0.2",
"flatbuffers": "trxcllnt/flatbuffers-esm",
@@ -61,45 +59,44 @@
"tslib": "1.8.1"
},
"devDependencies": {
- "@std/esm": "0.18.0",
+ "@std/esm": "0.19.1",
"@types/flatbuffers": "1.6.5",
"@types/glob": "5.0.34",
- "@types/jest": "21.1.8",
- "@types/node": "8.5.0",
- "@types/text-encoding": "0.0.32",
+ "@types/jest": "22.0.1",
+ "@types/node": "9.3.0",
"ast-types": "0.10.1",
"benchmark": "2.1.4",
"coveralls": "3.0.0",
"del": "3.0.0",
- "esdoc": "1.0.3",
+ "esdoc": "1.0.4",
"esdoc-standard-plugin": "1.0.0",
"glob": "7.1.2",
- "google-closure-compiler": "20171203.0.0",
+ "google-closure-compiler": "20180101.0.0",
"gulp": "github:gulpjs/gulp#6d71a658c61edb3090221579d8f97dbe086ba2ed",
"gulp-json-transform": "0.4.5",
"gulp-rename": "1.2.2",
- "gulp-sourcemaps": "2.6.1",
+ "gulp-sourcemaps": "2.6.3",
"gulp-transform-js-ast": "1.0.2",
"gulp-typescript": "3.2.3",
"ix": "2.3.4",
- "jest": "21.2.1",
+ "jest": "22.0.5",
"jest-environment-node-debug": "2.0.0",
"json": "9.0.6",
- "lerna": "2.5.1",
+ "lerna": "2.6.0",
"lint-staged": "6.0.0",
- "merge2": "1.2.0",
+ "merge2": "1.2.1",
"mkdirp": "0.5.1",
"npm-run-all": "4.1.2",
"pump": "1.0.2",
"rimraf": "2.6.2",
- "rxjs": "5.5.5",
+ "rxjs": "5.5.6",
"shx": "0.2.2",
"source-map-loader": "0.2.3",
"trash": "4.2.1",
- "ts-jest": "21.2.4",
- "tslint": "5.8.0",
+ "ts-jest": "22.0.1",
+ "tslint": "5.9.1",
"typescript": "2.6.2",
- "uglifyjs-webpack-plugin": "1.1.2",
+ "uglifyjs-webpack-plugin": "1.1.6",
"webpack": "3.10.0",
"xml2js": "0.4.19"
},
@@ -134,9 +131,12 @@
"/node_modules/"
],
"transform": {
- ".(ts|tsx)": "/node_modules/ts-jest/preprocessor.js",
- ".(js|jsx)": "/node_modules/babel-jest/build/index.js"
+ ".(ts|tsx)": "./node_modules/ts-jest/preprocessor.js",
+ ".(js|jsx)": "./node_modules/babel-jest/build/index.js"
},
+ "transformIgnorePatterns": [
+ "/node_modules/", "/(es2015|esnext)\/umd/"
+ ],
"testRegex": "(.*(-|\\.)(test|spec)s?)\\.(ts|tsx|js)$"
}
}
diff --git a/js/src/text-encoding-utf-8.d.ts b/js/src/text-encoding-utf-8.d.ts
deleted file mode 100644
index 68ba4dfd9a3..00000000000
--- a/js/src/text-encoding-utf-8.d.ts
+++ /dev/null
@@ -1,4 +0,0 @@
-declare module 'text-encoding-utf-8' {
- import * as TextEncoding from 'text-encoding';
- export = TextEncoding;
-}
diff --git a/js/src/vector/numeric.ts b/js/src/vector/numeric.ts
index fe4767809f4..830d6082bcc 100644
--- a/js/src/vector/numeric.ts
+++ b/js/src/vector/numeric.ts
@@ -34,10 +34,10 @@ export class NumericVector extends Vector {
concat(...vectors: Vector[]): Vector {
return new VirtualVector(this.data.constructor as TypedArrayConstructor, this, ...vectors);
}
- slice(start?: number, end?: number) {
+ slice(start?: number, end?: number): R {
const { data, stride } = this, from = start! | 0;
const to = end === undefined ? data.length : Math.max(end | 0, from);
- return data.subarray(Math.min(from, to) * stride | 0, to * stride | 0);
+ return data.subarray(Math.min(from, to) * stride | 0, to * stride | 0) as any as R;
}
}
@@ -49,7 +49,8 @@ export class FixedWidthNumericVector extends Numer
export class BoolVector extends NumericVector {
static pack(values: Iterable) {
- let xs = [], n, i = 0;
+ let n = 0, i = 0;
+ let xs: number[] = [];
let bit = 0, byte = 0;
for (const value of values) {
value && (byte |= 1 << bit);
diff --git a/js/src/vector/virtual.ts b/js/src/vector/virtual.ts
index 6ec3a8eef9f..42db78706db 100644
--- a/js/src/vector/virtual.ts
+++ b/js/src/vector/virtual.ts
@@ -93,7 +93,7 @@ export class VirtualVector implements Vector {
// this is a significant improvement as we avoid the memcpy 🎉
if ((source.length / vector.stride | 0) < total) {
let vectorsLength = vectors.length;
- let count = 0, length = 0, sources = [];
+ let count = 0, length = 0, sources = [] as any[];
do {
sources.push(source);
length += source.length;
diff --git a/js/test/Arrow.ts b/js/test/Arrow.ts
index 87641e52bf3..f2c4e930f92 100644
--- a/js/test/Arrow.ts
+++ b/js/test/Arrow.ts
@@ -16,7 +16,7 @@
// under the License.
/* tslint:disable */
-// Dynamically load an Ix target build based on command line arguments
+// Dynamically load an Arrow target build based on command line arguments
const path = require('path');
const target = process.env.TEST_TARGET!;
diff --git a/js/test/integration/validate-tests.ts b/js/test/integration/validate-tests.ts
index c8778ba2b33..c612d62ad0c 100644
--- a/js/test/integration/validate-tests.ts
+++ b/js/test/integration/validate-tests.ts
@@ -37,7 +37,7 @@ const arrowBuffers: Uint8Array[] = [fs.readFileSync(arrowPath)];
import Arrow from '../Arrow';
import { zip } from 'ix/iterable/zip';
-import { toArray } from 'ix/iterable/toArray';
+import { toArray } from 'ix/iterable/toarray';
const { Table, read } = Arrow;
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index d9f1bd2c364..151e0df8a22 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -966,6 +966,14 @@ def write_table(table, where, row_group_size=None, version='1.0',
""".format(_parquet_writer_arg_docs)
+def _mkdir_if_not_exists(fs, path):
+ if fs._isfilestore() and not fs.exists(path):
+ try:
+ fs.mkdir(path)
+ except OSError:
+ assert fs.exists(path)
+
+
def write_to_dataset(table, root_path, partition_cols=None,
filesystem=None, preserve_index=True, **kwargs):
"""
@@ -1012,11 +1020,7 @@ def write_to_dataset(table, root_path, partition_cols=None,
else:
fs = _ensure_filesystem(filesystem)
- if fs._isfilestore() and not fs.exists(root_path):
- try:
- fs.mkdir(root_path)
- except OSError:
- assert fs.exists(root_path)
+ _mkdir_if_not_exists(fs, root_path)
if partition_cols is not None and len(partition_cols) > 0:
df = table.to_pandas()
@@ -1034,8 +1038,7 @@ def write_to_dataset(table, root_path, partition_cols=None,
subtable = Table.from_pandas(subgroup,
preserve_index=preserve_index)
prefix = "/".join([root_path, subdir])
- if fs._isfilestore() and not fs.exists(prefix):
- fs.mkdir(prefix)
+ _mkdir_if_not_exists(fs, prefix)
outfile = compat.guid() + ".parquet"
full_path = "/".join([prefix, outfile])
with fs.open(full_path, 'wb') as f:
diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py
index 689ec15d329..61f2e83f319 100644
--- a/python/pyarrow/serialization.py
+++ b/python/pyarrow/serialization.py
@@ -21,7 +21,6 @@
import numpy as np
-from pyarrow import serialize_pandas, deserialize_pandas
from pyarrow.compat import builtin_pickle
from pyarrow.lib import _default_serialization_context, frombuffer
@@ -61,6 +60,48 @@ def _load_pickle_from_buffer(data):
_deserialize_numpy_array_pickle = _load_pickle_from_buffer
+# ----------------------------------------------------------------------
+# pandas-specific serialization matters
+
+def _register_custom_pandas_handlers(context):
+ # ARROW-1784, faster path for pandas-only visibility
+
+ try:
+ import pandas as pd
+ except ImportError:
+ return
+
+ import pyarrow.pandas_compat as pdcompat
+
+ def _serialize_pandas_dataframe(obj):
+ return pdcompat.dataframe_to_serialized_dict(obj)
+
+ def _deserialize_pandas_dataframe(data):
+ return pdcompat.serialized_dict_to_dataframe(data)
+
+ def _serialize_pandas_series(obj):
+ return _serialize_pandas_dataframe(pd.DataFrame({obj.name: obj}))
+
+ def _deserialize_pandas_series(data):
+ deserialized = _deserialize_pandas_dataframe(data)
+ return deserialized[deserialized.columns[0]]
+
+ context.register_type(
+ pd.Series, 'pd.Series',
+ custom_serializer=_serialize_pandas_series,
+ custom_deserializer=_deserialize_pandas_series)
+
+ context.register_type(
+ pd.Index, 'pd.Index',
+ custom_serializer=_pickle_to_buffer,
+ custom_deserializer=_load_pickle_from_buffer)
+
+ context.register_type(
+ pd.DataFrame, 'pd.DataFrame',
+ custom_serializer=_serialize_pandas_dataframe,
+ custom_deserializer=_deserialize_pandas_dataframe)
+
+
def register_default_serialization_handlers(serialization_context):
# ----------------------------------------------------------------------
@@ -136,90 +177,13 @@ def _deserialize_torch_tensor(data):
# no torch
pass
-
-register_default_serialization_handlers(_default_serialization_context)
+ _register_custom_pandas_handlers(serialization_context)
-# ----------------------------------------------------------------------
-# pandas-specific serialization matters
-
+register_default_serialization_handlers(_default_serialization_context)
pandas_serialization_context = _default_serialization_context.clone()
-
-def _register_pandas_arrow_handlers(context):
- try:
- import pandas as pd
- except ImportError:
- return
-
- def _serialize_pandas_series(obj):
- return serialize_pandas(pd.DataFrame({obj.name: obj}))
-
- def _deserialize_pandas_series(data):
- deserialized = deserialize_pandas(data)
- return deserialized[deserialized.columns[0]]
-
- def _serialize_pandas_dataframe(obj):
- return serialize_pandas(obj)
-
- def _deserialize_pandas_dataframe(data):
- return deserialize_pandas(data)
-
- context.register_type(
- pd.Series, 'pd.Series',
- custom_serializer=_serialize_pandas_series,
- custom_deserializer=_deserialize_pandas_series)
-
- context.register_type(
- pd.DataFrame, 'pd.DataFrame',
- custom_serializer=_serialize_pandas_dataframe,
- custom_deserializer=_deserialize_pandas_dataframe)
-
-
-def _register_custom_pandas_handlers(context):
- # ARROW-1784, faster path for pandas-only visibility
-
- try:
- import pandas as pd
- except ImportError:
- return
-
- import pyarrow.pandas_compat as pdcompat
-
- def _serialize_pandas_dataframe(obj):
- return pdcompat.dataframe_to_serialized_dict(obj)
-
- def _deserialize_pandas_dataframe(data):
- return pdcompat.serialized_dict_to_dataframe(data)
-
- def _serialize_pandas_series(obj):
- return _serialize_pandas_dataframe(pd.DataFrame({obj.name: obj}))
-
- def _deserialize_pandas_series(data):
- deserialized = _deserialize_pandas_dataframe(data)
- return deserialized[deserialized.columns[0]]
-
- context.register_type(
- pd.Series, 'pd.Series',
- custom_serializer=_serialize_pandas_series,
- custom_deserializer=_deserialize_pandas_series)
-
- context.register_type(
- pd.Index, 'pd.Index',
- custom_serializer=_pickle_to_buffer,
- custom_deserializer=_load_pickle_from_buffer)
-
- context.register_type(
- pd.DataFrame, 'pd.DataFrame',
- custom_serializer=_serialize_pandas_dataframe,
- custom_deserializer=_deserialize_pandas_dataframe)
-
-
-_register_pandas_arrow_handlers(_default_serialization_context)
-_register_custom_pandas_handlers(pandas_serialization_context)
-
-
pandas_serialization_context.register_type(
np.ndarray, 'np.array',
custom_serializer=_serialize_numpy_array_pickle,
diff --git a/site/index.html b/site/index.html
index ffa8d54af2f..87995cbabed 100644
--- a/site/index.html
+++ b/site/index.html
@@ -38,6 +38,10 @@ Standard
projects, including Calcite, Cassandra, Drill, Hadoop, HBase, Ibis,
Impala, Kudu, Pandas, Parquet, Phoenix, Spark, and Storm making it
the de-facto standard for columnar in-memory analytics.
+
+ Learn more about projects that are Powered By Apache Arrow
+
diff --git a/site/powered_by.md b/site/powered_by.md
index 56f6e2bcee4..40332026798 100644
--- a/site/powered_by.md
+++ b/site/powered_by.md
@@ -43,11 +43,9 @@ names, etc.) like "arrow-foo". These are permitted. Nominative use of trademarks
in descriptions is also always allowed, as in "BigCoProduct is a widget for
Apache Arrow".
-### Open Source Projects
-
-To add yourself to the list, please email dev@arrow.apache.org with your
+To add yourself to the list, please open a pull request adding your
organization name, URL, a list of which Arrow components you are using, and a
-short description of your use case.
+short description of your use case. See the following for some examples.
* **[Apache Parquet][3]:** A columnar storage format available to any project
in the Hadoop ecosystem, regardless of the choice of data processing
@@ -61,10 +59,23 @@ short description of your use case.
* **[Dask][15]:** Python library for parallel and distributed execution of
dynamic task graphs. Dask supports using pyarrow for accessing Parquet
files
+* **[Dremio][9]:** A self-service data platform. Dremio makes it easy for
+ users to discover, curate, accelerate, and share data from any source.
+ It includes a distributed SQL execution engine based on Apache Arrow.
+ Dremio reads data from any source (RDBMS, HDFS, S3, NoSQL) into Arrow
+ buffers, and provides fast SQL access via ODBC, JDBC, and REST for BI,
+ Python, R, and more (all backed by Apache Arrow).
* **[GeoMesa][8]:** A suite of tools that enables large-scale geospatial query
and analytics on distributed computing systems. GeoMesa supports query
results in the Arrow IPC format, which can then be used for in-browser
visualizations and/or further analytics.
+* **[GOAI][19]:** Open GPU-Accelerated Analytics Initiative for Arrow-powered
+ analytics across GPU tools and vendors
+* **[Graphistry][18]:** Supercharged Visual Investigation Platform used by
+ teams for security, anti-fraud, and related investigations. The Graphistry
+ team uses Arrow in its NodeJS GPU backend and client libraries, and is an
+ early contributing member to GOAI and Arrow\[JS\] focused on bringing these
+ technologies to the enterprise.
* **[libgdf][14]:** A C library of CUDA-based analytics functions and GPU IPC
support for structured data. Uses the Arrow IPC format and targets the Arrow
memory layout in its analytic functions. This work is part of the [GPU Open
@@ -75,6 +86,9 @@ short description of your use case.
* **[pandas][12]:** data analysis toolkit for Python programmers. pandas
supports reading and writing Parquet files using pyarrow. Several pandas
core developers are also contributors to Apache Arrow.
+* **[Quilt Data][13]:** Quilt is a data package manager, designed to make
+ managing data as easy as managing code. It supports Parquet format via
+ pyarrow for data access.
* **[Ray][5]:** A flexible, high-performance distributed execution framework
with a focus on machine learning and AI applications. Uses Arrow to
efficiently store Python data structures containing large arrays of numerical
@@ -91,29 +105,6 @@ short description of your use case.
Arrow Tables and RecordBatches in addition to the Python Database API
Specification 2.0.
-### Companies and Organizations
-
-To add yourself to the list, please email dev@arrow.apache.org with your
-organization name, URL, a list of which Arrow components you are using, and a
-short description of your use case.
-
-* **[Dremio][9]:** A self-service data platform. Dremio makes it easy for
- users to discover, curate, accelerate, and share data from any source.
- It includes a distributed SQL execution engine based on Apache Arrow.
- Dremio reads data from any source (RDBMS, HDFS, S3, NoSQL) into Arrow
- buffers, and provides fast SQL access via ODBC, JDBC, and REST for BI,
- Python, R, and more (all backed by Apache Arrow).
-* **[GOAI][19]:** Open GPU-Accelerated Analytics Initiative for Arrow-powered
- analytics across GPU tools and vendors
-* **[Graphistry][18]:** Supercharged Visual Investigation Platform used by
- teams for security, anti-fraud, and related investigations. The Graphistry
- team uses Arrow in its NodeJS GPU backend and client libraries, and is an
- early contributing member to GOAI and Arrow\[JS\] focused on bringing these
- technologies to the enterprise.
-* **[Quilt Data][13]:** Quilt is a data package manager, designed to make
- managing data as easy as managing code. It supports Parquet format via
- pyarrow for data access.
-
[1]: https://www.apache.org/foundation/marks/
[2]: https://www.apache.org/foundation/marks/faq/
[3]: https://parquet.apache.org/