From f16c37d34ecf676207c5ae09d96c151fba3bc96d Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 13 Jun 2025 11:12:33 -0400 Subject: [PATCH 01/14] Use KvikIO's implementation of file-backed memory mapping --- cpp/cmake/thirdparty/get_kvikio.cmake | 6 +- cpp/src/io/utilities/datasource.cpp | 142 +++++--------------------- 2 files changed, 27 insertions(+), 121 deletions(-) diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index 73f875b46c2..243fa946950 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -19,8 +19,8 @@ function(find_and_configure_kvikio VERSION) kvikio ${VERSION} GLOBAL_TARGETS kvikio::kvikio CPM_ARGS - GIT_REPOSITORY https://github.com/rapidsai/kvikio.git - GIT_TAG branch-${VERSION} + GIT_REPOSITORY https://github.com/kingcrimsontianyu/kvikio.git + GIT_TAG python-host-mmap-read GIT_SHALLOW TRUE SOURCE_SUBDIR cpp OPTIONS "KvikIO_BUILD_EXAMPLES OFF" "KvikIO_REMOTE_SUPPORT ${CUDF_KVIKIO_REMOTE_IO}" ) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 4b2ff497f33..e51decfd77e 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -25,6 +25,7 @@ #include #include +#include #include @@ -109,20 +110,8 @@ class kvikio_source : public datasource { rmm::cuda_stream_view stream) override { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - auto const read_size = std::min(size, this->size() - offset); - - if constexpr (std::is_same_v) { - return _kvikio_handle.pread(dst, - read_size, - offset, - kvikio::defaults::task_size(), - kvikio::defaults::gds_threshold(), - false /* not to sync_default_stream */); - } else { - // HandleT is kvikio::RemoteHandle - return _kvikio_handle.pread(dst, read_size, offset); - } + return _kvikio_handle.pread(dst, read_size, offset); } size_t device_read(size_t offset, @@ -167,6 +156,21 @@ class file_source : public kvikio_source { "Reading a file using kvikIO, with compatibility mode %s.", _kvikio_handle.get_compat_mode_manager().is_compat_mode_preferred() ? "on" : "off"); } + + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); + auto const read_size = std::min(size, this->size() - offset); + return _kvikio_handle.pread(dst, + read_size, + offset, + kvikio::defaults::task_size(), + kvikio::defaults::gds_threshold(), + false /* not to sync_default_stream */); + } }; /** @@ -175,117 +179,19 @@ class file_source : public kvikio_source { * Unlike Arrow's memory mapped IO class, this implementation allows memory mapping a subset of the * file where the starting offset may not be zero. */ -class memory_mapped_source : public file_source { +class memory_mapped_source : public kvikio_source { public: explicit memory_mapped_source(char const* filepath, size_t offset, size_t max_size_estimate) - : file_source(filepath) + : kvikio_source{kvikio::MmapHandle()} { if (this->size() != 0) { - // Memory mapping is not exclusive, so we can include the whole region we expect to read - map(_kvikio_handle.fd(), offset, max_size_estimate); + CUDF_EXPECTS(offset < this->size(), "Offset is past end of file", std::overflow_error); + if (max_size_estimate == 0 || (offset + max_size_estimate) > this->size()) { + max_size_estimate = this->size() - offset; + } + _kvikio_handle = kvikio::MmapHandle(filepath, "r", offset, max_size_estimate); } } - - ~memory_mapped_source() override - { - if (_map_addr != nullptr) { unmap(); } - } - - std::unique_ptr host_read(size_t offset, size_t size) override - { - // Clamp length to available data - auto const read_size = std::min(size, this->size() - offset); - - // If the requested range is outside of the mapped region, read from the file - if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) { - return file_source::host_read(offset, read_size); - } - - // If the requested range is only partially within the registered region, copy to a new - // host buffer to make the data safe to copy to the device - if (_reg_addr != nullptr and - (offset < _reg_offset or offset + read_size > (_reg_offset + _reg_size))) { - auto const src = static_cast(_map_addr) + (offset - _map_offset); - - return std::make_unique>>( - std::vector(src, src + read_size)); - } - - return std::make_unique( - static_cast(_map_addr) + offset - _map_offset, read_size); - } - - std::future> host_read_async(size_t offset, - size_t size) override - { - // Use the default implementation instead of the file_source's implementation - return datasource::host_read_async(offset, size); - } - - size_t host_read(size_t offset, size_t size, uint8_t* dst) override - { - // Clamp length to available data - auto const read_size = std::min(size, this->size() - offset); - - // If the requested range is outside of the mapped region, read from the file - if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) { - return file_source::host_read(offset, read_size, dst); - } - - auto const src = static_cast(_map_addr) + (offset - _map_offset); - std::memcpy(dst, src, read_size); - return read_size; - } - - std::future host_read_async(size_t offset, size_t size, uint8_t* dst) override - { - // Use the default implementation instead of the file_source's implementation - return datasource::host_read_async(offset, size, dst); - } - - [[nodiscard]] bool supports_device_read() const override { return false; } - - [[nodiscard]] bool is_device_read_preferred(size_t size) const override - { - return supports_device_read(); - } - - private: - void map(int fd, size_t offset, size_t size) - { - CUDF_EXPECTS(offset < this->size(), "Offset is past end of file", std::overflow_error); - - // Offset for `mmap()` must be page aligned - _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1); - - if (size == 0 || (offset + size) > this->size()) { size = this->size() - offset; } - - // Size for `mmap()` needs to include the page padding - _map_size = size + (offset - _map_offset); - if (_map_size == 0) { return; } - - // Check if accessing a region within already mapped area - _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset); - CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping"); - } - - void unmap() - { - if (_map_addr != nullptr) { - auto const result = munmap(_map_addr, _map_size); - if (result != 0) { CUDF_LOG_WARN("munmap failed with %d", result); } - _map_addr = nullptr; - } - } - - private: - size_t _map_offset = 0; - size_t _map_size = 0; - void* _map_addr = nullptr; - - size_t _reg_offset = 0; - size_t _reg_size = 0; - void* _reg_addr = nullptr; }; /** From a09c563811e634494d53d44a65857539dfd2c089 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 13 Jun 2025 16:36:43 -0400 Subject: [PATCH 02/14] Update --- cpp/src/io/utilities/datasource.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index e51decfd77e..40b75ced983 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -15,6 +15,7 @@ */ #include "getenv_or.hpp" +#include "kvikio/file_utils.hpp" #include #include @@ -184,12 +185,15 @@ class memory_mapped_source : public kvikio_source { explicit memory_mapped_source(char const* filepath, size_t offset, size_t max_size_estimate) : kvikio_source{kvikio::MmapHandle()} { - if (this->size() != 0) { - CUDF_EXPECTS(offset < this->size(), "Offset is past end of file", std::overflow_error); - if (max_size_estimate == 0 || (offset + max_size_estimate) > this->size()) { - max_size_estimate = this->size() - offset; + // Since the superclass kvikio_source is initialized with an empty mmap handle, `this->size()` + // returns 0 at this point. Use `kvikio::get_file_size()` instead. + auto const file_size = kvikio::get_file_size(filepath); + if (file_size != 0) { + CUDF_EXPECTS(offset < file_size, "Offset is past end of file", std::overflow_error); + if (max_size_estimate == 0 || (offset + max_size_estimate) > file_size) { + max_size_estimate = file_size - offset; } - _kvikio_handle = kvikio::MmapHandle(filepath, "r", offset, max_size_estimate); + _kvikio_handle = kvikio::MmapHandle(filepath, "r", max_size_estimate, offset); } } }; From 51ee3b2ca08abcbcc69dac31a8048a52854b4eb2 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 13 Jun 2025 16:38:51 -0400 Subject: [PATCH 03/14] Update --- cpp/src/io/utilities/datasource.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 40b75ced983..4f9658038a3 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -15,7 +15,6 @@ */ #include "getenv_or.hpp" -#include "kvikio/file_utils.hpp" #include #include @@ -26,6 +25,7 @@ #include #include +#include #include #include From bae6b6d28795e927fa615eb58ff7c6a64685bd15 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Wed, 23 Jul 2025 16:07:14 -0400 Subject: [PATCH 04/14] Update --- cpp/cmake/thirdparty/get_kvikio.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index 243fa946950..73f875b46c2 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2025, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -19,8 +19,8 @@ function(find_and_configure_kvikio VERSION) kvikio ${VERSION} GLOBAL_TARGETS kvikio::kvikio CPM_ARGS - GIT_REPOSITORY https://github.com/kingcrimsontianyu/kvikio.git - GIT_TAG python-host-mmap-read + GIT_REPOSITORY https://github.com/rapidsai/kvikio.git + GIT_TAG branch-${VERSION} GIT_SHALLOW TRUE SOURCE_SUBDIR cpp OPTIONS "KvikIO_BUILD_EXAMPLES OFF" "KvikIO_REMOTE_SUPPORT ${CUDF_KVIKIO_REMOTE_IO}" ) From 83504562ba794de402c957c005a1966790ec17cc Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Wed, 23 Jul 2025 23:07:02 -0400 Subject: [PATCH 05/14] Fix unit test error --- cpp/src/io/utilities/datasource.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 4f9658038a3..4edf0ac7f6d 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -193,7 +193,8 @@ class memory_mapped_source : public kvikio_source { if (max_size_estimate == 0 || (offset + max_size_estimate) > file_size) { max_size_estimate = file_size - offset; } - _kvikio_handle = kvikio::MmapHandle(filepath, "r", max_size_estimate, offset); + _kvikio_handle = + kvikio::MmapHandle(filepath, "r", std::nullopt, 0, kvikio::FileHandle::m644, MAP_SHARED); } } }; From d8869aa4890781f665f4500249ea3077874d1044 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Sun, 24 Aug 2025 00:55:10 -0400 Subject: [PATCH 06/14] Use KvikIO's versatile remote file interface to infer the endpoint type --- cpp/cmake/thirdparty/get_kvikio.cmake | 5 +++-- cpp/src/io/utilities/datasource.cpp | 30 ++++++++++++++------------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index 73f875b46c2..54947f96c85 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -19,8 +19,9 @@ function(find_and_configure_kvikio VERSION) kvikio ${VERSION} GLOBAL_TARGETS kvikio::kvikio CPM_ARGS - GIT_REPOSITORY https://github.com/rapidsai/kvikio.git - GIT_TAG branch-${VERSION} + GIT_REPOSITORY https://github.com/kingcrimsontianyu/kvikio.git + # GIT_TAG branch-${VERSION} + GIT_TAG remote-io-easy-interface GIT_SHALLOW TRUE SOURCE_SUBDIR cpp OPTIONS "KvikIO_BUILD_EXAMPLES OFF" "KvikIO_REMOTE_SUPPORT ${CUDF_KVIKIO_REMOTE_IO}" ) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 4edf0ac7f6d..d7d1126b7a8 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -369,27 +369,29 @@ class user_datasource_wrapper : public datasource { * @brief Remote file source backed by KvikIO, which handles S3 filepaths seamlessly. */ class remote_file_source : public kvikio_source { - static auto create_s3_handle(char const* filepath) + public: + explicit remote_file_source(char const* filepath) + : kvikio_source{kvikio::RemoteHandle::open(filepath)} { - return kvikio::RemoteHandle{ - std::make_unique(kvikio::S3Endpoint::parse_s3_url(filepath))}; } - public: - explicit remote_file_source(char const* filepath) : kvikio_source{create_s3_handle(filepath)} {} - ~remote_file_source() override = default; /** - * @brief Is `url` referring to a remote file supported by KvikIO? + * @brief Checks if a path has a URL scheme format that could indicate a remote resource * - * For now, only S3 urls (urls starting with "s3://") are supported. + * @note Strictly speaking, there is no definitive way to tell if a given file path refers to a + * remote or local file. For instance, it is legal to have a local directory named `s3:` and its + * file accessed by `s3:///` (the double slash is collapsed into a single + * slash), coincidentally taking on the remote S3 format. Here we ignore this special case and use + * a more practical approach: a file path is considered remote simply if it has a RFC + * 3986-conformant URL scheme. */ - static bool is_supported_remote_url(std::string const& url) + static bool could_be_remote_url(std::string const& filepath) { - // Regular expression to match "s3://" - static std::regex const pattern{R"(^s3://)", std::regex_constants::icase}; - return std::regex_search(url, pattern); + // Regular expression to match the URL scheme conforming to RFC 3986 + static std::regex const pattern{R"(^[a-zA-Z][a-zA-Z0-9+.-]*://)", std::regex_constants::icase}; + return std::regex_search(filepath, pattern); } }; #else @@ -399,7 +401,7 @@ class remote_file_source : public kvikio_source { class remote_file_source : public file_source { public: explicit remote_file_source(char const* filepath) : file_source(filepath) {} - static constexpr bool is_supported_remote_url(std::string const&) { return false; } + static constexpr bool could_be_remote_url(std::string const&) { return false; } }; #endif } // namespace @@ -416,7 +418,7 @@ std::unique_ptr datasource::create(std::string const& filepath, CUDF_FAIL("Invalid LIBCUDF_MMAP_ENABLED value: " + policy); }(); - if (remote_file_source::is_supported_remote_url(filepath)) { + if (remote_file_source::could_be_remote_url(filepath)) { return std::make_unique(filepath.c_str()); } else if (use_memory_mapping) { return std::make_unique(filepath.c_str(), offset, max_size_estimate); From cd15bc6d44d9c8e021a8a498d6371aa767ace004 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 25 Aug 2025 10:18:15 -0400 Subject: [PATCH 07/14] Cherry-pick build fix --- cpp/cmake/thirdparty/get_jitify.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_jitify.cmake b/cpp/cmake/thirdparty/get_jitify.cmake index b6f11e30d28..5db4e3e907f 100644 --- a/cpp/cmake/thirdparty/get_jitify.cmake +++ b/cpp/cmake/thirdparty/get_jitify.cmake @@ -19,8 +19,8 @@ function(find_and_configure_jitify) rapids_cpm_find( jitify 2.0.0 GIT_REPOSITORY https://github.com/NVIDIA/jitify.git - GIT_TAG 70783a3ad7b0cad2992a26a1ebf8fbe3d6b44e25 # jitify2 branch as of 5th Aug 2025 - GIT_SHALLOW TRUE + GIT_TAG 44e978b21fc8bdb6b2d7d8d179523c8350db72e5 # jitify2 branch as of 23rd Aug 2025 + GIT_SHALLOW FALSE DOWNLOAD_ONLY TRUE ) set(JITIFY_INCLUDE_DIR From 10bf781cca65108e46a079f91c98a0efa74df0a6 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 25 Aug 2025 19:42:33 -0400 Subject: [PATCH 08/14] Revert temp changes to jitify and kvikio cmake files --- cpp/cmake/thirdparty/get_jitify.cmake | 4 ++-- cpp/cmake/thirdparty/get_kvikio.cmake | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/cmake/thirdparty/get_jitify.cmake b/cpp/cmake/thirdparty/get_jitify.cmake index 5db4e3e907f..b6f11e30d28 100644 --- a/cpp/cmake/thirdparty/get_jitify.cmake +++ b/cpp/cmake/thirdparty/get_jitify.cmake @@ -19,8 +19,8 @@ function(find_and_configure_jitify) rapids_cpm_find( jitify 2.0.0 GIT_REPOSITORY https://github.com/NVIDIA/jitify.git - GIT_TAG 44e978b21fc8bdb6b2d7d8d179523c8350db72e5 # jitify2 branch as of 23rd Aug 2025 - GIT_SHALLOW FALSE + GIT_TAG 70783a3ad7b0cad2992a26a1ebf8fbe3d6b44e25 # jitify2 branch as of 5th Aug 2025 + GIT_SHALLOW TRUE DOWNLOAD_ONLY TRUE ) set(JITIFY_INCLUDE_DIR diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index 54947f96c85..73f875b46c2 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -19,9 +19,8 @@ function(find_and_configure_kvikio VERSION) kvikio ${VERSION} GLOBAL_TARGETS kvikio::kvikio CPM_ARGS - GIT_REPOSITORY https://github.com/kingcrimsontianyu/kvikio.git - # GIT_TAG branch-${VERSION} - GIT_TAG remote-io-easy-interface + GIT_REPOSITORY https://github.com/rapidsai/kvikio.git + GIT_TAG branch-${VERSION} GIT_SHALLOW TRUE SOURCE_SUBDIR cpp OPTIONS "KvikIO_BUILD_EXAMPLES OFF" "KvikIO_REMOTE_SUPPORT ${CUDF_KVIKIO_REMOTE_IO}" ) From 67ef92b62099380497afb6b19f13029d485b1536 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Thu, 28 Aug 2025 14:04:52 +0000 Subject: [PATCH 09/14] Update pylibcudf --- python/pylibcudf/pylibcudf/io/types.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 844f21e56ba..f37982a47b4 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -468,7 +468,7 @@ cdef class SourceInfo: different types of sources will raise a `ValueError`. """ # Regular expression that match remote file paths supported by libcudf - _is_remote_file_pattern = re.compile(r"^s3://", re.IGNORECASE) + _is_remote_file_pattern = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.-]*://", re.IGNORECASE) def __init__(self, list sources): if not sources: From a4c33211e819a7129f64006733a3f9a219863ed7 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Wed, 3 Sep 2025 23:34:42 -0400 Subject: [PATCH 10/14] Prepend additional message for remote file exception --- cpp/src/io/utilities/datasource.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index ffe4bfe4e3e..b13a9f908fc 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -34,7 +34,9 @@ #include #include +#include #include +#include #include #ifdef CUDF_KVIKIO_REMOTE_IO @@ -417,8 +419,14 @@ std::unique_ptr datasource::create(std::string const& filepath, CUDF_FAIL("Invalid LIBCUDF_MMAP_ENABLED value: " + policy); }(); + if (remote_file_source::could_be_remote_url(filepath)) { - return std::make_unique(filepath.c_str()); + try { + return std::make_unique(filepath.c_str()); + } catch (std::exception const& ex) { + CUDF_FAIL("Error accessing the remote file \"" + filepath + "\". Reason: " + ex.what(), + std::runtime_error); + } } else if (use_memory_mapping) { return std::make_unique(filepath.c_str(), offset, max_size_estimate); } else { From 09378bf029727911a7b3513a42f51d99c800afc1 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Thu, 4 Sep 2025 17:55:42 -0400 Subject: [PATCH 11/14] Remove filepath from error message --- cpp/src/io/utilities/datasource.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index b13a9f908fc..3c36c31daad 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -424,8 +424,7 @@ std::unique_ptr datasource::create(std::string const& filepath, try { return std::make_unique(filepath.c_str()); } catch (std::exception const& ex) { - CUDF_FAIL("Error accessing the remote file \"" + filepath + "\". Reason: " + ex.what(), - std::runtime_error); + CUDF_FAIL("Error accessing the remote file. Reason: " + ex.what(), std::runtime_error); } } else if (use_memory_mapping) { return std::make_unique(filepath.c_str(), offset, max_size_estimate); From 2e8403898ef6907e7daf2cb84c8f1c68e54a155f Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 5 Sep 2025 10:15:46 -0400 Subject: [PATCH 12/14] Redact remote file path --- cpp/src/io/utilities/datasource.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 3c36c31daad..6f0b1432293 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -424,7 +424,15 @@ std::unique_ptr datasource::create(std::string const& filepath, try { return std::make_unique(filepath.c_str()); } catch (std::exception const& ex) { - CUDF_FAIL("Error accessing the remote file. Reason: " + ex.what(), std::runtime_error); + std::string redacted_msg; + try { + // For security reasons, redact the file path if any from KvikIO's exception message + redacted_msg = + std::regex_replace(ex.what(), std::regex{filepath}, ""); + } catch (std::exception const& ex) { + redacted_msg = " unknown due to additional process error"; + } + CUDF_FAIL("Error accessing the remote file. Reason: " + redacted_msg, std::runtime_error); } } else if (use_memory_mapping) { return std::make_unique(filepath.c_str(), offset, max_size_estimate); From 2f1b12edd8e2cb838927bb36a0fd1f076cec44a3 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 5 Sep 2025 10:26:40 -0400 Subject: [PATCH 13/14] Remove an unused header --- cpp/src/io/utilities/datasource.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 6f0b1432293..29accb99f3e 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -34,7 +34,6 @@ #include #include -#include #include #include #include From c1c2d799cdcd512412ebfc2be23f9dc96cb8ff74 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 5 Sep 2025 10:31:15 -0400 Subject: [PATCH 14/14] Remove another unused header that is automatically added by reformatter --- cpp/src/io/utilities/datasource.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 29accb99f3e..67b0d23e29f 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -35,7 +35,6 @@ #include #include -#include #include #ifdef CUDF_KVIKIO_REMOTE_IO