-
Notifications
You must be signed in to change notification settings - Fork 1k
Use KvikIO's unified interface to create remote I/O endpoints #19788
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f16c37d
a09c563
51ee3b2
f75e3dc
63807a0
8f015f4
f315954
bae6b6d
bccb387
8350456
5ea88c7
ab0d28c
513cfa0
9a9a423
d8869aa
cd15bc6
10bf781
3d9dc0d
0a908f4
67ef92b
0047a6d
0bcee77
a4c3321
c338a74
09378bf
2e84038
2f1b12e
c1c2d79
4fe15e3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -368,27 +368,29 @@ class user_datasource_wrapper : public datasource { | |
| * @brief Remote file source backed by KvikIO, which handles S3 filepaths seamlessly. | ||
| */ | ||
| class remote_file_source : public kvikio_source<kvikio::RemoteHandle> { | ||
| static auto create_s3_handle(char const* filepath) | ||
| public: | ||
| explicit remote_file_source(char const* filepath) | ||
| : kvikio_source{kvikio::RemoteHandle::open(filepath)} | ||
| { | ||
| return kvikio::RemoteHandle{ | ||
| std::make_unique<kvikio::S3Endpoint>(kvikio::S3Endpoint::parse_s3_url(filepath))}; | ||
| } | ||
|
|
||
| public: | ||
| explicit remote_file_source(char const* filepath) : kvikio_source{create_s3_handle(filepath)} {} | ||
|
|
||
| ~remote_file_source() override = default; | ||
|
|
||
| /** | ||
| * @brief Is `url` referring to a remote file supported by KvikIO? | ||
| * @brief Checks if a path has a URL scheme format that could indicate a remote resource | ||
| * | ||
| * For now, only S3 urls (urls starting with "s3://") are supported. | ||
| * @note Strictly speaking, there is no definitive way to tell if a given file path refers to a | ||
| * remote or local file. For instance, it is legal to have a local directory named `s3:` and its | ||
| * file accessed by `s3://<sub-dir>/<file-name>` (the double slash is collapsed into a single | ||
| * slash), coincidentally taking on the remote S3 format. Here we ignore this special case and use | ||
| * a more practical approach: a file path is considered remote simply if it has a RFC | ||
| * 3986-conformant URL scheme. | ||
| */ | ||
| static bool is_supported_remote_url(std::string const& url) | ||
| static bool could_be_remote_url(std::string const& filepath) | ||
vuule marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| { | ||
| // Regular expression to match "s3://" | ||
| static std::regex const pattern{R"(^s3://)", std::regex_constants::icase}; | ||
| return std::regex_search(url, pattern); | ||
| // Regular expression to match the URL scheme conforming to RFC 3986 | ||
| static std::regex const pattern{R"(^[a-zA-Z][a-zA-Z0-9+.-]*://)", std::regex_constants::icase}; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the behavior today with
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a good question to consider. I checked the relevant information. The file URI is specified by RFC 8089, and is indeed supported by |
||
| return std::regex_search(filepath, pattern); | ||
| } | ||
| }; | ||
| #else | ||
|
|
@@ -398,7 +400,7 @@ class remote_file_source : public kvikio_source<kvikio::RemoteHandle> { | |
| class remote_file_source : public file_source { | ||
| public: | ||
| explicit remote_file_source(char const* filepath) : file_source(filepath) {} | ||
| static constexpr bool is_supported_remote_url(std::string const&) { return false; } | ||
| static constexpr bool could_be_remote_url(std::string const&) { return false; } | ||
| }; | ||
| #endif | ||
| } // namespace | ||
|
|
@@ -415,8 +417,21 @@ std::unique_ptr<datasource> datasource::create(std::string const& filepath, | |
|
|
||
| CUDF_FAIL("Invalid LIBCUDF_MMAP_ENABLED value: " + policy); | ||
| }(); | ||
| if (remote_file_source::is_supported_remote_url(filepath)) { | ||
| return std::make_unique<remote_file_source>(filepath.c_str()); | ||
|
|
||
| if (remote_file_source::could_be_remote_url(filepath)) { | ||
| try { | ||
| return std::make_unique<remote_file_source>(filepath.c_str()); | ||
| } catch (std::exception const& ex) { | ||
| std::string redacted_msg; | ||
| try { | ||
| // For security reasons, redact the file path if any from KvikIO's exception message | ||
| redacted_msg = | ||
| std::regex_replace(ex.what(), std::regex{filepath}, "<redacted-remote-file-path>"); | ||
|
Comment on lines
+428
to
+429
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wow :D |
||
| } catch (std::exception const& ex) { | ||
| redacted_msg = " unknown due to additional process error"; | ||
| } | ||
| CUDF_FAIL("Error accessing the remote file. Reason: " + redacted_msg, std::runtime_error); | ||
| } | ||
| } else if (use_memory_mapping) { | ||
| return std::make_unique<memory_mapped_source>(filepath.c_str(), offset, max_size_estimate); | ||
| } else { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
openutility function is a recent addition in KvikIO: https://github.com/rapidsai/kvikio/blob/6efd22dc6ae3389caea7d3e736c7f954b9db0619/cpp/include/kvikio/remote_handle.hpp#L370