diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index 6b94d6118c2..5a297742474 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -245,6 +245,17 @@ Result> FileSystem::OpenAppendStream( ////////////////////////////////////////////////////////////////////////// // SubTreeFileSystem implementation +namespace { + +Status ValidateSubPath(util::string_view s) { + if (internal::IsLikelyUri(s)) { + return Status::Invalid("Expected a filesystem path, got a URI: '", s, "'"); + } + return Status::OK(); +} + +} // namespace + SubTreeFileSystem::SubTreeFileSystem(const std::string& base_path, std::shared_ptr base_fs) : FileSystem(base_fs->io_context()), @@ -270,7 +281,8 @@ bool SubTreeFileSystem::Equals(const FileSystem& other) const { return base_path_ == subfs.base_path_ && base_fs_->Equals(subfs.base_fs_); } -std::string SubTreeFileSystem::PrependBase(const std::string& s) const { +Result SubTreeFileSystem::PrependBase(const std::string& s) const { + RETURN_NOT_OK(ValidateSubPath(s)); if (s.empty()) { return base_path_; } else { @@ -278,12 +290,12 @@ std::string SubTreeFileSystem::PrependBase(const std::string& s) const { } } -Status SubTreeFileSystem::PrependBaseNonEmpty(std::string* s) const { - if (s->empty()) { +Result SubTreeFileSystem::PrependBaseNonEmpty(const std::string& s) const { + RETURN_NOT_OK(ValidateSubPath(s)); + if (s.empty()) { return Status::IOError("Empty path"); } else { - *s = ConcatAbstractPath(base_path_, *s); - return Status::OK(); + return ConcatAbstractPath(base_path_, s); } } @@ -305,19 +317,21 @@ Status SubTreeFileSystem::FixInfo(FileInfo* info) const { } Result SubTreeFileSystem::NormalizePath(std::string path) { - ARROW_ASSIGN_OR_RAISE(auto normalized, base_fs_->NormalizePath(PrependBase(path))); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBase(path)); + ARROW_ASSIGN_OR_RAISE(auto normalized, base_fs_->NormalizePath(real_path)); return StripBase(std::move(normalized)); } Result SubTreeFileSystem::GetFileInfo(const std::string& path) { - ARROW_ASSIGN_OR_RAISE(FileInfo info, base_fs_->GetFileInfo(PrependBase(path))); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBase(path)); + ARROW_ASSIGN_OR_RAISE(FileInfo info, base_fs_->GetFileInfo(real_path)); RETURN_NOT_OK(FixInfo(&info)); return info; } Result> SubTreeFileSystem::GetFileInfo(const FileSelector& select) { auto selector = select; - selector.base_dir = PrependBase(selector.base_dir); + ARROW_ASSIGN_OR_RAISE(selector.base_dir, PrependBase(selector.base_dir)); ARROW_ASSIGN_OR_RAISE(auto infos, base_fs_->GetFileInfo(selector)); for (auto& info : infos) { RETURN_NOT_OK(FixInfo(&info)); @@ -327,7 +341,11 @@ Result> SubTreeFileSystem::GetFileInfo(const FileSelector& FileInfoGenerator SubTreeFileSystem::GetFileInfoGenerator(const FileSelector& select) { auto selector = select; - selector.base_dir = PrependBase(selector.base_dir); + auto maybe_base_dir = PrependBase(selector.base_dir); + if (!maybe_base_dir.ok()) { + return MakeFailingGenerator>(maybe_base_dir.status()); + } + selector.base_dir = *std::move(maybe_base_dir); auto gen = base_fs_->GetFileInfoGenerator(selector); auto self = checked_pointer_cast(shared_from_this()); @@ -343,23 +361,21 @@ FileInfoGenerator SubTreeFileSystem::GetFileInfoGenerator(const FileSelector& se } Status SubTreeFileSystem::CreateDir(const std::string& path, bool recursive) { - auto s = path; - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); - return base_fs_->CreateDir(s, recursive); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBaseNonEmpty(path)); + return base_fs_->CreateDir(real_path, recursive); } Status SubTreeFileSystem::DeleteDir(const std::string& path) { - auto s = path; - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); - return base_fs_->DeleteDir(s); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBaseNonEmpty(path)); + return base_fs_->DeleteDir(real_path); } Status SubTreeFileSystem::DeleteDirContents(const std::string& path) { if (internal::IsEmptyPath(path)) { return internal::InvalidDeleteDirContents(path); } - auto s = PrependBase(path); - return base_fs_->DeleteDirContents(s); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBase(path)); + return base_fs_->DeleteDirContents(real_path); } Status SubTreeFileSystem::DeleteRootDirContents() { @@ -371,103 +387,88 @@ Status SubTreeFileSystem::DeleteRootDirContents() { } Status SubTreeFileSystem::DeleteFile(const std::string& path) { - auto s = path; - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); - return base_fs_->DeleteFile(s); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBaseNonEmpty(path)); + return base_fs_->DeleteFile(real_path); } Status SubTreeFileSystem::Move(const std::string& src, const std::string& dest) { - auto s = src; - auto d = dest; - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); - RETURN_NOT_OK(PrependBaseNonEmpty(&d)); - return base_fs_->Move(s, d); + ARROW_ASSIGN_OR_RAISE(auto real_src, PrependBaseNonEmpty(src)); + ARROW_ASSIGN_OR_RAISE(auto real_dest, PrependBaseNonEmpty(dest)); + return base_fs_->Move(real_src, real_dest); } Status SubTreeFileSystem::CopyFile(const std::string& src, const std::string& dest) { - auto s = src; - auto d = dest; - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); - RETURN_NOT_OK(PrependBaseNonEmpty(&d)); - return base_fs_->CopyFile(s, d); + ARROW_ASSIGN_OR_RAISE(auto real_src, PrependBaseNonEmpty(src)); + ARROW_ASSIGN_OR_RAISE(auto real_dest, PrependBaseNonEmpty(dest)); + return base_fs_->CopyFile(real_src, real_dest); } Result> SubTreeFileSystem::OpenInputStream( const std::string& path) { - auto s = path; - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); - return base_fs_->OpenInputStream(s); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBaseNonEmpty(path)); + return base_fs_->OpenInputStream(real_path); } Result> SubTreeFileSystem::OpenInputStream( const FileInfo& info) { - auto s = info.path(); - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBaseNonEmpty(info.path())); FileInfo new_info(info); - new_info.set_path(std::move(s)); + new_info.set_path(std::move(real_path)); return base_fs_->OpenInputStream(new_info); } Future> SubTreeFileSystem::OpenInputStreamAsync( const std::string& path) { - auto s = path; - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); - return base_fs_->OpenInputStreamAsync(s); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBaseNonEmpty(path)); + return base_fs_->OpenInputStreamAsync(real_path); } Future> SubTreeFileSystem::OpenInputStreamAsync( const FileInfo& info) { - auto s = info.path(); - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBaseNonEmpty(info.path())); FileInfo new_info(info); - new_info.set_path(std::move(s)); + new_info.set_path(std::move(real_path)); return base_fs_->OpenInputStreamAsync(new_info); } Result> SubTreeFileSystem::OpenInputFile( const std::string& path) { - auto s = path; - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); - return base_fs_->OpenInputFile(s); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBaseNonEmpty(path)); + return base_fs_->OpenInputFile(real_path); } Result> SubTreeFileSystem::OpenInputFile( const FileInfo& info) { - auto s = info.path(); - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBaseNonEmpty(info.path())); FileInfo new_info(info); - new_info.set_path(std::move(s)); + new_info.set_path(std::move(real_path)); return base_fs_->OpenInputFile(new_info); } Future> SubTreeFileSystem::OpenInputFileAsync( const std::string& path) { - auto s = path; - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); - return base_fs_->OpenInputFileAsync(s); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBaseNonEmpty(path)); + return base_fs_->OpenInputFileAsync(real_path); } Future> SubTreeFileSystem::OpenInputFileAsync( const FileInfo& info) { - auto s = info.path(); - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBaseNonEmpty(info.path())); FileInfo new_info(info); - new_info.set_path(std::move(s)); + new_info.set_path(std::move(real_path)); return base_fs_->OpenInputFileAsync(new_info); } Result> SubTreeFileSystem::OpenOutputStream( const std::string& path, const std::shared_ptr& metadata) { - auto s = path; - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); - return base_fs_->OpenOutputStream(s, metadata); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBaseNonEmpty(path)); + return base_fs_->OpenOutputStream(real_path, metadata); } Result> SubTreeFileSystem::OpenAppendStream( const std::string& path, const std::shared_ptr& metadata) { - auto s = path; - RETURN_NOT_OK(PrependBaseNonEmpty(&s)); - return base_fs_->OpenAppendStream(s, metadata); + ARROW_ASSIGN_OR_RAISE(auto real_path, PrependBaseNonEmpty(path)); + return base_fs_->OpenAppendStream(real_path, metadata); } ////////////////////////////////////////////////////////////////////////// diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index c760d36dd52..a7d7b7514e4 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -384,8 +384,8 @@ class ARROW_EXPORT SubTreeFileSystem : public FileSystem { const std::string base_path_; std::shared_ptr base_fs_; - std::string PrependBase(const std::string& s) const; - Status PrependBaseNonEmpty(std::string* s) const; + Result PrependBase(const std::string& s) const; + Result PrependBaseNonEmpty(const std::string& s) const; Result StripBase(const std::string& s) const; Status FixInfo(FileInfo* info) const; diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 5fb128524e3..abc24c0a1ad 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -57,6 +57,10 @@ struct GcsPath { std::string object; static Result FromString(const std::string& s) { + if (internal::IsLikelyUri(s)) { + return Status::Invalid( + "Expected a GCS object path of the form 'bucket/key...', got a URI: '", s, "'"); + } auto const first_sep = s.find_first_of(internal::kSep); if (first_sep == 0) { return Status::Invalid("Path cannot start with a separator ('", s, "')"); diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index d799713ca4b..1190969bba8 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -478,6 +478,9 @@ TEST(GcsFileSystem, ObjectMetadataRoundtrip) { TEST_F(GcsIntegrationTest, GetFileInfoBucket) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); arrow::fs::AssertFileInfo(fs.get(), PreexistingBucketPath(), FileType::Directory); + + // URI + ASSERT_RAISES(Invalid, fs->GetFileInfo("gs://" + PreexistingBucketPath())); } TEST_F(GcsIntegrationTest, GetFileInfoObject) { @@ -487,6 +490,9 @@ TEST_F(GcsIntegrationTest, GetFileInfoObject) { ASSERT_TRUE(object.ok()) << "status=" << object.status(); arrow::fs::AssertFileInfo(fs.get(), PreexistingObjectPath(), FileType::File, object->time_created(), static_cast(object->size())); + + // URI + ASSERT_RAISES(Invalid, fs->GetFileInfo("gs://" + PreexistingObjectName())); } TEST_F(GcsIntegrationTest, GetFileInfoSelectorRecursive) { @@ -508,6 +514,10 @@ TEST_F(GcsIntegrationTest, GetFileInfoSelectorRecursive) { selector.max_recursion = 16; ASSERT_OK_AND_ASSIGN(auto results, fs->GetFileInfo(selector)); EXPECT_THAT(results, UnorderedElementsAreArray(expected.begin(), expected.end())); + + // URI + selector.base_dir = "gs://" + selector.base_dir; + ASSERT_RAISES(Invalid, fs->GetFileInfo(selector)); } TEST_F(GcsIntegrationTest, GetFileInfoSelectorNonRecursive) { @@ -626,6 +636,11 @@ TEST_F(GcsIntegrationTest, CreateDirRecursiveBucketAndFolder) { arrow::fs::AssertFileInfo(fs.get(), bucket_name + "/", FileType::Directory); } +TEST_F(GcsIntegrationTest, CreateDirUri) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + ASSERT_RAISES(Invalid, fs->CreateDir("gs://" + RandomBucketName(), true)); +} + TEST_F(GcsIntegrationTest, DeleteDirSuccess) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); ASSERT_OK_AND_ASSIGN(auto hierarchy, CreateHierarchy(fs)); @@ -641,6 +656,11 @@ TEST_F(GcsIntegrationTest, DeleteDirSuccess) { } } +TEST_F(GcsIntegrationTest, DeleteDirUri) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + ASSERT_RAISES(Invalid, fs->DeleteDir("gs://" + PreexistingBucketPath())); +} + TEST_F(GcsIntegrationTest, DeleteDirContentsSuccess) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); ASSERT_OK_AND_ASSIGN(auto hierarchy, CreateHierarchy(fs)); @@ -682,6 +702,11 @@ TEST_F(GcsIntegrationTest, DeleteFileDirectoryFails) { ASSERT_RAISES(IOError, fs->DeleteFile(path)); } +TEST_F(GcsIntegrationTest, DeleteFileUri) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + ASSERT_RAISES(Invalid, fs->DeleteFile("gs://" + PreexistingObjectPath())); +} + TEST_F(GcsIntegrationTest, MoveFileSuccess) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); const auto destination_path = PreexistingBucketPath() + "move-destination"; @@ -708,6 +733,13 @@ TEST_F(GcsIntegrationTest, MoveFileCannotRenameToDirectory) { PreexistingBucketPath() + "destination/")); } +TEST_F(GcsIntegrationTest, MoveFileUri) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + const auto destination_path = PreexistingBucketPath() + "move-destination"; + ASSERT_RAISES(Invalid, fs->Move("gs://" + PreexistingObjectPath(), destination_path)); + ASSERT_RAISES(Invalid, fs->Move(PreexistingObjectPath(), "gs://" + destination_path)); +} + TEST_F(GcsIntegrationTest, CopyFileSuccess) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); const auto destination_path = PreexistingBucketPath() + "copy-destination"; @@ -721,6 +753,15 @@ TEST_F(GcsIntegrationTest, CopyFileNotFound) { ASSERT_RAISES(IOError, fs->CopyFile(NotFoundObjectPath(), destination_path)); } +TEST_F(GcsIntegrationTest, CopyFileUri) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + const auto destination_path = PreexistingBucketPath() + "copy-destination"; + ASSERT_RAISES(Invalid, + fs->CopyFile("gs://" + PreexistingObjectPath(), destination_path)); + ASSERT_RAISES(Invalid, + fs->CopyFile(PreexistingObjectPath(), "gs://" + destination_path)); +} + TEST_F(GcsIntegrationTest, OpenInputStreamString) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); @@ -797,6 +838,11 @@ TEST_F(GcsIntegrationTest, OpenInputStreamInfoInvalid) { ASSERT_RAISES(IOError, fs->OpenInputStream(info)); } +TEST_F(GcsIntegrationTest, OpenInputStreamUri) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + ASSERT_RAISES(Invalid, fs->OpenInputStream("gs://" + PreexistingObjectPath())); +} + TEST_F(GcsIntegrationTest, OpenInputStreamReadMetadata) { auto client = GcsClient(); const auto custom_time = std::chrono::system_clock::now() + std::chrono::hours(1); @@ -940,6 +986,14 @@ TEST_F(GcsIntegrationTest, OpenOutputStreamClosed) { ASSERT_RAISES(Invalid, output->Tell()); } +TEST_F(GcsIntegrationTest, OpenOutputStreamUri) { + auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); + + const auto path = + internal::ConcatAbstractPath(PreexistingBucketName(), "open-output-stream-uri.txt"); + ASSERT_RAISES(Invalid, fs->OpenInputStream("gs://" + path)); +} + TEST_F(GcsIntegrationTest, OpenInputFileMixedReadVsReadAt) { auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions()); diff --git a/cpp/src/arrow/filesystem/localfs.cc b/cpp/src/arrow/filesystem/localfs.cc index 775fd746aa6..15d46788746 100644 --- a/cpp/src/arrow/filesystem/localfs.cc +++ b/cpp/src/arrow/filesystem/localfs.cc @@ -81,6 +81,13 @@ bool DetectAbsolutePath(const std::string& s) { namespace { +Status ValidatePath(util::string_view s) { + if (internal::IsLikelyUri(s)) { + return Status::Invalid("Expected a local filesystem path, got a URI: '", s, "'"); + } + return Status::OK(); +} + #ifdef _WIN32 std::string NativeToString(const NativePathString& ns) { @@ -274,6 +281,7 @@ LocalFileSystem::LocalFileSystem(const LocalFileSystemOptions& options, LocalFileSystem::~LocalFileSystem() {} Result LocalFileSystem::NormalizePath(std::string path) { + RETURN_NOT_OK(ValidatePath(path)); ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); return fn.ToString(); } @@ -288,11 +296,13 @@ bool LocalFileSystem::Equals(const FileSystem& other) const { } Result LocalFileSystem::GetFileInfo(const std::string& path) { + RETURN_NOT_OK(ValidatePath(path)); ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); return StatFile(fn.ToNative()); } Result> LocalFileSystem::GetFileInfo(const FileSelector& select) { + RETURN_NOT_OK(ValidatePath(select.base_dir)); ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(select.base_dir)); std::vector results; RETURN_NOT_OK(StatSelector(fn, select, 0, &results)); @@ -300,6 +310,7 @@ Result> LocalFileSystem::GetFileInfo(const FileSelector& s } Status LocalFileSystem::CreateDir(const std::string& path, bool recursive) { + RETURN_NOT_OK(ValidatePath(path)); ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); if (recursive) { return ::arrow::internal::CreateDirTree(fn).status(); @@ -309,6 +320,7 @@ Status LocalFileSystem::CreateDir(const std::string& path, bool recursive) { } Status LocalFileSystem::DeleteDir(const std::string& path) { + RETURN_NOT_OK(ValidatePath(path)); ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); auto st = ::arrow::internal::DeleteDirTree(fn, /*allow_not_found=*/false).status(); if (!st.ok()) { @@ -321,6 +333,7 @@ Status LocalFileSystem::DeleteDir(const std::string& path) { } Status LocalFileSystem::DeleteDirContents(const std::string& path) { + RETURN_NOT_OK(ValidatePath(path)); if (internal::IsEmptyPath(path)) { return internal::InvalidDeleteDirContents(path); } @@ -339,11 +352,14 @@ Status LocalFileSystem::DeleteRootDirContents() { } Status LocalFileSystem::DeleteFile(const std::string& path) { + RETURN_NOT_OK(ValidatePath(path)); ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); return ::arrow::internal::DeleteFile(fn, /*allow_not_found=*/false).status(); } Status LocalFileSystem::Move(const std::string& src, const std::string& dest) { + RETURN_NOT_OK(ValidatePath(src)); + RETURN_NOT_OK(ValidatePath(dest)); ARROW_ASSIGN_OR_RAISE(auto sfn, PlatformFilename::FromString(src)); ARROW_ASSIGN_OR_RAISE(auto dfn, PlatformFilename::FromString(dest)); @@ -363,6 +379,8 @@ Status LocalFileSystem::Move(const std::string& src, const std::string& dest) { } Status LocalFileSystem::CopyFile(const std::string& src, const std::string& dest) { + RETURN_NOT_OK(ValidatePath(src)); + RETURN_NOT_OK(ValidatePath(dest)); ARROW_ASSIGN_OR_RAISE(auto sfn, PlatformFilename::FromString(src)); ARROW_ASSIGN_OR_RAISE(auto dfn, PlatformFilename::FromString(dest)); // XXX should we use fstat() to compare inodes? @@ -392,6 +410,7 @@ template Result> OpenInputStreamGeneric( const std::string& path, const LocalFileSystemOptions& options, const io::IOContext& io_context) { + RETURN_NOT_OK(ValidatePath(path)); if (options.use_mmap) { return io::MemoryMappedFile::Open(path, io::FileMode::READ); } else { @@ -416,11 +435,11 @@ namespace { Result> OpenOutputStreamGeneric(const std::string& path, bool truncate, bool append) { - int fd; - bool write_only = true; + RETURN_NOT_OK(ValidatePath(path)); ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); + const bool write_only = true; ARROW_ASSIGN_OR_RAISE( - fd, ::arrow::internal::FileOpenWritable(fn, write_only, truncate, append)); + int fd, ::arrow::internal::FileOpenWritable(fn, write_only, truncate, append)); auto maybe_stream = io::FileOutputStream::Open(fd); if (!maybe_stream.ok()) { ARROW_UNUSED(::arrow::internal::FileClose(fd)); diff --git a/cpp/src/arrow/filesystem/localfs_test.cc b/cpp/src/arrow/filesystem/localfs_test.cc index e338160951a..795c476c3db 100644 --- a/cpp/src/arrow/filesystem/localfs_test.cc +++ b/cpp/src/arrow/filesystem/localfs_test.cc @@ -272,6 +272,8 @@ TYPED_TEST(TestLocalFS, NormalizePath) { #ifdef _WIN32 ASSERT_OK_AND_EQ("AB/CD", this->local_fs_->NormalizePath("AB\\CD")); ASSERT_OK_AND_EQ("/AB/CD", this->local_fs_->NormalizePath("\\AB\\CD")); + ASSERT_OK_AND_EQ("c:DE/fgh", this->local_fs_->NormalizePath("c:DE\\fgh")); + ASSERT_OK_AND_EQ("c:/DE/fgh", this->local_fs_->NormalizePath("c:\\DE\\fgh")); ASSERT_OK_AND_EQ("C:DE/fgh", this->local_fs_->NormalizePath("C:DE\\fgh")); ASSERT_OK_AND_EQ("C:/DE/fgh", this->local_fs_->NormalizePath("C:\\DE\\fgh")); ASSERT_OK_AND_EQ("//some/share/AB", @@ -279,6 +281,10 @@ TYPED_TEST(TestLocalFS, NormalizePath) { #else ASSERT_OK_AND_EQ("AB\\CD", this->local_fs_->NormalizePath("AB\\CD")); #endif + + // URIs + ASSERT_RAISES(Invalid, this->local_fs_->NormalizePath("file:AB/CD")); + ASSERT_RAISES(Invalid, this->local_fs_->NormalizePath("http:AB/CD")); } TYPED_TEST(TestLocalFS, NormalizePathThroughSubtreeFS) { @@ -287,6 +293,10 @@ TYPED_TEST(TestLocalFS, NormalizePathThroughSubtreeFS) { #else ASSERT_OK_AND_EQ("AB\\CD", this->fs_->NormalizePath("AB\\CD")); #endif + + // URIs + ASSERT_RAISES(Invalid, this->fs_->NormalizePath("file:AB/CD")); + ASSERT_RAISES(Invalid, this->fs_->NormalizePath("http:AB/CD")); } TYPED_TEST(TestLocalFS, FileSystemFromUriFile) { diff --git a/cpp/src/arrow/filesystem/mockfs.cc b/cpp/src/arrow/filesystem/mockfs.cc index f2d2f87263e..7306f3574b9 100644 --- a/cpp/src/arrow/filesystem/mockfs.cc +++ b/cpp/src/arrow/filesystem/mockfs.cc @@ -44,6 +44,13 @@ namespace internal { namespace { +Status ValidatePath(util::string_view s) { + if (internal::IsLikelyUri(s)) { + return Status::Invalid("Expected a filesystem path, got a URI: '", s, "'"); + } + return Status::OK(); +} + //////////////////////////////////////////////////////////////////////////// // Filesystem structure @@ -428,6 +435,7 @@ MockFileSystem::MockFileSystem(TimePoint current_time, const io::IOContext& io_c bool MockFileSystem::Equals(const FileSystem& other) const { return this == &other; } Status MockFileSystem::CreateDir(const std::string& path, bool recursive) { + RETURN_NOT_OK(ValidatePath(path)); auto parts = SplitAbstractPath(path); RETURN_NOT_OK(ValidateAbstractPathParts(parts)); @@ -457,6 +465,7 @@ Status MockFileSystem::CreateDir(const std::string& path, bool recursive) { } Status MockFileSystem::DeleteDir(const std::string& path) { + RETURN_NOT_OK(ValidatePath(path)); auto parts = SplitAbstractPath(path); RETURN_NOT_OK(ValidateAbstractPathParts(parts)); @@ -481,6 +490,7 @@ Status MockFileSystem::DeleteDir(const std::string& path) { } Status MockFileSystem::DeleteDirContents(const std::string& path) { + RETURN_NOT_OK(ValidatePath(path)); auto parts = SplitAbstractPath(path); RETURN_NOT_OK(ValidateAbstractPathParts(parts)); @@ -510,6 +520,7 @@ Status MockFileSystem::DeleteRootDirContents() { } Status MockFileSystem::DeleteFile(const std::string& path) { + RETURN_NOT_OK(ValidatePath(path)); auto parts = SplitAbstractPath(path); RETURN_NOT_OK(ValidateAbstractPathParts(parts)); @@ -533,6 +544,7 @@ Status MockFileSystem::DeleteFile(const std::string& path) { } Result MockFileSystem::GetFileInfo(const std::string& path) { + RETURN_NOT_OK(ValidatePath(path)); auto parts = SplitAbstractPath(path); RETURN_NOT_OK(ValidateAbstractPathParts(parts)); @@ -550,6 +562,7 @@ Result MockFileSystem::GetFileInfo(const std::string& path) { } Result MockFileSystem::GetFileInfo(const FileSelector& selector) { + RETURN_NOT_OK(ValidatePath(selector.base_dir)); auto parts = SplitAbstractPath(selector.base_dir); RETURN_NOT_OK(ValidateAbstractPathParts(parts)); @@ -590,6 +603,8 @@ struct BinaryOp { template static Status Run(MockFileSystem::Impl* impl, const std::string& src, const std::string& dest, OpFunc&& op_func) { + RETURN_NOT_OK(ValidatePath(src)); + RETURN_NOT_OK(ValidatePath(dest)); auto src_parts = SplitAbstractPath(src); auto dest_parts = SplitAbstractPath(dest); RETURN_NOT_OK(ValidateAbstractPathParts(src_parts)); @@ -685,6 +700,7 @@ Status MockFileSystem::CopyFile(const std::string& src, const std::string& dest) Result> MockFileSystem::OpenInputStream( const std::string& path) { + RETURN_NOT_OK(ValidatePath(path)); auto guard = impl_->lock_guard(); return impl_->OpenInputReader(path); @@ -692,6 +708,7 @@ Result> MockFileSystem::OpenInputStream( Result> MockFileSystem::OpenInputFile( const std::string& path) { + RETURN_NOT_OK(ValidatePath(path)); auto guard = impl_->lock_guard(); return impl_->OpenInputReader(path); @@ -699,6 +716,7 @@ Result> MockFileSystem::OpenInputFile( Result> MockFileSystem::OpenOutputStream( const std::string& path, const std::shared_ptr& metadata) { + RETURN_NOT_OK(ValidatePath(path)); auto guard = impl_->lock_guard(); return impl_->OpenOutputStream(path, /*append=*/false, metadata); @@ -706,6 +724,7 @@ Result> MockFileSystem::OpenOutputStream( Result> MockFileSystem::OpenAppendStream( const std::string& path, const std::shared_ptr& metadata) { + RETURN_NOT_OK(ValidatePath(path)); auto guard = impl_->lock_guard(); return impl_->OpenOutputStream(path, /*append=*/true, metadata); @@ -729,6 +748,7 @@ std::vector MockFileSystem::AllFiles() { Status MockFileSystem::CreateFile(const std::string& path, util::string_view contents, bool recursive) { + RETURN_NOT_OK(ValidatePath(path)); auto parent = fs::internal::GetAbstractPathParent(path).first; if (parent != "") { diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index f1bd5c087bf..0f62274b684 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -22,6 +22,7 @@ #include "arrow/status.h" #include "arrow/util/logging.h" #include "arrow/util/string_view.h" +#include "arrow/util/uri.h" namespace arrow { namespace fs { @@ -266,6 +267,26 @@ bool IsEmptyPath(util::string_view v) { return true; } +bool IsLikelyUri(util::string_view v) { + if (v.empty() || v[0] == '/') { + return false; + } + const auto pos = v.find_first_of(':'); + if (pos == v.npos) { + return false; + } + if (pos < 2) { + // One-letter URI schemes don't officially exist, perhaps a Windows drive letter? + return false; + } + if (pos > 36) { + // The largest IANA-registered URI scheme is "microsoft.windows.camera.multipicker" + // with 36 characters. + return false; + } + return ::arrow::internal::IsValidUriScheme(v.substr(0, pos)); +} + } // namespace internal } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index 5701c11b5d8..24d624b102b 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -125,6 +125,9 @@ std::string ToSlashes(util::string_view s); ARROW_EXPORT bool IsEmptyPath(util::string_view s); +ARROW_EXPORT +bool IsLikelyUri(util::string_view s); + } // namespace internal } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 1856adef707..1199f39997f 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -402,6 +402,10 @@ struct S3Path { std::vector key_parts; static Result FromString(const std::string& s) { + if (internal::IsLikelyUri(s)) { + return Status::Invalid( + "Expected an S3 object path of the form 'bucket/key...', got a URI: '", s, "'"); + } const auto src = internal::RemoveTrailingSlash(s); auto first_sep = src.find_first_of(kSep); if (first_sep == 0) { @@ -415,14 +419,14 @@ struct S3Path { path.bucket = std::string(src.substr(0, first_sep)); path.key = std::string(src.substr(first_sep + 1)); path.key_parts = internal::SplitAbstractPath(path.key); - RETURN_NOT_OK(Validate(&path)); + RETURN_NOT_OK(Validate(path)); return path; } - static Status Validate(const S3Path* path) { - auto result = internal::ValidateAbstractPathParts(path->key_parts); + static Status Validate(const S3Path& path) { + auto result = internal::ValidateAbstractPathParts(path.key_parts); if (!result.ok()) { - return Status::Invalid(result.message(), " in path ", path->full_path); + return Status::Invalid(result.message(), " in path ", path.full_path); } else { return result; } diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 089d44a0571..fba431b8fd0 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -448,6 +448,9 @@ class TestS3FS : public S3TestMixin { // Nonexistent ASSERT_RAISES(IOError, fs_->OpenOutputStream("nonexistent-bucket/somefile")); + // URI + ASSERT_RAISES(Invalid, fs_->OpenOutputStream("s3:bucket/newfile1")); + // Create new empty file ASSERT_OK_AND_ASSIGN(stream, fs_->OpenOutputStream("bucket/newfile1")); ASSERT_OK(stream->Close()); @@ -541,6 +544,11 @@ TEST_F(TestS3FS, GetFileInfoBucket) { AssertFileInfo(fs_.get(), "bucket/", FileType::Directory); AssertFileInfo(fs_.get(), "empty-bucket/", FileType::Directory); AssertFileInfo(fs_.get(), "nonexistent-bucket/", FileType::NotFound); + + // URIs + ASSERT_RAISES(Invalid, fs_->GetFileInfo("s3:bucket")); + ASSERT_RAISES(Invalid, fs_->GetFileInfo("s3:empty-bucket")); + ASSERT_RAISES(Invalid, fs_->GetFileInfo("s3:nonexistent-bucket")); } TEST_F(TestS3FS, GetFileInfoObject) { @@ -568,6 +576,11 @@ TEST_F(TestS3FS, GetFileInfoObject) { AssertFileInfo(fs_.get(), "bucket/somefile/", FileType::File, 9); AssertFileInfo(fs_.get(), "bucket/emptyd/", FileType::NotFound); AssertFileInfo(fs_.get(), "non-existent-bucket/somed/", FileType::NotFound); + + // URIs + ASSERT_RAISES(Invalid, fs_->GetFileInfo("s3:bucket/emptydir")); + ASSERT_RAISES(Invalid, fs_->GetFileInfo("s3:bucket/otherdir")); + ASSERT_RAISES(Invalid, fs_->GetFileInfo("s3:bucket/somefile")); } TEST_F(TestS3FS, GetFileInfoSelector) { @@ -634,6 +647,12 @@ TEST_F(TestS3FS, GetFileInfoSelector) { ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); SortInfos(&infos); ASSERT_EQ(infos.size(), 4); + + // URIs + select.base_dir = "s3:bucket"; + ASSERT_RAISES(Invalid, fs_->GetFileInfo(select)); + select.base_dir = "s3:bucket/somedir"; + ASSERT_RAISES(Invalid, fs_->GetFileInfo(select)); } TEST_F(TestS3FS, GetFileInfoSelectorRecursive) { @@ -747,6 +766,9 @@ TEST_F(TestS3FS, CreateDir) { // Existing "file", should fail ASSERT_RAISES(IOError, fs_->CreateDir("bucket/somefile")); + + // URI + ASSERT_RAISES(Invalid, fs_->CreateDir("s3:bucket/newdir2")); } TEST_F(TestS3FS, DeleteFile) { @@ -764,6 +786,9 @@ TEST_F(TestS3FS, DeleteFile) { // "Directory" ASSERT_RAISES(IOError, fs_->DeleteFile("bucket/somedir")); AssertFileInfo(fs_.get(), "bucket/somedir", FileType::Directory); + + // URI + ASSERT_RAISES(Invalid, fs_->DeleteFile("s3:bucket/somefile")); } TEST_F(TestS3FS, DeleteDir) { @@ -800,6 +825,9 @@ TEST_F(TestS3FS, DeleteDir) { // Bucket ASSERT_OK(fs_->DeleteDir("bucket")); AssertFileInfo(fs_.get(), "bucket", FileType::NotFound); + + // URI + ASSERT_RAISES(Invalid, fs_->DeleteDir("s3:empty-bucket")); } TEST_F(TestS3FS, CopyFile) { @@ -856,6 +884,9 @@ TEST_F(TestS3FS, OpenInputStream) { ASSERT_RAISES(IOError, fs_->OpenInputStream("nonexistent-bucket/somefile")); ASSERT_RAISES(IOError, fs_->OpenInputStream("bucket/zzzt")); + // URI + ASSERT_RAISES(Invalid, fs_->OpenInputStream("s3:bucket/somefile")); + // "Files" ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream("bucket/somefile")); ASSERT_OK_AND_ASSIGN(buf, stream->Read(2)); @@ -911,6 +942,9 @@ TEST_F(TestS3FS, OpenInputFile) { ASSERT_RAISES(IOError, fs_->OpenInputFile("nonexistent-bucket/somefile")); ASSERT_RAISES(IOError, fs_->OpenInputFile("bucket/zzzt")); + // URI + ASSERT_RAISES(Invalid, fs_->OpenInputStream("s3:bucket/somefile")); + // "Files" ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile("bucket/somefile")); ASSERT_OK_AND_EQ(9, file->GetSize()); diff --git a/cpp/src/arrow/util/uri.cc b/cpp/src/arrow/util/uri.cc index 35c6b898177..7a8484ce51a 100644 --- a/cpp/src/arrow/util/uri.cc +++ b/cpp/src/arrow/util/uri.cc @@ -17,6 +17,7 @@ #include "arrow/util/uri.h" +#include #include #include #include @@ -93,6 +94,21 @@ std::string UriEncodeHost(const std::string& host) { } } +bool IsValidUriScheme(const arrow::util::string_view s) { + auto is_alpha = [](char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); }; + auto is_scheme_char = [&](char c) { + return is_alpha(c) || (c >= '0' && c <= '9') || c == '+' || c == '-' || c == '.'; + }; + + if (s.empty()) { + return false; + } + if (!is_alpha(s[0])) { + return false; + } + return std::all_of(s.begin() + 1, s.end(), is_scheme_char); +} + struct Uri::Impl { Impl() : string_rep_(""), port_(-1) { memset(&uri_, 0, sizeof(uri_)); } diff --git a/cpp/src/arrow/util/uri.h b/cpp/src/arrow/util/uri.h index b4ffbb04dec..eae1956eafc 100644 --- a/cpp/src/arrow/util/uri.h +++ b/cpp/src/arrow/util/uri.h @@ -100,5 +100,9 @@ std::string UriUnescape(const arrow::util::string_view s); ARROW_EXPORT std::string UriEncodeHost(const std::string& host); +/// Whether the string is a syntactically valid URI scheme according to RFC 3986. +ARROW_EXPORT +bool IsValidUriScheme(const arrow::util::string_view s); + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/uri_test.cc b/cpp/src/arrow/util/uri_test.cc index 169e9c81b36..8cf93b2331d 100644 --- a/cpp/src/arrow/util/uri_test.cc +++ b/cpp/src/arrow/util/uri_test.cc @@ -41,6 +41,22 @@ TEST(UriEncodeHost, Basics) { ASSERT_EQ(UriEscape("192.168.1.1"), "192.168.1.1"); } +TEST(IsValidUriScheme, Basics) { + ASSERT_FALSE(IsValidUriScheme("")); + ASSERT_FALSE(IsValidUriScheme(":")); + ASSERT_FALSE(IsValidUriScheme(".")); + ASSERT_TRUE(IsValidUriScheme("a")); + ASSERT_TRUE(IsValidUriScheme("file")); + ASSERT_TRUE(IsValidUriScheme("local-file")); + ASSERT_TRUE(IsValidUriScheme("s3")); + ASSERT_TRUE(IsValidUriScheme("grpc+https")); + ASSERT_TRUE(IsValidUriScheme("file.local")); + ASSERT_FALSE(IsValidUriScheme("3s")); + ASSERT_FALSE(IsValidUriScheme("-file")); + ASSERT_FALSE(IsValidUriScheme("local/file")); + ASSERT_FALSE(IsValidUriScheme("filé")); +} + TEST(Uri, Empty) { Uri uri; ASSERT_EQ(uri.scheme(), ""); diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index 5d33268618f..b3b27f24eac 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -165,7 +165,7 @@ def _resolve_filesystem_and_path( filesystem = LocalFileSystem() try: file_info = filesystem.get_file_info(path) - except OSError: + except ValueError: # ValueError means path is likely an URI file_info = None exists_locally = False else: