diff --git a/cpp/src/arrow/dataset/discovery.cc b/cpp/src/arrow/dataset/discovery.cc index da1f8c592ce..12de3cbd3ea 100644 --- a/cpp/src/arrow/dataset/discovery.cc +++ b/cpp/src/arrow/dataset/discovery.cc @@ -34,25 +34,37 @@ namespace arrow { namespace dataset { FileSystemDataSourceDiscovery::FileSystemDataSourceDiscovery( - fs::FileSystem* filesystem, std::vector files, + fs::FileSystem* filesystem, std::string base_dir, std::vector files, std::shared_ptr format) - : fs_(filesystem), files_(std::move(files)), format_(std::move(format)) {} + : fs_(filesystem), + base_dir_(std::move(base_dir)), + files_(std::move(files)), + format_(std::move(format)) {} Status FileSystemDataSourceDiscovery::Make(fs::FileSystem* filesystem, + std::string base_dir, std::vector files, std::shared_ptr format, std::shared_ptr* out) { - out->reset(new FileSystemDataSourceDiscovery(filesystem, files, format)); + out->reset( + new FileSystemDataSourceDiscovery(filesystem, std::move(base_dir), files, format)); return Status::OK(); } +Status FileSystemDataSourceDiscovery::Make(fs::FileSystem* filesystem, + std::vector files, + std::shared_ptr format, + std::shared_ptr* out) { + return Make(filesystem, "", std::move(files), std::move(format), out); +} + Status FileSystemDataSourceDiscovery::Make(fs::FileSystem* filesystem, fs::Selector selector, std::shared_ptr format, std::shared_ptr* out) { std::vector files; RETURN_NOT_OK(filesystem->GetTargetStats(selector, &files)); - return Make(filesystem, files, format, out); + return Make(filesystem, std::move(selector.base_dir), std::move(files), format, out); } static inline Status InspectSchema(fs::FileSystem* fs, @@ -85,7 +97,8 @@ Status FileSystemDataSourceDiscovery::Finish(std::shared_ptr* out) { PathPartitions partitions; if (partition_scheme_ != nullptr) { - RETURN_NOT_OK(ApplyPartitionScheme(*partition_scheme_, files_, &partitions)); + RETURN_NOT_OK( + ApplyPartitionScheme(*partition_scheme_, base_dir_, files_, &partitions)); } return FileSystemBasedDataSource::Make(fs_, files_, root_partition(), diff --git a/cpp/src/arrow/dataset/discovery.h b/cpp/src/arrow/dataset/discovery.h index cbc1258e547..dbb596e98ad 100644 --- a/cpp/src/arrow/dataset/discovery.h +++ b/cpp/src/arrow/dataset/discovery.h @@ -89,6 +89,10 @@ class ARROW_DS_EXPORT DataSourceDiscovery { /// of fs::FileStats or a fs::Selector. class ARROW_DS_EXPORT FileSystemDataSourceDiscovery : public DataSourceDiscovery { public: + static Status Make(fs::FileSystem* filesystem, std::string base_dir, + std::vector files, std::shared_ptr format, + std::shared_ptr* out); + static Status Make(fs::FileSystem* filesystem, std::vector files, std::shared_ptr format, std::shared_ptr* out); @@ -102,11 +106,12 @@ class ARROW_DS_EXPORT FileSystemDataSourceDiscovery : public DataSourceDiscovery Status Finish(std::shared_ptr* out) override; protected: - FileSystemDataSourceDiscovery(fs::FileSystem* filesystem, + FileSystemDataSourceDiscovery(fs::FileSystem* filesystem, std::string base_dir, std::vector files, std::shared_ptr format); fs::FileSystem* fs_; + std::string base_dir_; std::vector files_; std::shared_ptr format_; }; diff --git a/cpp/src/arrow/dataset/discovery_test.cc b/cpp/src/arrow/dataset/discovery_test.cc index 936d995c1ba..92cf701aaac 100644 --- a/cpp/src/arrow/dataset/discovery_test.cc +++ b/cpp/src/arrow/dataset/discovery_test.cc @@ -20,6 +20,7 @@ #include #include +#include "arrow/dataset/partition.h" #include "arrow/dataset/test_util.h" #include "arrow/filesystem/test_util.h" @@ -31,16 +32,11 @@ class FileSystemDataSourceDiscoveryTest : public TestFileSystemBasedDataSource { void MakeDiscovery(const std::vector& files) { MakeFileSystem(files); ASSERT_OK( - FileSystemDataSourceDiscovery::Make(fs_.get(), files, format_, &discovery_)); - } - - void MakeDiscovery(const std::vector& files, fs::Selector selector) { - MakeFileSystem(files); - ASSERT_OK( - FileSystemDataSourceDiscovery::Make(fs_.get(), selector, format_, &discovery_)); + FileSystemDataSourceDiscovery::Make(fs_.get(), selector_, format_, &discovery_)); } protected: + fs::Selector selector_; std::shared_ptr discovery_; std::shared_ptr format_ = std::make_shared(); }; @@ -53,15 +49,29 @@ TEST_F(FileSystemDataSourceDiscoveryTest, Basic) { } TEST_F(FileSystemDataSourceDiscoveryTest, Selector) { - // This test ensure that the Selector is enforced. - fs::Selector selector; - selector.base_dir = "A"; - MakeDiscovery({fs::File("0"), fs::File("A/a")}, selector); + selector_.base_dir = "A"; + MakeDiscovery({fs::File("0"), fs::File("A/a")}); ASSERT_OK(discovery_->Finish(&source_)); + // "0" doesn't match selector, so it has been dropped: AssertFragmentsAreFromPath(source_->GetFragments(options_), {"A/a"}); } +TEST_F(FileSystemDataSourceDiscoveryTest, Partition) { + selector_.base_dir = "a=ignored/base"; + MakeDiscovery( + {fs::File(selector_.base_dir + "/a=1"), fs::File(selector_.base_dir + "/a=2")}); + + auto partition_scheme = + std::make_shared(schema({field("a", int32())})); + + ASSERT_OK(discovery_->SetPartitionScheme(partition_scheme)); + ASSERT_OK(discovery_->Finish(&source_)); + + AssertFragmentsAreFromPath(source_->GetFragments(options_), + {selector_.base_dir + "/a=1", selector_.base_dir + "/a=2"}); +} + TEST_F(FileSystemDataSourceDiscoveryTest, Inspect) { auto s = schema({field("f64", float64())}); format_ = std::make_shared(s); diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index 9141bb64b99..0fa940c68b1 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -90,11 +90,18 @@ Result> HivePartitionScheme::Parse( Status ApplyPartitionScheme(const PartitionScheme& scheme, std::vector files, PathPartitions* out) { + return ApplyPartitionScheme(scheme, "", std::move(files), out); +} + +Status ApplyPartitionScheme(const PartitionScheme& scheme, const std::string& base_dir, + std::vector files, PathPartitions* out) { for (const auto& file : files) { - const auto& path = file.path(); + if (file.path().substr(0, base_dir.size()) != base_dir) continue; + auto path = file.path().substr(base_dir.size()); + std::shared_ptr partition; RETURN_NOT_OK(scheme.Parse(path, &partition)); - out->emplace(path, std::move(partition)); + out->emplace(std::move(path), std::move(partition)); } return Status::OK(); diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h index 83ac5c862db..012ec32914c 100644 --- a/cpp/src/arrow/dataset/partition.h +++ b/cpp/src/arrow/dataset/partition.h @@ -34,7 +34,8 @@ namespace arrow { namespace fs { struct FileStats; -} +struct Selector; +} // namespace fs namespace dataset { @@ -171,6 +172,9 @@ using PathPartitions = std::unordered_map files, PathPartitions* out); +Status ApplyPartitionScheme(const PartitionScheme& scheme, const std::string& base_dir, + std::vector files, PathPartitions* out); + // TODO(bkietz) use RE2 and named groups to provide RegexpPartitionScheme } // namespace dataset diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index bc17346894e..7084f839a69 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -91,7 +91,7 @@ struct ARROW_EXPORT FileStats { void set_type(FileType type) { type_ = type; } /// The full file path in the filesystem - std::string path() const { return path_; } + const std::string& path() const { return path_; } void set_path(const std::string& path) { path_ = path; } /// The file base name (component after the last directory separator)