Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ set(ARROW_SRCS
filesystem/filesystem.cc
filesystem/localfs.cc
filesystem/mockfs.cc
filesystem/path_tree.cc
filesystem/path_util.cc
filesystem/util_internal.cc
io/buffered.cc
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/filesystem/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ arrow_install_all_headers("arrow/filesystem")

add_arrow_test(filesystem_test)
add_arrow_test(localfs_test)
add_arrow_test(path_tree_test)

if(ARROW_S3)
add_arrow_test(s3fs_test)
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/filesystem/filesystem.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ std::string FileStats::base_name() const {
return internal::GetAbstractPathParent(path_).second;
}

std::string FileStats::dir_name() const {
return internal::GetAbstractPathParent(path_).first;
}

// Debug helper
std::ostream& operator<<(std::ostream& os, const FileStats& stats) {
return os << "FileStats(" << stats.type() << ", " << stats.path() << ")";
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/filesystem/filesystem.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ struct ARROW_EXPORT FileStats {
/// The file base name (component after the last directory separator)
std::string base_name() const;

// The directory base name (component before the file base name).
std::string dir_name() const;

/// The size in bytes, if available
///
/// Only regular files are guaranteed to have a size.
Expand All @@ -110,6 +113,9 @@ struct ARROW_EXPORT FileStats {
TimePoint mtime() const { return mtime_; }
void set_mtime(TimePoint mtime) { mtime_ = mtime; }

bool IsFile() const { return type_ == FileType::File; }
bool IsDirectory() const { return type_ == FileType::Directory; }

bool operator==(const FileStats& other) const {
return type() == other.type() && path() == other.path() && size() == other.size() &&
mtime() == other.mtime();
Expand Down
137 changes: 137 additions & 0 deletions cpp/src/arrow/filesystem/path_tree.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//

#include "arrow/filesystem/path_tree.h"

#include <algorithm>
#include <iostream>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

#include "arrow/filesystem/path_util.h"

namespace arrow {
namespace fs {

using PathTreeByPathMap = std::unordered_map<std::string, std::shared_ptr<PathTree>>;

std::shared_ptr<PathTree> FindAncestor(const PathTreeByPathMap& directories,
std::string path) {
while (path != "") {
auto parent = internal::GetAbstractPathParent(path).first;
auto found = directories.find(parent);
if (found != directories.end()) {
return found->second;
}

path = std::move(parent);
}

return nullptr;
}

Status PathTree::Make(std::vector<FileStats> stats, PathForest* out) {
PathTreeByPathMap directories;
PathForest forest;

auto link_parent_or_insert_root = [&directories, &forest](const FileStats& s) {
if (s.path() == "") {
return;
}

auto ancestor = FindAncestor(directories, s.path());
auto node = std::make_shared<PathTree>(s);
if (ancestor) {
ancestor->AddChild(node);
} else {
forest.push_back(node);
}

if (s.type() == FileType::Directory) {
directories[s.path()] = node;
}
};

// Insert nodes by ascending path length, ensuring that nodes are always
// inserted after their ancestors. Note that this strategy does not account
// for special directories like '..'. It is expected that path are absolute.
auto cmp = [](const FileStats& lhs, const FileStats& rhs) {
return lhs.path().size() < rhs.path().size();
};
std::stable_sort(stats.begin(), stats.end(), cmp);
std::for_each(stats.cbegin(), stats.cend(), link_parent_or_insert_root);

*out = std::move(forest);
return Status::OK();
}

Status PathTree::Make(std::vector<FileStats> stats, std::shared_ptr<PathTree>* out) {
PathForest forest;
RETURN_NOT_OK(Make(stats, &forest));

auto size = forest.size();
if (size > 1) {
return Status::Invalid("Requested PathTree has ", size, " roots, but expected 1.");
} else if (size == 1) {
*out = forest[0];
}

return Status::OK();
}

std::ostream& operator<<(std::ostream& os, const PathTree& tree) {
os << "PathTree(" << tree.stats();

const auto& subtrees = tree.subtrees();
if (subtrees.size()) {
os << ", [";
for (size_t i = 0; i < subtrees.size(); i++) {
if (i != 0) os << ", ";
os << *subtrees[i];
}
os << "]";
}
os << ")";
return os;
}

std::ostream& operator<<(std::ostream& os, const std::shared_ptr<PathTree>& tree) {
if (tree != nullptr) {
return os << *tree.get();
}

return os;
}

bool operator==(const std::shared_ptr<PathTree>& lhs,
const std::shared_ptr<PathTree>& rhs) {
if (lhs == NULLPTR && rhs == NULLPTR) {
return true;
} else if (lhs != NULLPTR && rhs != NULLPTR) {
return *lhs == *rhs;
}

return false;
}

} // namespace fs
} // namespace arrow
110 changes: 110 additions & 0 deletions cpp/src/arrow/filesystem/path_tree.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include "arrow/filesystem/filesystem.h"

#include <algorithm>
#include <iosfwd>
#include <memory>
#include <utility>
#include <vector>

#include "arrow/status.h"

namespace arrow {
namespace fs {

class ARROW_EXPORT PathTree;

/// \brief A PathForest consists of multiples PathTree
using PathForest = std::vector<std::shared_ptr<PathTree>>;

/// \brief A PathTree is a utility to transform a vector of FileStats into a
/// forest representation for tree traversal purposes. Node in the graph wraps
/// a FileStats. Files are expected to be found only at leaves of the tree.
class ARROW_EXPORT PathTree {
public:
explicit PathTree(FileStats stats) : stats_(stats) {}
PathTree(FileStats stats, std::vector<std::shared_ptr<PathTree>> subtrees)
: stats_(stats), subtrees_(std::move(subtrees)) {}

/// \brief Transforms a FileStats vector into a forest of trees. Since there
/// is no guarantee of complete trees, it is possible to have a forest
/// (multiple roots). The caller should ensure that stats have unique path.
static Status Make(std::vector<FileStats> stats, PathForest* out);

/// \brief Like MakeForest but fails if there's more than one root.
static Status Make(std::vector<FileStats> stats, std::shared_ptr<PathTree>* out);

/// \brief Returns the FileStat of this node.
FileStats stats() const { return stats_; }
/// \brief Returns the subtrees under this node.
std::vector<std::shared_ptr<PathTree>> subtrees() const { return subtrees_; }

/// \brief Visit with eager pruning.
template <typename Visitor, typename Matcher>
Status Visit(Visitor&& v, Matcher&& m) const {
bool match = false;
ARROW_RETURN_NOT_OK(m(stats_, &match));
if (!match) {
return Status::OK();
}

ARROW_RETURN_NOT_OK(v(stats_));

for (const auto& t : subtrees_) {
ARROW_RETURN_NOT_OK(t->Visit(v, m));
}

return Status::OK();
}

template <typename Visitor>
Status Visit(Visitor&& v) const {
auto always_match = [](const FileStats& t, bool* match) {
*match = true;
return Status::OK();
};
return Visit(v, always_match);
}

bool operator==(const PathTree& other) const {
return stats_ == other.stats_ && subtrees_ == other.subtrees_;
}

protected:
FileStats stats_;
std::vector<std::shared_ptr<PathTree>> subtrees_;

// The AddChild method is convenient to create trees in a top-down fashion,
// e.g. the Make factory constructor.
void AddChild(std::shared_ptr<PathTree> child) {
subtrees_.push_back(std::move(child));
}
};

ARROW_EXPORT std::ostream& operator<<(std::ostream& os,
const std::shared_ptr<PathTree>& tree);
ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const PathTree& tree);

ARROW_EXPORT bool operator==(const std::shared_ptr<PathTree>& lhs,
const std::shared_ptr<PathTree>& rhs);

} // namespace fs
} // namespace arrow
Loading