Skip to content

Commit 1c5c592

Browse files
committed
Ignore nested files when calculating the total
For the nested files: - folder/ (5 MB) - folder/big_file (15 MB) The --total now outputs 15 MB instead of the previous 20 MB, because the inner file is inside of the folder that was also passed as an argument. Implemented with the Trie data structure, made of HashMap and PathBufs that represent each path components of the canonicalized file paths. Fixes #12.
1 parent 964ddfd commit 1c5c592

File tree

2 files changed

+106
-2
lines changed

2 files changed

+106
-2
lines changed

src/lib.rs

+104
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use number_prefix::NumberPrefix;
33
use walkdir::WalkDir;
44

55
use std::{
6+
collections::BTreeMap,
67
fmt::Display,
78
path::{Path, PathBuf},
89
};
@@ -31,6 +32,7 @@ impl<T, E: Display> ResultExt<T, E> for Result<T, E> {
3132
}
3233
}
3334

35+
#[derive(PartialEq, Eq, PartialOrd, Ord)]
3436
pub struct Entry {
3537
pub path: PathBuf,
3638
pub size: u64,
@@ -91,3 +93,105 @@ pub fn format_size(size: u64, binary: bool) -> String {
9193
NumberPrefix::Prefixed(prefix, number) => format!("{:.2} {}B", number, prefix),
9294
}
9395
}
96+
97+
/// Calculate the sum of sizes of all entries
98+
///
99+
/// Ignore nested files when calculating the total
100+
///
101+
/// For the nested files:
102+
/// - `folder/ (5 MB)`
103+
/// - `folder/big_file (15 MB)`
104+
///
105+
/// The is 15 MB instead of 20 MB because the inner file is inside of the
106+
/// folder that was also received as an argument
107+
///
108+
/// Implemented with the Trie data structure, made of HashMap and PathBufs
109+
/// that represent each path components of the canonicalized file paths
110+
pub fn calculate_unique_total_size(entries: &[Entry]) -> u64 {
111+
// Sorted to guarantee that files can only come after their parent directories
112+
let sorted_entries = {
113+
let mut entries_and_paths: Vec<(PathBuf, &Entry)> = vec![];
114+
115+
// Canonicalize each path, reporting and skipping errors
116+
for entry in entries {
117+
let canonical_path = entry.path.canonicalize().log_err(Some(&entry.path));
118+
if let Ok(path) = canonical_path {
119+
entries_and_paths.push((path, entry));
120+
}
121+
}
122+
entries_and_paths.sort_unstable();
123+
entries_and_paths
124+
};
125+
126+
#[derive(PartialEq, Eq, PartialOrd, Ord)]
127+
struct TriePathNode {
128+
// Children nodes of this current path, accessed by path
129+
children: BTreeMap<PathBuf, TriePathNode>,
130+
// Size of the file that ends at this node
131+
node_size: u64,
132+
}
133+
134+
let mut trie_root = TriePathNode {
135+
children: BTreeMap::new(),
136+
node_size: 0,
137+
};
138+
139+
// For each entry/path, add it to the Trie if it wasn't already inserted
140+
//
141+
// If the Trie receives a folder that is parent of a previously added file, then just consider
142+
// the parent folder, removing the childs, this way, we do not count them twice towards the
143+
// final total
144+
for (path, entry) in sorted_entries {
145+
// Necessary because we need to check when it's the last path piece
146+
let mut path_iter = path.iter().peekable();
147+
// Pointer to traverse the tree
148+
let mut current_trie_node = &mut trie_root;
149+
// Size to be added at the endif the current entry isn't children of any other
150+
let size_of_current_file = entry.size;
151+
152+
while let Some(piece) = path_iter.next() {
153+
// Query for the node in the Trie which matches the current path piece
154+
let entry = current_trie_node.children.entry(PathBuf::from(piece));
155+
156+
// Keeps track if the current entry is child of another previously found
157+
let mut already_considered = false;
158+
let next_trie_node = entry
159+
.and_modify(|_| {
160+
// If we are in this block, it means that the node size was already considered
161+
// because a parent of it was inserted. So we will skip this file
162+
already_considered = true;
163+
})
164+
// Add a node with 0 size, which may be changed after if it is the last piece
165+
.or_insert(TriePathNode {
166+
children: BTreeMap::new(),
167+
node_size: 0,
168+
});
169+
170+
// Skipping already accounted file, because it is nested inside of another one
171+
if already_considered {
172+
break;
173+
}
174+
175+
// If we are at the last piece of the current entry path, it means that this is the tip
176+
// that finally represents the file, and which path is the full file path
177+
let is_the_last_piece = path_iter.peek().is_none();
178+
if is_the_last_piece {
179+
// Update the size of this piece
180+
next_trie_node.node_size = size_of_current_file;
181+
// Drop all the childrens so that their sizes won't be added
182+
next_trie_node.children.clear();
183+
}
184+
185+
// Update the pointer to keep traversing the trie
186+
current_trie_node = next_trie_node;
187+
}
188+
}
189+
190+
fn trie_recursive_sum(node: &TriePathNode) -> u64 {
191+
let children_sum: u64 = node.children.values().map(trie_recursive_sum).sum();
192+
node.node_size + children_sum
193+
}
194+
195+
// Traverse the trie tree to calculate the sum
196+
trie_recursive_sum(&trie_root)
197+
}

src/main.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use wild;
44

55
mod cli;
66
use cli::Cli;
7-
use durt::{format_size, Entry};
7+
use durt::{calculate_unique_total_size, format_size, Entry};
88

99
fn main() {
1010
#[cfg(windows)]
@@ -60,7 +60,7 @@ fn main() {
6060
Table::new(" {:>} {:<}")
6161
};
6262

63-
let total_size = entries.iter().map(|e| e.size).sum();
63+
let total_size = calculate_unique_total_size(&entries);
6464
let mut omitted_entries = 0;
6565

6666
for entry in entries {

0 commit comments

Comments
 (0)