Skip to content

Commit

Permalink
Ignore nested files when calculating the total
Browse files Browse the repository at this point in the history
For the nested files:
 - folder/         (5 MB)
 - folder/big_file (15 MB)

The --total now outputs 15 MB instead of the previous 20 MB, because the
inner file is inside of the folder that was also passed as an argument.

Implemented with the Trie data structure, made of HashMap and PathBufs
that represent each path components of the canonicalized file paths.

Fixes cauebs#12.
  • Loading branch information
marcospb19 committed Dec 5, 2021
1 parent 964ddfd commit 1c5c592
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 2 deletions.
104 changes: 104 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use number_prefix::NumberPrefix;
use walkdir::WalkDir;

use std::{
collections::BTreeMap,
fmt::Display,
path::{Path, PathBuf},
};
Expand Down Expand Up @@ -31,6 +32,7 @@ impl<T, E: Display> ResultExt<T, E> for Result<T, E> {
}
}

#[derive(PartialEq, Eq, PartialOrd, Ord)]
pub struct Entry {
pub path: PathBuf,
pub size: u64,
Expand Down Expand Up @@ -91,3 +93,105 @@ pub fn format_size(size: u64, binary: bool) -> String {
NumberPrefix::Prefixed(prefix, number) => format!("{:.2} {}B", number, prefix),
}
}

/// Calculate the sum of sizes of all entries
///
/// Ignore nested files when calculating the total
///
/// For the nested files:
/// - `folder/ (5 MB)`
/// - `folder/big_file (15 MB)`
///
/// The is 15 MB instead of 20 MB because the inner file is inside of the
/// folder that was also received as an argument
///
/// Implemented with the Trie data structure, made of HashMap and PathBufs
/// that represent each path components of the canonicalized file paths
pub fn calculate_unique_total_size(entries: &[Entry]) -> u64 {
// Sorted to guarantee that files can only come after their parent directories
let sorted_entries = {
let mut entries_and_paths: Vec<(PathBuf, &Entry)> = vec![];

// Canonicalize each path, reporting and skipping errors
for entry in entries {
let canonical_path = entry.path.canonicalize().log_err(Some(&entry.path));
if let Ok(path) = canonical_path {
entries_and_paths.push((path, entry));
}
}
entries_and_paths.sort_unstable();
entries_and_paths
};

#[derive(PartialEq, Eq, PartialOrd, Ord)]
struct TriePathNode {
// Children nodes of this current path, accessed by path
children: BTreeMap<PathBuf, TriePathNode>,
// Size of the file that ends at this node
node_size: u64,
}

let mut trie_root = TriePathNode {
children: BTreeMap::new(),
node_size: 0,
};

// For each entry/path, add it to the Trie if it wasn't already inserted
//
// If the Trie receives a folder that is parent of a previously added file, then just consider
// the parent folder, removing the childs, this way, we do not count them twice towards the
// final total
for (path, entry) in sorted_entries {
// Necessary because we need to check when it's the last path piece
let mut path_iter = path.iter().peekable();
// Pointer to traverse the tree
let mut current_trie_node = &mut trie_root;
// Size to be added at the endif the current entry isn't children of any other
let size_of_current_file = entry.size;

while let Some(piece) = path_iter.next() {
// Query for the node in the Trie which matches the current path piece
let entry = current_trie_node.children.entry(PathBuf::from(piece));

// Keeps track if the current entry is child of another previously found
let mut already_considered = false;
let next_trie_node = entry
.and_modify(|_| {
// If we are in this block, it means that the node size was already considered
// because a parent of it was inserted. So we will skip this file
already_considered = true;
})
// Add a node with 0 size, which may be changed after if it is the last piece
.or_insert(TriePathNode {
children: BTreeMap::new(),
node_size: 0,
});

// Skipping already accounted file, because it is nested inside of another one
if already_considered {
break;
}

// If we are at the last piece of the current entry path, it means that this is the tip
// that finally represents the file, and which path is the full file path
let is_the_last_piece = path_iter.peek().is_none();
if is_the_last_piece {
// Update the size of this piece
next_trie_node.node_size = size_of_current_file;
// Drop all the childrens so that their sizes won't be added
next_trie_node.children.clear();
}

// Update the pointer to keep traversing the trie
current_trie_node = next_trie_node;
}
}

fn trie_recursive_sum(node: &TriePathNode) -> u64 {
let children_sum: u64 = node.children.values().map(trie_recursive_sum).sum();
node.node_size + children_sum
}

// Traverse the trie tree to calculate the sum
trie_recursive_sum(&trie_root)
}
4 changes: 2 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use wild;

mod cli;
use cli::Cli;
use durt::{format_size, Entry};
use durt::{calculate_unique_total_size, format_size, Entry};

fn main() {
#[cfg(windows)]
Expand Down Expand Up @@ -60,7 +60,7 @@ fn main() {
Table::new(" {:>} {:<}")
};

let total_size = entries.iter().map(|e| e.size).sum();
let total_size = calculate_unique_total_size(&entries);
let mut omitted_entries = 0;

for entry in entries {
Expand Down

0 comments on commit 1c5c592

Please sign in to comment.