Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feature] Full comment deduplication for referential objects, based on name only for similarity #685

Merged
merged 5 commits into from
Aug 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions src/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,77 @@ impl Collections {
self.vehicle_journeys = CollectionWithId::new(vehicle_journeys).unwrap();
}

/// Some comments are identical and can be deduplicated
pub fn comment_deduplication(&mut self) {
let duplicate2ref = self.get_comment_map_duplicate_to_referent();
if duplicate2ref.is_empty() {
return;
}

replace_comment_duplicates_by_ref(&mut self.lines, &duplicate2ref);
replace_comment_duplicates_by_ref(&mut self.routes, &duplicate2ref);
replace_comment_duplicates_by_ref(&mut self.stop_areas, &duplicate2ref);
replace_comment_duplicates_by_ref(&mut self.stop_points, &duplicate2ref);
replace_comment_duplicates_by_ref(&mut self.stop_locations, &duplicate2ref);

fn replace_comment_duplicates_by_ref<T>(
collection: &mut CollectionWithId<T>,
duplicate2ref: &BTreeMap<String, String>,
) where
T: Id<T> + CommentLinks,
{
let map_pt_object_duplicates: BTreeMap<Idx<T>, Vec<&str>> = collection
.iter()
.filter_map(|(idx, pt_object)| {
let intersection: Vec<&str> = pt_object
.comment_links()
.iter()
.filter_map(|comment_id| {
duplicate2ref
.get_key_value(comment_id)
.map(|(duplicate_id_ref, _)| duplicate_id_ref.as_str())
})
.collect();
if !intersection.is_empty() {
Some((idx, intersection))
} else {
None
}
})
.collect();

for (idx, intersection) in map_pt_object_duplicates {
for i in intersection {
let mut pt_object = collection.index_mut(idx);
pt_object.comment_links_mut().remove(i);
pt_object
.comment_links_mut()
.insert(duplicate2ref[i].clone());
}
}
}
}

/// From comment collection only, return a map of the similar comments.
///
/// Result: duplicates (comments to be removed) are mapped to their similar
/// referent (unique to be kept)
fn get_comment_map_duplicate_to_referent(&self) -> BTreeMap<String, String> {
let mut duplicate2ref = BTreeMap::<String, String>::new();
// Map of the referent comments id (uniqueness given the similarity_key)
let mut map_ref = HashMap::<&str, &str>::new();

for comment in self.comments.values() {
let similarity_key = comment.name.as_str(); // name only is considered
if let Some(ref_id) = map_ref.get(similarity_key) {
duplicate2ref.insert(comment.id.to_string(), ref_id.to_string());
} else {
map_ref.insert(similarity_key, &comment.id);
}
}
duplicate2ref
}

/// If the route name is empty, it is derived from the most frequent
/// `stop_area` origin and `stop_area` destination of all the associated
/// trips. The `stop_area` name is used to create the following `String`:
Expand Down Expand Up @@ -1100,6 +1171,7 @@ impl Model {
/// assert!(Model::new(collections).is_ok());
/// ```
pub fn new(mut c: Collections) -> Result<Self> {
c.comment_deduplication();
c.sanitize()?;

let forward_vj_to_sp = c
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
object_id,object_type,comment_id
GDL,stop_area,comment:kept:1
CHAM,stop_point,comment:kept:2
CHAM,stop_point,comment:kept:1
B42,line,comment:kept:1
M1F,route,comment:kept:1
M1B1_R,trip,comment:kept:2
Expand Down