From fb8020217296d846b79740abfc84c42736100fbb Mon Sep 17 00:00:00 2001 From: Ruslan Popov Date: Sun, 2 Nov 2025 19:30:50 +0100 Subject: [PATCH 1/4] feat(slr): run duplicate check after finding papers --- .../crawler/AutomaticDuplicateRemover.java | 53 +++++++++++++++++++ .../jabref/logic/crawler/StudyRepository.java | 8 +++ 2 files changed, 61 insertions(+) create mode 100644 jablib/src/main/java/org/jabref/logic/crawler/AutomaticDuplicateRemover.java diff --git a/jablib/src/main/java/org/jabref/logic/crawler/AutomaticDuplicateRemover.java b/jablib/src/main/java/org/jabref/logic/crawler/AutomaticDuplicateRemover.java new file mode 100644 index 00000000000..ba3455e77d2 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/crawler/AutomaticDuplicateRemover.java @@ -0,0 +1,53 @@ +package org.jabref.logic.crawler; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.jabref.logic.database.DuplicateCheck; +import org.jabref.model.database.BibDatabase; +import org.jabref.model.database.BibDatabaseContext; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.BibEntryTypesManager; + +public class AutomaticDuplicateRemover { + private final BibEntryTypesManager bibEntryTypesManager; + + public AutomaticDuplicateRemover(BibEntryTypesManager bibEntryTypesManager) { + this.bibEntryTypesManager = bibEntryTypesManager; + } + + public void removeDuplicates(BibDatabaseContext databaseContext) { + DuplicateCheck duplicateCheck = new DuplicateCheck(bibEntryTypesManager); + BibDatabase database = databaseContext.getDatabase(); + List entries = database.getEntries(); + List entriesToRemove = new ArrayList<>(); + Set handledEntries = new HashSet<>(); + + for (int i = 0; i < entries.size(); i++) { + BibEntry entry1 = entries.get(i); + if (handledEntries.contains(entry1)) { + continue; + } + + for (int j = i + 1; j < entries.size(); j++) { + BibEntry entry2 = entries.get(j); + if (handledEntries.contains(entry2)) { + continue; + } + + if (duplicateCheck.isDuplicate(entry1, entry2, databaseContext.getMode())) { + entry1.mergeWith(entry2); + entriesToRemove.add(entry2); + handledEntries.add(entry2); + } + } + handledEntries.add(entry1); + } + + for (BibEntry entry : entriesToRemove) { + database.removeEntry(entry); + } + } +} diff --git a/jablib/src/main/java/org/jabref/logic/crawler/StudyRepository.java b/jablib/src/main/java/org/jabref/logic/crawler/StudyRepository.java index 6f6c4db5b10..59fcd3740e1 100644 --- a/jablib/src/main/java/org/jabref/logic/crawler/StudyRepository.java +++ b/jablib/src/main/java/org/jabref/logic/crawler/StudyRepository.java @@ -7,13 +7,17 @@ import java.nio.file.Path; import java.time.LocalDateTime; import java.time.temporal.ChronoUnit; +import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.jabref.logic.JabRefException; import org.jabref.logic.citationkeypattern.CitationKeyGenerator; import org.jabref.logic.database.DatabaseMerger; +import org.jabref.logic.database.DuplicateCheck; import org.jabref.logic.exporter.AtomicFileWriter; import org.jabref.logic.exporter.BibDatabaseWriter; import org.jabref.logic.exporter.BibWriter; @@ -419,6 +423,9 @@ private void persistResults(List crawlResults) throws IOException, // Merge new entries into study result file merger.merge(existingStudyResultEntries.getDatabase(), newStudyResultEntries); + LOGGER.info("Removing duplicates..."); + new AutomaticDuplicateRemover(bibEntryTypesManager).removeDuplicates(existingStudyResultEntries); + writeResultToFile(getPathToStudyResultFile(), existingStudyResultEntries); } @@ -463,4 +470,5 @@ private Path getPathToStudyResultFile() { private Path getPathToQueryDirectory(String query) { return repositoryPath.resolve(trimNameAndAddID(query)); } + } From 2d70b7fd4c3f100a295a8cde9312a207abf0afb1 Mon Sep 17 00:00:00 2001 From: Ruslan Popov Date: Thu, 6 Nov 2025 15:01:15 +0100 Subject: [PATCH 2/4] fix(duplicates): remove unnecessary local var --- .../logic/crawler/AutomaticDuplicateRemover.java | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/crawler/AutomaticDuplicateRemover.java b/jablib/src/main/java/org/jabref/logic/crawler/AutomaticDuplicateRemover.java index ba3455e77d2..49959bb78d4 100644 --- a/jablib/src/main/java/org/jabref/logic/crawler/AutomaticDuplicateRemover.java +++ b/jablib/src/main/java/org/jabref/logic/crawler/AutomaticDuplicateRemover.java @@ -1,6 +1,5 @@ package org.jabref.logic.crawler; -import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -22,28 +21,25 @@ public void removeDuplicates(BibDatabaseContext databaseContext) { DuplicateCheck duplicateCheck = new DuplicateCheck(bibEntryTypesManager); BibDatabase database = databaseContext.getDatabase(); List entries = database.getEntries(); - List entriesToRemove = new ArrayList<>(); - Set handledEntries = new HashSet<>(); + Set entriesToRemove = new HashSet<>(); for (int i = 0; i < entries.size(); i++) { BibEntry entry1 = entries.get(i); - if (handledEntries.contains(entry1)) { + if (entriesToRemove.contains(entry1)) { continue; } for (int j = i + 1; j < entries.size(); j++) { BibEntry entry2 = entries.get(j); - if (handledEntries.contains(entry2)) { + if (entriesToRemove.contains(entry2)) { continue; } if (duplicateCheck.isDuplicate(entry1, entry2, databaseContext.getMode())) { entry1.mergeWith(entry2); entriesToRemove.add(entry2); - handledEntries.add(entry2); } } - handledEntries.add(entry1); } for (BibEntry entry : entriesToRemove) { From 978b8a99f4c5b4f7811f08e02fdac09827b34e9d Mon Sep 17 00:00:00 2001 From: Ruslan Popov Date: Thu, 6 Nov 2025 15:04:04 +0100 Subject: [PATCH 3/4] chore(changelog): add changelog entry --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 30842dfaa9d..0b97501a990 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv ### Added - We added automatic date-based groups that create year/month/day subgroups from an entry’s date fields. [#10822](https://github.com/JabRef/jabref/issues/10822) +- We added automatic remove of duplicated entries in SLR's study results. [#14226](https://github.com/JabRef/jabref/pull/14226) ### Changed From 636e4135005479325a92cc6eedbccbccf6e11a42 Mon Sep 17 00:00:00 2001 From: Ruslan Popov Date: Thu, 6 Nov 2025 15:08:20 +0100 Subject: [PATCH 4/4] refactor(slr): add more meaningful log --- .../java/org/jabref/logic/crawler/StudyRepository.java | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/crawler/StudyRepository.java b/jablib/src/main/java/org/jabref/logic/crawler/StudyRepository.java index 59fcd3740e1..f71d83cbe45 100644 --- a/jablib/src/main/java/org/jabref/logic/crawler/StudyRepository.java +++ b/jablib/src/main/java/org/jabref/logic/crawler/StudyRepository.java @@ -7,17 +7,13 @@ import java.nio.file.Path; import java.time.LocalDateTime; import java.time.temporal.ChronoUnit; -import java.util.ArrayList; -import java.util.HashSet; import java.util.List; -import java.util.Set; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.jabref.logic.JabRefException; import org.jabref.logic.citationkeypattern.CitationKeyGenerator; import org.jabref.logic.database.DatabaseMerger; -import org.jabref.logic.database.DuplicateCheck; import org.jabref.logic.exporter.AtomicFileWriter; import org.jabref.logic.exporter.BibDatabaseWriter; import org.jabref.logic.exporter.BibWriter; @@ -423,8 +419,9 @@ private void persistResults(List crawlResults) throws IOException, // Merge new entries into study result file merger.merge(existingStudyResultEntries.getDatabase(), newStudyResultEntries); - LOGGER.info("Removing duplicates..."); + LOGGER.info("Removing duplicates from study results (initially {} entries)", existingStudyResultEntries.getEntries().size()); new AutomaticDuplicateRemover(bibEntryTypesManager).removeDuplicates(existingStudyResultEntries); + LOGGER.info("Removed {} entries", existingStudyResultEntries.getEntries().size()); writeResultToFile(getPathToStudyResultFile(), existingStudyResultEntries); } @@ -470,5 +467,4 @@ private Path getPathToStudyResultFile() { private Path getPathToQueryDirectory(String query) { return repositoryPath.resolve(trimNameAndAddID(query)); } - }