From de790f92227efe01a80350638b78a5a3460e71ab Mon Sep 17 00:00:00 2001 From: Stefan Kolb Date: Sun, 15 Mar 2020 15:06:04 +0100 Subject: [PATCH 1/3] Improve ACS fetcher --- .../jabref/logic/importer/fetcher/ACS.java | 25 ++++++++++--------- .../logic/importer/fetcher/ACSTest.java | 11 +++++++- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ACS.java b/src/main/java/org/jabref/logic/importer/fetcher/ACS.java index 2fa068e1eb4..b2fff10f38f 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ACS.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ACS.java @@ -26,7 +26,7 @@ public class ACS implements FulltextFetcher { /** * Tries to find a fulltext URL for a given BibTex entry. - * + *

* Currently only uses the DOI if found. * * @param entry The Bibtex entry @@ -37,23 +37,24 @@ public class ACS implements FulltextFetcher { @Override public Optional findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); - Optional pdfLink = Optional.empty(); // DOI search Optional doi = entry.getField(StandardField.DOI).flatMap(DOI::parse); - if (doi.isPresent()) { - String source = String.format(SOURCE, doi.get().getDOI()); - // Retrieve PDF link - Document html = Jsoup.connect(source).ignoreHttpErrors(true).get(); - Element link = html.select("a.button_primary").first(); + if (!doi.isPresent()) { + return Optional.empty(); + } + + String source = String.format(SOURCE, doi.get().getDOI()); + // Retrieve PDF link + Document html = Jsoup.connect(source).ignoreHttpErrors(true).get(); + Element link = html.select("a.button_primary").first(); - if (link != null) { - LOGGER.info("Fulltext PDF found @ ACS."); - pdfLink = Optional.of(new URL(source.replaceFirst("/abs/", "/pdf/"))); - } + if (link != null) { + LOGGER.info("Fulltext PDF found @ ACS."); + return Optional.of(new URL(source.replaceFirst("/abs/", "/pdf/"))); } - return pdfLink; + return Optional.empty(); } @Override diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ACSTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ACSTest.java index 8a58fd47819..2552b5cc735 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/ACSTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/ACSTest.java @@ -16,7 +16,6 @@ @FetcherTest class ACSTest { - private ACS finder; private BibEntry entry; @@ -44,4 +43,14 @@ void notFoundByDOI() throws IOException { assertEquals(Optional.empty(), finder.findFullText(entry)); } + + @Test + void entityWithoutDoi() throws IOException { + assertEquals(Optional.empty(), finder.findFullText(entry)); + } + + @Test + void trustLevel() { + assertEquals(TrustLevel.PUBLISHER, finder.getTrustLevel()); + } } From 8f68de937e53213bb3e778230c9b91a515f29ac9 Mon Sep 17 00:00:00 2001 From: Stefan Kolb Date: Sun, 15 Mar 2020 15:13:05 +0100 Subject: [PATCH 2/3] Improve arXiv fetcher --- .../java/org/jabref/logic/importer/fetcher/ArXiv.java | 4 +--- .../org/jabref/logic/importer/fetcher/ArXivTest.java | 11 ++++++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java b/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java index de3539d8184..28a521a8022 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java @@ -74,10 +74,8 @@ public Optional findFullText(BibEntry entry) throws IOException { .filter(Optional::isPresent) .map(Optional::get) .findFirst(); + pdfUrl.ifPresent(url -> LOGGER.info("Fulltext PDF found @ arXiv.")); - if (pdfUrl.isPresent()) { - LOGGER.info("Fulltext PDF found @ arXiv."); - } return pdfUrl; } catch (FetcherException e) { LOGGER.warn("arXiv API request failed", e); diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java index 523cf9f651a..8bee0365c5b 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java @@ -23,7 +23,6 @@ @FetcherTest class ArXivTest { - private ArXiv finder; private BibEntry entry; private BibEntry sliceTheoremPaper; @@ -121,6 +120,16 @@ void findFullTextByDOINotAvailableInCatalog() throws IOException { assertEquals(Optional.empty(), finder.findFullText(entry)); } + @Test + void findFullTextEntityWithoutDoi() throws IOException { + assertEquals(Optional.empty(), finder.findFullText(entry)); + } + + @Test + void findFullTextTrustLevel() { + assertEquals(TrustLevel.PREPRINT, finder.getTrustLevel()); + } + @Test void searchEntryByPartOfTitle() throws Exception { assertEquals(Collections.singletonList(sliceTheoremPaper), From 0f93e2244d6500f1a6154c8433f2e2ad605b7939 Mon Sep 17 00:00:00 2001 From: Stefan Kolb Date: Sun, 15 Mar 2020 18:15:44 +0100 Subject: [PATCH 3/3] Minor improvements for Google scholar fetcher Detect Google captcha div. Follow up PRs might show or give more info to the user. --- .../logic/importer/fetcher/GoogleScholar.java | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java index 4e28444ff28..a8850bd9822 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java @@ -35,7 +35,7 @@ /** * FulltextFetcher implementation that attempts to find a PDF URL at GoogleScholar. - * + *

* Search String infos: https://scholar.google.com/intl/en/scholar/help.html#searching */ public class GoogleScholar implements FulltextFetcher, SearchBasedFetcher { @@ -58,11 +58,10 @@ public GoogleScholar(ImportFormatPreferences importFormatPreferences) { @Override public Optional findFullText(BibEntry entry) throws IOException, FetcherException { Objects.requireNonNull(entry); - Optional pdfLink = Optional.empty(); // Search in title if (!entry.hasField(StandardField.TITLE)) { - return pdfLink; + return Optional.empty(); } try { @@ -74,12 +73,10 @@ public Optional findFullText(BibEntry entry) throws IOException, FetcherExc // as_occt field to search in uriBuilder.addParameter("as_occt", "title"); - pdfLink = search(uriBuilder.toString()); + return search(uriBuilder.toString()); } catch (URISyntaxException e) { throw new FetcherException("Building URI failed.", e); } - - return pdfLink; } @Override @@ -91,6 +88,11 @@ private Optional search(String url) throws IOException { Optional pdfLink = Optional.empty(); Document doc = Jsoup.connect(url).userAgent(URLDownload.USER_AGENT).get(); + + if (needsCaptcha(doc.body().html())) { + LOGGER.warn("Hit Google traffic limitation. Captcha prevents automatic fetching."); + return Optional.empty(); + } // Check results for PDF link // TODO: link always on first result or none? for (int i = 0; i < NUM_RESULTS; i++) { @@ -111,6 +113,10 @@ private Optional search(String url) throws IOException { return pdfLink; } + private boolean needsCaptcha(String body) { + return body.contains("id=\"gs_captcha_ccl\""); + } + @Override public String getName() { return "Google Scholar"; @@ -158,6 +164,11 @@ public List performSearch(String query) throws FetcherException { private void addHitsFromQuery(List entryList, String queryURL) throws IOException, FetcherException { String content = new URLDownload(queryURL).asString(); + if (needsCaptcha(content)) { + throw new FetcherException("Fetching from Google Scholar failed.", + Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null); + } + Matcher matcher = LINK_TO_BIB_PATTERN.matcher(content); while (matcher.find()) { String citationsPageURL = matcher.group().replace("&", "&");