From b256ba712a1482692a8d06731e5e03a242e942d5 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sat, 7 Sep 2024 19:54:18 +0200 Subject: [PATCH] Try JCEF --- .gitignore | 3 + build.gradle | 2 +- src/main/java/module-info.java | 3 +- .../jabref/logic/importer/fetcher/ACS.java | 75 +++++++++++++------ 4 files changed, 59 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 2e33739ec64..e3abb530fc8 100644 --- a/.gitignore +++ b/.gitignore @@ -3,8 +3,11 @@ src/main/gen/ src/main/generated/ src-gen/ + .lycheecache +jcef-bundle/ + javafx/javafx-sdk-* javafx/javafx-jmods-* javafx/javafx.html diff --git a/build.gradle b/build.gradle index 87bed09a28c..912ec1edcb3 100644 --- a/build.gradle +++ b/build.gradle @@ -256,7 +256,7 @@ dependencies { implementation 'org.controlsfx:controlsfx:11.2.1' // region HTTP clients - implementation 'org.htmlunit:htmlunit:4.4.0' // used for web scraping + implementation 'me.friwi:jcefmaven:126.2.0' // used for web scraping implementation 'org.jsoup:jsoup:1.18.1' implementation 'com.konghq:unirest-java-core:4.4.4' implementation 'com.konghq:unirest-modules-gson:4.4.4' diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java index dc18d9dd3f2..34224f33dc2 100644 --- a/src/main/java/module-info.java +++ b/src/main/java/module-info.java @@ -90,7 +90,7 @@ requires org.glassfish.hk2.api; // region: http clients - requires htmlunit; + requires jcefmaven; requires org.apache.httpcomponents.core5.httpcore5; requires org.jsoup; requires unirest.java.core; @@ -184,5 +184,6 @@ requires mslinks; requires org.antlr.antlr4.runtime; requires org.libreoffice.uno; + requires jcef; // endregion } diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ACS.java b/src/main/java/org/jabref/logic/importer/fetcher/ACS.java index 581810d0479..1f6d8d5be33 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ACS.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ACS.java @@ -10,17 +10,21 @@ import org.jabref.model.entry.field.StandardField; import org.jabref.model.entry.identifier.DOI; -import org.htmlunit.BrowserVersion; -import org.htmlunit.WebClient; -import org.htmlunit.html.HtmlPage; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; +import me.friwi.jcefmaven.CefAppBuilder; +import me.friwi.jcefmaven.MavenCefAppHandlerAdapter; +import org.cef.CefApp; +import org.cef.CefClient; +import org.cef.CefSettings; +import org.cef.browser.CefBrowser; +import org.cef.browser.CefFrame; +import org.cef.callback.CefStringVisitor; +import org.cef.handler.CefDisplayHandlerAdapter; +import org.cef.handler.CefLoadHandlerAdapter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * FulltextFetcher implementation that attempts to find a PDF URL at ACS. + * FulltextFetcher implementation that attempts to find a PDF URL at ACS. */ public class ACS implements FulltextFetcher { private static final Logger LOGGER = LoggerFactory.getLogger(ACS.class); @@ -42,24 +46,51 @@ public Optional findFullText(BibEntry entry) throws IOException { String source = SOURCE.formatted(doi.get().getDOI()); - try (final WebClient webClient = new WebClient(BrowserVersion.CHROME)) { - webClient.getOptions().setSSLClientProtocols("TLSv1.3", "TLSv1.2"); - // inspired by https://www.innoq.com/en/blog/2016/01/webscraping/ - webClient.getCookieManager().setCookiesEnabled(true); - webClient.getOptions().setJavaScriptEnabled(true); - webClient.getOptions().setTimeout(10_000); - webClient.waitForBackgroundJavaScript(5000); - webClient.getOptions().setThrowExceptionOnScriptError(false); - webClient.getOptions().setPrintContentOnFailingStatusCode(true); + CefAppBuilder builder = new CefAppBuilder(); + builder.setAppHandler(new MavenCefAppHandlerAdapter(){}); + CefApp cefApp; + try { + cefApp = builder.build(); + } catch (Exception e) { + LOGGER.error("Could not initialize CEF", e); + throw new IOException(e); + } + + CefClient client = cefApp.createClient(); + CefBrowser browser = client.createBrowser(source, false, false); + + client.addLoadHandler(new CefLoadHandlerAdapter() { + @Override + public void onLoadEnd(CefBrowser browser, CefFrame frame, int httpStatusCode) { + System.out.println("lalala"); + if (frame.isMain()) { + frame.executeJavaScript( + "document.documentElement.outerHTML;", + frame.getURL(), + 0 + ); + } + } + }); - HtmlPage page = webClient.getPage(source); - boolean pdfButtonExists = page.querySelectorAll("a[title=\"PDF\"].article__btn__secondary").isEmpty(); - if (pdfButtonExists) { - LOGGER.info("Fulltext PDF found at ACS."); - // We "guess" the URL instead of parsing the HTML for the actual link - return Optional.of(new URL(source.replaceFirst("/abs/", "/pdf/"))); + client.addDisplayHandler(new CefDisplayHandlerAdapter() { + @Override + public boolean onConsoleMessage(CefBrowser browser, CefSettings.LogSeverity level, String message, String source, int line) { + // Capture the result of the JavaScript execution in the console message + System.out.println("Page HTML content:\n" + message); + return true; } + }); + + browser.loadURL(source); + + try { + Thread.sleep(5000); + } catch ( + InterruptedException e) { + throw new RuntimeException(e); } + return Optional.empty(); }