From e2f55071be001e6f69afd45ea551fdc34cb2d3e8 Mon Sep 17 00:00:00 2001 From: Julian Valentin Date: Tue, 6 Jul 2021 20:52:56 +0200 Subject: [PATCH] Add support for XHTML See valentjn/vscode-ltex#342. --- CHANGELOG.md | 3 +- README.md | 3 +- ltexls/pom.xml | 5 + .../parsing/CodeAnnotatedTextBuilder.java | 3 + .../ltexls/parsing/CodeFragmentizer.java | 3 + .../html/HtmlAnnotatedTextBuilder.java | 123 ++++++++++++++++++ .../ltexls/parsing/html/HtmlFragmentizer.java | 21 +++ .../ltexls/server/LtexWorkspaceService.java | 11 +- .../bsplines/ltexls/settings/Settings.java | 2 +- .../html/HtmlAnnotatedTextBuilderTest.java | 49 +++++++ .../parsing/html/HtmlFragmentizerTest.java | 56 ++++++++ 11 files changed, 271 insertions(+), 8 deletions(-) create mode 100644 ltexls/src/main/java/org/bsplines/ltexls/parsing/html/HtmlAnnotatedTextBuilder.java create mode 100644 ltexls/src/main/java/org/bsplines/ltexls/parsing/html/HtmlFragmentizer.java create mode 100644 ltexls/src/test/java/org/bsplines/ltexls/parsing/html/HtmlAnnotatedTextBuilderTest.java create mode 100644 ltexls/src/test/java/org/bsplines/ltexls/parsing/html/HtmlFragmentizerTest.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a67b67b..5b470d83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,9 @@ # Changelog -## 12.2.1 (upcoming) +## 12.3.0 (upcoming) +- Add support for XHTML (fixes [vscode-ltex#342](https://github.com/valentjn/vscode-ltex/issues/342)) - Fix error when checking LATEX documents ending with specific commands (fixes [vscode-ltex#341](https://github.com/valentjn/vscode-ltex/issues/341)) - Fix name of Portuguese babel language names, add support for Brazilian Portuguese babel language names (fixes [#72](https://github.com/valentjn/ltex-ls/issues/72)) diff --git a/README.md b/README.md index 0d8cd670..d72e636b 100644 --- a/README.md +++ b/README.md @@ -30,11 +30,10 @@ Find more information about LTEX at the [website of vscode-ltex](http ## Features -- **Supported markup languages:** BibTEX, LATEX, Markdown, Org, reStructuredText, R Sweave +- **Supported markup languages:** BibTEX, LATEX, Markdown, Org, reStructuredText, R Sweave, XHTML - Comes with **everything included,** no need to install Java or LanguageTool - **Offline checking:** Does not upload anything to the internet - Supports **over 20 languages:** English, French, German, Dutch, Chinese, Russian, etc. -- **Issue highlighting** with hover description - **Replacement suggestions** via quick fixes - **User dictionaries** - **Multilingual support** with babel commands or magic comments diff --git a/ltexls/pom.xml b/ltexls/pom.xml index 166ae7ba..402287b1 100644 --- a/ltexls/pom.xml +++ b/ltexls/pom.xml @@ -75,6 +75,11 @@ flexmark-test-util 0.62.2 + + com.fasterxml.woodstox + woodstox-core + 6.2.6 + com.google.code.gson gson diff --git a/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeAnnotatedTextBuilder.java b/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeAnnotatedTextBuilder.java index 3212506c..8bf78728 100644 --- a/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeAnnotatedTextBuilder.java +++ b/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeAnnotatedTextBuilder.java @@ -10,6 +10,7 @@ import java.util.HashMap; import java.util.Map; import java.util.function.Function; +import org.bsplines.ltexls.parsing.html.HtmlAnnotatedTextBuilder; import org.bsplines.ltexls.parsing.latex.LatexAnnotatedTextBuilder; import org.bsplines.ltexls.parsing.markdown.MarkdownAnnotatedTextBuilder; import org.bsplines.ltexls.parsing.org.OrgAnnotatedTextBuilder; @@ -27,6 +28,8 @@ public abstract class CodeAnnotatedTextBuilder extends AnnotatedTextBuilder { static { constructorMap.put("bibtex", (String codeLanguageId) -> new LatexAnnotatedTextBuilder(codeLanguageId)); + constructorMap.put("html", (String codeLanguageId) -> + new HtmlAnnotatedTextBuilder(codeLanguageId)); constructorMap.put("latex", (String codeLanguageId) -> new LatexAnnotatedTextBuilder(codeLanguageId)); constructorMap.put("markdown", (String codeLanguageId) -> diff --git a/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeFragmentizer.java b/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeFragmentizer.java index 7a3054f1..a738b8f1 100644 --- a/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeFragmentizer.java +++ b/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeFragmentizer.java @@ -13,6 +13,7 @@ import java.util.Map; import java.util.function.Function; import org.bsplines.ltexls.parsing.bibtex.BibtexFragmentizer; +import org.bsplines.ltexls.parsing.html.HtmlFragmentizer; import org.bsplines.ltexls.parsing.latex.LatexFragmentizer; import org.bsplines.ltexls.parsing.markdown.MarkdownFragmentizer; import org.bsplines.ltexls.parsing.org.OrgFragmentizer; @@ -29,6 +30,8 @@ public abstract class CodeFragmentizer { static { constructorMap.put("bibtex", (String codeLanguageId) -> new BibtexFragmentizer(codeLanguageId)); + constructorMap.put("html", (String codeLanguageId) -> + new HtmlFragmentizer(codeLanguageId)); constructorMap.put("latex", (String codeLanguageId) -> new LatexFragmentizer(codeLanguageId)); constructorMap.put("markdown", (String codeLanguageId) -> diff --git a/ltexls/src/main/java/org/bsplines/ltexls/parsing/html/HtmlAnnotatedTextBuilder.java b/ltexls/src/main/java/org/bsplines/ltexls/parsing/html/HtmlAnnotatedTextBuilder.java new file mode 100644 index 00000000..39ec8693 --- /dev/null +++ b/ltexls/src/main/java/org/bsplines/ltexls/parsing/html/HtmlAnnotatedTextBuilder.java @@ -0,0 +1,123 @@ +/* Copyright (C) 2020 Julian Valentin, LTeX Development Community + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ + +package org.bsplines.ltexls.parsing.html; + +import com.ctc.wstx.api.WstxInputProperties; +import java.io.StringReader; +import java.util.Stack; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; +import org.bsplines.ltexls.parsing.CodeAnnotatedTextBuilder; +import org.bsplines.ltexls.tools.Tools; + +public class HtmlAnnotatedTextBuilder extends CodeAnnotatedTextBuilder { + private static final Pattern whitespacePattern = Pattern.compile(" *\r?\n *"); + + private XMLInputFactory xmlInputFactory; + + public HtmlAnnotatedTextBuilder(String codeLanguageId) { + super(codeLanguageId); + + this.xmlInputFactory = XMLInputFactory.newInstance(); + this.xmlInputFactory.setProperty(WstxInputProperties.P_MIN_TEXT_SEGMENT, 1); + this.xmlInputFactory.setProperty(WstxInputProperties.P_TREAT_CHAR_REFS_AS_ENTS, true); + this.xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, true); + this.xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); + this.xmlInputFactory.setProperty(XMLInputFactory.IS_VALIDATING, false); + this.xmlInputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false); + } + + @Override + public CodeAnnotatedTextBuilder addCode(String code) { + int pos = 0; + Stack elementNameStack = new Stack<>(); + elementNameStack.push("html"); + String nextText = ""; + + try { + XMLStreamReader xmlStreamReader = + this.xmlInputFactory.createXMLStreamReader(new StringReader(code)); + + while (xmlStreamReader.hasNext()) { + int eventType = xmlStreamReader.next(); + int oldPos = pos; + pos = xmlStreamReader.getLocation().getCharacterOffset(); + String skippedCode = code.substring(oldPos, pos); + String interpretAs = ""; + + Tools.logger.finest("Position " + pos + " (" + xmlStreamReader.getLocation().getLineNumber() + + "," + xmlStreamReader.getLocation().getColumnNumber() + "): Event type = " + + eventType + ", skippedCode = '" + skippedCode + "'"); + + if (!nextText.isEmpty()) { + if (nextText.equals(skippedCode)) { + addTextWithWhitespace(nextText); + } else { + addMarkup(skippedCode, nextText); + } + + skippedCode = ""; + nextText = ""; + } + + if (eventType == XMLStreamReader.START_ELEMENT) { + String elementName = xmlStreamReader.getLocalName(); + elementNameStack.push(elementName); + Tools.logger.finest("START_ELEMENT: elementName = '" + xmlStreamReader.getLocalName() + + "'"); + + if ((elementName == "body") || (elementName == "div") + || (elementName == "h1") || (elementName == "h2") || (elementName == "h3") + || (elementName == "h4") || (elementName == "h5") || (elementName == "h6") + || (elementName == "p") || (elementName == "table") || (elementName == "tr")) { + interpretAs += "\n\n"; + } else if ((elementName == "br") || (elementName == "li")) { + interpretAs += "\n"; + } + } else if (eventType == XMLStreamReader.END_ELEMENT) { + Tools.logger.finest("END_ELEMENT"); + if (!elementNameStack.isEmpty()) elementNameStack.pop(); + } else if (eventType == XMLStreamReader.CHARACTERS) { + String elementName = (elementNameStack.isEmpty() ? "" : elementNameStack.peek()); + String text = xmlStreamReader.getText(); + Tools.logger.finest("CHARACTERS: text = '" + text + "'"); + if ((elementName != "script") && (elementName != "style")) nextText = text; + } else if (eventType == XMLStreamReader.ENTITY_REFERENCE) { + nextText = xmlStreamReader.getText(); + Tools.logger.finest("ENTITY_REFERENCE: text = '" + nextText + "'"); + } + + addMarkup(skippedCode, interpretAs); + } + } catch (XMLStreamException e) { + // ignore parser errors + } + + if (pos < code.length()) addTextWithWhitespace(code.substring(pos)); + + return this; + } + + protected CodeAnnotatedTextBuilder addTextWithWhitespace(String text) { + Matcher matcher = whitespacePattern.matcher(text); + int pos = 0; + + while (matcher.find()) { + if (matcher.start() > 0) addText(text.substring(pos, matcher.start())); + addMarkup(matcher.group()); + pos = matcher.end(); + } + + if (pos < text.length()) addText(text.substring(pos)); + + return this; + } +} diff --git a/ltexls/src/main/java/org/bsplines/ltexls/parsing/html/HtmlFragmentizer.java b/ltexls/src/main/java/org/bsplines/ltexls/parsing/html/HtmlFragmentizer.java new file mode 100644 index 00000000..d09e2607 --- /dev/null +++ b/ltexls/src/main/java/org/bsplines/ltexls/parsing/html/HtmlFragmentizer.java @@ -0,0 +1,21 @@ +/* Copyright (C) 2020 Julian Valentin, LTeX Development Community + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ + +package org.bsplines.ltexls.parsing.html; + +import java.util.regex.Pattern; +import org.bsplines.ltexls.parsing.RegexCodeFragmentizer; + +public class HtmlFragmentizer extends RegexCodeFragmentizer { + private static final Pattern pattern = Pattern.compile( + "^[ \t]*[ \t]*$", + Pattern.MULTILINE); + + public HtmlFragmentizer(String codeLanguageId) { + super(codeLanguageId, pattern); + } +} diff --git a/ltexls/src/main/java/org/bsplines/ltexls/server/LtexWorkspaceService.java b/ltexls/src/main/java/org/bsplines/ltexls/server/LtexWorkspaceService.java index bdc33130..24bbc9e7 100644 --- a/ltexls/src/main/java/org/bsplines/ltexls/server/LtexWorkspaceService.java +++ b/ltexls/src/main/java/org/bsplines/ltexls/server/LtexWorkspaceService.java @@ -116,16 +116,19 @@ public CompletableFuture executeCheckDocumentCommand(JsonObject argument if (fileNameStr.endsWith(".bib")) { codeLanguageId = "bibtex"; + } else if (fileNameStr.endsWith(".tex")) { + codeLanguageId = "latex"; + } else if (fileNameStr.endsWith(".htm") || fileNameStr.endsWith(".html") + || fileNameStr.endsWith(".xht") || fileNameStr.endsWith(".xhtml")) { + codeLanguageId = "html"; } else if (fileNameStr.endsWith(".md")) { codeLanguageId = "markdown"; } else if (fileNameStr.endsWith(".org")) { codeLanguageId = "org"; - } else if (fileNameStr.endsWith(".Rnw") || fileNameStr.endsWith(".rnw")) { - codeLanguageId = "rsweave"; } else if (fileNameStr.endsWith(".rst")) { codeLanguageId = "restructuredtext"; - } else if (fileNameStr.endsWith(".tex")) { - codeLanguageId = "latex"; + } else if (fileNameStr.endsWith(".Rnw") || fileNameStr.endsWith(".rnw")) { + codeLanguageId = "rsweave"; } } } diff --git a/ltexls/src/main/java/org/bsplines/ltexls/settings/Settings.java b/ltexls/src/main/java/org/bsplines/ltexls/settings/Settings.java index 237d133a..7d66a26e 100644 --- a/ltexls/src/main/java/org/bsplines/ltexls/settings/Settings.java +++ b/ltexls/src/main/java/org/bsplines/ltexls/settings/Settings.java @@ -26,7 +26,7 @@ public class Settings { private static final Set defaultEnabled = new HashSet<>(Arrays.asList( - "bibtex", "latex", "markdown", "org", "restructuredtext", "rsweave")); + "bibtex", "latex", "html", "markdown", "org", "restructuredtext", "rsweave")); private @Nullable Set enabled; private @Nullable String languageShortCode; diff --git a/ltexls/src/test/java/org/bsplines/ltexls/parsing/html/HtmlAnnotatedTextBuilderTest.java b/ltexls/src/test/java/org/bsplines/ltexls/parsing/html/HtmlAnnotatedTextBuilderTest.java new file mode 100644 index 00000000..fae653da --- /dev/null +++ b/ltexls/src/test/java/org/bsplines/ltexls/parsing/html/HtmlAnnotatedTextBuilderTest.java @@ -0,0 +1,49 @@ +/* Copyright (C) 2020 Julian Valentin, LTeX Development Community + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ + +package org.bsplines.ltexls.parsing.html; + +import org.bsplines.ltexls.parsing.CodeAnnotatedTextBuilder; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.languagetool.markup.AnnotatedText; + +public class HtmlAnnotatedTextBuilderTest { + private static void assertPlainText(String code, String expectedPlainText) { + AnnotatedText annotatedText = buildAnnotatedText(code); + Assertions.assertEquals(expectedPlainText, annotatedText.getPlainText()); + } + + private static AnnotatedText buildAnnotatedText(String code) { + CodeAnnotatedTextBuilder builder = CodeAnnotatedTextBuilder.create("html"); + return builder.addCode(code).build(); + } + + @Test + public void test() { + assertPlainText( + "\n" + + " \n" + + " Title\n" + + " \n" + + " \n" + + " This is a test.\n" + + " \n" + + " \n" + + "\n", + "Title\n\nThis is a test."); + assertPlainText( + "This is a test.\n", + "\n\nThis is a test."); + assertPlainText( + "This is a te
st.\n", + "\n\nThis is a te\nst."); + assertPlainText( + "This is a test & another test.\n", + "\n\nThis is a test & another test."); + } +} diff --git a/ltexls/src/test/java/org/bsplines/ltexls/parsing/html/HtmlFragmentizerTest.java b/ltexls/src/test/java/org/bsplines/ltexls/parsing/html/HtmlFragmentizerTest.java new file mode 100644 index 00000000..42a0576a --- /dev/null +++ b/ltexls/src/test/java/org/bsplines/ltexls/parsing/html/HtmlFragmentizerTest.java @@ -0,0 +1,56 @@ +/* Copyright (C) 2020 Julian Valentin, LTeX Development Community + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ + +package org.bsplines.ltexls.parsing.html; + +import java.util.List; +import org.bsplines.ltexls.parsing.CodeFragment; +import org.bsplines.ltexls.parsing.CodeFragmentizer; +import org.bsplines.ltexls.settings.Settings; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +public class HtmlFragmentizerTest { + private static void testFragmentizer(CodeFragmentizer fragmentizer, String code) { + List codeFragments = fragmentizer.fragmentize(code, new Settings()); + Assertions.assertEquals(3, codeFragments.size()); + + Assertions.assertEquals("html", codeFragments.get(0).getCodeLanguageId()); + Assertions.assertEquals(0, codeFragments.get(0).getFromPos()); + Assertions.assertEquals(12, codeFragments.get(0).getCode().length()); + Assertions.assertEquals("en-US", codeFragments.get(0).getSettings().getLanguageShortCode()); + + Assertions.assertEquals("html", codeFragments.get(1).getCodeLanguageId()); + Assertions.assertEquals(12, codeFragments.get(1).getFromPos()); + Assertions.assertEquals(50, codeFragments.get(1).getCode().length()); + Assertions.assertEquals("de-DE", codeFragments.get(1).getSettings().getLanguageShortCode()); + + Assertions.assertEquals("html", codeFragments.get(2).getCodeLanguageId()); + Assertions.assertEquals(62, codeFragments.get(2).getFromPos()); + Assertions.assertEquals(48, codeFragments.get(2).getCode().length()); + Assertions.assertEquals("en-US", codeFragments.get(2).getSettings().getLanguageShortCode()); + } + + @Test + public void test() { + CodeFragmentizer fragmentizer = CodeFragmentizer.create("html"); + + testFragmentizer(fragmentizer, + "Sentence 1\n" + + "\n \n\nSentence 2\n" + + "\n\n\nSentence 3\n"); + } + + @Test + public void testWrongSettings() { + CodeFragmentizer fragmentizer = CodeFragmentizer.create("html"); + Assertions.assertDoesNotThrow(() -> fragmentizer.fragmentize( + "Sentence 1\n\n\nSentence 2\n", new Settings())); + Assertions.assertDoesNotThrow(() -> fragmentizer.fragmentize( + "Sentence 1\n\n\nSentence 2\n", new Settings())); + } +}