diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9a67b67b..5b470d83 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,8 +8,9 @@
# Changelog
-## 12.2.1 (upcoming)
+## 12.3.0 (upcoming)
+- Add support for XHTML (fixes [vscode-ltex#342](https://github.com/valentjn/vscode-ltex/issues/342))
- Fix error when checking LATEX documents ending with specific commands (fixes [vscode-ltex#341](https://github.com/valentjn/vscode-ltex/issues/341))
- Fix name of Portuguese babel language names, add support for Brazilian Portuguese babel language names (fixes [#72](https://github.com/valentjn/ltex-ls/issues/72))
diff --git a/README.md b/README.md
index 0d8cd670..d72e636b 100644
--- a/README.md
+++ b/README.md
@@ -30,11 +30,10 @@ Find more information about LTEX at the [website of vscode-ltex](http
## Features
-- **Supported markup languages:** BibTEX, LATEX, Markdown, Org, reStructuredText, R Sweave
+- **Supported markup languages:** BibTEX, LATEX, Markdown, Org, reStructuredText, R Sweave, XHTML
- Comes with **everything included,** no need to install Java or LanguageTool
- **Offline checking:** Does not upload anything to the internet
- Supports **over 20 languages:** English, French, German, Dutch, Chinese, Russian, etc.
-- **Issue highlighting** with hover description
- **Replacement suggestions** via quick fixes
- **User dictionaries**
- **Multilingual support** with babel commands or magic comments
diff --git a/ltexls/pom.xml b/ltexls/pom.xml
index 166ae7ba..402287b1 100644
--- a/ltexls/pom.xml
+++ b/ltexls/pom.xml
@@ -75,6 +75,11 @@
flexmark-test-util
0.62.2
+
+ com.fasterxml.woodstox
+ woodstox-core
+ 6.2.6
+
com.google.code.gson
gson
diff --git a/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeAnnotatedTextBuilder.java b/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeAnnotatedTextBuilder.java
index 3212506c..8bf78728 100644
--- a/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeAnnotatedTextBuilder.java
+++ b/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeAnnotatedTextBuilder.java
@@ -10,6 +10,7 @@
import java.util.HashMap;
import java.util.Map;
import java.util.function.Function;
+import org.bsplines.ltexls.parsing.html.HtmlAnnotatedTextBuilder;
import org.bsplines.ltexls.parsing.latex.LatexAnnotatedTextBuilder;
import org.bsplines.ltexls.parsing.markdown.MarkdownAnnotatedTextBuilder;
import org.bsplines.ltexls.parsing.org.OrgAnnotatedTextBuilder;
@@ -27,6 +28,8 @@ public abstract class CodeAnnotatedTextBuilder extends AnnotatedTextBuilder {
static {
constructorMap.put("bibtex", (String codeLanguageId) ->
new LatexAnnotatedTextBuilder(codeLanguageId));
+ constructorMap.put("html", (String codeLanguageId) ->
+ new HtmlAnnotatedTextBuilder(codeLanguageId));
constructorMap.put("latex", (String codeLanguageId) ->
new LatexAnnotatedTextBuilder(codeLanguageId));
constructorMap.put("markdown", (String codeLanguageId) ->
diff --git a/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeFragmentizer.java b/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeFragmentizer.java
index 7a3054f1..a738b8f1 100644
--- a/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeFragmentizer.java
+++ b/ltexls/src/main/java/org/bsplines/ltexls/parsing/CodeFragmentizer.java
@@ -13,6 +13,7 @@
import java.util.Map;
import java.util.function.Function;
import org.bsplines.ltexls.parsing.bibtex.BibtexFragmentizer;
+import org.bsplines.ltexls.parsing.html.HtmlFragmentizer;
import org.bsplines.ltexls.parsing.latex.LatexFragmentizer;
import org.bsplines.ltexls.parsing.markdown.MarkdownFragmentizer;
import org.bsplines.ltexls.parsing.org.OrgFragmentizer;
@@ -29,6 +30,8 @@ public abstract class CodeFragmentizer {
static {
constructorMap.put("bibtex", (String codeLanguageId) ->
new BibtexFragmentizer(codeLanguageId));
+ constructorMap.put("html", (String codeLanguageId) ->
+ new HtmlFragmentizer(codeLanguageId));
constructorMap.put("latex", (String codeLanguageId) ->
new LatexFragmentizer(codeLanguageId));
constructorMap.put("markdown", (String codeLanguageId) ->
diff --git a/ltexls/src/main/java/org/bsplines/ltexls/parsing/html/HtmlAnnotatedTextBuilder.java b/ltexls/src/main/java/org/bsplines/ltexls/parsing/html/HtmlAnnotatedTextBuilder.java
new file mode 100644
index 00000000..39ec8693
--- /dev/null
+++ b/ltexls/src/main/java/org/bsplines/ltexls/parsing/html/HtmlAnnotatedTextBuilder.java
@@ -0,0 +1,123 @@
+/* Copyright (C) 2020 Julian Valentin, LTeX Development Community
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+
+package org.bsplines.ltexls.parsing.html;
+
+import com.ctc.wstx.api.WstxInputProperties;
+import java.io.StringReader;
+import java.util.Stack;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import org.bsplines.ltexls.parsing.CodeAnnotatedTextBuilder;
+import org.bsplines.ltexls.tools.Tools;
+
+public class HtmlAnnotatedTextBuilder extends CodeAnnotatedTextBuilder {
+ private static final Pattern whitespacePattern = Pattern.compile(" *\r?\n *");
+
+ private XMLInputFactory xmlInputFactory;
+
+ public HtmlAnnotatedTextBuilder(String codeLanguageId) {
+ super(codeLanguageId);
+
+ this.xmlInputFactory = XMLInputFactory.newInstance();
+ this.xmlInputFactory.setProperty(WstxInputProperties.P_MIN_TEXT_SEGMENT, 1);
+ this.xmlInputFactory.setProperty(WstxInputProperties.P_TREAT_CHAR_REFS_AS_ENTS, true);
+ this.xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, true);
+ this.xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
+ this.xmlInputFactory.setProperty(XMLInputFactory.IS_VALIDATING, false);
+ this.xmlInputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
+ }
+
+ @Override
+ public CodeAnnotatedTextBuilder addCode(String code) {
+ int pos = 0;
+ Stack elementNameStack = new Stack<>();
+ elementNameStack.push("html");
+ String nextText = "";
+
+ try {
+ XMLStreamReader xmlStreamReader =
+ this.xmlInputFactory.createXMLStreamReader(new StringReader(code));
+
+ while (xmlStreamReader.hasNext()) {
+ int eventType = xmlStreamReader.next();
+ int oldPos = pos;
+ pos = xmlStreamReader.getLocation().getCharacterOffset();
+ String skippedCode = code.substring(oldPos, pos);
+ String interpretAs = "";
+
+ Tools.logger.finest("Position " + pos + " (" + xmlStreamReader.getLocation().getLineNumber()
+ + "," + xmlStreamReader.getLocation().getColumnNumber() + "): Event type = "
+ + eventType + ", skippedCode = '" + skippedCode + "'");
+
+ if (!nextText.isEmpty()) {
+ if (nextText.equals(skippedCode)) {
+ addTextWithWhitespace(nextText);
+ } else {
+ addMarkup(skippedCode, nextText);
+ }
+
+ skippedCode = "";
+ nextText = "";
+ }
+
+ if (eventType == XMLStreamReader.START_ELEMENT) {
+ String elementName = xmlStreamReader.getLocalName();
+ elementNameStack.push(elementName);
+ Tools.logger.finest("START_ELEMENT: elementName = '" + xmlStreamReader.getLocalName()
+ + "'");
+
+ if ((elementName == "body") || (elementName == "div")
+ || (elementName == "h1") || (elementName == "h2") || (elementName == "h3")
+ || (elementName == "h4") || (elementName == "h5") || (elementName == "h6")
+ || (elementName == "p") || (elementName == "table") || (elementName == "tr")) {
+ interpretAs += "\n\n";
+ } else if ((elementName == "br") || (elementName == "li")) {
+ interpretAs += "\n";
+ }
+ } else if (eventType == XMLStreamReader.END_ELEMENT) {
+ Tools.logger.finest("END_ELEMENT");
+ if (!elementNameStack.isEmpty()) elementNameStack.pop();
+ } else if (eventType == XMLStreamReader.CHARACTERS) {
+ String elementName = (elementNameStack.isEmpty() ? "" : elementNameStack.peek());
+ String text = xmlStreamReader.getText();
+ Tools.logger.finest("CHARACTERS: text = '" + text + "'");
+ if ((elementName != "script") && (elementName != "style")) nextText = text;
+ } else if (eventType == XMLStreamReader.ENTITY_REFERENCE) {
+ nextText = xmlStreamReader.getText();
+ Tools.logger.finest("ENTITY_REFERENCE: text = '" + nextText + "'");
+ }
+
+ addMarkup(skippedCode, interpretAs);
+ }
+ } catch (XMLStreamException e) {
+ // ignore parser errors
+ }
+
+ if (pos < code.length()) addTextWithWhitespace(code.substring(pos));
+
+ return this;
+ }
+
+ protected CodeAnnotatedTextBuilder addTextWithWhitespace(String text) {
+ Matcher matcher = whitespacePattern.matcher(text);
+ int pos = 0;
+
+ while (matcher.find()) {
+ if (matcher.start() > 0) addText(text.substring(pos, matcher.start()));
+ addMarkup(matcher.group());
+ pos = matcher.end();
+ }
+
+ if (pos < text.length()) addText(text.substring(pos));
+
+ return this;
+ }
+}
diff --git a/ltexls/src/main/java/org/bsplines/ltexls/parsing/html/HtmlFragmentizer.java b/ltexls/src/main/java/org/bsplines/ltexls/parsing/html/HtmlFragmentizer.java
new file mode 100644
index 00000000..d09e2607
--- /dev/null
+++ b/ltexls/src/main/java/org/bsplines/ltexls/parsing/html/HtmlFragmentizer.java
@@ -0,0 +1,21 @@
+/* Copyright (C) 2020 Julian Valentin, LTeX Development Community
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+
+package org.bsplines.ltexls.parsing.html;
+
+import java.util.regex.Pattern;
+import org.bsplines.ltexls.parsing.RegexCodeFragmentizer;
+
+public class HtmlFragmentizer extends RegexCodeFragmentizer {
+ private static final Pattern pattern = Pattern.compile(
+ "^[ \t]*[ \t]*$",
+ Pattern.MULTILINE);
+
+ public HtmlFragmentizer(String codeLanguageId) {
+ super(codeLanguageId, pattern);
+ }
+}
diff --git a/ltexls/src/main/java/org/bsplines/ltexls/server/LtexWorkspaceService.java b/ltexls/src/main/java/org/bsplines/ltexls/server/LtexWorkspaceService.java
index bdc33130..24bbc9e7 100644
--- a/ltexls/src/main/java/org/bsplines/ltexls/server/LtexWorkspaceService.java
+++ b/ltexls/src/main/java/org/bsplines/ltexls/server/LtexWorkspaceService.java
@@ -116,16 +116,19 @@ public CompletableFuture