Skip to content

Commit

Permalink
Add support for XHTML
Browse files Browse the repository at this point in the history
  • Loading branch information
valentjn committed Jul 6, 2021
1 parent 20ea024 commit e2f5507
Show file tree
Hide file tree
Showing 11 changed files with 271 additions and 8 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

# Changelog

## 12.2.1 (upcoming)
## 12.3.0 (upcoming)

- Add support for XHTML (fixes [vscode-ltex#342](https://github.com/valentjn/vscode-ltex/issues/342))
- Fix error when checking L<sup>A</sup>T<sub>E</sub>X documents ending with specific commands (fixes [vscode-ltex#341](https://github.com/valentjn/vscode-ltex/issues/341))
- Fix name of Portuguese babel language names, add support for Brazilian Portuguese babel language names (fixes [#72](https://github.com/valentjn/ltex-ls/issues/72))

Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,10 @@ Find more information about LT<sub>E</sub>X at the [website of vscode-ltex](http

## Features

- **Supported markup languages:** BibT<sub>E</sub>X, L<sup>A</sup>T<sub>E</sub>X, Markdown, Org, reStructuredText, R Sweave
- **Supported markup languages:** BibT<sub>E</sub>X, L<sup>A</sup>T<sub>E</sub>X, Markdown, Org, reStructuredText, R Sweave, XHTML
- Comes with **everything included,** no need to install Java or LanguageTool
- **Offline checking:** Does not upload anything to the internet
- Supports **over 20 languages:** English, French, German, Dutch, Chinese, Russian, etc.
- **Issue highlighting** with hover description
- **Replacement suggestions** via quick fixes
- **User dictionaries**
- **Multilingual support** with babel commands or magic comments
Expand Down
5 changes: 5 additions & 0 deletions ltexls/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@
<artifactId>flexmark-test-util</artifactId>
<version>0.62.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.woodstox</groupId>
<artifactId>woodstox-core</artifactId>
<version>6.2.6</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import java.util.HashMap;
import java.util.Map;
import java.util.function.Function;
import org.bsplines.ltexls.parsing.html.HtmlAnnotatedTextBuilder;
import org.bsplines.ltexls.parsing.latex.LatexAnnotatedTextBuilder;
import org.bsplines.ltexls.parsing.markdown.MarkdownAnnotatedTextBuilder;
import org.bsplines.ltexls.parsing.org.OrgAnnotatedTextBuilder;
Expand All @@ -27,6 +28,8 @@ public abstract class CodeAnnotatedTextBuilder extends AnnotatedTextBuilder {
static {
constructorMap.put("bibtex", (String codeLanguageId) ->
new LatexAnnotatedTextBuilder(codeLanguageId));
constructorMap.put("html", (String codeLanguageId) ->
new HtmlAnnotatedTextBuilder(codeLanguageId));
constructorMap.put("latex", (String codeLanguageId) ->
new LatexAnnotatedTextBuilder(codeLanguageId));
constructorMap.put("markdown", (String codeLanguageId) ->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import java.util.Map;
import java.util.function.Function;
import org.bsplines.ltexls.parsing.bibtex.BibtexFragmentizer;
import org.bsplines.ltexls.parsing.html.HtmlFragmentizer;
import org.bsplines.ltexls.parsing.latex.LatexFragmentizer;
import org.bsplines.ltexls.parsing.markdown.MarkdownFragmentizer;
import org.bsplines.ltexls.parsing.org.OrgFragmentizer;
Expand All @@ -29,6 +30,8 @@ public abstract class CodeFragmentizer {
static {
constructorMap.put("bibtex", (String codeLanguageId) ->
new BibtexFragmentizer(codeLanguageId));
constructorMap.put("html", (String codeLanguageId) ->
new HtmlFragmentizer(codeLanguageId));
constructorMap.put("latex", (String codeLanguageId) ->
new LatexFragmentizer(codeLanguageId));
constructorMap.put("markdown", (String codeLanguageId) ->
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/* Copyright (C) 2020 Julian Valentin, LTeX Development Community
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/

package org.bsplines.ltexls.parsing.html;

import com.ctc.wstx.api.WstxInputProperties;
import java.io.StringReader;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.bsplines.ltexls.parsing.CodeAnnotatedTextBuilder;
import org.bsplines.ltexls.tools.Tools;

public class HtmlAnnotatedTextBuilder extends CodeAnnotatedTextBuilder {
private static final Pattern whitespacePattern = Pattern.compile(" *\r?\n *");

private XMLInputFactory xmlInputFactory;

public HtmlAnnotatedTextBuilder(String codeLanguageId) {
super(codeLanguageId);

this.xmlInputFactory = XMLInputFactory.newInstance();
this.xmlInputFactory.setProperty(WstxInputProperties.P_MIN_TEXT_SEGMENT, 1);
this.xmlInputFactory.setProperty(WstxInputProperties.P_TREAT_CHAR_REFS_AS_ENTS, true);
this.xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, true);
this.xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false);
this.xmlInputFactory.setProperty(XMLInputFactory.IS_VALIDATING, false);
this.xmlInputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
}

@Override
public CodeAnnotatedTextBuilder addCode(String code) {
int pos = 0;
Stack<String> elementNameStack = new Stack<>();
elementNameStack.push("html");
String nextText = "";

try {
XMLStreamReader xmlStreamReader =
this.xmlInputFactory.createXMLStreamReader(new StringReader(code));

while (xmlStreamReader.hasNext()) {
int eventType = xmlStreamReader.next();
int oldPos = pos;
pos = xmlStreamReader.getLocation().getCharacterOffset();
String skippedCode = code.substring(oldPos, pos);
String interpretAs = "";

Tools.logger.finest("Position " + pos + " (" + xmlStreamReader.getLocation().getLineNumber()
+ "," + xmlStreamReader.getLocation().getColumnNumber() + "): Event type = "
+ eventType + ", skippedCode = '" + skippedCode + "'");

if (!nextText.isEmpty()) {
if (nextText.equals(skippedCode)) {
addTextWithWhitespace(nextText);
} else {
addMarkup(skippedCode, nextText);
}

skippedCode = "";
nextText = "";
}

if (eventType == XMLStreamReader.START_ELEMENT) {
String elementName = xmlStreamReader.getLocalName();
elementNameStack.push(elementName);
Tools.logger.finest("START_ELEMENT: elementName = '" + xmlStreamReader.getLocalName()
+ "'");

if ((elementName == "body") || (elementName == "div")
|| (elementName == "h1") || (elementName == "h2") || (elementName == "h3")
|| (elementName == "h4") || (elementName == "h5") || (elementName == "h6")
|| (elementName == "p") || (elementName == "table") || (elementName == "tr")) {
interpretAs += "\n\n";
} else if ((elementName == "br") || (elementName == "li")) {
interpretAs += "\n";
}
} else if (eventType == XMLStreamReader.END_ELEMENT) {
Tools.logger.finest("END_ELEMENT");
if (!elementNameStack.isEmpty()) elementNameStack.pop();
} else if (eventType == XMLStreamReader.CHARACTERS) {
String elementName = (elementNameStack.isEmpty() ? "" : elementNameStack.peek());
String text = xmlStreamReader.getText();
Tools.logger.finest("CHARACTERS: text = '" + text + "'");
if ((elementName != "script") && (elementName != "style")) nextText = text;
} else if (eventType == XMLStreamReader.ENTITY_REFERENCE) {
nextText = xmlStreamReader.getText();
Tools.logger.finest("ENTITY_REFERENCE: text = '" + nextText + "'");
}

addMarkup(skippedCode, interpretAs);
}
} catch (XMLStreamException e) {
// ignore parser errors
}

if (pos < code.length()) addTextWithWhitespace(code.substring(pos));

return this;
}

protected CodeAnnotatedTextBuilder addTextWithWhitespace(String text) {
Matcher matcher = whitespacePattern.matcher(text);
int pos = 0;

while (matcher.find()) {
if (matcher.start() > 0) addText(text.substring(pos, matcher.start()));
addMarkup(matcher.group());
pos = matcher.end();
}

if (pos < text.length()) addText(text.substring(pos));

return this;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/* Copyright (C) 2020 Julian Valentin, LTeX Development Community
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/

package org.bsplines.ltexls.parsing.html;

import java.util.regex.Pattern;
import org.bsplines.ltexls.parsing.RegexCodeFragmentizer;

public class HtmlFragmentizer extends RegexCodeFragmentizer {
private static final Pattern pattern = Pattern.compile(
"^[ \t]*<!--[ \t]*(?i)ltex(?-i):(.*?)[ \t]*-->[ \t]*$",
Pattern.MULTILINE);

public HtmlFragmentizer(String codeLanguageId) {
super(codeLanguageId, pattern);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -116,16 +116,19 @@ public CompletableFuture<Object> executeCheckDocumentCommand(JsonObject argument

if (fileNameStr.endsWith(".bib")) {
codeLanguageId = "bibtex";
} else if (fileNameStr.endsWith(".tex")) {
codeLanguageId = "latex";
} else if (fileNameStr.endsWith(".htm") || fileNameStr.endsWith(".html")
|| fileNameStr.endsWith(".xht") || fileNameStr.endsWith(".xhtml")) {
codeLanguageId = "html";
} else if (fileNameStr.endsWith(".md")) {
codeLanguageId = "markdown";
} else if (fileNameStr.endsWith(".org")) {
codeLanguageId = "org";
} else if (fileNameStr.endsWith(".Rnw") || fileNameStr.endsWith(".rnw")) {
codeLanguageId = "rsweave";
} else if (fileNameStr.endsWith(".rst")) {
codeLanguageId = "restructuredtext";
} else if (fileNameStr.endsWith(".tex")) {
codeLanguageId = "latex";
} else if (fileNameStr.endsWith(".Rnw") || fileNameStr.endsWith(".rnw")) {
codeLanguageId = "rsweave";
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

public class Settings {
private static final Set<String> defaultEnabled = new HashSet<>(Arrays.asList(
"bibtex", "latex", "markdown", "org", "restructuredtext", "rsweave"));
"bibtex", "latex", "html", "markdown", "org", "restructuredtext", "rsweave"));

private @Nullable Set<String> enabled;
private @Nullable String languageShortCode;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/* Copyright (C) 2020 Julian Valentin, LTeX Development Community
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/

package org.bsplines.ltexls.parsing.html;

import org.bsplines.ltexls.parsing.CodeAnnotatedTextBuilder;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.languagetool.markup.AnnotatedText;

public class HtmlAnnotatedTextBuilderTest {
private static void assertPlainText(String code, String expectedPlainText) {
AnnotatedText annotatedText = buildAnnotatedText(code);
Assertions.assertEquals(expectedPlainText, annotatedText.getPlainText());
}

private static AnnotatedText buildAnnotatedText(String code) {
CodeAnnotatedTextBuilder builder = CodeAnnotatedTextBuilder.create("html");
return builder.addCode(code).build();
}

@Test
public void test() {
assertPlainText(
"<html>\n"
+ " <head>\n"
+ " <title>Title</title>\n"
+ " </head>\n"
+ " <body style=\"color:red;\">\n"
+ " This is a <b>test</b>.\n"
+ " <!-- This is a comment. -->\n"
+ " </body>\n"
+ "</html>\n",
"Title\n\nThis is a test.");
assertPlainText(
"<html><body>This is a te<script>abc</script>st.</body></html>\n",
"\n\nThis is a test.");
assertPlainText(
"<html><body>This is a te<br/>st.</body></html>\n",
"\n\nThis is a te\nst.");
assertPlainText(
"<html><body>This is a test &amp; another test.</body></html>\n",
"\n\nThis is a test & another test.");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/* Copyright (C) 2020 Julian Valentin, LTeX Development Community
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/

package org.bsplines.ltexls.parsing.html;

import java.util.List;
import org.bsplines.ltexls.parsing.CodeFragment;
import org.bsplines.ltexls.parsing.CodeFragmentizer;
import org.bsplines.ltexls.settings.Settings;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;

public class HtmlFragmentizerTest {
private static void testFragmentizer(CodeFragmentizer fragmentizer, String code) {
List<CodeFragment> codeFragments = fragmentizer.fragmentize(code, new Settings());
Assertions.assertEquals(3, codeFragments.size());

Assertions.assertEquals("html", codeFragments.get(0).getCodeLanguageId());
Assertions.assertEquals(0, codeFragments.get(0).getFromPos());
Assertions.assertEquals(12, codeFragments.get(0).getCode().length());
Assertions.assertEquals("en-US", codeFragments.get(0).getSettings().getLanguageShortCode());

Assertions.assertEquals("html", codeFragments.get(1).getCodeLanguageId());
Assertions.assertEquals(12, codeFragments.get(1).getFromPos());
Assertions.assertEquals(50, codeFragments.get(1).getCode().length());
Assertions.assertEquals("de-DE", codeFragments.get(1).getSettings().getLanguageShortCode());

Assertions.assertEquals("html", codeFragments.get(2).getCodeLanguageId());
Assertions.assertEquals(62, codeFragments.get(2).getFromPos());
Assertions.assertEquals(48, codeFragments.get(2).getCode().length());
Assertions.assertEquals("en-US", codeFragments.get(2).getSettings().getLanguageShortCode());
}

@Test
public void test() {
CodeFragmentizer fragmentizer = CodeFragmentizer.create("html");

testFragmentizer(fragmentizer,
"Sentence 1\n"
+ "\n <!-- ltex: language=de-DE--> \n\nSentence 2\n"
+ "\n<!--\t\t\tltex:\t\t\t\tlanguage=en-US\t\t-->\n\nSentence 3\n");
}

@Test
public void testWrongSettings() {
CodeFragmentizer fragmentizer = CodeFragmentizer.create("html");
Assertions.assertDoesNotThrow(() -> fragmentizer.fragmentize(
"Sentence 1\n<!-- ltex: languagede-DE -->\n\nSentence 2\n", new Settings()));
Assertions.assertDoesNotThrow(() -> fragmentizer.fragmentize(
"Sentence 1\n<!-- ltex: unknownKey=abc -->\n\nSentence 2\n", new Settings()));
}
}

0 comments on commit e2f5507

Please sign in to comment.