Skip to content

Commit

Permalink
vulc: fix mangled DOM due to a bug in the printer
Browse files Browse the repository at this point in the history
The issue is better described in tensorflow#3557. Please refer to it.

Our HTML module traversal relies on the `document` so we cannot create
the document without nested document in one go. Instead, we do a post
processing of the document to prune away all the `document` so the
default jsoup's printer can print them without awkward `<#root>`
elements.

index.html size difference:
```
555323 before
554749 after
```
  • Loading branch information
stephanwlee committed May 6, 2020
1 parent 79182fa commit 1a426c1
Showing 1 changed file with 52 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Html5Printer;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Parser;
Expand Down Expand Up @@ -182,10 +181,11 @@ public static void main(String[] args)
}

boolean shouldExtractJs = !jsPath.isEmpty();
// Write an empty file for shasum when all scripts are extracted out.
createFile(
jsOutput, shouldExtractJs ? extractAndTransformJavaScript(document, jsPath) : "");
// Write an empty file for shasum when all scripts are extracted out.
createFile(output, Html5Printer.stringify(document));
Document normalizedDocument = getFlattenedDocument(document);
createFile(output, normalizedDocument.toString());
}

private static void createFile(Path filePath, String content) throws IOException {
Expand Down Expand Up @@ -778,6 +778,55 @@ private static String extractAndTransformJavaScript(Document document, Webpath j
return scriptContent;
}

private static void cloneChildrenWithoutWhitespace(Element src, Element dest) {
List<Node> toMove = new ArrayList<Node>();
for (Node node : src.childNodes()) {
if (node instanceof TextNode && ((TextNode) node).isBlank()) {
continue;
}
toMove.add(node);
}
for (int i = 0; i < toMove.size(); i++) {
Node node = toMove.get(i);
dest.appendChild(node.clone());
}
}

// Refer to https://github.com/tensorflow/tensorboard/issues/3557.
private static Document getFlattenedDocument(Document document) {
Document flatDoc = new Document("/");
flatDoc.normalise();
Element rootDocumentHead = flatDoc.head();
Element rootDocumentBody = flatDoc.body();

Node currentNode = document;
while (currentNode != null) {
// Do not clone the element if it is a `document` inside `<head>`.
// We want to traverse further and get all elements from `<head>` and `<body>`
if (currentNode.parentNode() != null && currentNode.parentNode().nodeName().equals("head")
&& !(currentNode instanceof Document)) {
rootDocumentHead.appendChild(currentNode.clone());
}

if (currentNode.nodeName().equals("body")) {
cloneChildrenWithoutWhitespace((Element) currentNode, rootDocumentBody);
}

if (currentNode.childNodeSize() > 0) {
currentNode = currentNode.childNode(0);
} else {
while (currentNode != null && currentNode.nextSibling() == null) {
currentNode = currentNode.parentNode();
}

if (currentNode != null) {
currentNode = currentNode.nextSibling();
}
}
}
return flatDoc;
}

private static final class JsPrintlessErrorManager extends BasicErrorManager {

@Override
Expand Down

0 comments on commit 1a426c1

Please sign in to comment.