Skip to content

Commit

Permalink
vulc: fix mangled DOM due to a bug in the printer (tensorflow#3582)
Browse files Browse the repository at this point in the history
The issue is better described in tensorflow#3557. Please refer to it.

Our HTML module traversal relies on the document so we cannot create
the document without nested document in one go. Instead, we do a post
processing of the document to prune away all the document so the
default jsoup's printer can print them without awkward <#root>
elements.
  • Loading branch information
stephanwlee authored and caisq committed May 19, 2020
1 parent d135467 commit 4db0e43
Showing 1 changed file with 109 additions and 3 deletions.
112 changes: 109 additions & 3 deletions tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,14 @@
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Html5Printer;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Parser;
import org.jsoup.parser.Tag;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeVisitor;

/** Simple one-off solution for TensorBoard vulcanization. */
public final class Vulcanize {
Expand Down Expand Up @@ -182,10 +183,11 @@ public static void main(String[] args)
}

boolean shouldExtractJs = !jsPath.isEmpty();
// Write an empty file for shasum when all scripts are extracted out.
createFile(
jsOutput, shouldExtractJs ? extractAndTransformJavaScript(document, jsPath) : "");
// Write an empty file for shasum when all scripts are extracted out.
createFile(output, Html5Printer.stringify(document));
Document normalizedDocument = getFlattenedHTML5Document(document);
createFile(output, normalizedDocument.toString());
}

private static void createFile(Path filePath, String content) throws IOException {
Expand Down Expand Up @@ -778,6 +780,110 @@ private static String extractAndTransformJavaScript(Document document, Webpath j
return scriptContent;
}

private static void cloneChildrenWithoutWhitespace(Element src, Element dest) {
List<Node> toMove = new ArrayList<Node>();
for (Node node : src.childNodes()) {
if (node instanceof TextNode && ((TextNode) node).isBlank()) {
continue;
}
toMove.add(node);
}
for (int i = 0; i < toMove.size(); i++) {
Node node = toMove.get(i);
dest.appendChild(node.clone());
}
}

/**
* When we inline the HTML based on `<link rel="import">` in `transform`, we
* replace the link element with parsed document. This makes us have nested
* documents and jsoup's Node.outerHtml (or Node.toString) are incapable of
* properly outputting that. Here, we flatten the document by combining all
* elements in `<head>` and `<body>` of nested document in one `<head>` and
* `<body>`.
*
* It also prepends <!doctype html> since TensorBoard requires that the
* document is HTML.
*
* NOTE: it makes side-effect to the input `document`.
*
* Examples:
* // Input
* <#root> <!-- document -->
* <html>
* <head>
* <#root>
* <html>
* <head>
* <script></script>
* <#root><html><body>welcome </body></html></#root>
* </head>
* <body>foo</body></html>
* </#root></head>
* <body><span>bar</span></body>
* </html>
* </html>
* // Output
* <#root> <!-- document -->
* <!doctype html>
* <html>
* <head><script></script></head>
* <body>welcome foo<span>bar</span></body>
* </html>
* </html>
**/
private static Document getFlattenedHTML5Document(Document document) {
Document flatDoc = new Document("/");

flatDoc.appendChild(new DocumentType("html", "", "", ""));

// Transfer comment nodes from the `document` level. They are important
// license comments
for (Node node : document.childNodes()) {
if (node instanceof Comment) {
flatDoc.appendChild(node.clone());
}
}

// Create `<html>`, `<head>` and `<body>`.
flatDoc.normalise();

document.traverse(new FlatDocumentCopier(flatDoc));

return flatDoc;
}

private static class FlatDocumentCopier implements NodeVisitor {
private Element destHead;
private Element destBody;

public FlatDocumentCopier(Document dest) {
destHead = dest.head();
destBody = dest.body();
}

public void head(Node node, int depth) {
// Copy childNodes from `head` into the dest doc's head without
// modification if the node is not a `document` (or a `<#root>` element)
// in which case we want to traverse further and only copy the childNodes
// in its `body` and `head` elements.
if (node.parentNode() != null && node.parentNode().nodeName().equals("head")
&& !(node instanceof Document)) {
destHead.appendChild(node.clone());
}

if (node.nodeName().equals("body")) {
cloneChildrenWithoutWhitespace((Element) node, destBody);
// No need to further traverse the `body`. Skip by removing the nodes.
((Element) node).empty();
}
}

public void tail(Node node, int depth) {
// Copying is done during the `head`. No need to do any work.
}
}

private static final class JsPrintlessErrorManager extends BasicErrorManager {

@Override
Expand Down

0 comments on commit 4db0e43

Please sign in to comment.