vulc: fix mangled DOM due to a bug in the printer

The issue is better described in tensorflow#3557. Please refer to it. Our HTML module traversal relies on the `document` so we cannot create the document without nested document in one go. Instead, we do a post processing of the document to prune away all the `document` so the default jsoup's printer can print them without awkward `<#root>` elements. index.html size difference: ``` 555323 before 554749 after ```
stephanwlee · May 6, 2020 · 1a426c1 · 1a426c1
1 parent 79182fa
commit 1a426c1
Showing 1 changed file with 52 additions and 3 deletions.
diff --git a/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java b/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java
@@ -75,7 +75,6 @@
 import org.jsoup.nodes.DataNode;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
-import org.jsoup.nodes.Html5Printer;
 import org.jsoup.nodes.Node;
 import org.jsoup.nodes.TextNode;
 import org.jsoup.parser.Parser;
@@ -182,10 +181,11 @@ public static void main(String[] args)
     }
 
     boolean shouldExtractJs = !jsPath.isEmpty();
+    // Write an empty file for shasum when all scripts are extracted out.
     createFile(
         jsOutput, shouldExtractJs ? extractAndTransformJavaScript(document, jsPath) : "");
-    // Write an empty file for shasum when all scripts are extracted out.
-    createFile(output, Html5Printer.stringify(document));
+    Document normalizedDocument = getFlattenedDocument(document);
+    createFile(output, normalizedDocument.toString());
   }
 
   private static void createFile(Path filePath, String content) throws IOException {
@@ -778,6 +778,55 @@ private static String extractAndTransformJavaScript(Document document, Webpath j
     return scriptContent;
   }
 
+  private static void cloneChildrenWithoutWhitespace(Element src, Element dest) {
+    List<Node> toMove = new ArrayList<Node>();
+    for (Node node : src.childNodes()) {
+      if (node instanceof TextNode && ((TextNode) node).isBlank()) {
+        continue;
+      }
+      toMove.add(node);
+    }
+    for (int i = 0; i < toMove.size(); i++) {
+      Node node = toMove.get(i);
+      dest.appendChild(node.clone());
+    }
+  }
+
+  // Refer to https://github.com/tensorflow/tensorboard/issues/3557.
+  private static Document getFlattenedDocument(Document document) {
+    Document flatDoc = new Document("/");
+    flatDoc.normalise();
+    Element rootDocumentHead = flatDoc.head();
+    Element rootDocumentBody = flatDoc.body();
+
+    Node currentNode = document;
+    while (currentNode != null) {
+      // Do not clone the element if it is a `document` inside `<head>`.
+      // We want to traverse further and get all elements from `<head>` and `<body>`
+      if (currentNode.parentNode() != null && currentNode.parentNode().nodeName().equals("head")
+          && !(currentNode instanceof Document)) {
+        rootDocumentHead.appendChild(currentNode.clone());
+      }
+
+      if (currentNode.nodeName().equals("body")) {
+        cloneChildrenWithoutWhitespace((Element) currentNode, rootDocumentBody);
+      }
+
+      if (currentNode.childNodeSize() > 0) {
+        currentNode = currentNode.childNode(0);
+      } else {
+        while (currentNode != null && currentNode.nextSibling() == null) {
+          currentNode = currentNode.parentNode();
+        }
+
+        if (currentNode != null) {
+          currentNode = currentNode.nextSibling();
+        }
+      }
+    }
+    return flatDoc;
+  }
+
   private static final class JsPrintlessErrorManager extends BasicErrorManager {
 
     @Override