diff --git a/includes/utils/class-amp-dom-utils.php b/includes/utils/class-amp-dom-utils.php index 0602e92e20f..4a08dce4eb1 100644 --- a/includes/utils/class-amp-dom-utils.php +++ b/includes/utils/class-amp-dom-utils.php @@ -90,13 +90,15 @@ public static function get_dom( $document ) { $document ); - /* - * Replace noscript elements with placeholders since libxml<2.8 can parse them incorrectly. - * When appearing in the head element, a noscript can cause the head to close prematurely - * and the noscript gets moved to the body and anything after it which was in the head. - * See . - */ + // Deal with bugs in older versions of libxml. + $added_back_compat_meta_content_type = false; if ( version_compare( LIBXML_DOTTED_VERSION, '2.8', '<' ) ) { + /* + * Replace noscript elements with placeholders since libxml<2.8 can parse them incorrectly. + * When appearing in the head element, a noscript can cause the head to close prematurely + * and the noscript gets moved to the body and anything after it which was in the head. + * See . + */ $document = preg_replace_callback( '#]*>.*?#si', function( $matches ) { @@ -106,6 +108,21 @@ function( $matches ) { }, $document ); + + /* + * Add a pre-HTML5-style declaration of the encoding since libxml<2.8 doesn't recognize + * HTML5's meta charset. See . + */ + $document = preg_replace( + '#(?=', + $document, + 1, + $count + ); + if ( 1 === $count ) { + $added_back_compat_meta_content_type = true; + } } /* @@ -123,6 +140,14 @@ function( $matches ) { return false; } + // Remove pre-HTML5-style encoding declaration if added above. + if ( $added_back_compat_meta_content_type ) { + $meta_http_equiv_element = $dom->getElementById( 'meta-http-equiv-content-type' ); + if ( $meta_http_equiv_element ) { + $meta_http_equiv_element->parentNode->removeChild( $meta_http_equiv_element ); + } + } + return $dom; } diff --git a/tests/test-class-amp-dom-utils.php b/tests/test-class-amp-dom-utils.php index f64f9e7e81f..7fe8106fd07 100644 --- a/tests/test-class-amp-dom-utils.php +++ b/tests/test-class-amp-dom-utils.php @@ -146,6 +146,26 @@ public function test_html5_empty_elements() { $this->assertEquals( 'span', $video->childNodes->item( 5 )->nodeName ); } + /** + * Test encoding. + * + * @covers \AMP_DOM_Utils::get_dom() + */ + public function test_get_dom_encoding() { + $html = ''; + $html .= '

Check out ‘this’ and “that” and—other things.

'; + $html .= '

Check out ‘this’ and “that” and—other things.

'; + $html .= '

Check out ‘this’ and “that” and—other things.

'; + $html .= ''; + + $document = AMP_DOM_Utils::get_dom_from_content( $html ); + $this->assertEquals( 'UTF-8', $document->encoding ); + $paragraphs = $document->getElementsByTagName( 'p' ); + $this->assertSame( 3, $paragraphs->length ); + $this->assertSame( $paragraphs->item( 0 )->textContent, $paragraphs->item( 1 )->textContent ); + $this->assertSame( $paragraphs->item( 1 )->textContent, $paragraphs->item( 2 )->textContent ); + } + /** * Get Table Row Iterations *