Skip to content

Commit

Permalink
Merge pull request #888 from Automattic/add/output-buffer-sanitization
Browse files Browse the repository at this point in the history
Sanitize entire HTML output when theme support is present
  • Loading branch information
Thierry Muller committed Jan 23, 2018
2 parents 4e0f5ce + ef69f2d commit b8af86a
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 97 deletions.
53 changes: 28 additions & 25 deletions includes/class-amp-theme-support.php
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,6 @@ public static function register_hooks() {
*/
add_action( 'template_redirect', array( __CLASS__, 'start_output_buffering' ), 0 );

add_filter( 'the_content', array( __CLASS__, 'filter_the_content' ), PHP_INT_MAX );

// @todo Add character conversion.
}

Expand Down Expand Up @@ -442,32 +440,12 @@ public static function get_amp_custom_styles() {
return $css;
}

/**
* Filter the content to be valid AMP.
*
* @param string $content Content.
* @return string Amplified content.
*/
public static function filter_the_content( $content ) {
$args = array(
'content_max_width' => ! empty( $content_width ) ? $content_width : AMP_Post_Template::CONTENT_MAX_WIDTH, // Back-compat.
);

list( $sanitized_content, $scripts, $styles ) = AMP_Content_Sanitizer::sanitize( $content, self::$sanitizer_classes, $args );

self::$amp_scripts = array_merge( self::$amp_scripts, $scripts );
self::$amp_styles = array_merge( self::$amp_styles, $styles );

return $sanitized_content;
}

/**
* Determine required AMP scripts.
*
* @param string $html Output HTML.
* @return string Scripts to inject into the HEAD.
*/
public static function get_amp_component_scripts( $html ) {
public static function get_amp_component_scripts() {
$amp_scripts = self::$amp_scripts;

foreach ( self::$embed_handlers as $embed_handler ) {
Expand Down Expand Up @@ -512,15 +490,41 @@ public static function start_output_buffering() {
* Finish output buffering.
*
* @todo Do this in shutdown instead of output buffering callback?
* @global int $content_width
* @param string $output Buffered output.
* @return string Finalized output.
*/
public static function finish_output_buffering( $output ) {
global $content_width;

$dom = AMP_DOM_Utils::get_dom( $output );
$args = array(
'content_max_width' => ! empty( $content_width ) ? $content_width : AMP_Post_Template::CONTENT_MAX_WIDTH, // Back-compat.
);

$assets = AMP_Content_Sanitizer::sanitize_document( $dom, self::$sanitizer_classes, $args );

self::$amp_scripts = array_merge( self::$amp_scripts, $assets['scripts'] );
self::$amp_styles = array_merge( self::$amp_styles, $assets['styles'] );

/*
* @todo The sanitize method needs to be updated to sanitize the entire HTML element and not just the BODY.
* This will require updating mandatory_parent_blacklist in amphtml-update.py to include elements that appear in the HEAD.
* This will ensure that the scripts and styles that plugins output via wp_head() will be sanitized as well. However,
* since the the old paired mode is sending content from the *body* we'll need to be able to filter out the elements
* from outside the body from being part of the whitelist sanitizer when it runs when theme support is not present,
* as otherwise elements from the HEAD could get added to the BODY.
*/
$output = preg_replace(
'#(<body.*?>)(.+)(</body>)#si',
'$1' . AMP_DOM_Utils::get_content_from_dom( $dom ) . '$3',
$output
);

// Inject required scripts.
$output = preg_replace(
'#' . preg_quote( self::COMPONENT_SCRIPTS_PLACEHOLDER, '#' ) . '#',
self::get_amp_component_scripts( $output ),
self::get_amp_component_scripts(),
$output,
1
);
Expand All @@ -533,7 +537,6 @@ public static function finish_output_buffering( $output ) {
1
);

// @todo Add more validation checking and potentially the whitelist sanitizer.
return $output;
}
}
10 changes: 3 additions & 7 deletions includes/sanitizers/class-amp-rule-spec.php
Original file line number Diff line number Diff line change
Expand Up @@ -87,22 +87,18 @@ abstract class AMP_Rule_Spec {
*/
public static $additional_allowed_tags = array(

/**
* An experimental tag with no protoascii
*/
// An experimental tag with no protoascii.
'amp-share-tracking' => array(
'attr_spec_list' => array(),
'tag_spec' => array(),
),

/**
* Needed for some tags such as analytics
*/
// Needed for some tags such as analytics.
'script' => array(
'attr_spec_list' => array(
'type' => array(
'mandatory' => true,
'value_casei' => 'text/javascript',
'value_casei' => 'application/json',
),
),
'tag_spec' => array(),
Expand Down
48 changes: 39 additions & 9 deletions includes/templates/class-amp-content-sanitizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,30 +7,62 @@

/**
* Class AMP_Content_Sanitizer
*
* @since 0.4.1
*/
class AMP_Content_Sanitizer {

/**
* Sanitize.
* Sanitize _content_.
*
* @since 0.4.1
*
* @param string $content Content.
* @param string $content HTML content string or DOM document.
* @param string[] $sanitizer_classes Sanitizer classes.
* @param array $global_args Global args.
*
* @return array
* @return array Tuple containing sanitized HTML, scripts array, and styles array.
*/
public static function sanitize( $content, array $sanitizer_classes, $global_args = array() ) {
$dom = AMP_DOM_Utils::get_dom_from_content( $content );

$results = self::sanitize_document( $dom, $sanitizer_classes, $global_args );
return array(
AMP_DOM_Utils::get_content_from_dom( $dom ),
$results['scripts'],
$results['styles'],
);
}

/**
* Sanitize document.
*
* @since 0.7
*
* @param DOMDocument $dom HTML document.
* @param string[] $sanitizer_classes Sanitizer classes.
* @param array $global_args Global args passed into .
* @return array {
* Scripts and styles needed by sanitizers.
*
* @type array $scripts Scripts.
* @type array $styles Styles.
* }
*/
public static function sanitize_document( &$dom, $sanitizer_classes, $global_args ) {
$scripts = array();
$styles = array();
$dom = AMP_DOM_Utils::get_dom_from_content( $content );

foreach ( $sanitizer_classes as $sanitizer_class => $args ) {
if ( ! class_exists( $sanitizer_class ) ) {
/* translators: %s is sanitizer class */
_doing_it_wrong( __METHOD__, sprintf( esc_html__( 'Sanitizer (%s) class does not exist', 'amp' ), esc_html( $sanitizer_class ) ), '0.4.1' );
continue;
}

/**
* Sanitizer.
*
* @type AMP_Base_Sanitizer $sanitizer
*/
$sanitizer = new $sanitizer_class( $dom, array_merge( $global_args, $args ) );

if ( ! is_subclass_of( $sanitizer, 'AMP_Base_Sanitizer' ) ) {
Expand All @@ -45,9 +77,7 @@ public static function sanitize( $content, array $sanitizer_classes, $global_arg
$styles = array_merge( $styles, $sanitizer->get_styles() );
}

$sanitized_content = AMP_DOM_Utils::get_content_from_dom( $dom );

return array( $sanitized_content, $scripts, $styles );
return compact( 'scripts', 'styles' );
}
}

123 changes: 67 additions & 56 deletions includes/utils/class-amp-dom-utils.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,45 @@
class AMP_DOM_Utils {

/**
* Return a valid DOMDocument representing arbitrary HTML content passed as a parameter.
* HTML elements that are self-closing.
*
* @see Reciprocal function get_content_from_dom()
* Not all are valid AMP, but we include them for completeness.
*
* @since 0.2
* @since 0.7
* @link https://www.w3.org/TR/html5/syntax.html#serializing-html-fragments
* @var array
*/
private static $self_closing_tags = array(
'area',
'base',
'basefont',
'bgsound',
'br',
'col',
'embed',
'frame',
'hr',
'img',
'input',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr',
);

/**
* Return a valid DOMDocument representing HTML document passed as a parameter.
*
* @param string $content Valid HTML content to be represented by a DOMDocument.
* @since 0.7
*
* @param string $document Valid HTML document to be represented by a DOMDocument.
*
* @return DOMDocument|false Returns DOMDocument, or false if conversion failed.
*/
public static function get_dom_from_content( $content ) {
public static function get_dom( $document ) {
$libxml_previous_state = libxml_use_internal_errors( true );

$dom = new DOMDocument();
Expand All @@ -34,13 +62,7 @@ public static function get_dom_from_content( $content ) {
* We can later use this to extract our nodes.
* Add charset so loadHTML does not have problems parsing it.
*/
$result = $dom->loadHTML(
sprintf(
'<html><head><meta http-equiv="content-type" content="text/html; charset=%s"></head><body>%s</body></html>',
get_bloginfo( 'charset' ),
$content
)
);
$result = $dom->loadHTML( $document );

libxml_clear_errors();
libxml_use_internal_errors( $libxml_previous_state );
Expand All @@ -52,6 +74,35 @@ public static function get_dom_from_content( $content ) {
return $dom;
}

/**
* Return a valid DOMDocument representing arbitrary HTML content passed as a parameter.
*
* @see Reciprocal function get_content_from_dom()
*
* @since 0.2
*
* @param string $content Valid HTML content to be represented by a DOMDocument.
*
* @return DOMDocument|false Returns DOMDocument, or false if conversion failed.
*/
public static function get_dom_from_content( $content ) {
/*
* Wrap in dummy tags, since XML needs one parent node.
* It also makes it easier to loop through nodes.
* We can later use this to extract our nodes.
* Add utf-8 charset so loadHTML does not have problems parsing it.
* See: http://php.net/manual/en/domdocument.loadhtml.php#78243
*/
$document = sprintf(
'<html><head><meta http-equiv="content-type" content="text/html; charset=%s"></head><body>%s</body></html>',
get_bloginfo( 'charset' ),
$content
);

return self::get_dom( $document );

}

/**
* Return valid HTML content extracted from the DOMDocument passed as a parameter.
*
Expand All @@ -67,6 +118,8 @@ public static function get_content_from_dom( $dom ) {

/**
* We only want children of the body tag, since we have a subset of HTML.
*
* @todo We will want to get the full HTML eventually.
*/
$body = $dom->getElementsByTagName( 'body' )->item( 0 );

Expand Down Expand Up @@ -119,7 +172,7 @@ public static function get_content_from_dom_node( $dom, $node ) {
* Cache this regex so we don't have to recreate it every call.
*/
if ( ! isset( $self_closing_tags_regex ) ) {
$self_closing_tags = implode( '|', self::get_self_closing_tags() );
$self_closing_tags = implode( '|', self::$self_closing_tags );
$self_closing_tags_regex = "#></({$self_closing_tags})>#i";
}

Expand Down Expand Up @@ -262,48 +315,6 @@ public static function recursive_force_closing_tags( $dom, $node = null ) {
* @return bool Returns true if a valid self-closing tag, false if not.
*/
private static function is_self_closing_tag( $tag ) {
return in_array( $tag, self::get_self_closing_tags(), true );
}

/**
* Returns array of self closing tags
*
* @since 0.6
*
* @return string[]
*/
private static function get_self_closing_tags() {
/*
* As this function is called a lot the static var
* prevents having to re-create the array every time.
*/
static $self_closing_tags;
if ( ! isset( $self_closing_tags ) ) {
/*
* https://www.w3.org/TR/html5/syntax.html#serializing-html-fragments
* Not all are valid AMP, but we include them for completeness.
*/
$self_closing_tags = array(
'area',
'base',
'basefont',
'bgsound',
'br',
'col',
'embed',
'frame',
'hr',
'img',
'input',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr',
);
}
return $self_closing_tags;
return in_array( strtolower( $tag ), self::$self_closing_tags, true );
}
}
Loading

0 comments on commit b8af86a

Please sign in to comment.