Skip to content

Commit

Permalink
[html] Use find('*') over custom solution
Browse files Browse the repository at this point in the history
find('*') wasn't supported in older versions of simplehtmldom but it
is supported now. Thus, all custom implementations can be replaced
by the correct solution.
  • Loading branch information
logmanoriginal committed Jun 1, 2019
1 parent 5656792 commit 014b698
Showing 1 changed file with 2 additions and 24 deletions.
26 changes: 2 additions & 24 deletions lib/html.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,7 @@ function sanitize($html,

$htmlContent = str_get_html($html);

/*
* Notice: simple_html_dom currently doesn't support "->find(*)", which is a
* known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/
*
* A solution to this is to find all nodes WITHOUT a specific attribute. If
* the attribute is very unlikely to appear in the DOM, this is essentially
* returning all nodes.
*
* "*[!b38fd2b1fe7f4747d6b1c1254ccd055e]" is doing exactly that. The attrib
* "b38fd2b1fe7f4747d6b1c1254ccd055e" is very unlikely to appear in any DOM.
*/
foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) {
foreach($htmlContent->find('*') as $element) {
if(in_array($element->tag, $text_to_keep)) {
$element->outertext = $element->plaintext;
} elseif(in_array($element->tag, $tags_to_remove)) {
Expand Down Expand Up @@ -90,18 +79,7 @@ function backgroundToImg($htmlContent) {
$regex = '/background-image[ ]{0,}:[ ]{0,}url\([\'"]{0,}(.*?)[\'"]{0,}\)/';
$htmlContent = str_get_html($htmlContent);

/*
* Notice: simple_html_dom currently doesn't support "->find(*)", which is a
* known issue: https://sourceforge.net/p/simplehtmldom/bugs/157/
*
* A solution to this is to find all nodes WITHOUT a specific attribute. If
* the attribute is very unlikely to appear in the DOM, this is essentially
* returning all nodes.
*
* "*[!b38fd2b1fe7f4747d6b1c1254ccd055e]" is doing exactly that. The attrib
* "b38fd2b1fe7f4747d6b1c1254ccd055e" is very unlikely to appear in any DOM.
*/
foreach($htmlContent->find('*[!b38fd2b1fe7f4747d6b1c1254ccd055e]') as $element) {
foreach($htmlContent->find('*') as $element) {

if(preg_match($regex, $element->style, $matches) > 0) {

Expand Down

0 comments on commit 014b698

Please sign in to comment.