From e444525ef1634b675cd1cf52d39f4320ef0aecfd Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 10 Apr 2022 14:42:04 -0400 Subject: [PATCH] fix(perf): HTML4::EncodingReader detection --- lib/nokogiri/html4/document.rb | 2 +- test/html4/test_document_encoding.rb | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/lib/nokogiri/html4/document.rb b/lib/nokogiri/html4/document.rb index 177efc04f7..fbc22d2072 100644 --- a/lib/nokogiri/html4/document.rb +++ b/lib/nokogiri/html4/document.rb @@ -268,7 +268,7 @@ def start_element(name, attrs = []) end def self.detect_encoding(chunk) - (m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/)) && + (m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) && (return Nokogiri.XML(m[1]).encoding) if Nokogiri.jruby? diff --git a/test/html4/test_document_encoding.rb b/test/html4/test_document_encoding.rb index 6115301764..ecb4aa9a12 100644 --- a/test/html4/test_document_encoding.rb +++ b/test/html4/test_document_encoding.rb @@ -155,6 +155,18 @@ def binopen(file) end end end + + it "does not start backtracking during detection of XHTML encoding" do + # this test is a quick and dirty version + # of the more complete perf test that is on main. + n = 40_000 + redos_string = "