diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index bc37672b1..c1c301259 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -47,7 +47,7 @@ runs: --create-env-dump --create-plain-dump plain --install-composer-deps - --run-unit-tests --run-migration-tests + --run-unit-tests --run-integration-tests --run-migration-tests --extract-code-coverage --shutdown-helpers || { failed=$?; } ; if [ -n "$failed" ]; then diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e4d97757..364495b1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ [#1007](https://github.com/nextcloud/cookbook/pull/1007) @christianlupus - Switched to cURL for downloading of external files [#1055](https://github.com/nextcloud/cookbook/pull/1055) @christianlupus +- Rewrite encoding of imported recipes + [#1057](https://github.com/nextcloud/cookbook/pull/1057) @christianlupus ### Fixed - Fix visual regression in edit mode to prevent overflow of breadcrumbs diff --git a/lib/Exception/CouldNotGuessEncodingException.php b/lib/Exception/CouldNotGuessEncodingException.php new file mode 100644 index 000000000..3235c1a6b --- /dev/null +++ b/lib/Exception/CouldNotGuessEncodingException.php @@ -0,0 +1,9 @@ +headers as $s) { $parts = explode(':', $s, 2); - if (trim($parts[0]) === 'Content-Type') { + if (strtolower(trim($parts[0])) === 'content-type') { return trim($parts[1]); } } diff --git a/lib/Helper/EncodingGuessingHelper.php b/lib/Helper/EncodingGuessingHelper.php new file mode 100644 index 000000000..df59998e0 --- /dev/null +++ b/lib/Helper/EncodingGuessingHelper.php @@ -0,0 +1,70 @@ +l = $l; + } + /** + * Extract the text encoding from a HTML file + * + * @param string $content The content of the file + * @param ?string $contentType The ContentType header if present or null to look at the contents + * @return string The guessed content encoding + * @throws CouldNotGuessEncodingException if no encoding could be guessed + */ + public function guessEncoding(string $content, ?string $contentType): string { + if ($contentType !== null) { + $guess = $this->guessFromContentType($contentType); + + if ($guess !== null) { + return $guess; + } + } + + $guess = $this->guessFromMainContent($content); + if ($guess === null) { + throw new CouldNotGuessEncodingException($this->l->t('No content encoding was detected in the content.')); + } else { + return $guess; + } + } + + private function guessFromContentType(string $contentType): ?string { + $parts = explode(';', $contentType); + $parts = array_map(function ($x) { + return trim($x); + }, $parts); + + foreach ($parts as $part) { + $subparts = explode('=', $part, 2); + if (strtolower($subparts[0]) === 'charset' && count($subparts) === 2) { + return $subparts[1]; + } + } + // Fallback: We did not find anything in the Content-Type + return null; + } + + private function guessFromMainContent(string $content): ?string { + $regex = "/]* charset=['\"]?([^'\">]*)['\"]?[^>]*>/"; + $ret = preg_match($regex, $content, $matches); + if ($ret === 1) { + return $matches[1]; + } + return null; + } +} diff --git a/lib/Helper/HTMLFilter/HtmlEncodingFilter.php b/lib/Helper/HTMLFilter/HtmlEncodingFilter.php new file mode 100644 index 000000000..2a74a9747 --- /dev/null +++ b/lib/Helper/HTMLFilter/HtmlEncodingFilter.php @@ -0,0 +1,11 @@ +' . $html; + } + } +} diff --git a/lib/Service/HtmlDownloadService.php b/lib/Service/HtmlDownloadService.php index aa3b5d908..27e7e1fb5 100644 --- a/lib/Service/HtmlDownloadService.php +++ b/lib/Service/HtmlDownloadService.php @@ -3,10 +3,14 @@ namespace OCA\Cookbook\Service; use DOMDocument; +use OCA\Cookbook\Exception\CouldNotGuessEncodingException; use OCA\Cookbook\Exception\ImportException; use OCA\Cookbook\Exception\NoDownloadWasCarriedOutException; +use OCA\Cookbook\Helper\DownloadEncodingHelper; use OCA\Cookbook\Helper\DownloadHelper; +use OCA\Cookbook\Helper\EncodingGuessingHelper; use OCA\Cookbook\Helper\HTMLFilter\AbstractHtmlFilter; +use OCA\Cookbook\Helper\HTMLFilter\HtmlEncodingFilter; use OCA\Cookbook\Helper\HTMLFilter\HtmlEntityDecodeFilter; use OCA\Cookbook\Helper\HtmlToDomParser; use OCP\IL10N; @@ -18,16 +22,14 @@ class HtmlDownloadService { */ private $htmlFilters; - /** - * @var ILogger - */ - private $logger; - /** * @var IL10N */ private $l; + /** @var ILogger */ + private $logger; + /** * @var HtmlToDomParser */ @@ -36,6 +38,12 @@ class HtmlDownloadService { /** @var DownloadHelper */ private $downloadHelper; + /** @var EncodingGuessingHelper */ + private $encodingGuesser; + + /** @var DownloadEncodingHelper */ + private $downloadEncodingHelper; + /** * @var DOMDocument */ @@ -43,16 +51,24 @@ class HtmlDownloadService { public function __construct( HtmlEntityDecodeFilter $htmlEntityDecodeFilter, - ILogger $logger, + HtmlEncodingFilter $htmlEncodingFilter, IL10N $l10n, + ILogger $logger, HtmlToDomParser $htmlParser, - DownloadHelper $downloadHelper + DownloadHelper $downloadHelper, + EncodingGuessingHelper $encodingGuesser, + DownloadEncodingHelper $downloadEncodingHelper ) { - $this->htmlFilters = [ $htmlEntityDecodeFilter ]; - $this->logger = $logger; + $this->htmlFilters = [ + $htmlEntityDecodeFilter, + $htmlEncodingFilter, + ]; $this->l = $l10n; + $this->logger = $logger; $this->htmlParser = $htmlParser; $this->downloadHelper = $downloadHelper; + $this->encodingGuesser = $encodingGuesser; + $this->downloadEncodingHelper = $downloadEncodingHelper; } /** @@ -118,6 +134,13 @@ private function fetchHtmlPage(string $url): string { $html = $this->downloadHelper->getContent(); + try { + $enc = $this->encodingGuesser->guessEncoding($html, $this->downloadHelper->getContentType()); + $html = $this->downloadEncodingHelper->encodeToUTF8($html, $enc); + } catch (CouldNotGuessEncodingException $ex) { + $this->logger->notice($this->l->t('Could not find a valid encoding when parsing %s.', [$url])); + } + return $html; } } diff --git a/tests/Unit/Helper/DownloadEncodingHelper/iso-8859-1.orig b/tests/Unit/Helper/DownloadEncodingHelper/iso-8859-1.orig new file mode 100644 index 000000000..5088bf560 --- /dev/null +++ b/tests/Unit/Helper/DownloadEncodingHelper/iso-8859-1.orig @@ -0,0 +1 @@ +abcäöüßÄÖÜ diff --git a/tests/Unit/Helper/DownloadEncodingHelper/iso-8859-1.utf8 b/tests/Unit/Helper/DownloadEncodingHelper/iso-8859-1.utf8 new file mode 100644 index 000000000..e9ec69866 --- /dev/null +++ b/tests/Unit/Helper/DownloadEncodingHelper/iso-8859-1.utf8 @@ -0,0 +1 @@ +abcäöüßÄÖÜ diff --git a/tests/Unit/Helper/DownloadEncodingHelperTest.php b/tests/Unit/Helper/DownloadEncodingHelperTest.php new file mode 100644 index 000000000..a60b52c0a --- /dev/null +++ b/tests/Unit/Helper/DownloadEncodingHelperTest.php @@ -0,0 +1,48 @@ +dut = new DownloadEncodingHelper(); + } + + public function dpEncodings() { + return [ + ['iso-8859-1'] + ]; + } + + /** + * @dataProvider dpEncodings + * @param mixed $encoding + */ + public function testEncodeToUTF8($encoding) { + $unencoded = file_get_contents(__DIR__ . "/DownloadEncodingHelper/$encoding.orig"); + $encoded = file_get_contents(__DIR__ . "/DownloadEncodingHelper/$encoding.utf8"); + + $testEncoded = $this->dut->encodeToUTF8($unencoded, $encoding); + $this->assertEquals($encoded, $testEncoded); + } + + public function testEncodeUTF8ToUTF8() { + $encodings = $this->dpEncodings(); + + $testString = ''; + foreach ($encodings as $enc) { + $fileContent = file_get_contents(__DIR__ . "/DownloadEncodingHelper/{$enc[0]}.utf8"); + $testString .= $fileContent; + } + + $this->assertEquals($testString, $this->dut->encodeToUTF8($testString, 'utf-8')); + } +} diff --git a/tests/Unit/Helper/EncodeingGuessingHelperTest.php b/tests/Unit/Helper/EncodeingGuessingHelperTest.php new file mode 100644 index 000000000..cf84b2d8d --- /dev/null +++ b/tests/Unit/Helper/EncodeingGuessingHelperTest.php @@ -0,0 +1,75 @@ +createStub(IL10N::class); + $l->method('t')->willReturnArgument(0); + + $this->dut = new EncodingGuessingHelper($l); + } + + public function dpPureContentType() { + return [ + ['text/text; charset=utf-8', 'utf-8'], + ['text/text; boundary=foo ; charset=UTF-16', 'UTF-16'], + ['text/text;boundary=foo;charset=iso-8859-1;param=value', 'iso-8859-1'], + ]; + } + + /** + * @dataProvider dpPureContentType + * @param mixed $ct + * @param mixed $enc + */ + public function testGuessEncodingFromContentType($ct, $enc) { + $this->assertEquals($enc, $this->dut->guessEncoding('', $ct)); + } + + public function dpPureContent() { + return [ + ['contentA.html', 'iso-8859-1'], + ['contentB.html', 'UTF-16'], + ]; + } + + /** + * @dataProvider dpPureContent + * @param mixed $filename + * @param mixed $enc + */ + public function testGuessEncodingFromContentNoContentType($filename, $enc) { + $content = file_get_contents(__DIR__ . "/EncodingGuessingHelper/$filename"); + $this->assertEquals($enc, $this->dut->guessEncoding($content, null)); + } + + public function testGuessEncodingNoEncoding() { + $this->expectException(CouldNotGuessEncodingException::class); + $this->dut->guessEncoding('Some text', 'text/text;boundary=foo'); + } + + /** + * @dataProvider dpPureContentType + * @param mixed $ct + * @param mixed $enc + */ + public function testGuessEncodingBothPresent($ct, $enc) { + $content = file_get_contents(__DIR__ . '/EncodingGuessingHelper/contentA.html'); + $this->assertEquals($enc, $this->dut->guessEncoding($content, $ct)); + } +} diff --git a/tests/Unit/Helper/EncodingGuessingHelper/contentA.html b/tests/Unit/Helper/EncodingGuessingHelper/contentA.html new file mode 100644 index 000000000..a7b1d752c --- /dev/null +++ b/tests/Unit/Helper/EncodingGuessingHelper/contentA.html @@ -0,0 +1,3 @@ + + + diff --git a/tests/Unit/Helper/EncodingGuessingHelper/contentB.html b/tests/Unit/Helper/EncodingGuessingHelper/contentB.html new file mode 100644 index 000000000..3fd211f70 --- /dev/null +++ b/tests/Unit/Helper/EncodingGuessingHelper/contentB.html @@ -0,0 +1,3 @@ + + + diff --git a/tests/Unit/Helper/HTMLFilter/HtmlEncodingFilterTest.php b/tests/Unit/Helper/HTMLFilter/HtmlEncodingFilterTest.php new file mode 100644 index 000000000..df8815395 --- /dev/null +++ b/tests/Unit/Helper/HTMLFilter/HtmlEncodingFilterTest.php @@ -0,0 +1,33 @@ +dut = new HtmlEncodingFilter(); + } + + public function dp() { + return [ + ['', ''], + ['', ''], + ["", ""], + ]; + } + + /** + * @dataProvider dp + * @param mixed $input + * @param mixed $desiredOutput + */ + public function testApply($input, $desiredOutput) { + $this->dut->apply($input); + $this->assertEquals($desiredOutput, $input); + } +} diff --git a/tests/Unit/Service/HtmlDownloadServiceTest.php b/tests/Unit/Service/HtmlDownloadServiceTest.php index b34f0d84f..6d9c6c333 100644 --- a/tests/Unit/Service/HtmlDownloadServiceTest.php +++ b/tests/Unit/Service/HtmlDownloadServiceTest.php @@ -3,16 +3,20 @@ namespace OCA\Cookbook\tests\Unit\Service; use DOMDocument; +use OCA\Cookbook\Exception\CouldNotGuessEncodingException; use OCP\IL10N; -use OCP\ILogger; use PHPUnit\Framework\TestCase; use OCA\Cookbook\Helper\HtmlToDomParser; use OCA\Cookbook\Exception\ImportException; use OCA\Cookbook\Exception\NoDownloadWasCarriedOutException; +use OCA\Cookbook\Helper\DownloadEncodingHelper; use OCA\Cookbook\Helper\DownloadHelper; +use OCA\Cookbook\Helper\EncodingGuessingHelper; +use OCA\Cookbook\Helper\HTMLFilter\HtmlEncodingFilter; use PHPUnit\Framework\MockObject\MockObject; use OCA\Cookbook\Service\HtmlDownloadService; use OCA\Cookbook\Helper\HTMLFilter\HtmlEntityDecodeFilter; +use OCP\ILogger; /** * @covers \OCA\Cookbook\Service\HtmlDownloadService @@ -22,10 +26,8 @@ class HtmlDownloadServiceTest extends TestCase { * @var HtmlEntityDecodeFilter|MockObject */ private $htmlEntityDecodeFilter; - /** - * @var ILogger - */ - private $ilogger; + /** @var HtmlEncodingFilter */ + private $htmlEncodingFilter; /** * @var IL10N */ @@ -37,6 +39,12 @@ class HtmlDownloadServiceTest extends TestCase { /** @var DownloadHelper|MockObject */ private $downloadHelper; + /** @var EncodingGuessingHelper|MockObject */ + private $encodingGuesser; + + /** @var DownloadEncodingHelper|MockObject */ + private $downloadEncodingHelper; + /** * @var HtmlDownloadService */ @@ -54,12 +62,19 @@ public function setUp(): void { $this->htmlEntityDecodeFilter = $this->createMock(HtmlEntityDecodeFilter::class); $this->htmlEntityDecodeFilter->method('apply')->willReturnArgument(0); - $this->ilogger = $this->createStub(ILogger::class); + $this->htmlEncodingFilter = $this->createStub(HtmlEncodingFilter::class); + $this->htmlEncodingFilter->method('apply')->willReturnArgument(0); + $this->il10n = $this->createStub(IL10N::class); + $logger = $this->createStub(ILogger::class); $this->htmlParser = $this->createMock(HtmlToDomParser::class); $this->downloadHelper = $this->createMock(DownloadHelper::class); + $this->encodingGuesser = $this->createMock(EncodingGuessingHelper::class); + $this->downloadEncodingHelper = $this->createMock(DownloadEncodingHelper::class); - $this->sut = new HtmlDownloadService($this->htmlEntityDecodeFilter, $this->ilogger, $this->il10n, $this->htmlParser, $this->downloadHelper); + $this->sut = new HtmlDownloadService( + $this->htmlEntityDecodeFilter, $this->htmlEncodingFilter, $this->il10n, $logger, $this->htmlParser, + $this->downloadHelper, $this->encodingGuesser, $this->downloadEncodingHelper); } public function testDownloadInvalidUrl() { @@ -101,11 +116,53 @@ public function testDownload() { $content = 'The content of the html file'; $dom = $this->createStub(DOMDocument::class); $state = 12345; + $contentType = 'The content type'; + $encoding = 'utf-8'; + + $this->downloadHelper->expects($this->once()) + ->method('downloadFile'); + $this->downloadHelper->method('getStatus')->willReturn(200); + $this->downloadHelper->method('getContent')->willReturn($content); + $this->downloadHelper->method('getContentType')->willReturn($contentType); + + $this->encodingGuesser->method('guessEncoding') + ->with($content, $contentType) + ->willReturn($encoding); + $this->downloadEncodingHelper->method('encodeToUTF8') + ->with($content, $encoding)->willReturnArgument(0); + + $this->htmlParser->expects($this->once())->method('loadHtmlString') + ->with( + $this->anything(), + $this->equalTo($url), + $this->equalTo($content) + )->willReturn($dom); + $this->htmlParser->method('getState')->willReturn($state); + + $ret = $this->sut->downloadRecipe($url); + $this->assertEquals($state, $ret); + + $this->assertSame($dom, $this->sut->getDom()); + } + + public function testDownloadWithoutEncoding() { + $url = 'http://example.com'; + $content = 'The content of the html file'; + $dom = $this->createStub(DOMDocument::class); + $state = 12345; + $contentType = 'The content type'; + $encoding = 'utf-8'; $this->downloadHelper->expects($this->once()) ->method('downloadFile'); $this->downloadHelper->method('getStatus')->willReturn(200); $this->downloadHelper->method('getContent')->willReturn($content); + $this->downloadHelper->method('getContentType')->willReturn($contentType); + + $this->encodingGuesser->method('guessEncoding') + ->with($content, $contentType) + ->willThrowException(new CouldNotGuessEncodingException()); + $this->htmlParser->expects($this->once())->method('loadHtmlString') ->with( $this->anything(),