Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/actions/run-tests/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ runs:
--create-env-dump
--create-plain-dump plain
--install-composer-deps
--run-unit-tests --run-migration-tests
--run-unit-tests --run-integration-tests --run-migration-tests
--extract-code-coverage
--shutdown-helpers || { failed=$?; } ;
if [ -n "$failed" ]; then
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
[#1007](https://github.com/nextcloud/cookbook/pull/1007) @christianlupus
- Switched to cURL for downloading of external files
[#1055](https://github.com/nextcloud/cookbook/pull/1055) @christianlupus
- Rewrite encoding of imported recipes
[#1057](https://github.com/nextcloud/cookbook/pull/1057) @christianlupus

### Fixed
- Fix visual regression in edit mode to prevent overflow of breadcrumbs
Expand Down
9 changes: 9 additions & 0 deletions lib/Exception/CouldNotGuessEncodingException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?php

namespace OCA\Cookbook\Exception;

class CouldNotGuessEncodingException extends \Exception {
public function __construct($message = null, $code = null, $previous = null) {
parent::__construct($message, $code, $previous);
}
}
20 changes: 20 additions & 0 deletions lib/Helper/DownloadEncodingHelper.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?php

namespace OCA\Cookbook\Helper;

/**
* This class handles the encoding of downloads in order to only work with UTF8 strings.
*/
class DownloadEncodingHelper {
/**
* Encode a string to UTF8
*
* @param string $data The data to be converted
* @param string $encoding The encoding of the input string
* @return string The string encoded in UTF8 encoding
*/
public function encodeToUTF8(string $data, string $encoding): string {
$data = iconv($encoding, 'UTF-8', $data);
return $data;
}
}
2 changes: 1 addition & 1 deletion lib/Helper/DownloadHelper.php
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ public function getContentType(): ?string {

foreach ($this->headers as $s) {
$parts = explode(':', $s, 2);
if (trim($parts[0]) === 'Content-Type') {
if (strtolower(trim($parts[0])) === 'content-type') {
return trim($parts[1]);
}
}
Expand Down
70 changes: 70 additions & 0 deletions lib/Helper/EncodingGuessingHelper.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
<?php

namespace OCA\Cookbook\Helper;

use OCA\Cookbook\Exception\CouldNotGuessEncodingException;
use OCP\IL10N;

/**
* This class is a helper to help getting the type of a downloaded HTML file for further encoding and parsing.
* It extracts the correct text encoding.
*/
class EncodingGuessingHelper {
/** @var IL10N */
private $l;

public function __construct(
IL10N $l
) {
$this->l = $l;
}
/**
* Extract the text encoding from a HTML file
*
* @param string $content The content of the file
* @param ?string $contentType The ContentType header if present or null to look at the contents
* @return string The guessed content encoding
* @throws CouldNotGuessEncodingException if no encoding could be guessed
*/
public function guessEncoding(string $content, ?string $contentType): string {
if ($contentType !== null) {
$guess = $this->guessFromContentType($contentType);

if ($guess !== null) {
return $guess;
}
}

$guess = $this->guessFromMainContent($content);
if ($guess === null) {
throw new CouldNotGuessEncodingException($this->l->t('No content encoding was detected in the content.'));
} else {
return $guess;
}
}

private function guessFromContentType(string $contentType): ?string {
$parts = explode(';', $contentType);
$parts = array_map(function ($x) {
return trim($x);
}, $parts);

foreach ($parts as $part) {
$subparts = explode('=', $part, 2);
if (strtolower($subparts[0]) === 'charset' && count($subparts) === 2) {
return $subparts[1];
}
}
// Fallback: We did not find anything in the Content-Type
return null;
}

private function guessFromMainContent(string $content): ?string {
$regex = "/<meta[^>]* charset=['\"]?([^'\">]*)['\"]?[^>]*>/";
$ret = preg_match($regex, $content, $matches);
if ($ret === 1) {
return $matches[1];
}
return null;
}
}
11 changes: 11 additions & 0 deletions lib/Helper/HTMLFilter/HtmlEncodingFilter.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?php

namespace OCA\Cookbook\Helper\HTMLFilter;

class HtmlEncodingFilter extends AbstractHtmlFilter {
public function apply(string &$html): void {
if (preg_match('/^<\?xml/', $html) === 0) {
$html = '<?xml encoding="UTF-8">' . $html;
}
}
}
41 changes: 32 additions & 9 deletions lib/Service/HtmlDownloadService.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
namespace OCA\Cookbook\Service;

use DOMDocument;
use OCA\Cookbook\Exception\CouldNotGuessEncodingException;
use OCA\Cookbook\Exception\ImportException;
use OCA\Cookbook\Exception\NoDownloadWasCarriedOutException;
use OCA\Cookbook\Helper\DownloadEncodingHelper;
use OCA\Cookbook\Helper\DownloadHelper;
use OCA\Cookbook\Helper\EncodingGuessingHelper;
use OCA\Cookbook\Helper\HTMLFilter\AbstractHtmlFilter;
use OCA\Cookbook\Helper\HTMLFilter\HtmlEncodingFilter;
use OCA\Cookbook\Helper\HTMLFilter\HtmlEntityDecodeFilter;
use OCA\Cookbook\Helper\HtmlToDomParser;
use OCP\IL10N;
Expand All @@ -18,16 +22,14 @@ class HtmlDownloadService {
*/
private $htmlFilters;

/**
* @var ILogger
*/
private $logger;

/**
* @var IL10N
*/
private $l;

/** @var ILogger */
private $logger;

/**
* @var HtmlToDomParser
*/
Expand All @@ -36,23 +38,37 @@ class HtmlDownloadService {
/** @var DownloadHelper */
private $downloadHelper;

/** @var EncodingGuessingHelper */
private $encodingGuesser;

/** @var DownloadEncodingHelper */
private $downloadEncodingHelper;

/**
* @var DOMDocument
*/
private $dom;

public function __construct(
HtmlEntityDecodeFilter $htmlEntityDecodeFilter,
ILogger $logger,
HtmlEncodingFilter $htmlEncodingFilter,
IL10N $l10n,
ILogger $logger,
HtmlToDomParser $htmlParser,
DownloadHelper $downloadHelper
DownloadHelper $downloadHelper,
EncodingGuessingHelper $encodingGuesser,
DownloadEncodingHelper $downloadEncodingHelper
) {
$this->htmlFilters = [ $htmlEntityDecodeFilter ];
$this->logger = $logger;
$this->htmlFilters = [
$htmlEntityDecodeFilter,
$htmlEncodingFilter,
];
$this->l = $l10n;
$this->logger = $logger;
$this->htmlParser = $htmlParser;
$this->downloadHelper = $downloadHelper;
$this->encodingGuesser = $encodingGuesser;
$this->downloadEncodingHelper = $downloadEncodingHelper;
}

/**
Expand Down Expand Up @@ -118,6 +134,13 @@ private function fetchHtmlPage(string $url): string {

$html = $this->downloadHelper->getContent();

try {
$enc = $this->encodingGuesser->guessEncoding($html, $this->downloadHelper->getContentType());
$html = $this->downloadEncodingHelper->encodeToUTF8($html, $enc);
} catch (CouldNotGuessEncodingException $ex) {
$this->logger->notice($this->l->t('Could not find a valid encoding when parsing %s.', [$url]));
}

return $html;
}
}
1 change: 1 addition & 0 deletions tests/Unit/Helper/DownloadEncodingHelper/iso-8859-1.orig
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
abc�������
1 change: 1 addition & 0 deletions tests/Unit/Helper/DownloadEncodingHelper/iso-8859-1.utf8
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
abcäöüßÄÖÜ
48 changes: 48 additions & 0 deletions tests/Unit/Helper/DownloadEncodingHelperTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<?php

namespace OCA\Cookbook\tests\Unit\Helper;

use OCA\Cookbook\Helper\DownloadEncodingHelper;
use PHPUnit\Framework\TestCase;

/**
* @covers OCA\Cookbook\Helper\DownloadEncodingHelper
*/
class DownloadEncodingHelperTest extends TestCase {
/** @var DownloadEncodingHelper */
private $dut;

protected function setUp(): void {
$this->dut = new DownloadEncodingHelper();
}

public function dpEncodings() {
return [
['iso-8859-1']
];
}

/**
* @dataProvider dpEncodings
* @param mixed $encoding
*/
public function testEncodeToUTF8($encoding) {
$unencoded = file_get_contents(__DIR__ . "/DownloadEncodingHelper/$encoding.orig");
$encoded = file_get_contents(__DIR__ . "/DownloadEncodingHelper/$encoding.utf8");

$testEncoded = $this->dut->encodeToUTF8($unencoded, $encoding);
$this->assertEquals($encoded, $testEncoded);
}

public function testEncodeUTF8ToUTF8() {
$encodings = $this->dpEncodings();

$testString = '';
foreach ($encodings as $enc) {
$fileContent = file_get_contents(__DIR__ . "/DownloadEncodingHelper/{$enc[0]}.utf8");
$testString .= $fileContent;
}

$this->assertEquals($testString, $this->dut->encodeToUTF8($testString, 'utf-8'));
}
}
75 changes: 75 additions & 0 deletions tests/Unit/Helper/EncodeingGuessingHelperTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
<?php

namespace OCA\Cookbook\tests\Unit\Helper;

use OCA\Cookbook\Exception\CouldNotGuessEncodingException;
use OCA\Cookbook\Helper\EncodingGuessingHelper;
use OCP\IL10N;
use PHPUnit\Framework\MockObject\Stub;
use PHPUnit\Framework\TestCase;

/**
* @covers OCA\Cookbook\Helper\EncodingGuessingHelper
* @covers OCA\Cookbook\Exception\CouldNotGuessEncodingException
*/
class EncodingGuessingHelperTest extends TestCase {
/** @var EncodingGuessingHelper */
private $dut;

protected function setUp(): void {
/** @var IL10N|Stub $l */
$l = $this->createStub(IL10N::class);
$l->method('t')->willReturnArgument(0);

$this->dut = new EncodingGuessingHelper($l);
}

public function dpPureContentType() {
return [
['text/text; charset=utf-8', 'utf-8'],
['text/text; boundary=foo ; charset=UTF-16', 'UTF-16'],
['text/text;boundary=foo;charset=iso-8859-1;param=value', 'iso-8859-1'],
];
}

/**
* @dataProvider dpPureContentType
* @param mixed $ct
* @param mixed $enc
*/
public function testGuessEncodingFromContentType($ct, $enc) {
$this->assertEquals($enc, $this->dut->guessEncoding('', $ct));
}

public function dpPureContent() {
return [
['contentA.html', 'iso-8859-1'],
['contentB.html', 'UTF-16'],
];
}

/**
* @dataProvider dpPureContent
* @param mixed $filename
* @param mixed $enc
*/
public function testGuessEncodingFromContentNoContentType($filename, $enc) {
$content = file_get_contents(__DIR__ . "/EncodingGuessingHelper/$filename");
$this->assertEquals($enc, $this->dut->guessEncoding($content, null));
}

public function testGuessEncodingNoEncoding() {
$this->expectException(CouldNotGuessEncodingException::class);
$this->dut->guessEncoding('Some text', 'text/text;boundary=foo');
}

/**
* @dataProvider dpPureContentType
* @param mixed $ct
* @param mixed $enc
*/
public function testGuessEncodingBothPresent($ct, $enc) {
$content = file_get_contents(__DIR__ . '/EncodingGuessingHelper/contentA.html');
$this->assertEquals($enc, $this->dut->guessEncoding($content, $ct));
}
}
3 changes: 3 additions & 0 deletions tests/Unit/Helper/EncodingGuessingHelper/contentA.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<html>
<head><meta charset="iso-8859-1"></head><body></body>
</html>
3 changes: 3 additions & 0 deletions tests/Unit/Helper/EncodingGuessingHelper/contentB.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<html>
<head><meta charset="UTF-16"></head><body></body>
</html>
Loading