Skip to content

Commit

Permalink
[Multi] Minor improvements for my bridges (#1507)
Browse files Browse the repository at this point in the history
* [DarkReading] Hide dummy articles

* [FuturaSciences] Strip inline scripts from content

* [FeedExpander] Fix PHP notice on missing uri field

(guid is valid uri AND item uri is not valid)
 => (guid is valid uri AND item uri is empty or not valid)

* [NextInpact] Fix subtitle extraction

* [Markdown] Fix images with empty replacement text

* [TheHackerNews] Fix Author name cleanup

* [LeMondeInformatique] Remove encoding conversion

Was previously needed due to actual encoding on the page
being inconsistent with encoding specified in <meta> tag

* [AnimeUltime] Remove encoding conversion

Was previously needed due to encoding on the page being incorrect

* [FuturaSciences] Fix content extraction

* [FuturaSciences] Fix unneeded unset()

* [GBAtemp] Fix tutorial mode URL extraction

* [GBAtemp] Fix tutorial mode Title extraction
  • Loading branch information
em92 authored Aug 14, 2020
2 parents dc36b42 + c642652 commit 4b8c3b9
Show file tree
Hide file tree
Showing 8 changed files with 12 additions and 9 deletions.
1 change: 0 additions & 1 deletion bridges/AnimeUltimeBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ public function collectData(){
$item_description = defaultLinkTo($item_description, self::URI);
$item_description = str_replace("\r", '', $item_description);
$item_description = str_replace("\n", '', $item_description);
$item_description = utf8_encode($item_description);

//Build and add final item
$item = array();
Expand Down
2 changes: 2 additions & 0 deletions bridges/DarkReadingBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ public function collectData(){

protected function parseItem($newsItem){
$item = parent::parseItem($newsItem);
if (empty($item['content']))
return null; //ignore dummy articles
$article = getSimpleHTMLDOMCached($item['uri'])
or returnServerError('Could not request Dark Reading: ' . $item['uri']);
$item['content'] = $this->extractArticleContent($article);
Expand Down
3 changes: 2 additions & 1 deletion bridges/FuturaSciencesBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ protected function parseItem($newsItem){
}

private function extractArticleContent($article){
$contents = $article->find('section.article-text-classic', 0)->innertext;
$contents = $article->find('section.article-text', 1)->innertext;
$headline = trim($article->find('p.description', 0)->plaintext);
if(!empty($headline))
$headline = '<p><b>' . $headline . '</b></p>';
Expand Down Expand Up @@ -129,6 +129,7 @@ private function extractArticleContent($article){
$contents = stripWithDelimiters($contents, 'fs:xt:clickname="', '"');
$contents = StripWithDelimiters($contents, '<section class="module-toretain module-propal-nl', '</section>');
$contents = stripWithDelimiters($contents, '<script ', '</script>');
$contents = stripWithDelimiters($contents, '<script>', '</script>');

return $headline . trim($contents);
}
Expand Down
4 changes: 2 additions & 2 deletions bridges/GBAtempBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ public function collectData(){
break;
case 'T':
foreach($html->find('li.portal-tutorial') as $tutorialItem) {
$url = self::URI . $tutorialItem->find('a', 0)->href;
$title = $tutorialItem->find('a', 0)->plaintext;
$url = self::URI . $tutorialItem->find('a', 1)->href;
$title = $tutorialItem->find('a', 1)->plaintext;
$time = $this->findItemDate($tutorialItem);
$author = $tutorialItem->find('a.username', 0)->plaintext;
$content = $this->fetchPostContent($url, self::URI);
Expand Down
4 changes: 2 additions & 2 deletions bridges/LeMondeInformatiqueBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ protected function parseItem($newsItem){

//No response header sets the encoding, explicit conversion is needed or subsequent xml_encode() will fail
$content_node = $article_html->find('div.col-primary, div.col-sm-9', 0);
$item['content'] = utf8_encode($this->cleanArticle($content_node->innertext));
$item['author'] = utf8_encode($article_html->find('div.author-infos', 0)->find('b', 0)->plaintext);
$item['content'] = $this->cleanArticle($content_node->innertext);
$item['author'] = $article_html->find('div.author-infos', 0)->find('b', 0)->plaintext;

return $item;
}
Expand Down
1 change: 1 addition & 0 deletions bridges/TheHackerNewsBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ public function collectData(){

$article_url = $element->find('a.story-link', 0)->href;
$article_author = trim($element->find('i.icon-user', 0)->parent()->plaintext);
$article_author = str_replace('&#59396;', '', $article_author);
$article_title = $element->find('h2.home-title', 0)->plaintext;

//Date without time
Expand Down
2 changes: 1 addition & 1 deletion lib/FeedExpander.php
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ protected function parseRSS_2_0_Item($feedItem){
if($attribute === 'isPermaLink'
&& ($value === 'true' || (
filter_var($feedItem->guid, FILTER_VALIDATE_URL)
&& !filter_var($item['uri'], FILTER_VALIDATE_URL)
&& (empty($item['uri']) || !filter_var($item['uri'], FILTER_VALIDATE_URL))
)
)
) {
Expand Down
4 changes: 2 additions & 2 deletions lib/html.php
Original file line number Diff line number Diff line change
Expand Up @@ -207,15 +207,15 @@ function markdownToHtml($string) {

//For more details about how these regex work:
// https://github.com/RSS-Bridge/rss-bridge/pull/802#discussion_r216138702
// Images: https://regex101.com/r/JW9Evr/1
// Images: https://regex101.com/r/JW9Evr/2
// Links: https://regex101.com/r/eRGVe7/1
// Bold: https://regex101.com/r/2p40Y0/1
// Italic: https://regex101.com/r/xJkET9/1
// Separator: https://regex101.com/r/ZBEqFP/1
// Plain URL: https://regex101.com/r/2JHYwb/1
// Site name: https://regex101.com/r/qIuKYE/1

$string = preg_replace('/\!\[([^\]]+)\]\(([^\) ]+)(?: [^\)]+)?\)/', '<img src="$2" alt="$1" />', $string);
$string = preg_replace('/\!\[([^\]]*)\]\(([^\) ]+)(?: [^\)]+)?\)/', '<img src="$2" alt="$1" />', $string);
$string = preg_replace('/\[([^\]]+)\]\(([^\)]+)\)/', '<a href="$2">$1</a>', $string);
$string = preg_replace('/\*\*(.*)\*\*/U', '<b>$1</b>', $string);
$string = preg_replace('/\*(.*)\*/U', '<i>$1</i>', $string);
Expand Down

0 comments on commit 4b8c3b9

Please sign in to comment.