Skip to content

Commit

Permalink
Return longest text after failing to detect text longer than the conf…
Browse files Browse the repository at this point in the history
…igured value (#423)

Save extracted text across attempts and return the longest one when all attempts fail, and add a test case from hukumusume
  • Loading branch information
andreskrey authored and gijsk committed Feb 27, 2018
1 parent 264b8e8 commit 834672e
Show file tree
Hide file tree
Showing 4 changed files with 362 additions and 3 deletions.
28 changes: 25 additions & 3 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ function Readability(uri, doc, options) {
this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
this._attempts = [];

// Configurable options
this._debug = !!options.debug;
Expand Down Expand Up @@ -1083,24 +1084,45 @@ Readability.prototype = {
if (this._debug)
this.log("Article content after paging: " + articleContent.innerHTML);

var parseSuccessful = true;

// Now that we've gone through the full algorithm, check to see if
// we got any meaningful content. If we didn't, we may need to re-run
// grabArticle with different flags set. This gives us a higher likelihood of
// finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content.
if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
var textLength = this._getInnerText(articleContent, true).length;
if (textLength < this._wordThreshold) {
parseSuccessful = false;
page.innerHTML = pageCacheHtml;

if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
this._attempts.push({articleContent: articleContent, textLength: textLength});
} else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
this._removeFlag(this.FLAG_WEIGHT_CLASSES);
this._attempts.push({articleContent: articleContent, textLength: textLength});
} else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
this._attempts.push({articleContent: articleContent, textLength: textLength});
} else {
return null;
this._attempts.push({articleContent: articleContent, textLength: textLength});
// No luck after removing flags, just return the longest text we found during the different loops
this._attempts.sort(function (a, b) {
return a.textLength < b.textLength;
});

// But first check if we actually have something
if (!this._attempts[0].textLength) {
return null;
}

articleContent = this._attempts[0].articleContent;
parseSuccessful = true;
}
} else {
}

if (parseSuccessful) {
// Find out text direction from ancestors of final top candidate.
var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
this._someNode(ancestors, function(ancestor) {
Expand Down
7 changes: 7 additions & 0 deletions test/test-pages/hukumusume/expected-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"title": "欲張りなイヌ <福娘童話集 きょうのイソップ童話>",
"byline": null,
"dir": null,
"excerpt": "福娘童話集 > きょうのイソップ童話 > 1月のイソップ童話 > 欲張りなイヌ",
"readerable": true
}
57 changes: 57 additions & 0 deletions test/test-pages/hukumusume/expected.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
<div id="readability-page-1" class="page"><div width="619">
<p> <a href="http://fakehost/test/../../../index.html">福娘童話集</a> > <a href="http://fakehost/test/../index.html">きょうのイソップ童話</a> &gt; <a href="http://fakehost/test/../itiran/01gatu.htm">1月のイソップ童話</a> &gt; 欲張りなイヌ</p>
<p>
<span color="#FF0000" size="+2">元旦のイソップ童話</span></p><p> <img src="http://fakehost/test/../../../gazou/pc_gazou/aesop/aesop052.jpg" alt="よくばりなイヌ" width="480" height="360"></img></p><p> 欲張りなイヌ</p><p> <a href="http://hukumusume.com/douwa/English/aesop/01/01_j.html">ひらがな</a> ←→ <a href="http://hukumusume.com/douwa/English/aesop/01/01_j&amp;E.html">日本語・英語</a> ←→ <a href="http://hukumusume.com/douwa/English/aesop/01/01_E.html">English</a></p>
<table>
<tbody>
<tr>
<td>
<table>
<tbody>
<tr>
<td><img src="http://fakehost/test/../../../../366/logo_bana/corner_1.gif" width="7" height="7"></img></td>
<td>
<span color="#FF0000"><b>おりがみをつくろう</b></span>
</td>
<td>
<span size="-1">(<a href="http://www.origami-club.com/index.html"> おりがみくらぶ</a> より)</span>
</td>
<td><img src="http://fakehost/test/../../../../366/logo_bana/corner_2.gif" width="7" height="7"></img></td>
</tr>
<tr>
<td colspan="4">
<table>
<tbody>
<tr>
<td>
<span size="+2"><a href="http://www.origami-club.com/easy/dogfase/index.html"><img src="http://fakehost/test/../../../gazou/origami_gazou/kantan/dogface.gif" alt="犬の顔の折り紙" width="73" height="51"></img>いぬのかお</a></span>   <a href="http://www.origami-club.com/easy/dog/index.html"><img src="http://fakehost/test/../../../gazou/origami_gazou/kantan/dog.gif" alt="犬の顔の紙" width="62" height="43"></img><span size="+2">いぬ</span></a></td>
</tr>
</tbody>
</table>
</td>
</tr>
</tbody>
</table>
</td>
</tr>
</tbody>
</table>
<table>
<tbody>
<tr>
<td>♪音声配信(html5)</td>
</tr>
<tr>
<td><audio src="http://ohanashi2.up.seesaa.net/mp3/ae_0101.mp3" controls=""></audio></td>
</tr>
<tr>
<td>
<span size="-1"><a href="http://www.voiceblog.jp/onokuboaki/">亜姫の朗読☆ イソップ童話より</a></span>
</td>
</tr>
</tbody>
</table>
<p>  肉をくわえたイヌが、橋を渡っていました。 ふと下を見ると、川の中にも肉をくわえたイヌがいます。 イヌはそれを見て、思いました。(あいつの肉の方が、大きそうだ)  イヌは、くやしくてたまりません。 (そうだ、あいつをおどかして、あの肉を取ってやろう)  そこでイヌは、川の中のイヌに向かって思いっきり吠えました。 「ウゥー、ワン!!」  そのとたん、くわえていた肉はポチャンと川の中に落ちてしまいました。 「ああー、ぁぁー」  川の中には、がっかりしたイヌの顔がうつっています。  さっきの川の中のイヌは、水にうつった自分の顔だったのです。  同じ物を持っていても、人が持っている物の方が良く見え、また、欲張るとけっきょく損をするというお話しです。</p>
<p>おしまい</p>
<p> </p>
</div></div>
Loading

0 comments on commit 834672e

Please sign in to comment.