Skip to content

Commit

Permalink
Improve URL handling in JSDOMParser and Readability.js
Browse files Browse the repository at this point in the history
This change ups the required node version to 7.0 because it relies on the builtin url module.

We now pass a url when constructing a jsdom document or JSDOMParser document.
Because this is an API change, I'm increasing the package version.

Ultimately, I would like to remove the  argument from the readability constructor. It should
use the documentURI from the document it is passed.
  • Loading branch information
gijsk committed Feb 28, 2018
1 parent 834672e commit d598baf
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
language: node_js
sudo: false
node_js:
- '6.5'
- '7.0'
script:
- npm run lint
- npm run test
21 changes: 18 additions & 3 deletions JSDOMParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,8 @@
},
};

var Document = function () {
var Document = function (url) {
this.documentURI = url;
this.styleSheets = [];
this.childNodes = [];
this.children = [];
Expand Down Expand Up @@ -593,6 +594,20 @@
node.textContent = text;
return node;
},

get baseURI() {
if (!this.hasOwnProperty("_baseURI")) {
this._baseURI = this.documentURI;
var baseElements = this.getElementsByTagName("base");
var href = baseElements[0] && baseElements[0].getAttribute("href");
if (href) {
try {
this._baseURI = (new URL(href, this._baseURI)).href;
} catch (ex) {/* Just fall back to documentURI */}
}
}
return this._baseURI;
},
};

var Element = function (tag) {
Expand Down Expand Up @@ -1111,9 +1126,9 @@
/**
* Parses an HTML string and returns a JS implementation of the Document.
*/
parse: function (html) {
parse: function (html, url) {
this.html = html;
var doc = this.doc = new Document();
var doc = this.doc = new Document(url);
this.readChildren(doc);

// If this is an HTML document, remove root-level children except for the
Expand Down
2 changes: 2 additions & 0 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
var path = require("path");
var fs = require("fs");
var url = require("url");

// We want to load Readability and JSDOMParser, which aren't set up as commonjs libraries,
// and so we need to do some hocus-pocus with 'vm' to import them on a separate scope
Expand All @@ -14,6 +15,7 @@ var scopeContext = {};
// in the scope we're using:
scopeContext.dump = console.log;
scopeContext.console = console;
scopeContext.URL = url.URL;

// Actually load files. NB: if either of the files has parse errors,
// node is dumb and shows you a syntax error *at this callsite* . Don't try to find
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "readability",
"version": "0.0.1",
"version": "0.1.0",
"description": "A standalone version of the readability library used for Firefox Reader View.",
"main": "Readability.js",
"scripts": {
Expand All @@ -20,7 +20,7 @@
"url": "https://github.com/mozilla/readability/issues"
},
"engines": {
"node" : ">=6.5"
"node" : ">=7.0"
},
"homepage": "https://github.com/mozilla/readability",
"devDependencies": {
Expand Down
22 changes: 21 additions & 1 deletion test/test-jsdomparser.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ var BASETESTCASE = '<html><body><p>Some text and <a class="someclass" href="#">a
'<div id="foo">With a <script>With &lt; fancy " characters in it because' +
'</script> that is fun.<span>And another node to make it harder</span></div><form><input type="text"/><input type="number"/>Here\'s a form</form></body></html>';

var baseDoc = new JSDOMParser().parse(BASETESTCASE);
var baseDoc = new JSDOMParser().parse(BASETESTCASE, "http://fakehost/");

describe("Test JSDOM functionality", function() {
function nodeExpect(actual, expected) {
Expand Down Expand Up @@ -37,6 +37,11 @@ describe("Test JSDOM functionality", function() {

});

it("should have basic URI information", function() {
expect(baseDoc.documentURI, "http://fakehost/");
expect(baseDoc.baseURI, "http://fakehost/");
});

it("should deal with script tags", function() {
// Check our script parsing worked:
var scripts = baseDoc.getElementsByTagName("script");
Expand Down Expand Up @@ -300,3 +305,18 @@ describe("Recovery from self-closing tags that have close tags", function() {
expect(doc.firstChild.firstChild.firstChild.localName).eql("p");
});
});

describe("baseURI parsing", function() {
it("should handle various types of relative and absolute base URIs", function() {
function checkBase(base, expectedResult) {
var html = "<html><head><base href='" + base + "'></base></head><body/></html>";
var doc = new JSDOMParser().parse(html, "http://fakehost/some/dir/");
expect(doc.baseURI).eql(expectedResult);
}

checkBase("relative/path", "http://fakehost/some/dir/relative/path");
checkBase("/path", "http://fakehost/path");
checkBase("http://absolute/", "http://absolute/");
checkBase("//absolute/path", "http://absolute/path");
});
});
3 changes: 2 additions & 1 deletion test/test-readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ describe("Test pages", function() {

runTestsWithItems("jsdom", function(source) {
var doc = jsdom(source, {
url: uri.spec,
features: {
FetchExternalResources: false,
ProcessExternalResources: false
Expand All @@ -240,7 +241,7 @@ describe("Test pages", function() {

runTestsWithItems("JSDOMParser", function(source) {
var parser = new JSDOMParser();
var doc = parser.parse(source);
var doc = parser.parse(source, uri.spec);
if (parser.errorState) {
console.error("Parsing this DOM caused errors:", parser.errorState);
return null;
Expand Down

0 comments on commit d598baf

Please sign in to comment.