Skip to content

Commit

Permalink
Allow node.js URL object to be passed to fromUrl (#102)
Browse files Browse the repository at this point in the history
* updating fromURL to allow node URL object which allows raw access to URL being used, fixes #97
  • Loading branch information
dbashford authored Nov 1, 2016
1 parent bbd278d commit ac4f799
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 147 deletions.
7 changes: 6 additions & 1 deletion .eslintrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
"prefer-template": 0,
"prefer-arrow-callback": 0,
"prefer-rest-params": 0
},
"env": {
"mocha": true
},
"globals": {
"expect": true
}

}
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ textract.fromBufferWithName(name, buffer, config, function( error, text ) {})

##### URL

When passing a URL, the URL can either be a string, or a [node.js URL object](https://nodejs.org/api/url.html). Using the URL object allows fine grained control over the URL being used.

```javascript
textract.fromUrl(url, function( error, text ) {})
```
Expand All @@ -160,7 +162,7 @@ textract.fromUrl(url, config, function( error, text ) {})
* [#93](https://github.com/dbashford/textract/pull/93). PR added better error handling for `fromUrl` requests.
* [#95](https://github.com/dbashford/textract/pull/95). PR added support for monetary symbols.
* [#98](https://github.com/dbashford/textract/pull/98). PR shortened needlessly long file paths for temp files.
* [#101](https://github.com/dbashford/textract/pull/95). PR added UTF-8 support for antiword requests.
* [#101](https://github.com/dbashford/textract/pull/101). PR added UTF-8 support for antiword requests.

### 2.0.0
* Codebase is now properly eslinted.
Expand Down
12 changes: 8 additions & 4 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -116,19 +116,23 @@ function fromBufferWithName( filePath, bufferContent, options, cb ) {
}

function fromUrl( url, options, cb ) {
var urlNoQueryParams, extname, filePath, fullFilePath, file;
var urlNoQueryParams, extname, filePath, fullFilePath, file, href;

if ( typeof url === 'string' ) {
// allow url to be either a string or to be a
// Node URL Object: https://nodejs.org/api/url.html
href = ( typeof url === 'string' ) ? url : url.href;

if ( href ) {
options = options || {};
urlNoQueryParams = url.split( '?' )[0];
urlNoQueryParams = href.split( '?' )[0];
extname = path.extname( urlNoQueryParams );
filePath = _genRandom() + extname;
fullFilePath = path.join( tmpDir, filePath );
file = fs.createWriteStream( fullFilePath );
file.on( 'finish', function() {
fromFileWithPath( fullFilePath, options, cb );
});

got( url )
.on( 'response', function( response ) {
// allows for overriding by the developer or automatically
Expand Down
300 changes: 159 additions & 141 deletions test/url_test.js
Original file line number Diff line number Diff line change
@@ -1,142 +1,160 @@
var path = require("path");

describe("fromUrl tests", function() {
this.timeout(3000);

it("will properly extract files from sites with extensions that are misleading", function(done) {
var url = "http://apps.leg.wa.gov/billinfo/summary.aspx?bill=1276";
fromUrl(url, function(error, text) {
expect(error).to.be.null;
expect(text).to.be.an('string');
expect(text.substring(0, 100)).to.eql(" HB 1276 - 2015-16 Test 1 \" Test 2 \" Test 3 Legislature Home \" Senate \" House of Representatives \" C");
done();
});
});
/* eslint-disable max-len, no-unused-expressions */
/* global fromUrl */

var nodeUrl = require( 'url' );

describe( 'fromUrl tests', function() {
var test;

this.timeout( 3000 );

var test = function(ext, name, _text) {
it('will ' + ext + ' files', function(done) {
var url = "https://cdn.rawgit.com/dbashford/textract/master/test/files/" + name + "?raw=true";
fromUrl(url, function(error, text) {
expect(error).to.be.null;
expect(text).to.be.an('string');
expect(text.substring(0, 100)).to.eql(_text);
done();
});
});
};

test(
"doc",
"doc.doc",
" Word Specification Sample Working Draft 04, 16 August 2002 Document identifier: wd-spectools-word-s"
);

test(
"xls",
"test.xls",
"This,is,a,spreadsheet,yay! "
);

test(
"xlsx",
"pi.xlsx",
'This is the value of PI:,3.141592 '
);

test(
"pdf",
"pdf.pdf",
"This is a test. Please ignore."
);

test(
"docx",
"docx.docx",
"This is a test Just so you know: Lorem ipsum dolor sit amet, consecutuer adipiscing elit, sed diam n"
);

test(
"text/*",
"txt.txt",
"This is a plain old text file."
);

test(
"pptx",
"ppt.pptx",
"This is some title Text And a sub-title Text in Lists Bullet 1 Bullet 2 Bullet 3 Number 1 Number 2 N"
);

test(
"markdown",
"test.md",
" This is an h1 This is an h2 This text has been bolded and italicized "
);

test(
"ods",
"ods.ods",
"This,is,a,ods Really,it,is, I,promise,, "
);

test(
"xml",
"xml.xml",
" Empire Burlesque Bob Dylan USA Columbia 10.90 1985 Hide your heart Bonnie Tyler UK CBS Records 9.90"
);

test(
"odt",
"odt.odt",
"This is an ODT THIS IS A HEADING More ODT"
);

test(
"potx",
"potx.potx",
"This is a potx template Yep, a potx I had no idea These were even a thing "
);

test(
"xltx",
"xltx.xltx",
",,,,,, Packing Slip ,Your Company Name,,,,\"July 24, 2015\", , Your Company Slogan,,,,, ,,,,,, ,Addres"
);

test(
"ott",
"ott.ott",
"This is a document template, yay templates! Woo templates get me so excited! Woo templates get me so"
);

test(
"ots",
"ots.ots",
"This,is , template, an,open,office,template isn't,it,awesome?, you,know,it,is "
);

test(
"odg",
"odg.odg",
"This is a drawing? A drawing, a drawing! This is a drawing, Aren't you mad envious?"
);

test(
"otg",
"otg.otg",
"This is a drawing template A drawing template. Who would really ever need to extract from one of the"
);

test(
"odp",
"odp.odp",
"This is a title This is a slide's text This is a 2nd page And a 2nd page's content"
);

test(
"otp",
"otp.otp",
"This is a template title Template page text 2nd prezo text"
);

});
it( 'will properly extract files from sites with extensions that are misleading', function( done ) {
var url = 'http://apps.leg.wa.gov/billinfo/summary.aspx?bill=1276';
fromUrl( url, function( error, text ) {
expect( error ).to.be.null;
expect( text ).to.be.an( 'string' );
expect( text.substring( 0, 100 ) ).to.eql(
' HB 1276 - 2015-16 Test 1 " Test 2 " Test 3 Legislature Home " Senate " House of Representatives " C' );
done();
});
});

it( 'take object URL', function( done ) {
var url = 'https://cdn.rawgit.com/dbashford/textract/master/test/files/doc.doc?raw=true'
, urlObj = nodeUrl.parse( url )
;

fromUrl( urlObj, function( error, text ) {
expect( error ).to.be.null;
expect( text ).to.be.an( 'string' );
expect( text.substring( 0, 100 ) ).to.eql( 'Word Specification Sample Working Draft 04, 16 August 2002 Document identifier: wd-spectools-word-sa' );
done();
});
});

test = function( ext, name, _text ) {
it( 'will ' + ext + ' files', function( done ) {
var url = 'https://cdn.rawgit.com/dbashford/textract/master/test/files/' + name + '?raw=true';
fromUrl( url, function( error, text ) {
expect( error ).to.be.null;
expect( text ).to.be.an( 'string' );
expect( text.substring( 0, 100 ) ).to.eql( _text );
done();
});
});
};

test(
'doc',
'doc.doc',
'Word Specification Sample Working Draft 04, 16 August 2002 Document identifier: wd-spectools-word-sa'
);

test(
'xls',
'test.xls',
'This,is,a,spreadsheet,yay! '
);

test(
'xlsx',
'pi.xlsx',
'This is the value of PI:,3.141592 '
);

test(
'pdf',
'pdf.pdf',
'This is a test. Please ignore.'
);

test(
'docx',
'docx.docx',
'This is a test Just so you know: Lorem ipsum dolor sit amet, consecutuer adipiscing elit, sed diam n'
);

test(
'text/*',
'txt.txt',
'This is a plain old text file.'
);

test(
'pptx',
'ppt.pptx',
'This is some title Text And a sub-title Text in Lists Bullet 1 Bullet 2 Bullet 3 Number 1 Number 2 N'
);

test(
'markdown',
'test.md',
' This is an h1 This is an h2 This text has been bolded and italicized '
);

test(
'ods',
'ods.ods',
'This,is,a,ods Really,it,is, I,promise,, '
);

test(
'xml',
'xml.xml',
' Empire Burlesque Bob Dylan USA Columbia 10.90 1985 Hide your heart Bonnie Tyler UK CBS Records 9.90'
);

test(
'odt',
'odt.odt',
'This is an ODT THIS IS A HEADING More ODT'
);

test(
'potx',
'potx.potx',
'This is a potx template Yep, a potx I had no idea These were even a thing '
);

test(
'xltx',
'xltx.xltx',
',,,,,, Packing Slip ,Your Company Name,,,,"July 24, 2015", , Your Company Slogan,,,,, ,,,,,, ,Addres'
);

test(
'ott',
'ott.ott',
'This is a document template, yay templates! Woo templates get me so excited! Woo templates get me so'
);

test(
'ots',
'ots.ots',
"This,is , template, an,open,office,template isn't,it,awesome?, you,know,it,is "
);

test(
'odg',
'odg.odg',
"This is a drawing? A drawing, a drawing! This is a drawing, Aren't you mad envious?"
);

test(
'otg',
'otg.otg',
'This is a drawing template A drawing template. Who would really ever need to extract from one of the'
);

test(
'odp',
'odp.odp',
"This is a title This is a slide's text This is a 2nd page And a 2nd page's content"
);

test(
'otp',
'otp.otp',
'This is a template title Template page text 2nd prezo text'
);
});

0 comments on commit ac4f799

Please sign in to comment.