Skip to content

Commit

Permalink
fixes #106, adding epub extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
David Bashford committed Aug 3, 2018
1 parent b27de26 commit 8951230
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 4 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ A text extraction node module.
* HTML, HTM
* ATOM, RSS
* Markdown
* EPUB
* XML, XSL
* PDF
* DOC, DOCX
Expand Down Expand Up @@ -170,11 +171,12 @@ textract.fromUrl(url, config, function( error, text ) {})

## Release Notes

### 2.3.1 (pending)
### 2.4.0 (pending)
* [#164](https://github.com/dbashford/textract/issues/164). Fixed issue with extra text nodes in odt/ott extraction.
* [#156](https://github.com/dbashford/textract/issues/156). Introduced `preserveOnlyMultipleLineBreaks` feature.
* [#149](https://github.com/dbashford/textract/issues/149). RTF extraction error error fixed by [#166](https://github.com/dbashford/textract/pull/166).
* [#145](https://github.com/dbashford/textract/issues/145). Handling Japanese full-width characters.
* [#106](https://github.com/dbashford/textract/issues/106). Now extracting `.epub`

156

Expand Down
48 changes: 48 additions & 0 deletions lib/extractors/epub.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
var EPub = require( 'epub2/node' )
, htmlExtract = require( './html' )
;

function extractText( filePath, options, cb ) {
var epub = new EPub( filePath )
, allText = ''
, hasError = false
, chapterCount = 0
;

epub.on( 'end', function() {
// Iterate over each chapter...
epub.flow.forEach( function( chapter ) {
// if already error, don't do anything
if ( !hasError ) {
// Get the chapter text
epub.getChapterRaw( chapter.id, function( rawChaperError, text ) {
if ( rawChaperError ) {
hasError = true;
cb( rawChaperError, null );
} else {
// Extract the raw text from the chapter text (it's html)
htmlExtract.extractFromText( text, options, function( htmlExtractError, outText ) {
if ( htmlExtractError ) {
hasError = true;
cb( htmlExtractError, null );
} else {
allText += outText;
chapterCount++;
if ( chapterCount === epub.flow.length ) {
cb( null, allText );
}
}
});
}
});
}
});
});

epub.parse();
}

module.exports = {
types: ['application/epub+zip'],
extract: extractText
};
8 changes: 5 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "textract",
"version": "2.3.1",
"version": "2.4.0",
"homepage": "https://github.com/dbashford/textract",
"author": "David Bashford",
"description": "Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.",
Expand Down Expand Up @@ -43,7 +43,8 @@
"ots",
"potx",
"odg",
"otg"
"otg",
"epub"
],
"dependencies": {
"mime": "2.2.0",
Expand All @@ -58,7 +59,8 @@
"html-entities": "1.2.0",
"iconv-lite": "0.4.15",
"jschardet": "1.4.1",
"yauzl": "2.7.0"
"yauzl": "2.7.0",
"epub2": "1.3.4"
},
"devDependencies": {
"chai": "1.5.0",
Expand Down
26 changes: 26 additions & 0 deletions test/extract_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,32 @@ describe( 'textract', function() {
});
});

describe( 'for .epub files', function() {
it( 'will extract text from epub files', function( done ) {
var docPath = path.join( __dirname, 'files', 'Metamorphosis-jackson.epub' );
this.timeout( 5000 );
fromFileWithPath( docPath, function( error, text ) {
expect( error ).to.be.null;
expect( text ).to.be.an( 'string' );
expect( text.length ).to.eql( 119329 );
expect( text.substring( 3000, 3500 ) ).to.eql( "dboard so that he could lift his head better; found where the itch was, and saw that it was covered with lots of little white spots which he didn't know what to make of; and when he tried to feel the place with one of his legs he drew it quickly back because as soon as he touched it he was overcome by a cold shudder. He slid back into his former position. \"Getting up early all the time\", he thought, \"it makes you stupid. You've got to get enough sleep. Other travelling salesmen live a life of lu" );
done();
});
});

it( 'will extract text from epub files and preserve line breaks', function( done ) {
var docPath = path.join( __dirname, 'files', 'Metamorphosis-jackson.epub' );
this.timeout( 5000 );
fromFileWithPath( docPath, { preserveLineBreaks: true }, function( error, text ) {
expect( error ).to.be.null;
expect( text ).to.be.an( 'string' );
expect( text.length ).to.eql( 119342 );
expect( text.substring( 3000, 3500 ) ).to.eql( "rds the headboard so that he could lift his head better; found where the itch was, and saw that it was covered with lots of little white spots which he didn't know what to make of; and when he tried to feel the place with one of his legs he drew it quickly back because as soon as he touched it he was overcome by a cold shudder.\nHe slid back into his former position. \"Getting up early all the time\", he thought, \"it makes you stupid. You've got to get enough sleep. Other travelling salesmen live a" );
done();
});
});
});

describe( 'for .atom files', function() {
it( 'will extract text from atom files', function( done ) {
var docPath = path.join( __dirname, 'files', 'atom.atom' );
Expand Down
Binary file added test/files/Metamorphosis-jackson.epub
Binary file not shown.
31 changes: 31 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ adler-32@:
exit-on-epipe "~1.0.1"
printj "~1.1.0"

adm-zip@^0.4.4:
version "0.4.11"
resolved "https://registry.npmjs.org/adm-zip/-/adm-zip-0.4.11.tgz#2aa54c84c4b01a9d0fb89bb11982a51f13e3d62a"

ajv-keywords@^1.0.0:
version "1.5.1"
resolved "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-1.5.1.tgz#314dd0a4b3368fad3dfcdc54ede6171b886daf3c"
Expand Down Expand Up @@ -82,6 +86,10 @@ balanced-match@^1.0.0:
version "1.0.0"
resolved "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz#89b4d199ab2bee49de164ea02b89ce462d71b767"

bluebird@^3.5.1:
version "3.5.1"
resolved "https://registry.npmjs.org/bluebird/-/bluebird-3.5.1.tgz#d9551f9de98f1fcda1e683d17ee91a0602ee2eb9"

boolbase@~1.0.0:
version "1.0.0"
resolved "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e"
Expand Down Expand Up @@ -374,6 +382,14 @@ entities@^1.1.1, entities@~1.1.1:
version "1.1.1"
resolved "https://registry.npmjs.org/entities/-/entities-1.1.1.tgz#6e5c2d0a5621b5dadaecef80b90edfb5cd7772f0"

[email protected]:
version "1.3.4"
resolved "https://registry.npmjs.org/epub2/-/epub2-1.3.4.tgz#711fa98f07a99e3dbaba6878cc53b182ca42d436"
dependencies:
adm-zip "^0.4.4"
bluebird "^3.5.1"
xml2js "^0.4.4"

error-ex@^1.2.0:
version "1.3.1"
resolved "https://registry.npmjs.org/error-ex/-/error-ex-1.3.1.tgz#f855a86ce61adc4e8621c3cda21e7a7612c3a8dc"
Expand Down Expand Up @@ -1309,6 +1325,10 @@ safe-buffer@~5.1.0, safe-buffer@~5.1.1:
version "5.1.1"
resolved "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.1.tgz#893312af69b2123def71f57889001671eeb2c853"

sax@>=0.6.0:
version "1.2.4"
resolved "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9"

"semver@2 || 3 || 4 || 5":
version "5.5.0"
resolved "https://registry.npmjs.org/semver/-/semver-5.5.0.tgz#dc4bbc7a6ca9d916dee5d43516f0092b58f7b8ab"
Expand Down Expand Up @@ -1516,6 +1536,17 @@ xlsx@~0.7.11:
jszip "2.4.0"
ssf "~0.8.1"

xml2js@^0.4.4:
version "0.4.19"
resolved "https://registry.npmjs.org/xml2js/-/xml2js-0.4.19.tgz#686c20f213209e94abf0d1bcf1efaa291c7827a7"
dependencies:
sax ">=0.6.0"
xmlbuilder "~9.0.1"

xmlbuilder@~9.0.1:
version "9.0.7"
resolved "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-9.0.7.tgz#132ee63d2ec5565c557e20f4c22df9aca686b10d"

[email protected]:
version "0.1.27"
resolved "https://registry.npmjs.org/xmldom/-/xmldom-0.1.27.tgz#d501f97b3bdb403af8ef9ecc20573187aadac0e9"
Expand Down

0 comments on commit 8951230

Please sign in to comment.