Skip to content

Commit

Permalink
Merge pull request #17 from yobix-ai/16-typeerror-parseerrorparse-err…
Browse files Browse the repository at this point in the history
…or-occurred-tika-198-illegal-ioexception-from-orgapachetikaparsermicrosoftooxmlooxmlparser281b1a01

fix: fixed issue 16 and added test case
  • Loading branch information
nmammeri authored Oct 30, 2024
2 parents 79a5d0c + 598a1b1 commit 1c42e0b
Show file tree
Hide file tree
Showing 9 changed files with 24 additions and 298 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/release_python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ jobs:
set -e
python3 -m venv .venv
source .venv/bin/activate
pip install extractous --find-links bindings/extractous-python/dist --force-reinstall
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
pip install pytest scikit-learn
cd bindings/extractous-python
pytest -s
Expand All @@ -87,7 +87,7 @@ jobs:
pip3 install -U pip pytest scikit-learn
run: |
set -e
pip3 install extractous --find-links bindings/extractous-python/dist --force-reinstall
pip3 install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
cd bindings/extractous-python
pytest -s
Expand Down Expand Up @@ -128,7 +128,7 @@ jobs:
run: |
python -m venv .venv
.venv\Scripts\activate.bat
pip install extractous --find-links bindings/extractous-python/dist --force-reinstall
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
pip install pytest scikit-learn
cd bindings\extractous-python
pytest -s
Expand Down Expand Up @@ -178,7 +178,7 @@ jobs:
set -e
python3 -m venv .venv
source .venv/bin/activate
pip install extractous --find-links bindings/extractous-python/dist --force-reinstall
pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
pip install pytest scikit-learn
cd bindings/extractous-python
pytest -s
Expand Down
2 changes: 1 addition & 1 deletion bindings/extractous-python/build-inside-docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

if [[ $PWD =~ extractous/bindings/extractous-python$ ]]; then
ROOT_DIR=$(realpath "$PWD/../../")
docker build $PWD
docker build -t manylinux_2_28_graalvm $PWD
docker run --rm --mount type=bind,source=$ROOT_DIR,target=/workspace manylinux_2_28_graalvm bash /workspace/bindings/extractous-python/build-wheels.sh

# reset paemissions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
("simple.pptx", 0.9),
("table-multi-row-column-cells.png", -1.0),
("winter-sports.epub", 0.9),
("bug_16.docx", 0.9),
]

@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
Expand Down
4 changes: 2 additions & 2 deletions extractous-core/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,9 @@ fn gradle_build(
}

let gradlew = if cfg!(target_os = "windows") {
&tika_native_dir.join("gradlew.bat")
tika_native_dir.join("gradlew.bat")
} else {
&tika_native_dir.join("gradlew")
tika_native_dir.join("gradlew")
};

// Launch the gradle build
Expand Down
1 change: 1 addition & 0 deletions extractous-core/tests/extractor_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use textdistance::nstr::cosine;
#[test_case("simple.pptx", 0.9; "Test another PPTX file")]
#[test_case("table-multi-row-column-cells.png", -1.0; "Test PNG file")]
#[test_case("winter-sports.epub", 0.9; "Test EPUB file")]
#[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")]
fn test_extract_file_to_string(file_name: &str, target_dist: f64) {
let extractor = Extractor::new().set_extract_string_max_length(1000000);
// extract file with extractor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,264 +20,6 @@
{
"name": "[Lsun.security.pkcs.SignerInfo;"
},
{
"allDeclaredClasses": true,
"name": "ai.yobix.TestFiles",
"queryAllDeclaredMethods": true,
"queryAllPublicMethods": true
},
{
"allDeclaredClasses": true,
"name": "ai.yobix.TestUtils",
"queryAllDeclaredMethods": true,
"queryAllPublicMethods": true
},
{
"allDeclaredClasses": true,
"allDeclaredFields": true,
"methods": [
{
"name": "<init>",
"parameterTypes": []
},
{
"name": "test_parseAllTestDocs",
"parameterTypes": []
}
],
"name": "ai.yobix.TikaAllDocsTest",
"queryAllDeclaredConstructors": true,
"queryAllDeclaredMethods": true,
"queryAllPublicMethods": true
},
{
"allDeclaredClasses": true,
"allDeclaredFields": true,
"methods": [
{
"name": "<init>",
"parameterTypes": []
},
{
"name": "test_detectDocxFile",
"parameterTypes": []
},
{
"name": "test_detectOdtFile",
"parameterTypes": []
},
{
"name": "test_detectPdfFile",
"parameterTypes": []
},
{
"name": "test_detectTxtFile",
"parameterTypes": []
},
{
"name": "test_detectXlsFile",
"parameterTypes": []
}
],
"name": "ai.yobix.TikaDetectTest",
"queryAllDeclaredConstructors": true,
"queryAllDeclaredMethods": true,
"queryAllPublicMethods": true
},
{
"allDeclaredClasses": true,
"allDeclaredFields": true,
"methods": [
{
"name": "<init>",
"parameterTypes": []
},
{
"name": "testAppleFilingPdf",
"parameterTypes": []
},
{
"name": "testGetPlainTextFromInvoice",
"parameterTypes": []
},
{
"name": "testGetTextFromPdfFormatWithFonts",
"parameterTypes": []
},
{
"name": "testGetTextFromXlsFile",
"parameterTypes": []
},
{
"name": "testHandbook1pDocxFile",
"parameterTypes": []
},
{
"name": "testInvalidPath",
"parameterTypes": []
},
{
"name": "testNoNewRomanPdfFile",
"parameterTypes": []
},
{
"name": "testParseBytes",
"parameterTypes": []
},
{
"name": "testParseHttpUrl",
"parameterTypes": []
},
{
"name": "testParseHttpsUrl",
"parameterTypes": []
},
{
"name": "testParseUrl",
"parameterTypes": []
},
{
"name": "testQuarkusOdtFile",
"parameterTypes": []
},
{
"name": "testQuarkusPdfFile",
"parameterTypes": []
},
{
"name": "testQuarkusTxtFile",
"parameterTypes": []
},
{
"name": "testWordDocxFile",
"parameterTypes": []
}
],
"name": "ai.yobix.TikaParseReaderTest",
"queryAllDeclaredConstructors": true,
"queryAllDeclaredMethods": true,
"queryAllPublicMethods": true
},
{
"allDeclaredClasses": true,
"allDeclaredFields": true,
"methods": [
{
"name": "<init>",
"parameterTypes": []
},
{
"name": "testAppleFilingPdf",
"parameterTypes": []
},
{
"name": "testGetPlainTextFromInvoice",
"parameterTypes": []
},
{
"name": "testGetTextFromPdfFormatWithFonts",
"parameterTypes": []
},
{
"name": "testGetTextFromXlsFile",
"parameterTypes": []
},
{
"name": "testHandbook1pDocxFile",
"parameterTypes": []
},
{
"name": "testImageFile",
"parameterTypes": []
},
{
"name": "testNoNewRomanPdfFile",
"parameterTypes": []
},
{
"name": "testWordDocxFile",
"parameterTypes": []
},
{
"name": "test_invalidPath",
"parameterTypes": []
},
{
"name": "test_quarkusOdtFile",
"parameterTypes": []
},
{
"name": "test_quarkusPdfFile",
"parameterTypes": []
},
{
"name": "test_quarkusTxtFile",
"parameterTypes": []
}
],
"name": "ai.yobix.TikaParseStringTest",
"queryAllDeclaredConstructors": true,
"queryAllDeclaredMethods": true,
"queryAllPublicMethods": true
},
{
"allDeclaredClasses": true,
"allDeclaredFields": true,
"methods": [
{
"name": "<init>",
"parameterTypes": []
},
{
"name": "testAppleFilingPdf",
"parameterTypes": []
},
{
"name": "testGetPlainTextFromInvoice",
"parameterTypes": []
},
{
"name": "testGetTextFromPdfFormatWithFonts",
"parameterTypes": []
},
{
"name": "testGetTextFromXlsFile",
"parameterTypes": []
},
{
"name": "testHandbook1pDocxFile",
"parameterTypes": []
},
{
"name": "testNoNewRomanPdfFile",
"parameterTypes": []
},
{
"name": "testWordDocxFile",
"parameterTypes": []
},
{
"name": "test_invalidPath",
"parameterTypes": []
},
{
"name": "test_quarkusOdtFile",
"parameterTypes": []
},
{
"name": "test_quarkusPdfFile",
"parameterTypes": []
},
{
"name": "test_quarkusTxtFile",
"parameterTypes": []
}
],
"name": "ai.yobix.TikaParseTest",
"queryAllDeclaredConstructors": true,
"queryAllDeclaredMethods": true,
"queryAllPublicMethods": true
},
{
"methods": [
{
Expand Down Expand Up @@ -3668,6 +3410,17 @@
],
"name": "org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTPPrImpl"
},
{
"methods": [
{
"name": "<init>",
"parameterTypes": [
"org.apache.xmlbeans.SchemaType"
]
}
],
"name": "org.openxmlformats.schemas.wordprocessingml.x2006.main.impl.CTPictureImpl"
},
{
"methods": [
{
Expand Down
Loading

0 comments on commit 1c42e0b

Please sign in to comment.