diff --git a/.github/workflows/release_python.yml b/.github/workflows/release_python.yml index 8a41640..9da3e91 100644 --- a/.github/workflows/release_python.yml +++ b/.github/workflows/release_python.yml @@ -1,6 +1,6 @@ # This file was autogenerated by maturin v1.6.0 using: # maturin generate-ci github -o ../../.github/workflows/release_pyton_pytest.yml --pytest -# +# # Then adapted to the project # name: CI @@ -38,7 +38,7 @@ jobs: python-version: '3.8' # On linux we don't use graalvm/setup-graalvm@v1.2.5 action to install graalvm because it will install it - # on the runner machine and on linux the build will happen inside a manylinux docker. + # on the runner machine and on linux the build will happen inside a manylinux docker. # Instead, we use a script to install graalvm inside the docker container # the script is launched by setting the before-script-linux config option of the maturin action - name: Build wheels @@ -60,7 +60,7 @@ jobs: with: name: wheels-linux-${{ matrix.platform.target }} path: bindings/extractous-python/dist - + - name: pytest if: ${{ startsWith(matrix.platform.target, 'x86_64') }} shell: bash @@ -70,7 +70,7 @@ jobs: python3 -m venv .venv source .venv/bin/activate pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall - pip install pytest scikit-learn + pip install pytest scikit-learn lxml cd bindings/extractous-python pytest -s @@ -85,7 +85,7 @@ jobs: apt-get update apt-get install tesseract-ocr tesseract-ocr-deu tesseract-ocr-ara apt-get install -y --no-install-recommends python3 python3-pip - pip3 install -U pip pytest scikit-learn + pip3 install -U pip pytest scikit-learn lxml run: | set -e pip3 install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall @@ -129,7 +129,7 @@ jobs: python -m venv .venv .venv\Scripts\activate.bat pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall - pip install pytest scikit-learn + pip install pytest scikit-learn lxml cd bindings\extractous-python pytest -s . @@ -186,7 +186,7 @@ jobs: python3 -m venv .venv source .venv/bin/activate pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall - pip install pytest scikit-learn + pip install pytest scikit-learn lxml cd bindings/extractous-python pytest -s @@ -206,7 +206,7 @@ jobs: name: wheels-sdist path: bindings/extractous-python/dist - # Follows the guide on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ + # Follows the guide on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ # We use 2 actions one to publish on PyPi on tag pushes to main brnach and the other to publish on TestPyPi on any push publish-to-testpypi: name: Publish to TestPyPI diff --git a/README.md b/README.md index f715d02..da0f893 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,9 @@ from extractous import Extractor # Create a new extractor extractor = Extractor() -extractor.set_extract_string_max_length(1000) +extractor = extractor.set_extract_string_max_length(1000) +# if you need an xml +# extractor = extractor.set_xml_output(True) # Extract text from a file result, metadata = extractor.extract_file_to_string("README.md") @@ -125,6 +127,8 @@ use extractous::Extractor; fn main() { // Create a new extractor. Note it uses a consuming builder pattern let mut extractor = Extractor::new().set_extract_string_max_length(1000); + // if you need an xml + // extractor = extractor.set_parse_string_as_xml(false); // Extract text from a file let (text, metadata) = extractor.extract_file_to_string("README.md").unwrap(); diff --git a/bindings/extractous-python/README.md b/bindings/extractous-python/README.md index 2f3d558..fab132d 100644 --- a/bindings/extractous-python/README.md +++ b/bindings/extractous-python/README.md @@ -20,7 +20,9 @@ from extractous import Extractor # Create a new extractor extractor = Extractor() -extractor.set_extract_string_max_length(1000) +extractor = extractor.set_extract_string_max_length(1000) +# if you need an xml +# extractor = extractor.set_xml_output(True) # Extract text from a file result, metadata = extractor.extract_file_to_string("README.md") diff --git a/bindings/extractous-python/pyproject.toml b/bindings/extractous-python/pyproject.toml index 3ce2b8f..16f1805 100644 --- a/bindings/extractous-python/pyproject.toml +++ b/bindings/extractous-python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "extractous" -version = '0.2.0' +version = '0.2.1' classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: Implementation :: CPython", @@ -50,11 +50,11 @@ module-name = "extractous._extractous" python-source = "python" # Setting skip-auditwheel=true is very important to instruct maturin to not run its auditwheel flow -# maturin auditwheel flow changes any top level shared lib names which causes problems with our graalvm libs +# maturin auditwheel flow changes any top level shared lib names which causes problems with our graalvm libs # By skipping the wheel, we just get a plain _extracts_rs* lib, and we have to: # * bundle our graalvm libs using the below include [] directive # * change the RPATH of _extracts_rs* lib to be able to properly find the bundled graalvm libs -skip-auditwheel=true +skip-auditwheel=true # This tells cargo to set the RPATH for the private module built lib _extractous.abi3.so # Set the RPATH to $ORIGIN because the graalvm libs will be bundled in the same dir as the _extractous.abi3.so @@ -62,7 +62,7 @@ rustc-args = ["-C", "link-arg=-Wl,-rpath,$ORIGIN"] # Maturin include command will start looking from the python/extractous folder # so to include the graalvm libs the rust build script must copy them to python/extractous folder -include = [ +include = [ {path = "**/*.so", format = ["wheel"]}, {path = "**/*.dylib", format = ["wheel"]}, {path = "**/*.dll", format = ["wheel"]} diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs index 3ccfdd6..0a9f121 100644 --- a/bindings/extractous-python/src/extractor.rs +++ b/bindings/extractous-python/src/extractor.rs @@ -136,6 +136,12 @@ impl Extractor { Ok(Self(inner)) } + /// Set the configuration for the parse as xml + pub fn set_xml_output(&self, xml_output: bool) -> PyResult { + let inner = self.0.clone().set_xml_output(xml_output); + Ok(Self(inner)) + } + /// Extracts text from a file path. Returns a tuple with stream of the extracted text /// the stream is decoded using the extractor's `encoding` and tika metadata. pub fn extract_file<'py>( diff --git a/bindings/extractous-python/tests/test_extract_bytes.py b/bindings/extractous-python/tests/test_extract_bytes.py index df0ca07..a40a77c 100644 --- a/bindings/extractous-python/tests/test_extract_bytes.py +++ b/bindings/extractous-python/tests/test_extract_bytes.py @@ -3,7 +3,7 @@ from extractous import Extractor from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray, \ - is_expected_metadata_contained + is_expected_metadata_contained, extract_body_text TEST_CASES = [ ("2022_Q3_AAPL.pdf", 0.9, 0.8), @@ -49,6 +49,36 @@ def test_extract_bytes_to_string(file_name, target_dist, metadata_dist): assert percent_similarity >= metadata_dist, \ f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" +@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES) +def test_extract_bytes_to_string_as_xml(file_name, target_dist, metadata_dist): + """Test the extraction from bytes of various file types.""" + original_filepath = f"../../test_files/documents/{file_name}" + expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt" + expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json" + + # Read expected + with open(expected_result_filepath, "r", encoding="utf8") as file: + expected = file.read() + with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file: + expected_metadata = json.load(file) + + # Extract + file_bytes = read_file_to_bytearray(original_filepath) + + extractor = Extractor() + extractor = extractor.set_xml_output(True) + result_xml, metadata = extractor.extract_file_to_string(original_filepath) + result_text = extract_body_text(result_xml) + + # Check Expected + assert cosine_similarity(result_text, expected) >= target_dist, \ + f"Cosine similarity is less than {target_dist} for file: {file_name}" + + # Check metadata + percent_similarity = calculate_similarity_percent(metadata, expected_metadata) + assert percent_similarity >= metadata_dist, \ + f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" + @pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES) def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist): """Test the extraction from bytes of various file types.""" @@ -76,4 +106,36 @@ def test_extract_bytes_to_stream(file_name, target_dist, metadata_dist): # Check metadata percent_similarity = calculate_similarity_percent(metadata, expected_metadata) assert percent_similarity >= metadata_dist, \ - f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" \ No newline at end of file + f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" + + +@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES) +def test_extract_bytes_to_stream_as_xml(file_name, target_dist, metadata_dist): + """Test the extraction from bytes to stream as xml of various file types.""" + original_filepath = f"../../test_files/documents/{file_name}" + expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt" + expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json" + + # Read expected + with open(expected_result_filepath, "r", encoding="utf8") as file: + expected = file.read() + with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file: + expected_metadata = json.load(file) + + # Extract + file_bytes = read_file_to_bytearray(original_filepath) + + extractor = Extractor() + extractor = extractor.set_xml_output(True) + reader, metadata = extractor.extract_bytes(file_bytes) + result_xml = read_to_string(reader) + result_text = extract_body_text(result_xml) + + # Check Expected + assert cosine_similarity(result_text, expected) >= target_dist, \ + f"Cosine similarity is less than {target_dist} for file: {file_name}" + + # Check metadata + percent_similarity = calculate_similarity_percent(metadata, expected_metadata) + assert percent_similarity >= metadata_dist, \ + f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" diff --git a/bindings/extractous-python/tests/test_extract_file.py b/bindings/extractous-python/tests/test_extract_file.py index 15b97cc..6b0b8c9 100644 --- a/bindings/extractous-python/tests/test_extract_file.py +++ b/bindings/extractous-python/tests/test_extract_file.py @@ -2,7 +2,7 @@ import pytest from extractous import Extractor -from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string +from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained, read_to_string, extract_body_text TEST_CASES = [ ("2022_Q3_AAPL.pdf", 0.9, 0.8), @@ -21,7 +21,7 @@ @pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES) def test_extract_file_to_string(file_name, target_dist, metadata_dist): - """Test the extraction and comparison of various file types.""" + """Test the extraction to string as plain text of various file types.""" original_filepath = f"../../test_files/documents/{file_name}" expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt" expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json" @@ -49,10 +49,39 @@ def test_extract_file_to_string(file_name, target_dist, metadata_dist): assert percent_similarity >= metadata_dist, \ f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" +@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES) +def test_extract_file_to_string_as_xml(file_name, target_dist, metadata_dist): + """Test the extraction to string as XML of various file types.""" + original_filepath = f"../../test_files/documents/{file_name}" + expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt" + expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json" + + # Read expected + with open(expected_result_filepath, "r", encoding="utf8") as file: + expected = file.read() + with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file: + expected_metadata = json.load(file) + + # Extract + extractor = Extractor() + extractor = extractor.set_xml_output(True) + result_xml, metadata = extractor.extract_file_to_string(original_filepath) + result_text = extract_body_text(result_xml) + + # Check extracted + assert cosine_similarity(result_text, expected) >= target_dist, \ + f"Cosine similarity is less than {target_dist} for file: {file_name}" + + # Check metadata + percent_similarity = calculate_similarity_percent(metadata, expected_metadata) + assert percent_similarity >= metadata_dist, \ + f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" + + @pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES) def test_extract_file_to_stream(file_name, target_dist, metadata_dist): - """Test the extraction from bytes of various file types.""" + """Test the extraction from bytes to stream of various file types.""" original_filepath = f"../../test_files/documents/{file_name}" expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt" expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json" @@ -75,4 +104,34 @@ def test_extract_file_to_stream(file_name, target_dist, metadata_dist): # Check metadata percent_similarity = calculate_similarity_percent(metadata, expected_metadata) assert percent_similarity >= metadata_dist, \ - f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" \ No newline at end of file + f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" + + +@pytest.mark.parametrize("file_name, target_dist, metadata_dist", TEST_CASES) +def test_extract_file_to_stream_as_xml(file_name, target_dist, metadata_dist): + """Test the extraction from bytes of various file types.""" + original_filepath = f"../../test_files/documents/{file_name}" + expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt" + expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json" + + # Read expected + with open(expected_result_filepath, "r", encoding="utf8") as file: + expected = file.read() + with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file: + expected_metadata = json.load(file) + + # Extract + extractor = Extractor() + extractor = extractor.set_xml_output(True) + reader, metadata = extractor.extract_file(original_filepath) + result_xml = read_to_string(reader) + result_text = extract_body_text(result_xml) + + # Check extracted + assert cosine_similarity(result_text, expected) >= target_dist, \ + f"Cosine similarity is less than {target_dist} for file: {file_name}" + + # Check metadata + percent_similarity = calculate_similarity_percent(metadata, expected_metadata) + assert percent_similarity >= metadata_dist, \ + f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}" diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py index 34e4acf..598771a 100644 --- a/bindings/extractous-python/tests/test_extract_url.py +++ b/bindings/extractous-python/tests/test_extract_url.py @@ -16,4 +16,13 @@ def test_extract_url_to_string(): content, metadata = extractor.extract_url_to_string("https://www.google.com") assert "Google" in content - assert len(metadata.keys()) > 0 \ No newline at end of file + assert len(metadata.keys()) > 0 + +def test_extract_url_to_string_as_xml(): + extractor = Extractor() + extractor = extractor.set_xml_output(True) + + content, metadata = extractor.extract_url_to_string("https://www.google.com") + + assert "Google" in content + assert len(metadata.keys()) > 0 diff --git a/bindings/extractous-python/tests/test_pdf.py b/bindings/extractous-python/tests/test_pdf.py index a04163d..7c96d84 100644 --- a/bindings/extractous-python/tests/test_pdf.py +++ b/bindings/extractous-python/tests/test_pdf.py @@ -1,5 +1,5 @@ from extractous import Extractor -from utils import read_to_string +from utils import read_to_string, extract_body_text def expected_result(): @@ -22,6 +22,17 @@ def test_extract_file(): print(f"test_pdf:test_extract_file result = {result}") assert result == expected_result() +def test_extract_file_as_xml(): + extractor = Extractor() + extractor = extractor.set_xml_output(True) + reader, metadata = extractor.extract_file("tests/quarkus.pdf") + + result_xml = read_to_string(reader) + + print(f"test_pdf:test_extract_file_as_xml result = {result_xml}") + result_text = extract_body_text(result_xml) + assert result_text.strip() == expected_result().strip() + def test_extract_bytes(): extractor = Extractor() @@ -33,3 +44,17 @@ def test_extract_bytes(): print(f"test_pdf:test_extract_bytes result = {result}") assert result == expected_result() + +def test_extract_bytes_as_xml(): + extractor = Extractor() + extractor = extractor.set_xml_output(True) + + with open("tests/quarkus.pdf", "rb") as file: + buffer = bytearray(file.read()) + reader, metadata = extractor.extract_bytes(buffer) + + result_xml = read_to_string(reader) + + print(f"test_pdf:test_extract_bytes_as_xml result = {result_xml}") + result_text = extract_body_text(result_xml) + assert result_text.strip() == expected_result().strip() diff --git a/bindings/extractous-python/tests/utils.py b/bindings/extractous-python/tests/utils.py index 8368db9..c72f825 100644 --- a/bindings/extractous-python/tests/utils.py +++ b/bindings/extractous-python/tests/utils.py @@ -1,6 +1,6 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity as cosine_sim - +from lxml import etree def cosine_similarity(text1, text2): """Calculate the cosine similarity between two texts.""" @@ -78,3 +78,20 @@ def calculate_similarity_percent(expected, current): # Return the similarity percentage return matches / total + + +def extract_body_text(xml: str) -> str: + """ + Extracts and returns plain text content from the section of an XML + string. + """ + try: + parser = etree.XMLParser(recover=True) + root = etree.fromstring(xml.encode(), parser=parser) + ns= {"ns": "http://www.w3.org/1999/xhtml"} + body = root.find(".//ns:body", namespaces=ns) + if body is None: + return "" + return "\n".join(body.itertext()).strip() + except ET.ParseError as e: + raise ValueError(f"Invalid XML input: {e}") diff --git a/extractous-core/Cargo.lock b/extractous-core/Cargo.lock index e5e4fe4..8c081e6 100644 --- a/extractous-core/Cargo.lock +++ b/extractous-core/Cargo.lock @@ -465,7 +465,7 @@ dependencies = [ [[package]] name = "extractous" -version = "0.2.0" +version = "0.2.1" dependencies = [ "bytemuck", "criterion", @@ -473,6 +473,7 @@ dependencies = [ "fs_extra", "jni", "libc", + "quick-xml", "reqwest", "serde", "serde_json", @@ -482,6 +483,7 @@ dependencies = [ "test-case", "textdistance", "thiserror", + "walkdir", "zip", ] @@ -1324,6 +1326,15 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "quick-xml" +version = "0.37.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f22f29bdff3987b4d8632ef95fd6424ec7e4e0a57e2f4fc63e489e75357f6a03" +dependencies = [ + "memchr", +] + [[package]] name = "quote" version = "1.0.37" diff --git a/extractous-core/Cargo.toml b/extractous-core/Cargo.toml index 01251b6..8472540 100644 --- a/extractous-core/Cargo.toml +++ b/extractous-core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "extractous" -version = "0.2.0" +version = "0.2.1" edition = "2021" description = """ @@ -36,6 +36,7 @@ test-case = "3.0" criterion = "0.5.1" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" +quick-xml = "0.37.1" [build-dependencies] fs_extra = { version = "1.3.0" } @@ -43,6 +44,7 @@ reqwest = { version = "0.12.7", features = ["blocking", "json"] } zip = "2.2.0" flate2 = "1.0.33" tar = "0.4.41" +walkdir = "2.5.0" [profile.release] opt-level = 3 diff --git a/extractous-core/README.md b/extractous-core/README.md index 0c87db6..83e6e30 100644 --- a/extractous-core/README.md +++ b/extractous-core/README.md @@ -43,7 +43,10 @@ fn main() { let file_path = &args[1]; // Extract the provided file content to a string - let extractor = Extractor::new(); + let mut extractor = Extractor::new(); + // if you need an xml + // extractor = extractor.set_xml_output(false); + // Extract text from a file let (content, metadata) = extractor.extract_file_to_string(file_path).unwrap(); println!("{}", content); println!("{:?}", metadata); @@ -128,7 +131,7 @@ installed on your system because some of the OCR tests will fail if no tesseract * `sudo apt install tesseract-ocr` * Install any language extensions you want. for example to install German and Arabic: * `sudo apt install tesseract-ocr-deu tesseract-ocr-ara` -* On Mac +* On Mac * `brew install tesseract tesseract-lang` ### Building Extractous diff --git a/extractous-core/build.rs b/extractous-core/build.rs index 1aeeb17..55046a4 100644 --- a/extractous-core/build.rs +++ b/extractous-core/build.rs @@ -3,6 +3,7 @@ use std::fs; use std::io; use std::path::{Path, PathBuf}; use std::process::Command; +use walkdir::WalkDir; fn main() { // Exit early when building docs or when running clippy @@ -29,6 +30,17 @@ fn main() { //println!("cargo:warning=dist_dir: {}", dist_dir.display()); //println!("cargo:warning=out_dir: {}", out_dir.display()); //println!("cargo:warning=tika_native_dir: {:?}", tika_native_dir); + let tika_native_dir = out_dir.join("tika-native"); + let mut need_build = false; + if is_dir_updated(&tika_native_source_dir, &tika_native_dir) { + println!("Lib tika_native files were updated"); + fs_extra::dir::remove(&libs_out_dir).ok(); + fs_extra::dir::remove(&tika_native_dir).ok(); + need_build = true; + // Launch the gradle build + } else { + println!("Lib tika_native files were not updated"); + } // Try to find already built libs match find_already_built_libs(&out_dir) { @@ -39,15 +51,17 @@ fn main() { copy_build_artifacts(&libs_dir, vec![&libs_out_dir], false); } } - None => { - // Launch the gradle build - gradle_build( - &tika_native_source_dir, - &out_dir, - &libs_out_dir, - &python_bind_dir, - ); - } + None => { need_build = true; } + } + + // Launch the gradle build + if need_build { + gradle_build( + &tika_native_source_dir, + &out_dir, + &libs_out_dir, + &python_bind_dir, + ); } // Tell cargo to look for shared libraries in the specified directory @@ -93,17 +107,35 @@ fn find_already_built_libs(out_dir: &Path) -> Option { } fn is_dir_updated(src: &Path, dest: &Path) -> bool { - let src_modified = fs::metadata(src) - .and_then(|meta| meta.modified()) - .ok(); - let dest_modified = fs::metadata(dest) - .and_then(|meta| meta.modified()) - .ok(); - - match (src_modified, dest_modified) { - (Some(src_time), Some(dest_time)) => src_time > dest_time, - _ => true, // If either timestamp is unavailable, consider the source as updated + for entry in WalkDir::new(src).into_iter().filter_map(|e| e.ok()) { + if entry.file_type().is_file() { + let src_file = entry.path(); + let relative_path = src_file.strip_prefix(src).unwrap(); + let dest_file = dest.join(relative_path); + + if !dest_file.exists() { + // File does not exist in the destination directory + return true; + } + + let src_modified = match fs::metadata(src_file).and_then(|meta| meta.modified()) { + Ok(time) => time, + Err(_) => continue, // Skip unreadable files + }; + + let dest_modified = match fs::metadata(&dest_file).and_then(|meta| meta.modified()) { + Ok(time) => time, + Err(_) => return true, // File in dest is inaccessible + }; + + if src_modified > dest_modified { + // Source file is newer than the destination file + return true; + } + } } + // All checks passed + false } // Run the gradle build command to build tika-native @@ -122,7 +154,7 @@ fn gradle_build( println!("Using GraalVM JDK found at {}", graalvm_home.display()); println!("Building tika_native libs this might take a while ... Please be patient!!"); - if is_dir_updated(&tika_native_dir, &out_dir) { + if is_dir_updated(&tika_native_source_dir, &tika_native_dir) { println!("Lib tika_native files were updated"); fs_extra::dir::remove(&tika_native_dir).ok(); } diff --git a/extractous-core/examples/extract_to_stream.rs b/extractous-core/examples/extract_to_stream.rs index 08d4386..fca3e02 100644 --- a/extractous-core/examples/extract_to_stream.rs +++ b/extractous-core/examples/extract_to_stream.rs @@ -8,7 +8,7 @@ fn main() { let file_path = &args[1]; // Extract the provided file content to a string - let extractor = Extractor::new(); + let extractor = Extractor::new().set_xml_output(true); let (stream, _metadata) = extractor.extract_file(file_path).unwrap(); // Extract url // let stream = extractor.extract_url("https://www.google.com/").unwrap(); diff --git a/extractous-core/examples/extract_to_string.rs b/extractous-core/examples/extract_to_string.rs index 36b2916..65db3f9 100644 --- a/extractous-core/examples/extract_to_string.rs +++ b/extractous-core/examples/extract_to_string.rs @@ -6,7 +6,7 @@ fn main() { let file_path = &args[1]; // Extract the provided file content to a string - let extractor = Extractor::new(); + let extractor = Extractor::new().set_xml_output(true); let (content, _metadata) = extractor.extract_file_to_string(file_path).unwrap(); println!("{}", content); } diff --git a/extractous-core/src/extractor.rs b/extractous-core/src/extractor.rs index b8f220d..5509de0 100644 --- a/extractous-core/src/extractor.rs +++ b/extractous-core/src/extractor.rs @@ -65,6 +65,7 @@ pub struct Extractor { pdf_config: PdfParserConfig, office_config: OfficeParserConfig, ocr_config: TesseractOcrConfig, + xml_output: bool, } impl Default for Extractor { @@ -75,6 +76,7 @@ impl Default for Extractor { pdf_config: PdfParserConfig::default(), office_config: OfficeParserConfig::default(), ocr_config: TesseractOcrConfig::default(), + xml_output: false, } } } @@ -117,6 +119,12 @@ impl Extractor { self } + /// Set the configuration for the parse as xml + pub fn set_xml_output(mut self, xml_output: bool) -> Self { + self.xml_output = xml_output; + self + } + /// Extracts text from a file path. Returns a tuple with stream of the extracted text and metadata. /// the stream is decoded using the extractor's `encoding` pub fn extract_file(&self, file_path: &str) -> ExtractResult<(StreamReader, Metadata)> { @@ -126,6 +134,7 @@ impl Extractor { &self.pdf_config, &self.office_config, &self.ocr_config, + self.xml_output, ) } @@ -138,6 +147,7 @@ impl Extractor { &self.pdf_config, &self.office_config, &self.ocr_config, + self.xml_output, ) } @@ -150,6 +160,7 @@ impl Extractor { &self.pdf_config, &self.office_config, &self.ocr_config, + self.xml_output, ) } @@ -162,6 +173,7 @@ impl Extractor { &self.pdf_config, &self.office_config, &self.ocr_config, + self.xml_output, ) } @@ -174,6 +186,7 @@ impl Extractor { &self.pdf_config, &self.office_config, &self.ocr_config, + self.xml_output, ) } @@ -186,8 +199,10 @@ impl Extractor { &self.pdf_config, &self.office_config, &self.ocr_config, + self.xml_output, ) } + } #[cfg(test)] @@ -197,6 +212,7 @@ mod tests { use std::fs::File; use std::io::BufReader; use std::io::{self, Read}; + use std::str; const TEST_FILE: &str = "README.md"; @@ -292,4 +308,20 @@ mod tests { "Metadata should contain at least one entry" ); } + + #[test] + fn extract_file_to_xml_test() { + // Parse the files using extractous + let extractor = Extractor::new().set_xml_output(true); + let result = extractor.extract_file_to_string(TEST_FILE); + let (content, metadata) = result.unwrap(); + assert!( + content.len() > 0, + "Metadata should contain at least one entry" + ); + assert!( + metadata.len() > 0, + "Metadata should contain at least one entry" + ); + } } diff --git a/extractous-core/src/tika/parse.rs b/extractous-core/src/tika/parse.rs index 4f941af..fe1f78b 100644 --- a/extractous-core/src/tika/parse.rs +++ b/extractous-core/src/tika/parse.rs @@ -32,6 +32,7 @@ fn parse_to_stream( pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, + as_xml: bool, method_name: &str, signature: &str, ) -> ExtractResult<(StreamReader, Metadata)> { @@ -52,6 +53,7 @@ fn parse_to_stream( (&j_pdf_conf.internal).into(), (&j_office_conf.internal).into(), (&j_ocr_conf.internal).into(), + JValue::Bool(if as_xml { 1 } else { 0 }), ], ); let call_result_obj = call_result?.l()?; @@ -69,6 +71,7 @@ pub fn parse_file( pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, + as_xml: bool ) -> ExtractResult<(StreamReader, Metadata)> { let mut env = get_vm_attach_current_thread()?; @@ -80,12 +83,14 @@ pub fn parse_file( pdf_conf, office_conf, ocr_conf, + as_xml, "parseFile", "(Ljava/lang/String;\ Ljava/lang/String;\ Lorg/apache/tika/parser/pdf/PDFParserConfig;\ Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + Z\ )Lai/yobix/ReaderResult;", ) } @@ -96,6 +101,7 @@ pub fn parse_bytes( pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, + as_xml: bool, ) -> ExtractResult<(StreamReader, Metadata)> { let mut env = get_vm_attach_current_thread()?; @@ -112,12 +118,14 @@ pub fn parse_bytes( pdf_conf, office_conf, ocr_conf, + as_xml, "parseBytes", "(Ljava/nio/ByteBuffer;\ Ljava/lang/String;\ Lorg/apache/tika/parser/pdf/PDFParserConfig;\ Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + Z\ )Lai/yobix/ReaderResult;", ) } @@ -128,6 +136,7 @@ pub fn parse_url( pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, + as_xml: bool, ) -> ExtractResult<(StreamReader, Metadata)> { let mut env = get_vm_attach_current_thread()?; @@ -139,12 +148,14 @@ pub fn parse_url( pdf_conf, office_conf, ocr_conf, + as_xml, "parseUrl", "(Ljava/lang/String;\ Ljava/lang/String;\ Lorg/apache/tika/parser/pdf/PDFParserConfig;\ Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + Z\ )Lai/yobix/ReaderResult;", ) } @@ -157,6 +168,7 @@ pub fn parse_to_string( pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, + as_xml: bool, method_name: &str, signature: &str, ) -> ExtractResult<(String, Metadata)> { @@ -175,6 +187,7 @@ pub fn parse_to_string( (&j_pdf_conf.internal).into(), (&j_office_conf.internal).into(), (&j_ocr_conf.internal).into(), + JValue::Bool(if as_xml { 1 } else { 0 }), ], ); let call_result_obj = call_result?.l()?; @@ -191,6 +204,7 @@ pub fn parse_file_to_string( pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, + as_xml: bool, ) -> ExtractResult<(String, Metadata)> { let mut env = get_vm_attach_current_thread()?; @@ -202,12 +216,14 @@ pub fn parse_file_to_string( pdf_conf, office_conf, ocr_conf, + as_xml, "parseFileToString", "(Ljava/lang/String;\ I\ Lorg/apache/tika/parser/pdf/PDFParserConfig;\ Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + Z\ )Lai/yobix/StringResult;", ) } @@ -219,6 +235,7 @@ pub fn parse_bytes_to_string( pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, + as_xml: bool, ) -> ExtractResult<(String, Metadata)> { let mut env = get_vm_attach_current_thread()?; @@ -235,12 +252,14 @@ pub fn parse_bytes_to_string( pdf_conf, office_conf, ocr_conf, + as_xml, "parseBytesToString", "(Ljava/nio/ByteBuffer;\ I\ Lorg/apache/tika/parser/pdf/PDFParserConfig;\ Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + Z\ )Lai/yobix/StringResult;", ) } @@ -252,6 +271,7 @@ pub fn parse_url_to_string( pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, + as_xml: bool, ) -> ExtractResult<(String, Metadata)> { let mut env = get_vm_attach_current_thread()?; @@ -263,12 +283,14 @@ pub fn parse_url_to_string( pdf_conf, office_conf, ocr_conf, + as_xml, "parseUrlToString", "(Ljava/lang/String;\ I\ Lorg/apache/tika/parser/pdf/PDFParserConfig;\ Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + Z\ )Lai/yobix/StringResult;", ) } diff --git a/extractous-core/tests/extract_to_xml_tests.rs b/extractous-core/tests/extract_to_xml_tests.rs new file mode 100644 index 0000000..a645053 --- /dev/null +++ b/extractous-core/tests/extract_to_xml_tests.rs @@ -0,0 +1,90 @@ +use extractous::Extractor; +use std::fs; +use test_case::test_case; +use textdistance::nstr::cosine; +use quick_xml::reader::Reader; +use quick_xml::events::Event; + + +// Declarers the shared test_utils code as module in this integration test +mod test_utils; + +fn extract_p_tag_content(xml: &str) -> String { + let mut reader = Reader::from_str(xml); + reader.config_mut().trim_text(true); // Trim surrounding whitespace + let mut buf = Vec::new(); + let mut collected_content = String::new(); + let mut inside_body = false; + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) if e.name().as_ref() == b"body" => { + inside_body = true; + } + Ok(Event::End(ref e)) if e.name().as_ref() == b"body" => { + inside_body = false; + } + Ok(Event::Text(e)) if inside_body => { + collected_content.push_str(&e.unescape().unwrap().into_owned()); + collected_content.push('\n'); // Separate paragraphs with newline + } + Ok(Event::Eof) => break, + Err(e) => { + eprintln!("Error reading XML: {}", e); + break; + } + _ => (), + } + buf.clear(); + } + + collected_content.trim_end().to_string() +} + +#[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")] +#[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")] +#[test_case("simple.odt", 0.8; "Test ODT file")] +#[test_case("table-multi-row-column-cells-actual.csv", 0.8; "Test CSV file")] +#[test_case("vodafone.xlsx", 0.4; "Test XLSX file")] +#[test_case("category-level.docx", 0.8; "Test DOCX file")] +#[test_case("simple.doc", 0.8; "Test DOC file")] +#[test_case("simple.pptx", 0.9; "Test another PPTX file")] +#[test_case("table-multi-row-column-cells.png", -1.0; "Test PNG file")] +#[test_case("winter-sports.epub", 0.8; "Test EPUB file")] +#[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")] +//#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")] +fn test_extract_file_to_xml(file_name: &str, target_dist: f64) { + let extractor = Extractor::new().set_extract_string_max_length(1000000) + .set_xml_output(true); + // extract file with extractor + let (extracted_xml, extracted_metadata) = extractor + .extract_file_to_string(&format!("../test_files/documents/{}", file_name)) + .unwrap(); + println!("{}: {}", file_name, extracted_xml); + let extracted = extract_p_tag_content(&extracted_xml); + + // read expected string + let expected = + fs::read_to_string(format!("../test_files/expected_result/{}.txt", file_name)).unwrap(); + + let dist = cosine(&expected.trim(), &extracted.trim()); + assert!( + dist > target_dist, + "Cosine similarity is less than {} for file: {}, dist: {}", + target_dist, + file_name, + dist + ); + println!("{}: {}", file_name, dist); + + // read expected metadata + let expected_metadata = test_utils::parse_metadata_file(&format!( + "../test_files/expected_result/{}.metadata.json", + file_name + )); + + assert!(test_utils::is_expected_metadata_contained( + &expected_metadata, + &extracted_metadata + )); +} diff --git a/extractous-core/tika-native/src/main/java/ai/yobix/ParsingReader.java b/extractous-core/tika-native/src/main/java/ai/yobix/ParsingReader.java new file mode 100644 index 0000000..8a7d621 --- /dev/null +++ b/extractous-core/tika-native/src/main/java/ai/yobix/ParsingReader.java @@ -0,0 +1,104 @@ +package ai.yobix; + +import java.io.*; +import java.util.concurrent.Executor; + +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.xml.sax.ContentHandler; +import org.apache.tika.exception.ZeroByteFileException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ToXMLContentHandler; + +public class ParsingReader extends Reader { + + private final Parser parser; + private final Reader reader; + private final PipedOutputStream pipedOutputStream; + private final InputStream stream; + private final Metadata metadata; + private final ParseContext context; + private final boolean outputXml; + private final String encoding; + private transient Throwable throwable; + + public ParsingReader(Parser parser, InputStream stream, Metadata metadata, + ParseContext context, boolean outputXml, String encoding) throws IOException { + this.parser = parser; + this.stream = stream; + this.metadata = metadata; + this.context = context; + this.outputXml = outputXml; + this.encoding = encoding; + + PipedInputStream pipedInputStream = new PipedInputStream(); + this.pipedOutputStream = new PipedOutputStream(pipedInputStream); + this.reader = new BufferedReader(new InputStreamReader(pipedInputStream)); + + Executor executor = command -> { + String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); + if (name != null) { + name = "Apache Tika: " + name; + } else { + name = "Apache Tika"; + } + Thread thread = new Thread(command, name); + thread.setDaemon(true); + thread.start(); + }; + + executor.execute(new ParsingTask()); + + reader.mark(1); + reader.read(); + reader.reset(); + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + if (throwable instanceof ZeroByteFileException) { + return -1; + } else if (throwable instanceof IOException) { + throw (IOException) throwable; + } else if (throwable != null) { + throw new IOException("", throwable); + } + return reader.read(cbuf, off, len); + } + + @Override + public void close() throws IOException { + reader.close(); + } + + private class ParsingTask implements Runnable { + + public void run() { + try { + ContentHandler handler = outputXml ? new ToXMLContentHandler(pipedOutputStream, encoding) : new BodyContentHandler(pipedOutputStream); + parser.parse(stream, handler, metadata, context); + } catch (Throwable t) { + throwable = t; + } + + try { + stream.close(); + } catch (Throwable t) { + if (throwable == null) { + throwable = t; + } + } + + try { + pipedOutputStream.close(); + } catch (Throwable t) { + if (throwable == null) { + throwable = t; + } + } + } + + } +} diff --git a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java index f9f8e4a..699a47f 100644 --- a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java +++ b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java @@ -11,17 +11,18 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; -import org.apache.tika.parser.ParsingReader; import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ToXMLContentHandler; import org.apache.tika.sax.WriteOutContentHandler; import org.graalvm.nativeimage.IsolateThread; import org.graalvm.nativeimage.c.function.CEntryPoint; import org.graalvm.nativeimage.c.type.CCharPointer; import org.graalvm.nativeimage.c.type.CConst; import org.graalvm.nativeimage.c.type.CTypeConversion; +import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import java.io.IOException; @@ -74,7 +75,9 @@ public static StringResult parseFileToString( int maxLength, PDFParserConfig pdfConfig, OfficeParserConfig officeConfig, - TesseractOCRConfig tesseractConfig + TesseractOCRConfig tesseractConfig, + boolean asXML + // maybe replace with a single config class ) { try { final Path path = Paths.get(filePath); @@ -82,10 +85,9 @@ public static StringResult parseFileToString( final InputStream stream = TikaInputStream.get(path, metadata); String result = parseToStringWithConfig( - stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig); + stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig, asXML); // No need to close the stream because parseToString does so return new StringResult(result, metadata); - } catch (java.io.IOException e) { return new StringResult((byte) 1, "Could not open file: " + e.getMessage()); } catch (TikaException e) { @@ -104,7 +106,8 @@ public static StringResult parseUrlToString( int maxLength, PDFParserConfig pdfConfig, OfficeParserConfig officeConfig, - TesseractOCRConfig tesseractConfig + TesseractOCRConfig tesseractConfig, + boolean asXML ) { try { final URL url = new URI(urlString).toURL(); @@ -112,7 +115,7 @@ public static StringResult parseUrlToString( final TikaInputStream stream = TikaInputStream.get(url, metadata); String result = parseToStringWithConfig( - stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig); + stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig, asXML); // No need to close the stream because parseToString does so return new StringResult(result, metadata); @@ -138,7 +141,8 @@ public static StringResult parseBytesToString( int maxLength, PDFParserConfig pdfConfig, OfficeParserConfig officeConfig, - TesseractOCRConfig tesseractConfig + TesseractOCRConfig tesseractConfig, + boolean asXML ) { final Metadata metadata = new Metadata(); final ByteBufferInputStream inStream = new ByteBufferInputStream(data); @@ -146,7 +150,7 @@ public static StringResult parseBytesToString( try { String result = parseToStringWithConfig( - stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig); + stream, metadata, maxLength, pdfConfig, officeConfig, tesseractConfig, asXML); // No need to close the stream because parseToString does so return new StringResult(result, metadata); } catch (java.io.IOException e) { @@ -156,16 +160,24 @@ public static StringResult parseBytesToString( } } - private static String parseToStringWithConfig( InputStream stream, Metadata metadata, int maxLength, PDFParserConfig pdfConfig, OfficeParserConfig officeConfig, - TesseractOCRConfig tesseractConfig + TesseractOCRConfig tesseractConfig, + boolean asXML ) throws IOException, TikaException { - final WriteOutContentHandler handler = new WriteOutContentHandler(maxLength); + ContentHandler handler; + ContentHandler handlerForParser; + if (asXML) { + handler = new WriteOutContentHandler(new ToXMLContentHandler(), maxLength); + handlerForParser = handler; + } else { + handler = new WriteOutContentHandler(maxLength); + handlerForParser = new BodyContentHandler(handler); + } try { final TikaConfig config = TikaConfig.getDefaultConfig(); @@ -177,8 +189,7 @@ private static String parseToStringWithConfig( parsecontext.set(OfficeParserConfig.class, officeConfig); parsecontext.set(TesseractOCRConfig.class, tesseractConfig); - parser.parse(stream, new BodyContentHandler(handler), metadata, parsecontext); - + parser.parse(stream, handlerForParser, metadata, parsecontext); } catch (SAXException e) { if (!WriteLimitReachedException.isWriteLimitReached(e)) { // This should never happen with BodyContentHandler... @@ -203,7 +214,8 @@ public static ReaderResult parseFile( String charsetName, PDFParserConfig pdfConfig, OfficeParserConfig officeConfig, - TesseractOCRConfig tesseractConfig + TesseractOCRConfig tesseractConfig, + boolean asXML ) { try { // System.out.println("pdfConfig.isExtractInlineImages = " + pdfConfig.isExtractInlineImages()); @@ -218,7 +230,7 @@ public static ReaderResult parseFile( final Metadata metadata = new Metadata(); final TikaInputStream stream = TikaInputStream.get(path, metadata); - return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig); + return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig, asXML); } catch (java.io.IOException e) { return new ReaderResult((byte) 1, "Could not open file: " + e.getMessage()); @@ -237,14 +249,15 @@ public static ReaderResult parseUrl( String charsetName, PDFParserConfig pdfConfig, OfficeParserConfig officeConfig, - TesseractOCRConfig tesseractConfig + TesseractOCRConfig tesseractConfig, + boolean asXML ) { try { final URL url = new URI(urlString).toURL(); final Metadata metadata = new Metadata(); final TikaInputStream stream = TikaInputStream.get(url, metadata); - return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig); + return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig, asXML); } catch (MalformedURLException e) { return new ReaderResult((byte) 2, "Malformed URL error occurred " + e.getMessage()); @@ -267,7 +280,8 @@ public static ReaderResult parseBytes( String charsetName, PDFParserConfig pdfConfig, OfficeParserConfig officeConfig, - TesseractOCRConfig tesseractConfig + TesseractOCRConfig tesseractConfig, + boolean asXML ) { @@ -275,7 +289,7 @@ public static ReaderResult parseBytes( final ByteBufferInputStream inStream = new ByteBufferInputStream(data); final TikaInputStream stream = TikaInputStream.get(inStream, new TemporaryResources(), metadata); - return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig); + return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig, asXML); } private static ReaderResult parse( @@ -284,25 +298,28 @@ private static ReaderResult parse( String charsetName, PDFParserConfig pdfConfig, OfficeParserConfig officeConfig, - TesseractOCRConfig tesseractConfig + TesseractOCRConfig tesseractConfig, + boolean asXML ) { try { final TikaConfig config = TikaConfig.getDefaultConfig(); final ParseContext parsecontext = new ParseContext(); final Parser parser = new AutoDetectParser(config); + final Charset charset = Charset.forName(charsetName, StandardCharsets.UTF_8); parsecontext.set(Parser.class, parser); parsecontext.set(PDFParserConfig.class, pdfConfig); parsecontext.set(OfficeParserConfig.class, officeConfig); parsecontext.set(TesseractOCRConfig.class, tesseractConfig); - final Reader reader = new ParsingReader(parser, inputStream, metadata, parsecontext); + //final Reader reader = new org.apache.tika.parser.ParsingReader(parser, inputStream, metadata, parsecontext); + final Reader reader = new ParsingReader(parser, inputStream, metadata, parsecontext, asXML, charset.name()); // Convert Reader which works with chars to ReaderInputStream which works with bytes ReaderInputStream readerInputStream = ReaderInputStream.builder() .setReader(reader) - .setCharset(Charset.forName(charsetName, StandardCharsets.UTF_8)) + .setCharset(charset) .get(); return new ReaderResult(readerInputStream, metadata); @@ -336,4 +353,4 @@ private static CCharPointer cParseToString(IsolateThread thread, @CConst CCharPo } } -} \ No newline at end of file +} diff --git a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-linux/reachability-metadata.json b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-linux/reachability-metadata.json index 7239ef1..20a1990 100644 --- a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-linux/reachability-metadata.json +++ b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-linux/reachability-metadata.json @@ -103,7 +103,8 @@ "java.lang.String", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -113,7 +114,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -123,7 +125,8 @@ "java.lang.String", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -133,7 +136,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -143,7 +147,8 @@ "java.lang.String", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -153,7 +158,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] } ], diff --git a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-macos/reachability-metadata.json b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-macos/reachability-metadata.json index e2c289f..13d80d6 100644 --- a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-macos/reachability-metadata.json +++ b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-macos/reachability-metadata.json @@ -36,19 +36,19 @@ "parameterTypes": [] }, { - "name": "getReader", + "name": "getMetadata", "parameterTypes": [] }, { - "name": "getStatus", + "name": "getReader", "parameterTypes": [] }, { - "name": "isError", + "name": "getStatus", "parameterTypes": [] }, { - "name": "getMetadata", + "name": "isError", "parameterTypes": [] } ], @@ -65,35 +65,20 @@ "parameterTypes": [] }, { - "name": "getStatus", + "name": "getMetadata", "parameterTypes": [] }, { - "name": "isError", + "name": "getStatus", "parameterTypes": [] }, { - "name": "getMetadata", + "name": "isError", "parameterTypes": [] } ], "type": "ai.yobix.StringResult" }, - { - "methods": [ - { - "name": "getValues", - "parameterTypes": [ - "java.lang.String" - ] - }, - { - "name": "names", - "parameterTypes": [] - } - ], - "type": "org.apache.tika.metadata.Metadata" - }, { "methods": [ { @@ -109,7 +94,8 @@ "java.lang.String", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -119,7 +105,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -129,7 +116,8 @@ "java.lang.String", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -139,7 +127,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -149,7 +138,8 @@ "java.lang.String", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -159,7 +149,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] } ], @@ -741,6 +732,21 @@ ], "type": "org.apache.commons.io.input.ReaderInputStream" }, + { + "methods": [ + { + "name": "getValues", + "parameterTypes": [ + "java.lang.String" + ] + }, + { + "name": "names", + "parameterTypes": [] + } + ], + "type": "org.apache.tika.metadata.Metadata" + }, { "methods": [ { diff --git a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-windows/reachability-metadata.json b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-windows/reachability-metadata.json index 81ccb22..8c07dc0 100644 --- a/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-windows/reachability-metadata.json +++ b/extractous-core/tika-native/src/main/resources/META-INF/ai.yobix/tika-2.9.2-windows/reachability-metadata.json @@ -103,7 +103,8 @@ "java.lang.String", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -113,7 +114,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -123,7 +125,8 @@ "java.lang.String", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -133,7 +136,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -143,7 +147,8 @@ "java.lang.String", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] }, { @@ -153,7 +158,8 @@ "int", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig", - "org.apache.tika.parser.ocr.TesseractOCRConfig" + "org.apache.tika.parser.ocr.TesseractOCRConfig", + "boolean" ] } ],