Merge pull request #19 from yobix-ai/18-ocr-examples-and-docs

18 ocr examples and docs
yobix-ai · Nov 4, 2024 · 2db7f6e · 2db7f6e
2 parents ab3fb0e + 205864e
commit 2db7f6e
Show file tree

Hide file tree

Showing 24 changed files with 464 additions and 55 deletions.
diff --git a/.github/workflows/release_python.yml b/.github/workflows/release_python.yml
@@ -67,6 +67,7 @@ jobs:
         shell: bash
         run: |
           set -e
+          sudo apt install tesseract-ocr tesseract-ocr-deu tesseract-ocr-ara
           python3 -m venv .venv
           source .venv/bin/activate
           pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall
@@ -83,6 +84,7 @@ jobs:
           githubToken: ${{ github.token }}
           install: |
             apt-get update
+            apt-get install tesseract-ocr tesseract-ocr-deu tesseract-ocr-ara
             apt-get install -y --no-install-recommends python3 python3-pip
             pip3 install -U pip pytest scikit-learn
           run: |
@@ -138,7 +140,7 @@ jobs:
     strategy:
       matrix:
         platform:
-          - runner: macos-12
+          - runner: macos-13
             target: x86_64
           - runner: macos-14
             target: aarch64
@@ -176,6 +178,7 @@ jobs:
       - name: pytest
         run: |
           set -e
+          brew install tesseract tesseract-lang
           python3 -m venv .venv
           source .venv/bin/activate
           pip install extractous --find-links bindings/extractous-python/dist --no-index --force-reinstall

diff --git a/README.md b/README.md
@@ -75,23 +75,96 @@ extractor.set_extract_string_max_length(1000)
 
 # Extract text from a file
 result = extractor.extract_file_to_string("README.md")
+print(result)
+```
+* Extracting a file to a buffered stream:
+
+```python
+from extractous import Extractor
+
+extractor = Extractor()
+reader = extractor.extract_file("tests/quarkus.pdf")
+
+result = ""
+buffer = reader.read(4096)
+while len(buffer) > 0:
+    result += buffer.decode("utf-8")
+    buffer = reader.read(4096)
+
+print(result)
+```
+
+* Extracting a file with OCR:
+
+You need to have Tesseract installed with the language pack. For example on debian `sudo apt install tesseract-ocr tesseract-ocr-deu`
+
+```python
+from extractous import Extractor, TesseractOcrConfig
+
+extractor = Extractor().set_ocr_config(TesseractOcrConfig().set_language("deu"))
+result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pdf")
+
 print(result)
 ```
 
 #### Rust
 * Extract a file content to a string:
 ```rust
 use extractous::Extractor;
-use extractous::PdfParserConfig;
 
-// Create a new extractor. Note it uses a consuming builder pattern
-let mut extractor = Extractor::new().set_extract_string_max_length(1000);
+fn main() {
+    // Create a new extractor. Note it uses a consuming builder pattern
+    let mut extractor = Extractor::new().set_extract_string_max_length(1000);
 
-// Extract text from a file
-let text = extractor.extract_file_to_string("README.md").unwrap();
-println!("{}", text);
+    // Extract text from a file
+    let text = extractor.extract_file_to_string("README.md").unwrap();
+    println!("{}", text);
+}
 ```
 
+* Extract a content of a file to a `StreamReader` and perform buffered reading
+```rust
+use std::io::Read;
+use extractous::Extractor;
+
+fn main() {
+    // Get the command-line arguments
+    let args: Vec<String> = std::env::args().collect();
+    let file_path = &args[1];
+
+    // Extract the provided file content to a string
+    let extractor = Extractor::new();
+    let stream = extractor.extract_file(file_path).unwrap();
+
+    // Because stream implements std::io::Read trait we can perform buffered reading
+    // For example we can use it to create a BufReader
+    let mut buffer = Vec::new();
+    stream.read_to_end(&mut buffer).unwrap();
+
+    println!("{}", String::from_utf8(buffer).unwrap())
+}
+```
+
+* Extract content of PDF with OCR. 
+
+You need to have Tesseract installed with the language pack. For example on debian `sudo apt install tesseract-ocr tesseract-ocr-deu`
+
+```rust
+use extractous::Extractor;
+
+fn main() {
+  let file_path = "../test_files/documents/deu-ocr.pdf";
+
+    let extractor = Extractor::new()
+          .set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
+          .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
+    // extract file with extractor
+  let content = extractor.extract_file_to_string(file_path).unwrap();
+  println!("{}", content);
+}
+```
+
+
 ## 🔥 Performance
 * **Extractous** is fast, please don't take our word for it, you can run the [benchmarks](https://github.com/yobix-ai/extractous-benchmarks) yourself. For example extracting content out of [sec10 filings pdf forms](https://github.com/yobix-ai/extractous-benchmarks/raw/main/dataset/sec10-filings), Extractous is on average **~18x faster** than unstructured-io:
 

diff --git a/bindings/extractous-python/README.md b/bindings/extractous-python/README.md
@@ -39,5 +39,16 @@ while len(buffer) > 0:
     result += buffer.decode("utf-8")
     buffer = reader.read(4096)
 
+print(result)
+```
+
+Extracting a file with OCR:
+
+```python
+from extractous import Extractor, TesseractOcrConfig
+
+extractor = Extractor().set_ocr_config(TesseractOcrConfig().set_language("deu"))
+result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pdf")
+
 print(result)
 ```
diff --git a/bindings/extractous-python/pyproject.toml b/bindings/extractous-python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "extractous"
-version = '0.1.6'
+version = '0.1.7'
 classifiers = [
   "Programming Language :: Rust",
   "Programming Language :: Python :: Implementation :: CPython",

diff --git a/bindings/extractous-python/tests/test_extract_file_to_string.py b/bindings/extractous-python/tests/test_extract_file_to_string.py
@@ -1,8 +1,7 @@
 import pytest
 
 from extractous import Extractor
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.metrics.pairwise import cosine_similarity as cosine_sim
+from utils import cosine_similarity
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9),
@@ -16,6 +15,7 @@
     ("table-multi-row-column-cells.png", -1.0),
     ("winter-sports.epub", 0.9),
     ("bug_16.docx", 0.9),
+    ("deu-ocr.pdf", 0.9),
 ]
 
 @pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
@@ -31,13 +31,3 @@ def test_extract_file_to_string(file_name, target_dist):
     assert cosine_similarity(result, expected) > target_dist, \
         f"Cosine similarity is less than {target_dist} for file: {file_name}"
 
-def cosine_similarity(text1, text2):
-    """Calculate the cosine similarity between two texts."""
-
-    # Create the CountVectorizer and transform the texts into vectors
-    vectorizer = CountVectorizer().fit_transform([text1, text2])
-    vectors = vectorizer.toarray()
-
-    # Calculate cosine similarity between the two vectors
-    cos_sim = cosine_sim(vectors)
-    return cos_sim[0][1]
diff --git a/bindings/extractous-python/tests/test_ocr.py b/bindings/extractous-python/tests/test_ocr.py
@@ -0,0 +1,46 @@
+from extractous import Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig
+from utils import cosine_similarity
+
+def test_ara_ocr_png():
+    ocr_config = TesseractOcrConfig().set_language("ara")
+    extractor = Extractor().set_ocr_config(ocr_config)
+    result = extractor.extract_file_to_string("../../test_files/documents/ara-ocr.png")
+
+    with open("../../test_files/expected_result/ara-ocr.png.txt", "r",  encoding="utf8") as file:
+        expected = file.read()
+
+    assert cosine_similarity(result, expected)
+
+
+def test_ocr_only_strategy_extract_deu_ocr_pdf_to_string():
+    test_file = "../../test_files/documents/eng-ocr.pdf"
+    expected_result_file = "../../test_files/expected_result/deu-ocr.pdf.txt"
+
+    pdf_config = PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.OCR_ONLY)
+    ocr_config = TesseractOcrConfig().set_language("deu")
+
+    # Note builder patter is used
+    extractor = Extractor()
+    extractor = extractor.set_ocr_config(ocr_config)
+    extractor = extractor.set_pdf_config(pdf_config)
+
+    result = extractor.extract_file_to_string(test_file)
+
+    with open(expected_result_file, "r",  encoding="utf8") as file:
+        expected = file.read()
+
+    assert cosine_similarity(result, expected)
+
+def test_no_ocr_strategy_extract_deu_ocr_pdf_to_string():
+    test_file = "../../test_files/documents/deu-ocr.pdf"
+
+    pdf_config = PdfParserConfig()
+    pdf_config = pdf_config.set_ocr_strategy(PdfOcrStrategy.NO_OCR)
+    ocr_config = TesseractOcrConfig()
+    ocr_config = ocr_config.set_language("deu")
+
+    extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.NO_OCR))
+
+    result = extractor.extract_file_to_string(test_file)
+
+    assert result.strip() == ""
diff --git a/bindings/extractous-python/tests/test_pdf.py b/bindings/extractous-python/tests/test_pdf.py
@@ -9,7 +9,7 @@ def test_extract_file_to_string():
     extractor = Extractor()
     result = extractor.extract_file_to_string("tests/quarkus.pdf")
 
-    print(result)
+    #print(result)
     assert result == expected_result()
 
 
@@ -23,5 +23,5 @@ def test_extract_file():
         result += b.decode("utf-8")
         b = reader.read(4096)
 
-    print(result)
+    #print(result)
     assert result == expected_result()
diff --git a/bindings/extractous-python/tests/utils.py b/bindings/extractous-python/tests/utils.py
@@ -0,0 +1,13 @@
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import cosine_similarity as cosine_sim
+
+def cosine_similarity(text1, text2):
+    """Calculate the cosine similarity between two texts."""
+
+    # Create the CountVectorizer and transform the texts into vectors
+    vectorizer = CountVectorizer().fit_transform([text1, text2])
+    vectors = vectorizer.toarray()
+
+    # Calculate cosine similarity between the two vectors
+    cos_sim = cosine_sim(vectors)
+    return cos_sim[0][1]
diff --git a/extractous-core/Cargo.lock b/extractous-core/Cargo.lock
diff --git a/extractous-core/Cargo.toml b/extractous-core/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "extractous"
-version = "0.1.6"
+version = "0.1.7"
 edition = "2021"
 
 description = """

diff --git a/extractous-core/README.md b/extractous-core/README.md
@@ -73,6 +73,24 @@ fn main() {
 }
 ```
 
+* Extract content of PDF with OCR. You need to have Tesseract installed with the language pack. For example on debian `sudo apt install tesseract-ocr tesseract-ocr-deu`
+* If you get `Parse error occurred : Unable to extract PDF content`, it is most likely that OCR language pack is not installed
+```rust
+use extractous::Extractor;
+
+fn main() {
+  let file_path = "../test_files/documents/deu-ocr.pdf";
+
+    let extractor = Extractor::new()
+          .set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
+          .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
+    // extract file with extractor
+  let content = extractor.extract_file_to_string(file_path).unwrap();
+  println!("{}", content);
+}
+```
+
+
 ## Building
 
 ### Requirements
@@ -84,6 +102,7 @@ fn main() {
   specific local version, you can do so by setting the GRAALVM_HOME environment variable
 * We recommend using [sdkman](https://sdkman.io/install) to install GraalVM JDKs
 * `sdk install java 22.0.1-graalce`
+* To be able to use it from IDEA, on Ubuntu for example add `GRAALVM_HOME=$HOME/.sdkman/candidates/java/22.0.2-graalce` to `/etc/environment`
 * Confirm that GraalVM is installed correctly by running `java -version`. You should see something like:
 ```text
 openjdk 22.0.1 2024-04-16

diff --git a/extractous-core/src/config.rs b/extractous-core/src/config.rs
@@ -28,7 +28,7 @@ impl Default for PdfParserConfig {
         Self {
             ocr_strategy: PdfOcrStrategy::AUTO,
             extract_inline_images: false,
-            extract_unique_inline_images_only: true,
+            extract_unique_inline_images_only: false,
             extract_marked_content: false,
             extract_annotation_text: true,
         }
@@ -71,7 +71,7 @@ impl PdfParserConfig {
     /// or similar equality metric. If the PDF actually contains multiple copies of the same
     /// image -- all with different object ids -- then all images will be extracted.
     /// For this parameter to have any effect, extractInlineImages must be set to true.
-    /// Default: true.
+    /// Default: false.
     pub fn set_extract_unique_inline_images_only(mut self, val: bool) -> Self {
         self.extract_unique_inline_images_only = val;
         self

diff --git a/extractous-core/src/extractor.rs b/extractous-core/src/extractor.rs
@@ -127,7 +127,13 @@ impl Extractor {
     /// Extracts text from a file path. Returns a string that is of maximum length
     /// of the extractor's `extract_string_max_length`
     pub fn extract_file_to_string(&self, file_path: &str) -> ExtractResult<String> {
-        tika::parse_file_to_string(file_path, self.extract_string_max_length)
+        tika::parse_file_to_string(
+            file_path,
+            self.extract_string_max_length,
+            &self.pdf_config,
+            &self.office_config,
+            &self.ocr_config,
+        )
     }
 }
 

diff --git a/extractous-core/src/lib.rs b/extractous-core/src/lib.rs
@@ -9,6 +9,7 @@
 //! To use an extractor, you need to:
 //! - [create and configure new the extractor](#create-and-config-an-extractor)
 //! - [use the extractor to extract text](#extract-text)
+//! - [enable OCR for the extractor](#extract-text-with-ocr)
 //!
 //! ## Create and config an extractor
 //!
@@ -44,6 +45,26 @@
 //! println!("{}", text);
 //!
 //! ```
+//!
+//! ## Extract text with OCR
+//! * Make sure Tesseract is installed with the corresponding language packs. For example on debian `sudo apt install tesseract-ocr tesseract-ocr-deu` to install tesseract with German language pack.
+//! * If you get `Parse error occurred : Unable to extract PDF content`, it is most likely that the OCR language pack is not installed
+//!
+//! ```no_run
+//! use extractous::{Extractor, TesseractOcrConfig, PdfParserConfig, PdfOcrStrategy};
+//!
+//! let file_path = "../test_files/documents/deu-ocr.pdf";
+//!
+//! // Create a new extractor. Note it uses a consuming builder pattern
+//! let extractor = Extractor::new()
+//!  .set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
+//!  .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
+//!
+//! // extract file with extractor
+//! let content = extractor.extract_file_to_string(file_path).unwrap();
+//! println!("{}", content);
+//!
+//! ```
 
 /// Default buffer size
 pub const DEFAULT_BUF_SIZE: usize = 32768;