Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,8 @@ jobs:
if: needs.setup.outputs.qvac_needed == 'true'
working-directory: ${{ env.PKG_DIR }}
run: |
mkdir -p models/ocr/rec_512
aws s3 cp s3://tether-ai-dev/qvac_models_compiled/ocr/rec_512/ models/ocr/rec_512/ --recursive --exclude "*" \
mkdir -p models/ocr/rec_dyn
aws s3 cp s3://tether-ai-dev/qvac_models_compiled/ocr/rec_dyn/ models/ocr/rec_dyn/ --recursive --exclude "*" \
--include "detector_craft.onnx" \
--include "recognizer_latin.onnx"
echo "Downloaded QVAC OCR models:"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
if: steps.version.outputs.bumped == 'true'
uses: softprops/action-gh-release@v2
with:
tag_name: v${{ steps.version.outputs.current }}
tag_name: ocr-onnx-v${{ steps.version.outputs.current }}
name: QVAC OCR Addon v${{ steps.version.outputs.current }}
body_path: ${{ inputs.workdir }}/release-notes/v${{ steps.version.outputs.current }}.md
env:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ jobs:
working-directory: ${{ inputs.workdir || env.PKG_DIR }}
shell: bash
run: |
mkdir -p models/ocr/rec_512
aws s3 cp s3://tether-ai-dev/qvac_models_compiled/ocr/rec_512/ models/ocr/rec_512/ --recursive
mkdir -p models/ocr/rec_dyn
aws s3 cp s3://tether-ai-dev/qvac_models_compiled/ocr/rec_dyn/ models/ocr/rec_dyn/ --recursive

- name: Run integration test
working-directory: ${{ inputs.workdir || env.PKG_DIR }}
Expand Down
68 changes: 16 additions & 52 deletions .github/workflows/prebuilds-qvac-lib-infer-nmtcpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,14 @@ jobs:
fail-fast: false
matrix:
include:
- os: ubuntu-24.04
platform: linux
arch: x64
- os: ubuntu-24.04-arm64-private
platform: linux
arch: arm64
# Linux builds on Ubuntu 22.04 for glibc compatibility
- os: ubuntu-22.04
platform: linux
arch: x64
- os: ubuntu-22.04-arm
platform: linux
arch: arm64
# Android build requires Ubuntu 24.04 for NDK tooling
- os: ubuntu-24.04
platform: android
arch: arm64
Expand Down Expand Up @@ -116,7 +111,7 @@ jobs:
echo "Matrix tags: ${{ matrix.tags }}"
echo "PKG_DIR: ${{ env.PKG_DIR }}"

- if: ${{ matrix.os == 'ubuntu-24.04' || matrix.os == 'ubuntu-24.04-arm64-private' || matrix.os == 'ubuntu-22.04' || matrix.os == 'ubuntu-22.04-arm' }}
- if: ${{ matrix.os == 'ubuntu-22.04' || matrix.os == 'ubuntu-22.04-arm' || matrix.os == 'ubuntu-24.04' }}
name: Update c++ tools
run: |
wget https://apt.llvm.org/llvm.sh
Expand Down Expand Up @@ -236,7 +231,7 @@ jobs:
echo "VCPKG_ROOT=$VCPKG_ROOT" >> $GITHUB_ENV
echo "$VCPKG_ROOT" >> $GITHUB_PATH

- if: ${{ matrix.os == 'ubuntu-24.04' || matrix.os == 'ubuntu-24.04-arm64-private' || matrix.os == 'ubuntu-22.04' || matrix.os == 'ubuntu-22.04-arm' }}
- if: ${{ matrix.os == 'ubuntu-22.04' || matrix.os == 'ubuntu-22.04-arm' || matrix.os == 'ubuntu-24.04' }}
name: Configure vcpkg in linux
run: echo "VCPKG_ROOT=$VCPKG_INSTALLATION_ROOT" >> $GITHUB_ENV

Expand Down Expand Up @@ -268,15 +263,10 @@ jobs:
env | sort
continue-on-error: true

- if: ${{ matrix.os == 'ubuntu-24.04' || matrix.os == 'ubuntu-24.04-arm64-private' || matrix.os == 'ubuntu-22.04' || matrix.os == 'ubuntu-22.04-arm' }}
- if: ${{ matrix.os == 'ubuntu-22.04' || matrix.os == 'ubuntu-22.04-arm' || matrix.os == 'ubuntu-24.04' }}
name: Update apt sources
run: sudo apt-get update

- if: ${{ matrix.os == 'ubuntu-24.04' && matrix.arch == 'arm64' }}
name: Install tooling for cross compilation - ubuntu arm64
run: |
sudo apt-get install gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libc6-dev-arm64-cross lld

- if: ${{ matrix.os == 'windows-2022' }}
name: Configure cmake generator in windows
run: echo "CMAKE_GENERATOR=Visual Studio 17 2022" >> $env:GITHUB_ENV
Expand All @@ -290,20 +280,8 @@ jobs:
- if: ${{ matrix.os == 'windows-2022' && matrix.arch == 'arm64' }}
run: echo "CMAKE_GENERATOR_PLATFORM=ARM64" >> $env:GITHUB_ENV

- if: ${{ matrix.os == 'ubuntu-24.04' }}
name: Install vulkan in linux x64
run: |
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list http://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list
sudo apt update
sudo apt install -y vulkan-sdk

- if: ${{ matrix.os == 'ubuntu-24.04-arm64-private' }}
name: Build Vulkan SDK for linux arm64 (with S3 cache)
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-east-1
- if: ${{ matrix.os == 'ubuntu-22.04' || matrix.os == 'ubuntu-22.04-arm' }}
name: Download Vulkan SDK for ubuntu-22.04
run: |
sudo apt install -y xz-utils
wget -q -O /tmp/vulkansdk.tar.xz https://sdk.lunarg.com/sdk/download/latest/linux/vulkan_sdk.tar.xz
Expand All @@ -312,34 +290,20 @@ jobs:
cd vulkan
tar xf /tmp/vulkansdk.tar.xz --strip-components=1

# Extract SDK major.minor version from README.txt (e.g., "1.4" from "1.4.341.0")
SDK_VERSION=$(grep -o 'sdk/[0-9]*\.[0-9]*' README.txt | head -1 | sed 's|sdk/||')
S3_BUCKET="tether-ai-dev"
S3_KEY="vulkan-sdk-cache/linux-arm64-${SDK_VERSION}.tar.gz"

echo "Vulkan SDK version: ${SDK_VERSION}"

# Try to download cached build from S3
if aws s3 cp "s3://${S3_BUCKET}/${S3_KEY}" /tmp/vulkan-arm64-cache.tar.gz 2>/dev/null; then
echo "Found cached Vulkan SDK, extracting..."
tar xzf /tmp/vulkan-arm64-cache.tar.gz -C ~/vulkan
rm /tmp/vulkan-arm64-cache.tar.gz
else
echo "No cache found, building Vulkan SDK for ARM64..."
./vulkansdk --maxjobs

# Upload the compiled SDK to S3 for future runs
echo "Uploading compiled SDK to S3..."
tar czf /tmp/vulkan-arm64-cache.tar.gz aarch64
aws s3 cp /tmp/vulkan-arm64-cache.tar.gz "s3://${S3_BUCKET}/${S3_KEY}"
rm /tmp/vulkan-arm64-cache.tar.gz
fi
- if: ${{ matrix.os == 'ubuntu-22.04' }}
name: Setup Vulkan SDK path for ubuntu-22.04 x64
run: |
VULKAN_SDK=~/vulkan/x86_64
echo "VULKAN_SDK=$VULKAN_SDK" >> $GITHUB_ENV

- if: ${{ matrix.os == 'ubuntu-24.04-arm64-private' }}
name: Setup Vulkan SDK environment for linux arm64
- if: ${{ matrix.os == 'ubuntu-22.04-arm' }}
name: Build Vulkan SDK for ubuntu-22.04 arm64
run: |
VULKAN_SDK=~/vulkan/aarch64
echo "VULKAN_SDK=$VULKAN_SDK" >> $GITHUB_ENV
cd ~/vulkan
./vulkansdk --maxjobs

echo "PATH=$VULKAN_SDK/bin:$PATH" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=$VULKAN_SDK/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" >> $GITHUB_ENV
echo "VK_ADD_LAYER_PATH=$VULKAN_SDK/share/vulkan/explicit_layer.d" >> $GITHUB_ENV
Expand Down
14 changes: 14 additions & 0 deletions packages/qvac-lib-inference-addon-onnx-ocr-fasttext/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,20 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

# [0.1.6] - 2026-02-09

### Changed

- Replaced fixed-width recognizer preprocessing with EasyOCR-style dynamic-width resizing for improved OCR accuracy. Images are now resized proportionally to model height with LANCZOS4 interpolation instead of aspect-preserving resize to a fixed 512px width.
- Switched to dynamic-width recognizer models (`rec_dyn`). Batch inference now uses per-batch proportional width instead of fixed `RECOGNIZER_MODEL_WIDTH`.
- Updated default model path from `rec_512` to `rec_dyn` across tests, benchmarks, and scripts.
- Replaced English recognizer with Latin recognizer in unit tests (`recognizer_english.onnx` → `recognizer_latin.onnx`).
- Added `--model-dir` CLI option to batch OCR CLI, evaluate script, and QVAC OCR backend for configurable model directory.

### Fixed

- Improved Portuguese OCR accuracy (minor punctuation corrections in test expected outputs).

# [0.1.2] - 2026-01-16

### Changed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,18 +262,35 @@ cv::Mat normalizeAndPad(const cv::Mat &img, int channels, int height, int maxWid
}

/**
* @brief resizes the image to fit recognizer input sizes
* @brief calculates the proportional width for EasyOCR-style resizing
*
* The image is not simply resized to recognizer input format. After height is adjusted, the portion corresponding to [new image width,
* recognizerImageWidth] is padded with the last column of the image
* Always scales height to RECOGNIZER_MODEL_HEIGHT, width is proportional to aspect ratio.
* This matches EasyOCR's preprocessing approach.
*
* @param width : original image width
* @param height : original image height
* @return int : the proportional width after resizing to model height
*/
int calculateProportionalWidth(int width, int height) {
float ratio = static_cast<float>(width) / static_cast<float>(height);
int newWidth = static_cast<int>(std::ceil(RECOGNIZER_MODEL_HEIGHT * ratio));
return std::max(1, newWidth); // Ensure at least 1 pixel width
}

/**
* @brief resizes the image to fit recognizer input sizes (EasyOCR-style)
*
* Always scales height to RECOGNIZER_MODEL_HEIGHT (64), width is proportional.
* The image is then padded to targetWidth for batching.
*
* It also receives contrast treatment according to adjustContrast
*
* @param subImage : image to be treated
* @param targetWidth : target width for padding (typically max width in batch)
* @param adjustContrast : target contrast
* @return adjusted image
*/
cv::Mat alignAndCollate(const SubImage &subImage, double adjustContrast = 0.0) {
cv::Mat alignAndCollate(const SubImage &subImage, int targetWidth, double adjustContrast = 0.0) {
cv::Mat image = subImage.image;
int width = image.cols;
int height = image.rows;
Expand All @@ -285,25 +302,21 @@ cv::Mat alignAndCollate(const SubImage &subImage, double adjustContrast = 0.0) {
image = adjustContrastGrey(image, adjustContrast);
}

// Aspect-preserving resize (ExecutorTorch approach)
// Scale both dimensions by the same ratio to fit within RECOGNIZER_MODEL_WIDTH x RECOGNIZER_MODEL_HEIGHT
float heightRatio = static_cast<float>(RECOGNIZER_MODEL_HEIGHT) / static_cast<float>(height);
float widthRatio = static_cast<float>(RECOGNIZER_MODEL_WIDTH) / static_cast<float>(width);
float resizeRatio = std::min(heightRatio, widthRatio);
// EasyOCR-style resize: always scale height to model height, width proportional
int proportionalWidth = calculateProportionalWidth(width, height);

int resizedW = static_cast<int>(std::round(static_cast<float>(width) * resizeRatio));
int resizedH = static_cast<int>(std::round(static_cast<float>(height) * resizeRatio));

// Clamp to model dimensions
resizedW = std::min(resizedW, RECOGNIZER_MODEL_WIDTH);
resizedH = std::min(resizedH, RECOGNIZER_MODEL_HEIGHT);
// Use LANCZOS interpolation like EasyOCR
cv::Mat resizedImage;
cv::resize(image, resizedImage, cv::Size(proportionalWidth, RECOGNIZER_MODEL_HEIGHT), 0, 0, cv::INTER_LANCZOS4);

// Use INTER_AREA for downscaling, INTER_CUBIC for upscaling
int interpolation = (resizeRatio < 1.0F) ? cv::INTER_AREA : cv::INTER_CUBIC;
return normalizeAndPad(resizedImage, 1 /*grayscale*/, RECOGNIZER_MODEL_HEIGHT, targetWidth);
}

cv::Mat resizedImage;
cv::resize(image, resizedImage, cv::Size(resizedW, resizedH), 0, 0, interpolation);
return normalizeAndPad(resizedImage, 1 /*grayscale*/, RECOGNIZER_MODEL_HEIGHT, RECOGNIZER_MODEL_WIDTH);
/**
* @brief Legacy version for backward compatibility - uses fixed RECOGNIZER_MODEL_WIDTH
*/
cv::Mat alignAndCollate(const SubImage &subImage, double adjustContrast = 0.0) {
return alignAndCollate(subImage, RECOGNIZER_MODEL_WIDTH, adjustContrast);
}

/**
Expand Down Expand Up @@ -799,19 +812,21 @@ cv::Mat StepRecognizeText::runInferenceOnImg(const cv::Mat &img) {
return preds.clone();
}

cv::Mat StepRecognizeText::runBatchInference(const std::vector<cv::Mat> &images) {
cv::Mat StepRecognizeText::runBatchInference(const std::vector<cv::Mat> &images, int dynamicWidth) {
auto t0 = std::chrono::high_resolution_clock::now();
if (images.empty()) {
return cv::Mat();
}

const int batchSize = static_cast<int>(images.size());
QLOG(qvac_lib_inference_addon_cpp::logger::Priority::DEBUG,
"[Recognition] runBatchInference called with batch_size=" + std::to_string(batchSize));
const int height = RECOGNIZER_MODEL_HEIGHT;
const int width = RECOGNIZER_MODEL_WIDTH;
const int width = dynamicWidth;
const int numChannels = 1;

QLOG(qvac_lib_inference_addon_cpp::logger::Priority::DEBUG,
"[Recognition] runBatchInference called with batch_size=" + std::to_string(batchSize) +
", dynamic_width=" + std::to_string(width));

// Create batch tensor: [batch, channels, height, width]
std::vector<float> batchData(batchSize * numChannels * height * width);

Expand Down Expand Up @@ -918,17 +933,28 @@ std::vector<InferredText> StepRecognizeText::processImgList() {
size_t batchEnd = std::min(batchStart + static_cast<size_t>(batchSize), allIndices.size());
size_t currentBatchSize = batchEnd - batchStart;

// Prepare images ONLY for this batch
// Calculate max proportional width for this batch (EasyOCR-style dynamic batching)
int maxProportionalWidth = 0;
for (size_t i = batchStart; i < batchEnd; i++) {
auto &idx = allIndices[i];
auto &subImage = imgListOfLists_[idx.listIdx][idx.imgIdx];
int propWidth = calculateProportionalWidth(subImage.image.cols, subImage.image.rows);
maxProportionalWidth = std::max(maxProportionalWidth, propWidth);
}
// Ensure minimum width for model stability
maxProportionalWidth = std::max(maxProportionalWidth, RECOGNIZER_MODEL_HEIGHT);

// Prepare images ONLY for this batch, using dynamic max width
std::vector<cv::Mat> preparedImages;
preparedImages.reserve(currentBatchSize);
for (size_t i = batchStart; i < batchEnd; i++) {
auto &idx = allIndices[i];
auto &subImage = imgListOfLists_[idx.listIdx][idx.imgIdx];
cv::Mat preparedImg = alignAndCollate(subImage, 0.0);
cv::Mat preparedImg = alignAndCollate(subImage, maxProportionalWidth, 0.0);
preparedImages.push_back(preparedImg);
}

cv::Mat batchPreds = runBatchInference(preparedImages);
cv::Mat batchPreds = runBatchInference(preparedImages, maxProportionalWidth);

// Decode results and populate SubImages for this batch
for (size_t i = 0; i < currentBatchSize; i++) {
Expand Down Expand Up @@ -966,16 +992,26 @@ std::vector<InferredText> StepRecognizeText::processImgList() {
for (size_t batchStart = 0; batchStart < lowConfidenceIndices.size(); batchStart += batchSize) {
size_t batchEnd = std::min(batchStart + static_cast<size_t>(batchSize), lowConfidenceIndices.size());

// Calculate max proportional width for contrast batch
int maxProportionalWidth = 0;
for (size_t j = batchStart; j < batchEnd; j++) {
auto &idx = lowConfidenceIndices[j];
auto &subImage = imgListOfLists_[idx.listIdx][idx.imgIdx];
int propWidth = calculateProportionalWidth(subImage.image.cols, subImage.image.rows);
maxProportionalWidth = std::max(maxProportionalWidth, propWidth);
}
maxProportionalWidth = std::max(maxProportionalWidth, RECOGNIZER_MODEL_HEIGHT);

std::vector<cv::Mat> contrastImages;
contrastImages.reserve(batchEnd - batchStart);
for (size_t j = batchStart; j < batchEnd; j++) {
auto &idx = lowConfidenceIndices[j];
auto &subImage = imgListOfLists_[idx.listIdx][idx.imgIdx];
cv::Mat contrastImg = alignAndCollate(subImage, TARGET_ADJUSTED_CONTRAST);
cv::Mat contrastImg = alignAndCollate(subImage, maxProportionalWidth, TARGET_ADJUSTED_CONTRAST);
contrastImages.push_back(contrastImg);
}

cv::Mat contrastPreds = runBatchInference(contrastImages);
cv::Mat contrastPreds = runBatchInference(contrastImages, maxProportionalWidth);

for (size_t j = 0; j < contrastImages.size(); j++) {
auto &idx = lowConfidenceIndices[batchStart + j];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,13 @@ struct StepRecognizeText {
cv::Mat runInferenceOnImg(const cv::Mat &img);

/**
* @brief runs ONNX batch inference on multiple images
* @brief runs ONNX batch inference on multiple images with dynamic width
*
* @param images : vector of prepared recognizer inputs
* @param dynamicWidth : the width of input images (for dynamic-width models)
* @return cv::Mat : the recognizer predictions with shape [batch, seq_len, num_chars]
*/
cv::Mat runBatchInference(const std::vector<cv::Mat> &images);
cv::Mat runBatchInference(const std::vector<cv::Mat> &images, int dynamicWidth);

/**
* @brief processes the sub image to run recognizer inference and populate text and confidence score
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(
language: str = "en",
timeout: int = 600,
batch_size: int = 50,
model_dir: str = "rec_dyn",
**kwargs
):
"""Initialize QVAC OCR backend.
Expand All @@ -35,13 +36,15 @@ def __init__(
language: Language code for OCR (e.g., 'en')
timeout: Timeout in seconds for batch operations
batch_size: Number of images to process in one batch
model_dir: Model directory name (e.g., 'rec_dyn' or 'rec_512')
**kwargs: Additional arguments passed to parent
"""
super().__init__(name="qvac", **kwargs)
self.bare_path = bare_path
self.language = language
self.timeout = timeout
self.batch_size = batch_size
self.model_dir = model_dir

# Determine addon path
if addon_path:
Expand Down Expand Up @@ -110,7 +113,8 @@ def _run_batch(self, image_paths: List[str]) -> dict:
str(self.batch_cli_script),
"--input", input_file,
"--output", output_file,
"--lang", self.language
"--lang", self.language,
"--model-dir", self.model_dir
]

result = subprocess.run(
Expand Down
Loading