Skip to content

Commit

Permalink
Add initial support for traineddata files in zip format
Browse files Browse the repository at this point in the history
This requires libminizip-dev.

Up to now, little endian tesseract works with the new format.
More work is needed for training tools and big endian support.

Signed-off-by: Stefan Weil <[email protected]>
  • Loading branch information
stweil committed May 12, 2017
1 parent f98d731 commit e0fa1ed
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 0 deletions.
1 change: 1 addition & 0 deletions api/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ tesseract_LDFLAGS = $(OPENCL_LDFLAGS)

tesseract_LDADD += $(LEPTONICA_LIBS)
tesseract_LDADD += $(OPENMP_CXXFLAGS)
tesseract_LDADD += $(minizip_LIBS)

if T_WIN
tesseract_LDADD += -ltiff
Expand Down
2 changes: 2 additions & 0 deletions ccutil/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ libtesseract_ccutil_la_SOURCES = \
unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
params.cpp universalambigs.cpp

AM_CPPFLAGS += $(minizip_CFLAGS)

if T_WIN
AM_CPPFLAGS += -I$(top_srcdir)/vs2010/port -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\"
noinst_HEADERS += ../vs2010/port/strtok_r.h
Expand Down
115 changes: 115 additions & 0 deletions ccutil/tessdatamanager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@

#include "tessdatamanager.h"

#include <string>

#include <stdio.h>
#include <unzip.h>

#include "helpers.h"
#include "serialis.h"
Expand All @@ -33,6 +36,108 @@

namespace tesseract {

#define WITH_ZIP 1

#if WITH_ZIP

bool TessdataManager::LoadZipFile(const char *filename) {
bool result = false;
fprintf(stderr, "TessdataManager::%s(%s)\n", __func__, filename);
std::string zipfile(filename);
unzFile uf = unzOpen(zipfile.c_str());
if (uf == nullptr) {
zipfile += ".zip";
uf = unzOpen(zipfile.c_str());
}
if (uf != nullptr) {
fprintf(stderr, "unzOpen(%s) passed\n", zipfile.c_str());
unz_global_info global_info;
int err;
err = unzGetGlobalInfo(uf, &global_info);
if (err == UNZ_OK) {
fprintf(stderr,
"unzGetGlobalInfo(...) passed, zip file with %lu entries\n",
global_info.number_entry);
}
unz_file_info file_info;
char fileName[32];
char extraField[32];
char comment[32];
//~ $1 = {version = 798, version_needed = 20, flag = 0, compression_method = 8, dosDate = 1252768343, crc = 2481269679, compressed_size = 7131663, uncompressed_size = 16109842,
//~ size_filename = 15, size_file_extra = 24, size_file_comment = 0, disk_num_start = 0, internal_fa = 0, external_fa = 2175008768, tmu_date = {tm_sec = 46, tm_min = 18,
//~ tm_hour = 23, tm_mday = 11, tm_mon = 4, tm_year = 2017}}
for (unsigned i = 0; i < global_info.number_entry; i++) {
err = unzGetCurrentFileInfo(uf, &file_info,
fileName, sizeof(fileName),
extraField, sizeof(extraField),
comment, sizeof(comment));
if (err == UNZ_OK) {
//~ fprintf(stderr,
//~ "unzGetCurrentFileInfo(...) passed, file %s, %lu byte\n",
//~ fileName, file_info.uncompressed_size);

char *suffix = strchr(fileName, '.');
if (suffix != nullptr) {
TessdataType type;
if (TessdataTypeFromFileSuffix(suffix + 1, &type)) {
fprintf(stderr,
"TessdataTypeFromFileSuffix(...) passed, type %d\n", type);
err = unzOpenCurrentFilePassword(uf, nullptr);
if (err != UNZ_OK) {
fprintf(stderr, "unzOpenCurrentFilePassword(...) failed, err %d\n", err);
} else {
entries_[type].resize_no_init(file_info.uncompressed_size);
err = unzReadCurrentFile(uf, &entries_[type][0], file_info.uncompressed_size);
if (err < UNZ_OK) {
fprintf(stderr, "unzReadCurrentFile(...) failed, err %d\n", err);
}
err = unzCloseCurrentFile(uf);
if (err != UNZ_OK) {
fprintf(stderr, "unzCloseCurrentFile(...) failed\n");
}
}
}
}
}
//~ err = unzGoToFirstFile(uf);

err = unzGoToNextFile(uf);
if (err != UNZ_OK) {
fprintf(stderr, "unzGoToNextFile(...) failed\n");
}
}
is_loaded_ = true;
err = unzClose(uf);
if (err != UNZ_OK) {
fprintf(stderr, "unzClose(...) failed\n");
}
result = true;
} else {
fprintf(stderr, "unzOpen(%s) failed\n", zipfile.c_str());
perror(zipfile.c_str());
}
return result;
}

bool TessdataManager::Init(const char *data_file_name) {
fprintf(stderr, "TessdataManager::%s(%s)\n", __func__, data_file_name);
GenericVector<char> data;
if (reader_ == nullptr) {
return LoadZipFile(data_file_name);
} else {
if (!(*reader_)(data_file_name, &data)) return false;
}
return LoadMemBuffer(data_file_name, &data[0], data.size());
}

bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
int size) {
fprintf(stderr, "TessdataManager::%s(%s,...)\n", __func__, name);
return false;
}

#else

bool TessdataManager::Init(const char *data_file_name) {
GenericVector<char> data;
if (reader_ == nullptr) {
Expand Down Expand Up @@ -112,6 +217,8 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {
}
}

#endif

// Resets to the initial state, keeping the reader.
void TessdataManager::Clear() {
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
Expand All @@ -135,13 +242,16 @@ void TessdataManager::Directory() const {
// Opens the given TFile pointer to the given component type.
// Returns false in case of failure.
bool TessdataManager::GetComponent(TessdataType type, TFile *fp) {
fprintf(stderr, "TessdataManager::%s(%d,...)\n", __func__, type);
if (!is_loaded_ && !Init(data_file_name_.string())) return false;
if (entries_[type].empty()) return false;
fp->Open(&entries_[type][0], entries_[type].size());
fp->set_swap(swap_);
return true;
}

#if !WITH_ZIP

bool TessdataManager::CombineDataFiles(
const char *language_data_path_prefix,
const char *output_filename) {
Expand Down Expand Up @@ -199,6 +309,7 @@ bool TessdataManager::ExtractToFile(const char *filename) {
if (entries_[type].empty()) return false;
return SaveDataToFile(entries_[type], filename);
}
#endif

bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix,
TessdataType *type) {
Expand All @@ -213,6 +324,8 @@ bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix,
return false;
}

#if !WITH_ZIP

bool TessdataManager::TessdataTypeFromFileName(const char *filename,
TessdataType *type) {
// Get the file suffix (extension)
Expand All @@ -221,4 +334,6 @@ bool TessdataManager::TessdataTypeFromFileName(const char *filename,
return TessdataTypeFromFileSuffix(suffix, type);
}

#endif

} // namespace tesseract
2 changes: 2 additions & 0 deletions ccutil/tessdatamanager.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,8 @@ class TessdataManager {

private:

bool LoadZipFile(const char *filename);

// Saves to the given filename.
bool SaveFile(const STRING &filename, FileWriter writer) const;

Expand Down
2 changes: 2 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,8 @@ else
AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.])
fi
PKG_CHECK_MODULES([minizip], [minizip], [have_minizip=true], [have_minizip=false])
AM_CONDITIONAL([ENABLE_TRAINING], true)
# Check location of icu headers
Expand Down

0 comments on commit e0fa1ed

Please sign in to comment.