Skip to content

Commit

Permalink
Add initial support for traineddata files in zip format
Browse files Browse the repository at this point in the history
This requires libzip-dev or libminizip-dev.

Up to now, little endian tesseract works with the new format.
More work is needed for training tools and big endian support.

Signed-off-by: Stefan Weil <[email protected]>
  • Loading branch information
stweil committed May 13, 2017
1 parent aa3222f commit 33bb2ea
Show file tree
Hide file tree
Showing 6 changed files with 221 additions and 5 deletions.
7 changes: 7 additions & 0 deletions api/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ tesseract_LDFLAGS = $(OPENCL_LDFLAGS)

tesseract_LDADD += $(LEPTONICA_LIBS)
tesseract_LDADD += $(OPENMP_CXXFLAGS)
if HAVE_LIBZIP
tesseract_LDADD += $(libzip_LIBS)
else
if HAVE_MINIZIP
tesseract_LDADD += $(minizip_LIBS)
endif
endif

if T_WIN
tesseract_LDADD += -ltiff
Expand Down
8 changes: 8 additions & 0 deletions ccutil/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ libtesseract_ccutil_la_SOURCES = \
unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
params.cpp universalambigs.cpp

if HAVE_LIBZIP
AM_CPPFLAGS += $(libzip_CFLAGS)
else
if HAVE_MINIZIP
AM_CPPFLAGS += $(minizip_CFLAGS)
endif
endif

if T_WIN
AM_CPPFLAGS += -I$(top_srcdir)/vs2010/port -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\"
noinst_HEADERS += ../vs2010/port/strtok_r.h
Expand Down
152 changes: 152 additions & 0 deletions ccutil/tessdatamanager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,21 @@
#pragma warning(disable:4244) // Conversion warnings
#endif

// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif

#include "tessdatamanager.h"

#include <string>

#include <stdio.h>
#if defined(HAVE_LIBZIP)
#include <zip.h>
#elif defined(HAVE_MINIZIP)
#include <unzip.h>
#endif // ZIP supported

#include "helpers.h"
#include "serialis.h"
Expand All @@ -33,9 +45,144 @@

namespace tesseract {

#if defined(HAVE_LIBZIP)
bool TessdataManager::LoadZipFile(const char *filename) {
bool result = false;
fprintf(stderr, "TessdataManager::%s(%s)\n", __func__, filename);
std::string zipfile(filename);
int err;
zip_t *uf = zip_open(zipfile.c_str(), ZIP_RDONLY, &err);
if (uf == nullptr) {
zipfile += ".zip";
uf = zip_open(zipfile.c_str(), ZIP_RDONLY, &err);
}
if (uf != nullptr) {
fprintf(stderr, "zip_open(%s) passed\n", zipfile.c_str());
int64_t nEntries = zip_get_num_entries(uf, ZIP_FL_UNCHANGED);
for (int i = 0; i < nEntries; i++) {
zip_stat_t zipStat;
if (zip_stat_index(uf, i, ZIP_FL_UNCHANGED, &zipStat) == 0 &&
(zipStat.valid & ZIP_STAT_NAME) && (zipStat.valid & ZIP_STAT_SIZE)) {
//~ fprintf(stderr,
//~ "zip_get_name(...) passed, file %s\n", zipStat.name);

TessdataType type;
if (TessdataTypeFromFileName(zipStat.name, &type)) {
fprintf(stderr,
"TessdataTypeFromFileName(%s, ...) passed, type %d\n",
zipStat.name, type);
zip_file_t *zipFile = zip_fopen_index(uf, i, ZIP_FL_UNCHANGED);
if (zipFile == nullptr) {
fprintf(stderr, "zip_fopen_index(...) failed\n");
} else {
entries_[type].resize_no_init(zipStat.size);
if (zip_fread(zipFile, &entries_[type][0], zipStat.size) !=
static_cast<int64_t>(zipStat.size)) {
fprintf(stderr, "zip_fread(...) failed\n");
}
zip_fclose(zipFile);
}
}
}
}
is_loaded_ = true;
err = zip_close(uf);
if (err != 0) {
fprintf(stderr, "zip_close(...) failed\n");
}
result = true;
} else {
fprintf(stderr, "zip_open(%s) failed\n", zipfile.c_str());

}
return result;
}
#elif defined(HAVE_MINIZIP)
bool TessdataManager::LoadZipFile(const char *filename) {
bool result = false;
fprintf(stderr, "TessdataManager::%s(%s)\n", __func__, filename);
std::string zipfile(filename);
unzFile uf = unzOpen(zipfile.c_str());
if (uf == nullptr) {
zipfile += ".zip";
uf = unzOpen(zipfile.c_str());
}
if (uf != nullptr) {
fprintf(stderr, "unzOpen(%s) passed\n", zipfile.c_str());
unz_global_info global_info;
int err;
err = unzGetGlobalInfo(uf, &global_info);
if (err == UNZ_OK) {
fprintf(stderr,
"unzGetGlobalInfo(...) passed, zip file with %lu entries\n",
global_info.number_entry);
}
unz_file_info file_info;
char fileName[32];
char extraField[32];
char comment[32];
//~ $1 = {version = 798, version_needed = 20, flag = 0, compression_method = 8, dosDate = 1252768343, crc = 2481269679, compressed_size = 7131663, uncompressed_size = 16109842,
//~ size_filename = 15, size_file_extra = 24, size_file_comment = 0, disk_num_start = 0, internal_fa = 0, external_fa = 2175008768, tmu_date = {tm_sec = 46, tm_min = 18,
//~ tm_hour = 23, tm_mday = 11, tm_mon = 4, tm_year = 2017}}
for (unsigned i = 0; i < global_info.number_entry; i++) {
err = unzGetCurrentFileInfo(uf, &file_info,
fileName, sizeof(fileName),
extraField, sizeof(extraField),
comment, sizeof(comment));
if (err == UNZ_OK) {
//~ fprintf(stderr,
//~ "unzGetCurrentFileInfo(...) passed, file %s, %lu byte\n",
//~ fileName, file_info.uncompressed_size);

TessdataType type;
if (TessdataTypeFromFileName(fileName, &type)) {
fprintf(stderr,
"TessdataTypeFromFileName(%s, ...) passed, type %d\n",
fileName, type);
err = unzOpenCurrentFilePassword(uf, nullptr);
if (err != UNZ_OK) {
fprintf(stderr, "unzOpenCurrentFilePassword(...) failed, err %d\n", err);
} else {
entries_[type].resize_no_init(file_info.uncompressed_size);
err = unzReadCurrentFile(uf, &entries_[type][0], file_info.uncompressed_size);
if (err < UNZ_OK) {
fprintf(stderr, "unzReadCurrentFile(...) failed, err %d\n", err);
}
err = unzCloseCurrentFile(uf);
if (err != UNZ_OK) {
fprintf(stderr, "unzCloseCurrentFile(...) failed\n");
}
}
}
}
//~ err = unzGoToFirstFile(uf);

err = unzGoToNextFile(uf);
if (err != UNZ_OK) {
fprintf(stderr, "unzGoToNextFile(...) failed\n");
}
}
is_loaded_ = true;
err = unzClose(uf);
if (err != UNZ_OK) {
fprintf(stderr, "unzClose(...) failed\n");
}
result = true;
} else {
fprintf(stderr, "unzOpen(%s) failed\n", zipfile.c_str());
perror(zipfile.c_str());
}
return result;
}
#endif // ZIP supported

bool TessdataManager::Init(const char *data_file_name) {
fprintf(stderr, "TessdataManager::%s(%s)\n", __func__, data_file_name);
GenericVector<char> data;
if (reader_ == nullptr) {
#if defined(HAVE_MINIZIP)
if (LoadZipFile(data_file_name)) return true;
#endif // HAVE_MINIZIP
if (!LoadDataFromFile(data_file_name, &data)) return false;
} else {
if (!(*reader_)(data_file_name, &data)) return false;
Expand All @@ -46,6 +193,7 @@ bool TessdataManager::Init(const char *data_file_name) {
// Loads from the given memory buffer as if a file.
bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
int size) {
// TODO: This method supports only the proprietary file format.
data_file_name_ = name;
TFile fp;
fp.Open(data, size);
Expand Down Expand Up @@ -76,6 +224,7 @@ bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
// Saves to the given filename.
bool TessdataManager::SaveFile(const STRING &filename,
FileWriter writer) const {
// TODO: This method supports only the proprietary file format.
ASSERT_HOST(is_loaded_);
GenericVector<char> data;
Serialize(&data);
Expand All @@ -87,6 +236,7 @@ bool TessdataManager::SaveFile(const STRING &filename,

// Serializes to the given vector.
void TessdataManager::Serialize(GenericVector<char> *data) const {
// TODO: This method supports only the proprietary file format.
ASSERT_HOST(is_loaded_);
// Compute the offset_table and total size.
inT64 offset_table[TESSDATA_NUM_ENTRIES];
Expand Down Expand Up @@ -146,6 +296,7 @@ bool TessdataManager::CombineDataFiles(
const char *language_data_path_prefix,
const char *output_filename) {
// Load individual tessdata components from files.
// TODO: This method supports only the proprietary file format.
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
TessdataType type;
ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
Expand Down Expand Up @@ -178,6 +329,7 @@ bool TessdataManager::OverwriteComponents(
char **component_filenames,
int num_new_components) {
// Open the files with the new components.
// TODO: This method supports only the proprietary file format.
for (int i = 0; i < num_new_components; ++i) {
TessdataType type;
if (TessdataTypeFromFileName(component_filenames[i], &type)) {
Expand Down
2 changes: 2 additions & 0 deletions ccutil/tessdatamanager.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,8 @@ class TessdataManager {

private:

bool LoadZipFile(const char *filename);

// Saves to the given filename.
bool SaveFile(const STRING &filename, FileWriter writer) const;

Expand Down
25 changes: 25 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,31 @@ else
AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.])
fi
AM_CONDITIONAL([HAVE_LIBARCHIVE], false)
AM_CONDITIONAL([HAVE_ZZIPLIB], false)
PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false])
if $have_libarchive; then
AC_DEFINE([HAVE_LIBARCHIVE], [], [Enable libarchive])
fi
PKG_CHECK_MODULES([libzip], [libzip], [have_libzip=true], [have_libzip=false])
AM_CONDITIONAL([HAVE_LIBZIP], [$have_libzip])
if $have_libzip; then
AC_DEFINE([HAVE_LIBZIP], [], [Enable libzip])
fi
PKG_CHECK_MODULES([minizip], [minizip], [have_minizip=true], [have_minizip=false])
AM_CONDITIONAL([HAVE_MINIZIP], [$have_minizip])
if $have_minizip; then
AC_DEFINE([HAVE_MINIZIP], [], [Enable minizip])
fi
PKG_CHECK_MODULES([zziplib], [zziplib], [have_zziplib=true], [have_zziplib=false])
if $have_zziplib; then
AC_DEFINE([HAVE_ZZIPLIB], [], [Enable zziplib])
fi
AM_CONDITIONAL([ENABLE_TRAINING], true)
# Check location of icu headers
Expand Down
32 changes: 27 additions & 5 deletions training/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ AM_CPPFLAGS += \

EXTRA_DIST = language-specific.sh tesstrain.sh tesstrain_utils.sh

# TODO: training programs can not be linked to shared library created
# with -fvisibility
# TODO: training programs can not be linked to shared library created
# with -fvisibility
if VISIBILITY
AM_LDFLAGS += -all-static
endif
Expand All @@ -26,9 +26,9 @@ noinst_HEADERS = \
noinst_LTLIBRARIES = libtesseract_training.la libtesseract_tessopt.la

libtesseract_training_la_LIBADD = \
../cutil/libtesseract_cutil.la
../cutil/libtesseract_cutil.la
# ../api/libtesseract.la

libtesseract_training_la_SOURCES = \
boxchar.cpp commandlineflags.cpp commontraining.cpp degradeimage.cpp \
fileio.cpp ligature_table.cpp lstmtester.cpp normstrngs.cpp pango_font_info.cpp \
Expand Down Expand Up @@ -374,5 +374,27 @@ mftraining_LDADD += $(LEPTONICA_LIBS)
set_unicharset_properties_LDADD += $(LEPTONICA_LIBS)
shapeclustering_LDADD += $(LEPTONICA_LIBS)
text2image_LDADD += $(LEPTONICA_LIBS)
unicharset_extractor_LDFLAGS += $(LEPTONICA_LIBS)
unicharset_extractor_LDFLAGS += $(LEPTONICA_LIBS)
wordlist2dawg_LDADD += $(LEPTONICA_LIBS)

ambiguous_words_LDADD += $(libzip_LIBS)
classifier_tester_LDADD += $(libzip_LIBS)
cntraining_LDADD += $(libzip_LIBS)
combine_tessdata_LDADD += $(libzip_LIBS)
lstmeval_LDADD += $(libzip_LIBS)
lstmtraining_LDADD += $(libzip_LIBS)
mftraining_LDADD += $(libzip_LIBS)
set_unicharset_properties_LDADD += $(libzip_LIBS)
shapeclustering_LDADD += $(libzip_LIBS)
wordlist2dawg_LDADD += $(libzip_LIBS)

ambiguous_words_LDADD += $(minizip_LIBS)
classifier_tester_LDADD += $(minizip_LIBS)
cntraining_LDADD += $(minizip_LIBS)
combine_tessdata_LDADD += $(minizip_LIBS)
lstmeval_LDADD += $(minizip_LIBS)
lstmtraining_LDADD += $(minizip_LIBS)
mftraining_LDADD += $(minizip_LIBS)
set_unicharset_properties_LDADD += $(minizip_LIBS)
shapeclustering_LDADD += $(minizip_LIBS)
wordlist2dawg_LDADD += $(minizip_LIBS)

0 comments on commit 33bb2ea

Please sign in to comment.