Skip to content

Commit

Permalink
New API method and unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
nguyenq committed Nov 18, 2015
1 parent 935516d commit dbe6903
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 2 deletions.
10 changes: 10 additions & 0 deletions src/main/java/net/sourceforge/tess4j/ITesseract.java
Original file line number Diff line number Diff line change
Expand Up @@ -214,4 +214,14 @@ public enum RenderedFormat {
* @throws TesseractException
*/
void createDocuments(String[] filenames, String[] outputbases, List<RenderedFormat> formats) throws TesseractException;

/**
* Gets segmented regions.
*
* @param bi input image
* @param level TessPageIteratorLevel enum
* @return
* @throws TesseractException
*/
List<Rectangle> getRegions(BufferedImage bi, int level) throws TesseractException;
}
45 changes: 43 additions & 2 deletions src/main/java/net/sourceforge/tess4j/Tesseract.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,18 @@

import com.sun.jna.Pointer;
import com.sun.jna.StringArray;
import com.sun.jna.ptr.IntByReference;
import com.sun.jna.ptr.PointerByReference;
import java.awt.Rectangle;
import java.awt.image.*;
import java.io.*;
import java.nio.ByteBuffer;
import java.util.*;
import javax.imageio.IIOImage;
import net.sourceforge.lept4j.Box;
import net.sourceforge.lept4j.Boxa;
import static net.sourceforge.lept4j.ILeptonica.L_CLONE;
import net.sourceforge.lept4j.Leptonica;
import static net.sourceforge.tess4j.ITessAPI.TRUE;

import net.sourceforge.tess4j.ITessAPI.TessBaseAPI;
import net.sourceforge.tess4j.ITessAPI.TessOcrEngineMode;
Expand All @@ -33,7 +37,6 @@
import net.sourceforge.tess4j.util.ImageIOHelper;
import net.sourceforge.tess4j.util.LoggHelper;
import net.sourceforge.tess4j.util.PdfUtilities;
import net.sourceforge.tess4j.util.Utils;
import org.slf4j.*;

/**
Expand Down Expand Up @@ -561,6 +564,44 @@ private void createDocuments(String filename, TessResultRenderer renderer) throw
}
}

/**
* Gets segmented regions.
*
* @param bi input image
* @param level TessPageIteratorLevel enum
* @return
* @throws TesseractException
*/
@Override
public List<Rectangle> getRegions(BufferedImage bi, int level) throws TesseractException {
init();
setTessVariables();

try {
List<Rectangle> list = new ArrayList<Rectangle>();
setImage(bi, null);

Boxa boxes = api.TessBaseAPIGetComponentImages(handle, level, TRUE, null, null);
Leptonica leptInstance = Leptonica.INSTANCE;
int boxCount = leptInstance.boxaGetCount(boxes);
for (int i = 0; i < boxCount; i++) {
Box box = leptInstance.boxaGetBox(boxes, i, L_CLONE);
if (box == null) {
continue;
}
list.add(new Rectangle(box.x, box.y, box.w, box.h));
}

return list;
} catch (IOException ioe) {
// skip the problematic image
logger.error(ioe.getMessage(), ioe);
throw new TesseractException(ioe);
} finally {
dispose();
}
}

/**
* Releases all of the native resources used by this instance.
*/
Expand Down
42 changes: 42 additions & 0 deletions src/main/java/net/sourceforge/tess4j/Tesseract1.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@
import java.nio.ByteBuffer;
import java.util.*;
import javax.imageio.IIOImage;
import net.sourceforge.lept4j.Box;
import net.sourceforge.lept4j.Boxa;
import static net.sourceforge.lept4j.ILeptonica.L_CLONE;
import net.sourceforge.lept4j.Leptonica1;
import static net.sourceforge.tess4j.ITessAPI.TRUE;

import net.sourceforge.tess4j.util.ImageIOHelper;
import net.sourceforge.tess4j.util.LoggHelper;
Expand Down Expand Up @@ -527,6 +532,43 @@ private void createDocuments(String filename, TessResultRenderer renderer) throw
// }
}

/**
* Gets segmented regions.
*
* @param bi input image
* @param level TessPageIteratorLevel enum
* @return
* @throws TesseractException
*/
@Override
public List<Rectangle> getRegions(BufferedImage bi, int level) throws TesseractException {
init();
setTessVariables();

try {
List<Rectangle> list = new ArrayList<Rectangle>();
setImage(bi, null);

Boxa boxes = TessBaseAPIGetComponentImages(handle, level, TRUE, null, null);
int boxCount = Leptonica1.boxaGetCount(boxes);
for (int i = 0; i < boxCount; i++) {
Box box = Leptonica1.boxaGetBox(boxes, i, L_CLONE);
if (box == null) {
continue;
}
list.add(new Rectangle(box.x, box.y, box.w, box.h));
}

return list;
} catch (IOException ioe) {
// skip the problematic image
logger.error(ioe.getMessage(), ioe);
throw new TesseractException(ioe);
} finally {
dispose();
}
}

/**
* Releases all of the native resources used by this instance.
*/
Expand Down
19 changes: 19 additions & 0 deletions src/test/java/net/sourceforge/tess4j/Tesseract1Test.java
Original file line number Diff line number Diff line change
Expand Up @@ -290,4 +290,23 @@ public List<Word> getWords(File file, int pageIteratorLevel) {
}
}
}

/**
* Test of getRegions method, of class Tesseract1.
*/
@Test
public void testGetRegions() throws Exception {
logger.info("getRegions at given TessPageIteratorLevel");
File imageFile = new File(testResourcesDataPath, "eurotext.png");
BufferedImage bi = ImageIO.read(imageFile);
int level = TessPageIteratorLevel.RIL_SYMBOL;
logger.info("PageIteratorLevel: " + Utils.getConstantName(level, TessPageIteratorLevel.class));
List<Rectangle> result = instance.getRegions(bi, level);
for (int i = 0; i < result.size(); i++) {
Rectangle rect = result.get(i);
logger.info(String.format("Box[%d]: x=%d, y=%d, w=%d, h=%d", i, rect.x, rect.y, rect.width, rect.height));
}

assertTrue(result.size() > 0);
}
}
19 changes: 19 additions & 0 deletions src/test/java/net/sourceforge/tess4j/TesseractTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -312,4 +312,23 @@ public List<Word> getTextElements(File file, int pageIteratorLevel) {
}
}
}

/**
* Test of getRegions method, of class Tesseract.
*/
@Test
public void testGetRegions() throws Exception {
logger.info("getRegions at given TessPageIteratorLevel");
File imageFile = new File(testResourcesDataPath, "eurotext.png");
BufferedImage bi = ImageIO.read(imageFile);
int level = TessPageIteratorLevel.RIL_SYMBOL;
logger.info("PageIteratorLevel: " + Utils.getConstantName(level, TessPageIteratorLevel.class));
List<Rectangle> result = instance.getRegions(bi, level);
for (int i = 0; i < result.size(); i++) {
Rectangle rect = result.get(i);
logger.info(String.format("Box[%d]: x=%d, y=%d, w=%d, h=%d", i, rect.x, rect.y, rect.width, rect.height));
}

assertTrue(result.size() > 0);
}
}

5 comments on commit dbe6903

@Bahramudin
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi! Can we set the prediction text before doing ORC? e.g: we tell to Tesseract that the text can be contain "7, 4, a, v, u, w, ..., ect" I think it will help accuracy, and it is good for Chinese language, because in Chinese language there are a lot of characters. We just want to recognize only one sentence that may contains some characters and numbers. Thanks

@nguyenq
Copy link
Owner Author

@nguyenq nguyenq commented on dbe6903 Dec 2, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use tessedit_char_whitelist variable.

@Bahramudin
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you! It is what I was looking for

@4F2E4A2E
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Bahramudin please write a small snippet containing a example on how to apply this with tess4j in order to help us to document this.

@Bahramudin
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@4F2E4A2E Do you mean this: tesseract.setLanguage("chi_sim");
tesseract.setTessVariable("tessedit_char_whitelist", "012(3[45]6)789汉子");
String result = tesseract.doOCR(image);
System.out.println("result = " + result);

Please sign in to comment.