Skip to content

Commit

Permalink
vesseract: added image_to_boxes
Browse files Browse the repository at this point in the history
  • Loading branch information
SheatNoisette committed Nov 14, 2021
1 parent 51f5297 commit b3df1a3
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 9 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ all: format test

test:
@echo "Running tests..."
$(v_compiler) test .
$(v_compiler) -g test .

fmt:
$(v_compiler) fmt -w .
Expand Down
8 changes: 7 additions & 1 deletion alternatives.v
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,10 @@ pub fn image_to_alto_xml_path(image_path string) ?string {
[inline]
pub fn image_to_string_path(filepath string) ?string {
return image_to_string(image: filepath, lang: 'eng', args: '')
}
}

// Variant of image_to_boxes, only a file path is required
[inline]
pub fn image_to_boxes_path(filepath string) ?[]Tesseract_box {
return image_to_boxes(image: filepath, lang: 'eng', args: '')
}
53 changes: 49 additions & 4 deletions vesseract.v
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@ import os
pub struct Tesseract_box {
pub:
letter string
x1 int
y1 int
x2 int
y2 int
x1 u32
y1 u32
x2 u32
y2 u32
page u32
}

// Used as a parameter
Expand Down Expand Up @@ -137,3 +138,47 @@ pub fn image_to_alto_xml(t Tesseract) ?string {
// Get XML
return xml
}

// Get bounding boxes from Tesseract
// Return an array of Tesseract boxes
pub fn image_to_boxes(t Tesseract) ?[]Tesseract_box {
// Run tesseract with bounding box detection
result := extract_text_tesseract(
image: t.image
lang: t.lang
args: t.args + ' batch.nochop makebox'
) or { return err }

// Load box file
box_file := os.read_file(result.id + '.box') or { return err }
lines := box_file.split('\n')

// Delete "box" file and txt
os.rm(result.id + '.box') ?

// Hold results
mut boxes := []Tesseract_box{}

// Parse
for line in lines {
// Letter, x1, y1, x2, y1, page
// Example: H 68 206 91 235 0
content := line.split(' ')

// Skip malformed lines
if content.len != 6 {
continue
}

boxes << Tesseract_box{
letter: content[0]
x1: content[1].u32()
y1: content[2].u32()
x2: content[3].u32()
y2: content[4].u32()
page: content[5].u32()
}
}

return boxes
}
10 changes: 10 additions & 0 deletions vesseract_test.v
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,13 @@ fn test_image_to_alto_xml_path() {
xml := image_to_alto_xml_path('sample/demo.png') or { panic(err) }
assert xml.contains('http://www.loc.gov/standards/alto/ns-v3#')
}

fn test_image_to_boxes() {
boxes := image_to_boxes(image: 'sample/demo.png', lang: 'eng', args: '') or { panic(err) }

assert boxes[0].x1 == 68
assert boxes[0].y1 == 206
assert boxes[0].x2 == 91
assert boxes[0].y2 == 235
assert boxes.len == 16
}
9 changes: 6 additions & 3 deletions wrapper.v
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ pub:
output_filename string
// Tesseract output stdout
stdout_result string
// ID generated
id string
}

// Generate a id for a document to be processed
Expand Down Expand Up @@ -55,9 +57,9 @@ fn extract_text_tesseract(t Tesseract) ?Tesseract_output {
args << t.image

// Output tmp - Random ID
id := generate_id()
doc_id := generate_id()
// Output file (tesseract append .txt)
args << id
args << doc_id

if t.lang.len > 0 {
args << '-l ' + t.lang
Expand All @@ -74,7 +76,8 @@ fn extract_text_tesseract(t Tesseract) ?Tesseract_output {
return Tesseract_output{
image_path: t.image
arguments: args
output_filename: id + '.txt'
output_filename: doc_id + '.txt'
stdout_result: stdout
id: doc_id
}
}

0 comments on commit b3df1a3

Please sign in to comment.