Add Language Detection Endpoint

- Implement a new endpoint for detecting the language of files. - Update tests and documentation related to the new endpoint. Closes #247
ifad · tagliala · Sep 27, 2024 · Sep 27, 2024 · a92d5e473dd94923da467b9fe721c55b697df3ea
commit a92d5e473dd94923da467b9fe721c55b697df3ea
diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml
diff --git a/README.md b/README.md
@@ -343,6 +343,30 @@ Response:
 
       ... PDF document body ...
 
+### Detect language
+
+This is a foreground document language detection request. The detected language
+will be returned as the response body.
+
+    POST /convert
+
+Params *(suggest using `multipart/form-data`)*:
+
+* `file`      - the file to detect
+* `action`    - `detect_language`
+
+#### Example:
+
+    POST /convert
+      file=... foo.docx ...
+      action=detect_language
+
+Response:
+
+      Content-Type: text/plain
+
+      en
+
 ## Callbacks
 
 When a document conversion is completed, an attempt will be made to POST a

diff --git a/config/app.yml b/config/app.yml
@@ -33,3 +33,4 @@ wkhtmltopdf_path: <%= ENV['WKHTMLTOPDF_PATH'] %>
 # Other settings
 tika_config_directory: <%= ENV['TIKA_CONFIG_DIRECTORY'] %>
 wkhtmltopdf_params: '-d 100 --encoding UTF-8'
+tesseract_available_languages: <%= ENV['TESSERACT_AVAILABLE_LANGUAGES'] %>
diff --git a/lib/app.rb b/lib/app.rb
@@ -192,6 +192,28 @@ class App < Sinatra::Base
       respond_with_error e
     end
 
+    #
+    # Detect document language
+    #
+    # POST params:
+    #  file     - the file to detect language
+    post '/detect-language' do
+      unless params[:file]
+        return respond 400, "missing file parameter"
+      end
+
+      unless params[:file].respond_to?(:fetch) and params[:file].fetch(:tempfile, nil).respond_to?(:read)
+        return respond 400, "invalid file parameter"
+      end
+
+      body = params[:file][:tempfile].read
+      content = Converter.new(logger: @logger).convert_file('detect-language', body)
+      content_type content.mime_type
+      content
+    rescue StandardError => e
+      respond_with_error e
+    end
+
     # Legacy method to convert files
     # Brought over from Heathen
     #

diff --git a/lib/config.rb b/lib/config.rb
@@ -45,6 +45,8 @@ class C_
     attr_accessor :tika_config_directory
     # @return [String] Params for wkhtmltopdf
     attr_accessor :wkhtmltopdf_params
+    # @return [Array<String>] Languages available to Tesseract for OCR. Defaults to `["eng"]`
+    attr_accessor :tesseract_available_languages
 
     def self.config_file_path
       Pathname.new File.expand_path('../config/app.yml', __dir__)
@@ -70,6 +72,7 @@ def self.config
 
         c.tika_config_directory = yaml['tika_config_directory'] || '../tmp/tika'
         c.wkhtmltopdf_params = yaml['wkhtmltopdf_params'] || ''
+        c.tesseract_available_languages = (yaml['tesseract_available_languages'] || 'eng').split(',')
 
         c
       end

diff --git a/lib/heathen.rb b/lib/heathen.rb
@@ -12,6 +12,7 @@
 require_relative 'heathen/processor'
 
 require_relative 'heathen/processor_methods/convert_image'
+require_relative 'heathen/processor_methods/detect_language'
 require_relative 'heathen/processor_methods/htmltotext'
 require_relative 'heathen/processor_methods/libreoffice'
 require_relative 'heathen/processor_methods/pdftotext'

diff --git a/lib/heathen/processor_methods/detect_language.rb b/lib/heathen/processor_methods/detect_language.rb
@@ -0,0 +1,18 @@
+# frozen_string_literal: true
+
+module Heathen
+  class Processor
+    def detect_language
+      executioner.execute(
+        Colore::C_.tika_path,
+        "--config=#{Colore::TikaConfig.path_for_language_detection}",
+        '--language',
+        job.content_file,
+        binary: true
+      )
+      raise ConversionFailed.new if executioner.last_exit_status != 0
+
+      job.content = executioner.stdout
+    end
+  end
+end
diff --git a/lib/heathen/task.rb b/lib/heathen/task.rb
@@ -103,3 +103,7 @@ def task_key(action, mime_type)
 Heathen::Task.register 'doc', '.*' do
   perform_task 'msoffice'
 end
+
+Heathen::Task.register 'detect_language', '.*' do
+  detect_language
+end
diff --git a/lib/tika_config.rb b/lib/tika_config.rb
@@ -20,7 +20,7 @@ module TikaConfig
           <parser class="org.apache.tika.parser.DefaultParser"></parser>
           <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
             <params>
-              <param name="language" type="string">%<language_alpha3>s</param>
+              <param name="language" type="string">%<alpha3_languages>s</param>
             </params>
           </parser>
         </parsers>
@@ -34,12 +34,12 @@ def tika_config_path
         Pathname.new File.expand_path(Colore::C_.tika_config_directory, __dir__)
       end
 
-      def path_for!(language_alpha3)
-        file = tika_config_path.join('ocr', VERSION, "tika.#{language_alpha3}.xml")
+      def path_for!(alpha3_languages)
+        file = tika_config_path.join('ocr', VERSION, "tika.#{alpha3_languages.sort.join('-')}.xml")
         return file if file.file?
 
         FileUtils.mkdir_p(tika_config_path.join('ocr', VERSION))
-        file.write format(TEMPLATE, language_alpha3: language_alpha3)
+        file.write format(TEMPLATE, alpha3_languages: alpha3_languages.join('+'))
         file
       end
     end
@@ -55,7 +55,15 @@ def path_for!(language_alpha3)
     def self.path_for(language)
       language_alpha3 = Colore::Utils.language_alpha3(language) || DEFAULT_LANGUAGE
 
-      path_for!(language_alpha3)
+      path_for!([language_alpha3])
+    end
+
+    # Returns the file path of the Tika configuration for performing language
+    # detection.
+    #
+    # @return [Pathname] The path to the Tika configuration file for language detection
+    def self.path_for_language_detection
+      path_for!(Colore::C_.tesseract_available_languages)
     end
   end
 end
diff --git a/spec/heathen/processor_methods/detect_language_spec.rb b/spec/heathen/processor_methods/detect_language_spec.rb
@@ -0,0 +1,52 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe Heathen::Processor do
+  let(:content) { fixture('heathen/quickfox.ar.jpg').read }
+  let(:job) { Heathen::Job.new 'foo', content }
+  let(:processor) { described_class.new job: job, logger: spec_logger }
+
+  before do
+    setup_tika_config
+  end
+
+  after do
+    processor.clean_up
+    delete_tika_config
+  end
+
+  describe '#detect_language' do
+    let(:content) { fixture('heathen/quickfox.jpg').read }
+    let(:tesseract_available_languages) { %w[eng] }
+
+    before do
+      allow(Colore::C_.config).to receive(:tesseract_available_languages).and_return(tesseract_available_languages)
+
+      processor.detect_language
+    end
+
+    it 'detects document language' do
+      expect(job.content).to eq 'en'
+      expect(job.content.mime_type).to eq 'text/plain; charset=us-ascii'
+    end
+
+    context 'with Arabic documents' do
+      let(:content) { fixture('heathen/quickfox.ar.jpg').read }
+
+      context 'when Arabic is not available in Tesseract' do
+        it 'does not detect Arabic' do
+          expect(job.content).not_to eq 'ar'
+        end
+      end
+
+      context 'when Arabic is available in Tesseract' do
+        let(:tesseract_available_languages) { %w[eng ara] }
+
+        it 'detects Arabic' do
+          expect(job.content).to eq 'ar'
+        end
+      end
+    end
+  end
+end
diff --git a/spec/integration/standard_tasks_spec.rb b/spec/integration/standard_tasks_spec.rb
@@ -21,6 +21,14 @@
     end
   end
 
+  describe 'detect_language' do
+    it 'runs' do
+      content = fixture('heathen/quickfox.jpg').read
+      new_content = converter.convert 'detect_language', content
+      expect(new_content.mime_type).to eq 'text/plain; charset=us-ascii'
+    end
+  end
+
   describe 'ocr_text' do
     it 'converts jpeg' do
       content = fixture('heathen/quickfox.jpg').read

diff --git a/spec/lib/tika_config_spec.rb b/spec/lib/tika_config_spec.rb
@@ -51,4 +51,22 @@
       end
     end
   end
+
+  describe '.path_for_language_detection' do
+    subject(:path_for_language_detection) { described_class.path_for_language_detection }
+
+    it 'returns the correct configuration file path' do
+      expect(path_for_language_detection).to eq tmp_tika_config_dir.join('ocr', described_class::VERSION, 'tika.eng.xml')
+    end
+
+    context 'when multiple languages are available' do
+      before do
+        allow(Colore::C_.config).to receive(:tesseract_available_languages).and_return(%w[fra eng])
+      end
+
+      it 'returns the correct configuration file path' do
+        expect(path_for_language_detection).to eq tmp_tika_config_dir.join('ocr', described_class::VERSION, 'tika.eng-fra.xml')
+      end
+    end
+  end
 end