From 3b5afc4103e6e2fb46e2f41f96cad67931b2c4f1 Mon Sep 17 00:00:00 2001 From: Geremia Taglialatela Date: Fri, 27 Sep 2024 11:32:44 +0200 Subject: [PATCH] Add Language Detection endpoint Close #247 Test --- .rubocop_todo.yml | 3 +- README.md | 24 ++++++++++++ config/app.yml | 1 + lib/app.rb | 22 +++++++++++ lib/config.rb | 3 ++ .../processor_methods/detect_language.rb | 18 +++++++++ lib/heathen/task.rb | 4 ++ lib/tika_config.rb | 22 ++++++++--- .../processor_methods/detect_language_spec.rb | 37 +++++++++++++++++++ spec/integration/standard_tasks_spec.rb | 8 ++++ spec/lib/tika_config_spec.rb | 29 +++++++++++++++ 11 files changed, 165 insertions(+), 6 deletions(-) create mode 100644 lib/heathen/processor_methods/detect_language.rb create mode 100644 spec/heathen/processor_methods/detect_language_spec.rb diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml index 41bc962..dce6afb 100644 --- a/.rubocop_todo.yml +++ b/.rubocop_todo.yml @@ -87,7 +87,7 @@ Metrics/BlockLength: # Configuration parameters: CountComments, CountAsOne. Metrics/ClassLength: - Max: 173 + Max: 187 # Configuration parameters: AllowedMethods, AllowedPatterns. Metrics/CyclomaticComplexity: @@ -220,6 +220,7 @@ Style/Documentation: - 'lib/errors.rb' - 'lib/heathen/filename.rb' - 'lib/heathen/processor_methods/convert_image.rb' + - 'lib/heathen/processor_methods/detect_language.rb' - 'lib/heathen/processor_methods/htmltotext.rb' - 'lib/heathen/processor_methods/libreoffice.rb' - 'lib/heathen/processor_methods/pdftotext.rb' diff --git a/README.md b/README.md index d47507d..88358b7 100644 --- a/README.md +++ b/README.md @@ -343,6 +343,30 @@ Response: ... PDF document body ... +### Detect language + +This is a foreground document language detection request. The detected language +will be returned as the response body. + + POST /convert + +Params *(suggest using `multipart/form-data`)*: + +* `file` - the file to detect +* `action` - `detect_language` + +#### Example: + + POST /convert + file=... foo.docx ... + action=detect_language + +Response: + + Content-Type: text/plain + + en + ## Callbacks When a document conversion is completed, an attempt will be made to POST a diff --git a/config/app.yml b/config/app.yml index 8ad03fc..66c86e5 100644 --- a/config/app.yml +++ b/config/app.yml @@ -33,3 +33,4 @@ wkhtmltopdf_path: <%= ENV['WKHTMLTOPDF_PATH'] %> # Other settings tika_config_directory: <%= ENV['TIKA_CONFIG_DIRECTORY'] %> wkhtmltopdf_params: '-d 100 --encoding UTF-8' +tesseract_available_languages: <%= ENV['TESSERACT_AVAILABLE_LANGUAGES'] %> diff --git a/lib/app.rb b/lib/app.rb index 99bcd00..f754716 100644 --- a/lib/app.rb +++ b/lib/app.rb @@ -192,6 +192,28 @@ class App < Sinatra::Base respond_with_error e end + # + # Detect document language + # + # POST params: + # file - the file to detect language + post '/detect-language' do + unless params[:file] + return respond 400, "missing file parameter" + end + + unless params[:file].respond_to?(:fetch) and params[:file].fetch(:tempfile, nil).respond_to?(:read) + return respond 400, "invalid file parameter" + end + + body = params[:file][:tempfile].read + content = Converter.new(logger: @logger).convert_file('detect-language', body) + content_type content.mime_type + content + rescue StandardError => e + respond_with_error e + end + # Legacy method to convert files # Brought over from Heathen # diff --git a/lib/config.rb b/lib/config.rb index e94d90b..dce7a25 100644 --- a/lib/config.rb +++ b/lib/config.rb @@ -45,6 +45,8 @@ class C_ attr_accessor :tika_config_directory # @return [String] Params for wkhtmltopdf attr_accessor :wkhtmltopdf_params + # @return [Array] Languages available to Tesseract for OCR. Defaults to `["eng"]` + attr_accessor :tesseract_available_languages def self.config_file_path Pathname.new File.expand_path('../config/app.yml', __dir__) @@ -70,6 +72,7 @@ def self.config c.tika_config_directory = yaml['tika_config_directory'] || '../tmp/tika' c.wkhtmltopdf_params = yaml['wkhtmltopdf_params'] || '' + c.tesseract_available_languages = yaml['tesseract_available_languages'].to_s.split(',') || %w[eng] c end diff --git a/lib/heathen/processor_methods/detect_language.rb b/lib/heathen/processor_methods/detect_language.rb new file mode 100644 index 0000000..11c9be7 --- /dev/null +++ b/lib/heathen/processor_methods/detect_language.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true + +module Heathen + class Processor + def detect_language + executioner.execute( + Colore::C_.tika_path, + "--config=#{Colore::C_.tika_config}", + '--language', + job.content_file, + binary: true + ) + raise ConversionFailed.new if executioner.last_exit_status != 0 + + job.content = executioner.stdout + end + end +end diff --git a/lib/heathen/task.rb b/lib/heathen/task.rb index 666b493..e421a5f 100644 --- a/lib/heathen/task.rb +++ b/lib/heathen/task.rb @@ -103,3 +103,7 @@ def task_key(action, mime_type) Heathen::Task.register 'doc', '.*' do perform_task 'msoffice' end + +Heathen::Task.register 'detect_language', '.*' do + detect_language +end diff --git a/lib/tika_config.rb b/lib/tika_config.rb index 4232914..a77cb23 100644 --- a/lib/tika_config.rb +++ b/lib/tika_config.rb @@ -20,7 +20,7 @@ module TikaConfig - %s + %s @@ -34,14 +34,18 @@ def tika_config_path Pathname.new File.expand_path(Colore::C_.tika_config_directory, __dir__) end - def path_for!(language_alpha3) - file = tika_config_path.join('ocr', VERSION, "tika.#{language_alpha3}.xml") + def path_for!(alpha3_languages, filename:) + file = tika_config_path.join('ocr', VERSION, "tika.#{filename}.xml") return file if file.file? FileUtils.mkdir_p(tika_config_path.join('ocr', VERSION)) - file.write format(TEMPLATE, language_alpha3: language_alpha3) + file.write format(TEMPLATE, alpha3_languages: alpha3_languages.join('+')) file end + + def path_for_language!(language_alpha3) + path_for!([language_alpha3], filename: language_alpha3) + end end # Returns the file path of the Tika configuration for performing OCR @@ -55,7 +59,15 @@ def path_for!(language_alpha3) def self.path_for(language) language_alpha3 = Colore::Utils.language_alpha3(language) || DEFAULT_LANGUAGE - path_for!(language_alpha3) + path_for_language!(language_alpha3) + end + + # Returns the file path of the Tika configuration for performing language + # detection. + # + # @return [Pathname] The path to the Tika configuration file for language detection + def self.path_for_language_detection + path_for!(Colore::C_.tesseract_available_languages, filename: 'language_detection') end end end diff --git a/spec/heathen/processor_methods/detect_language_spec.rb b/spec/heathen/processor_methods/detect_language_spec.rb new file mode 100644 index 0000000..a4f3c56 --- /dev/null +++ b/spec/heathen/processor_methods/detect_language_spec.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Heathen::Processor do + let(:content) { fixture('heathen/quickfox.ar.jpg').read } + let(:job) { Heathen::Job.new 'foo', content } + let(:processor) { described_class.new job: job, logger: spec_logger } + + after do + processor.clean_up + end + + describe '#detect_language' do + before do + processor.detect_language + end + + context 'with English documents' do + let(:content) { fixture('heathen/quickfox.jpg').read } + + it 'detects English input file language' do + expect(job.content).to eq 'en' + expect(job.content.mime_type).to eq 'text/plain; charset=us-ascii' + end + end + + context 'with Arabic documents' do + let(:content) { fixture('heathen/quickfox.ar.jpg').read } + + it 'detects Arabic input file language' do + expect(job.content).to eq 'ar' + expect(job.content.mime_type).to eq 'text/plain; charset=us-ascii' + end + end + end +end diff --git a/spec/integration/standard_tasks_spec.rb b/spec/integration/standard_tasks_spec.rb index 14d8e01..b8e39e4 100644 --- a/spec/integration/standard_tasks_spec.rb +++ b/spec/integration/standard_tasks_spec.rb @@ -22,6 +22,14 @@ end end + describe 'detect_language' do + it 'runs' do + content = fixture('heathen/quickfox.jpg').read + new_content = converter.convert 'detect_language', content + expect(new_content.mime_type).to eq 'text/plain; charset=us-ascii' + end + end + describe 'ocr_text' do it 'converts jpeg' do content = fixture('heathen/quickfox.jpg').read diff --git a/spec/lib/tika_config_spec.rb b/spec/lib/tika_config_spec.rb index 455c743..0808688 100644 --- a/spec/lib/tika_config_spec.rb +++ b/spec/lib/tika_config_spec.rb @@ -56,4 +56,33 @@ end end end + + describe '.path_for_language_detection' do + subject(:path_for_language_detection) { described_class.path_for_language_detection } + + before do + allow(Colore::C_.config).to receive(:tesseract_available_languages).and_return(%w[eng fra]) + end + + it 'returns the correct configuration file path' do + expect(path_for_language_detection).to eq tika_test_config_path.join('ocr', described_class::VERSION, 'tika.language_detection.xml') + end + + it 'includes all the available languages' do + expect(path_for_language_detection.read).to include('eng+fra') + end + + context 'when the configuration file is already present' do + before do + allow(FileUtils).to receive(:mkdir_p) + .with(tika_test_config_path.join('ocr', described_class::VERSION)) + .and_call_original + end + + it 'does not overwrite it' do + 2.times { described_class.path_for_language_detection } + expect(FileUtils).to have_received(:mkdir_p).once + end + end + end end