Skip to content

Commit

Permalink
Add Language Detection endpoint
Browse files Browse the repository at this point in the history
Close #247
  • Loading branch information
tagliala committed Sep 12, 2024
1 parent 9416507 commit c3591f9
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 0 deletions.
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,28 @@ Response:

... PDF document body ...

### Detect language

This is a foreground document language detection request. The detected language
will be returned as the response body.

POST /detect-language

Params *(suggest using `multipart/form-data`)*:

* `file` - the file to convert

#### Example:

POST /detect-language
file=... foo.docx ...

Response:

Content-Type: text/plain

en

## Callbacks

When a document conversion is completed, an attempt will be made to POST a
Expand Down
24 changes: 24 additions & 0 deletions lib/app.rb
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,30 @@ class App < Sinatra::Base
end
end

#
# Detect document language
#
# POST params:
# file - the file to detect language
post '/detect-language' do
begin
unless params[:file]
return respond 400, "missing file parameter"
end

unless params[:file].respond_to?(:fetch) and params[:file].fetch(:tempfile, nil).respond_to?(:read)
return respond 400, "invalid file parameter"
end

body = params[:file][:tempfile].read
content = Converter.new(logger: @logger).convert_file('detect_language', body)
content_type content.mime_type
content
rescue StandardError => e
respond_with_error e
end
end

# Legacy method to convert files
# Brought over from Heathen
#
Expand Down
18 changes: 18 additions & 0 deletions lib/heathen/processor_methods/detect_language.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# frozen_string_literal: true

module Heathen
class Processor
def detect_language
expect_mime_type 'application/pdf'

executioner.execute(
Colore::C_.tika_path,
'--language',
job.content_file,
binary: true
)

job.content = executioner.stdout
end
end
end
20 changes: 20 additions & 0 deletions spec/heathen/processor_methods/detect_language_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# frozen_string_literal: true

require 'spec_helper'

describe Heathen::Processor do
let(:content) { File.read(fixture('heathen/quickfox.pdf')) }
let(:job) { Heathen::Job.new 'foo', content, 'en' }
let(:processor) { described_class.new job: job, logger: Logger.new($stderr) }

after do
processor.clean_up
end

context '#detect_language' do
it 'detects input file language' do
processor.detect_language
expect(job.content).to eq 'en'
end
end
end

0 comments on commit c3591f9

Please sign in to comment.