Skip to content

Commit

Permalink
21941 replaces the <i18n> interpolation tag in i18n-tasks for our cus…
Browse files Browse the repository at this point in the history
…tom <var> tag

This allows DeepL to "read" the interpolation contents, which improves translation accuracy and also fixes the problem of interpolations getting pushed to the front of sentences

Closes bookingexperts/monorepo#21941
  • Loading branch information
barak-neeman committed Jul 24, 2024
1 parent 383db5d commit a5c1843
Show file tree
Hide file tree
Showing 2 changed files with 161 additions and 6 deletions.
85 changes: 83 additions & 2 deletions lib/i18n/tasks/translators/deepl_translator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -60,21 +60,102 @@ def options_for_plain

# @param [String] value
# @return [String] 'hello, %{name}' => 'hello, <i18n>%{name}</i18n>'
def replace_interpolations(value)
def original_replace_interpolations(value)
value.gsub(INTERPOLATION_KEY_RE, '<i18n>\0</i18n>')
end

# @param [String] untranslated
# @param [String] translated
# @return [String] 'hello, <i18n>%{name}</i18n>' => 'hello, %{name}'
def restore_interpolations(untranslated, translated)
def original_restore_interpolations(untranslated, translated)
return translated if untranslated !~ INTERPOLATION_KEY_RE

translated.gsub(%r{</?i18n>}, '')
rescue StandardError => e
raise_interpolation_error(untranslated, translated, e)
end

# deepl does a better job with interpolations when it doesn't have to deal
# with <br> tags, so we replace all of them with meaningless asterisk chains
BR_REGEXP = %r{(<br\s*/?>\s*)+}i.freeze
BR_SINGLE_MARKER = ' *** '
BR_DOUBLE_MARKER = ' ***** '

# letting deepl 'read' the interpolations gives better translations (and
# solves the problem of interpolations getting pushed all the way to the
# front of the sentence), however, deepl will also try to translate
# the interpolations and that gets messy.
# we use nonsense three-letter acronyms so deepl will 'read' them and leave
# them alone (the letter X also works very well, except in sentences with
# several consecutive interpolations because that reads X X X and deepl
# doesn't handle that well)
# deepl also needs to know if an interpolation will be a word or a number,
# for romance languages it matters. a little Spanish lesson to illustrate:
# "%{foo} betalingen" translates either to "facturas %{foo}"
# (openstaande betalingen -> facturas pendientes) or to "%{foo} facturas"
# (5 betalingen -> 5 facturas)
# for interpolation keys that are usually numeric, we pick a number
# instead of the three-letter acronym (more consistency in how we name
# interpolation keys would help)
LETTER_SUBS = %w[RYX QFN VLB XOG DWP ZMQ JZQ WVS LRX HPM].freeze
NUM_SUBS = %w[17 19 23 29 31 37 41 43 47 53].freeze

def key_substitution(stache, index)
case stache.gsub(/[^a-z]/, '')
when 'count', 'minutes', 'hours'
NUM_SUBS[index % NUM_SUBS.size]
else
LETTER_SUBS[index % LETTER_SUBS.size]
end
end

# BEX version of replace_interpolation
def replace_interpolations(value)
key_index = 0
value.gsub(INTERPOLATION_KEY_RE) do |stache|
sub = key_substitution(stache, key_index)
key_index += 1
"<var stache=\"#{stache}\" sub=\"#{sub}\">#{sub}</var>"
end.gsub(BR_REGEXP) do |br|
if br.downcase.count('b') == 2
# never more than two <br> in a row, it gets messy
BR_DOUBLE_MARKER
else
BR_SINGLE_MARKER
end
end
end

# reversing our substitutions should be straight-forward, but it's not
# because deepl gets creative. cases are explained inline.
def restore_interpolations(untranslated, translated)
translated.gsub(%r{(.?)<var stache="([^"]*)" sub="([^"]*)">([^<]*)</var>}) do
char = $1

Check failure on line 133 in lib/i18n/tasks/translators/deepl_translator.rb

View workflow job for this annotation

GitHub Actions / lint

[Correctable] Style/PerlBackrefs: Prefer ::Regexp.last_match(1) over $1.
stache = $2

Check failure on line 134 in lib/i18n/tasks/translators/deepl_translator.rb

View workflow job for this annotation

GitHub Actions / lint

[Correctable] Style/PerlBackrefs: Prefer ::Regexp.last_match(2) over $2.
sub = $3

Check failure on line 135 in lib/i18n/tasks/translators/deepl_translator.rb

View workflow job for this annotation

GitHub Actions / lint

[Correctable] Style/PerlBackrefs: Prefer ::Regexp.last_match(3) over $3.
body = $4

Check failure on line 136 in lib/i18n/tasks/translators/deepl_translator.rb

View workflow job for this annotation

GitHub Actions / lint

[Correctable] Style/PerlBackrefs: Prefer ::Regexp.last_match(4) over $4.
if body == sub
# deepl kept the 'sub' text inside the <var> tag and nothing else, clean.
"#{char}#{stache}"
elsif body.index(sub)
# deepl took some letters from outside the <var> tag and placed them
# inside the <var> e.g. task <var>"RYX</var>"
before, after = body.split(sub,2)

Check failure on line 143 in lib/i18n/tasks/translators/deepl_translator.rb

View workflow job for this annotation

GitHub Actions / lint

[Correctable] Layout/SpaceAfterComma: Space missing after comma.
"#{before}#{stache}#{after}"
elsif "#{char}#{body}".downcase == sub.downcase
# deepl took the first letter from inside the <var> tag and placed it
# immediately before the <var> tag e.g. R<var>yx</var>
stache
else
# instead of trying to look normal the fallback prints something
# obviously wrong hoping to get some attention and a manual fix
"!!!!!#{sub.inspect} (#{char.inspect} #{body.inspect})!!!!!"
end
end.gsub(BR_DOUBLE_MARKER, '<br /><br />').gsub(BR_SINGLE_MARKER, '<br />')
rescue StandardError => e
raise_interpolation_error(untranslated, translated, e)
end

def no_results_error_message
I18n.t('i18n_tasks.deepl_translate.errors.no_results')
end
Expand Down
82 changes: 78 additions & 4 deletions spec/deepl_translate_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,8 @@

text_test = [
'key',
"Hello, %{user} O'Neill! How are you? {{ Check out this Liquid tag, it should not be translated }} \
{% That applies to this Liquid tag as well %}",
"¡Hola, %{user} O'Neill! ¿Qué tal estás? {{ Check out this Liquid tag, it should not be translated }} \
{% That applies to this Liquid tag as well %}"
"Hello, %{user} O'Neill! How are you? {{ Check out this Liquid tag, it should not be translated }} {% That applies to this Liquid tag as well %}",

Check failure on line 12 in spec/deepl_translate_spec.rb

View workflow job for this annotation

GitHub Actions / lint

Layout/LineLength: Line is too long. [150/120]
"¡Hola, %{user} O'Neill! ¿Qué tal? {{ Check out this Liquid tag, it should not be translated }} {% That applies to this Liquid tag as well %}"

Check failure on line 13 in spec/deepl_translate_spec.rb

View workflow job for this annotation

GitHub Actions / lint

Layout/LineLength: Line is too long. [146/120]
]

html_test_plrl = [
Expand Down Expand Up @@ -92,4 +90,80 @@
end
end
end

# Don't expect deepl's answers to be exactly the same each run
describe 'in het Nederlands' do
let(:base_task) { I18n::Tasks::BaseTask.new }

before do
# this is a good place to go ENV['DEEPL_AUTH_KEY'] = '...' but do not commit it
skip 'temporarily disabled on JRuby due to https://github.com/jruby/jruby/issues/4802' if RUBY_ENGINE == 'jruby'
skip 'DEEPL_AUTH_KEY env var not set' unless ENV['DEEPL_AUTH_KEY']
end

it 'tells time' do
german, english, spanish =
translate_dutch(hours_and_minutes: "%{hours} uur en %{minutes} minuten")

Check failure on line 106 in spec/deepl_translate_spec.rb

View workflow job for this annotation

GitHub Actions / lint

[Correctable] Style/StringLiterals: Prefer single-quoted strings when you don't need string interpolation or special symbols.
expect(german).to eq '%{hours} Stunden und %{minutes} Minuten'
expect(english).to eq '%{hours} hours and %{minutes} minutes'
expect(spanish).to eq '%{hours} horas y %{minutes} minutos'
end

it 'counts' do
german, english, spanish =
translate_dutch(other: "%{count} taken")

Check failure on line 114 in spec/deepl_translate_spec.rb

View workflow job for this annotation

GitHub Actions / lint

[Correctable] Style/StringLiterals: Prefer single-quoted strings when you don't need string interpolation or special symbols.
expect(german).to eq '%{count} Aufgaben'
expect(english).to eq '%{count} tasks'
expect(spanish).to eq '%{count} tareas'
end

it 'assigns' do
german, english, spanish =
translate_dutch(assigned: 'Taak "%{todo}" toegewezen aan %{user}')
expect(german).to eq 'To-dos "%{todo}" zugewiesen an %{user}'
expect(english).to eq 'Task "%{todo}" assigned to %{user}'
expect(spanish).to eq 'Tarea "%{todo}" asignada a %{user}'
end

it 'sings' do
german, english, spanish =
translate_dutch(verse: "Ik zou zo graag een %{animal} kopen. Ik Zag %{count} beren %{food} smeren")
expect(german).to eq 'Ich würde so gerne einen %{animal} kaufen. Ich sah %{count} Bären, die %{food} schmierten'
# greasing is a funny way to say smeren, but we let it slide
expect(english).to eq 'I would so love to buy a %{animal}. I saw %{count} bears greasing %{food}'
expect(spanish).to eq 'Me encantaría comprar un %{animal}. Vi %{count} osos engrasando %{food}'
end

it 'sends emails' do
german, english, spanish =
translate_dutch(email_body_html: "{{ booking.greeting }},<br><br>Bijgevoegd ziet u een factuur van {{ park.name }} met factuurnummer {{ invoice.invoice_nr }}.<br />Volgens onze administratie had het verschuldigde bedrag van {{ locals.payment_collector_total }} op {{ locals.payment_collector_deadline }} moeten zijn betaald. Helaas hebben we nog geen betaling ontvangen.<br>")
puts german
puts english
puts spanish
expect(german).to eq '{{ booking.greeting }},<br /><br />Anbei finden Sie eine Rechnung von {{ park.name }} mit der Rechnungsnummer {{ invoice.invoice_nr }}.<br />Laut unserer Verwaltung hätte der von {{ locals.payment_collector_total }} geschuldete Betrag am {{ locals.payment_collector_deadline }} bezahlt werden müssen. Leider haben wir die Zahlungen noch nicht erhalten.<br />'
expect(english).to eq '{{ booking.greeting }},<br /><br />Attached please find an invoice from {{ park.name }} with invoice number {{ invoice.invoice_nr }}.<br />According to our records, the amount due from {{ locals.payment_collector_total }} on {{ locals.payment_collector_deadline }} should have been paid. Unfortunately, we have not yet received payment.<br />'
expect(spanish).to eq '{{ booking.greeting }},<br /><br />Adjuntamos una factura de {{ park.name }} con el número de factura {{ invoice.invoice_nr }}.<br />Según nuestros registros, el importe adeudado por {{ locals.payment_collector_total }} debería haber sido abonado en {{ locals.payment_collector_deadline }}. Lamentablemente, aún no hemos recibido el pago.<br />'
end

it 'asks itself why are you even translating this' do
german, english, spanish =
translate_dutch(action: '%{subject} %{verb} %{object}')
expect(german).to eq '%{subject} %{verb} %{object}'
expect(english).to eq '%{subject} %{verb} %{object}'
expect(spanish).to eq '%{subject} %{verb} %{object}'
end

def translate_dutch(dutch_pair)
key = dutch_pair.keys.first
phrase = dutch_pair[key]
locales = %w[de en-us es]
branches = locales.inject({}) do |hash, locale|
hash[locale] = { 'testing' => { key => phrase } }
hash
end
tree = build_tree(branches)
translations = base_task.translate_forest(tree, from: 'nl', backend: :deepl)
locales.map { |locale| translations[locale]['testing'][key].value.strip }
end
end
end

0 comments on commit a5c1843

Please sign in to comment.