From b1354697cc0e3f4a7ef8bd7ec502c2400a01b46a Mon Sep 17 00:00:00 2001 From: Barak Neeman Date: Wed, 24 Jul 2024 14:06:08 +0200 Subject: [PATCH] 21941 replaces the interpolation tag in i18n-tasks for our custom tag This allows DeepL to "read" the interpolation contents, which improves translation accuracy and also fixes the problem of interpolations getting pushed to the front of sentences Closes bookingexperts/support#21941 --- .../tasks/translators/deepl_translator.rb | 87 ++++++++++++++++- spec/deepl_translate_spec.rb | 97 ++++++++++++++++++- 2 files changed, 177 insertions(+), 7 deletions(-) diff --git a/lib/i18n/tasks/translators/deepl_translator.rb b/lib/i18n/tasks/translators/deepl_translator.rb index 5399b9ef..550b2e0a 100644 --- a/lib/i18n/tasks/translators/deepl_translator.rb +++ b/lib/i18n/tasks/translators/deepl_translator.rb @@ -3,7 +3,7 @@ require 'i18n/tasks/translators/base_translator' module I18n::Tasks::Translators - class DeeplTranslator < BaseTranslator + class DeeplTranslator < BaseTranslator # rubocop:disable Metrics/ClassLength # max allowed texts per request BATCH_SIZE = 50 # those languages must be specified with their sub-kind e.g en-us @@ -60,14 +60,14 @@ def options_for_plain # @param [String] value # @return [String] 'hello, %{name}' => 'hello, %{name}' - def replace_interpolations(value) + def original_replace_interpolations(value) value.gsub(INTERPOLATION_KEY_RE, '\0') end # @param [String] untranslated # @param [String] translated # @return [String] 'hello, %{name}' => 'hello, %{name}' - def restore_interpolations(untranslated, translated) + def original_restore_interpolations(untranslated, translated) return translated if untranslated !~ INTERPOLATION_KEY_RE translated.gsub(%r{}, '') @@ -75,6 +75,87 @@ def restore_interpolations(untranslated, translated) raise_interpolation_error(untranslated, translated, e) end + # deepl does a better job with interpolations when it doesn't have to deal + # with
tags, so we replace all of them with meaningless asterisk chains + BR_REGEXP = %r{(\s*)+}i.freeze + BR_SINGLE_MARKER = ' *** ' + BR_DOUBLE_MARKER = ' ***** ' + + # letting deepl 'read' the interpolations gives better translations (and + # solves the problem of interpolations getting pushed all the way to the + # front of the sentence), however, deepl will also try to translate + # the interpolations and that gets messy. + # we use nonsense three-letter acronyms so deepl will 'read' them and leave + # them alone (the letter X also works very well, except in sentences with + # several consecutive interpolations because that reads X X X and deepl + # doesn't handle that well) + # deepl also needs to know if an interpolation will be a word or a number, + # for romance languages it matters. a little Spanish lesson to illustrate: + # "%{foo} betalingen" translates either to "facturas %{foo}" + # (openstaande betalingen -> facturas pendientes) or to "%{foo} facturas" + # (5 betalingen -> 5 facturas) + # for interpolation keys that are usually numeric, we pick a number + # instead of the three-letter acronym (more consistency in how we name + # interpolation keys would help) + LETTER_SUBS = %w[RYX QFN VLB XOG DWP ZMQ JZQ WVS LRX HPM].freeze + NUM_SUBS = %w[17 19 23 29 31 37 41 43 47 53].freeze + + def sub_for_handle(handle, index) + case handle.gsub(/[^a-z]/, '') + when 'count', 'minutes', 'hours' + NUM_SUBS[index % NUM_SUBS.size] + else + LETTER_SUBS[index % LETTER_SUBS.size] + end + end + + # BEX version of replace_interpolation + def replace_interpolations(value) + index = 0 + value.gsub(INTERPOLATION_KEY_RE) do |handle| + sub = sub_for_handle(handle, index) + index += 1 + "#{sub}" + end.gsub(BR_REGEXP) do |br| + if br.downcase.count('b') == 2 + # never more than two
in a row, it gets messy + BR_DOUBLE_MARKER + else + BR_SINGLE_MARKER + end + end + end + + # reversing our substitutions should be straight-forward, but it's not + # because deepl gets creative. cases are explained inline. + def restore_interpolations(untranslated, translated) + translated.gsub(%r{(.?)([^<]*)}) do + char = ::Regexp.last_match(1) + handle = ::Regexp.last_match(2) + sub = ::Regexp.last_match(3) + body = ::Regexp.last_match(4) + if body == sub + # deepl kept the 'sub' text inside the tag and nothing else, clean. + "#{char}#{handle}" + elsif body.index(sub) + # deepl took some letters from outside the tag and placed them + # inside the e.g. task "RYX" + before, after = body.split(sub, 2) + "#{before}#{handle}#{after}" + elsif "#{char}#{body}".downcase == sub.downcase + # deepl took the first letter from inside the tag and placed it + # immediately before the tag e.g. Ryx + handle + else + # instead of trying to look normal the fallback prints something + # obviously wrong hoping to get some attention and a manual fix + "!!!!!#{sub.inspect} (#{char.inspect} #{body.inspect})!!!!!" + end + end.gsub(BR_DOUBLE_MARKER, '

').gsub(BR_SINGLE_MARKER, '
') + rescue StandardError => e + raise_interpolation_error(untranslated, translated, e) + end + def no_results_error_message I18n.t('i18n_tasks.deepl_translate.errors.no_results') end diff --git a/spec/deepl_translate_spec.rb b/spec/deepl_translate_spec.rb index 79494a68..aeb1f5b1 100644 --- a/spec/deepl_translate_spec.rb +++ b/spec/deepl_translate_spec.rb @@ -9,10 +9,10 @@ text_test = [ 'key', - "Hello, %{user} O'Neill! How are you? {{ Check out this Liquid tag, it should not be translated }} \ - {% That applies to this Liquid tag as well %}", - "¡Hola, %{user} O'Neill! ¿Qué tal estás? {{ Check out this Liquid tag, it should not be translated }} \ - {% That applies to this Liquid tag as well %}" + "Hello, %{user} O'Neill! How are you? {{ Check out this Liquid tag, it should not be translated }} " \ + '{% That applies to this Liquid tag as well %}', + "¡Hola, %{user} O'Neill! ¿Qué tal? {{ Check out this Liquid tag, it should not be translated }} " \ + '{% That applies to this Liquid tag as well %}' ] html_test_plrl = [ @@ -92,4 +92,93 @@ end end end + + # Don't expect deepl's answers to be exactly the same each run + describe 'translating Dutch into other languages' do + let(:base_task) { I18n::Tasks::BaseTask.new } + + before do + skip 'temporarily disabled on JRuby due to https://github.com/jruby/jruby/issues/4802' if RUBY_ENGINE == 'jruby' + skip 'DEEPL_AUTH_KEY env var not set' unless ENV['DEEPL_AUTH_KEY'] + end + + it 'tells time' do + german, english, spanish = + translate_dutch(hours_and_minutes: '%{hours} uur en %{minutes} minuten') + expect(german).to eq '%{hours} Stunden und %{minutes} Minuten' + expect(english).to eq '%{hours} hours and %{minutes} minutes' + expect(spanish).to eq '%{hours} horas y %{minutes} minutos' + end + + it 'counts' do + german, english, spanish = + translate_dutch(other: '%{count} taken') + expect(german).to eq '%{count} Aufgaben' + expect(english).to eq '%{count} tasks' + expect(spanish).to eq '%{count} tareas' + end + + it 'assigns' do + german, english, spanish = + translate_dutch(assigned: 'Taak "%{todo}" toegewezen aan %{user}') + expect(german).to eq 'To-dos "%{todo}" zugewiesen an %{user}' + expect(english).to eq 'Task "%{todo}" assigned to %{user}' + expect(spanish).to eq 'Tarea "%{todo}" asignada a %{user}' + end + + it 'sings' do + german, english, spanish = + translate_dutch(verse: 'Ik zou zo graag een %{animal} kopen. Ik zag %{count} beren %{food} smeren') + expect(german).to eq 'Ich würde so gerne einen %{animal} kaufen. Ich sah %{count} Bären, die %{food} schmierten' + # greasing is a funny way to say smeren, but we let it slide + expect(english).to eq 'I would so love to buy a %{animal}. I saw %{count} bears greasing %{food}' + expect(spanish).to eq 'Me encantaría comprar un %{animal}. Vi %{count} osos engrasando %{food}' + end + + it 'sends emails' do + german, english, spanish = + translate_dutch( + email_body_html: '{{ booking.greeting }},

Bijgevoegd ziet u een factuur van {{ park.name }} met ' \ + 'factuurnummer {{ invoice.invoice_nr }}.
Volgens onze administratie had het ' \ + 'verschuldigde bedrag van {{ locals.payment_collector_total }} op ' \ + '{{ locals.payment_collector_deadline }} moeten zijn betaald. Helaas hebben we nog ' \ + 'geen betaling ontvangen.
' + ) + expect(german).to eq '{{ booking.greeting }},

Anbei finden Sie eine Rechnung von {{ park.name }} ' \ + 'mit der Rechnungsnummer {{ invoice.invoice_nr }}.
Laut unserer Verwaltung hätte der ' \ + 'von {{ locals.payment_collector_total }} geschuldete Betrag ' \ + 'am {{ locals.payment_collector_deadline }} bezahlt werden müssen. Leider haben wir die ' \ + 'Zahlungen noch nicht erhalten.
' + expect(english).to eq '{{ booking.greeting }},

Attached please find an invoice from {{ park.name }} ' \ + 'with invoice number {{ invoice.invoice_nr }}.
According to our records, the amount ' \ + 'due from {{ locals.payment_collector_total }} on ' \ + '{{ locals.payment_collector_deadline }} should have been paid. Unfortunately, we have ' \ + 'not yet received payment.
' + expect(spanish).to eq '{{ booking.greeting }},

Adjuntamos una factura de {{ park.name }} con el ' \ + 'número de factura {{ invoice.invoice_nr }}.
Según nuestros registros, el importe ' \ + 'adeudado por {{ locals.payment_collector_total }} debería haber sido abonado en ' \ + '{{ locals.payment_collector_deadline }}. Lamentablemente, aún no hemos recibido ' \ + 'el pago.
' + end + + it 'asks itself why are you even translating this' do + german, english, spanish = + translate_dutch(action: '%{subject} %{verb} %{object}') + expect(german).to eq '%{subject} %{verb} %{object}' + expect(english).to eq '%{subject} %{verb} %{object}' + expect(spanish).to eq '%{subject} %{verb} %{object}' + end + + def translate_dutch(dutch_pair) + key = dutch_pair.keys.first + phrase = dutch_pair[key] + locales = %w[de en-us es] + branches = locales.each_with_object({}) do |locale, hash| + hash[locale] = { 'testing' => { key => phrase } } + end + tree = build_tree(branches) + translations = base_task.translate_forest(tree, from: 'nl', backend: :deepl) + locales.map { |locale| translations[locale]['testing'][key].value.strip } + end + end end