From 6df212c65d9947bada72921bf22b10b9d316cee3 Mon Sep 17 00:00:00 2001 From: p-goulart Date: Fri, 5 Jan 2024 16:53:38 +0100 Subject: [PATCH 1/9] [pt] Adapt resources to multiwords, dictionary fixes --- .../resource/pt/dialect_alternations.txt | 2 + .../resource/pt/disambiguation.xml | 17 ++++ .../resource/pt/entities/misc.ent | 2 + .../languagetool/resource/pt/multiwords.txt | 97 ++++++++++--------- .../org/languagetool/rules/pt/grammar.xml | 8 +- 5 files changed, 75 insertions(+), 51 deletions(-) diff --git a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/dialect_alternations.txt b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/dialect_alternations.txt index 359f2266a3c2..5dd8bf804981 100644 --- a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/dialect_alternations.txt +++ b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/dialect_alternations.txt @@ -7555,3 +7555,5 @@ groenlandesas=gronelandesas cátion=catíones cátions=catíones íons=íones +fato=facto +fatos=factos diff --git a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/disambiguation.xml b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/disambiguation.xml index 34e7bdab8843..cb2857c31176 100644 --- a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/disambiguation.xml +++ b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/disambiguation.xml @@ -3167,6 +3167,23 @@ + + + + \p{Lu}.+ + &english_compounders; + + + + + + + + &english_compounders; + + + + diff --git a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/misc.ent b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/misc.ent index 19bd49759a70..d06a54d564af 100644 --- a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/misc.ent +++ b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/entities/misc.ent @@ -207,3 +207,5 @@ limpo"> + + diff --git a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/multiwords.txt b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/multiwords.txt index fa921ecab167..26da94673771 100644 --- a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/multiwords.txt +++ b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/multiwords.txt @@ -159,8 +159,8 @@ Guerra de Comando e Controlo NPFS000 Guerra Fria NPFS000 Guerra Santa NPFS000 Guerras Napoleónicas NPFP000 -home office NPMS000_ -Home Office NPMS000_ +home office NCMS000_ +Home Office NCMS000_ Hora H NPCS000 Idade Média NPFS000 Ilhas Mapia NPFPG00 @@ -1386,7 +1386,7 @@ La Tour NPCNG00_ La Toussuire NPCNG00_ La Valletta NPCNG00_ La Vanguardia NPFSO00_ -lan house NPFS000_ +lan house NCFS000_ lan houses NCFP000_ Land Rover NPCNO00_ LanguageTool Premium NPMSO00_ @@ -1437,8 +1437,8 @@ Mac OS X Server NPMS000 Mae Sai NPCNG00_ Manchester United NPMSG00_ Manuel Valls NPMSSP0_ -mare nostrum NPMSG00_ -Mare Nostrum NPMSG00_ +mare nostrum NCMSG00_ +Mare Nostrum NCMSG00_ María José NPFSSP0_ Mariah Carey NPFSSP0_ Mariano Rajoy NPMSSP0_ @@ -4189,7 +4189,6 @@ Adolf Otto Reinhold Windaus NPMS000_ Adolf von Baeyer NPMS000_ Adolfo Pérez Esquivel NPMS000_ Agatha Christie NPFS000_ -Agência Internacional de Energia Atómica NPMS000_ Ahmed Zewail NPMS000_ Akira Suzuki NPMS000_ Alan Heeger NPMS000_ @@ -9489,64 +9488,37 @@ casus belli NCMS000 Coitus interruptus NCMN000 coitus interruptus NCMN000 coup de force NCMS000 -Coup de force NCMS000 crème de la crème NCFS000 -Crème de la crème NCFS000 -Curriculum vitae NCMS000 curriculum vitae NCMS000 déjà vu NCMS000 -Déjà vu NCMS000 -Delirium tremens NCMN000 delirium tremens NCMN000 deus ex machina NCMN000 -Deus ex machina NCMN000 -Dolce vita NCFS000 dolce vita NCFS000 -Drag queen NCFS000 drag queen NCFS000 Drag queens NCFP000 drag queens NCFP000 fast food NCCS000 fast foods NCCP000 foc amic NCMS000 -Foc amic NCMS000 -Foie gras NCMS000 foie gras NCMS000 -Fumata blanca NCFS000 fumata blanca NCFS000 fumata negra NCFS000 -Fumata negra NCFS000 -Habeas corpus NCMS000 habeas corpus NCMS000 -Heavy metal NCMS000 heavy metal NCMS000 Internet Archive NPMS000_ Internet Explorer NPMS000_ jet lag NCMS000 -Jet lag NCMS000 -Laissez faire NCMS000 laissez faire NCMS000 -Ma non troppo RG_ ma non troppo RG_ making of NCMS000 -Making of NCMS000 -Mass media NCMN000 mass media NCMN000 mea culpa NCMN000 -Mea culpa NCMN000 melting pot NCMS000 -Melting pot NCMS000 -Modus operandi NCMN000 modus operandi NCMN000 -Modus vivendi NCMN000 modus vivendi NCMN000 nouvelle cuisine NCFS000 -Nouvelle cuisine NCFS000 -Pax romana NCFS000 pax romana NCFS000 -Persona non grata NCFS000 persona non grata NCFS000 -Prime time NCMS000 prime time NCMS000 rara avis NCFS000 Rara avis NCFS000 @@ -9651,8 +9623,8 @@ América Central NPFSG00_ América do Norte NPFSG00_ América do Sul NPFSG00_ América Latina NPFSG00_ -ampola capilar NPFS000 -ampolas capilares NPFP000 +ampola capilar NCFS000 +ampolas capilares NCFP000 Angela Merkel NPFS000_ Angelina Jolie NPFS000_ Anna Karenina NPFS000_ @@ -9673,11 +9645,11 @@ Atlântico Sul NPMS000_ Augusto Guerra NPMSO00_ Azeite de dendê NCMS000 azeite de dendê NCMS000 -bacalhau à Brás NPMS000 -Bacalhau à Brás NPMS000 +bacalhau à Brás NCMS000 +Bacalhau à Brás NCMS000 Bacalhau à Gomes Sá NPMS000 -bacalhau à Gomes Sá NPMS000 -bacalhau à Zé NPMS000 +bacalhau à Gomes Sá NCMS000 +bacalhau à Zé NCMS000 Bacalhau à Zé NPMS000 Backstreet Boys NPMP000_ batata rosti NCFS000_ @@ -9699,7 +9671,7 @@ Braga Jazz NPMSO00_ Braga Park NPMSG00_ Braga Parque NPMSG00_ Buenos Aires NPFSG00 -business as usual NPMS000_ +business as usual NCMS000_ cabeça de cartaz NCCS000_ Cabeça de cartaz NCCS000_ cabeça de casal NCCS000_ @@ -9786,8 +9758,8 @@ corte brunoise VMN0000_ Corte julienne VMN0000_ corte julienne VMN0000_ Corto Maltese NPMS000_ -cosa nostra NPFS000_ -Cosa nostra NPFS000_ +cosa nostra NCFS000_ +Cosa nostra NCFS000_ Costa do Marfim NPFSG00_ Costa Rica NPFSG00_ Cream cheese NCMS000_ @@ -9880,7 +9852,7 @@ hora H NCFS000_ I Guerra Mundial NPFS000_ II Guerra Mundial NPFS000_ III Guerra Mundial NPFS000_ -ilhas Cook NPFP000_ +Ilhas Cook NPFP000_ Impressoras laser NCFP000 impressoras laser NCFP000 indústria automóvel NCFS000 @@ -9911,8 +9883,8 @@ linfócitos T NCMP000_ lingua franca NCFS000_ linha SOS NCFS000_ linhas SOS NCFP000_ -Live CD NPMS000_ -live CD NPMS000_ +Live CD NCMS000_ +live CD NCMS000_ localização GPS NCFS000 localizações GPS NCFS000 Loch Ness NPMS000_ @@ -9938,7 +9910,7 @@ Manuel Maria NPMSO00_ Máquinas Multibanco NCFP000 máquinas Multibanco NCFP000 máquinas multibanco NCFP000 -mar de Bering NPMS000_ +Mar de Bering NPMS000_ Mar Mediterrâneo NPMS000_ Margaret Atwood NPFS000_ Maria João NPFSO00_ @@ -11452,8 +11424,16 @@ iPad Mini NCMS000_ iPad Pro NCMS000_ MacBook Pro NCMS000_ MacBook Air NCMS000_ +business center NCMS000_ +business centers NCMP000_ call center NCMS000_ +call centers NCMP000_ shopping center NCMS000_ +shopping centers NCMP000_ +contact center NCMS000_ +contact centers NCMP000_ +support center NCMS000_ +support centers NCMP000_ sex appeal NCMS000_ sex shop NCFS000_ sex shops NCFP000_ @@ -11532,7 +11512,7 @@ in medias res L_LATIM_ facultas docendi L_LATIM_ venia legendi L_LATIM_ Grand Prix NPMS000_ -Grand Slam NCMS000_ +grand slam NCMS000_ horribile dictu L_LATIM_ Diabetes mellitus L_LATIM_ Diabetes renalis L_LATIM_ @@ -12411,5 +12391,28 @@ Steaua Bucareste NPCSO00_ Estrela de Bucareste NPCSO00_ M'Banza Kongo NPCSG00_ Nottingham Forest NPCSO00_ +AS Saint-Étienne NPCSO00_ Mikhail Yuryevich NPMSS00_ +Hall of Fame NPMSO00_ +Rock and Roll Hall of Fame NPMSO00_ +Mao Tsé-Tung NPMSS00_ +Deng Xiaoping NPMSS00_ +composite card NCMS000_ +composite cards NCMP000_ +SIM card NCMS000_ +SIM cards NCMP000_ +lean office NCMS000_ +lean offices NCMP000_ +Heil Hitler I +Sieg Heil I +clarinete basset NCMS000_ +clarinetes basset NCMP000_ +clarinete de basset NCMS000_ +clarinetes de basset NCMP000_ +green card NCMS000_ +green cards NCMP000_ +red carpet NCCS000_ +red carpets NCCP000_ +touch screen NCFS000_ +touch screens NCFP000_ diff --git a/languagetool-language-modules/pt/src/main/resources/org/languagetool/rules/pt/grammar.xml b/languagetool-language-modules/pt/src/main/resources/org/languagetool/rules/pt/grammar.xml index 40b917d2eb18..651dc829faa2 100644 --- a/languagetool-language-modules/pt/src/main/resources/org/languagetool/rules/pt/grammar.xml +++ b/languagetool-language-modules/pt/src/main/resources/org/languagetool/rules/pt/grammar.xml @@ -44203,7 +44203,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. - + &particles_of; @@ -44275,7 +44275,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. Rocki Mountains National Parrk. - + @@ -44396,7 +44396,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. Sucre de caña - + - @@ -44520,7 +44520,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. Notáveis são o signor Gabriele Leone, Giovanni Battista Gervasio, Pietro Denis, que viajou muito entre 1750 e 1810. - + .+ - From ba0e768c6eb7499032946066b01d08e135e1cb38 Mon Sep 17 00:00:00 2001 From: p-goulart Date: Fri, 5 Jan 2024 16:54:14 +0100 Subject: [PATCH 2/9] Add smart titlecase method to StringTools --- .../org/languagetool/tools/StringTools.java | 77 +++++++++++++++++++ .../languagetool/tools/StringToolsTest.java | 8 ++ 2 files changed, 85 insertions(+) diff --git a/languagetool-core/src/main/java/org/languagetool/tools/StringTools.java b/languagetool-core/src/main/java/org/languagetool/tools/StringTools.java index 1ed3c8a06539..021a47a2820a 100644 --- a/languagetool-core/src/main/java/org/languagetool/tools/StringTools.java +++ b/languagetool-core/src/main/java/org/languagetool/tools/StringTools.java @@ -18,6 +18,7 @@ */ package org.languagetool.tools; +import com.google.common.collect.Sets; import com.google.common.xml.XmlEscapers; import org.jetbrains.annotations.Contract; import org.jetbrains.annotations.Nullable; @@ -29,6 +30,8 @@ import java.text.Normalizer; import java.util.*; import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; import static java.util.regex.Pattern.*; @@ -75,6 +78,42 @@ public enum ApiPrintMode { private static final Pattern NOT_WORD_STR = compile("[^\\p{L}]+", DOTALL); private static final Pattern PATTERN = compile("(?U)[^\\p{Space}\\p{Alnum}\\p{Punct}]"); private static final Pattern DIACRIT_MARKS = compile("[\\p{InCombiningDiacriticalMarks}]"); + // Sets of words used for titlecasing in a few locales; useful for named entities in foreign languages, esp. English + private static final Set ENGLISH_TITLECASE_EXCEPTIONS = Collections.unmodifiableSet( + new HashSet<>(Arrays.asList("of", "in", "on", "the", "a", "an", "and", "or")) + ); + private static final Set PORTUGUESE_TITLECASE_EXCEPTIONS = Collections.unmodifiableSet( + new HashSet<>(Arrays.asList("e", "ou", "que", + "de", "do", "dos", "da", "das", + "o", "a", "os", "as", + "no", "nos", "na", "nas", + "ao", "aos", "à", "às")) + ); + private static final Set FRENCH_TITLECASE_EXCEPTIONS = Collections.unmodifiableSet( + new HashSet<>(Arrays.asList("et", "ou", "que", "qui", + "de", "du", "des", "en", + "le", "les", "la", + "un", "une", + "à", "au", "aux")) + ); + private static final Set SPANISH_TITLECASE_EXCEPTIONS = Collections.unmodifiableSet( + new HashSet<>(Arrays.asList("y", "e", "o", "u", "que", + "el", "la", "los", "las", + "un", "unos", "una", "unas", + "del", "nel", "de", "en", "a", "al")) + ); + private static final Set GERMAN_TITLECASE_EXCEPTIONS = Collections.unmodifiableSet( + new HashSet<>(Arrays.asList("von", "in", "im", "an", "am", "vom", "und", "oder", "dass", "ob", + "der", "die", "das", "dem", "den", "des", + "ein", "eines", "einem", "einen", "einer", "eine", + "kein", "keines", "keinem", "keinen", "keiner", "keine")) + ); + private static final Set DUTCH_TITLECASE_EXCEPTIONS = Collections.unmodifiableSet( + new HashSet<>(Arrays.asList("van", "in", "de", "het", "een", "en", "of")) + ); + + + private static final Set ALL_TITLECASE_EXCEPTIONS = collectAllTitleCaseExceptions(); private StringTools() { // only static stuff @@ -246,6 +285,44 @@ public static String uppercaseFirstChar(@Nullable String str, Language language) } } + private static Set collectAllTitleCaseExceptions() { + List> setList = Arrays.asList(ENGLISH_TITLECASE_EXCEPTIONS, PORTUGUESE_TITLECASE_EXCEPTIONS, + FRENCH_TITLECASE_EXCEPTIONS, SPANISH_TITLECASE_EXCEPTIONS, GERMAN_TITLECASE_EXCEPTIONS, DUTCH_TITLECASE_EXCEPTIONS); + Set union = setList.stream().flatMap(Set::stream).collect(Collectors.toSet()); + return union; + } + + /** + * Title case a string ignoring a list of words. These words are ignored due to titlecasing conventions in the most + * frequent languages. Differs from {@link #convertToTitleCaseIteratingChars(String)} in that it is less aggressive, + * i.e., we do not force titlecase in all caps words (e.g. IDEA does not become Idea). + * This method behaves the same regardless of the language, and is rather aggressive in its ignoring of words. + * We can, possibly, in the future, have language-specific titlecasing conventions. + */ + @Contract("!null -> !null") + @Nullable + public static String titlecaseGlobal(@Nullable final String str) { + assert str != null; + String[] strParts = str.split(" "); + if (strParts.length == 1) { + return uppercaseFirstChar(str); + } + StringJoiner titlecasedStr = new StringJoiner(" "); + for (int i=0; i < strParts.length; i++) { + String strPart = strParts[i]; + if (i == 0) { + titlecasedStr.add(uppercaseFirstChar(strPart)); + continue; + } + if (ALL_TITLECASE_EXCEPTIONS.contains(strPart.toLowerCase())) { + titlecasedStr.add(lowercaseFirstCharIfCapitalized(strPart)); + } else { + titlecasedStr.add(uppercaseFirstChar(strPart)); + } + } + return titlecasedStr.toString(); + } + /** * Return str modified so that its first character is now an * lowercase character. If str starts with non-alphabetic diff --git a/languagetool-core/src/test/java/org/languagetool/tools/StringToolsTest.java b/languagetool-core/src/test/java/org/languagetool/tools/StringToolsTest.java index a68b19a21c4c..67dc5ab1cb47 100644 --- a/languagetool-core/src/test/java/org/languagetool/tools/StringToolsTest.java +++ b/languagetool-core/src/test/java/org/languagetool/tools/StringToolsTest.java @@ -262,4 +262,12 @@ public void testIsCamelCase() { assertTrue(StringTools.isCamelCase("iSomeTHING")); } + @Test + public void testTitlecaseGlobal() { + assertEquals("The Lord of the Rings", StringTools.titlecaseGlobal("the lord of the rings")); + assertEquals("Rhythm and Blues", StringTools.titlecaseGlobal("rhythm And blues")); + assertEquals("Memória de Leitura", StringTools.titlecaseGlobal("memória de leitura")); + assertEquals("Fond du Lac", StringTools.titlecaseGlobal("fond du lac")); + assertEquals("El Niño de las Islas", StringTools.titlecaseGlobal("el niño de Las islas")); + } } From 916e7a675d0c441f9a8528b70ba98c8b5261ba1c Mon Sep 17 00:00:00 2001 From: p-goulart Date: Fri, 5 Jan 2024 17:30:10 +0100 Subject: [PATCH 3/9] [pt] Fix multiword prepositions --- .../languagetool/resource/pt/multiwords.txt | 643 ++++-------------- .../languagetool/rules/pt/pt-BR/grammar.xml | 2 +- 2 files changed, 122 insertions(+), 523 deletions(-) diff --git a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/multiwords.txt b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/multiwords.txt index 26da94673771..9bba8f7b9b03 100644 --- a/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/multiwords.txt +++ b/languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt/multiwords.txt @@ -274,29 +274,25 @@ ZX Spectrum NPMS000 ### Locuções ### #################################################### # a gente PP3CS000 -# Devido à SP000 -# devido à SP000 -# devido ao SP000 -# Devido ao SP000 -# Devido aos SP000 -# devido aos SP000 -# devido às SP000 -# Devido às SP000 -# Devido as SP000 -# devido as SP000 +# Devido à SPS00 +# devido à SPS00 +# devido ao SPS00 +# Devido ao SPS00 +# Devido aos SPS00 +# devido aos SPS00 +# devido às SPS00 +# Devido às SPS00 +# Devido as SPS00 +# devido as SPS00 # dezenas de RG # Dezenas de RG # É que RG # é que RG -À beira de SP000 -à beira de SP000 +à beira de SPS00 a bem dizer RG -A bem dizer RG -A bordo RG a bordo RG à Brás RG à bulhão pato RG -à Bulhão Pato RG a cada ano RG a cada dia RG a cada hora RG @@ -304,814 +300,417 @@ a cada mês RG a cada minuto RG a cada segundo RG a cada semana RG -a cargo de SP000 -A cargo de SP000 -A coberto de SP000 -a coberto de SP000 -à custa de SP000 -À custa de SP000 -A custo RG +a cargo de SPS00 +a coberto de SPS00 +à custa de SPS00 a custo RG a dada altura RG -A dada altura RG à direita RG -À direita RG -À disposição RG à disposição RG -À distância RG à distância RG à distância de RG -À distância de RG -À espreita SP000 -à espreita SP000 +à espreita SPS00 à esquerda RG -À esquerda RG -à face de SP000 -à face de SP000 -a favor de SP000 -A favor de SP000 -a fim de SP000 -A fim de SP000 +à face de SPS00 +a favor de SPS00 +a fim de SPS00 a fio RG -À frente RG à frente RG -à hora de SP000 -À hora de SP000 +à hora de SPS00 à Lagareiro RG -à luz de SP000 -À luz de SP000 -A mais de SP000 -a mais de SP000 -À maneira de SP000 -à maneira de SP000 -À medida de SP000 -à medida de SP000 -À medida que RG +à luz de SPS00 +a mais de SPS00 +à maneira de SPS00 à medida que RG a não ser que CS -A não ser que CS à noite RG -À noite RG a páginas tantas RG -A par de SP000 -a par de SP000 -À partida RG +a par de SPS00 à partida RG -A partir de SP000 -a partir de SP000 -A pé RG +a partir de SPS00 a pé RG -A posteriori RG a posteriori RG a prazo RG -A preceito RG a preceito RG -À pressa RG à pressa RG à primeira vista RG -À primeira vista RG a priori RG -A priori RG a propósito RG -A propósito RG -A qualquer momento RG a qualquer momento RG -A seu tempo RG a seu tempo RG -A sós RG a sós RG -À tarde RG à tarde RG -a tempo de SP000 -A tempo de SP000 +a tempo de SPS00 a termo RG -a título de SP000 -A título de SP000 -À toa RG +a título de SPS00 à toa RG -A toda a SP000 -a toda a SP000 +a toda a SPS00 a todo o pano RG -A todo o pano RG à tona RG -À tona RG a torto e a direito RG -A torto e a direito RG -À tripa-forra RG à tripa-forra RG -À venda em SP000 -à venda em SP000 -à vista de SP000 -À vista de SP000 -à volta de SP000 -À volta de SP000 -À vontade RG +à venda em SPS00 +à vista de SPS00 +à volta de SPS00 à vontade RG -Abaixo de SP000 -abaixo de SP000 -Acerca de SP000 -acerca de SP000 -acima de SP000 -Acima de SP000 +abaixo de SPS00 +acerca de SPS00 +acima de SPS00 afinal de contas RG -Afinal de contas RG -Agora que RG agora que RG ainda assim RG -Ainda assim RG ainda bem RG -Ainda bem RG -Ainda por cima RG ainda por cima RG -Ainda que CS ainda que CS -Além de SP000 -além de SP000 -Alguma vez RG +além de SPS00 alguma vez RG -Ano a ano RG ano a ano RG -Ano após ano RG ano após ano RG ao acaso RG -Ao acaso RG -ao cabo de SP000 -Ao cabo de SP000 +ao cabo de SPS00 ao calhas RG ao certo RG -Ao certo RG ao contrário RG -Ao contrário RG -ao contrário de SP000 -Ao contrário de SP000 -ao encontro de SP000 -Ao encontro de SP000 -Ao fim da tarde RG +ao contrário de SPS00 +ao encontro de SPS00 ao fim da tarde RG ao fundo RG -Ao fundo RG ao lado RG -Ao lado RG -ao lado de SP000 -Ao lado de SP000 +ao lado de SPS00 ao largo RG -Ao largo RG -Ao largo de SP000 -ao largo de SP000 -Ao longe RG +ao largo de SPS00 ao longe RG -Ao longo da semana RG ao longo da semana RG -ao longo de SP000 -Ao longo de SP000 -Ao longo do ano RG +ao longo de SPS00 ao longo do ano RG -Ao longo do dia RG ao longo do dia RG -Ao longo do mês RG ao longo do mês RG ao longo dos anos RG -Ao longo dos anos RG -Ao longo dos dias RG ao longo dos dias RG ao longo dos meses RG -Ao longo dos meses RG ao máximo RG -Ao máximo RG -Ao menos RG ao menos RG -Ao mesmo tempo RG ao mesmo tempo RG -Ao passo que CS ao passo que CS -ao pé de SP000 -Ao pé de SP000 -Ao todo RG +ao pé de SPS00 ao todo RG -Ao vivo RG ao vivo RG -Aos olhos de SP000 -aos olhos de SP000 -Aos poucos RG +aos olhos de SPS00 aos poucos RG apesar de CS -Apesar de CS apesar disso RG -Apesar disso RG -aquando de SP000 -Aquando de SP000 -aquém de SP000 -Aquém de SP000 +aquando de SPS00 +aquém de SPS00 aqui e ali RG -Aqui e ali RG -Às avessas RG às avessas RG às cegas RG -Às cegas RG às claras RG -Às claras RG -Às direitas RG às direitas RG -Às escondidas RG às escondidas RG -Às escuras RG às escuras RG às pressas RG -Às pressas RG às vezes RG -Às vezes RG -Assim como RG assim como RG até agora RG até ao momento RG -Até então RG até então RG até este momento RG até mesmo RG -Até mesmo RG até o momento RG até que ponto RG -Até que ponto RG -Atrás de SP000 -atrás de SP000 -Através de SP000 -através de SP000 +atrás de SPS00 +através de SPS00 bem como RG -Bem como RG -Cada vez RG cada vez RG -Cada vez mais RG cada vez mais RG -com base em SP000 -Com base em SP000 +com base em SPS00 com calma RG -Com calma RG -Com certeza RG com certeza RG com dificuldade RG -Com dificuldade RG -Com efeito RG com efeito RG -Com facilidade RG com facilidade RG -com vista a SP000 -Com vista a SP000 -Como que CS +com vista a SPS00 como que CS -Como se CS como se CS -da parte de SP000 -Da parte de SP000 +da parte de SPS00 dado que CS -Dado que CS -Daqui para a frente RG daqui para a frente RG -De acordo RG de acordo RG -de acordo SP000 -De acordo SP000 +de acordo SPS00 de algum modo RG -De algum modo RG de alguma forma RG de alguma forma RG -De alguma forma RG -De alguma forma RG de alguma maneira RG -De alguma maneira RG de alto a baixo RG -De alto a baixo RG -De bom grado RG de bom grado RG de bruços RG -De bruços RG -De certa forma RG de certa forma RG de certo modo RG -De certo modo RG de cima RG -De cima RG de cor RG -De cor RG -De costas RG de costas RG de dentro RG -De dentro RG de dia RG -De dia RG -De encontro a SP000 -de encontro a SP000 +de encontro a SPS00 de fato RG -De fato RG de fora RG -De fora RG -de fora SP000 -De fora SP000 -De forma a SP000 -de forma a SP000 -De forma alguma RG +de forma a SPS00 de forma alguma RG de forma alguma RN -De forma alguma RN de forma nenhuma RN -De forma nenhuma RN -De graça RG de graça RG -De imediato RG de imediato RG -De jeito nenhum RN de jeito nenhum RN -De longe RG de longe RG -De má vontade RG de má vontade RG -de maneira a SP000 -De maneira a SP000 +de maneira a SPS00 de maneira nenhuma RG -De maneira nenhuma RG de manhã RG -De manhã RG -De modo a SP000 -de modo a SP000 -De modo algum RN +de modo a SPS00 de modo algum RN -De modo que CS de modo que CS de muito RG -De muito RG de nenhuma forma RN -De nenhuma forma RN -De noite RG de noite RG de novo RG -De novo RG -De perto RG de perto RG de propósito RG -De propósito RG de qualquer forma RG -De qualquer forma RG de qualquer maneira RG -De qualquer maneira RG -De qualquer modo RG de qualquer modo RG -De quando em quando RG de quando em quando RG de repente RG -De repente RG de rompante RG -De rompante RG de súbito RG -De súbito RG -De supetão RG de supetão RG de tal forma RG -De tal forma RG -De tal modo que CS de tal modo que CS de tempos a tempos RG -De tempos a tempos RG -De tempos em tempos RG de tempos em tempos RG -de todo RG -De todo RG -De vez em quando RG +# de todo RG de vez em quando RG -debaixo de SP000 -Debaixo de SP000 -dentro de SP000 -Dentro de SP000 -Desde então RG +debaixo de SPS00 +dentro de SPS00 desde então RG -desde há SP000 -Desde há SP000 -Desde há já SP000 -desde há já SP000 -Desde já RG +desde há SPS00 +desde há já SPS00 desde já RG desde logo RG -Desde logo RG desde pequeno RG -Desde pequeno RG -Desde que CS desde que CS dessa maneira RG -Dessa maneira RG -Desse jeito RG desse jeito RG desse modo RG -Desse modo RG desta feita RG -Desta feita RG -Desta forma RG desta forma RG desta vez RG -Desta vez RG -detrás de SP000 -Detrás de SP000 -Devido a SP000 -devido a SP000 +detrás de SPS00 +devido a SPS00 dia a dia RG -Dia a dia RG -Dia após dia RG dia após dia RG -diante de SP000 -Diante de SP000 -Do que CS +diante de SPS00 do que CS -Dois a dois RG dois a dois RG duas a duas RG -Duas a duas RG -E por aí RG e por aí RG eis que RG -Eis que RG -Em abono de SP000 -em abono de SP000 +em abono de SPS00 em algum momento RG -Em algum momento RG em armazém RG -Em armazém RG -Em baixo RG em baixo RG -Em breve RG em breve RG -Em causa RG em causa RG -Em cima RG em cima RG -Em cima de SP000 -em cima de SP000 -Em comum RG +em cima de SPS00 em comum RG em conta RG -Em conta RG -Em curso RG em curso RG -Em detrimento de SP000 -em detrimento de SP000 -Em evidência RG +em detrimento de SPS00 em evidência RG em excesso RG -Em excesso RG -Em face de SP000 -em face de SP000 -em favor de SP000 -Em favor de SP000 -Em função de SP000 -em função de SP000 +em face de SPS00 +em favor de SPS00 +em função de SPS00 em geral RG -Em geral RG em grande parte RG -Em grande parte RG -em homenagem a SP000 -Em homenagem a SP000 -Em jeito de SP000 -em jeito de SP000 -Em larga medida RG +em homenagem a SPS00 +em jeito de SPS00 em larga medida RG em massa RG -Em massa RG em média RG -Em média RG em momento algum RG -Em momento algum RG -em nome de SP000 -Em nome de SP000 -Em obediência a SP000 -em obediência a SP000 -Em parte RG +em nome de SPS00 +em obediência a SPS00 em parte RG em partes RG -Em partes RG -Em primeiro lugar RG em primeiro lugar RG -Em prol de SP000 -em prol de SP000 -Em regra RG +em prol de SPS00 em regra RG -em relação a SP000 -Em relação a SP000 -Em segundo lugar RG +em relação a SPS00 em segundo lugar RG -Em seu favor RG em seu favor RG -Em silêncio RG em silêncio RG -Em simultâneo RG em simultâneo RG em stock RG -Em stock RG +em estoque RG em tempo real RG -Em tempo real RG -Em terceiro lugar RG em terceiro lugar RG -em termos de SP000 -Em termos de SP000 +em termos de SPS00 em toda a parte RG -Em toda a parte RG -em torno de SP000 -Em torno de SP000 +em torno de SPS00 em vão RG -Em vão RG -Em vez de SP000 -em vez de SP000 -em vias de SP000 -Em vias de SP000 -em virtude de SP000 -Em virtude de SP000 +em vez de SPS00 +em vias de SPS00 +em virtude de SPS00 em volta RG -Em volta RG -Enquanto que CS enquanto que CS -face a SP000 -Face a SP000 -Face ao SP000 -face ao SP000 -Face aos SP000 -face aos SP000 -Face às SP000 -face às SP000 -frente a SP000 -Frente a SP000 +face a SPS00 +Face a SPS00 +face ao SPS00 +face aos SPS00 +face às SPS00 +frente a SPS00 frente a frente RG -Frente a frente RG -graças a SP000 -Graças a SP000 -Graças a Deus I +graças a SPS00 +graças a Deus I hoje de manhã RG -Hoje de manhã RG -Hoje em dia RG hoje em dia RG hoje mesmo RG -Hoje mesmo RG -In loco SP000 -in loco SP000 +in loco SPS00 inferior a RG -Inferior a RG já que CS -Já que CS lado a lado RG -Lado a lado RG -Logo a seguir RG logo a seguir RG logo que RG -Logo que RG -longe de SP000 -Longe de SP000 -Mais nada RG +longe de SPS00 mais nada RG -Mais ou menos RG mais ou menos RG mais uma vez RG -Mais uma vez RG -Mar adentro RG mar adentro RG -Menos de RG menos de RG -Mês a mês RG mês a mês RG mês após mês RG -Mês após mês RG mesmo assim RG -Mesmo assim RG -Mesmo que CS mesmo que CS mesmo que RG -Mesmo que RG -Muitas vezes RG muitas vezes RG muito pouco RG -Na altura RG na altura RG na altura certa RG -Na altura certa RG -Na altura da cintura AQFS00 na altura da cintura AQFS00 na altura devida RG -Na altura devida RG na generalidade RG -Na generalidade RG -Na íntegra RG na íntegra RG -Na medida em que RG na medida em que RG na melhor das hipóteses RG -Na melhor das hipóteses RG -Na presença de SP000 -na presença de SP000 -Na sua maior parte RG +na presença de SPS00 na sua maior parte RG -Na sua maioria RG na sua maioria RG na verdade RG -Na verdade RG não obstante RG -Não obstante RG -Não só RG não só RG nem sequer RG -Nem sequer RG nessa altura RG -Nessa altura RG nesse sentido RG -Nesse sentido RG neste momento RG -Neste momento RG -Neste sentido RG neste sentido RG -no caso de SP000 -No caso de SP000 -no cimo de SP000 -No cimo de SP000 -No conceito de SP000 -no conceito de SP000 -no decorrer de SP000 -No decorrer de SP000 -No decurso de SP000 -no decurso de SP000 +no caso de SPS00 +no cimo de SPS00 +no conceito de SPS00 +no decorrer de SPS00 +no decurso de SPS00 no entanto RG -No entanto RG no máximo RG -No máximo RG -no meio de SP000 -No meio de SP000 -No mínimo RG +no meio de SPS00 no mínimo RG -No que se refere RG no que se refere RG -Nos termos de SP000 -nos termos de SP000 -Ontem de manhã RG +nos termos de SPS00 ontem de manhã RG ou seja RG -Ou seja RG -Outra vez RG outra vez RG -Para além SP000 -para além SP000 -para além de SP000 -Para além de SP000 -Para casa RG +para além SPS00 +para além de SPS00 para casa RG -Para cima RG para cima RG -Para dentro RG para dentro RG para fora RG -Para fora RG -Para fora de SP000 -para fora de SP000 -Para onde RG +para fora de SPS00 para onde RG -para os lados de SP000 -Para os lados de SP000 -Passo a passo RG +para os lados de SPS00 passo a passo RG pela primeira vez RG -Pela primeira vez RG pelo contrário RG -Pelo contrário RG -Pelo menos RG pelo menos RG -Pelos vistos RG pelos vistos RG per capita RG -Per capita RG -Perto de SP000 -perto de SP000 +perto de SPS00 pois então RG -Pois então RG -Pois que CS pois que CS por ali RG -Por ali RG por aqui RG -Por aqui RG -Por cabeça RG por cabeça RG -Por causa de SP000 -por causa de SP000 -Por certo RG +por causa de SPS00 por certo RG -Por cima de SP000 -por cima de SP000 +por cima de SPS00 por completo RG -Por completo RG -Por conta de SP000 -por conta de SP000 +por conta de SPS00 por dentro RG -Por dentro RG -Por detrás de SP000 -por detrás de SP000 -por entre SP000 -Por entre SP000 +por detrás de SPS00 +por entre SPS00 por excelência RG -Por excelência RG -Por exemplo RG por exemplo RG por fora RG -Por fora RG -Por habitante RG por habitante RG -Por iniciativa de SP000 -por iniciativa de SP000 -por meio de SP000 -Por meio de SP000 -Por ocasião de SP000 -por ocasião de SP000 +por iniciativa de SPS00 +por meio de SPS00 +por ocasião de SPS00 por onde RG -Por onde RG -por parte de SP000 -Por parte de SP000 +por parte de SPS00 por perto RG -Por perto RG por pouco RG -Por pouco RG -Por que RG por que RG por vezes RG -Por vezes RG -por via de SP000 -Por via de SP000 -Por volta de SP000 -por volta de SP000 +por via de SPS00 +por volta de SPS00 porta a porta RG -Porta a porta RG pouco a pouco RG -Pouco a pouco RG -Pra onde RG pra onde RG -próximo de SP000 -Próximo de SP000 -quando de SP000 -Quando de SP000 -Quanto a SP000 -quanto a SP000 -Quanto mais RG +próximo de SPS00 +quando de SPS00 +quanto a SPS00 quanto mais RG -Quem sabe RG quem sabe RG -referente a SP000 -Referente a SP000 -relativamente a SP000 -Relativamente a SP000 -Rumo a SP000 -rumo a SP000 +referente a SPS00 +relativamente a SPS00 +rumo a SPS00 se bem que CS -Se bem que CS -Se calhar RG se calhar RG -Sem calma RG sem calma RG sem certeza RG -Sem certeza RG -Sem dúvida RG sem dúvida RG -Semana a semana RG semana a semana RG -Semana após semana RG semana após semana RG sempre que RG -Sempre que RG -Sob pena de SP000 -sob pena de SP000 +sob pena de SPS00 superior a RG -Superior a RG -Tal como RG tal como RG tanto mais quanto RG -Tanto mais quanto RG -Tanto mais que CS tanto mais que CS todos os dias RG -Todos os dias RG -Um a um RG um a um RG um ao outro RG -Um ao outro RG um bocadinho RG -Um bocadinho RG Um bocado RG um bocado RG -Um pouco RG um pouco RG um pouquinho RG -Um pouquinho RG -Um tanto RG um tanto RG uma a uma RG -Uma a uma RG -Uma vez RG uma vez RG -Uma vez mais RG uma vez mais RG uma vez por todas RG -Uma vez por todas RG -Uma vez que CS uma vez que CS visto que CS -Visto que CS #################################################################### ### Proper nouns ### ### Partially from 14-11-2016 Catalan and Spanish multiword.txt ### diff --git a/languagetool-language-modules/pt/src/main/resources/org/languagetool/rules/pt/pt-BR/grammar.xml b/languagetool-language-modules/pt/src/main/resources/org/languagetool/rules/pt/pt-BR/grammar.xml index b32ac4f7c9e9..abbb62423578 100644 --- a/languagetool-language-modules/pt/src/main/resources/org/languagetool/rules/pt/pt-BR/grammar.xml +++ b/languagetool-language-modules/pt/src/main/resources/org/languagetool/rules/pt/pt-BR/grammar.xml @@ -1098,7 +1098,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA --> Ter - + Dom From a8c4f4f966444ed4f4ba63693b2883587bddfe26 Mon Sep 17 00:00:00 2001 From: p-goulart Date: Fri, 5 Jan 2024 17:31:07 +0100 Subject: [PATCH 4/9] [pt] Update PT tokeniser - improve handling of percent signs (was: [50%OFF], will be: [50%, OFF]); - add some tests due to the latest dictionary version. --- .../pt/PortugueseWordTokenizer.java | 54 ++++++++++--------- .../pt/PortugueseWordTokenizerTest.java | 7 +++ 2 files changed, 37 insertions(+), 24 deletions(-) diff --git a/languagetool-language-modules/pt/src/main/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizer.java b/languagetool-language-modules/pt/src/main/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizer.java index 17e9e044356a..0be99a2fefbd 100644 --- a/languagetool-language-modules/pt/src/main/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizer.java +++ b/languagetool-language-modules/pt/src/main/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizer.java @@ -18,13 +18,11 @@ */ package org.languagetool.tokenizers.pt; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; +import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.StringTokenizer; +import com.google.common.base.Splitter; import org.languagetool.tagging.pt.PortugueseTagger; import org.languagetool.tokenizers.WordTokenizer; @@ -56,7 +54,6 @@ public class PortugueseWordTokenizer extends WordTokenizer { // space between digits private static final Pattern DECIMAL_SPACE_PATTERN = compile("(?<=^|[\\s(])\\d{1,3}( [\\d]{3})+(?=[\\s(]|$)", CASE_INSENSITIVE| UNICODE_CASE); - // dots in numbers private static final Pattern DOTTED_NUMBERS_PATTERN = compile("([\\d])\\.([\\d])", CASE_INSENSITIVE| UNICODE_CASE); private static final String DOTTED_NUMBERS_REPL = "$1" + NON_BREAKING_DOT_SUBST + "$2"; @@ -79,31 +76,31 @@ public class PortugueseWordTokenizer extends WordTokenizer { private static final String HYPHEN_REPL = "$1" + HYPHEN_SUBST + "$2"; private static final Pattern NEARBY_HYPHENS_PATTERN = compile("([\\p{L}])-([\\p{L}])-([\\p{L}])", CASE_INSENSITIVE | UNICODE_CASE); private static final String NEARBY_HYPHENS_REPL = "$1" + HYPHEN_SUBST + "$2" + HYPHEN_SUBST + "$3"; - - private final String PT_TOKENISING_CHARS = getTokenizingCharacters() + "⌈⌋″©"; + private final String PT_TOKENISING_CHARS = getTokenizingCharacters() + "⌈⌋″©%"; public PortugueseWordTokenizer() { tagger = new PortugueseTagger(); } @Override - public List tokenize(String text) { + public List tokenize(final String text) { + String tokenisedText = text; // it's really bad practice to reassign method params imo... - if (text.contains(",")) { - text = DECIMAL_COMMA_PATTERN.matcher(text).replaceAll(DECIMAL_COMMA_REPL); + if (tokenisedText.contains(",")) { + tokenisedText = DECIMAL_COMMA_PATTERN.matcher(tokenisedText).replaceAll(DECIMAL_COMMA_REPL); } // if period is not the last character in the sentence - int dotIndex = text.indexOf('.'); - boolean dotInsideSentence = dotIndex >= 0 && dotIndex < text.length() - 1; + int dotIndex = tokenisedText.indexOf('.'); + boolean dotInsideSentence = dotIndex >= 0 && dotIndex < tokenisedText.length() - 1; if (dotInsideSentence) { - text = DATE_PATTERN.matcher(text).replaceAll(DATE_PATTERN_REPL); - text = DOTTED_NUMBERS_PATTERN.matcher(text).replaceAll(DOTTED_NUMBERS_REPL); - text = DOTTED_ORDINALS_PATTERN.matcher(text).replaceAll(DOTTED_ORDINALS_REPL); + tokenisedText = DATE_PATTERN.matcher(tokenisedText).replaceAll(DATE_PATTERN_REPL); + tokenisedText = DOTTED_NUMBERS_PATTERN.matcher(tokenisedText).replaceAll(DOTTED_NUMBERS_REPL); + tokenisedText = DOTTED_ORDINALS_PATTERN.matcher(tokenisedText).replaceAll(DOTTED_ORDINALS_REPL); } // 2 000 000 - Matcher spacedDecimalMatcher = DECIMAL_SPACE_PATTERN.matcher(text); + Matcher spacedDecimalMatcher = DECIMAL_SPACE_PATTERN.matcher(tokenisedText); if (spacedDecimalMatcher.find()) { StringBuffer sb = new StringBuffer(); do { @@ -113,29 +110,38 @@ public List tokenize(String text) { spacedDecimalMatcher.appendReplacement(sb, splitNumberAdjusted); } while (spacedDecimalMatcher.find()); spacedDecimalMatcher.appendTail(sb); - text = sb.toString(); + tokenisedText = sb.toString(); } // 12:25 - if (text.contains(":")) { - text = COLON_NUMBERS_PATTERN.matcher(text).replaceAll(COLON_NUMBERS_REPL); + if (tokenisedText.contains(":")) { + tokenisedText = COLON_NUMBERS_PATTERN.matcher(tokenisedText).replaceAll(COLON_NUMBERS_REPL); } - if (text.contains("-")) { - text = NEARBY_HYPHENS_PATTERN.matcher(text).replaceAll(NEARBY_HYPHENS_REPL); - text = HYPHEN_PATTERN.matcher(text).replaceAll(HYPHEN_REPL); + if (tokenisedText.contains("-")) { + tokenisedText = NEARBY_HYPHENS_PATTERN.matcher(tokenisedText).replaceAll(NEARBY_HYPHENS_REPL); + tokenisedText = HYPHEN_PATTERN.matcher(tokenisedText).replaceAll(HYPHEN_REPL); } List tokenList = new ArrayList<>(); - StringTokenizer st = new StringTokenizer(text, PT_TOKENISING_CHARS, true); + StringTokenizer st = new StringTokenizer(tokenisedText, PT_TOKENISING_CHARS, true); while (st.hasMoreElements()) { String token = st.nextToken(); + // make sure we join the % sign with the previous token, if it ends in a digit + if (Objects.equals(token, "%")) { + int lastIndex = tokenList.size() - 1; + String lastToken = tokenList.get(lastIndex); + if (lastToken.matches(".*\\d$")) { + tokenList.set(lastIndex, lastToken + "%"); + continue; + } + } token = token.replace(DECIMAL_COMMA_SUBST, ','); token = token.replace(NON_BREAKING_COLON_SUBST, ':'); token = token.replace(NON_BREAKING_SPACE_SUBST, ' '); // outside of if as we also replace back sentence-ending abbreviations token = token.replace(NON_BREAKING_DOT_SUBST, '.'); token = HYPHEN_SUBST.matcher(token).replaceAll("-"); - tokenList.addAll( wordsToAdd(token)); + tokenList.addAll(wordsToAdd(token)); } return joinEMailsAndUrls(tokenList); diff --git a/languagetool-language-modules/pt/src/test/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizerTest.java b/languagetool-language-modules/pt/src/test/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizerTest.java index 5b97e4c3e83b..b4ef979cb2c2 100644 --- a/languagetool-language-modules/pt/src/test/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizerTest.java +++ b/languagetool-language-modules/pt/src/test/java/org/languagetool/tokenizers/pt/PortugueseWordTokenizerTest.java @@ -67,6 +67,8 @@ public void testTokeniseHyphenatedSingleToken() { testTokenise("sex-appeal", new String[]{"sex-appeal"}); testTokenise("Aix-en-Provence", new String[]{"Aix-en-Provence"}); testTokenise("Montemor-o-Novo", new String[]{"Montemor-o-Novo"}); + testTokenise("Andorra-a-Velha", new String[]{"Andorra-a-Velha"}); + testTokenise("Tsé-Tung", new String[]{"Tsé-Tung"}); } @Test @@ -119,6 +121,11 @@ public void testTokeniseCurrency() { testTokenise("€2.000,00", new String[]{"€", "2.000,00"}); } + @Test + public void testTokeniseSplitsPercent() { + testTokenise("50%OFF", new String[]{"50%", "OFF"}); + } + @Test public void testTokeniseNumberAbbreviation() { testTokenise("Nº666", new String[]{"Nº666"}); // superscript 'o' From a77bd8d2792d6f37a10a2e06f7bd1154f1f5bfc1 Mon Sep 17 00:00:00 2001 From: p-goulart Date: Fri, 5 Jan 2024 17:32:29 +0100 Subject: [PATCH 5/9] [pt] Add speller tests due to latest dictionary --- .../MorfologikPortugueseSpellerRuleTest.java | 62 ++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/languagetool-language-modules/pt/src/test/java/org/languagetool/rules/pt/MorfologikPortugueseSpellerRuleTest.java b/languagetool-language-modules/pt/src/test/java/org/languagetool/rules/pt/MorfologikPortugueseSpellerRuleTest.java index 52c97feacd85..3e8ddbdc3daa 100644 --- a/languagetool-language-modules/pt/src/test/java/org/languagetool/rules/pt/MorfologikPortugueseSpellerRuleTest.java +++ b/languagetool-language-modules/pt/src/test/java/org/languagetool/rules/pt/MorfologikPortugueseSpellerRuleTest.java @@ -269,6 +269,16 @@ public void testPortugueseSymmetricalDialectDifferences() throws Exception { // assertTwoWayDialectError("bebê", "bebé"); } + @Test + public void testPortugueseAsymmetricalDialectDifferences() throws Exception { + // 'facto' is always invalid in pt-BR + assertSingleExactError("facto", ltBR, ruleBR, "fato", + "Possível erro de ortografia: esta é a grafia utilizada no português europeu.", + "MORFOLOGIK_RULE_PT_BR_DIALECT"); + // 'fato' is valid in pt-PT, albeit with another meaning + assertNoErrors("fato", ltPT, rulePT); + } + @Test public void testPortugueseSpellingAgreementVariation() throws Exception { // orthographic reforms @@ -437,6 +447,7 @@ public void testBrazilPortugueseSpellingCustomReplacements() throws Exception { assertSingleError("andância", ltBR, ruleBR, new String[]{"andança"}); assertSingleError("abto", ltBR, ruleBR, new String[]{"hábito"}); assertSingleError("logo nao", ltBR, ruleBR, new String[]{"não"}); + assertSingleError("kitchenette", ltBR, ruleBR, new String[]{"quitinete"}); } @Test @@ -557,7 +568,56 @@ public void testPortugueseSpellerIgnoresWordsFromIgnoreTXT() throws Exception { assertNoErrors("MERCEDES-BENZ", ltBR, ruleBR); assertNoErrors("Mercedes-Benz", ltBR, ruleBR); assertNoErrors("big band", ltBR, ruleBR); + assertNoErrors("Big band", ltBR, ruleBR); + assertNoErrors("Big Band", ltBR, ruleBR); assertNoErrors("BIG BANDS", ltBR, ruleBR); - } + // entry is "rhythm and blues" + assertNoErrors("rhythm and blues", ltBR, ruleBR); // same as file + assertNoErrors("Rhythm and blues", ltBR, ruleBR); // sentence-initial + assertNoErrors("Rhythm And Blues", ltBR, ruleBR); // title-case (naïve) + assertNoErrors("Rhythm and Blues", ltBR, ruleBR); // title-case (smart) + // entry is "stock car" + assertNoErrors("stock car", ltBR, ruleBR); + assertNoErrors("Stock Car", ltBR, ruleBR); + + assertNoErrors("Rock and Roll", ltBR, ruleBR); + assertNoErrors("Hall of Fame", ltBR, ruleBR); + assertNoErrors("Rock and Roll Hall of Fame", ltBR, ruleBR); + assertNoErrors("Rock And Roll Hall Of Fame", ltBR, ruleBR); // bad titlecasing, but we should accept + assertNoErrors("Chesapeake Bay Retriever", ltBR, ruleBR); + assertNoErrors("Pit Bull", ltBR, ruleBR); + assertNoErrors("Mao Tsé-Tung", ltBR, ruleBR); + assertNoErrors("Honoris Causa", ltBR, ruleBR); + } + + @Test public void testPortugueseSpellerEnglishCompounds() throws Exception { + // disambiguator rule + assertNoErrors("UntaggedWord Card", ltBR, ruleBR); // unknown word + assertNoErrors("Vaca Center", ltBR, ruleBR); // valid uppercase word + assertNoErrors("de Klerk Center", ltBR, ruleBR); // any proper noun, regardless of case + assertSingleError("caramba Center", ltBR, ruleBR, new String[]{"Conter", "Centre"}); // not in context + } + + @Test public void testPortugueseSpellerAcceptsArbitraryHyphenation() throws Exception { + assertNoErrors("Xai-Xai", ltBR, ruleBR); + assertNoErrors("Tsé-Tung", ltBR, ruleBR); + assertNoErrors("X-Men", ltBR, ruleBR); + assertNoErrors("t-shirts", ltBR, ruleBR); + assertNoErrors("além-mar", ltBR, ruleBR); + assertNoErrors("além-mares", ltBR, ruleBR); + assertNoErrors("baby-doll", ltBR, ruleBR); + assertNoErrors("baby-dolls", ltBR, ruleBR); + assertNoErrors("e-zine", ltBR, ruleBR); + assertNoErrors("e-zines", ltBR, ruleBR); + assertNoErrors("CD-ROM", ltBR, ruleBR); + assertNoErrors("CD-ROMs", ltBR, ruleBR); + assertSingleError("heavy-metal", ltBR, ruleBR, new String[]{"heavy metal"}); + } + + @Test public void testPortugueseSpellerAccepts50PercentOff() throws Exception { + // Tokenising rule; we need to add a rule to add the space ourselves, but at least it doesn't suggest nonsense + assertNoErrors("50%OFF", ltBR, ruleBR); + assertSingleError("50%oogabooga", ltBR, ruleBR, new String[]{}); + } } From 34d261ba013afb36118c9b3d34c385fae769895a Mon Sep 17 00:00:00 2001 From: p-goulart Date: Fri, 5 Jan 2024 17:32:51 +0100 Subject: [PATCH 6/9] Add titlecasing step to MultiWordChunker class - multitoken suggestions were failing because we were only checking if they were present in the dictionary by upcasing their first letter; - this failed to account for titlecasing (either naively or a little more smartly), which is relatively frequent; - cf. stuff like "The Lord of the Rings". --- .../tagging/disambiguation/MultiWordChunker.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/languagetool-core/src/main/java/org/languagetool/tagging/disambiguation/MultiWordChunker.java b/languagetool-core/src/main/java/org/languagetool/tagging/disambiguation/MultiWordChunker.java index e37f5dd97cdf..c59c4e95c73a 100644 --- a/languagetool-core/src/main/java/org/languagetool/tagging/disambiguation/MultiWordChunker.java +++ b/languagetool-core/src/main/java/org/languagetool/tagging/disambiguation/MultiWordChunker.java @@ -22,6 +22,7 @@ import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.text.WordUtils; import org.jetbrains.annotations.Nullable; import org.languagetool.AnalyzedSentence; import org.languagetool.AnalyzedToken; @@ -137,11 +138,20 @@ private void fillMaps(Map mStartSpace, Map mSt String originalToken = interner.computeIfAbsent(tokenAndTag[0], Function.identity()); String tag = interner.computeIfAbsent((defaultTag != null ? defaultTag:tokenAndTag[1]), Function.identity()); tokens.add(originalToken); + // We now understand this to also allow for titlecase. if (allowFirstCapitalized) { String tokenFirstCapitalized = StringTools.uppercaseFirstChar(originalToken); if (!mFull.containsKey(tokenFirstCapitalized) && !originalToken.equals(tokenFirstCapitalized)) { tokens.add(tokenFirstCapitalized); } + String tokenNaivelyTitlecased = WordUtils.capitalize(originalToken); + if (!tokenNaivelyTitlecased.equals(tokenFirstCapitalized) && !mFull.containsKey(tokenNaivelyTitlecased) && !originalToken.equals(tokenNaivelyTitlecased)) { + tokens.add(tokenNaivelyTitlecased); + } + String tokenSmartlyTitlecased = StringTools.titlecaseGlobal(originalToken); + if (!tokenSmartlyTitlecased.equals(tokenNaivelyTitlecased) && !mFull.containsKey(tokenSmartlyTitlecased) && !originalToken.equals(tokenSmartlyTitlecased)) { + tokens.add(tokenSmartlyTitlecased); + } } if (allowAllUppercase) { String tokenAllUppercase = originalToken.toUpperCase(); From d14d5b10144742db90a99d70d100435c1c62d07e Mon Sep 17 00:00:00 2001 From: p-goulart Date: Fri, 5 Jan 2024 17:40:56 +0100 Subject: [PATCH 7/9] [pt] Bump up dict to v0.12 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 43012ecf804f..2afdd69be05f 100644 --- a/pom.xml +++ b/pom.xml @@ -223,7 +223,7 @@ 0.6 2.1 - 0.11 + 0.12 0.1 1.0.2 1.3 From 57a9ef060d5496567b91ea8d21f82fe0f7ab2d1e Mon Sep 17 00:00:00 2001 From: p-goulart Date: Fri, 5 Jan 2024 20:32:20 +0100 Subject: [PATCH 8/9] Improve titlecase logic in MultiWordChunker --- .../disambiguation/MultiWordChunker.java | 71 +++++++++++++------ .../disambiguation/MultiWordChunkerTest.java | 19 +++++ .../MorfologikPortugueseSpellerRuleTest.java | 9 ++- 3 files changed, 76 insertions(+), 23 deletions(-) diff --git a/languagetool-core/src/main/java/org/languagetool/tagging/disambiguation/MultiWordChunker.java b/languagetool-core/src/main/java/org/languagetool/tagging/disambiguation/MultiWordChunker.java index c59c4e95c73a..a6a2a8dd4507 100644 --- a/languagetool-core/src/main/java/org/languagetool/tagging/disambiguation/MultiWordChunker.java +++ b/languagetool-core/src/main/java/org/languagetool/tagging/disambiguation/MultiWordChunker.java @@ -138,27 +138,27 @@ private void fillMaps(Map mStartSpace, Map mSt String originalToken = interner.computeIfAbsent(tokenAndTag[0], Function.identity()); String tag = interner.computeIfAbsent((defaultTag != null ? defaultTag:tokenAndTag[1]), Function.identity()); tokens.add(originalToken); - // We now understand this to also allow for titlecase. - if (allowFirstCapitalized) { - String tokenFirstCapitalized = StringTools.uppercaseFirstChar(originalToken); - if (!mFull.containsKey(tokenFirstCapitalized) && !originalToken.equals(tokenFirstCapitalized)) { - tokens.add(tokenFirstCapitalized); - } - String tokenNaivelyTitlecased = WordUtils.capitalize(originalToken); - if (!tokenNaivelyTitlecased.equals(tokenFirstCapitalized) && !mFull.containsKey(tokenNaivelyTitlecased) && !originalToken.equals(tokenNaivelyTitlecased)) { - tokens.add(tokenNaivelyTitlecased); - } - String tokenSmartlyTitlecased = StringTools.titlecaseGlobal(originalToken); - if (!tokenSmartlyTitlecased.equals(tokenNaivelyTitlecased) && !mFull.containsKey(tokenSmartlyTitlecased) && !originalToken.equals(tokenSmartlyTitlecased)) { - tokens.add(tokenSmartlyTitlecased); - } - } - if (allowAllUppercase) { - String tokenAllUppercase = originalToken.toUpperCase(); - if (!mFull.containsKey(tokenAllUppercase) && !originalToken.equals(tokenAllUppercase)) { - tokens.add(tokenAllUppercase); - } - } + tokens.addAll(getTokenLettercaseVariants(originalToken, mFull)); +// if (allowFirstCapitalized) { +// String tokenFirstCapitalized = StringTools.uppercaseFirstChar(originalToken); +// if (!mFull.containsKey(tokenFirstCapitalized) && !originalToken.equals(tokenFirstCapitalized)) { +// tokens.add(tokenFirstCapitalized); +// } +// String tokenNaivelyTitlecased = WordUtils.capitalize(originalToken); +// if (!tokenNaivelyTitlecased.equals(tokenFirstCapitalized) && !mFull.containsKey(tokenNaivelyTitlecased) && !originalToken.equals(tokenNaivelyTitlecased)) { +// tokens.add(tokenNaivelyTitlecased); +// } +// String tokenSmartlyTitlecased = StringTools.titlecaseGlobal(originalToken); +// if (!tokenSmartlyTitlecased.equals(tokenNaivelyTitlecased) && !mFull.containsKey(tokenSmartlyTitlecased) && !originalToken.equals(tokenSmartlyTitlecased)) { +// tokens.add(tokenSmartlyTitlecased); +// } +// } +// if (allowAllUppercase) { +// String tokenAllUppercase = originalToken.toUpperCase(); +// if (!mFull.containsKey(tokenAllUppercase) && !originalToken.equals(tokenAllUppercase)) { +// tokens.add(tokenAllUppercase); +// } +// } for (String token : tokens) { boolean containsSpace = token.indexOf(' ') > 0; String firstToken; @@ -195,6 +195,35 @@ private void fillMaps(Map mStartSpace, Map mSt } } + public List getTokenLettercaseVariants(String originalToken, Map tokenMap) { + List newTokens = new ArrayList<>(); + if (allowAllUppercase) { + String tokenAllUppercase = originalToken.toUpperCase(); + if (!tokenMap.containsKey(tokenAllUppercase) && !originalToken.equals(tokenAllUppercase)) { + newTokens.add(tokenAllUppercase); + } + } + // We now understand this to also allow for titlecase. + if (allowFirstCapitalized) { + String tokenFirstCapitalized = StringTools.uppercaseFirstChar(originalToken); + if (!tokenMap.containsKey(tokenFirstCapitalized) && !originalToken.equals(tokenFirstCapitalized)) { + newTokens.add(tokenFirstCapitalized); + } + if (originalToken.split(" ").length > 1 && StringTools.startsWithLowercase(originalToken)) { // titlecasing is only relevant for multi-token entries + String tokenNaivelyTitlecased = WordUtils.capitalize(originalToken); + if (!tokenNaivelyTitlecased.equals(tokenFirstCapitalized) && !originalToken.equals(tokenNaivelyTitlecased)) { + newTokens.add(tokenNaivelyTitlecased); + } + String tokenSmartlyTitlecased = StringTools.titlecaseGlobal(originalToken); + if (!tokenSmartlyTitlecased.equals(tokenFirstCapitalized) && !tokenSmartlyTitlecased.equals(tokenNaivelyTitlecased) && + !originalToken.equals(tokenSmartlyTitlecased)) { + newTokens.add(tokenSmartlyTitlecased); + } + } + } + return newTokens; + } + @Override public AnalyzedSentence disambiguate(AnalyzedSentence input) throws IOException { return disambiguate(input, null); diff --git a/languagetool-core/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java b/languagetool-core/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java index 30da3b3f4fda..da24cf9bf473 100644 --- a/languagetool-core/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java +++ b/languagetool-core/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java @@ -6,7 +6,9 @@ import org.languagetool.tagging.xx.DemoTagger; import java.io.IOException; +import java.util.HashMap; import java.util.List; +import java.util.Map; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -99,4 +101,21 @@ public void testDisambiguate2RemoveOtherReadings() throws IOException { assertFalse(tokens[5].getReadings().toString().contains("FakePosTag")); } + @Test + public void testLettercaseVariants() throws IOException { + MultiWordChunker multiWordChunker = new MultiWordChunker("/yy/multiwords.txt", true,true); + Map map = new HashMap<>(); + map.put("rhythm and blues", new AnalyzedToken("rhythm and blues", "NCMS000_", "rhythm and blues")); + map.put("Vênus de Milo", new AnalyzedToken("Vênus de Milo", "NCFSS00_", "Vênus de Milo")); + List tokenVariantsRnB = multiWordChunker.getTokenLettercaseVariants("rhythm and blues", map); + assertTrue(tokenVariantsRnB.contains("Rhythm and blues")); // simple upcase of first word + assertTrue(tokenVariantsRnB.contains("Rhythm And Blues")); // naïve titlecase + assertTrue(tokenVariantsRnB.contains("Rhythm and Blues")); // smarter titlecase + assertTrue(tokenVariantsRnB.contains("RHYTHM AND BLUES")); // all caps + List tokenVariantsVenus = multiWordChunker.getTokenLettercaseVariants("Vênus de Milo", map); + assertFalse(tokenVariantsVenus.contains("Vênus De Milo")); // naïve titlecase + assertFalse(tokenVariantsVenus.contains("vênus de milo")); // downcased + assertTrue(tokenVariantsVenus.contains("VÊNUS DE MILO")); // all caps + } + } diff --git a/languagetool-language-modules/pt/src/test/java/org/languagetool/rules/pt/MorfologikPortugueseSpellerRuleTest.java b/languagetool-language-modules/pt/src/test/java/org/languagetool/rules/pt/MorfologikPortugueseSpellerRuleTest.java index 3e8ddbdc3daa..bc115375d51e 100644 --- a/languagetool-language-modules/pt/src/test/java/org/languagetool/rules/pt/MorfologikPortugueseSpellerRuleTest.java +++ b/languagetool-language-modules/pt/src/test/java/org/languagetool/rules/pt/MorfologikPortugueseSpellerRuleTest.java @@ -580,12 +580,17 @@ public void testPortugueseSpellerIgnoresWordsFromIgnoreTXT() throws Exception { // entry is "stock car" assertNoErrors("stock car", ltBR, ruleBR); assertNoErrors("Stock Car", ltBR, ruleBR); + // entry is "Hall of Fame", so titlecase variants are not added + assertNoErrors("Hall of Fame", ltBR, ruleBR); + assertSingleError("Hall Of Fame", ltBR, ruleBR, new String[]{}); + assertErrorLength("hall of fame", 2, ltBR, ruleBR, new String[]{}); assertNoErrors("Rock and Roll", ltBR, ruleBR); assertNoErrors("Hall of Fame", ltBR, ruleBR); assertNoErrors("Rock and Roll Hall of Fame", ltBR, ruleBR); - assertNoErrors("Rock And Roll Hall Of Fame", ltBR, ruleBR); // bad titlecasing, but we should accept - assertNoErrors("Chesapeake Bay Retriever", ltBR, ruleBR); + assertSingleError("Rock And Roll Hall Of Fame", ltBR, ruleBR, new String[]{}); // bad titlecasing + assertNoErrors("Chesapeake Bay retriever", ltBR, ruleBR); + assertSingleError("Chesapeake Bay Retriever", ltBR, ruleBR, new String[]{}); // an annoying limitation assertNoErrors("Pit Bull", ltBR, ruleBR); assertNoErrors("Mao Tsé-Tung", ltBR, ruleBR); assertNoErrors("Honoris Causa", ltBR, ruleBR); From 1a0dbab5e952d2a26bb515dae03d4d02fe9c9bad Mon Sep 17 00:00:00 2001 From: p-goulart Date: Mon, 8 Jan 2024 11:05:03 +0100 Subject: [PATCH 9/9] Add titlecasing option to multi-word chunker - only Portuguese has it *on*, all other locales have it set to false; - add a simple StringTools method to check if all words in a multi-token string are lowercase (and tests). --- .../disambiguation/MultiWordChunker.java | 36 ++++++------------- .../org/languagetool/tools/StringTools.java | 13 +++++++ .../disambiguation/MultiWordChunkerTest.java | 4 +-- .../languagetool/tools/StringToolsTest.java | 8 +++++ .../ca/CatalanHybridDisambiguator.java | 4 +-- .../rules/de/GermanRuleDisambiguator.java | 6 ++-- .../en/EnglishHybridDisambiguator.java | 4 +-- .../es/SpanishHybridDisambiguator.java | 4 +-- .../fr/FrenchHybridDisambiguator.java | 2 +- .../pt/PortugueseHybridDisambiguator.java | 4 +-- 10 files changed, 46 insertions(+), 39 deletions(-) diff --git a/languagetool-core/src/main/java/org/languagetool/tagging/disambiguation/MultiWordChunker.java b/languagetool-core/src/main/java/org/languagetool/tagging/disambiguation/MultiWordChunker.java index a6a2a8dd4507..078b72723f6e 100644 --- a/languagetool-core/src/main/java/org/languagetool/tagging/disambiguation/MultiWordChunker.java +++ b/languagetool-core/src/main/java/org/languagetool/tagging/disambiguation/MultiWordChunker.java @@ -48,6 +48,7 @@ public class MultiWordChunker extends AbstractDisambiguator { private final String filename; private final boolean allowFirstCapitalized; private final boolean allowAllUppercase; + private final boolean allowTitlecase; private volatile boolean initialized; private Map mStartSpace; @@ -69,7 +70,7 @@ public class MultiWordChunker extends AbstractDisambiguator { * @param filename file text with multiwords and tags */ public MultiWordChunker(String filename) { - this(filename, false, false); + this(filename, false, false, false); } /** @@ -78,17 +79,21 @@ public MultiWordChunker(String filename) { * multiword can be capitalized * @param allowAllUppercase if set to {@code true}, the all uppercase * version of the multiword is allowed + * @param allowTitlecase if set to {@code true}, titlecased variants + * of multi-token words are accepted */ - public MultiWordChunker(String filename, boolean allowFirstCapitalized, boolean allowAllUppercase) { + public MultiWordChunker(String filename, boolean allowFirstCapitalized, boolean allowAllUppercase, boolean allowTitlecase) { this.filename = filename; this.allowFirstCapitalized = allowFirstCapitalized; this.allowAllUppercase = allowAllUppercase; + this.allowTitlecase = allowTitlecase; } - public MultiWordChunker(String filename, boolean allowFirstCapitalized, boolean allowAllUppercase, String defaultTag) { + public MultiWordChunker(String filename, boolean allowFirstCapitalized, boolean allowAllUppercase, boolean allowTitlecase, String defaultTag) { this.filename = filename; this.allowFirstCapitalized = allowFirstCapitalized; this.allowAllUppercase = allowAllUppercase; + this.allowTitlecase = allowTitlecase; this.defaultTag = defaultTag; } @@ -139,26 +144,6 @@ private void fillMaps(Map mStartSpace, Map mSt String tag = interner.computeIfAbsent((defaultTag != null ? defaultTag:tokenAndTag[1]), Function.identity()); tokens.add(originalToken); tokens.addAll(getTokenLettercaseVariants(originalToken, mFull)); -// if (allowFirstCapitalized) { -// String tokenFirstCapitalized = StringTools.uppercaseFirstChar(originalToken); -// if (!mFull.containsKey(tokenFirstCapitalized) && !originalToken.equals(tokenFirstCapitalized)) { -// tokens.add(tokenFirstCapitalized); -// } -// String tokenNaivelyTitlecased = WordUtils.capitalize(originalToken); -// if (!tokenNaivelyTitlecased.equals(tokenFirstCapitalized) && !mFull.containsKey(tokenNaivelyTitlecased) && !originalToken.equals(tokenNaivelyTitlecased)) { -// tokens.add(tokenNaivelyTitlecased); -// } -// String tokenSmartlyTitlecased = StringTools.titlecaseGlobal(originalToken); -// if (!tokenSmartlyTitlecased.equals(tokenNaivelyTitlecased) && !mFull.containsKey(tokenSmartlyTitlecased) && !originalToken.equals(tokenSmartlyTitlecased)) { -// tokens.add(tokenSmartlyTitlecased); -// } -// } -// if (allowAllUppercase) { -// String tokenAllUppercase = originalToken.toUpperCase(); -// if (!mFull.containsKey(tokenAllUppercase) && !originalToken.equals(tokenAllUppercase)) { -// tokens.add(tokenAllUppercase); -// } -// } for (String token : tokens) { boolean containsSpace = token.indexOf(' ') > 0; String firstToken; @@ -203,13 +188,14 @@ public List getTokenLettercaseVariants(String originalToken, Map 1 && StringTools.startsWithLowercase(originalToken)) { // titlecasing is only relevant for multi-token entries + // Titlecasing is only relevant for multi-token entries, and only done for expressions that are entirely lowercase + // It is also limited to when first-letter capitalisation is allowed. + if (allowTitlecase && originalToken.split(" ").length > 1 && StringTools.allStartWithLowercase(originalToken)) { String tokenNaivelyTitlecased = WordUtils.capitalize(originalToken); if (!tokenNaivelyTitlecased.equals(tokenFirstCapitalized) && !originalToken.equals(tokenNaivelyTitlecased)) { newTokens.add(tokenNaivelyTitlecased); diff --git a/languagetool-core/src/main/java/org/languagetool/tools/StringTools.java b/languagetool-core/src/main/java/org/languagetool/tools/StringTools.java index 021a47a2820a..5f21f650d7dc 100644 --- a/languagetool-core/src/main/java/org/languagetool/tools/StringTools.java +++ b/languagetool-core/src/main/java/org/languagetool/tools/StringTools.java @@ -256,6 +256,19 @@ public static boolean startsWithLowercase(String str) { return Character.isLowerCase(str.charAt(0)); } + public static boolean allStartWithLowercase(String str) { + String[] strParts = str.split(" "); + if (strParts.length < 2) { + return startsWithLowercase(str); + } + for (String strPart : strParts) { + if (!startsWithLowercase(strPart)) { + return false; + } + } + return true; + } + /** * Return str modified so that its first character is now an * uppercase character. If str starts with non-alphabetic diff --git a/languagetool-core/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java b/languagetool-core/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java index da24cf9bf473..d0f651862bc2 100644 --- a/languagetool-core/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java +++ b/languagetool-core/src/test/java/org/languagetool/tagging/disambiguation/MultiWordChunkerTest.java @@ -39,7 +39,7 @@ public void setUp() throws Exception { @Test public void testDisambiguate1() throws IOException { - MultiWordChunker multiWordChunker = new MultiWordChunker("/yy/multiwords.txt", true, true); + MultiWordChunker multiWordChunker = new MultiWordChunker("/yy/multiwords.txt", true, true, true); AnalyzedSentence analyzedSentence = lt.getAnalyzedSentence("ah for shame"); AnalyzedSentence disambiguated = multiWordChunker.disambiguate(analyzedSentence); @@ -103,7 +103,7 @@ public void testDisambiguate2RemoveOtherReadings() throws IOException { @Test public void testLettercaseVariants() throws IOException { - MultiWordChunker multiWordChunker = new MultiWordChunker("/yy/multiwords.txt", true,true); + MultiWordChunker multiWordChunker = new MultiWordChunker("/yy/multiwords.txt", true, true, true); Map map = new HashMap<>(); map.put("rhythm and blues", new AnalyzedToken("rhythm and blues", "NCMS000_", "rhythm and blues")); map.put("Vênus de Milo", new AnalyzedToken("Vênus de Milo", "NCFSS00_", "Vênus de Milo")); diff --git a/languagetool-core/src/test/java/org/languagetool/tools/StringToolsTest.java b/languagetool-core/src/test/java/org/languagetool/tools/StringToolsTest.java index 67dc5ab1cb47..d276b926424f 100644 --- a/languagetool-core/src/test/java/org/languagetool/tools/StringToolsTest.java +++ b/languagetool-core/src/test/java/org/languagetool/tools/StringToolsTest.java @@ -270,4 +270,12 @@ public void testTitlecaseGlobal() { assertEquals("Fond du Lac", StringTools.titlecaseGlobal("fond du lac")); assertEquals("El Niño de las Islas", StringTools.titlecaseGlobal("el niño de Las islas")); } + + @Test + public void testAllStartWithLowercase() { + assertTrue(StringTools.allStartWithLowercase("the lord of the rings")); + assertFalse(StringTools.allStartWithLowercase("the Fellowship of the Ring")); + assertTrue(StringTools.allStartWithLowercase("bilbo")); + assertFalse(StringTools.allStartWithLowercase("Baggins")); + } } diff --git a/languagetool-language-modules/ca/src/main/java/org/languagetool/tagging/disambiguation/ca/CatalanHybridDisambiguator.java b/languagetool-language-modules/ca/src/main/java/org/languagetool/tagging/disambiguation/ca/CatalanHybridDisambiguator.java index 3dc9abef2a3b..2171efbe5768 100644 --- a/languagetool-language-modules/ca/src/main/java/org/languagetool/tagging/disambiguation/ca/CatalanHybridDisambiguator.java +++ b/languagetool-language-modules/ca/src/main/java/org/languagetool/tagging/disambiguation/ca/CatalanHybridDisambiguator.java @@ -39,8 +39,8 @@ */ public class CatalanHybridDisambiguator extends AbstractDisambiguator { - private final MultiWordChunker chunker = new MultiWordChunker("/ca/multiwords.txt", true, true); - private final MultiWordChunker chunkerGlobal = new MultiWordChunker("/spelling_global.txt", false, true, "NPCN000"); + private final MultiWordChunker chunker = new MultiWordChunker("/ca/multiwords.txt", true, true, false); + private final MultiWordChunker chunkerGlobal = new MultiWordChunker("/spelling_global.txt", false, true, false,"NPCN000"); private final Disambiguator disambiguator; @Override diff --git a/languagetool-language-modules/de/src/main/java/org/languagetool/tagging/disambiguation/rules/de/GermanRuleDisambiguator.java b/languagetool-language-modules/de/src/main/java/org/languagetool/tagging/disambiguation/rules/de/GermanRuleDisambiguator.java index 213ded9ef6fc..b0ef4913a008 100644 --- a/languagetool-language-modules/de/src/main/java/org/languagetool/tagging/disambiguation/rules/de/GermanRuleDisambiguator.java +++ b/languagetool-language-modules/de/src/main/java/org/languagetool/tagging/disambiguation/rules/de/GermanRuleDisambiguator.java @@ -35,13 +35,13 @@ public class GermanRuleDisambiguator extends AbstractDisambiguator { private final Disambiguator disambiguator; private final MultiWordChunker multitokenSpeller = new MultiWordChunker( - "/de/multitoken-ignore.txt", false, false, MultiWordChunker.tagForNotAddingTags); + "/de/multitoken-ignore.txt", false, false, false, MultiWordChunker.tagForNotAddingTags); private final MultiWordChunker multitokenSpeller2 = new MultiWordChunker( - "/de/multitoken-suggest.txt", false, false, MultiWordChunker.tagForNotAddingTags); + "/de/multitoken-suggest.txt", false, false, false, MultiWordChunker.tagForNotAddingTags); private final MultiWordChunker multitokenSpeller3 = new MultiWordChunker( - "/spelling_global.txt", false, false, MultiWordChunker.tagForNotAddingTags); + "/spelling_global.txt", false, false, false, MultiWordChunker.tagForNotAddingTags); public GermanRuleDisambiguator(Language lang) { disambiguator = new XmlRuleDisambiguator(lang, true); diff --git a/languagetool-language-modules/en/src/main/java/org/languagetool/tagging/en/EnglishHybridDisambiguator.java b/languagetool-language-modules/en/src/main/java/org/languagetool/tagging/en/EnglishHybridDisambiguator.java index 83defe413ede..46a95b663eb1 100644 --- a/languagetool-language-modules/en/src/main/java/org/languagetool/tagging/en/EnglishHybridDisambiguator.java +++ b/languagetool-language-modules/en/src/main/java/org/languagetool/tagging/en/EnglishHybridDisambiguator.java @@ -42,8 +42,8 @@ */ public class EnglishHybridDisambiguator extends AbstractDisambiguator { - private final MultiWordChunker chunker = new MultiWordChunker("/en/multiwords.txt", true, true); - private final MultiWordChunker chunkerGlobal = new MultiWordChunker("/spelling_global.txt", true, true, MultiWordChunker.tagForNotAddingTags); + private final MultiWordChunker chunker = new MultiWordChunker("/en/multiwords.txt", true, true, false); + private final MultiWordChunker chunkerGlobal = new MultiWordChunker("/spelling_global.txt", true, true, false, MultiWordChunker.tagForNotAddingTags); private final Disambiguator disambiguator; public EnglishHybridDisambiguator(Language lang) { diff --git a/languagetool-language-modules/es/src/main/java/org/languagetool/tagging/disambiguation/es/SpanishHybridDisambiguator.java b/languagetool-language-modules/es/src/main/java/org/languagetool/tagging/disambiguation/es/SpanishHybridDisambiguator.java index dcc78559dc37..b5c96ec98f3f 100644 --- a/languagetool-language-modules/es/src/main/java/org/languagetool/tagging/disambiguation/es/SpanishHybridDisambiguator.java +++ b/languagetool-language-modules/es/src/main/java/org/languagetool/tagging/disambiguation/es/SpanishHybridDisambiguator.java @@ -42,8 +42,8 @@ */ public class SpanishHybridDisambiguator extends AbstractDisambiguator { - private final MultiWordChunker chunker = new MultiWordChunker("/es/multiwords.txt", true, true); - private final Disambiguator chunkerGlobal = new MultiWordChunker("/spelling_global.txt", false, true, "NPCN000"); + private final MultiWordChunker chunker = new MultiWordChunker("/es/multiwords.txt", true, true, false); + private final Disambiguator chunkerGlobal = new MultiWordChunker("/spelling_global.txt", false, true, false, "NPCN000"); private final Disambiguator disambiguator; public SpanishHybridDisambiguator(Language lang) { diff --git a/languagetool-language-modules/fr/src/main/java/org/languagetool/tagging/disambiguation/fr/FrenchHybridDisambiguator.java b/languagetool-language-modules/fr/src/main/java/org/languagetool/tagging/disambiguation/fr/FrenchHybridDisambiguator.java index 8d06cb2edb4f..dd0d3c317f90 100644 --- a/languagetool-language-modules/fr/src/main/java/org/languagetool/tagging/disambiguation/fr/FrenchHybridDisambiguator.java +++ b/languagetool-language-modules/fr/src/main/java/org/languagetool/tagging/disambiguation/fr/FrenchHybridDisambiguator.java @@ -40,7 +40,7 @@ public class FrenchHybridDisambiguator extends AbstractDisambiguator { - private final MultiWordChunker chunker = new MultiWordChunker("/fr/multiwords.txt", true, true); + private final MultiWordChunker chunker = new MultiWordChunker("/fr/multiwords.txt", true, true, false); private final Disambiguator disambiguator = new XmlRuleDisambiguator(new French(), true); public FrenchHybridDisambiguator() { diff --git a/languagetool-language-modules/pt/src/main/java/org/languagetool/tagging/disambiguation/pt/PortugueseHybridDisambiguator.java b/languagetool-language-modules/pt/src/main/java/org/languagetool/tagging/disambiguation/pt/PortugueseHybridDisambiguator.java index f4d06f67de9d..7357c52756c1 100644 --- a/languagetool-language-modules/pt/src/main/java/org/languagetool/tagging/disambiguation/pt/PortugueseHybridDisambiguator.java +++ b/languagetool-language-modules/pt/src/main/java/org/languagetool/tagging/disambiguation/pt/PortugueseHybridDisambiguator.java @@ -39,8 +39,8 @@ */ public class PortugueseHybridDisambiguator extends AbstractDisambiguator { - private final MultiWordChunker chunker = new MultiWordChunker("/pt/multiwords.txt", true, true); - private final MultiWordChunker chunkerGlobal = new MultiWordChunker("/spelling_global.txt", false, true, "NPCN000"); + private final MultiWordChunker chunker = new MultiWordChunker("/pt/multiwords.txt", true, true, true); + private final MultiWordChunker chunkerGlobal = new MultiWordChunker("/spelling_global.txt", false, true, true,"NPCN000"); private final Disambiguator disambiguator; public PortugueseHybridDisambiguator(Language lang) {