diff --git a/packages/botonic-plugin-contentful/package-lock.json b/packages/botonic-plugin-contentful/package-lock.json index f8ed403875..56d1bea03c 100644 --- a/packages/botonic-plugin-contentful/package-lock.json +++ b/packages/botonic-plugin-contentful/package-lock.json @@ -265,6 +265,14 @@ "@nlpjs/core": "^4.14.0" } }, + "@nlpjs/lang-uk": { + "version": "4.14.0", + "resolved": "https://registry.npmjs.org/@nlpjs/lang-uk/-/lang-uk-4.14.0.tgz", + "integrity": "sha512-cq/REeQFOL9fUTWc51kcVUt3X+06Vam2xStwzOo1WJqgs+jKCAOzSvJkqpyjrYv1Ru5RDfJ+yK2C/2L7kNXxnw==", + "requires": { + "@nlpjs/core": "^4.14.0" + } + }, "@nlpjs/language-min": { "version": "4.6.0", "resolved": "https://registry.npmjs.org/@nlpjs/language-min/-/language-min-4.6.0.tgz", diff --git a/packages/botonic-plugin-contentful/package.json b/packages/botonic-plugin-contentful/package.json index 95ad278ec8..323b16df83 100644 --- a/packages/botonic-plugin-contentful/package.json +++ b/packages/botonic-plugin-contentful/package.json @@ -47,6 +47,7 @@ "@nlpjs/lang-tr": "^4.14.0", "@nlpjs/lang-el": "^4.14.0", "@nlpjs/lang-cs": "^4.14.0", + "@nlpjs/lang-uk": "^4.14.0", "@nlpjs/ner": "^4.14.0", "@nlpjs/similarity": "^4.14.0", "@types/joi": "^14.3.4", diff --git a/packages/botonic-plugin-contentful/src/nlp/locales.ts b/packages/botonic-plugin-contentful/src/nlp/locales.ts index de44b513a9..905a3be5ab 100644 --- a/packages/botonic-plugin-contentful/src/nlp/locales.ts +++ b/packages/botonic-plugin-contentful/src/nlp/locales.ts @@ -15,6 +15,7 @@ export const GERMAN = 'de' export const ROMANIAN = 'ro' export const GREEK = 'el' export const CZECH = 'cs' +export const UKRAINIAN = 'uk' export const SUPPORTED_LOCALES = [ SPANISH, @@ -30,6 +31,7 @@ export const SUPPORTED_LOCALES = [ ROMANIAN, GREEK, CZECH, + UKRAINIAN, ] export function checkLocale(locale: Locale): Locale { diff --git a/packages/botonic-plugin-contentful/src/nlp/stemmer.ts b/packages/botonic-plugin-contentful/src/nlp/stemmer.ts index 6a4ea15927..ce239aaaba 100644 --- a/packages/botonic-plugin-contentful/src/nlp/stemmer.ts +++ b/packages/botonic-plugin-contentful/src/nlp/stemmer.ts @@ -11,6 +11,7 @@ import StemmerPt from '@nlpjs/lang-pt/src/stemmer-pt' import StemmerRo from '@nlpjs/lang-ro/src/stemmer-ro' import StemmerRu from '@nlpjs/lang-ru/src/stemmer-ru' import StemmerTr from '@nlpjs/lang-tr/src/stemmer-tr' +import StemmerUk from '@nlpjs/lang-uk/src/stemmer-uk' import { Locale, rootLocale } from './locales' import { StemmerPl } from './stemmers/polish-stemmer' @@ -34,6 +35,7 @@ export const stemmers: { [key: string]: Stemmer } = { ro: new StemmerRo(), el: new StemmerEl(), cs: new StemmerCs(), + uk: new StemmerUk(), } export function stemmerFor(locale: Locale): Stemmer { diff --git a/packages/botonic-plugin-contentful/src/nlp/stopwords/stopwords-uk.ts b/packages/botonic-plugin-contentful/src/nlp/stopwords/stopwords-uk.ts new file mode 100644 index 0000000000..9f788b685d --- /dev/null +++ b/packages/botonic-plugin-contentful/src/nlp/stopwords/stopwords-uk.ts @@ -0,0 +1,76 @@ +// from https://github.com/stopwords-iso/stopwords-uk/blob/master/stopwords-uk.json +export const ukDefaultStopWords = [ + 'авжеж', + 'адже', + 'але', + 'б', + 'без', + 'був', + 'була', + 'були', + 'було', + 'бути', + 'більш', + 'вам', + 'вас', + 'весь', + 'вздовж', + 'ви', + 'вниз', + 'внизу', + 'вона', + 'вони', + 'воно', + 'все', + 'всередині', + 'всіх', + 'від', + 'він', + 'да', + 'давай', + 'давати', + 'де', + 'дещо', + 'для', + 'до', + 'з', + 'завжди', + 'замість', + 'й', + 'коли', + 'ледве', + 'майже', + 'ми', + 'навколо', + 'навіть', + 'нам', + 'от', + 'отже', + 'отож', + 'поза', + 'про', + 'під', + 'та', + 'так', + 'такий', + 'також', + 'те', + 'ти', + 'тобто', + 'тож', + 'тощо', + 'хоча', + 'це', + 'цей', + 'чи', + 'чого', + 'що', + 'як', + 'який', + 'якої', + 'є', + 'із', + 'інших', + 'їх', + 'її', +] diff --git a/packages/botonic-plugin-contentful/src/nlp/tokens.ts b/packages/botonic-plugin-contentful/src/nlp/tokens.ts index b059dcb9fa..4216a9552a 100644 --- a/packages/botonic-plugin-contentful/src/nlp/tokens.ts +++ b/packages/botonic-plugin-contentful/src/nlp/tokens.ts @@ -11,6 +11,7 @@ import TokenizerPt from '@nlpjs/lang-pt/src/tokenizer-pt' import TokenizerRo from '@nlpjs/lang-ro/src/tokenizer-ro' import TokenizerRu from '@nlpjs/lang-ru/src/tokenizer-ru' import TokenizerTr from '@nlpjs/lang-tr/src/tokenizer-tr' +import TokenizerUk from '@nlpjs/lang-uk/src/tokenizer-uk' import { Locale, rootLocale } from './locales' import * as locales from './locales' @@ -27,6 +28,7 @@ import { ptDefaultStopWords } from './stopwords/stopwords-pt' import { roDefaultStopWords } from './stopwords/stopwords-ro' import { ruDefaultStopWords } from './stopwords/stopwords-ru' import { trDefaultStopWords } from './stopwords/stopwords-tr' +import { ukDefaultStopWords } from './stopwords/stopwords-uk' export function countOccurrences(haystack: string, needle: string): number { let n = 0 @@ -99,6 +101,7 @@ const tokenizers: { [locale: string]: Tokenizer } = { [locales.ROMANIAN]: new TokenizerRo(), [locales.GREEK]: new TokenizerEl(), [locales.CZECH]: new TokenizerCs(), + [locales.UKRAINIAN]: new TokenizerUk(), } export function tokenizerPerLocale(locale: Locale): Tokenizer { @@ -129,4 +132,5 @@ export const DEFAULT_STOP_WORDS: { [key: string]: string[] } = { ro: roDefaultStopWords, el: elDefaultStopWords, cs: csDefaultStopWords, + uk: ukDefaultStopWords, } diff --git a/packages/botonic-plugin-contentful/src/typings.d.ts b/packages/botonic-plugin-contentful/src/typings.d.ts index afeb1ee14a..0eb85a9eb9 100644 --- a/packages/botonic-plugin-contentful/src/typings.d.ts +++ b/packages/botonic-plugin-contentful/src/typings.d.ts @@ -222,6 +222,20 @@ declare module '@nlpjs/lang-cs/src/stemmer-cs' { export = StemmerCs } +declare module '@nlpjs/lang-uk/src/tokenizer-uk' { + import { Tokenizer } from '@nlpjs/core/src' + + class TokenizerUk extends Tokenizer {} + export = TokenizerUk +} + +declare module '@nlpjs/lang-uk/src/stemmer-uk' { + import { BaseStemmer } from '@nlpjs/core/src' + + class StemmerUk extends BaseStemmer {} + export = StemmerUk +} + declare module 'sort-stream' { function sort(func: (a: any, b: any) => number): any export = sort diff --git a/packages/botonic-plugin-contentful/tests/nlp/normalizer.test.ts b/packages/botonic-plugin-contentful/tests/nlp/normalizer.test.ts index 79806badf7..92d7784f18 100644 --- a/packages/botonic-plugin-contentful/tests/nlp/normalizer.test.ts +++ b/packages/botonic-plugin-contentful/tests/nlp/normalizer.test.ts @@ -89,6 +89,16 @@ test.each([ new Word('koupit', 'koupit'), ], ], + [ + 'uk', + 'я з тобою розмовляю', + [ + new Word('я', 'я'), + Word.StopWord('з'), + new Word('тобою', 'тоб'), + new Word('розмовляю', 'розмовля'), + ], + ], ])( 'TEST: stemmer removes stopwords: lang=%s input="%j"', (locale: string, raw: string, words: Word[]) => { diff --git a/packages/botonic-plugin-contentful/tests/nlp/stemmer.test.ts b/packages/botonic-plugin-contentful/tests/nlp/stemmer.test.ts index f7f1a047cb..8cc88569aa 100644 --- a/packages/botonic-plugin-contentful/tests/nlp/stemmer.test.ts +++ b/packages/botonic-plugin-contentful/tests/nlp/stemmer.test.ts @@ -28,6 +28,7 @@ test.each([ ['el', 'ομιλία', ['ομιλ']], ['cs', 'mluvící', ['mluvic']], ['cs', 'psaní', ['psan']], + ['uk', 'розмовляючи', ['розмовляюч']], ])( 'TEST: stemmer removes final letters(%s) =>%j', (locale: string, raw: string, expected: string) => {