Skip to content

Commit

Permalink
Merge pull request #1114 from hubtype/contentful/uk
Browse files Browse the repository at this point in the history
chore(nlp): Ukrainian tokenizer, stemmer and stopwords added
  • Loading branch information
elozano98 authored Nov 19, 2020
2 parents e057e23 + 3ea39ab commit 6a95efa
Show file tree
Hide file tree
Showing 9 changed files with 118 additions and 0 deletions.
8 changes: 8 additions & 0 deletions packages/botonic-plugin-contentful/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions packages/botonic-plugin-contentful/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
"@nlpjs/lang-tr": "^4.14.0",
"@nlpjs/lang-el": "^4.14.0",
"@nlpjs/lang-cs": "^4.14.0",
"@nlpjs/lang-uk": "^4.14.0",
"@nlpjs/ner": "^4.14.0",
"@nlpjs/similarity": "^4.14.0",
"@types/joi": "^14.3.4",
Expand Down
2 changes: 2 additions & 0 deletions packages/botonic-plugin-contentful/src/nlp/locales.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ export const GERMAN = 'de'
export const ROMANIAN = 'ro'
export const GREEK = 'el'
export const CZECH = 'cs'
export const UKRAINIAN = 'uk'

export const SUPPORTED_LOCALES = [
SPANISH,
Expand All @@ -30,6 +31,7 @@ export const SUPPORTED_LOCALES = [
ROMANIAN,
GREEK,
CZECH,
UKRAINIAN,
]

export function checkLocale(locale: Locale): Locale {
Expand Down
2 changes: 2 additions & 0 deletions packages/botonic-plugin-contentful/src/nlp/stemmer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import StemmerPt from '@nlpjs/lang-pt/src/stemmer-pt'
import StemmerRo from '@nlpjs/lang-ro/src/stemmer-ro'
import StemmerRu from '@nlpjs/lang-ru/src/stemmer-ru'
import StemmerTr from '@nlpjs/lang-tr/src/stemmer-tr'
import StemmerUk from '@nlpjs/lang-uk/src/stemmer-uk'

import { Locale, rootLocale } from './locales'
import { StemmerPl } from './stemmers/polish-stemmer'
Expand All @@ -34,6 +35,7 @@ export const stemmers: { [key: string]: Stemmer } = {
ro: new StemmerRo(),
el: new StemmerEl(),
cs: new StemmerCs(),
uk: new StemmerUk(),
}

export function stemmerFor(locale: Locale): Stemmer {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// from https://github.com/stopwords-iso/stopwords-uk/blob/master/stopwords-uk.json
export const ukDefaultStopWords = [
'авжеж',
'адже',
'але',
'б',
'без',
'був',
'була',
'були',
'було',
'бути',
'більш',
'вам',
'вас',
'весь',
'вздовж',
'ви',
'вниз',
'внизу',
'вона',
'вони',
'воно',
'все',
'всередині',
'всіх',
'від',
'він',
'да',
'давай',
'давати',
'де',
'дещо',
'для',
'до',
'з',
'завжди',
'замість',
'й',
'коли',
'ледве',
'майже',
'ми',
'навколо',
'навіть',
'нам',
'от',
'отже',
'отож',
'поза',
'про',
'під',
'та',
'так',
'такий',
'також',
'те',
'ти',
'тобто',
'тож',
'тощо',
'хоча',
'це',
'цей',
'чи',
'чого',
'що',
'як',
'який',
'якої',
'є',
'із',
'інших',
'їх',
'її',
]
4 changes: 4 additions & 0 deletions packages/botonic-plugin-contentful/src/nlp/tokens.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import TokenizerPt from '@nlpjs/lang-pt/src/tokenizer-pt'
import TokenizerRo from '@nlpjs/lang-ro/src/tokenizer-ro'
import TokenizerRu from '@nlpjs/lang-ru/src/tokenizer-ru'
import TokenizerTr from '@nlpjs/lang-tr/src/tokenizer-tr'
import TokenizerUk from '@nlpjs/lang-uk/src/tokenizer-uk'

import { Locale, rootLocale } from './locales'
import * as locales from './locales'
Expand All @@ -27,6 +28,7 @@ import { ptDefaultStopWords } from './stopwords/stopwords-pt'
import { roDefaultStopWords } from './stopwords/stopwords-ro'
import { ruDefaultStopWords } from './stopwords/stopwords-ru'
import { trDefaultStopWords } from './stopwords/stopwords-tr'
import { ukDefaultStopWords } from './stopwords/stopwords-uk'

export function countOccurrences(haystack: string, needle: string): number {
let n = 0
Expand Down Expand Up @@ -99,6 +101,7 @@ const tokenizers: { [locale: string]: Tokenizer } = {
[locales.ROMANIAN]: new TokenizerRo(),
[locales.GREEK]: new TokenizerEl(),
[locales.CZECH]: new TokenizerCs(),
[locales.UKRAINIAN]: new TokenizerUk(),
}

export function tokenizerPerLocale(locale: Locale): Tokenizer {
Expand Down Expand Up @@ -129,4 +132,5 @@ export const DEFAULT_STOP_WORDS: { [key: string]: string[] } = {
ro: roDefaultStopWords,
el: elDefaultStopWords,
cs: csDefaultStopWords,
uk: ukDefaultStopWords,
}
14 changes: 14 additions & 0 deletions packages/botonic-plugin-contentful/src/typings.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,20 @@ declare module '@nlpjs/lang-cs/src/stemmer-cs' {
export = StemmerCs
}

declare module '@nlpjs/lang-uk/src/tokenizer-uk' {
import { Tokenizer } from '@nlpjs/core/src'

class TokenizerUk extends Tokenizer {}
export = TokenizerUk
}

declare module '@nlpjs/lang-uk/src/stemmer-uk' {
import { BaseStemmer } from '@nlpjs/core/src'

class StemmerUk extends BaseStemmer {}
export = StemmerUk
}

declare module 'sort-stream' {
function sort(func: (a: any, b: any) => number): any
export = sort
Expand Down
10 changes: 10 additions & 0 deletions packages/botonic-plugin-contentful/tests/nlp/normalizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,16 @@ test.each<any>([
new Word('koupit', 'koupit'),
],
],
[
'uk',
'я з тобою розмовляю',
[
new Word('я', 'я'),
Word.StopWord('з'),
new Word('тобою', 'тоб'),
new Word('розмовляю', 'розмовля'),
],
],
])(
'TEST: stemmer removes stopwords: lang=%s input="%j"',
(locale: string, raw: string, words: Word[]) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ test.each<any>([
['el', 'ομιλία', ['ομιλ']],
['cs', 'mluvící', ['mluvic']],
['cs', 'psaní', ['psan']],
['uk', 'розмовляючи', ['розмовляюч']],
])(
'TEST: stemmer removes final letters(%s) =>%j',
(locale: string, raw: string, expected: string) => {
Expand Down

0 comments on commit 6a95efa

Please sign in to comment.