Skip to content

Commit

Permalink
Merge pull request #138 from FerroEduardo/enhancements
Browse files Browse the repository at this point in the history
Fixes emoji module, create normaliser tests and add automatic test
  • Loading branch information
thalesbertaglia authored Feb 27, 2023
2 parents 17c5cf8 + 6147ce6 commit a18e3c9
Show file tree
Hide file tree
Showing 10 changed files with 95 additions and 11 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: test

on:
push:
branches:
- master
pull_request:

jobs:
build:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install .
python -m pip install pytest pytest-cov
- name: Test with pytest
run: |
pytest tests/ --cov=enelvo/
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ python:
- "3.6"
- "3.7"
- "3.8"
- "3.9"
- "3.10"

before_install:
- pip install poetry
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
<h4 align="center">A flexible normaliser for user-generated content in Portuguese.</h4>

<p align="center">
<a href="https://github.com/thalesbertaglia/enelvo/actions/workflows/tests.yaml"><img alt="tests" src="https://github.com/thalesbertaglia/enelvo/actions/workflows/tests.yaml/badge.svg"></a>
<a href="https://travis-ci.org/thalesbertaglia/enelvo"><img alt="Build Status" src="https://travis-ci.org/thalesbertaglia/enelvo.svg?branch=master"></a>
<a href='https://coveralls.io/github/thalesbertaglia/enelvo?branch=master'><img src='https://coveralls.io/repos/github/thalesbertaglia/enelvo/badge.svg?branch=master' alt='Coverage Status' /></a>
<a href="https://github.com/psf/black"><img alt="Code style: black" src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
Expand Down
2 changes: 0 additions & 2 deletions enelvo/candidate_generation/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,9 @@

# Author: Thales Bertaglia <thalesbertaglia@gmail.com>

import gensim
import pickle
from enelvo import metrics
from enelvo import candidate_scoring
from enelvo import utils
from enelvo.candidate_generation import baselines


Expand Down
1 change: 0 additions & 1 deletion enelvo/candidate_scoring/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

# Author: Thales Bertaglia <thalesbertaglia@gmail.com>

import pickle
from enelvo import metrics


Expand Down
4 changes: 2 additions & 2 deletions enelvo/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Preprocessing methods."""
from .tokenizer import Tokenizer
from emoji import UNICODE_EMOJI
import emoji
import string
import os.path

Expand Down Expand Up @@ -70,7 +70,7 @@ def sanitize(text, as_string=False):
clean = [
w.strip()
for w in tokens
if w not in emoticons and w not in UNICODE_EMOJI and len(w) != 0
if w not in emoticons and emoji.emoji_count(w) == 0 and len(w) != 0
]
return clean if not as_string else " ".join(clean)

Expand Down
5 changes: 2 additions & 3 deletions enelvo/preprocessing/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@
# Adapted to Portuguese by Thales Bertaglia <thalesbertaglia@gmail.com>

import re
import string
from os import path
from emoji import UNICODE_EMOJI
import emoji

from html.entities import name2codepoint

Expand Down Expand Up @@ -72,7 +71,7 @@ def _isemoji(s):
len(s) == len("\U0001f4a9")
and any(l <= s <= u for l, u in emoji_ranges)
or s in emoji_flags
or s in UNICODE_EMOJI
or emoji.emoji_count(s) > 0
)


Expand Down
2 changes: 0 additions & 2 deletions enelvo/utils/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

# Author: Thales Bertaglia <thalesbertaglia@gmail.com>

from tabulate import tabulate


def evaluate_candidate_generation(list_corrections, list_candidates):
"""Returns the recall (in %) of candidate generation methods.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ editdistance = ">=0.6.0"
numpy = ">=1.19.5"
gensim = ">=4.1.2"
tabulate = ">=0.8.9"
emoji = ">=1.6.3"
emoji = "2.2.0"

[tool.poetry.dev-dependencies]
coveralls = ">=3.3.1"
Expand Down
58 changes: 58 additions & 0 deletions tests/test_normaliser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from enelvo import normaliser

def test_raw():
norm = normaliser.Normaliser()
assert norm.normalise('Que dia lindo') == 'que dia lindo'
assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha 😀'
assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do eduardo'
assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no bbb'
assert norm.normalise('#python > #javascript') == 'hashtag > hashtag'

def test_sanitize():
norm = normaliser.Normaliser(sanitize=True)
assert norm.normalise('Que dia lindo') == 'que dia lindo'
assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha'
assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do eduardo'
assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no bbb'
assert norm.normalise('#python > #javascript') == 'hashtag hashtag'

def test_capitalize_pns():
norm = normaliser.Normaliser(capitalize_pns=True)
assert norm.normalise('Que dia lindo') == 'que dia lindo'
assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha 😀'
assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do Eduardo'
assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no bbb'
assert norm.normalise('#python > #javascript') == 'Hashtag > Hashtag'

def test_capitalize_inis():
norm = normaliser.Normaliser(capitalize_inis=True)
assert norm.normalise('Que dia lindo') == 'Que dia lindo'
assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'Hoje eu vou dar uma voltinha 😀'
assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'Vou dar uma passada na casa do eduardo'
assert norm.normalise('Domingo é dia de paredão no BBB') == 'Domingo é dia de paredão no bbb'
assert norm.normalise('#python > #javascript') == 'Hashtag > hashtag'

def test_capitalize_acs():
norm = normaliser.Normaliser(capitalize_acs=True)
assert norm.normalise('Que dia lindo') == 'que dia lindo'
assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha 😀'
assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do eduardo'
assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no BBB'
assert norm.normalise('#python > #javascript') == 'hashtag > hashtag'

def test_readable_tokenizer():
norm = normaliser.Normaliser(tokenizer='readable')
assert norm.normalise('Que dia lindo') == 'que dia lindo'
assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'hoje eu vou dar uma voltinha 😀'
assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'vou dar uma passada na casa do eduardo'
assert norm.normalise('Domingo é dia de paredão no BBB') == 'domingo é dia de paredão no bbb'
assert norm.normalise('#python > #javascript') == '#python > #javascript'

def test_all():
norm = normaliser.Normaliser(sanitize=True, capitalize_pns=True, capitalize_inis=True, capitalize_acs=True, tokenizer='readable')
assert norm.normalise('Que dia lindo') == 'Que dia lindo'
assert norm.normalise('Hoje eu vou dar uma voltinha 😀') == 'Hoje eu vou dar uma voltinha'
assert norm.normalise('Vou dar uma passada na casa do Eduardo') == 'Vou dar uma passada na casa do Eduardo'
assert norm.normalise('Domingo é dia de paredão no BBB') == 'Domingo é dia de paredão no BBB'
assert norm.normalise('#python > #javascript') == 'python javascript'

0 comments on commit a18e3c9

Please sign in to comment.