Skip to content

Commit

Permalink
Merge pull request #185 from bab2min/dev/fix_space
Browse files Browse the repository at this point in the history
`Kiwi.space()` 개선
  • Loading branch information
bab2min authored Oct 28, 2024
2 parents 68a1d1b + 0708b3e commit 38a64d4
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 25 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
/opt/python/cp38-cp38/bin/python -m pip install "readme-renderer==41.0" "cryptography<38" "twine<4" wheel numpy==`/opt/python/cp38-cp38/bin/python .github/workflows/numpy_version.py`
/opt/python/cp38-cp38/bin/python setup.py sdist
/opt/python/cp38-cp38/bin/python -m twine upload dist/*.tar.gz
for cp in cp37-cp37m cp38-cp38
for cp in cp38-cp38
do
/opt/python/${cp}/bin/python -m pip install wheel numpy==`/opt/python/${cp}/bin/python .github/workflows/numpy_version.py`
USE_MIMALLOC=1 /opt/python/${cp}/bin/python setup.py build bdist_wheel
Expand Down Expand Up @@ -89,7 +89,7 @@ jobs:
yum install libffi-devel -y
/opt/python/cp311-cp311/bin/python -m pip install --upgrade pip setuptools
/opt/python/cp311-cp311/bin/python -m pip install twine wheel numpy==`/opt/python/cp311-cp311/bin/python .github/workflows/numpy_version.py`
for cp in cp39-cp39 cp310-cp310 cp311-cp311 cp312-cp312
for cp in cp39-cp39 cp310-cp310 cp311-cp311 cp312-cp312 cp313-cp313
do
/opt/python/${cp}/bin/python -m pip install wheel numpy==`/opt/python/${cp}/bin/python .github/workflows/numpy_version.py`
USE_MIMALLOC=1 /opt/python/${cp}/bin/python setup.py build bdist_wheel
Expand All @@ -103,7 +103,7 @@ jobs:
strategy:
max-parallel: 4
matrix:
python-version: [3.8, 3.9, "3.10", 3.11, 3.12]
python-version: [3.8, 3.9, "3.10", 3.11, 3.12, 3.13]

steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -131,7 +131,7 @@ jobs:
strategy:
max-parallel: 4
matrix:
python-version: [3.7, 3.8, 3.9, "3.10", 3.11, 3.12]
python-version: [3.8, 3.9, "3.10", 3.11, 3.12]
architecture: [x86, x64]

steps:
Expand Down Expand Up @@ -181,7 +181,7 @@ jobs:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
for cp in cp37-cp37m cp38-cp38 cp39-cp39
for cp in cp38-cp38 cp39-cp39
do
/opt/python/${cp}/bin/python -m pip install wheel numpy==`/opt/python/${cp}/bin/python .github/workflows/numpy_version.py`
USE_MIMALLOC=1 /opt/python/${cp}/bin/python setup.py build bdist_wheel
Expand Down Expand Up @@ -223,7 +223,7 @@ jobs:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
for cp in cp310-cp310 cp311-cp311 cp312-cp312
for cp in cp310-cp310 cp311-cp311 cp312-cp312 cp313-cp313
do
/opt/python/${cp}/bin/python -m pip install wheel numpy==`/opt/python/${cp}/bin/python .github/workflows/numpy_version.py`
USE_MIMALLOC=1 /opt/python/${cp}/bin/python setup.py build bdist_wheel
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/deploy_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
/opt/python/cp38-cp38/bin/python -m pip install "readme-renderer==41.0" "cryptography<38" "twine<4" wheel numpy==`/opt/python/cp38-cp38/bin/python .github/workflows/numpy_version.py`
/opt/python/cp38-cp38/bin/python setup.py sdist
/opt/python/cp38-cp38/bin/python -m twine upload --repository testpypi dist/*.tar.gz
for cp in cp37-cp37m cp38-cp38
for cp in cp38-cp38
do
/opt/python/${cp}/bin/python -m pip install wheel numpy==`/opt/python/${cp}/bin/python .github/workflows/numpy_version.py`
USE_MIMALLOC=1 /opt/python/${cp}/bin/python setup.py build bdist_wheel
Expand Down Expand Up @@ -86,7 +86,7 @@ jobs:
yum install libffi-devel -y
/opt/python/cp311-cp311/bin/python -m pip install --upgrade pip setuptools
/opt/python/cp311-cp311/bin/python -m pip install twine wheel numpy==`/opt/python/cp311-cp311/bin/python .github/workflows/numpy_version.py`
for cp in cp39-cp39 cp310-cp310 cp311-cp311 cp312-cp312
for cp in cp39-cp39 cp310-cp310 cp311-cp311 cp312-cp312 cp313-cp313
do
/opt/python/${cp}/bin/python -m pip install wheel numpy==`/opt/python/${cp}/bin/python .github/workflows/numpy_version.py`
USE_MIMALLOC=1 /opt/python/${cp}/bin/python setup.py build bdist_wheel
Expand All @@ -100,7 +100,7 @@ jobs:
strategy:
max-parallel: 4
matrix:
python-version: [3.8, 3.9, "3.10", 3.11, 3.12]
python-version: [3.8, 3.9, "3.10", 3.11, 3.12, 3.13]

steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -128,7 +128,7 @@ jobs:
strategy:
max-parallel: 4
matrix:
python-version: [3.7, 3.8, 3.9, "3.10", 3.11, 3.12]
python-version: [3.8, 3.9, "3.10", 3.11, 3.12]
architecture: [x86, x64]

steps:
Expand Down Expand Up @@ -178,7 +178,7 @@ jobs:
TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
run: |
for cp in cp37-cp37m cp38-cp38 cp39-cp39
for cp in cp38-cp38 cp39-cp39
do
/opt/python/${cp}/bin/python -m pip install --upgrade pip setuptools
/opt/python/${cp}/bin/python -m pip install wheel numpy==`/opt/python/${cp}/bin/python .github/workflows/numpy_version.py`
Expand Down Expand Up @@ -221,7 +221,7 @@ jobs:
TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
run: |
for cp in cp310-cp310 cp311-cp311 cp312-cp312
for cp in cp310-cp310 cp311-cp311 cp312-cp312 cp313-cp313
do
/opt/python/${cp}/bin/python -m pip install --upgrade pip setuptools
/opt/python/${cp}/bin/python -m pip install wheel numpy==`/opt/python/${cp}/bin/python .github/workflows/numpy_version.py`
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/numpy_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

def get_old_numpy_version(use_v1=False):
py_version = sys.version_info
if not use_v1 and py_version >= (3, 9):
return '2.0.0'
if not use_v1:
if py_version >= (3, 10): return '2.1.*'
if py_version >= (3, 9): return '2.0.*'
if py_version >= (3, 12): return '1.26.0'
if py_version >= (3, 11): return '1.24.0'
if py_version >= (3, 10): return '1.22.0'
Expand Down
20 changes: 10 additions & 10 deletions .github/workflows/pull_request_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
mv Kiwi/models/base/combiningRule.txt model/kiwipiepy_model/
mv Kiwi/models/base/skipbigram.mdl model/kiwipiepy_model/
/opt/python/${{ matrix.cp }}/bin/python -m pip install numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py`
/opt/python/${{ matrix.cp }}/bin/python -m pip install setuptools numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py`
cd model
/opt/python/${{ matrix.cp }}/bin/python setup.py build install
Expand Down Expand Up @@ -66,7 +66,7 @@ jobs:
strategy:
max-parallel: 4
matrix:
cp: [cp39-cp39, cp310-cp310, cp311-cp311, cp312-cp312]
cp: [cp39-cp39, cp310-cp310, cp311-cp311, cp312-cp312, cp313-cp313]

steps:
- uses: actions/checkout@v3
Expand All @@ -87,13 +87,13 @@ jobs:
mv Kiwi/models/base/combiningRule.txt model/kiwipiepy_model/
mv Kiwi/models/base/skipbigram.mdl model/kiwipiepy_model/
/opt/python/${{ matrix.cp }}/bin/python -m pip install numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py`
/opt/python/${{ matrix.cp }}/bin/python -m pip install setuptools numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py`
cd model
/opt/python/${{ matrix.cp }}/bin/python setup.py build install
cd ..
/opt/python/${{ matrix.cp }}/bin/python setup.py build
/opt/python/${{ matrix.cp }}/bin/python -m pip install numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py v1`
/opt/python/${{ matrix.cp }}/bin/python -m pip install numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py v1` || true
/opt/python/${{ matrix.cp }}/bin/python setup.py install
- name: Test kiwipiepy
run: |
Expand Down Expand Up @@ -121,7 +121,7 @@ jobs:
strategy:
max-parallel: 4
matrix:
python-version: [3.8, 3.9, "3.10", 3.11, 3.12]
python-version: [3.8, 3.9, "3.10", 3.11, 3.12, 3.13]

steps:
- uses: actions/checkout@v2
Expand All @@ -148,7 +148,7 @@ jobs:
MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=arm64 USE_MIMALLOC=1 python setup.py build
MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=x86_64 USE_MIMALLOC=1 python setup.py build
python -m pip install numpy==`python .github/workflows/numpy_version.py v1`
python -m pip install numpy==`python .github/workflows/numpy_version.py v1` || true
MACOSX_DEPLOYMENT_TARGET=10.14 KIWI_CPU_ARCH=x86_64 USE_MIMALLOC=1 python setup.py install
- name: Archive binary
uses: actions/upload-artifact@v3
Expand Down Expand Up @@ -195,7 +195,7 @@ jobs:
python setup.py build install
cd ..
$env:USE_MIMALLOC = 1; python setup.py build
python -m pip install numpy==$(python .github/workflows/numpy_version.py v1)
Try { python -m pip install numpy==$(python .github/workflows/numpy_version.py v1) } Catch {}
$env:USE_MIMALLOC = 1; python setup.py install
- name: Archive binary
uses: actions/upload-artifact@v3
Expand All @@ -214,7 +214,7 @@ jobs:
strategy:
max-parallel: 8
matrix:
cp: [cp39-cp39, cp310-cp310, cp311-cp311, cp312-cp312]
cp: [cp39-cp39, cp310-cp310, cp311-cp311, cp312-cp312, cp313-cp313]
arch: [aarch64]

steps:
Expand All @@ -238,13 +238,13 @@ jobs:
mv Kiwi/models/base/combiningRule.txt model/kiwipiepy_model/
mv Kiwi/models/base/skipbigram.mdl model/kiwipiepy_model/
/opt/python/${{ matrix.cp }}/bin/python -m pip install wheel numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py`
/opt/python/${{ matrix.cp }}/bin/python -m pip install wheel setuptools numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py`
cd model
/opt/python/${{ matrix.cp }}/bin/python setup.py build install
cd ..
/opt/python/${{ matrix.cp }}/bin/python setup.py build
/opt/python/${{ matrix.cp }}/bin/python -m pip install wheel numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py v1`
/opt/python/${{ matrix.cp }}/bin/python -m pip install numpy==`/opt/python/${{ matrix.cp }}/bin/python .github/workflows/numpy_version.py v1` || true
/opt/python/${{ matrix.cp }}/bin/python setup.py install bdist_wheel
tar -zcvf /artifacts/build.tgz build/*
Expand Down
6 changes: 5 additions & 1 deletion kiwipiepy/_wrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -1713,6 +1713,7 @@ def _space(arg):
if last < t.start:
if (t.tag.startswith('E') or t.tag.startswith('J') or t.tag.startswith('XS')
or t.tag == 'VX' and t.form in '하지'
or prev_tag == 'SN' and t.tag == 'NNB'
):
s = any_ws.sub('', raw[last:t.start])
else:
Expand All @@ -1726,7 +1727,10 @@ def _space(arg):
# 이전에 공백이 없는 경우만 삽입
chunks.append(' ')
if last < t.end:
s = any_ws.sub('', raw[last:t.end])
if t.tag.startswith('NN'):
s = t.form
else:
s = any_ws.sub('', raw[last:t.end])
if s: chunks.append(s)
last = t.end
prev_tag = t.tag
Expand Down
13 changes: 13 additions & 0 deletions test/test_kiwipiepy.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,19 @@ def test_space():
]))
assert res_a == [res0, res1, res2]

def test_space_with_multiword_token():
kiwi = Kiwi()
kiwi.add_user_word('구미 1동', 'NNP', 10)

assert kiwi.space('구미 1동') == '구미 1동'
assert kiwi.space('구미1동') == '구미 1동'

def test_space_of_sn_nnb():
kiwi = Kiwi()

assert kiwi.space('3 시 30 분 45 초') == '3시 30분 45초'
assert kiwi.space('3시30분45초') == '3시 30분 45초'

def test_glue():
chunks = """KorQuAD 2.0은 총 100,000+ 쌍으로 구성된 한국어 질의응답 데이터셋이다. 기존 질의응답 표준 데이
터인 KorQuAD 1.0과의 차이점은 크게 세가지가 있는데 첫 번째는 주어지는 지문이 한두 문단이 아닌 위
Expand Down

0 comments on commit 38a64d4

Please sign in to comment.