Skip to content

Commit

Permalink
Implement fetch_all (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
lovit committed Oct 3, 2020
1 parent 0a3b2f6 commit 6a352cd
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 1 deletion.
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,33 @@ Wikitext format Korean corpus
[한국어 위키의 덤프 데이터](https://dumps.wikimedia.org/kowiki/)를 바탕을 제작한 wikitext 형식의 텍스트 파일입니다.

Corpus size

- train : 26827837 lines (877754 articles)
- dev : 130568 lines (4433 articles)
- test : 134478 lines (4433 articles)

To fetch data, run below script. Then three corpus, train / dev / test files are downloaded at ./data/

```
python fetch.py
```

This corpus is licensed with CC-BY-SA 3.0 which kowiki is licensed. For detail, visit https://www.creativecommons.org/licenses/by-sa/3.0/

## Fetch and load using Korpora

Korpora is Korean Corpora Archives, implemented based on Python. We will provide the fetch / load function at Korpora

이 코퍼스는 Korpora 프로젝트에서 사용하는 기능을 제공할 예정입니다.

```python
from Korpora import Korpora

kowikitext = Korpora.load('kowikitext')

# or
Korpora.fetch('kowikitext')
```

## License

[CC-BY-SA 3.0](https://www.creativecommons.org/licenses/by-sa/3.0/) which [kowiki](https://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EC%A0%80%EC%9E%91%EA%B6%8C) dump dataset is licensed
3 changes: 3 additions & 0 deletions fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from kowikitext import fetch_all

fetch_all()
7 changes: 7 additions & 0 deletions kowikitext/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .about import __author__
from .about import __name__
from .about import __license__
from .about import __version__

from .utils import fetch
from .utils import fetch_all
4 changes: 4 additions & 0 deletions kowikitext/about.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
__author__ = 'lovit'
__name__ = 'kowikitext'
__license__ = 'CC-BY-SA 3.0'
__version__ = '20200920.v1'
80 changes: 80 additions & 0 deletions kowikitext/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import os
import zipfile
from tqdm import tqdm
from urllib import request


data_root = os.path.abspath(os.path.dirname(__file__))
data_root = os.path.abspath(os.path.join(data_root, '../data/'))
if not os.path.exists(data_root):
os.makedirs(data_root)

local_zips = {
'train': f'{data_root}/kowikitext_20200920.train.zip',
'dev': f'{data_root}/kowikitext_20200920.dev.zip',
'test': f'{data_root}/kowikitext_20200920.test.zip'
}

local_texts = {
'train': f'{data_root}/kowikitext_20200302.train',
'dev': f'{data_root}/kowikitext_20200302.dev',
'test': f'{data_root}/kowikitext_20200302.test '
}

KOWIKITEXT_URLS = {
'train': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.train.zip',
'dev': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.dev.zip',
'test': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.test.zip'
}


def fetch_all():
for data_name in ['train', 'dev', 'test']:
fetch(data_name)


def fetch(data_name):
if data_name not in ['train', 'dev', 'test']:
raise ValueError('Check `data_name` is one of ["train", "dev", "test"]')
if os.path.exists(local_texts[data_name]):
return True
zippath = local_zips[data_name]
if not os.path.exists(zippath):
download(KOWIKITEXT_URLS[data_name], zippath, data_name)
with zipfile.ZipFile(zippath, 'r') as zip_ref:
zip_ref.extractall(data_root)
print(f'unzip {data_name}')


def download(url, local_path, corpus_name):
filename = os.path.basename(local_path)
with tqdm(unit='B', unit_scale=True, miniters=1, desc=f'[{corpus_name}] download {filename}') as t:
request.urlretrieve(url, filename=local_path, reporthook=_reporthook(t))


def _reporthook(t):
""" ``reporthook`` to use with ``urllib.request`` that prints the process of the download.
Uses ``tqdm`` for progress bar.
**Reference:**
https://github.com/tqdm/tqdm
Args:
t (tqdm.tqdm) Progress bar.
Example:
>>> with tqdm(unit='B', unit_scale=True, miniters=1, desc=filename) as t: # doctest: +SKIP
... urllib.request.urlretrieve(file_url, filename=full_path, reporthook=reporthook(t))
"""
last_b = [0]

def inner(b=1, bsize=1, tsize=None):
"""
Args:
b (int, optional): Number of blocks just transferred [default: 1].
bsize (int, optional): Size of each block (in tqdm units) [default: 1].
tsize (int, optional): Total size (in tqdm units). If [default: None] remains unchanged.
"""
if tsize is not None:
t.total = tsize
t.update((b - last_b[0]) * bsize)
last_b[0] = b

return inner

0 comments on commit 6a352cd

Please sign in to comment.