diff --git a/README.md b/README.md index 1d67646..08cfc0d 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,33 @@ Wikitext format Korean corpus [한국어 위키의 덤프 데이터](https://dumps.wikimedia.org/kowiki/)를 바탕을 제작한 wikitext 형식의 텍스트 파일입니다. Corpus size - - train : 26827837 lines (877754 articles) - dev : 130568 lines (4433 articles) - test : 134478 lines (4433 articles) + +To fetch data, run below script. Then three corpus, train / dev / test files are downloaded at ./data/ + +``` +python fetch.py +``` + +This corpus is licensed with CC-BY-SA 3.0 which kowiki is licensed. For detail, visit https://www.creativecommons.org/licenses/by-sa/3.0/ + +## Fetch and load using Korpora + +Korpora is Korean Corpora Archives, implemented based on Python. We will provide the fetch / load function at Korpora + +이 코퍼스는 Korpora 프로젝트에서 사용하는 기능을 제공할 예정입니다. + +```python +from Korpora import Korpora + +kowikitext = Korpora.load('kowikitext') + +# or +Korpora.fetch('kowikitext') +``` + +## License + +[CC-BY-SA 3.0](https://www.creativecommons.org/licenses/by-sa/3.0/) which [kowiki](https://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EC%A0%80%EC%9E%91%EA%B6%8C) dump dataset is licensed diff --git a/fetch.py b/fetch.py new file mode 100644 index 0000000..52454e2 --- /dev/null +++ b/fetch.py @@ -0,0 +1,3 @@ +from kowikitext import fetch_all + +fetch_all() diff --git a/kowikitext/__init__.py b/kowikitext/__init__.py new file mode 100644 index 0000000..375179a --- /dev/null +++ b/kowikitext/__init__.py @@ -0,0 +1,7 @@ +from .about import __author__ +from .about import __name__ +from .about import __license__ +from .about import __version__ + +from .utils import fetch +from .utils import fetch_all diff --git a/kowikitext/about.py b/kowikitext/about.py new file mode 100644 index 0000000..7a0b2a6 --- /dev/null +++ b/kowikitext/about.py @@ -0,0 +1,4 @@ +__author__ = 'lovit' +__name__ = 'kowikitext' +__license__ = 'CC-BY-SA 3.0' +__version__ = '20200920.v1' \ No newline at end of file diff --git a/kowikitext/utils.py b/kowikitext/utils.py new file mode 100644 index 0000000..14f86e1 --- /dev/null +++ b/kowikitext/utils.py @@ -0,0 +1,80 @@ +import os +import zipfile +from tqdm import tqdm +from urllib import request + + +data_root = os.path.abspath(os.path.dirname(__file__)) +data_root = os.path.abspath(os.path.join(data_root, '../data/')) +if not os.path.exists(data_root): + os.makedirs(data_root) + +local_zips = { + 'train': f'{data_root}/kowikitext_20200920.train.zip', + 'dev': f'{data_root}/kowikitext_20200920.dev.zip', + 'test': f'{data_root}/kowikitext_20200920.test.zip' +} + +local_texts = { + 'train': f'{data_root}/kowikitext_20200302.train', + 'dev': f'{data_root}/kowikitext_20200302.dev', + 'test': f'{data_root}/kowikitext_20200302.test ' +} + +KOWIKITEXT_URLS = { + 'train': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.train.zip', + 'dev': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.dev.zip', + 'test': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.test.zip' +} + + +def fetch_all(): + for data_name in ['train', 'dev', 'test']: + fetch(data_name) + + +def fetch(data_name): + if data_name not in ['train', 'dev', 'test']: + raise ValueError('Check `data_name` is one of ["train", "dev", "test"]') + if os.path.exists(local_texts[data_name]): + return True + zippath = local_zips[data_name] + if not os.path.exists(zippath): + download(KOWIKITEXT_URLS[data_name], zippath, data_name) + with zipfile.ZipFile(zippath, 'r') as zip_ref: + zip_ref.extractall(data_root) + print(f'unzip {data_name}') + + +def download(url, local_path, corpus_name): + filename = os.path.basename(local_path) + with tqdm(unit='B', unit_scale=True, miniters=1, desc=f'[{corpus_name}] download {filename}') as t: + request.urlretrieve(url, filename=local_path, reporthook=_reporthook(t)) + + +def _reporthook(t): + """ ``reporthook`` to use with ``urllib.request`` that prints the process of the download. + Uses ``tqdm`` for progress bar. + **Reference:** + https://github.com/tqdm/tqdm + Args: + t (tqdm.tqdm) Progress bar. + Example: + >>> with tqdm(unit='B', unit_scale=True, miniters=1, desc=filename) as t: # doctest: +SKIP + ... urllib.request.urlretrieve(file_url, filename=full_path, reporthook=reporthook(t)) + """ + last_b = [0] + + def inner(b=1, bsize=1, tsize=None): + """ + Args: + b (int, optional): Number of blocks just transferred [default: 1]. + bsize (int, optional): Size of each block (in tqdm units) [default: 1]. + tsize (int, optional): Total size (in tqdm units). If [default: None] remains unchanged. + """ + if tsize is not None: + t.total = tsize + t.update((b - last_b[0]) * bsize) + last_b[0] = b + + return inner \ No newline at end of file