Implement fetch_all (#4)

lovit · Oct 3, 2020 · 6a352cd · 6a352cd
1 parent 0a3b2f6
commit 6a352cd
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -5,7 +5,33 @@ Wikitext format Korean corpus
 [한국어 위키의 덤프 데이터](https://dumps.wikimedia.org/kowiki/)를 바탕을 제작한 wikitext 형식의 텍스트 파일입니다.
 
 Corpus size
-
 - train : 26827837 lines (877754 articles)
 - dev : 130568 lines (4433 articles)
 - test : 134478 lines (4433 articles)
+
+To fetch data, run below script. Then three corpus, train / dev / test files are downloaded at ./data/
+
+```
+python fetch.py
+```
+
+This corpus is licensed with CC-BY-SA 3.0 which kowiki is licensed. For detail, visit https://www.creativecommons.org/licenses/by-sa/3.0/
+
+## Fetch and load using Korpora
+
+Korpora is Korean Corpora Archives, implemented based on Python. We will provide the fetch / load function at Korpora
+
+이 코퍼스는 Korpora 프로젝트에서 사용하는 기능을 제공할 예정입니다.
+
+```python
+from Korpora import Korpora
+
+kowikitext = Korpora.load('kowikitext')
+
+# or
+Korpora.fetch('kowikitext')
+```
+
+## License
+
+[CC-BY-SA 3.0](https://www.creativecommons.org/licenses/by-sa/3.0/) which [kowiki](https://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EC%A0%80%EC%9E%91%EA%B6%8C) dump dataset is licensed
diff --git a/fetch.py b/fetch.py
@@ -0,0 +1,3 @@
+from kowikitext import fetch_all
+
+fetch_all()
diff --git a/kowikitext/__init__.py b/kowikitext/__init__.py
@@ -0,0 +1,7 @@
+from .about import __author__
+from .about import __name__
+from .about import __license__
+from .about import __version__
+
+from .utils import fetch
+from .utils import fetch_all
diff --git a/kowikitext/about.py b/kowikitext/about.py
@@ -0,0 +1,4 @@
+__author__ = 'lovit'
+__name__ = 'kowikitext'
+__license__ = 'CC-BY-SA 3.0'
+__version__ = '20200920.v1'
diff --git a/kowikitext/utils.py b/kowikitext/utils.py
@@ -0,0 +1,80 @@
+import os
+import zipfile
+from tqdm import tqdm
+from urllib import request
+
+
+data_root = os.path.abspath(os.path.dirname(__file__))
+data_root = os.path.abspath(os.path.join(data_root, '../data/'))
+if not os.path.exists(data_root):
+    os.makedirs(data_root)
+
+local_zips = {
+    'train': f'{data_root}/kowikitext_20200920.train.zip',
+    'dev': f'{data_root}/kowikitext_20200920.dev.zip',
+    'test': f'{data_root}/kowikitext_20200920.test.zip'
+}
+
+local_texts = {
+    'train': f'{data_root}/kowikitext_20200302.train',
+    'dev': f'{data_root}/kowikitext_20200302.dev',
+    'test': f'{data_root}/kowikitext_20200302.test '
+}
+
+KOWIKITEXT_URLS = {
+    'train': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.train.zip',
+    'dev': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.dev.zip',
+    'test': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.test.zip'
+}
+
+
+def fetch_all():
+    for data_name in ['train', 'dev', 'test']:
+        fetch(data_name)
+
+
+def fetch(data_name):
+    if data_name not in ['train', 'dev', 'test']:
+        raise ValueError('Check `data_name` is one of ["train", "dev", "test"]')
+    if os.path.exists(local_texts[data_name]):
+        return True
+    zippath = local_zips[data_name]
+    if not os.path.exists(zippath):
+        download(KOWIKITEXT_URLS[data_name], zippath, data_name)
+        with zipfile.ZipFile(zippath, 'r') as zip_ref:
+            zip_ref.extractall(data_root)
+        print(f'unzip {data_name}')
+
+
+def download(url, local_path, corpus_name):
+    filename = os.path.basename(local_path)
+    with tqdm(unit='B', unit_scale=True, miniters=1, desc=f'[{corpus_name}] download {filename}') as t:
+        request.urlretrieve(url, filename=local_path, reporthook=_reporthook(t))
+
+
+def _reporthook(t):
+    """ ``reporthook`` to use with ``urllib.request`` that prints the process of the download.
+    Uses ``tqdm`` for progress bar.
+    **Reference:**
+    https://github.com/tqdm/tqdm
+    Args:
+        t (tqdm.tqdm) Progress bar.
+    Example:
+        >>> with tqdm(unit='B', unit_scale=True, miniters=1, desc=filename) as t:  # doctest: +SKIP
+        ...   urllib.request.urlretrieve(file_url, filename=full_path, reporthook=reporthook(t))
+    """
+    last_b = [0]
+
+    def inner(b=1, bsize=1, tsize=None):
+        """
+        Args:
+            b (int, optional): Number of blocks just transferred [default: 1].
+            bsize (int, optional): Size of each block (in tqdm units) [default: 1].
+            tsize (int, optional): Total size (in tqdm units). If [default: None] remains unchanged.
+        """
+        if tsize is not None:
+            t.total = tsize
+        t.update((b - last_b[0]) * bsize)
+        last_b[0] = b
+
+    return inner