-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
121 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from kowikitext import fetch_all | ||
|
||
fetch_all() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from .about import __author__ | ||
from .about import __name__ | ||
from .about import __license__ | ||
from .about import __version__ | ||
|
||
from .utils import fetch | ||
from .utils import fetch_all |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
__author__ = 'lovit' | ||
__name__ = 'kowikitext' | ||
__license__ = 'CC-BY-SA 3.0' | ||
__version__ = '20200920.v1' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import os | ||
import zipfile | ||
from tqdm import tqdm | ||
from urllib import request | ||
|
||
|
||
data_root = os.path.abspath(os.path.dirname(__file__)) | ||
data_root = os.path.abspath(os.path.join(data_root, '../data/')) | ||
if not os.path.exists(data_root): | ||
os.makedirs(data_root) | ||
|
||
local_zips = { | ||
'train': f'{data_root}/kowikitext_20200920.train.zip', | ||
'dev': f'{data_root}/kowikitext_20200920.dev.zip', | ||
'test': f'{data_root}/kowikitext_20200920.test.zip' | ||
} | ||
|
||
local_texts = { | ||
'train': f'{data_root}/kowikitext_20200302.train', | ||
'dev': f'{data_root}/kowikitext_20200302.dev', | ||
'test': f'{data_root}/kowikitext_20200302.test ' | ||
} | ||
|
||
KOWIKITEXT_URLS = { | ||
'train': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.train.zip', | ||
'dev': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.dev.zip', | ||
'test': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.test.zip' | ||
} | ||
|
||
|
||
def fetch_all(): | ||
for data_name in ['train', 'dev', 'test']: | ||
fetch(data_name) | ||
|
||
|
||
def fetch(data_name): | ||
if data_name not in ['train', 'dev', 'test']: | ||
raise ValueError('Check `data_name` is one of ["train", "dev", "test"]') | ||
if os.path.exists(local_texts[data_name]): | ||
return True | ||
zippath = local_zips[data_name] | ||
if not os.path.exists(zippath): | ||
download(KOWIKITEXT_URLS[data_name], zippath, data_name) | ||
with zipfile.ZipFile(zippath, 'r') as zip_ref: | ||
zip_ref.extractall(data_root) | ||
print(f'unzip {data_name}') | ||
|
||
|
||
def download(url, local_path, corpus_name): | ||
filename = os.path.basename(local_path) | ||
with tqdm(unit='B', unit_scale=True, miniters=1, desc=f'[{corpus_name}] download {filename}') as t: | ||
request.urlretrieve(url, filename=local_path, reporthook=_reporthook(t)) | ||
|
||
|
||
def _reporthook(t): | ||
""" ``reporthook`` to use with ``urllib.request`` that prints the process of the download. | ||
Uses ``tqdm`` for progress bar. | ||
**Reference:** | ||
https://github.com/tqdm/tqdm | ||
Args: | ||
t (tqdm.tqdm) Progress bar. | ||
Example: | ||
>>> with tqdm(unit='B', unit_scale=True, miniters=1, desc=filename) as t: # doctest: +SKIP | ||
... urllib.request.urlretrieve(file_url, filename=full_path, reporthook=reporthook(t)) | ||
""" | ||
last_b = [0] | ||
|
||
def inner(b=1, bsize=1, tsize=None): | ||
""" | ||
Args: | ||
b (int, optional): Number of blocks just transferred [default: 1]. | ||
bsize (int, optional): Size of each block (in tqdm units) [default: 1]. | ||
tsize (int, optional): Total size (in tqdm units). If [default: None] remains unchanged. | ||
""" | ||
if tsize is not None: | ||
t.total = tsize | ||
t.update((b - last_b[0]) * bsize) | ||
last_b[0] = b | ||
|
||
return inner |