Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix CI with change of name of nlp #7054

Merged
merged 5 commits into from
Sep 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ jobs:
- v0.3-torch_and_tf-{{ checksum "setup.py" }}
- v0.3-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install git+https://github.com/huggingface/nlp
- run: pip install git+https://github.com/huggingface/datasets
- run: pip install .[sklearn,tf-cpu,torch,testing]
- run: pip install codecov pytest-cov
- save_cache:
Expand All @@ -104,7 +104,7 @@ jobs:
- v0.3-torch-{{ checksum "setup.py" }}
- v0.3-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install git+https://github.com/huggingface/nlp
- run: pip install git+https://github.com/huggingface/datasets
- run: pip install .[sklearn,torch,testing]
- save_cache:
key: v0.3-torch-{{ checksum "setup.py" }}
Expand All @@ -129,7 +129,7 @@ jobs:
- v0.3-tf-{{ checksum "setup.py" }}
- v0.3-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install git+https://github.com/huggingface/nlp
- run: pip install git+https://github.com/huggingface/datasets
- run: pip install .[sklearn,tf-cpu,testing]
- save_cache:
key: v0.3-tf-{{ checksum "setup.py" }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/self-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:
pip install --upgrade pip
pip install torch!=1.6.0
pip install .[sklearn,testing,onnxruntime]
pip install git+https://github.com/huggingface/nlp
pip install git+https://github.com/huggingface/datasets

- name: Are GPUs recognized by our DL frameworks
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/self-scheduled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
pip install --upgrade pip
pip install torch!=1.6.0
pip install .[sklearn,testing,onnxruntime]
pip install git+https://github.com/huggingface/nlp
pip install git+https://github.com/huggingface/datasets

- name: Are GPUs recognized by our DL frameworks
run: |
Expand Down
2 changes: 1 addition & 1 deletion examples/longform-qa/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Long Form Question Answering

This folder contains the code for the Long Form Question answering [demo](http://35.226.96.115:8080/) as well as methods to train and use a fully end-to-end Long Form Question Answering system using the [🤗transformers](https://github.com/huggingface/transformers) and [🤗nlp](https://github.com/huggingface/nlp) libraries.
This folder contains the code for the Long Form Question answering [demo](http://35.226.96.115:8080/) as well as methods to train and use a fully end-to-end Long Form Question Answering system using the [🤗transformers](https://github.com/huggingface/transformers) and [🤗datasets](https://github.com/huggingface/datasets) libraries.

You can use these methods to train your own system by following along the associate [notebook](https://github.com/huggingface/notebooks/blob/master/longform-qa/Long_Form_Question_Answering_with_ELI5_and_Wikipedia.ipynb) or [blog post](https://yjernite.github.io/lfqa.html).
6 changes: 3 additions & 3 deletions examples/longform-qa/eli5_app.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import datasets
import faiss
import nlp
import numpy as np
import streamlit as st
import torch
Expand Down Expand Up @@ -45,7 +45,7 @@ def load_models():
def load_indexes():
if LOAD_DENSE_INDEX:
faiss_res = faiss.StandardGpuResources()
wiki40b_passages = nlp.load_dataset(path="wiki_snippets", name="wiki40b_en_100_0")["train"]
wiki40b_passages = datasets.load_dataset(path="wiki_snippets", name="wiki40b_en_100_0")["train"]
wiki40b_passage_reps = np.memmap(
"wiki40b_passages_reps_32_l-8_h-768_b-512-512.dat",
dtype="float32",
Expand All @@ -63,7 +63,7 @@ def load_indexes():

@st.cache(allow_output_mutation=True)
def load_train_data():
eli5 = nlp.load_dataset("eli5", name="LFQA_reddit")
eli5 = datasets.load_dataset("eli5", name="LFQA_reddit")
eli5_train = eli5["train_eli5"]
eli5_train_q_reps = np.memmap(
"eli5_questions_reps.dat", dtype="float32", mode="r", shape=(eli5_train.num_rows, 128)
Expand Down
2 changes: 1 addition & 1 deletion examples/longform-qa/eli5_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from random import choice, randint
from time import time

import datasets # noqa: F401
import faiss # noqa: F401
import nlp # noqa: F401
import numpy as np
import pandas as pd
import torch
Expand Down
2 changes: 1 addition & 1 deletion examples/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ faiss
streamlit
elasticsearch
pandas
nlp
datasets
fire
pytest
conllu
10 changes: 5 additions & 5 deletions examples/seq2seq/download_wmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,25 @@


def download_wmt_dataset(src_lang="ro", tgt_lang="en", dataset="wmt16", save_dir=None) -> None:
"""Download a dataset using the nlp package and save it to the format expected by finetune.py
"""Download a dataset using the datasets package and save it to the format expected by finetune.py
Format of save_dir: train.source, train.target, val.source, val.target, test.source, test.target.

Args:
src_lang: <str> source language
tgt_lang: <str> target language
dataset: <str> wmt16, wmt17, etc. wmt16 is a good start as it's small. To get the full list run `import nlp; print([d.id for d in nlp.list_datasets() if "wmt" in d.id])`
dataset: <str> wmt16, wmt17, etc. wmt16 is a good start as it's small. To get the full list run `import datasets; print([d.id for d in datasets.list_datasets() if "wmt" in d.id])`
save_dir: <str>, where to save the datasets, defaults to f'{dataset}-{src_lang}-{tgt_lang}'

Usage:
>>> download_wmt_dataset('ro', 'en', dataset='wmt16') # saves to wmt16-ro-en
"""
try:
import nlp
import datasets
except (ModuleNotFoundError, ImportError):
raise ImportError("run pip install nlp")
raise ImportError("run pip install datasets")
pair = f"{src_lang}-{tgt_lang}"
print(f"Converting {dataset}-{pair}")
ds = nlp.load_dataset(dataset, pair)
ds = datasets.load_dataset(dataset, pair)
if save_dir is None:
save_dir = f"{dataset}-{pair}"
save_dir = Path(save_dir)
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ known_first_party = transformers
known_third_party =
absl
conllu
datasets
elasticsearch
fairseq
faiss
Expand All @@ -16,7 +17,6 @@ known_third_party =
git
h5py
matplotlib
nlp
nltk
numpy
packaging
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@
add_start_docstrings,
cached_path,
is_apex_available,
is_nlp_available,
is_datasets_available,
is_psutil_available,
is_py3nvml_available,
is_tf_available,
Expand Down
10 changes: 5 additions & 5 deletions src/transformers/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,12 @@


try:
import nlp # noqa: F401
import datasets # noqa: F401

_nlp_available = True
_datasets_available = True

except ImportError:
_nlp_available = False
_datasets_available = False

try:
from torch.hub import _get_torch_home
Expand Down Expand Up @@ -155,8 +155,8 @@ def is_torch_tpu_available():
return _torch_tpu_available


def is_nlp_available():
return _nlp_available
def is_datasets_available():
return _datasets_available


def is_psutil_available():
Expand Down
30 changes: 15 additions & 15 deletions src/transformers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from tqdm.auto import tqdm, trange

from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
from .file_utils import is_nlp_available, is_torch_tpu_available
from .file_utils import is_datasets_available, is_torch_tpu_available
from .integrations import (
default_hp_search_backend,
is_comet_available,
Expand Down Expand Up @@ -65,8 +65,8 @@
_use_native_amp = True
from torch.cuda.amp import autocast

if is_nlp_available():
import nlp
if is_datasets_available():
import datasets

if is_torch_tpu_available():
import torch_xla.core.xla_model as xm
Expand Down Expand Up @@ -179,10 +179,10 @@ class Trainer:
:obj:`eval_dataset`. Will default to :func:`~transformers.default_data_collator` if no ``tokenizer`` is
provided, an instance of :func:`~transformers.DataCollatorWithPadding` otherwise.
train_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
The dataset to use for training. If it is an :obj:`nlp.Dataset`, columns not accepted by the
The dataset to use for training. If it is an :obj:`datasets.Dataset`, columns not accepted by the
``model.forward()`` method are automatically removed.
eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
The dataset to use for evaluation. If it is an :obj:`nlp.Dataset`, columns not accepted by the
The dataset to use for evaluation. If it is an :obj:`datasets.Dataset`, columns not accepted by the
``model.forward()`` method are automatically removed.
tokenizer (:class:`PreTrainedTokenizerBase`, `optional`):
The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs the
Expand Down Expand Up @@ -280,10 +280,10 @@ def __init__(
FutureWarning,
)

if is_nlp_available():
if isinstance(train_dataset, nlp.Dataset):
if is_datasets_available():
if isinstance(train_dataset, datasets.Dataset):
self._remove_unused_columns(self.train_dataset, description="training")
if isinstance(eval_dataset, nlp.Dataset):
if isinstance(eval_dataset, datasets.Dataset):
self._remove_unused_columns(self.eval_dataset, description="evaluation")

self.global_step = None
Expand All @@ -294,7 +294,7 @@ def __init__(
self.hp_search_backend = None
self.use_tune_checkpoints = False

def _remove_unused_columns(self, dataset: "nlp.Dataset", description: Optional[str] = None):
def _remove_unused_columns(self, dataset: "datasets.Dataset", description: Optional[str] = None):
if not self.args.remove_unused_columns:
return
# Inspect model forward signature to keep only the arguments it accepts.
Expand Down Expand Up @@ -364,12 +364,12 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa

Args:
eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
If provided, will override :obj:`self.eval_dataset`. If it is an :obj:`nlp.Dataset`, columns not
If provided, will override :obj:`self.eval_dataset`. If it is an :obj:`datasets.Dataset`, columns not
accepted by the ``model.forward()`` method are automatically removed.
"""
if eval_dataset is None and self.eval_dataset is None:
raise ValueError("Trainer: evaluation requires an eval_dataset.")
elif eval_dataset is not None and is_nlp_available() and isinstance(eval_dataset, nlp.Dataset):
elif eval_dataset is not None and is_datasets_available() and isinstance(eval_dataset, datasets.Dataset):
self._remove_unused_columns(eval_dataset, description="evaluation")
eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
eval_sampler = self._get_eval_sampler(eval_dataset)
Expand All @@ -393,10 +393,10 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:

Args:
eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
The test dataset to use. If it is an :obj:`nlp.Dataset`, columns not accepted by the
The test dataset to use. If it is an :obj:`datasets.Dataset`, columns not accepted by the
``model.forward()`` method are automatically removed.
"""
if is_nlp_available() and isinstance(test_dataset, nlp.Dataset):
if is_datasets_available() and isinstance(test_dataset, datasets.Dataset):
self._remove_unused_columns(test_dataset, description="test")
test_sampler = self._get_eval_sampler(test_dataset)

Expand Down Expand Up @@ -1200,7 +1200,7 @@ def evaluate(self, eval_dataset: Optional[Dataset] = None) -> Dict[str, float]:

Args:
eval_dataset (:obj:`Dataset`, `optional`):
Pass a dataset if you wish to override :obj:`self.eval_dataset`. If it is an :obj:`nlp.Dataset`,
Pass a dataset if you wish to override :obj:`self.eval_dataset`. If it is an :obj:`datasets.Dataset`,
columns not accepted by the ``model.forward()`` method are automatically removed.

Returns:
Expand All @@ -1227,7 +1227,7 @@ def predict(self, test_dataset: Dataset) -> PredictionOutput:

Args:
test_dataset (:obj:`Dataset`):
Dataset to run the predictions on. If it is an :obj:`nlp.Dataset`, columns not accepted by the
Dataset to run the predictions on. If it is an :obj:`datasets.Dataset`, columns not accepted by the
``model.forward()`` method are automatically removed.

Returns:
Expand Down
8 changes: 4 additions & 4 deletions tests/test_trainer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest

import nlp
import datasets
import numpy as np

from transformers import AutoTokenizer, TrainingArguments, is_torch_available
Expand Down Expand Up @@ -200,11 +200,11 @@ def test_predict(self):
x = trainer.eval_dataset.x
self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))

def test_trainer_with_nlp(self):
def test_trainer_with_datasets(self):
np.random.seed(42)
x = np.random.normal(size=(64,)).astype(np.float32)
y = 2.0 * x + 3.0 + np.random.normal(scale=0.1, size=(64,))
train_dataset = nlp.Dataset.from_dict({"input_x": x, "label": y})
train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y})

# Base training. Should have the same results as test_reproducible_training
model = RegressionModel()
Expand All @@ -222,7 +222,7 @@ def test_trainer_with_nlp(self):

# Adding one column not used by the model should have no impact
z = np.random.normal(size=(64,)).astype(np.float32)
train_dataset = nlp.Dataset.from_dict({"input_x": x, "label": y, "extra": z})
train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z})
model = RegressionModel()
trainer = Trainer(model, args, train_dataset=train_dataset)
trainer.train()
Expand Down