From 56d5b50292bd182f30f67fbfefbc3d4f7bef6663 Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Tue, 27 Jun 2017 11:57:04 -0700 Subject: [PATCH] fix pylint --- allennlp/data/data_generator.py | 2 +- allennlp/data/dataset_readers/squad.py | 4 +- allennlp/data/vocabulary.py | 5 +- allennlp/layers/embeddings.py | 4 +- tests/data/data_generator_to_fix.py | 322 ++++++++++++------------- tests/data/embeddings_test.py | 41 +--- 6 files changed, 171 insertions(+), 207 deletions(-) diff --git a/allennlp/data/data_generator.py b/allennlp/data/data_generator.py index 75da4fb18a4..a082c910f24 100644 --- a/allennlp/data/data_generator.py +++ b/allennlp/data/data_generator.py @@ -177,7 +177,7 @@ def __adaptive_grouping(self, instances: List[Instance]): @staticmethod def sort_dataset_by_padding(dataset: Dataset, - sorting_keys: List[Tuple[str, str]], + sorting_keys: List[Tuple[str, str]], # pylint: disable=invalid-sequence-index padding_noise: float=0.0) -> List[Instance]: """ Sorts the ``Instances`` in this ``Dataset`` by their padding lengths, using the keys in diff --git a/allennlp/data/dataset_readers/squad.py b/allennlp/data/dataset_readers/squad.py index c3886d71327..04c61d4d715 100644 --- a/allennlp/data/dataset_readers/squad.py +++ b/allennlp/data/dataset_readers/squad.py @@ -73,7 +73,9 @@ def __init__(self, # Maps question indices to question strings self._id_to_question = {} - def _get_sentence_choices(self, question_id: int, answer_id: int) -> Tuple[List[str], int]: + def _get_sentence_choices(self, + question_id: int, + answer_id: int) -> Tuple[List[str], int]: # pylint: disable=invalid-sequence-index # Because sentences and questions have different indices, we need this to hold tuples of # ("sentence", id) or ("question", id), instead of just single ids. negative_sentences = set() diff --git a/allennlp/data/vocabulary.py b/allennlp/data/vocabulary.py index 683b0e3fbad..0de630bc099 100644 --- a/allennlp/data/vocabulary.py +++ b/allennlp/data/vocabulary.py @@ -1,13 +1,14 @@ from collections import defaultdict -from ..common.util import namespace_match from typing import Dict, List, Union import codecs import logging - import tqdm +from ..common.util import namespace_match + logger = logging.getLogger(__name__) # pylint: disable=invalid-name + class _NamespaceDependentDefaultDict(defaultdict): """ Sometimes certain namespaces need padding (like "tokens") and some don't (like diff --git a/allennlp/layers/embeddings.py b/allennlp/layers/embeddings.py index 16f0a1c73de..02cb1f55fd9 100644 --- a/allennlp/layers/embeddings.py +++ b/allennlp/layers/embeddings.py @@ -74,13 +74,13 @@ def __init__(self, if self.padding_index is not None: self.weight.data[self.padding_index].fill_(0) - def forward(self, input): + def forward(self, inputs): # pylint: disable=arguments-differ padding_index = self.padding_index if self.padding_index is not None else -1 return self._backend.Embedding(padding_index, self.max_norm, self.norm_type, self.scale_grad_by_freq, - self.sparse)(input, self.weight) + self.sparse)(inputs, self.weight) def get_pretrained_embedding_layer(embeddings_filename: str, diff --git a/tests/data/data_generator_to_fix.py b/tests/data/data_generator_to_fix.py index 189f4f32f99..020556e811c 100644 --- a/tests/data/data_generator_to_fix.py +++ b/tests/data/data_generator_to_fix.py @@ -1,161 +1,161 @@ -# pylint: disable=no-self-use,invalid-name -import numpy - -from allennlp.common.params import Params -from allennlp.data.data_generator import DataGenerator -from allennlp.testing.test_case import AllenNlpTestCase - - -class TestDataGenerator(AllenNlpTestCase): - def setUp(self): - super(TestDataGenerator, self).setUp() - self.text_trainer = FakeTextTrainer() - self.instances = [ - FakeInstance(0, 5, 3, 2), - FakeInstance(1, 4, 3, 2), - FakeInstance(2, 4, 1, 2), - FakeInstance(3, 9, 3, 2), - FakeInstance(4, 8, 3, 2), - FakeInstance(5, 2, 1, 2), - FakeInstance(6, 3, 3, 2), - FakeInstance(7, 3, 3, 3), - FakeInstance(8, 1, 1, 2), - FakeInstance(9, 1, 1, 3), - ] - - def test_instances_are_sorted_by_sorting_keys(self): - params = Params({ - 'dynamic_padding': True, - 'padding_noise': 0.0, - }) - generator = DataGenerator(self.text_trainer, params) - batches = generator.create_generator(IndexedDataset(self.instances)) - assert generator.last_num_batches == 4 - one_epoch_arrays = [next(batches) for _ in range(4)] - one_epoch_arrays.sort(key=lambda x: x[0][0]) - assert self.as_list(one_epoch_arrays[0][0]) == [1, 0, 4] - assert self.as_list(one_epoch_arrays[1][0]) == [3] - assert self.as_list(one_epoch_arrays[2][0]) == [6, 7, 2] - assert self.as_list(one_epoch_arrays[3][0]) == [8, 9, 5] - - def test_batches_are_consistent_with_no_repermuting(self): - params = Params({ - 'padding_noise': 0.0, - 'sort_every_epoch': False, - 'dynamic_padding': True, - }) - generator = DataGenerator(self.text_trainer, params) - batches = generator.create_generator(IndexedDataset(self.instances)) - assert generator.last_num_batches == 4 - first_epoch_arrays = [next(batches) for _ in range(4)] - second_epoch_arrays = [next(batches) for _ in range(4)] - first_epoch_arrays.sort(key=lambda x: x[0][0]) - second_epoch_arrays.sort(key=lambda x: x[0][0]) - first_epoch = [self.as_list(x[0]) for x in first_epoch_arrays] - second_epoch = [self.as_list(x[0]) for x in second_epoch_arrays] - assert first_epoch == second_epoch - - def test_biggest_batch_first(self): - params = Params({ - 'padding_noise': 0.0, - 'dynamic_padding': True, - 'biggest_batch_first': True, - }) - generator = DataGenerator(self.text_trainer, params) - batches = generator.create_generator(IndexedDataset(self.instances)) - biggest_batches = [next(batches) for _ in range(2)] - assert self.as_list(biggest_batches[0][0]) == [3] - assert self.as_list(biggest_batches[1][0]) == [1, 0, 4] - - def test_adaptive_grouping(self): - params = Params({ - 'padding_noise': 0.0, - 'dynamic_padding': True, - 'adaptive_batch_sizes': True, - 'adaptive_memory_usage_constant': 130, - }) - generator = DataGenerator(self.text_trainer, params) - batches = generator.create_generator(IndexedDataset(self.instances)) - assert generator.last_num_batches == 4 - one_epoch_arrays = [next(batches) for _ in range(4)] - one_epoch_arrays.sort(key=lambda x: x[0][0]) - assert self.as_list(one_epoch_arrays[0][0]) == [0, 4] - assert self.as_list(one_epoch_arrays[1][0]) == [3] - assert self.as_list(one_epoch_arrays[2][0]) == [7, 2, 1] - assert self.as_list(one_epoch_arrays[3][0]) == [8, 9, 5, 6] - - def test_sort_every_batch_actually_adds_noise_every_batch(self): - # We're just going to get two epoch's worth of batches, and make sure that they're - # different. - params = Params({ - 'padding_noise': 0.8, - 'sort_every_epoch': True, - 'dynamic_padding': True, - }) - generator = DataGenerator(self.text_trainer, params) - batches = generator.create_generator(IndexedDataset(self.instances)) - assert generator.last_num_batches == 4 - first_epoch_arrays = [next(batches) for _ in range(4)] - second_epoch_arrays = [next(batches) for _ in range(4)] - first_epoch_arrays.sort(key=lambda x: x[0][0]) - second_epoch_arrays.sort(key=lambda x: x[0][0]) - first_epoch = [self.as_list(x[0]) for x in first_epoch_arrays] - second_epoch = [self.as_list(x[0]) for x in second_epoch_arrays] - assert first_epoch != second_epoch - - def test_maximum_batch_size_is_actually_a_maximum(self): - params = Params({ - 'padding_noise': 0.0, - 'dynamic_padding': True, - 'adaptive_batch_sizes': True, - 'adaptive_memory_usage_constant': 50, - 'maximum_batch_size': 2, - }) - generator = DataGenerator(self.text_trainer, params) - batches = generator.create_generator(IndexedDataset(self.instances)) - assert generator.last_num_batches == 7 - one_epoch_arrays = [next(batches) for _ in range(7)] - one_epoch_arrays.sort(key=lambda x: x[0][0]) - print([self.as_list(x[0]) for x in one_epoch_arrays]) - assert self.as_list(one_epoch_arrays[0][0]) == [0] - assert self.as_list(one_epoch_arrays[1][0]) == [2, 1] - assert self.as_list(one_epoch_arrays[2][0]) == [3] - assert self.as_list(one_epoch_arrays[3][0]) == [4] - assert self.as_list(one_epoch_arrays[4][0]) == [5, 6] - assert self.as_list(one_epoch_arrays[5][0]) == [7] - assert self.as_list(one_epoch_arrays[6][0]) == [8, 9] - - def as_list(self, array): - return list(numpy.squeeze(array, axis=-1)) - - -class FakeInstance: - def __init__(self, index, a_length, b_length, c_length): - self.index = index - self.a_length = a_length - self.b_length = b_length - self.c_length = c_length - - def get_padding_lengths(self): - return {'a': self.a_length, 'b': self.b_length, 'c': self.c_length} - - def pad(self, lengths): - pass - - def as_training_data(self): - return numpy.asarray([self.index]), numpy.asarray([self.index]) - - -class FakeTextTrainer: - batch_size = 3 - a_length = None - b_length = None - c_length = None - def get_instance_sorting_keys(self): - return ['a', 'b', 'c'] - - def get_padding_lengths(self): - return {'a': self.a_length, 'b': self.b_length, 'c': self.c_length} - - def get_padding_memory_scaling(self, lengths): - return lengths['a'] * lengths['b'] * lengths['c'] +# # pylint: disable=no-self-use,invalid-name +# import numpy +# +# from allennlp.common.params import Params +# from allennlp.data.data_generator import DataGenerator +# from allennlp.testing.test_case import AllenNlpTestCase +# +# +# class TestDataGenerator(AllenNlpTestCase): +# def setUp(self): +# super(TestDataGenerator, self).setUp() +# self.text_trainer = FakeTextTrainer() +# self.instances = [ +# FakeInstance(0, 5, 3, 2), +# FakeInstance(1, 4, 3, 2), +# FakeInstance(2, 4, 1, 2), +# FakeInstance(3, 9, 3, 2), +# FakeInstance(4, 8, 3, 2), +# FakeInstance(5, 2, 1, 2), +# FakeInstance(6, 3, 3, 2), +# FakeInstance(7, 3, 3, 3), +# FakeInstance(8, 1, 1, 2), +# FakeInstance(9, 1, 1, 3), +# ] +# +# def test_instances_are_sorted_by_sorting_keys(self): +# params = Params({ +# 'dynamic_padding': True, +# 'padding_noise': 0.0, +# }) +# generator = DataGenerator(self.text_trainer, params) +# batches = generator.create_generator(IndexedDataset(self.instances)) +# assert generator.last_num_batches == 4 +# one_epoch_arrays = [next(batches) for _ in range(4)] +# one_epoch_arrays.sort(key=lambda x: x[0][0]) +# assert self.as_list(one_epoch_arrays[0][0]) == [1, 0, 4] +# assert self.as_list(one_epoch_arrays[1][0]) == [3] +# assert self.as_list(one_epoch_arrays[2][0]) == [6, 7, 2] +# assert self.as_list(one_epoch_arrays[3][0]) == [8, 9, 5] +# +# def test_batches_are_consistent_with_no_repermuting(self): +# params = Params({ +# 'padding_noise': 0.0, +# 'sort_every_epoch': False, +# 'dynamic_padding': True, +# }) +# generator = DataGenerator(self.text_trainer, params) +# batches = generator.create_generator(IndexedDataset(self.instances)) +# assert generator.last_num_batches == 4 +# first_epoch_arrays = [next(batches) for _ in range(4)] +# second_epoch_arrays = [next(batches) for _ in range(4)] +# first_epoch_arrays.sort(key=lambda x: x[0][0]) +# second_epoch_arrays.sort(key=lambda x: x[0][0]) +# first_epoch = [self.as_list(x[0]) for x in first_epoch_arrays] +# second_epoch = [self.as_list(x[0]) for x in second_epoch_arrays] +# assert first_epoch == second_epoch +# +# def test_biggest_batch_first(self): +# params = Params({ +# 'padding_noise': 0.0, +# 'dynamic_padding': True, +# 'biggest_batch_first': True, +# }) +# generator = DataGenerator(self.text_trainer, params) +# batches = generator.create_generator(IndexedDataset(self.instances)) +# biggest_batches = [next(batches) for _ in range(2)] +# assert self.as_list(biggest_batches[0][0]) == [3] +# assert self.as_list(biggest_batches[1][0]) == [1, 0, 4] +# +# def test_adaptive_grouping(self): +# params = Params({ +# 'padding_noise': 0.0, +# 'dynamic_padding': True, +# 'adaptive_batch_sizes': True, +# 'adaptive_memory_usage_constant': 130, +# }) +# generator = DataGenerator(self.text_trainer, params) +# batches = generator.create_generator(IndexedDataset(self.instances)) +# assert generator.last_num_batches == 4 +# one_epoch_arrays = [next(batches) for _ in range(4)] +# one_epoch_arrays.sort(key=lambda x: x[0][0]) +# assert self.as_list(one_epoch_arrays[0][0]) == [0, 4] +# assert self.as_list(one_epoch_arrays[1][0]) == [3] +# assert self.as_list(one_epoch_arrays[2][0]) == [7, 2, 1] +# assert self.as_list(one_epoch_arrays[3][0]) == [8, 9, 5, 6] +# +# def test_sort_every_batch_actually_adds_noise_every_batch(self): +# # We're just going to get two epoch's worth of batches, and make sure that they're +# # different. +# params = Params({ +# 'padding_noise': 0.8, +# 'sort_every_epoch': True, +# 'dynamic_padding': True, +# }) +# generator = DataGenerator(self.text_trainer, params) +# batches = generator.create_generator(IndexedDataset(self.instances)) +# assert generator.last_num_batches == 4 +# first_epoch_arrays = [next(batches) for _ in range(4)] +# second_epoch_arrays = [next(batches) for _ in range(4)] +# first_epoch_arrays.sort(key=lambda x: x[0][0]) +# second_epoch_arrays.sort(key=lambda x: x[0][0]) +# first_epoch = [self.as_list(x[0]) for x in first_epoch_arrays] +# second_epoch = [self.as_list(x[0]) for x in second_epoch_arrays] +# assert first_epoch != second_epoch +# +# def test_maximum_batch_size_is_actually_a_maximum(self): +# params = Params({ +# 'padding_noise': 0.0, +# 'dynamic_padding': True, +# 'adaptive_batch_sizes': True, +# 'adaptive_memory_usage_constant': 50, +# 'maximum_batch_size': 2, +# }) +# generator = DataGenerator(self.text_trainer, params) +# batches = generator.create_generator(IndexedDataset(self.instances)) +# assert generator.last_num_batches == 7 +# one_epoch_arrays = [next(batches) for _ in range(7)] +# one_epoch_arrays.sort(key=lambda x: x[0][0]) +# print([self.as_list(x[0]) for x in one_epoch_arrays]) +# assert self.as_list(one_epoch_arrays[0][0]) == [0] +# assert self.as_list(one_epoch_arrays[1][0]) == [2, 1] +# assert self.as_list(one_epoch_arrays[2][0]) == [3] +# assert self.as_list(one_epoch_arrays[3][0]) == [4] +# assert self.as_list(one_epoch_arrays[4][0]) == [5, 6] +# assert self.as_list(one_epoch_arrays[5][0]) == [7] +# assert self.as_list(one_epoch_arrays[6][0]) == [8, 9] +# +# def as_list(self, array): +# return list(numpy.squeeze(array, axis=-1)) +# +# +# class FakeInstance: +# def __init__(self, index, a_length, b_length, c_length): +# self.index = index +# self.a_length = a_length +# self.b_length = b_length +# self.c_length = c_length +# +# def get_padding_lengths(self): +# return {'a': self.a_length, 'b': self.b_length, 'c': self.c_length} +# +# def pad(self, lengths): +# pass +# +# def as_training_data(self): +# return numpy.asarray([self.index]), numpy.asarray([self.index]) +# +# +# class FakeTextTrainer: +# batch_size = 3 +# a_length = None +# b_length = None +# c_length = None +# def get_instance_sorting_keys(self): +# return ['a', 'b', 'c'] +# +# def get_padding_lengths(self): +# return {'a': self.a_length, 'b': self.b_length, 'c': self.c_length} +# +# def get_padding_memory_scaling(self, lengths): +# return lengths['a'] * lengths['b'] * lengths['c'] diff --git a/tests/data/embeddings_test.py b/tests/data/embeddings_test.py index d523ae5aa47..5d2826076b4 100644 --- a/tests/data/embeddings_test.py +++ b/tests/data/embeddings_test.py @@ -3,11 +3,9 @@ import numpy import pytest -from allennlp.common.checks import ConfigurationError + from allennlp.data.vocabulary import Vocabulary from allennlp.layers.embeddings import get_pretrained_embedding_layer - -#from allennlp.models.text_classification import ClassificationModel from allennlp.testing.test_case import AllenNlpTestCase @@ -69,40 +67,3 @@ def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self): embedding_layer = get_pretrained_embedding_layer(embeddings_filename, vocab) word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([0.0, 0.0, 0.0])) - - @pytest.mark.skip - def test_embedding_will_not_project_random_embeddings(self): - self.write_pretrained_vector_files() - self.write_true_false_model_files() - with pytest.raises(ConfigurationError): - args = { - "embeddings": { - "words": { - "dimension": 5, - "project": True, - "fine_tune": True, - "dropout": 0.2 - } - } - } - model = self.get_model(ClassificationModel, args) - model.train() - - @pytest.mark.skip - def test_projection_dim_not_equal_to_pretrained_dim_with_no_projection_flag_raises_error(self): - self.write_pretrained_vector_files() - self.write_true_false_model_files() - with pytest.raises(ConfigurationError): - args = { - "embeddings": { - "words": { - "dimension": 13, - "pretrained_file": self.PRETRAINED_VECTORS_GZIP, - "project": False, - "fine_tune": False, - "dropout": 0.2 - } - } - } - model = self.get_model(ClassificationModel, args) - model.train()