From 66db31af2f8b40fae89bfd2d9e2f8b1738d8c48a Mon Sep 17 00:00:00 2001 From: Jan Stypka Date: Sun, 8 Oct 2017 12:46:55 +0200 Subject: [PATCH 1/6] api: rename nb_epochs to epochs everywhere --- README.md | 4 ++-- magpie/config.py | 2 +- magpie/main.py | 20 ++++++++++++-------- magpie/tests/test_api.py | 2 +- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 63c6aa6..2adf4de 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Magpie is a deep learning tool for multi-label text classification. It learns on >>> from magpie import MagpieModel >>> magpie = MagpieModel() >>> magpie.init_word_vectors('/path/to/corpus', vec_dim=100) ->>> magpie.train('/path/to/corpus', ['label1', 'label2', 'label3'], nb_epochs=3) +>>> magpie.train('/path/to/corpus', ['label1', 'label2', 'label3'], epochs=3) Training... >>> magpie.predict_from_text('Well, that was quick!') [('label1', 0.96), ('label3', 0.65), ('label2', 0.21)] @@ -44,7 +44,7 @@ magpie.init_word_vectors('data/hep-categories', vec_dim=100) If you plan to reuse the trained word representations, you might want to save them and pass in the constructor to `MagpieModel` next time. For the training, just type: ```python labels = ['Gravitation and Cosmology', 'Experiment-HEP', 'Theory-HEP'] -magpie.train('data/hep-categories', labels, test_ratio=0.2, nb_epochs=30) +magpie.train('data/hep-categories', labels, test_ratio=0.2, epochs=30) ``` By providing the `test_ratio` argument, the model splits data into train & test datasets (in this example into 80/20 ratio) and evaluates itself after every epoch displaying it's current loss and accuracy. The default value of `test_ratio` is 0 meaning that all the data will be used for training. diff --git a/magpie/config.py b/magpie/config.py index 9c5c3d8..1054a43 100644 --- a/magpie/config.py +++ b/magpie/config.py @@ -11,7 +11,7 @@ # Training parameters BATCH_SIZE = 64 -NB_EPOCHS = 1 +EPOCHS = 1 # Number of tokens to save from the abstract, zero padded SAMPLE_LENGTH = 200 diff --git a/magpie/main.py b/magpie/main.py index 5b35ff6..b1557e2 100644 --- a/magpie/main.py +++ b/magpie/main.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals, print_function, division +import math import os import sys from six import string_types @@ -9,7 +10,7 @@ from magpie.base.document import Document from magpie.base.word2vec import train_word2vec, fit_scaler -from magpie.config import NN_ARCHITECTURE, BATCH_SIZE, EMBEDDING_SIZE, NB_EPOCHS +from magpie.config import NN_ARCHITECTURE, BATCH_SIZE, EMBEDDING_SIZE, EPOCHS from magpie.nn.input_data import get_data_for_model from magpie.nn.models import get_nn_model from magpie.utils import save_to_disk, load_from_disk @@ -38,7 +39,7 @@ def __init__(self, keras_model=None, word2vec_model=None, scaler=None, def train(self, train_dir, vocabulary, test_dir=None, callbacks=None, nn_model=NN_ARCHITECTURE, batch_size=BATCH_SIZE, test_ratio=0.0, - nb_epochs=NB_EPOCHS, verbose=1): + epochs=EPOCHS, verbose=1): """ Train the model on given data :param train_dir: directory with data files. Text files should end with @@ -51,7 +52,7 @@ def train(self, train_dir, vocabulary, test_dir=None, callbacks=None, :param batch_size: size of one batch :param test_ratio: the ratio of samples that will be withheld from training and used for testing. This can be overridden by test_dir. - :param nb_epochs: number of epochs to train + :param epochs: number of epochs to train :param verbose: 0, 1 or 2. As in Keras. :return: History object @@ -99,7 +100,7 @@ def train(self, train_dir, vocabulary, test_dir=None, callbacks=None, x_train, y_train, batch_size=batch_size, - nb_epoch=nb_epochs, + epochs=epochs, validation_data=test_data, validation_split=test_ratio, callbacks=callbacks or [], @@ -108,7 +109,7 @@ def train(self, train_dir, vocabulary, test_dir=None, callbacks=None, def batch_train(self, train_dir, vocabulary, test_dir=None, callbacks=None, nn_model=NN_ARCHITECTURE, batch_size=BATCH_SIZE, - nb_epochs=NB_EPOCHS, verbose=1): + epochs=EPOCHS, verbose=1): """ Train the model on given data :param train_dir: directory with data files. Text files should end with @@ -119,7 +120,7 @@ def batch_train(self, train_dir, vocabulary, test_dir=None, callbacks=None, :param callbacks: objects passed to the Keras fit function as callbacks :param nn_model: string defining the NN architecture e.g. 'crnn' :param batch_size: size of one batch - :param nb_epochs: number of epochs to train + :param epochs: number of epochs to train :param verbose: 0, 1 or 2. As in Keras. :return: History object @@ -163,10 +164,13 @@ def batch_train(self, train_dir, vocabulary, test_dir=None, callbacks=None, scaler=self.scaler, ) + nb_of_files = len({filename[:-4] for filename in os.listdir(train_dir)}) + steps_per_epoch = math.ceil(nb_of_files / batch_size) + return self.keras_model.fit_generator( train_generator, - len({filename[:-4] for filename in os.listdir(train_dir)}), - nb_epochs, + steps_per_epoch=steps_per_epoch, + epochs=epochs, validation_data=test_data, callbacks=callbacks or [], verbose=verbose, diff --git a/magpie/tests/test_api.py b/magpie/tests/test_api.py index e3d4970..8038061 100644 --- a/magpie/tests/test_api.py +++ b/magpie/tests/test_api.py @@ -17,7 +17,7 @@ def test_integrity(self): from magpie import MagpieModel model = MagpieModel() model.init_word_vectors(DATA_DIR, vec_dim=100) - history = model.train(DATA_DIR, labels, test_ratio=0.3, nb_epochs=3) + history = model.train(DATA_DIR, labels, test_ratio=0.3, epochs=3) assert history is not None # Do a simple prediction From 28b8b9d39f53d8327dbf658048a81b7046ae398f Mon Sep 17 00:00:00 2001 From: Jan Stypka Date: Sun, 8 Oct 2017 13:41:21 +0200 Subject: [PATCH 2/6] nn: rewrite the NN models to use Keras 2.0 API --- magpie/nn/models.py | 61 +++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/magpie/nn/models.py b/magpie/nn/models.py index e049978..2d1df6b 100644 --- a/magpie/nn/models.py +++ b/magpie/nn/models.py @@ -1,8 +1,6 @@ -from keras.layers.convolutional import MaxPooling1D, Convolution1D -from keras.layers.core import Flatten, Dropout, Dense, Merge -from keras.layers.normalization import BatchNormalization -from keras.layers.recurrent import GRU -from keras.models import Sequential +from keras.layers import Input, Dense, GRU, Dropout, BatchNormalization, \ + MaxPooling1D, Conv1D, Flatten, Concatenate +from keras.models import Model from magpie.config import SAMPLE_LENGTH @@ -18,31 +16,33 @@ def get_nn_model(nn_model, embedding, output_length): def cnn(embedding_size, output_length): """ Create and return a keras model of a CNN """ + NB_FILTER = 256 NGRAM_LENGTHS = [1, 2, 3, 4, 5] - conv_layers = [] + conv_layers, inputs = [], [] + for ngram_length in NGRAM_LENGTHS: - ngram_layer = Sequential() - ngram_layer.add(Convolution1D( + current_input = Input(shape=(SAMPLE_LENGTH, embedding_size)) + inputs.append(current_input) + + convolution = Conv1D( NB_FILTER, ngram_length, - input_dim=embedding_size, - input_length=SAMPLE_LENGTH, - init='lecun_uniform', + kernel_initializer='lecun_uniform', activation='tanh', - )) - pool_length = SAMPLE_LENGTH - ngram_length + 1 - ngram_layer.add(MaxPooling1D(pool_length=pool_length)) - conv_layers.append(ngram_layer) + )(current_input) - model = Sequential() - model.add(Merge(conv_layers, mode='concat')) + pool_size = SAMPLE_LENGTH - ngram_length + 1 + pooling = MaxPooling1D(pool_size=pool_size)(convolution) + conv_layers.append(pooling) - model.add(Dropout(0.5)) - model.add(Flatten()) + merged = Concatenate()(conv_layers) + dropout = Dropout(0.5)(merged) + flattened = Flatten()(dropout) + outputs = Dense(output_length, activation='sigmoid')(flattened) - model.add(Dense(output_length, activation='sigmoid')) + model = Model(inputs=inputs, outputs=outputs) model.compile( loss='binary_crossentropy', @@ -57,20 +57,21 @@ def rnn(embedding_size, output_length): """ Create and return a keras model of a RNN """ HIDDEN_LAYER_SIZE = 256 - model = Sequential() + inputs = Input(shape=(SAMPLE_LENGTH, embedding_size)) - model.add(GRU( + gru = GRU( HIDDEN_LAYER_SIZE, - input_dim=embedding_size, - input_length=SAMPLE_LENGTH, - init='glorot_uniform', - inner_init='normal', + input_shape=(SAMPLE_LENGTH, embedding_size), + kernel_initializer="glorot_uniform", + recurrent_initializer='normal', activation='relu', - )) - model.add(BatchNormalization()) - model.add(Dropout(0.1)) + )(inputs) + + batch_normalization = BatchNormalization()(gru) + dropout = Dropout(0.1)(batch_normalization) + outputs = Dense(output_length, activation='sigmoid')(dropout) - model.add(Dense(output_length, activation='sigmoid')) + model = Model(inputs=inputs, outputs=outputs) model.compile( loss='binary_crossentropy', From 058f55b3a1705fe17384da640f1f288f1117c60e Mon Sep 17 00:00:00 2001 From: Jan Stypka Date: Sun, 8 Oct 2017 14:24:25 +0200 Subject: [PATCH 3/6] tests: add a separate test for RNN and batch training --- magpie/tests/test_api.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/magpie/tests/test_api.py b/magpie/tests/test_api.py index 8038061..eec0248 100644 --- a/magpie/tests/test_api.py +++ b/magpie/tests/test_api.py @@ -8,7 +8,7 @@ class TestAPI(unittest.TestCase): """ Basic integration test """ - def test_integrity(self): + def test_cnn_train(self): # Get them labels! with io.open(DATA_DIR + '.labels', 'r') as f: labels = {line.rstrip('\n') for line in f} @@ -17,7 +17,28 @@ def test_integrity(self): from magpie import MagpieModel model = MagpieModel() model.init_word_vectors(DATA_DIR, vec_dim=100) - history = model.train(DATA_DIR, labels, test_ratio=0.3, epochs=3) + history = model.train(DATA_DIR, labels, nn_model='cnn', test_ratio=0.3, epochs=3) + assert history is not None + + # Do a simple prediction + predictions = model.predict_from_text("Black holes are cool!") + assert len(predictions) == len(labels) + + # Assert the hell out of it! + for lab, val in predictions: + assert lab in labels + assert 0 <= val <= 1 + + def test_rnn_batch_train(self): + # Get them labels! + with io.open(DATA_DIR + '.labels', 'r') as f: + labels = {line.rstrip('\n') for line in f} + + # Run the model + from magpie import MagpieModel + model = MagpieModel() + model.init_word_vectors(DATA_DIR, vec_dim=100) + history = model.batch_train(DATA_DIR, labels, nn_model='rnn', epochs=3) assert history is not None # Do a simple prediction From 80750873ab6fc765bb9c0f556093ed72e0ed4ddd Mon Sep 17 00:00:00 2001 From: Jan Stypka Date: Sun, 8 Oct 2017 14:38:42 +0200 Subject: [PATCH 4/6] api: rename MagpieModel to Magpie --- README.md | 13 ++++++------- magpie/__init__.py | 2 +- magpie/main.py | 2 +- magpie/tests/test_api.py | 8 ++++---- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 2adf4de..258a482 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,7 @@ Magpie is a deep learning tool for multi-label text classification. It learns on ## Very short introduction ``` ->>> from magpie import MagpieModel ->>> magpie = MagpieModel() +>>> magpie = Magpie() >>> magpie.init_word_vectors('/path/to/corpus', vec_dim=100) >>> magpie.train('/path/to/corpus', ['label1', 'label2', 'label3'], epochs=3) Training... @@ -24,9 +23,9 @@ $ ls data/hep-categories Before you train the model, you need to build appropriate word vector representations for your corpus. In theory, you can train them on a different corpus or reuse already trained ones ([tutorial](http://rare-technologies.com/word2vec-tutorial/)), however Magpie enables you to do that as well. ```python -from magpie import MagpieModel +from magpie import Magpie -magpie = MagpieModel() +magpie = Magpie() magpie.train_word2vec('data/hep-categories', vec_dim=100) ``` @@ -41,7 +40,7 @@ You would usually want to combine those two steps, by simply running: magpie.init_word_vectors('data/hep-categories', vec_dim=100) ``` -If you plan to reuse the trained word representations, you might want to save them and pass in the constructor to `MagpieModel` next time. For the training, just type: +If you plan to reuse the trained word representations, you might want to save them and pass in the constructor to `Magpie` next time. For the training, just type: ```python labels = ['Gravitation and Cosmology', 'Experiment-HEP', 'Theory-HEP'] magpie.train('data/hep-categories', labels, test_ratio=0.2, epochs=30) @@ -63,7 +62,7 @@ Trained models can be used for prediction with methods: ('Theory-HEP', 0.20917746)] ``` ## Saving & loading the model -A `MagpieModel` object consists of three components - the word2vec mappings, a scaler and a `keras` model. In order to train Magpie you can either provide the word2vec mappings and a scaler in advance or let the program compute them for you on the training data. Usually you would want to train them yourself on a full dataset and reuse them afterwards. You can use the provided functions for that purpose: +A `Magpie` object consists of three components - the word2vec mappings, a scaler and a `keras` model. In order to train Magpie you can either provide the word2vec mappings and a scaler in advance or let the program compute them for you on the training data. Usually you would want to train them yourself on a full dataset and reuse them afterwards. You can use the provided functions for that purpose: ```python magpie.save_word2vec_model('/save/my/embeddings/here') @@ -74,7 +73,7 @@ magpie.save_model('/save/my/model/here.h5') When you want to reinitialize your trained model, you can run: ```python -magpie = MagpieModel( +magpie = Magpie( keras_model='/save/my/model/here.h5', word2vec_model='/save/my/embeddings/here', scaler='/save/my/scaler/here', diff --git a/magpie/__init__.py b/magpie/__init__.py index d95e68d..fb6d077 100644 --- a/magpie/__init__.py +++ b/magpie/__init__.py @@ -1 +1 @@ -from .main import MagpieModel +from .main import Magpie diff --git a/magpie/main.py b/magpie/main.py index b1557e2..8172748 100644 --- a/magpie/main.py +++ b/magpie/main.py @@ -16,7 +16,7 @@ from magpie.utils import save_to_disk, load_from_disk -class MagpieModel(object): +class Magpie(object): def __init__(self, keras_model=None, word2vec_model=None, scaler=None, labels=None): diff --git a/magpie/tests/test_api.py b/magpie/tests/test_api.py index eec0248..24eca29 100644 --- a/magpie/tests/test_api.py +++ b/magpie/tests/test_api.py @@ -2,6 +2,8 @@ import os import unittest +from magpie import Magpie + # This one is hacky, but I'm too lazy to do it properly! PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) DATA_DIR = os.path.join(PROJECT_DIR, 'data', 'hep-categories') @@ -14,8 +16,7 @@ def test_cnn_train(self): labels = {line.rstrip('\n') for line in f} # Run the model - from magpie import MagpieModel - model = MagpieModel() + model = Magpie() model.init_word_vectors(DATA_DIR, vec_dim=100) history = model.train(DATA_DIR, labels, nn_model='cnn', test_ratio=0.3, epochs=3) assert history is not None @@ -35,8 +36,7 @@ def test_rnn_batch_train(self): labels = {line.rstrip('\n') for line in f} # Run the model - from magpie import MagpieModel - model = MagpieModel() + model = Magpie() model.init_word_vectors(DATA_DIR, vec_dim=100) history = model.batch_train(DATA_DIR, labels, nn_model='rnn', epochs=3) assert history is not None From 28b47ccb6f8b93d02df7db9645b0eaeecfc46a8d Mon Sep 17 00:00:00 2001 From: Jan Stypka Date: Sun, 8 Oct 2017 14:48:38 +0200 Subject: [PATCH 5/6] api: bump the version to 2.0 :tada: --- README.md | 2 +- setup.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 258a482..33a02b2 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ or just pass the objects directly! The package is not on PyPi, but you can get it directly from GitHub: ``` -$ pip install git+https://github.com/inspirehep/magpie.git@v1.0 +$ pip install git+https://github.com/inspirehep/magpie.git@v2.0 ``` If you encounter any problems with the installation, make sure to install the correct versions of dependencies listed in `setup.py` file. diff --git a/setup.py b/setup.py index cb844ff..a29f133 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.0', + version='2.0', description='Automatic text classification tool', # long_description=long_description, @@ -73,7 +73,7 @@ 'scipy~=0.18', 'gensim~=0.13', 'scikit-learn~=0.18', - 'keras~=1.2.2', + 'keras~=2.0', 'h5py~=2.6', ], From 0b7f6f985dd98de11f02c0442591167e8bdd2bdf Mon Sep 17 00:00:00 2001 From: Jan Stypka Date: Sun, 8 Oct 2017 17:59:21 +0200 Subject: [PATCH 6/6] docs: add a paragraph about v1.0 vs v2.0 --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 33a02b2..fbe6da6 100644 --- a/README.md +++ b/README.md @@ -90,5 +90,8 @@ $ pip install git+https://github.com/inspirehep/magpie.git@v2.0 ``` If you encounter any problems with the installation, make sure to install the correct versions of dependencies listed in `setup.py` file. +## Magpie v1.0 vs v2.0 +Magpie v1.0 depends on Keras v1.X, while Magpie v2.0 on Keras v2.X. You can install and use either of those, but bear in mind that only v2.0 will be developed in the future. If you have troubles with installation, make sure that both Magpie and Keras have the same major version. + ## Contact If you have any problems, feel free to open an issue. We'll do our best to help :+1: