Adding SQuAD 2.0 support

ddylewsky · Nov 15, 2018 · 6045470 · 6045470
1 parent 9d81f96
commit 6045470
Show file tree

Hide file tree

Showing 6 changed files with 268 additions and 95 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,11 @@
 # BERT
 
+**\*\*\*\*\* New November 15th, 2018: SOTA SQuAD 2.0 System \*\*\*\*\***
+
+We released code changes to reproduce our 83% F1 SQuAD 2.0 system, which is
+currently 1st place on the leaderboard by 3%. See the SQuAD 2.0 section of the
+README for details.
+
 **\*\*\*\*\* New November 5th, 2018: Third-party PyTorch and Chainer versions of
 BERT available \*\*\*\*\***
 
@@ -342,7 +348,7 @@ python run_classifier.py \
   --output_dir=/tmp/mrpc_output/
 ```
 
-### SQuAD
+### SQuAD 1.1
 
 The Stanford Question Answering Dataset (SQuAD) is a popular question answering
 benchmark dataset. BERT (at the time of the release) obtains state-of-the-art
@@ -435,6 +441,78 @@ If you fine-tune for one epoch on
 be even better, but you will need to convert TriviaQA into the SQuAD json
 format.
 
+### SQuAD 2.0
+
+This model is also implemented and documented in `run_squad.py`.
+
+To run on SQuAD 2.0, you will first need to download the dataset. The necessary
+files can be found here:
+
+*   [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
+*   [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+*   [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
+
+Download these to some directory `$SQUAD_DIR`.
+
+On Cloud TPU you can run with BERT-Large as follows:
+
+```shell
+python run_squad.py \
+  --vocab_file=$BERT_LARGE_DIR/vocab.txt \
+  --bert_config_file=$BERT_LARGE_DIR/bert_config.json \
+  --init_checkpoint=$BERT_LARGE_DIR/bert_model.ckpt \
+  --do_train=True \
+  --train_file=$SQUAD_DIR/train-v1.1.json \
+  --do_predict=True \
+  --predict_file=$SQUAD_DIR/dev-v1.1.json \
+  --train_batch_size=24 \
+  --learning_rate=3e-5 \
+  --num_train_epochs=2.0 \
+  --max_seq_length=384 \
+  --doc_stride=128 \
+  --output_dir=gs://some_bucket/squad_large/ \
+  --use_tpu=True \
+  --tpu_name=$TPU_NAME \
+  --version_2_with_negative=True
+```
+
+We assume you have copied everything from the output directory to a local
+directory called ./squad/. The initial dev set predictions will be at
+./squad/predictions.json and the differences between the score of no answer ("")
+and the best non-null answer for each question will be in the file
+./squad/null_odds.json
+
+Run this script to tune a threshold for predicting null versus non-null answers:
+
+python $SQUAD_DIR/evaluate-v2.0.py $SQUAD_DIR/dev-v2.0.json
+./squad/predictions.json --na-prob-file ./squad/null_odds.json
+
+Assume the script outputs "best_f1_thresh" THRESH. (Typical values are between
+-1.0 and -5.0). You can now re-run the model to generate predictions with the
+derived threshold or alternatively you can extract the appropriate answers from
+./squad/nbest_predictions.json.
+
+```shell
+python run_squad.py \
+  --vocab_file=$BERT_LARGE_DIR/vocab.txt \
+  --bert_config_file=$BERT_LARGE_DIR/bert_config.json \
+  --init_checkpoint=$BERT_LARGE_DIR/bert_model.ckpt \
+  --do_train=False \
+  --train_file=$SQUAD_DIR/train-v1.1.json \
+  --do_predict=True \
+  --predict_file=$SQUAD_DIR/dev-v1.1.json \
+  --train_batch_size=24 \
+  --learning_rate=3e-5 \
+  --num_train_epochs=2.0 \
+  --max_seq_length=384 \
+  --doc_stride=128 \
+  --output_dir=gs://some_bucket/squad_large/ \
+  --use_tpu=True \
+  --tpu_name=$TPU_NAME \
+  --version_2_with_negative=True \
+  --null_score_diff_threshold=$THRESH
+```
+
 ### Out-of-memory issues
 
 All experiments in the paper were fine-tuned on a Cloud TPU, which has 64GB of

diff --git a/modeling.py b/modeling.py
@@ -469,11 +469,6 @@ def embedding_postprocessor(input_tensor,
   seq_length = input_shape[1]
   width = input_shape[2]
 
-  if seq_length > max_position_embeddings:
-    raise ValueError("The seq length (%d) cannot be greater than "
-                     "`max_position_embeddings` (%d)" %
-                     (seq_length, max_position_embeddings))
-
   output = input_tensor
 
   if use_token_type:
@@ -494,37 +489,35 @@ def embedding_postprocessor(input_tensor,
     output += token_type_embeddings
 
   if use_position_embeddings:
-    full_position_embeddings = tf.get_variable(
-        name=position_embedding_name,
-        shape=[max_position_embeddings, width],
-        initializer=create_initializer(initializer_range))
-    # Since the position embedding table is a learned variable, we create it
-    # using a (long) sequence length `max_position_embeddings`. The actual
-    # sequence length might be shorter than this, for faster training of
-    # tasks that do not have long sequences.
-    #
-    # So `full_position_embeddings` is effectively an embedding table
-    # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
-    # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
-    # perform a slice.
-    if seq_length < max_position_embeddings:
+    assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
+    with tf.control_dependencies([assert_op]):
+      full_position_embeddings = tf.get_variable(
+          name=position_embedding_name,
+          shape=[max_position_embeddings, width],
+          initializer=create_initializer(initializer_range))
+      # Since the position embedding table is a learned variable, we create it
+      # using a (long) sequence length `max_position_embeddings`. The actual
+      # sequence length might be shorter than this, for faster training of
+      # tasks that do not have long sequences.
+      #
+      # So `full_position_embeddings` is effectively an embedding table
+      # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
+      # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
+      # perform a slice.
       position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                      [seq_length, -1])
-    else:
-      position_embeddings = full_position_embeddings
-
-    num_dims = len(output.shape.as_list())
-
-    # Only the last two dimensions are relevant (`seq_length` and `width`), so
-    # we broadcast among the first dimensions, which is typically just
-    # the batch size.
-    position_broadcast_shape = []
-    for _ in range(num_dims - 2):
-      position_broadcast_shape.append(1)
-    position_broadcast_shape.extend([seq_length, width])
-    position_embeddings = tf.reshape(position_embeddings,
-                                     position_broadcast_shape)
-    output += position_embeddings
+      num_dims = len(output.shape.as_list())
+
+      # Only the last two dimensions are relevant (`seq_length` and `width`), so
+      # we broadcast among the first dimensions, which is typically just
+      # the batch size.
+      position_broadcast_shape = []
+      for _ in range(num_dims - 2):
+        position_broadcast_shape.append(1)
+      position_broadcast_shape.extend([seq_length, width])
+      position_embeddings = tf.reshape(position_embeddings,
+                                       position_broadcast_shape)
+      output += position_embeddings
 
   output = layer_norm_and_dropout(output, dropout_prob)
   return output

diff --git a/modeling_test.py b/modeling_test.py
@@ -164,6 +164,7 @@ def assert_all_tensors_reachable(self, sess, outputs):
     graph = sess.graph
 
     ignore_strings = [
+        "^.*/assert_less_equal/.*$",
         "^.*/dilation_rate$",
         "^.*/Tensordot/concat$",
         "^.*/Tensordot/concat/axis$",

diff --git a/run_classifier.py b/run_classifier.py
@@ -607,9 +607,8 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
         num_labels, use_one_hot_embeddings)
 
     tvars = tf.trainable_variables()
-
+    initialized_variable_names = {}
     scaffold_fn = None
-    initialized_variable_names = []
     if init_checkpoint:
       (assignment_map, initialized_variable_names
       ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)