Padding examples for TPU eval/predictions and checking case match

ddylewsky · Dec 18, 2018 · f39e881 · f39e881
1 parent b8ba348
commit f39e881
Show file tree

Hide file tree

Showing 8 changed files with 164 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ Mongolian \*\*\*\*\***
 
 We uploaded a new multilingual model which does *not* perform any normalization
 on the input (no lower casing, accent stripping, or Unicode normalization), and
-additionally includes Thai and Mongolian.
+additionally inclues Thai and Mongolian.
 
 **It is recommended to use this version for developing multilingual models,
 especially on languages with non-Latin alphabets.**
@@ -38,8 +38,9 @@ repository.
 
 We have made two new BERT models available:
 
-*   **[`BERT-Base, Multilingual`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**:
-    102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Base, Multilingual`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)
+    (Not recommended, use `Multilingual Cased` instead)**: 102 languages,
+    12-layer, 768-hidden, 12-heads, 110M parameters
 *   **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**:
     Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M
     parameters
@@ -228,8 +229,9 @@ The links to the models are here (right-click, 'Save link as...' on the name):
     24-layer, 1024-hidden, 16-heads, 340M parameters
 *   **[`BERT-Base, Multilingual Cased (New, recommended)`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**:
     104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-*   **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**:
-    102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)
+    (Not recommended, use `Multilingual Cased` instead)**: 102 languages,
+    12-layer, 768-hidden, 12-heads, 110M parameters
 *   **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**:
     Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M
     parameters

diff --git a/create_pretraining_data.py b/create_pretraining_data.py
@@ -20,9 +20,8 @@
 
 import collections
 import random
-
-import tokenization
 import tensorflow as tf
+import tokenization
 
 flags = tf.flags
 

diff --git a/multilingual.md b/multilingual.md
@@ -297,7 +297,7 @@ chosen because they are the top 100 languages with the largest Wikipedias:
 *   Volapük
 *   Waray-Waray
 *   Welsh
-*   West
+*   West Frisian
 *   Western Punjabi
 *   Yoruba
 

diff --git a/optimization.py b/optimization.py
@@ -76,6 +76,9 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
   train_op = optimizer.apply_gradients(
       zip(grads, tvars), global_step=global_step)
 
+  # Normally the global step update is done inside of `apply_gradients`.
+  # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
+  # a different optimizer, you should probably take this line out.
   new_global_step = global_step + 1
   train_op = tf.group(train_op, [global_step.assign(new_global_step)])
   return train_op
@@ -137,7 +140,7 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):
       # the correct way of using L2 regularization/weight decay with Adam,
       # since that will interact with the m and v parameters in strange ways.
       #
-      # Instead we want to decay the weights in a manner that doesn't interact
+      # Instead we want ot decay the weights in a manner that doesn't interact
       # with the m/v parameters. This is equivalent to adding the square
       # of the weights to the loss with plain (non-momentum) SGD.
       if self._do_use_weight_decay(param_name):

diff --git a/run_classifier.py b/run_classifier.py
@@ -145,14 +145,33 @@ def __init__(self, guid, text_a, text_b=None, label=None):
     self.label = label
 
 
+class PaddingInputExample(object):
+  """Fake example so the num input examples is a multiple of the batch size.
+
+  When running eval/predict on the TPU, we need to pad the number of examples
+  to be a multiple of the batch size, because the TPU requires a fixed batch
+  size. The alternative is to drop the last batch, which is bad because it means
+  the entire output data won't be generated.
+
+  We use this class instead of `None` because treating `None` as padding
+  battches could cause silent errors.
+  """
+
+
 class InputFeatures(object):
   """A single set of features of data."""
 
-  def __init__(self, input_ids, input_mask, segment_ids, label_id):
+  def __init__(self,
+               input_ids,
+               input_mask,
+               segment_ids,
+               label_id,
+               is_real_example=True):
     self.input_ids = input_ids
     self.input_mask = input_mask
     self.segment_ids = segment_ids
     self.label_id = label_id
+    self.is_real_example = is_real_example
 
 
 class DataProcessor(object):
@@ -358,6 +377,15 @@ def _create_examples(self, lines, set_type):
 def convert_single_example(ex_index, example, label_list, max_seq_length,
                            tokenizer):
   """Converts a single `InputExample` into a single `InputFeatures`."""
+
+  if isinstance(example, PaddingInputExample):
+    return InputFeatures(
+        input_ids=[0] * max_seq_length,
+        input_mask=[0] * max_seq_length,
+        segment_ids=[0] * max_seq_length,
+        label_id=0,
+        is_real_example=False)
+
   label_map = {}
   for (i, label) in enumerate(label_list):
     label_map[label] = i
@@ -393,7 +421,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
   # it easier for the model to learn the concept of sequences.
   #
   # For classification tasks, the first vector (corresponding to [CLS]) is
-  # used as as the "sentence vector". Note that this only makes sense because
+  # used as the "sentence vector". Note that this only makes sense because
   # the entire model is fine-tuned.
   tokens = []
   segment_ids = []
@@ -443,7 +471,8 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
       input_ids=input_ids,
       input_mask=input_mask,
       segment_ids=segment_ids,
-      label_id=label_id)
+      label_id=label_id,
+      is_real_example=True)
   return feature
 
 
@@ -469,9 +498,12 @@ def create_int_feature(values):
     features["input_mask"] = create_int_feature(feature.input_mask)
     features["segment_ids"] = create_int_feature(feature.segment_ids)
     features["label_ids"] = create_int_feature([feature.label_id])
+    features["is_real_example"] = create_int_feature(
+        [int(feature.is_real_example)])
 
     tf_example = tf.train.Example(features=tf.train.Features(feature=features))
     writer.write(tf_example.SerializeToString())
+  writer.close()
 
 
 def file_based_input_fn_builder(input_file, seq_length, is_training,
@@ -483,6 +515,7 @@ def file_based_input_fn_builder(input_file, seq_length, is_training,
       "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
       "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
       "label_ids": tf.FixedLenFeature([], tf.int64),
+      "is_real_example": tf.FixedLenFeature([], tf.int64),
   }
 
   def _decode_record(record, name_to_features):
@@ -599,6 +632,11 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
     input_mask = features["input_mask"]
     segment_ids = features["segment_ids"]
     label_ids = features["label_ids"]
+    is_real_example = None
+    if "is_real_example" in features:
+      is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
+    else:
+      is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
 
     is_training = (mode == tf.estimator.ModeKeys.TRAIN)
 
@@ -643,24 +681,28 @@ def tpu_scaffold():
           scaffold_fn=scaffold_fn)
     elif mode == tf.estimator.ModeKeys.EVAL:
 
-      def metric_fn(per_example_loss, label_ids, logits):
+      def metric_fn(per_example_loss, label_ids, logits, is_real_example):
         predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
-        accuracy = tf.metrics.accuracy(label_ids, predictions)
-        loss = tf.metrics.mean(per_example_loss)
+        accuracy = tf.metrics.accuracy(
+            labels=label_ids, predictions=predictions, weights=is_real_example)
+        loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
         return {
             "eval_accuracy": accuracy,
             "eval_loss": loss,
         }
 
-      eval_metrics = (metric_fn, [per_example_loss, label_ids, logits])
+      eval_metrics = (metric_fn,
+                      [per_example_loss, label_ids, logits, is_real_example])
       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
           mode=mode,
           loss=total_loss,
           eval_metrics=eval_metrics,
           scaffold_fn=scaffold_fn)
     else:
       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
-          mode=mode, predictions=probabilities, scaffold_fn=scaffold_fn)
+          mode=mode,
+          predictions={"probabilities": probabilities},
+          scaffold_fn=scaffold_fn)
     return output_spec
 
   return model_fn
@@ -748,6 +790,9 @@ def main(_):
       "xnli": XnliProcessor,
   }
 
+  tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
+                                                FLAGS.init_checkpoint)
+
   if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
     raise ValueError(
         "At least one of `do_train`, `do_eval` or `do_predict' must be True.")
@@ -836,22 +881,33 @@ def main(_):
 
   if FLAGS.do_eval:
     eval_examples = processor.get_dev_examples(FLAGS.data_dir)
+    num_actual_eval_examples = len(eval_examples)
+    if FLAGS.use_tpu:
+      # TPU requires a fixed batch size for all batches, therefore the number
+      # of examples must be a multiple of the batch size, or else examples
+      # will get dropped. So we pad with fake examples which are ignored
+      # later on. These do NOT count towards the metric (all tf.metrics
+      # support a per-instance weight, and these get a weight of 0.0).
+      while len(eval_examples) % FLAGS.eval_batch_size != 0:
+        eval_examples.append(PaddingInputExample())
+
     eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
     file_based_convert_examples_to_features(
         eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
 
     tf.logging.info("***** Running evaluation *****")
-    tf.logging.info("  Num examples = %d", len(eval_examples))
+    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
+                    len(eval_examples), num_actual_eval_examples,
+                    len(eval_examples) - num_actual_eval_examples)
     tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
 
     # This tells the estimator to run through the entire set.
     eval_steps = None
     # However, if running eval on the TPU, you will need to specify the
     # number of steps.
     if FLAGS.use_tpu:
-      # Eval will be slightly WRONG on the TPU because it will truncate
-      # the last batch.
-      eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
+      assert len(eval_examples) % FLAGS.eval_batch_size == 0
+      eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)
 
     eval_drop_remainder = True if FLAGS.use_tpu else False
     eval_input_fn = file_based_input_fn_builder(
@@ -871,20 +927,26 @@ def main(_):
 
   if FLAGS.do_predict:
     predict_examples = processor.get_test_examples(FLAGS.data_dir)
+    num_actual_predict_examples = len(predict_examples)
+    if FLAGS.use_tpu:
+      # TPU requires a fixed batch size for all batches, therefore the number
+      # of examples must be a multiple of the batch size, or else examples
+      # will get dropped. So we pad with fake examples which are ignored
+      # later on.
+      while len(predict_examples) % FLAGS.predict_batch_size != 0:
+        predict_examples.append(PaddingInputExample())
+
     predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
     file_based_convert_examples_to_features(predict_examples, label_list,
                                             FLAGS.max_seq_length, tokenizer,
                                             predict_file)
 
     tf.logging.info("***** Running prediction*****")
-    tf.logging.info("  Num examples = %d", len(predict_examples))
+    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
+                    len(predict_examples), num_actual_predict_examples,
+                    len(predict_examples) - num_actual_predict_examples)
     tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
 
-    if FLAGS.use_tpu:
-      # Warning: According to tpu_estimator.py Prediction on TPU is an
-      # experimental feature and hence not supported here
-      raise ValueError("Prediction in TPU not supported")
-
     predict_drop_remainder = True if FLAGS.use_tpu else False
     predict_input_fn = file_based_input_fn_builder(
         input_file=predict_file,
@@ -896,11 +958,18 @@ def main(_):
 
     output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
     with tf.gfile.GFile(output_predict_file, "w") as writer:
+      num_written_lines = 0
       tf.logging.info("***** Predict results *****")
-      for prediction in result:
+      for (i, prediction) in enumerate(result):
+        probabilities = prediction["probabilities"]
+        if i >= num_actual_predict_examples:
+          break
         output_line = "\t".join(
-            str(class_probability) for class_probability in prediction) + "\n"
+            str(class_probability)
+            for class_probability in probabilities) + "\n"
         writer.write(output_line)
+        num_written_lines += 1
+    assert num_written_lines == num_actual_predict_examples
 
 
 if __name__ == "__main__":

diff --git a/run_squad.py b/run_squad.py
@@ -1096,6 +1096,9 @@ def close(self):
 
 def validate_flags_or_throw(bert_config):
   """Validate the input FLAGS or throw an exception."""
+  tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
+                                                FLAGS.init_checkpoint)
+
   if not FLAGS.do_train and not FLAGS.do_predict:
     raise ValueError("At least one of `do_train` or `do_predict` must be True.")
 

diff --git a/tokenization.py b/tokenization.py
@@ -19,11 +19,62 @@
 from __future__ import print_function
 
 import collections
+import re
 import unicodedata
 import six
 import tensorflow as tf
 
 
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+  """Checks whether the casing config is consistent with the checkpoint name."""
+
+  # The casing has to be passed in by the user and there is no explicit check
+  # as to whether it matches the checkpoint. The casing information probably
+  # should have been stored in the bert_config.json file, but it's not, so
+  # we have to heuristically detect it to validate.
+
+  if not init_checkpoint:
+    return
+
+  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+  if m is None:
+    return
+
+  model_name = m.group(1)
+
+  lower_models = [
+      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+  ]
+
+  cased_models = [
+      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+      "multi_cased_L-12_H-768_A-12"
+  ]
+
+  is_bad_config = False
+  if model_name in lower_models and not do_lower_case:
+    is_bad_config = True
+    actual_flag = "False"
+    case_name = "lowercased"
+    opposite_flag = "True"
+
+  if model_name in cased_models and do_lower_case:
+    is_bad_config = True
+    actual_flag = "True"
+    case_name = "cased"
+    opposite_flag = "False"
+
+  if is_bad_config:
+    raise ValueError(
+        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+        "However, `%s` seems to be a %s model, so you "
+        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+        "how the model was pre-training. If this error is wrong, please "
+        "just comment out this check." % (actual_flag, init_checkpoint,
+                                          model_name, case_name, opposite_flag))
+
+
 def convert_to_unicode(text):
   """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
   if six.PY3:
@@ -84,7 +135,10 @@ def load_vocab(vocab_file):
 
 def convert_by_vocab(vocab, items):
   """Converts a sequence of [tokens|ids] using the vocab."""
-  return [vocab[item] for item in items]
+  output = []
+  for item in items:
+    output.append(vocab[item])
+  return output
 
 
 def convert_tokens_to_ids(vocab, tokens):