Skip to content

Commit

Permalink
Padding examples for TPU eval/predictions and checking case match
Browse files Browse the repository at this point in the history
  • Loading branch information
jacobdevlin-google committed Dec 18, 2018
1 parent b8ba348 commit f39e881
Show file tree
Hide file tree
Showing 8 changed files with 164 additions and 34 deletions.
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Mongolian \*\*\*\*\***

We uploaded a new multilingual model which does *not* perform any normalization
on the input (no lower casing, accent stripping, or Unicode normalization), and
additionally includes Thai and Mongolian.
additionally inclues Thai and Mongolian.

**It is recommended to use this version for developing multilingual models,
especially on languages with non-Latin alphabets.**
Expand Down Expand Up @@ -38,8 +38,9 @@ repository.

We have made two new BERT models available:

* **[`BERT-Base, Multilingual`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**:
102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Base, Multilingual`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)
(Not recommended, use `Multilingual Cased` instead)**: 102 languages,
12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**:
Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M
parameters
Expand Down Expand Up @@ -228,8 +229,9 @@ The links to the models are here (right-click, 'Save link as...' on the name):
24-layer, 1024-hidden, 16-heads, 340M parameters
* **[`BERT-Base, Multilingual Cased (New, recommended)`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**:
104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**:
102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)
(Not recommended, use `Multilingual Cased` instead)**: 102 languages,
12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**:
Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M
parameters
Expand Down
3 changes: 1 addition & 2 deletions create_pretraining_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@

import collections
import random

import tokenization
import tensorflow as tf
import tokenization

flags = tf.flags

Expand Down
2 changes: 1 addition & 1 deletion multilingual.md
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ chosen because they are the top 100 languages with the largest Wikipedias:
* Volapük
* Waray-Waray
* Welsh
* West
* West Frisian
* Western Punjabi
* Yoruba

Expand Down
5 changes: 4 additions & 1 deletion optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
train_op = optimizer.apply_gradients(
zip(grads, tvars), global_step=global_step)

# Normally the global step update is done inside of `apply_gradients`.
# However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
# a different optimizer, you should probably take this line out.
new_global_step = global_step + 1
train_op = tf.group(train_op, [global_step.assign(new_global_step)])
return train_op
Expand Down Expand Up @@ -137,7 +140,7 @@ def apply_gradients(self, grads_and_vars, global_step=None, name=None):
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want to decay the weights in a manner that doesn't interact
# Instead we want ot decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if self._do_use_weight_decay(param_name):
Expand Down
109 changes: 89 additions & 20 deletions run_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,14 +145,33 @@ def __init__(self, guid, text_a, text_b=None, label=None):
self.label = label


class PaddingInputExample(object):
"""Fake example so the num input examples is a multiple of the batch size.
When running eval/predict on the TPU, we need to pad the number of examples
to be a multiple of the batch size, because the TPU requires a fixed batch
size. The alternative is to drop the last batch, which is bad because it means
the entire output data won't be generated.
We use this class instead of `None` because treating `None` as padding
battches could cause silent errors.
"""


class InputFeatures(object):
"""A single set of features of data."""

def __init__(self, input_ids, input_mask, segment_ids, label_id):
def __init__(self,
input_ids,
input_mask,
segment_ids,
label_id,
is_real_example=True):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
self.is_real_example = is_real_example


class DataProcessor(object):
Expand Down Expand Up @@ -358,6 +377,15 @@ def _create_examples(self, lines, set_type):
def convert_single_example(ex_index, example, label_list, max_seq_length,
tokenizer):
"""Converts a single `InputExample` into a single `InputFeatures`."""

if isinstance(example, PaddingInputExample):
return InputFeatures(
input_ids=[0] * max_seq_length,
input_mask=[0] * max_seq_length,
segment_ids=[0] * max_seq_length,
label_id=0,
is_real_example=False)

label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i
Expand Down Expand Up @@ -393,7 +421,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# used as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
segment_ids = []
Expand Down Expand Up @@ -443,7 +471,8 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id)
label_id=label_id,
is_real_example=True)
return feature


Expand All @@ -469,9 +498,12 @@ def create_int_feature(values):
features["input_mask"] = create_int_feature(feature.input_mask)
features["segment_ids"] = create_int_feature(feature.segment_ids)
features["label_ids"] = create_int_feature([feature.label_id])
features["is_real_example"] = create_int_feature(
[int(feature.is_real_example)])

tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
writer.close()


def file_based_input_fn_builder(input_file, seq_length, is_training,
Expand All @@ -483,6 +515,7 @@ def file_based_input_fn_builder(input_file, seq_length, is_training,
"input_mask": tf.FixedLenFeature([seq_length], tf.int64),
"segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
"label_ids": tf.FixedLenFeature([], tf.int64),
"is_real_example": tf.FixedLenFeature([], tf.int64),
}

def _decode_record(record, name_to_features):
Expand Down Expand Up @@ -599,6 +632,11 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
label_ids = features["label_ids"]
is_real_example = None
if "is_real_example" in features:
is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
else:
is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

is_training = (mode == tf.estimator.ModeKeys.TRAIN)

Expand Down Expand Up @@ -643,24 +681,28 @@ def tpu_scaffold():
scaffold_fn=scaffold_fn)
elif mode == tf.estimator.ModeKeys.EVAL:

def metric_fn(per_example_loss, label_ids, logits):
def metric_fn(per_example_loss, label_ids, logits, is_real_example):
predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
accuracy = tf.metrics.accuracy(label_ids, predictions)
loss = tf.metrics.mean(per_example_loss)
accuracy = tf.metrics.accuracy(
labels=label_ids, predictions=predictions, weights=is_real_example)
loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
return {
"eval_accuracy": accuracy,
"eval_loss": loss,
}

eval_metrics = (metric_fn, [per_example_loss, label_ids, logits])
eval_metrics = (metric_fn,
[per_example_loss, label_ids, logits, is_real_example])
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode,
loss=total_loss,
eval_metrics=eval_metrics,
scaffold_fn=scaffold_fn)
else:
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode, predictions=probabilities, scaffold_fn=scaffold_fn)
mode=mode,
predictions={"probabilities": probabilities},
scaffold_fn=scaffold_fn)
return output_spec

return model_fn
Expand Down Expand Up @@ -748,6 +790,9 @@ def main(_):
"xnli": XnliProcessor,
}

tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
FLAGS.init_checkpoint)

if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
raise ValueError(
"At least one of `do_train`, `do_eval` or `do_predict' must be True.")
Expand Down Expand Up @@ -836,22 +881,33 @@ def main(_):

if FLAGS.do_eval:
eval_examples = processor.get_dev_examples(FLAGS.data_dir)
num_actual_eval_examples = len(eval_examples)
if FLAGS.use_tpu:
# TPU requires a fixed batch size for all batches, therefore the number
# of examples must be a multiple of the batch size, or else examples
# will get dropped. So we pad with fake examples which are ignored
# later on. These do NOT count towards the metric (all tf.metrics
# support a per-instance weight, and these get a weight of 0.0).
while len(eval_examples) % FLAGS.eval_batch_size != 0:
eval_examples.append(PaddingInputExample())

eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
file_based_convert_examples_to_features(
eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)

tf.logging.info("***** Running evaluation *****")
tf.logging.info(" Num examples = %d", len(eval_examples))
tf.logging.info(" Num examples = %d (%d actual, %d padding)",
len(eval_examples), num_actual_eval_examples,
len(eval_examples) - num_actual_eval_examples)
tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)

# This tells the estimator to run through the entire set.
eval_steps = None
# However, if running eval on the TPU, you will need to specify the
# number of steps.
if FLAGS.use_tpu:
# Eval will be slightly WRONG on the TPU because it will truncate
# the last batch.
eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
assert len(eval_examples) % FLAGS.eval_batch_size == 0
eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

eval_drop_remainder = True if FLAGS.use_tpu else False
eval_input_fn = file_based_input_fn_builder(
Expand All @@ -871,20 +927,26 @@ def main(_):

if FLAGS.do_predict:
predict_examples = processor.get_test_examples(FLAGS.data_dir)
num_actual_predict_examples = len(predict_examples)
if FLAGS.use_tpu:
# TPU requires a fixed batch size for all batches, therefore the number
# of examples must be a multiple of the batch size, or else examples
# will get dropped. So we pad with fake examples which are ignored
# later on.
while len(predict_examples) % FLAGS.predict_batch_size != 0:
predict_examples.append(PaddingInputExample())

predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
file_based_convert_examples_to_features(predict_examples, label_list,
FLAGS.max_seq_length, tokenizer,
predict_file)

tf.logging.info("***** Running prediction*****")
tf.logging.info(" Num examples = %d", len(predict_examples))
tf.logging.info(" Num examples = %d (%d actual, %d padding)",
len(predict_examples), num_actual_predict_examples,
len(predict_examples) - num_actual_predict_examples)
tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size)

if FLAGS.use_tpu:
# Warning: According to tpu_estimator.py Prediction on TPU is an
# experimental feature and hence not supported here
raise ValueError("Prediction in TPU not supported")

predict_drop_remainder = True if FLAGS.use_tpu else False
predict_input_fn = file_based_input_fn_builder(
input_file=predict_file,
Expand All @@ -896,11 +958,18 @@ def main(_):

output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
with tf.gfile.GFile(output_predict_file, "w") as writer:
num_written_lines = 0
tf.logging.info("***** Predict results *****")
for prediction in result:
for (i, prediction) in enumerate(result):
probabilities = prediction["probabilities"]
if i >= num_actual_predict_examples:
break
output_line = "\t".join(
str(class_probability) for class_probability in prediction) + "\n"
str(class_probability)
for class_probability in probabilities) + "\n"
writer.write(output_line)
num_written_lines += 1
assert num_written_lines == num_actual_predict_examples


if __name__ == "__main__":
Expand Down
3 changes: 3 additions & 0 deletions run_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -1096,6 +1096,9 @@ def close(self):

def validate_flags_or_throw(bert_config):
"""Validate the input FLAGS or throw an exception."""
tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
FLAGS.init_checkpoint)

if not FLAGS.do_train and not FLAGS.do_predict:
raise ValueError("At least one of `do_train` or `do_predict` must be True.")

Expand Down
56 changes: 55 additions & 1 deletion tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,62 @@
from __future__ import print_function

import collections
import re
import unicodedata
import six
import tensorflow as tf


def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
"""Checks whether the casing config is consistent with the checkpoint name."""

# The casing has to be passed in by the user and there is no explicit check
# as to whether it matches the checkpoint. The casing information probably
# should have been stored in the bert_config.json file, but it's not, so
# we have to heuristically detect it to validate.

if not init_checkpoint:
return

m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
if m is None:
return

model_name = m.group(1)

lower_models = [
"uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
"multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
]

cased_models = [
"cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
"multi_cased_L-12_H-768_A-12"
]

is_bad_config = False
if model_name in lower_models and not do_lower_case:
is_bad_config = True
actual_flag = "False"
case_name = "lowercased"
opposite_flag = "True"

if model_name in cased_models and do_lower_case:
is_bad_config = True
actual_flag = "True"
case_name = "cased"
opposite_flag = "False"

if is_bad_config:
raise ValueError(
"You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
"However, `%s` seems to be a %s model, so you "
"should pass in `--do_lower_case=%s` so that the fine-tuning matches "
"how the model was pre-training. If this error is wrong, please "
"just comment out this check." % (actual_flag, init_checkpoint,
model_name, case_name, opposite_flag))


def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
Expand Down Expand Up @@ -84,7 +135,10 @@ def load_vocab(vocab_file):

def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
return [vocab[item] for item in items]
output = []
for item in items:
output.append(vocab[item])
return output


def convert_tokens_to_ids(vocab, tokens):
Expand Down
Loading

0 comments on commit f39e881

Please sign in to comment.