[add]上传训练benchmark by z00560161

This commit is contained in:
liang_chaoming@huawei.com
2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,15 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
@@ -0,0 +1,13 @@
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 30522
}
@@ -0,0 +1,442 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Create masked LM/next sentence masked_lm TF examples for BERT."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import random
import tokenization
import tensorflow as tf
flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_string("input_file", None,
"Input raw text file (or comma-separated list of files).")
flags.DEFINE_string(
"output_file", None,
"Output TF example file (or comma-separated list of files).")
flags.DEFINE_string("vocab_file", None,
"The vocabulary file that the BERT model was trained on.")
flags.DEFINE_bool(
"do_lower_case", True,
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.")
flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
flags.DEFINE_integer("max_predictions_per_seq", 20,
"Maximum number of masked LM predictions per sequence.")
flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
flags.DEFINE_integer(
"dupe_factor", 10,
"Number of times to duplicate the input data (with different masks).")
flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
flags.DEFINE_float(
"short_seq_prob", 0.1,
"Probability of creating sequences which are shorter than the "
"maximum length.")
class TrainingInstance(object):
"""A single training instance (sentence pair)."""
def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
is_random_next):
self.tokens = tokens
self.segment_ids = segment_ids
self.is_random_next = is_random_next
self.masked_lm_positions = masked_lm_positions
self.masked_lm_labels = masked_lm_labels
def __str__(self):
s = ""
s += "tokens: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in self.tokens]))
s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
s += "is_random_next: %s\n" % self.is_random_next
s += "masked_lm_positions: %s\n" % (" ".join(
[str(x) for x in self.masked_lm_positions]))
s += "masked_lm_labels: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in self.masked_lm_labels]))
s += "\n"
return s
def __repr__(self):
return self.__str__()
def write_instance_to_example_files(instances, tokenizer, max_seq_length,
max_predictions_per_seq, output_files):
"""Create TF example files from `TrainingInstance`s."""
writers = []
for output_file in output_files:
writers.append(tf.python_io.TFRecordWriter(output_file))
writer_index = 0
total_written = 0
for (inst_index, instance) in enumerate(instances):
input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
input_mask = [1] * len(input_ids)
segment_ids = list(instance.segment_ids)
assert len(input_ids) <= max_seq_length
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
masked_lm_positions = list(instance.masked_lm_positions)
masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
masked_lm_weights = [1.0] * len(masked_lm_ids)
while len(masked_lm_positions) < max_predictions_per_seq:
masked_lm_positions.append(0)
masked_lm_ids.append(0)
masked_lm_weights.append(0.0)
next_sentence_label = 1 if instance.is_random_next else 0
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(input_ids)
features["input_mask"] = create_int_feature(input_mask)
features["segment_ids"] = create_int_feature(segment_ids)
features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
features["next_sentence_labels"] = create_int_feature([next_sentence_label])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writers[writer_index].write(tf_example.SerializeToString())
writer_index = (writer_index + 1) % len(writers)
total_written += 1
if inst_index < 20:
tf.logging.info("*** Example ***")
tf.logging.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in instance.tokens]))
for feature_name in features.keys():
feature = features[feature_name]
values = []
if feature.int64_list.value:
values = feature.int64_list.value
elif feature.float_list.value:
values = feature.float_list.value
tf.logging.info(
"%s: %s" % (feature_name, " ".join([str(x) for x in values])))
for writer in writers:
writer.close()
tf.logging.info("Wrote %d total instances", total_written)
def create_int_feature(values):
feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return feature
def create_float_feature(values):
feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
return feature
def create_training_instances(input_files, tokenizer, max_seq_length,
dupe_factor, short_seq_prob, masked_lm_prob,
max_predictions_per_seq, rng):
"""Create `TrainingInstance`s from raw text."""
all_documents = [[]]
# Input file format:
# (1) One sentence per line. These should ideally be actual sentences, not
# entire paragraphs or arbitrary spans of text. (Because we use the
# sentence boundaries for the "next sentence prediction" task).
# (2) Blank lines between documents. Document boundaries are needed so
# that the "next sentence prediction" task doesn't span between documents.
for input_file in input_files:
with tf.gfile.GFile(input_file, "r") as reader:
while True:
line = tokenization.convert_to_unicode(reader.readline())
if not line:
break
line = line.strip()
# Empty lines are used as document delimiters
if not line:
all_documents.append([])
tokens = tokenizer.tokenize(line)
if tokens:
all_documents[-1].append(tokens)
# Remove empty documents
all_documents = [x for x in all_documents if x]
rng.shuffle(all_documents)
vocab_words = list(tokenizer.vocab.keys())
instances = []
for _ in range(dupe_factor):
for document_index in range(len(all_documents)):
instances.extend(
create_instances_from_document(
all_documents, document_index, max_seq_length, short_seq_prob,
masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
rng.shuffle(instances)
return instances
def create_instances_from_document(
all_documents, document_index, max_seq_length, short_seq_prob,
masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
"""Creates `TrainingInstance`s for a single document."""
document = all_documents[document_index]
# Account for [CLS], [SEP], [SEP]
max_num_tokens = max_seq_length - 3
# We *usually* want to fill up the entire sequence since we are padding
# to `max_seq_length` anyways, so short sequences are generally wasted
# computation. However, we *sometimes*
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
# sequences to minimize the mismatch between pre-training and fine-tuning.
# The `target_seq_length` is just a rough target however, whereas
# `max_seq_length` is a hard limit.
target_seq_length = max_num_tokens
if rng.random() < short_seq_prob:
target_seq_length = rng.randint(2, max_num_tokens)
# We DON'T just concatenate all of the tokens from a document into a long
# sequence and choose an arbitrary split point because this would make the
# next sentence prediction task too easy. Instead, we split the input into
# segments "A" and "B" based on the actual "sentences" provided by the user
# input.
instances = []
current_chunk = []
current_length = 0
i = 0
while i < len(document):
segment = document[i]
current_chunk.append(segment)
current_length += len(segment)
if i == len(document) - 1 or current_length >= target_seq_length:
if current_chunk:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end = 1
if len(current_chunk) >= 2:
a_end = rng.randint(1, len(current_chunk) - 1)
tokens_a = []
for j in range(a_end):
tokens_a.extend(current_chunk[j])
tokens_b = []
# Random next
is_random_next = False
if len(current_chunk) == 1 or rng.random() < 0.5:
is_random_next = True
target_b_length = target_seq_length - len(tokens_a)
# This should rarely go for more than one iteration for large
# corpora. However, just to be careful, we try to make sure that
# the random document is not the same as the document
# we're processing.
for _ in range(10):
random_document_index = rng.randint(0, len(all_documents) - 1)
if random_document_index != document_index:
break
random_document = all_documents[random_document_index]
random_start = rng.randint(0, len(random_document) - 1)
for j in range(random_start, len(random_document)):
tokens_b.extend(random_document[j])
if len(tokens_b) >= target_b_length:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments = len(current_chunk) - a_end
i -= num_unused_segments
# Actual next
else:
is_random_next = False
for j in range(a_end, len(current_chunk)):
tokens_b.extend(current_chunk[j])
truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
assert len(tokens_a) >= 1
assert len(tokens_b) >= 1
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
(tokens, masked_lm_positions,
masked_lm_labels) = create_masked_lm_predictions(
tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
instance = TrainingInstance(
tokens=tokens,
segment_ids=segment_ids,
is_random_next=is_random_next,
masked_lm_positions=masked_lm_positions,
masked_lm_labels=masked_lm_labels)
instances.append(instance)
current_chunk = []
current_length = 0
i += 1
return instances
MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
["index", "label"])
def create_masked_lm_predictions(tokens, masked_lm_prob,
max_predictions_per_seq, vocab_words, rng):
"""Creates the predictions for the masked LM objective."""
cand_indexes = []
for (i, token) in enumerate(tokens):
if token == "[CLS]" or token == "[SEP]":
continue
cand_indexes.append(i)
rng.shuffle(cand_indexes)
output_tokens = list(tokens)
num_to_predict = min(max_predictions_per_seq,
max(1, int(round(len(tokens) * masked_lm_prob))))
masked_lms = []
covered_indexes = set()
for index in cand_indexes:
if len(masked_lms) >= num_to_predict:
break
if index in covered_indexes:
continue
covered_indexes.add(index)
masked_token = None
# 80% of the time, replace with [MASK]
if rng.random() < 0.8:
masked_token = "[MASK]"
else:
# 10% of the time, keep original
if rng.random() < 0.5:
masked_token = tokens[index]
# 10% of the time, replace with random word
else:
masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
output_tokens[index] = masked_token
masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
masked_lms = sorted(masked_lms, key=lambda x: x.index)
masked_lm_positions = []
masked_lm_labels = []
for p in masked_lms:
masked_lm_positions.append(p.index)
masked_lm_labels.append(p.label)
return (output_tokens, masked_lm_positions, masked_lm_labels)
def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
"""Truncates a pair of sequences to a maximum sequence length."""
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_num_tokens:
break
trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
assert len(trunc_tokens) >= 1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if rng.random() < 0.5:
del trunc_tokens[0]
else:
trunc_tokens.pop()
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
input_files = []
for input_pattern in FLAGS.input_file.split(","):
input_files.extend(tf.gfile.Glob(input_pattern))
tf.logging.info("*** Reading from input files ***")
for input_file in input_files:
tf.logging.info(" %s", input_file)
rng = random.Random(FLAGS.random_seed)
instances = create_training_instances(
input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
rng)
output_files = FLAGS.output_file.split(",")
tf.logging.info("*** Writing to output files ***")
for output_file in output_files:
tf.logging.info(" %s", output_file)
write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
FLAGS.max_predictions_per_seq, output_files)
if __name__ == "__main__":
flags.mark_flag_as_required("input_file")
flags.mark_flag_as_required("output_file")
flags.mark_flag_as_required("vocab_file")
tf.app.run()
@@ -0,0 +1,419 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Extract pre-computed feature vectors from BERT."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import codecs
import collections
import json
import re
import modeling
import tokenization
import tensorflow as tf
flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_string("input_file", None, "")
flags.DEFINE_string("output_file", None, "")
flags.DEFINE_string("layers", "-1,-2,-3,-4", "")
flags.DEFINE_string(
"bert_config_file", None,
"The config json file corresponding to the pre-trained BERT model. "
"This specifies the model architecture.")
flags.DEFINE_integer(
"max_seq_length", 128,
"The maximum total input sequence length after WordPiece tokenization. "
"Sequences longer than this will be truncated, and sequences shorter "
"than this will be padded.")
flags.DEFINE_string(
"init_checkpoint", None,
"Initial checkpoint (usually from a pre-trained BERT model).")
flags.DEFINE_string("vocab_file", None,
"The vocabulary file that the BERT model was trained on.")
flags.DEFINE_bool(
"do_lower_case", True,
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.")
flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.")
flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
flags.DEFINE_string("master", None,
"If using a TPU, the address of the master.")
flags.DEFINE_integer(
"num_tpu_cores", 8,
"Only used if `use_tpu` is True. Total number of TPU cores to use.")
flags.DEFINE_bool(
"use_one_hot_embeddings", False,
"If True, tf.one_hot will be used for embedding lookups, otherwise "
"tf.nn.embedding_lookup will be used. On TPUs, this should be True "
"since it is much faster.")
class InputExample(object):
def __init__(self, unique_id, text_a, text_b):
self.unique_id = unique_id
self.text_a = text_a
self.text_b = text_b
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
self.unique_id = unique_id
self.tokens = tokens
self.input_ids = input_ids
self.input_mask = input_mask
self.input_type_ids = input_type_ids
def input_fn_builder(features, seq_length):
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
all_unique_ids = []
all_input_ids = []
all_input_mask = []
all_input_type_ids = []
for feature in features:
all_unique_ids.append(feature.unique_id)
all_input_ids.append(feature.input_ids)
all_input_mask.append(feature.input_mask)
all_input_type_ids.append(feature.input_type_ids)
def input_fn(params):
"""The actual input function."""
batch_size = params["batch_size"]
num_examples = len(features)
# This is for demo purposes and does NOT scale to large data sets. We do
# not use Dataset.from_generator() because that uses tf.py_func which is
# not TPU compatible. The right way to load data is with TFRecordReader.
d = tf.data.Dataset.from_tensor_slices({
"unique_ids":
tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
"input_ids":
tf.constant(
all_input_ids, shape=[num_examples, seq_length],
dtype=tf.int32),
"input_mask":
tf.constant(
all_input_mask,
shape=[num_examples, seq_length],
dtype=tf.int32),
"input_type_ids":
tf.constant(
all_input_type_ids,
shape=[num_examples, seq_length],
dtype=tf.int32),
})
d = d.batch(batch_size=batch_size, drop_remainder=False)
return d
return input_fn
def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
use_one_hot_embeddings):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
unique_ids = features["unique_ids"]
input_ids = features["input_ids"]
input_mask = features["input_mask"]
input_type_ids = features["input_type_ids"]
model = modeling.BertModel(
config=bert_config,
is_training=False,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=input_type_ids,
use_one_hot_embeddings=use_one_hot_embeddings)
if mode != tf.estimator.ModeKeys.PREDICT:
raise ValueError("Only PREDICT modes are supported: %s" % (mode))
tvars = tf.trainable_variables()
scaffold_fn = None
(assignment_map,
initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
tvars, init_checkpoint)
if use_tpu:
def tpu_scaffold():
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
return tf.train.Scaffold()
scaffold_fn = tpu_scaffold
else:
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
tf.logging.info("**** Trainable Variables ****")
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
init_string)
all_layers = model.get_all_encoder_layers()
predictions = {
"unique_id": unique_ids,
}
for (i, layer_index) in enumerate(layer_indexes):
predictions["layer_output_%d" % i] = all_layers[layer_index]
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
return output_spec
return model_fn
def convert_examples_to_features(examples, seq_length, tokenizer):
"""Loads a data file into a list of `InputBatch`s."""
features = []
for (ex_index, example) in enumerate(examples):
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > seq_length - 2:
tokens_a = tokens_a[0:(seq_length - 2)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
input_type_ids = []
tokens.append("[CLS]")
input_type_ids.append(0)
for token in tokens_a:
tokens.append(token)
input_type_ids.append(0)
tokens.append("[SEP]")
input_type_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
input_type_ids.append(1)
tokens.append("[SEP]")
input_type_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < seq_length:
input_ids.append(0)
input_mask.append(0)
input_type_ids.append(0)
assert len(input_ids) == seq_length
assert len(input_mask) == seq_length
assert len(input_type_ids) == seq_length
if ex_index < 5:
tf.logging.info("*** Example ***")
tf.logging.info("unique_id: %s" % (example.unique_id))
tf.logging.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
tf.logging.info(
"input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
features.append(
InputFeatures(
unique_id=example.unique_id,
tokens=tokens,
input_ids=input_ids,
input_mask=input_mask,
input_type_ids=input_type_ids))
return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def read_examples(input_file):
"""Read a list of `InputExample`s from an input file."""
examples = []
unique_id = 0
with tf.gfile.GFile(input_file, "r") as reader:
while True:
line = tokenization.convert_to_unicode(reader.readline())
if not line:
break
line = line.strip()
text_a = None
text_b = None
m = re.match(r"^(.*) \|\|\| (.*)$", line)
if m is None:
text_a = line
else:
text_a = m.group(1)
text_b = m.group(2)
examples.append(
InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
unique_id += 1
return examples
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
layer_indexes = [int(x) for x in FLAGS.layers.split(",")]
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
master=FLAGS.master,
tpu_config=tf.contrib.tpu.TPUConfig(
num_shards=FLAGS.num_tpu_cores,
per_host_input_for_training=is_per_host))
examples = read_examples(FLAGS.input_file)
features = convert_examples_to_features(
examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
unique_id_to_feature = {}
for feature in features:
unique_id_to_feature[feature.unique_id] = feature
model_fn = model_fn_builder(
bert_config=bert_config,
init_checkpoint=FLAGS.init_checkpoint,
layer_indexes=layer_indexes,
use_tpu=FLAGS.use_tpu,
use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)
# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator = tf.contrib.tpu.TPUEstimator(
use_tpu=FLAGS.use_tpu,
model_fn=model_fn,
config=run_config,
predict_batch_size=FLAGS.batch_size)
input_fn = input_fn_builder(
features=features, seq_length=FLAGS.max_seq_length)
with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file,
"w")) as writer:
for result in estimator.predict(input_fn, yield_single_examples=True):
unique_id = int(result["unique_id"])
feature = unique_id_to_feature[unique_id]
output_json = collections.OrderedDict()
output_json["linex_index"] = unique_id
all_features = []
for (i, token) in enumerate(feature.tokens):
all_layers = []
for (j, layer_index) in enumerate(layer_indexes):
layer_output = result["layer_output_%d" % j]
layers = collections.OrderedDict()
layers["index"] = layer_index
layers["values"] = [
round(float(x), 6) for x in layer_output[i:(i + 1)].flat
]
all_layers.append(layers)
features = collections.OrderedDict()
features["token"] = token
features["layers"] = all_layers
all_features.append(features)
output_json["features"] = all_features
writer.write(json.dumps(output_json) + "\n")
if __name__ == "__main__":
flags.mark_flag_as_required("input_file")
flags.mark_flag_as_required("vocab_file")
flags.mark_flag_as_required("bert_config_file")
flags.mark_flag_as_required("init_checkpoint")
flags.mark_flag_as_required("output_file")
tf.app.run()
@@ -0,0 +1,35 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
import numpy as np
def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
initializer=None, regularizer=None,
trainable=True,
*args, **kwargs):
"""Custom variable getter that forces trainable variables to be stored in
float32 precision and then casts them to the training precision.
"""
storage_dtype = tf.float32 if trainable else dtype
variable = getter(name, shape, dtype=storage_dtype,
initializer=initializer, regularizer=regularizer,
trainable=trainable,
*args, **kwargs)
if trainable and dtype != tf.float32:
variable = tf.cast(variable, dtype)
return variable
@@ -0,0 +1,141 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import copy
import json
import math
import re
import six
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.contrib.layers.python.layers import utils
from tensorflow.contrib.framework.python.ops import variables
from tensorflow.python.ops import init_ops
import numpy
from tensorflow.python.ops import array_ops
from tensorflow.python.framework import dtypes
from tensorflow.python.ops import nn
def fused_layer_norm(inputs,
center=True,
scale=True,
activation_fn=None,
reuse=None,
variables_collections=None,
outputs_collections=None,
trainable=True,
begin_norm_axis=1,
begin_params_axis=-1,
scope=None,
use_fused_batch_norm=False):
with tf.variable_scope(
scope, 'LayerNorm', [inputs], reuse=reuse) as sc:
inputs = ops.convert_to_tensor(inputs)
inputs_shape = inputs.shape
inputs_rank = inputs_shape.ndims
if inputs_rank is None:
raise ValueError('Inputs %s has undefined rank.' % inputs.name)
dtype = inputs.dtype.base_dtype
if begin_norm_axis < 0:
begin_norm_axis = inputs_rank + begin_norm_axis
if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) '
'must be < rank(inputs) (%d)' %
(begin_params_axis, begin_norm_axis, inputs_rank))
params_shape = inputs_shape[begin_params_axis:]
if not params_shape.is_fully_defined():
raise ValueError(
'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' %
(inputs.name, begin_params_axis, inputs_shape))
# Allocate parameters for the beta and gamma of the normalization.
beta, gamma = None, None
if center:
beta_collections = utils.get_variable_collections(variables_collections,
'beta')
beta = variables.model_variable(
'beta',
shape=params_shape,
dtype=dtype,
initializer=init_ops.zeros_initializer(),
collections=beta_collections,
trainable=trainable)
if scale:
gamma_collections = utils.get_variable_collections(
variables_collections, 'gamma')
gamma = variables.model_variable(
'gamma',
shape=params_shape,
dtype=dtype,
initializer=init_ops.ones_initializer(),
collections=gamma_collections,
trainable=trainable)
if use_fused_batch_norm:
# get static TensorShape if fully defined,
# otherwise retrieve shape tensor
norm_shape = inputs.shape[begin_norm_axis:]
if norm_shape.is_fully_defined():
bn_shape = [1, -1, 1, numpy.prod(norm_shape.as_list())]
else:
norm_shape = tf.shape(inputs)[begin_norm_axis:]
bn_shape = [1, -1, 1, tf.reduce_prod(norm_shape)]
if inputs.get_shape().is_fully_defined():
outputs_shape = inputs.get_shape()
else:
outputs_shape = tf.shape(inputs)
inputs = array_ops.reshape(inputs, bn_shape)
if inputs.get_shape().is_fully_defined():
# static inputs TensorShape fully defined after reshape.
ones = array_ops.ones(inputs.get_shape()[1], dtype=dtypes.float32)
zeros = array_ops.zeros(inputs.get_shape()[1], dtype=dtypes.float32)
else:
# static inputs TensorShape NOT fully defined after reshape.
# must use dynamic shape, which means these input tensors
# have to be created at runtime, which causes a slowdown.
scale_shape = tf.shape(inputs)[1]
ones = array_ops.ones(scale_shape, dtype=dtypes.float32)
zeros = array_ops.zeros(scale_shape, dtype=dtypes.float32)
outputs, mean, variance = nn.fused_batch_norm(
inputs,
ones, zeros,
epsilon=1e-4,
data_format="NCHW")
outputs = array_ops.reshape(outputs, outputs_shape)
if center and scale:
outputs = outputs * gamma + beta
elif center:
outputs = outputs + beta
elif scale:
outputs = outputs * gamma
else:
# Calculate the moments on the last axis (layer activations).
norm_axes = list(range(begin_norm_axis, inputs_rank))
mean, variance = nn.moments(inputs, norm_axes, keep_dims=True)
# Compute layer normalization using the batch_normalization function.
variance_epsilon = 1e-4
outputs = nn.batch_normalization(
inputs,
mean,
variance,
offset=beta,
scale=gamma,
variance_epsilon=variance_epsilon)
outputs.set_shape(inputs_shape)
if activation_fn is not None:
outputs = activation_fn(outputs)
return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
@@ -0,0 +1,36 @@
# coding=utf-8
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
import numpy as np
def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
initializer=None, regularizer=None,
trainable=True,
*args, **kwargs):
"""Custom variable getter that forces trainable variables to be stored in
float32 precision and then casts them to the training precision.
"""
storage_dtype = tf.float32 if trainable else dtype
variable = getter(name, shape, dtype=storage_dtype,
initializer=initializer, regularizer=regularizer,
trainable=trainable,
*args, **kwargs)
if trainable and dtype != tf.float32:
variable = tf.cast(variable, dtype)
return variable
def get_custom_getter(compute_type):
return float32_variable_storage_getter if compute_type == tf.float16 else None
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,439 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions and classes related to optimization (weight updates)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import tensorflow as tf
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import linalg_ops
from tensorflow.python.ops import math_ops
from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
from npu_bridge.estimator.npu import npu_loss_scale_optimizer as lso
from npu_bridge.estimator.npu import npu_loss_scale_manager as lsm_lib
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None, manual_fp16=False, use_fp16=False, num_accumulation_steps=1,
optimizer_type="adam", allreduce_post_accumulation=False):
"""Creates an optimizer training op."""
global_step = tf.train.get_or_create_global_step()
# avoid step change in learning rate at end of warmup phase
if optimizer_type == "adam":
power = 1.0
decayed_learning_rate_at_crossover_point = init_lr * (
(1.0 - float(num_warmup_steps) / float(num_train_steps)) ** power)
else:
power = 0.5
decayed_learning_rate_at_crossover_point = init_lr
adjusted_init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point)
print('decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e' % (decayed_learning_rate_at_crossover_point, adjusted_init_lr))
learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
# Implements linear decay of the learning rate.
learning_rate = tf.train.polynomial_decay(
learning_rate,
global_step,
num_train_steps,
end_learning_rate=0.0,
power=power,
cycle=False)
# Implements linear warmup. I.e., if global_step < num_warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
if num_warmup_steps:
global_steps_int = tf.cast(global_step, tf.int32)
warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
global_steps_float = tf.cast(global_steps_int, tf.float32)
warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
warmup_percent_done = global_steps_float / warmup_steps_float
warmup_learning_rate = init_lr * warmup_percent_done
is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
learning_rate = (
(1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
if optimizer_type == "lamb":
print("Initializing LAMB Optimizer")
optimizer = LAMBOptimizer(
learning_rate=learning_rate,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
else:
print("Initializing ADAM Weight Decay Optimizer")
# It is recommended that you use this optimizer for fine tuning, since this
# is how the model was trained (note that the Adam m/v variables are NOT
# loaded from init_checkpoint.)
optimizer = AdamWeightDecayOptimizer(
learning_rate=learning_rate,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-4,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
if hvd is not None and (num_accumulation_steps == 1 or (not allreduce_post_accumulation)):
optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none)
optimizer = NPUDistributedOptimizer(optimizer)
if tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]:
opt_tmp = optimizer
if tf.flags.FLAGS.npu_bert_loss_scale == 0:
loss_scale_manager = lsm_lib.ExponentialUpdateLossScaleManager(init_loss_scale=tf.flags.FLAGS.init_loss_scale_value, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
elif tf.flags.FLAGS.npu_bert_loss_scale >= 1:
loss_scale_manager = lsm_lib.FixedLossScaleManager(loss_scale=tf.flags.FLAGS.npu_bert_loss_scale)
else:
raise ValueError("Invalid loss scale: %d" % tf.flags.FLAGS.npu_bert_loss_scale)
optimizer = lso.NPULossScaleOptimizer(opt_tmp, loss_scale_manager, is_distributed=tf.flags.FLAGS.distributed)
tvars = tf.trainable_variables()
grads_and_vars = optimizer.compute_gradients(loss * 1.0 / num_accumulation_steps, tvars)
if num_accumulation_steps > 1:
local_step = tf.get_variable(name="local_step", shape=[], dtype=tf.int32, trainable=False,
initializer=tf.zeros_initializer)
batch_finite = tf.get_variable(name="batch_finite", shape=[], dtype=tf.bool, trainable=False,
initializer=tf.ones_initializer)
accum_vars = [tf.get_variable(
name=tvar.name.split(":")[0] + "/accum",
shape=tvar.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer()) for tvar in tf.trainable_variables()]
reset_step = tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool)
local_step = tf.cond(reset_step, lambda:local_step.assign(tf.ones_like(local_step)), lambda:local_step.assign_add(1))
with tf.name_scope(accumulate_step):
grads_and_vars_and_accums = [(gv[0],gv[1],accum_vars[i]) for i, gv in enumerate(grads_and_vars) if gv[0] is not None]
grads, tvars, accum_vars = list(zip(*grads_and_vars_and_accums))
all_are_finite = tf.reduce_all([tf.reduce_all(tf.is_finite(g)) for g in grads]) if (tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]) and (manual_fp16 or use_fp16) else tf.constant(True, dtype=tf.bool)
batch_finite = tf.cond(reset_step,
lambda: batch_finite.assign(tf.math.logical_and(tf.constant(True, dtype=tf.bool), all_are_finite)),
lambda:batch_finite.assign(tf.math.logical_and(batch_finite, all_are_finite)))
# This is how the model was pre-trained.
# ensure global norm is a finite number
# to prevent clip_by_global_norm from having a hizzy fit.
if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
(clipped_grads, _) = tf.clip_by_global_norm(
grads, clip_norm=1.0,
use_norm=tf.cond(
all_are_finite,
lambda: tf.global_norm(grads),
lambda: tf.constant(1.0)))
else:
with tf.name_scope("clip_grads"):
clipped_grads = [
(tf.clip_by_norm(grad, clip_norm=1.0))
if grad is not None else (grad, var) for grad in grads
]
accum_vars = tf.cond(reset_step,
lambda: [accum_vars[i].assign(grad) for i, grad in enumerate(clipped_grads)],
lambda: [accum_vars[i].assign_add(grad) for i, grad in enumerate(clipped_grads)])
def update(accum_vars):
with tf.name_scope("opt_update"):
if allreduce_post_accumulation and hvd is not None:
accum_vars = [hvd.allreduce(tf.convert_to_tensor(accum_var), compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) if isinstance(accum_var, tf.IndexedSlices)
else hvd.allreduce(accum_var, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) for accum_var in accum_vars]
return optimizer.apply_gradients(list(zip(accum_vars, tvars)), global_step=global_step)
update_step = tf.identity(tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool), name="update_step")
update_op = tf.cond(update_step,
lambda: update(accum_vars), lambda: tf.no_op())
new_global_step = tf.cond(tf.math.logical_and(update_step, tf.cast(hvd.allreduce(tf.cast(batch_finite, tf.int32)), tf.bool)), lambda: global_step+1, lambda: global_step)
new_global_step = tf.identity(new_global_step, name='step_update')
train_op = tf.group(update_op, [global_step.assign(new_global_step)])
else:
grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
grads, tvars = list(zip(*grads_and_vars))
if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
all_are_finite = tf.reduce_all(
[tf.reduce_all(tf.is_finite(g)) for g in grads]) if (tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]) and (use_fp16 or manual_fp16) else tf.constant(True, dtype=tf.bool)
# This is how the model was pre-trained.
# ensure global norm is a finite number
# to prevent clip_by_global_norm from having a hizzy fit.
if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
(clipped_grads, _) = tf.clip_by_global_norm(
grads, clip_norm=1.0,
use_norm=tf.cond(
all_are_finite,
lambda: tf.global_norm(grads),
lambda: tf.constant(1.0)))
else:
with tf.name_scope("clip_grads"):
clipped_grads = [
(tf.clip_by_norm(grad, clip_norm=1.0))
if grad is not None else (grad, var) for grad in grads
]
with tf.name_scope("apply_grads"):
train_op = optimizer.apply_gradients(
list(zip(clipped_grads, tvars)), global_step=global_step)
#if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
# new_global_step = tf.cond(all_are_finite, lambda: global_step + 1, lambda: global_step)
#else:
# new_global_step = global_step + 1
#new_global_step = tf.identity(new_global_step, name='step_update')
#train_op = tf.group(train_op, [global_step.assign(new_global_step)])
return train_op
class AdamWeightDecayOptimizer(tf.train.Optimizer):
"""A basic Adam optimizer that includes "correct" L2 weight decay."""
def __init__(self,
learning_rate,
weight_decay_rate=0.0,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-4,
exclude_from_weight_decay=None,
name="AdamWeightDecayOptimizer"):
"""Constructs a AdamWeightDecayOptimizer."""
super(AdamWeightDecayOptimizer, self).__init__(False, name)
self.learning_rate = tf.identity(learning_rate, name='learning_rate')
self.weight_decay_rate = weight_decay_rate
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self.exclude_from_weight_decay = exclude_from_weight_decay
def apply_gradients(self, grads_and_vars, global_step=None, name=None,
manual_fp16=False):
"""See base class."""
assignments = []
for (grad, param) in grads_and_vars:
with tf.name_scope("apply_one_adam"):
if grad is None or param is None:
continue
param_name = self._get_variable_name(param.name)
has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
if has_shadow:
# create shadow fp32 weights for fp16 variable
param_fp32 = tf.get_variable(
name=param_name + "/shadow",
dtype=tf.float32,
trainable=False,
initializer=tf.cast(param.initialized_value(),tf.float32))
else:
param_fp32 = param
m = tf.get_variable(
name=param_name + "/adam_m",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
v = tf.get_variable(
name=param_name + "/adam_v",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
# Standard Adam update.
next_m = (
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
next_v = (
tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
tf.square(grad)))
update = next_m / (tf.sqrt(next_v) + self.epsilon)
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if self._do_use_weight_decay(param_name):
update += self.weight_decay_rate * param_fp32
update_with_lr = self.learning_rate * update
next_param = param_fp32 - update_with_lr
if has_shadow:
# cast shadow fp32 weights to fp16 and assign to trainable variable
param.assign(tf.cast(next_param, param.dtype.base_dtype))
assignments.extend(
[param_fp32.assign(next_param),
m.assign(next_m),
v.assign(next_v)])
new_global_step = global_step + 1
new_global_step = tf.identity(new_global_step, name='step_update')
assignments.extend([global_step.assign(new_global_step)])
return tf.group(*assignments, name=name)
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if not self.weight_decay_rate:
return False
if self.exclude_from_weight_decay:
for r in self.exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
def _get_variable_name(self, param_name):
"""Get the variable name from the tensor name."""
m = re.match("^(.*):\\d+$", param_name)
if m is not None:
param_name = m.group(1)
return param_name
class LAMBOptimizer(tf.train.Optimizer):
"""A LAMB optimizer that includes "correct" L2 weight decay."""
def __init__(self,
learning_rate,
weight_decay_rate=0.0,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=None,
name="LAMBOptimizer"):
"""Constructs a LAMBOptimizer."""
super(LAMBOptimizer, self).__init__(False, name)
self.learning_rate = tf.identity(learning_rate, name='learning_rate')
self.weight_decay_rate = weight_decay_rate
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self.exclude_from_weight_decay = exclude_from_weight_decay
self.steps = 0
def apply_gradients(self, grads_and_vars, global_step=None, name=None,
manual_fp16=False):
"""See base class."""
assignments = []
for (grad, param) in grads_and_vars:
with tf.name_scope("apply_one_lamb"):
if grad is None or param is None:
continue
param_name = self._get_variable_name(param.name)
has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
if has_shadow:
# create shadow fp32 weights for fp16 variable
param_fp32 = tf.get_variable(
name=param_name + "/shadow",
dtype=tf.float32,
trainable=False,
initializer=tf.cast(param.initialized_value(),tf.float32))
else:
param_fp32 = param
m = tf.get_variable(
name=param_name + "/adam_m",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
v = tf.get_variable(
name=param_name + "/adam_v",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
# LAMB update
next_m = (
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
next_v = (
tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
tf.square(grad)))
self.steps += 1
beta1_correction = (1 - self.beta_1 ** self.steps)
beta2_correction = (1 - self.beta_2 ** self.steps)
next_m_unbiased = next_m / beta1_correction
next_v_unbiased = next_v / beta2_correction
update = next_m_unbiased / (tf.sqrt(next_v_unbiased) + self.epsilon)
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if self._do_use_weight_decay(param_name):
update += self.weight_decay_rate * param_fp32
w_norm = linalg_ops.norm(param, ord=2)
g_norm = linalg_ops.norm(update, ord=2)
ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0)
update_with_lr = ratio * self.learning_rate * update
next_param = param_fp32 - update_with_lr
if has_shadow:
# cast shadow fp32 weights to fp16 and assign to trainable variable
param.assign(tf.cast(next_param, param.dtype.base_dtype))
assignments.extend(
[param_fp32.assign(next_param),
m.assign(next_m),
v.assign(next_v)])
new_global_step = global_step + 1
new_global_step = tf.identity(new_global_step, name='step_update')
assignments.extend([global_step.assign(new_global_step)])
return tf.group(*assignments, name=name)
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if not self.weight_decay_rate:
return False
if self.exclude_from_weight_decay:
for r in self.exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
def _get_variable_name(self, param_name):
"""Get the variable name from the tensor name."""
m = re.match("^(.*):\\d+$", param_name)
if m is not None:
param_name = m.group(1)
return param_name
@@ -0,0 +1,784 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run masked LM/next sentence masked_lm pre-training for BERT."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import modeling
import optimization
import tensorflow as tf
import glob
from utils import LogEvalRunHook
from tensorflow.core.protobuf import rewriter_config_pb2
from gpu_environment import get_custom_getter
from npu_bridge.estimator.npu.npu_config import *
from npu_bridge.estimator.npu.npu_estimator import *
from npu_bridge.estimator.npu.npu_config import NPURunConfig
from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../../../../../utils/atlasboost'))
# import hwlog
from benchmark_log import hwlog
from benchmark_log.basic_utils import get_environment_info
from benchmark_log.basic_utils import get_model_parameter
os.environ['WHICH_OP'] = 'GEOP'
os.environ['NEW_GE_FE_ID'] = '1'
os.environ['GE_AICPU_FLAG'] = '1'
os.environ['GE_USE_STATIC_MEMORY'] = '1'
os.environ['OPTION_EXEC_HCCL_FLAG'] = '1'
os.environ['HCCL_CONNECT_TIMEOUT'] = '600'
flags = tf.flags
FLAGS = flags.FLAGS
## Required parameters
flags.DEFINE_string(
"bert_config_file", None,
"The config json file corresponding to the pre-trained BERT model. "
"This specifies the model architecture.")
flags.DEFINE_string(
"input_files_dir", "./data",
"Directory with input files, comma separated or single directory.")
flags.DEFINE_string(
"eval_files_dir", None,
"Directory with eval files, comma separated or single directory. ")
flags.DEFINE_string(
"output_dir", "./models",
"The output directory where the model checkpoints will be written.")
## Other parameters
flags.DEFINE_string(
"init_checkpoint", None,
"Initial checkpoint (usually from a pre-trained BERT model).")
flags.DEFINE_string(
"optimizer_type", "lamb",
"Optimizer used for training - LAMB or ADAM")
flags.DEFINE_integer(
"max_seq_length", 128,
"The maximum total input sequence length after WordPiece tokenization. "
"Sequences longer than this will be truncated, and sequences shorter "
"than this will be padded. Must match data generation.")
flags.DEFINE_integer(
"max_predictions_per_seq", 20,
"Maximum number of masked LM predictions per sequence. "
"Must match data generation.")
flags.DEFINE_bool("do_train", True, "Whether to run training.")
flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
flags.DEFINE_integer("train_batch_size", 64, "Total batch size for training.")
flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
flags.DEFINE_float("learning_rate", 1e-4, "The initial learning rate for Adam.")
flags.DEFINE_integer("num_train_steps", 1000000, "Number of training steps.")
flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.")
flags.DEFINE_integer("save_checkpoints_steps", 10000,
"How often to save the model checkpoint.")
flags.DEFINE_integer("display_loss_steps", 10,
"How often to print loss")
flags.DEFINE_integer("iterations_per_loop", 1000,
"How many steps to make in each estimator call.")
flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.")
flags.DEFINE_integer("num_accumulation_steps", 1,
"Number of accumulation steps before gradient update."
"Global batch size = num_accumulation_steps * train_batch_size")
flags.DEFINE_bool("allreduce_post_accumulation", False, "Whether to all reduce after accumulation of N steps or after each step")
flags.DEFINE_bool(
"verbose_logging", False,
"If true, all of the trainable parameters are printed")
flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
flags.DEFINE_bool("report_loss", True, "Whether to report total loss during training.")
flags.DEFINE_bool("manual_fp16", True, "Whether to use fp32 or fp16 arithmetic on GPU. "
"Manual casting is done instead of using AMP")
flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
flags.DEFINE_bool("use_fp16", False, "Whether to enable AMP ops.")
flags.DEFINE_bool("use_fp16_cls", True, "Whether to use fp16 in cls and pooler.")
flags.DEFINE_bool("distributed", True, "Whether to use multi-npu")
flags.DEFINE_bool('npu_bert_fused_gelu', True, 'Whether to use npu defined gelu op')
flags.DEFINE_bool('npu_bert_debug', False, 'If True, dropout and shuffle is disabled.')
flags.DEFINE_bool('npu_bert_use_tdt', True, 'Whether to use tdt as dataset')
flags.DEFINE_string("npu_bert_job_start_file", None, "CSA job start file path.")
flags.DEFINE_integer("npu_bert_loss_scale", 0, "Whether to use loss scale, -1 is disable, 0 is dynamic loss scale, >=1 is static loss scale")
flags.DEFINE_bool("npu_bert_clip_by_global_norm", False, "Use clip_by_global_norm if True, or use clip_by_norm for each gradient")
flags.DEFINE_bool('npu_bert_npu_dropout', True, 'Whether to use npu defined gelu op')
flags.DEFINE_bool('npu_gather', True, 'Whether to use gather_npu whose backward propagation avoids IndexedSlices')
flags.DEFINE_bool('hcom_parallel', True, 'Whether to use parallel allreduce')
flags.DEFINE_integer('init_loss_scale_value', 2**32, 'Initial loss scale value for loss scale optimizer')
# report samples/sec, total loss and learning rate during training
class _LogSessionRunHook(tf.train.SessionRunHook):
def __init__(self, global_batch_size, num_accumulation_steps, display_every=10, hvd_rank=-1):
self.global_batch_size = global_batch_size
self.display_every = display_every
self.hvd_rank = hvd_rank
self.num_accumulation_steps = num_accumulation_steps
def after_create_session(self, session, coord):
self.elapsed_secs = 0.
self.count = 0
self.all_count = 0
self.avg_loss = 0.0
def before_run(self, run_context):
self.t0 = time.time()
if self.num_accumulation_steps <= 1:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
return tf.train.SessionRunArgs(
fetches=['global_step:0', 'total_loss:0',
'learning_rate:0', 'nsp_loss:0',
'mlm_loss:0', 'loss_scale:0', 'apply_grads/All:0'])
else:
return tf.train.SessionRunArgs(
fetches=['global_step:0', 'total_loss:0',
'learning_rate:0', 'nsp_loss:0',
'mlm_loss:0'])
else:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
return tf.train.SessionRunArgs(
fetches=['global_step:0', 'update_step:0', 'total_loss:0',
'learning_rate:0', 'nsp_loss:0',
'mlm_loss:0', 'loss_scale:0'])
else:
return tf.train.SessionRunArgs(
fetches=['global_step:0', 'update_step:0', 'total_loss:0',
'learning_rate:0', 'nsp_loss:0',
'mlm_loss:0'])
def after_run(self, run_context, run_values):
self.elapsed_secs += time.time() - self.t0
if self.num_accumulation_steps <=1:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler, custom_arg = run_values.results
else:
global_step, total_loss, lr, nsp_loss, mlm_loss = run_values. \
results
update_step = True
else:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
global_step, update_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
else:
global_step, update_step, total_loss, lr, nsp_loss, mlm_loss = run_values.\
results
print_step = global_step + 1 # One-based index for printing.
self.avg_loss += total_loss
self.all_count += 1
if update_step:
self.count += 1
dt = self.elapsed_secs / self.count
sent_per_sec = self.global_batch_size / dt * FLAGS.iterations_per_loop
avg_loss_step = self.avg_loss / self.all_count
if self.hvd_rank >= 0:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
print('Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e Loss scale = %6.4e isFinite = %6i' %
(self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr, loss_scaler, custom_arg), flush=True)
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
else:
print('Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' %
(self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr), flush=True)
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
else:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e Loss scale = %6.4e isFinite = %6i' %
(print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr, loss_scaler, custom_arg), flush=True)
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
else:
print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' %
(print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr), flush=True)
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
self.elapsed_secs = 0.
self.count = 0
self.avg_loss = 0.0
self.all_count = 0
def model_fn_builder(bert_config, init_checkpoint, learning_rate,
num_train_steps, num_warmup_steps,
use_one_hot_embeddings, hvd=None):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
tf.logging.info("*** Features ***")
for name in sorted(features.keys()):
tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
masked_lm_positions = features["masked_lm_positions"]
masked_lm_ids = features["masked_lm_ids"]
masked_lm_weights = features["masked_lm_weights"]
next_sentence_labels = features["next_sentence_labels"]
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
model = modeling.BertModel(
config=bert_config,
is_training=is_training,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=segment_ids,
use_one_hot_embeddings=use_one_hot_embeddings,
compute_type=tf.float16 if FLAGS.manual_fp16 else tf.float32)
(masked_lm_loss,
masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output(
bert_config, model.get_sequence_output(), model.get_embedding_table(),
masked_lm_positions, masked_lm_ids,
masked_lm_weights)
(next_sentence_loss, next_sentence_example_loss,
next_sentence_log_probs) = get_next_sentence_output(
bert_config, model.get_pooled_output(), next_sentence_labels)
masked_lm_loss = tf.identity(masked_lm_loss, name="mlm_loss")
next_sentence_loss = tf.identity(next_sentence_loss, name="nsp_loss")
total_loss = masked_lm_loss + next_sentence_loss
total_loss = tf.identity(total_loss, name='total_loss')
tvars = tf.trainable_variables()
initialized_variable_names = {}
if init_checkpoint and (hvd is None or hvd.rank() == 0):
print("Loading checkpoint", init_checkpoint)
(assignment_map, initialized_variable_names
) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
if FLAGS.verbose_logging:
tf.logging.info("**** Trainable Variables ****")
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
tf.logging.info(" %d :: name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape,
init_string)
output_spec = None
if mode == tf.estimator.ModeKeys.TRAIN:
train_op = optimization.create_optimizer(
total_loss, learning_rate, num_train_steps, num_warmup_steps,
hvd, FLAGS.manual_fp16, FLAGS.use_fp16, FLAGS.num_accumulation_steps, FLAGS.optimizer_type, FLAGS.allreduce_post_accumulation)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
train_op=train_op)
elif mode == tf.estimator.ModeKeys.EVAL:
def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
masked_lm_weights, next_sentence_example_loss,
next_sentence_log_probs, next_sentence_labels):
"""Computes the loss and accuracy of the model."""
masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
[-1, masked_lm_log_probs.shape[-1]])
masked_lm_predictions = tf.argmax(
masked_lm_log_probs, axis=-1, output_type=tf.int32)
masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
masked_lm_accuracy = tf.metrics.accuracy(
labels=masked_lm_ids,
predictions=masked_lm_predictions,
weights=masked_lm_weights)
masked_lm_mean_loss = tf.metrics.mean(
values=masked_lm_example_loss, weights=masked_lm_weights)
next_sentence_log_probs = tf.reshape(
next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
next_sentence_predictions = tf.argmax(
next_sentence_log_probs, axis=-1, output_type=tf.int32)
next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
next_sentence_accuracy = tf.metrics.accuracy(
labels=next_sentence_labels, predictions=next_sentence_predictions)
next_sentence_mean_loss = tf.metrics.mean(
values=next_sentence_example_loss)
return {
"masked_lm_accuracy": masked_lm_accuracy,
"masked_lm_loss": masked_lm_mean_loss,
"next_sentence_accuracy": next_sentence_accuracy,
"next_sentence_loss": next_sentence_mean_loss,
}
eval_metric_ops = metric_fn(
masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
masked_lm_weights, next_sentence_example_loss,
next_sentence_log_probs, next_sentence_labels
)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
eval_metric_ops=eval_metric_ops)
else:
raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))
return output_spec
return model_fn
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
label_ids, label_weights):
"""Get loss and log probs for the masked LM."""
input_tensor = gather_indexes(input_tensor, positions)
with tf.variable_scope("cls/predictions"):
# We apply one more non-linear transformation before the output layer.
# This matrix is not used after pre-training.
with tf.variable_scope("transform", custom_getter=get_custom_getter(compute_type=tf.float16 if FLAGS.use_fp16_cls else tf.float32)):
if FLAGS.use_fp16_cls:
input_tensor = tf.cast(input_tensor, tf.float16)
input_tensor = tf.layers.dense(
input_tensor,
units=bert_config.hidden_size,
activation=modeling.get_activation(bert_config.hidden_act),
kernel_initializer=modeling.create_initializer(
bert_config.initializer_range))
input_tensor = tf.cast(input_tensor, tf.float32)
input_tensor = modeling.layer_norm(input_tensor)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
output_bias = tf.get_variable(
"output_bias",
shape=[bert_config.vocab_size],
initializer=tf.zeros_initializer())
if FLAGS.use_fp16_cls:
input_tensor = tf.cast(input_tensor, tf.float16)
logits = tf.matmul(input_tensor, tf.cast(output_weights, tf.float16), transpose_b=True)
logits = tf.cast(logits, tf.float32)
else:
logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
label_ids = tf.reshape(label_ids, [-1])
label_weights = tf.reshape(label_weights, [-1])
one_hot_labels = tf.one_hot(
label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
# The `positions` tensor might be zero-padded (if the sequence is too
# short to have the maximum number of predictions). The `label_weights`
# tensor has a value of 1.0 for every real prediction and 0.0 for the
# padding predictions.
per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
numerator = tf.reduce_sum(label_weights * per_example_loss)
denominator = tf.reduce_sum(label_weights) + 1e-5
loss = numerator / denominator
return (loss, per_example_loss, log_probs)
def get_next_sentence_output(bert_config, input_tensor, labels):
"""Get loss and log probs for the next sentence prediction."""
# Simple binary classification. Note that 0 is "next sentence" and 1 is
# "random sentence". This weight matrix is not used after pre-training.
with tf.variable_scope("cls/seq_relationship"):
output_weights = tf.get_variable(
"output_weights",
shape=[2, bert_config.hidden_size],
initializer=modeling.create_initializer(bert_config.initializer_range))
output_bias = tf.get_variable(
"output_bias", shape=[2], initializer=tf.zeros_initializer())
if FLAGS.use_fp16_cls:
input_tensor = tf.cast(input_tensor, tf.float16)
logits = tf.matmul(input_tensor, tf.cast(output_weights, tf.float16), transpose_b=True)
logits = tf.cast(logits, tf.float32)
else:
logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
labels = tf.reshape(labels, [-1])
one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
return (loss, per_example_loss, log_probs)
def gather_indexes(sequence_tensor, positions):
"""Gathers the vectors at the specific positions over a minibatch."""
sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
batch_size = sequence_shape[0]
seq_length = sequence_shape[1]
width = sequence_shape[2]
flat_offsets = tf.reshape(
tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
flat_positions = tf.reshape(positions + flat_offsets, [-1])
flat_sequence_tensor = tf.reshape(sequence_tensor,
[batch_size * seq_length, width])
output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
return output_tensor
def input_fn_builder(input_files,
batch_size,
max_seq_length,
max_predictions_per_seq,
is_training,
num_cpu_threads=4,
hvd=None):
"""Creates an `input_fn` closure to be passed to Estimator."""
def input_fn():
"""The actual input function."""
name_to_features = {
"input_ids":
tf.FixedLenFeature([max_seq_length], tf.int64),
"input_mask":
tf.FixedLenFeature([max_seq_length], tf.int64),
"segment_ids":
tf.FixedLenFeature([max_seq_length], tf.int64),
"masked_lm_positions":
tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
"masked_lm_ids":
tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
"masked_lm_weights":
tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
"next_sentence_labels":
tf.FixedLenFeature([1], tf.int64),
}
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if is_training:
d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
if FLAGS.distributed:
#rank_size = int(os.getenv('RANK_SIZE'))
#rank_id = int(os.getenv('RANK_INDEX'))
#device_id = int(os.getenv('DEVICE_ID'))
#local_rank = rank_id * 8 + device_id
#print('RANK_SIZE=', rank_size, ' RANK_ID=', local_rank)
rank_size = int(os.getenv('RANK_SIZE'))
rank_id = int(os.getenv('RANK_ID'))
print('RANK_SIZE=', rank_size, ' rank_id=', rank_id)
d = d.shard(rank_size, rank_id)
d = d.repeat()
if not FLAGS.npu_bert_debug:
d = d.shuffle(buffer_size=len(input_files))
# `cycle_length` is the number of parallel files that get read.
if not FLAGS.npu_bert_debug:
#cycle_length = min(num_cpu_threads, len(input_files))
cycle_length = min(num_cpu_threads, int(len(input_files)/int(os.getenv('RANK_SIZE'))))
else:
cycle_length = 1
# `sloppy` mode means that the interleaving is not exact. This adds
# even more randomness to the training pipeline.
#d = d.apply(
# tf.contrib.data.parallel_interleave(
# tf.data.TFRecordDataset,
# sloppy=(not FLAGS.npu_bert_debug),
# cycle_length=cycle_length))
d = d.interleave(
tf.data.TFRecordDataset,
cycle_length=cycle_length,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
if not FLAGS.npu_bert_debug:
d = d.shuffle(buffer_size=100)
else:
d = tf.data.TFRecordDataset(input_files)
# Since we evaluate for a fixed number of steps we don't want to encounter
# out-of-range exceptions.
d = d.repeat()
# We must `drop_remainder` on training because the TPU requires fixed
# size dimensions. For eval, we assume we are evaluating on the CPU or GPU
# and we *don't* want to drop the remainder, otherwise we wont cover
# every sample.
d = d.apply(
tf.contrib.data.map_and_batch(
lambda record: _decode_record(record, name_to_features),
batch_size=batch_size,
num_parallel_batches=num_cpu_threads,
drop_remainder=True))
return d
return input_fn
def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
example = tf.parse_single_example(record, name_to_features)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for name in list(example.keys()):
t = example[name]
if t.dtype == tf.int64:
t = tf.to_int32(t)
example[name] = t
return example
def main(_):
for name, value in FLAGS.__flags.items():
print("name:", name, " ", FLAGS[name].value)
tf.logging.set_verbosity(tf.logging.INFO)
if not FLAGS.do_train and not FLAGS.do_eval:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
if FLAGS.use_fp16:
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
if FLAGS.horovod:
import horovod.tensorflow as hvd
hvd.init()
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
if FLAGS.npu_gather:
if FLAGS.distributed and bert_config.num_hidden_layers == 24:
#from hccl.split.api import set_split_strategy_by_idx
from hccl.split.api import set_split_strategy_by_size
#set_split_strategy_by_idx([8,72,136,200,264,328,392,397])
set_split_strategy_by_size([10,10,10,10,15,15,15,15])
if FLAGS.distributed and bert_config.num_hidden_layers == 12:
from hccl.split.api import set_split_strategy_by_idx
set_split_strategy_by_idx([8,56,104,152,200,205])
if FLAGS.distributed and bert_config.num_hidden_layers == 6:
from hccl.split.api import set_split_strategy_by_idx
set_split_strategy_by_idx([8,40,72,104,109])
tf.gfile.MakeDirs(FLAGS.output_dir)
input_files = []
for input_file_dir in FLAGS.input_files_dir.split(","):
input_files.extend(tf.gfile.Glob(os.path.join(input_file_dir, "*")))
input_files.sort()
print("Input Files:", input_files)
if FLAGS.horovod and len(input_files) < hvd.size():
raise ValueError("Input Files must be sharded")
if FLAGS.use_fp16 and FLAGS.manual_fp16:
raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error")
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
config = tf.ConfigProto()
if FLAGS.horovod:
config.gpu_options.visible_device_list = str(hvd.local_rank())
if hvd.rank() == 0:
tf.logging.info("***** Configuaration *****")
for key in FLAGS.__flags.keys():
tf.logging.info(' {}: {}'.format(key, getattr(FLAGS, key)))
tf.logging.info("**************************")
# config.gpu_options.per_process_gpu_memory_fraction = 0.7
if FLAGS.use_xla:
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT
#run_config = tf.estimator.RunConfig(
run_config = NPURunConfig(
model_dir=FLAGS.output_dir,
save_summary_steps=0,
session_config=config,
save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None,
# This variable controls how often estimator reports examples/sec.
# Default value is every 100 steps.
# When --report_loss is True, we set to very large value to prevent
# default info reporting from estimator.
# Ideally we should set it to None, but that does not work.
log_step_count_steps=1 if FLAGS.report_loss else 100,
enable_data_pre_proc=FLAGS.npu_bert_use_tdt,
iterations_per_loop=FLAGS.iterations_per_loop,
hcom_parallel=FLAGS.hcom_parallel)
if FLAGS.distributed:
rank_size = int(os.getenv('RANK_SIZE'))
model_fn = model_fn_builder(
bert_config=bert_config,
init_checkpoint=FLAGS.init_checkpoint,
learning_rate=FLAGS.learning_rate,
num_train_steps=FLAGS.num_train_steps,
num_warmup_steps=FLAGS.num_warmup_steps,
use_one_hot_embeddings=False,
hvd=None if not FLAGS.horovod else hvd)
training_hooks = []
"""
if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0):
global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.horovod else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
if FLAGS.horovod and hvd.size() > 1:
training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
"""
if FLAGS.report_loss:
global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.distributed else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * rank_size
training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
#estimator = tf.estimator.Estimator(
estimator = NPUEstimator(
model_fn=model_fn,
config=run_config,
job_start_file=FLAGS.npu_bert_job_start_file)
if FLAGS.do_train:
tf.logging.info("***** Running training *****")
tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
train_input_fn = input_fn_builder(
input_files=input_files,
batch_size=FLAGS.train_batch_size,
max_seq_length=FLAGS.max_seq_length,
max_predictions_per_seq=FLAGS.max_predictions_per_seq,
is_training=True,
hvd=None if not FLAGS.horovod else hvd)
estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps)
if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
tf.logging.info("***** Running evaluation *****")
tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
eval_files = []
for eval_file_dir in FLAGS.eval_files_dir.split(","):
eval_files.extend(tf.gfile.Glob(os.path.join(eval_file_dir, "*")))
eval_input_fn = input_fn_builder(
input_files=eval_files,
batch_size=FLAGS.eval_batch_size,
max_seq_length=FLAGS.max_seq_length,
max_predictions_per_seq=FLAGS.max_predictions_per_seq,
is_training=False,
hvd=None if not FLAGS.horovod else hvd)
eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
eval_start_time = time.time()
result = estimator.evaluate(
input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks)
eval_time_elapsed = time.time() - eval_start_time
eval_time_wo_overhead = eval_hooks[-1].total_time
num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size
ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
tf.logging.info("-----------------------------")
tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
eval_hooks[-1].count * FLAGS.eval_batch_size)
tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
(eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size)
tf.logging.info("Summary Inference Statistics on EVAL set")
tf.logging.info("Batch size = %d", FLAGS.eval_batch_size)
tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
tf.logging.info("-----------------------------")
output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
with tf.gfile.GFile(output_eval_file, "w") as writer:
tf.logging.info("***** Eval results *****")
for key in sorted(result.keys()):
tf.logging.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
if key == 'masked_lm_accuracy':
hwlog.remark_print(key=hwlog.MASKED_LM_ACCURACY, value=str(result[key]))
elif key == 'next_sentence_accuracy ':
hwlog.remark_print(key=hwlog.NEXT_SENTENCE_ACCURACY, value=str(result[key]))
elif key == 'global_step':
hwlog.remark_print(key=hwlog.GLOBAL_STEP, value=str(result[key]))
elif key == 'loss':
hwlog.remark_print(key=hwlog.LOSS, value=str(result[key]))
elif key == 'masked_lm_loss':
hwlog.remark_print(key=hwlog.MASKED_LM_LOSS, value=str(result[key]))
elif key == 'next_sentence_loss ':
hwlog.remark_print(key=hwlog.NEXT_SENTENCE_LOSS, value=str(result[key]))
else:
pass
if __name__ == "__main__":
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
config_info = get_model_parameter("tensorflow_config")
initinal_data = {"base_lr": 0.01, "dataset": "cn-clue/en-wiki", "optimizer": "Adam", "loss_scale": 512}
flags.mark_flag_as_required("input_files_dir")
flags.mark_flag_as_required("eval_files_dir")
flags.mark_flag_as_required("bert_config_file")
flags.mark_flag_as_required("output_dir")
flags.mark_flag_as_required("npu_bert_job_start_file")
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
if FLAGS.use_xla and FLAGS.manual_fp16:
print('WARNING! Combining --use_xla with --manual_fp16 may prevent convergence.')
print(' This warning message will be removed when the underlying')
print(' issues have been fixed and you are running a TF version')
print(' that has that fix.')
tf.app.run()
@@ -0,0 +1,215 @@
"""
Multiclass
from:
https://github.com/guillaumegenthial/tf_metrics/blob/master/tf_metrics/__init__.py
"""
__author__ = "Guillaume Genthial"
import numpy as np
import tensorflow as tf
from tensorflow.python.ops.metrics_impl import _streaming_confusion_matrix
def precision(labels, predictions, num_classes, pos_indices=None,
weights=None, average='micro'):
"""Multi-class precision metric for Tensorflow
Parameters
----------
labels : Tensor of tf.int32 or tf.int64
The true labels
predictions : Tensor of tf.int32 or tf.int64
The predictions, same shape as labels
num_classes : int
The number of classes
pos_indices : list of int, optional
The indices of the positive classes, default is all
weights : Tensor of tf.int32, optional
Mask, must be of compatible shape with labels
average : str, optional
'micro': counts the total number of true positives, false
positives, and false negatives for the classes in
`pos_indices` and infer the metric from it.
'macro': will compute the metric separately for each class in
`pos_indices` and average. Will not account for class
imbalance.
'weighted': will compute the metric separately for each class in
`pos_indices` and perform a weighted average by the total
number of true labels for each class.
Returns
-------
tuple of (scalar float Tensor, update_op)
"""
cm, op = _streaming_confusion_matrix(
labels, predictions, num_classes, weights)
pr, _, _ = metrics_from_confusion_matrix(
cm, pos_indices, average=average)
op, _, _ = metrics_from_confusion_matrix(
op, pos_indices, average=average)
return (pr, op)
def recall(labels, predictions, num_classes, pos_indices=None, weights=None,
average='micro'):
"""Multi-class recall metric for Tensorflow
Parameters
----------
labels : Tensor of tf.int32 or tf.int64
The true labels
predictions : Tensor of tf.int32 or tf.int64
The predictions, same shape as labels
num_classes : int
The number of classes
pos_indices : list of int, optional
The indices of the positive classes, default is all
weights : Tensor of tf.int32, optional
Mask, must be of compatible shape with labels
average : str, optional
'micro': counts the total number of true positives, false
positives, and false negatives for the classes in
`pos_indices` and infer the metric from it.
'macro': will compute the metric separately for each class in
`pos_indices` and average. Will not account for class
imbalance.
'weighted': will compute the metric separately for each class in
`pos_indices` and perform a weighted average by the total
number of true labels for each class.
Returns
-------
tuple of (scalar float Tensor, update_op)
"""
cm, op = _streaming_confusion_matrix(
labels, predictions, num_classes, weights)
_, re, _ = metrics_from_confusion_matrix(
cm, pos_indices, average=average)
_, op, _ = metrics_from_confusion_matrix(
op, pos_indices, average=average)
return (re, op)
def f1(labels, predictions, num_classes, pos_indices=None, weights=None,
average='micro'):
return fbeta(labels, predictions, num_classes, pos_indices, weights,
average)
def fbeta(labels, predictions, num_classes, pos_indices=None, weights=None,
average='micro', beta=1):
"""Multi-class fbeta metric for Tensorflow
Parameters
----------
labels : Tensor of tf.int32 or tf.int64
The true labels
predictions : Tensor of tf.int32 or tf.int64
The predictions, same shape as labels
num_classes : int
The number of classes
pos_indices : list of int, optional
The indices of the positive classes, default is all
weights : Tensor of tf.int32, optional
Mask, must be of compatible shape with labels
average : str, optional
'micro': counts the total number of true positives, false
positives, and false negatives for the classes in
`pos_indices` and infer the metric from it.
'macro': will compute the metric separately for each class in
`pos_indices` and average. Will not account for class
imbalance.
'weighted': will compute the metric separately for each class in
`pos_indices` and perform a weighted average by the total
number of true labels for each class.
beta : int, optional
Weight of precision in harmonic mean
Returns
-------
tuple of (scalar float Tensor, update_op)
"""
cm, op = _streaming_confusion_matrix(
labels, predictions, num_classes, weights)
_, _, fbeta = metrics_from_confusion_matrix(
cm, pos_indices, average=average, beta=beta)
_, _, op = metrics_from_confusion_matrix(
op, pos_indices, average=average, beta=beta)
return (fbeta, op)
def safe_div(numerator, denominator):
"""Safe division, return 0 if denominator is 0"""
numerator, denominator = tf.to_float(numerator), tf.to_float(denominator)
zeros = tf.zeros_like(numerator, dtype=numerator.dtype)
denominator_is_zero = tf.equal(denominator, zeros)
return tf.where(denominator_is_zero, zeros, numerator / denominator)
def pr_re_fbeta(cm, pos_indices, beta=1):
"""Uses a confusion matrix to compute precision, recall and fbeta"""
num_classes = cm.shape[0]
neg_indices = [i for i in range(num_classes) if i not in pos_indices]
cm_mask = np.ones([num_classes, num_classes])
cm_mask[neg_indices, neg_indices] = 0
diag_sum = tf.reduce_sum(tf.diag_part(cm * cm_mask))
cm_mask = np.ones([num_classes, num_classes])
cm_mask[:, neg_indices] = 0
tot_pred = tf.reduce_sum(cm * cm_mask)
cm_mask = np.ones([num_classes, num_classes])
cm_mask[neg_indices, :] = 0
tot_gold = tf.reduce_sum(cm * cm_mask)
pr = safe_div(diag_sum, tot_pred)
re = safe_div(diag_sum, tot_gold)
fbeta = safe_div((1. + beta**2) * pr * re, beta**2 * pr + re)
return pr, re, fbeta
def metrics_from_confusion_matrix(cm, pos_indices=None, average='micro',
beta=1):
"""Precision, Recall and F1 from the confusion matrix
Parameters
----------
cm : tf.Tensor of type tf.int32, of shape (num_classes, num_classes)
The streaming confusion matrix.
pos_indices : list of int, optional
The indices of the positive classes
beta : int, optional
Weight of precision in harmonic mean
average : str, optional
'micro', 'macro' or 'weighted'
"""
num_classes = cm.shape[0]
if pos_indices is None:
pos_indices = [i for i in range(num_classes)]
if average == 'micro':
return pr_re_fbeta(cm, pos_indices, beta)
elif average in {'macro', 'weighted'}:
precisions, recalls, fbetas, n_golds = [], [], [], []
for idx in pos_indices:
pr, re, fbeta = pr_re_fbeta(cm, [idx], beta)
precisions.append(pr)
recalls.append(re)
fbetas.append(fbeta)
cm_mask = np.zeros([num_classes, num_classes])
cm_mask[idx, :] = 1
n_golds.append(tf.to_float(tf.reduce_sum(cm * cm_mask)))
if average == 'macro':
pr = tf.reduce_mean(precisions)
re = tf.reduce_mean(recalls)
fbeta = tf.reduce_mean(fbetas)
return pr, re, fbeta
if average == 'weighted':
n_gold = tf.reduce_sum(n_golds)
pr_sum = sum(p * n for p, n in zip(precisions, n_golds))
pr = safe_div(pr_sum, n_gold)
re_sum = sum(r * n for r, n in zip(recalls, n_golds))
re = safe_div(re_sum, n_gold)
fbeta_sum = sum(f * n for f, n in zip(fbetas, n_golds))
fbeta = safe_div(fbeta_sum, n_gold)
return pr, re, fbeta
else:
raise NotImplementedError()
@@ -0,0 +1,451 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import unicodedata
import six
import tensorflow as tf
import re
import os
PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
}
def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
"""Checks whether the casing config is consistent with the checkpoint name."""
# The casing has to be passed in by the user and there is no explicit check
# as to whether it matches the checkpoint. The casing information probably
# should have been stored in the bert_config.json file, but it's not, so
# we have to heuristically detect it to validate.
if not init_checkpoint:
return
m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
if m is None:
return
model_name = m.group(1)
lower_models = [
"uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
"multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
]
cased_models = [
"cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
"multi_cased_L-12_H-768_A-12"
]
is_bad_config = False
if model_name in lower_models and not do_lower_case:
is_bad_config = True
actual_flag = "False"
case_name = "lowercased"
opposite_flag = "True"
if model_name in cased_models and do_lower_case:
is_bad_config = True
actual_flag = "True"
case_name = "cased"
opposite_flag = "False"
if is_bad_config:
raise ValueError(
"You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
"However, `%s` seems to be a %s model, so you "
"should pass in `--do_lower_case=%s` so that the fine-tuning matches "
"how the model was pre-training. If this error is wrong, please "
"just comment out this check." % (actual_flag, init_checkpoint,
model_name, case_name, opposite_flag))
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, "r") as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab
def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
output.append(vocab[item])
return output
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class FullTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class BertTokenizer(object):
"""Runs end-to-end tokenization: punctuation splitting + wordpiece"""
def __init__(self, vocab_file, do_lower_case=True):
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()])
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
"""Converts a sequence of tokens into ids using the vocab."""
ids = []
for token in tokens:
ids.append(self.vocab[token])
return ids
def convert_ids_to_tokens(self, ids):
"""Converts a sequence of ids in wordpiece tokens using the vocab."""
tokens = []
for i in ids:
tokens.append(self.ids_to_tokens[i])
return tokens
@classmethod
def from_pretrained(cls, pretrained_model_name, do_lower_case=True):
"""
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
"""
if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
else:
vocab_file = pretrained_model_name
# redirect to the cache, if necessary
try:
resolved_vocab_file = cached_path(vocab_file)
if resolved_vocab_file == vocab_file:
logger.info("loading vocabulary file {}".format(vocab_file))
else:
logger.info("loading vocabulary file {} from cache at {}".format(
vocab_file, resolved_vocab_file))
# Instantiate tokenizer.
tokenizer = cls(resolved_vocab_file, do_lower_case)
except FileNotFoundError:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
pretrained_model_name))
tokenizer = None
return tokenizer
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text = convert_to_unicode(text)
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
@@ -0,0 +1,62 @@
import tensorflow as tf
import time
# report latency and throughput during eval
class LogEvalRunHook(tf.train.SessionRunHook):
def __init__(self, global_batch_size, hvd_rank=-1):
self.global_batch_size = global_batch_size
self.hvd_rank = hvd_rank
self.total_time = 0.0
self.count = 0
self.skipped = 0
self.time_list = []
def before_run(self, run_context):
self.t0 = time.time()
def after_run(self, run_context, run_values):
elapsed_secs = time.time() - self.t0
self.count += 1
# Removing first 2 (arbitrary) number of startup iterations from perf evaluations
if self.count <= 2:
print("Skipping time record for ", self.count, " due to overhead")
self.skipped += 1
else:
self.time_list.append(elapsed_secs)
self.total_time += elapsed_secs
# report throughput during training
class LogTrainRunHook(tf.train.SessionRunHook):
def __init__(self, global_batch_size, hvd_rank=-1, save_checkpoints_steps=1000):
self.global_batch_size = global_batch_size
self.hvd_rank = hvd_rank
self.save_checkpoints_steps = save_checkpoints_steps
self.total_time = 0.0
self.count = 0 # Holds number of iterations, including skipped iterations for fp16 loss scaling
def after_create_session(self, session, coord):
self.init_global_step = session.run(tf.train.get_global_step())
def before_run(self, run_context):
self.t0 = time.time()
return tf.train.SessionRunArgs(
fetches=['step_update:0'])
def after_run(self, run_context, run_values):
elapsed_secs = time.time() - self.t0
self.global_step = run_values.results[0]
self.count += 1
# Removing first step + first two steps after every checkpoint save
if (self.global_step - self.init_global_step) % self.save_checkpoints_steps <= 1:
print("Skipping time record for ", self.global_step, " due to checkpoint-saving/warmup overhead")
else:
self.total_time += elapsed_secs
def end(self, session):
num_global_steps = self.global_step - self.init_global_step
self.skipped = (num_global_steps // self.save_checkpoints_steps) * 2 + \
min(2, num_global_steps % self.save_checkpoints_steps) - 1