[add]上传训练benchmark by z00560161

This commit is contained in:
liang_chaoming@huawei.com
2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,56 @@
# Bert-Base_tensorflow训练说明
### 1. 模型训练参数配置
在train/yaml/Bert-Base.yaml中修改相应配置, 配置项含义:
```
tensorflow_config:
#layer层数有6和12两种,中文数据集用 bert_base_layer6_cn.json/bert_base_layer12_cn.json 英文用bert_base_layer6_cn.json/bert_base_layer12_en.json
bert_config_file: bert_base_layer6_cn.json
#数据集句子长度是256时 设置为 256,40,句子长度是128时设置为128,20
max_seq_length: 128
max_predictions_per_seq: 20
# 最佳性能train_batch_size为160
train_batch_size: 160
learning_rate: 1e-4
num_warmup_steps: 100
num_train_steps: 1000
optimizer_type: adam
manual_fp16: True
use_fp16_cls: True
input_files_dir: 数据集路径
eval_files_dir: 数据集路径
npu_bert_debug: False
npu_bert_use_tdt: True
distributed: True
do_train: True
do_eval: False
num_accumulation_steps: 1
iterations_per_loop: 100
npu_bert_loss_scale: 0
save_checkpoints_steps: 1000
npu_bert_clip_by_global_norm: False
# docker 镜像名称:版本号
docker_image: c73:b021
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
mpirun_ip: 90.90.140.199:8,90.90.140.229:8
# 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
device_group_1p: 6
device_group_2p: 0 1
device_group_4p: 0 1 2 3
```
------
@@ -0,0 +1,15 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
@@ -0,0 +1,13 @@
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 30522
}
@@ -0,0 +1,442 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Create masked LM/next sentence masked_lm TF examples for BERT."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import random
import tokenization
import tensorflow as tf
flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_string("input_file", None,
"Input raw text file (or comma-separated list of files).")
flags.DEFINE_string(
"output_file", None,
"Output TF example file (or comma-separated list of files).")
flags.DEFINE_string("vocab_file", None,
"The vocabulary file that the BERT model was trained on.")
flags.DEFINE_bool(
"do_lower_case", True,
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.")
flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
flags.DEFINE_integer("max_predictions_per_seq", 20,
"Maximum number of masked LM predictions per sequence.")
flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
flags.DEFINE_integer(
"dupe_factor", 10,
"Number of times to duplicate the input data (with different masks).")
flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
flags.DEFINE_float(
"short_seq_prob", 0.1,
"Probability of creating sequences which are shorter than the "
"maximum length.")
class TrainingInstance(object):
"""A single training instance (sentence pair)."""
def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
is_random_next):
self.tokens = tokens
self.segment_ids = segment_ids
self.is_random_next = is_random_next
self.masked_lm_positions = masked_lm_positions
self.masked_lm_labels = masked_lm_labels
def __str__(self):
s = ""
s += "tokens: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in self.tokens]))
s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
s += "is_random_next: %s\n" % self.is_random_next
s += "masked_lm_positions: %s\n" % (" ".join(
[str(x) for x in self.masked_lm_positions]))
s += "masked_lm_labels: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in self.masked_lm_labels]))
s += "\n"
return s
def __repr__(self):
return self.__str__()
def write_instance_to_example_files(instances, tokenizer, max_seq_length,
max_predictions_per_seq, output_files):
"""Create TF example files from `TrainingInstance`s."""
writers = []
for output_file in output_files:
writers.append(tf.python_io.TFRecordWriter(output_file))
writer_index = 0
total_written = 0
for (inst_index, instance) in enumerate(instances):
input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
input_mask = [1] * len(input_ids)
segment_ids = list(instance.segment_ids)
assert len(input_ids) <= max_seq_length
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
masked_lm_positions = list(instance.masked_lm_positions)
masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
masked_lm_weights = [1.0] * len(masked_lm_ids)
while len(masked_lm_positions) < max_predictions_per_seq:
masked_lm_positions.append(0)
masked_lm_ids.append(0)
masked_lm_weights.append(0.0)
next_sentence_label = 1 if instance.is_random_next else 0
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(input_ids)
features["input_mask"] = create_int_feature(input_mask)
features["segment_ids"] = create_int_feature(segment_ids)
features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
features["next_sentence_labels"] = create_int_feature([next_sentence_label])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writers[writer_index].write(tf_example.SerializeToString())
writer_index = (writer_index + 1) % len(writers)
total_written += 1
if inst_index < 20:
tf.logging.info("*** Example ***")
tf.logging.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in instance.tokens]))
for feature_name in features.keys():
feature = features[feature_name]
values = []
if feature.int64_list.value:
values = feature.int64_list.value
elif feature.float_list.value:
values = feature.float_list.value
tf.logging.info(
"%s: %s" % (feature_name, " ".join([str(x) for x in values])))
for writer in writers:
writer.close()
tf.logging.info("Wrote %d total instances", total_written)
def create_int_feature(values):
feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return feature
def create_float_feature(values):
feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
return feature
def create_training_instances(input_files, tokenizer, max_seq_length,
dupe_factor, short_seq_prob, masked_lm_prob,
max_predictions_per_seq, rng):
"""Create `TrainingInstance`s from raw text."""
all_documents = [[]]
# Input file format:
# (1) One sentence per line. These should ideally be actual sentences, not
# entire paragraphs or arbitrary spans of text. (Because we use the
# sentence boundaries for the "next sentence prediction" task).
# (2) Blank lines between documents. Document boundaries are needed so
# that the "next sentence prediction" task doesn't span between documents.
for input_file in input_files:
with tf.gfile.GFile(input_file, "r") as reader:
while True:
line = tokenization.convert_to_unicode(reader.readline())
if not line:
break
line = line.strip()
# Empty lines are used as document delimiters
if not line:
all_documents.append([])
tokens = tokenizer.tokenize(line)
if tokens:
all_documents[-1].append(tokens)
# Remove empty documents
all_documents = [x for x in all_documents if x]
rng.shuffle(all_documents)
vocab_words = list(tokenizer.vocab.keys())
instances = []
for _ in range(dupe_factor):
for document_index in range(len(all_documents)):
instances.extend(
create_instances_from_document(
all_documents, document_index, max_seq_length, short_seq_prob,
masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
rng.shuffle(instances)
return instances
def create_instances_from_document(
all_documents, document_index, max_seq_length, short_seq_prob,
masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
"""Creates `TrainingInstance`s for a single document."""
document = all_documents[document_index]
# Account for [CLS], [SEP], [SEP]
max_num_tokens = max_seq_length - 3
# We *usually* want to fill up the entire sequence since we are padding
# to `max_seq_length` anyways, so short sequences are generally wasted
# computation. However, we *sometimes*
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
# sequences to minimize the mismatch between pre-training and fine-tuning.
# The `target_seq_length` is just a rough target however, whereas
# `max_seq_length` is a hard limit.
target_seq_length = max_num_tokens
if rng.random() < short_seq_prob:
target_seq_length = rng.randint(2, max_num_tokens)
# We DON'T just concatenate all of the tokens from a document into a long
# sequence and choose an arbitrary split point because this would make the
# next sentence prediction task too easy. Instead, we split the input into
# segments "A" and "B" based on the actual "sentences" provided by the user
# input.
instances = []
current_chunk = []
current_length = 0
i = 0
while i < len(document):
segment = document[i]
current_chunk.append(segment)
current_length += len(segment)
if i == len(document) - 1 or current_length >= target_seq_length:
if current_chunk:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end = 1
if len(current_chunk) >= 2:
a_end = rng.randint(1, len(current_chunk) - 1)
tokens_a = []
for j in range(a_end):
tokens_a.extend(current_chunk[j])
tokens_b = []
# Random next
is_random_next = False
if len(current_chunk) == 1 or rng.random() < 0.5:
is_random_next = True
target_b_length = target_seq_length - len(tokens_a)
# This should rarely go for more than one iteration for large
# corpora. However, just to be careful, we try to make sure that
# the random document is not the same as the document
# we're processing.
for _ in range(10):
random_document_index = rng.randint(0, len(all_documents) - 1)
if random_document_index != document_index:
break
random_document = all_documents[random_document_index]
random_start = rng.randint(0, len(random_document) - 1)
for j in range(random_start, len(random_document)):
tokens_b.extend(random_document[j])
if len(tokens_b) >= target_b_length:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments = len(current_chunk) - a_end
i -= num_unused_segments
# Actual next
else:
is_random_next = False
for j in range(a_end, len(current_chunk)):
tokens_b.extend(current_chunk[j])
truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
assert len(tokens_a) >= 1
assert len(tokens_b) >= 1
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
(tokens, masked_lm_positions,
masked_lm_labels) = create_masked_lm_predictions(
tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
instance = TrainingInstance(
tokens=tokens,
segment_ids=segment_ids,
is_random_next=is_random_next,
masked_lm_positions=masked_lm_positions,
masked_lm_labels=masked_lm_labels)
instances.append(instance)
current_chunk = []
current_length = 0
i += 1
return instances
MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
["index", "label"])
def create_masked_lm_predictions(tokens, masked_lm_prob,
max_predictions_per_seq, vocab_words, rng):
"""Creates the predictions for the masked LM objective."""
cand_indexes = []
for (i, token) in enumerate(tokens):
if token == "[CLS]" or token == "[SEP]":
continue
cand_indexes.append(i)
rng.shuffle(cand_indexes)
output_tokens = list(tokens)
num_to_predict = min(max_predictions_per_seq,
max(1, int(round(len(tokens) * masked_lm_prob))))
masked_lms = []
covered_indexes = set()
for index in cand_indexes:
if len(masked_lms) >= num_to_predict:
break
if index in covered_indexes:
continue
covered_indexes.add(index)
masked_token = None
# 80% of the time, replace with [MASK]
if rng.random() < 0.8:
masked_token = "[MASK]"
else:
# 10% of the time, keep original
if rng.random() < 0.5:
masked_token = tokens[index]
# 10% of the time, replace with random word
else:
masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
output_tokens[index] = masked_token
masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
masked_lms = sorted(masked_lms, key=lambda x: x.index)
masked_lm_positions = []
masked_lm_labels = []
for p in masked_lms:
masked_lm_positions.append(p.index)
masked_lm_labels.append(p.label)
return (output_tokens, masked_lm_positions, masked_lm_labels)
def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
"""Truncates a pair of sequences to a maximum sequence length."""
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_num_tokens:
break
trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
assert len(trunc_tokens) >= 1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if rng.random() < 0.5:
del trunc_tokens[0]
else:
trunc_tokens.pop()
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
input_files = []
for input_pattern in FLAGS.input_file.split(","):
input_files.extend(tf.gfile.Glob(input_pattern))
tf.logging.info("*** Reading from input files ***")
for input_file in input_files:
tf.logging.info(" %s", input_file)
rng = random.Random(FLAGS.random_seed)
instances = create_training_instances(
input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
rng)
output_files = FLAGS.output_file.split(",")
tf.logging.info("*** Writing to output files ***")
for output_file in output_files:
tf.logging.info(" %s", output_file)
write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
FLAGS.max_predictions_per_seq, output_files)
if __name__ == "__main__":
flags.mark_flag_as_required("input_file")
flags.mark_flag_as_required("output_file")
flags.mark_flag_as_required("vocab_file")
tf.app.run()
@@ -0,0 +1,419 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Extract pre-computed feature vectors from BERT."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import codecs
import collections
import json
import re
import modeling
import tokenization
import tensorflow as tf
flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_string("input_file", None, "")
flags.DEFINE_string("output_file", None, "")
flags.DEFINE_string("layers", "-1,-2,-3,-4", "")
flags.DEFINE_string(
"bert_config_file", None,
"The config json file corresponding to the pre-trained BERT model. "
"This specifies the model architecture.")
flags.DEFINE_integer(
"max_seq_length", 128,
"The maximum total input sequence length after WordPiece tokenization. "
"Sequences longer than this will be truncated, and sequences shorter "
"than this will be padded.")
flags.DEFINE_string(
"init_checkpoint", None,
"Initial checkpoint (usually from a pre-trained BERT model).")
flags.DEFINE_string("vocab_file", None,
"The vocabulary file that the BERT model was trained on.")
flags.DEFINE_bool(
"do_lower_case", True,
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.")
flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.")
flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
flags.DEFINE_string("master", None,
"If using a TPU, the address of the master.")
flags.DEFINE_integer(
"num_tpu_cores", 8,
"Only used if `use_tpu` is True. Total number of TPU cores to use.")
flags.DEFINE_bool(
"use_one_hot_embeddings", False,
"If True, tf.one_hot will be used for embedding lookups, otherwise "
"tf.nn.embedding_lookup will be used. On TPUs, this should be True "
"since it is much faster.")
class InputExample(object):
def __init__(self, unique_id, text_a, text_b):
self.unique_id = unique_id
self.text_a = text_a
self.text_b = text_b
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
self.unique_id = unique_id
self.tokens = tokens
self.input_ids = input_ids
self.input_mask = input_mask
self.input_type_ids = input_type_ids
def input_fn_builder(features, seq_length):
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
all_unique_ids = []
all_input_ids = []
all_input_mask = []
all_input_type_ids = []
for feature in features:
all_unique_ids.append(feature.unique_id)
all_input_ids.append(feature.input_ids)
all_input_mask.append(feature.input_mask)
all_input_type_ids.append(feature.input_type_ids)
def input_fn(params):
"""The actual input function."""
batch_size = params["batch_size"]
num_examples = len(features)
# This is for demo purposes and does NOT scale to large data sets. We do
# not use Dataset.from_generator() because that uses tf.py_func which is
# not TPU compatible. The right way to load data is with TFRecordReader.
d = tf.data.Dataset.from_tensor_slices({
"unique_ids":
tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
"input_ids":
tf.constant(
all_input_ids, shape=[num_examples, seq_length],
dtype=tf.int32),
"input_mask":
tf.constant(
all_input_mask,
shape=[num_examples, seq_length],
dtype=tf.int32),
"input_type_ids":
tf.constant(
all_input_type_ids,
shape=[num_examples, seq_length],
dtype=tf.int32),
})
d = d.batch(batch_size=batch_size, drop_remainder=False)
return d
return input_fn
def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
use_one_hot_embeddings):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
unique_ids = features["unique_ids"]
input_ids = features["input_ids"]
input_mask = features["input_mask"]
input_type_ids = features["input_type_ids"]
model = modeling.BertModel(
config=bert_config,
is_training=False,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=input_type_ids,
use_one_hot_embeddings=use_one_hot_embeddings)
if mode != tf.estimator.ModeKeys.PREDICT:
raise ValueError("Only PREDICT modes are supported: %s" % (mode))
tvars = tf.trainable_variables()
scaffold_fn = None
(assignment_map,
initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
tvars, init_checkpoint)
if use_tpu:
def tpu_scaffold():
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
return tf.train.Scaffold()
scaffold_fn = tpu_scaffold
else:
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
tf.logging.info("**** Trainable Variables ****")
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
init_string)
all_layers = model.get_all_encoder_layers()
predictions = {
"unique_id": unique_ids,
}
for (i, layer_index) in enumerate(layer_indexes):
predictions["layer_output_%d" % i] = all_layers[layer_index]
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
return output_spec
return model_fn
def convert_examples_to_features(examples, seq_length, tokenizer):
"""Loads a data file into a list of `InputBatch`s."""
features = []
for (ex_index, example) in enumerate(examples):
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > seq_length - 2:
tokens_a = tokens_a[0:(seq_length - 2)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
input_type_ids = []
tokens.append("[CLS]")
input_type_ids.append(0)
for token in tokens_a:
tokens.append(token)
input_type_ids.append(0)
tokens.append("[SEP]")
input_type_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
input_type_ids.append(1)
tokens.append("[SEP]")
input_type_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < seq_length:
input_ids.append(0)
input_mask.append(0)
input_type_ids.append(0)
assert len(input_ids) == seq_length
assert len(input_mask) == seq_length
assert len(input_type_ids) == seq_length
if ex_index < 5:
tf.logging.info("*** Example ***")
tf.logging.info("unique_id: %s" % (example.unique_id))
tf.logging.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
tf.logging.info(
"input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
features.append(
InputFeatures(
unique_id=example.unique_id,
tokens=tokens,
input_ids=input_ids,
input_mask=input_mask,
input_type_ids=input_type_ids))
return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def read_examples(input_file):
"""Read a list of `InputExample`s from an input file."""
examples = []
unique_id = 0
with tf.gfile.GFile(input_file, "r") as reader:
while True:
line = tokenization.convert_to_unicode(reader.readline())
if not line:
break
line = line.strip()
text_a = None
text_b = None
m = re.match(r"^(.*) \|\|\| (.*)$", line)
if m is None:
text_a = line
else:
text_a = m.group(1)
text_b = m.group(2)
examples.append(
InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
unique_id += 1
return examples
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
layer_indexes = [int(x) for x in FLAGS.layers.split(",")]
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
master=FLAGS.master,
tpu_config=tf.contrib.tpu.TPUConfig(
num_shards=FLAGS.num_tpu_cores,
per_host_input_for_training=is_per_host))
examples = read_examples(FLAGS.input_file)
features = convert_examples_to_features(
examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
unique_id_to_feature = {}
for feature in features:
unique_id_to_feature[feature.unique_id] = feature
model_fn = model_fn_builder(
bert_config=bert_config,
init_checkpoint=FLAGS.init_checkpoint,
layer_indexes=layer_indexes,
use_tpu=FLAGS.use_tpu,
use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)
# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator = tf.contrib.tpu.TPUEstimator(
use_tpu=FLAGS.use_tpu,
model_fn=model_fn,
config=run_config,
predict_batch_size=FLAGS.batch_size)
input_fn = input_fn_builder(
features=features, seq_length=FLAGS.max_seq_length)
with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file,
"w")) as writer:
for result in estimator.predict(input_fn, yield_single_examples=True):
unique_id = int(result["unique_id"])
feature = unique_id_to_feature[unique_id]
output_json = collections.OrderedDict()
output_json["linex_index"] = unique_id
all_features = []
for (i, token) in enumerate(feature.tokens):
all_layers = []
for (j, layer_index) in enumerate(layer_indexes):
layer_output = result["layer_output_%d" % j]
layers = collections.OrderedDict()
layers["index"] = layer_index
layers["values"] = [
round(float(x), 6) for x in layer_output[i:(i + 1)].flat
]
all_layers.append(layers)
features = collections.OrderedDict()
features["token"] = token
features["layers"] = all_layers
all_features.append(features)
output_json["features"] = all_features
writer.write(json.dumps(output_json) + "\n")
if __name__ == "__main__":
flags.mark_flag_as_required("input_file")
flags.mark_flag_as_required("vocab_file")
flags.mark_flag_as_required("bert_config_file")
flags.mark_flag_as_required("init_checkpoint")
flags.mark_flag_as_required("output_file")
tf.app.run()
@@ -0,0 +1,35 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
import numpy as np
def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
initializer=None, regularizer=None,
trainable=True,
*args, **kwargs):
"""Custom variable getter that forces trainable variables to be stored in
float32 precision and then casts them to the training precision.
"""
storage_dtype = tf.float32 if trainable else dtype
variable = getter(name, shape, dtype=storage_dtype,
initializer=initializer, regularizer=regularizer,
trainable=trainable,
*args, **kwargs)
if trainable and dtype != tf.float32:
variable = tf.cast(variable, dtype)
return variable
@@ -0,0 +1,141 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import copy
import json
import math
import re
import six
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.contrib.layers.python.layers import utils
from tensorflow.contrib.framework.python.ops import variables
from tensorflow.python.ops import init_ops
import numpy
from tensorflow.python.ops import array_ops
from tensorflow.python.framework import dtypes
from tensorflow.python.ops import nn
def fused_layer_norm(inputs,
center=True,
scale=True,
activation_fn=None,
reuse=None,
variables_collections=None,
outputs_collections=None,
trainable=True,
begin_norm_axis=1,
begin_params_axis=-1,
scope=None,
use_fused_batch_norm=False):
with tf.variable_scope(
scope, 'LayerNorm', [inputs], reuse=reuse) as sc:
inputs = ops.convert_to_tensor(inputs)
inputs_shape = inputs.shape
inputs_rank = inputs_shape.ndims
if inputs_rank is None:
raise ValueError('Inputs %s has undefined rank.' % inputs.name)
dtype = inputs.dtype.base_dtype
if begin_norm_axis < 0:
begin_norm_axis = inputs_rank + begin_norm_axis
if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) '
'must be < rank(inputs) (%d)' %
(begin_params_axis, begin_norm_axis, inputs_rank))
params_shape = inputs_shape[begin_params_axis:]
if not params_shape.is_fully_defined():
raise ValueError(
'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' %
(inputs.name, begin_params_axis, inputs_shape))
# Allocate parameters for the beta and gamma of the normalization.
beta, gamma = None, None
if center:
beta_collections = utils.get_variable_collections(variables_collections,
'beta')
beta = variables.model_variable(
'beta',
shape=params_shape,
dtype=dtype,
initializer=init_ops.zeros_initializer(),
collections=beta_collections,
trainable=trainable)
if scale:
gamma_collections = utils.get_variable_collections(
variables_collections, 'gamma')
gamma = variables.model_variable(
'gamma',
shape=params_shape,
dtype=dtype,
initializer=init_ops.ones_initializer(),
collections=gamma_collections,
trainable=trainable)
if use_fused_batch_norm:
# get static TensorShape if fully defined,
# otherwise retrieve shape tensor
norm_shape = inputs.shape[begin_norm_axis:]
if norm_shape.is_fully_defined():
bn_shape = [1, -1, 1, numpy.prod(norm_shape.as_list())]
else:
norm_shape = tf.shape(inputs)[begin_norm_axis:]
bn_shape = [1, -1, 1, tf.reduce_prod(norm_shape)]
if inputs.get_shape().is_fully_defined():
outputs_shape = inputs.get_shape()
else:
outputs_shape = tf.shape(inputs)
inputs = array_ops.reshape(inputs, bn_shape)
if inputs.get_shape().is_fully_defined():
# static inputs TensorShape fully defined after reshape.
ones = array_ops.ones(inputs.get_shape()[1], dtype=dtypes.float32)
zeros = array_ops.zeros(inputs.get_shape()[1], dtype=dtypes.float32)
else:
# static inputs TensorShape NOT fully defined after reshape.
# must use dynamic shape, which means these input tensors
# have to be created at runtime, which causes a slowdown.
scale_shape = tf.shape(inputs)[1]
ones = array_ops.ones(scale_shape, dtype=dtypes.float32)
zeros = array_ops.zeros(scale_shape, dtype=dtypes.float32)
outputs, mean, variance = nn.fused_batch_norm(
inputs,
ones, zeros,
epsilon=1e-4,
data_format="NCHW")
outputs = array_ops.reshape(outputs, outputs_shape)
if center and scale:
outputs = outputs * gamma + beta
elif center:
outputs = outputs + beta
elif scale:
outputs = outputs * gamma
else:
# Calculate the moments on the last axis (layer activations).
norm_axes = list(range(begin_norm_axis, inputs_rank))
mean, variance = nn.moments(inputs, norm_axes, keep_dims=True)
# Compute layer normalization using the batch_normalization function.
variance_epsilon = 1e-4
outputs = nn.batch_normalization(
inputs,
mean,
variance,
offset=beta,
scale=gamma,
variance_epsilon=variance_epsilon)
outputs.set_shape(inputs_shape)
if activation_fn is not None:
outputs = activation_fn(outputs)
return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
@@ -0,0 +1,36 @@
# coding=utf-8
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
import numpy as np
def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
initializer=None, regularizer=None,
trainable=True,
*args, **kwargs):
"""Custom variable getter that forces trainable variables to be stored in
float32 precision and then casts them to the training precision.
"""
storage_dtype = tf.float32 if trainable else dtype
variable = getter(name, shape, dtype=storage_dtype,
initializer=initializer, regularizer=regularizer,
trainable=trainable,
*args, **kwargs)
if trainable and dtype != tf.float32:
variable = tf.cast(variable, dtype)
return variable
def get_custom_getter(compute_type):
return float32_variable_storage_getter if compute_type == tf.float16 else None
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,439 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions and classes related to optimization (weight updates)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import tensorflow as tf
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import linalg_ops
from tensorflow.python.ops import math_ops
from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
from npu_bridge.estimator.npu import npu_loss_scale_optimizer as lso
from npu_bridge.estimator.npu import npu_loss_scale_manager as lsm_lib
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None, manual_fp16=False, use_fp16=False, num_accumulation_steps=1,
optimizer_type="adam", allreduce_post_accumulation=False):
"""Creates an optimizer training op."""
global_step = tf.train.get_or_create_global_step()
# avoid step change in learning rate at end of warmup phase
if optimizer_type == "adam":
power = 1.0
decayed_learning_rate_at_crossover_point = init_lr * (
(1.0 - float(num_warmup_steps) / float(num_train_steps)) ** power)
else:
power = 0.5
decayed_learning_rate_at_crossover_point = init_lr
adjusted_init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point)
print('decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e' % (decayed_learning_rate_at_crossover_point, adjusted_init_lr))
learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
# Implements linear decay of the learning rate.
learning_rate = tf.train.polynomial_decay(
learning_rate,
global_step,
num_train_steps,
end_learning_rate=0.0,
power=power,
cycle=False)
# Implements linear warmup. I.e., if global_step < num_warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
if num_warmup_steps:
global_steps_int = tf.cast(global_step, tf.int32)
warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
global_steps_float = tf.cast(global_steps_int, tf.float32)
warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
warmup_percent_done = global_steps_float / warmup_steps_float
warmup_learning_rate = init_lr * warmup_percent_done
is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
learning_rate = (
(1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
if optimizer_type == "lamb":
print("Initializing LAMB Optimizer")
optimizer = LAMBOptimizer(
learning_rate=learning_rate,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
else:
print("Initializing ADAM Weight Decay Optimizer")
# It is recommended that you use this optimizer for fine tuning, since this
# is how the model was trained (note that the Adam m/v variables are NOT
# loaded from init_checkpoint.)
optimizer = AdamWeightDecayOptimizer(
learning_rate=learning_rate,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-4,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
if hvd is not None and (num_accumulation_steps == 1 or (not allreduce_post_accumulation)):
optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none)
optimizer = NPUDistributedOptimizer(optimizer)
if tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]:
opt_tmp = optimizer
if tf.flags.FLAGS.npu_bert_loss_scale == 0:
loss_scale_manager = lsm_lib.ExponentialUpdateLossScaleManager(init_loss_scale=tf.flags.FLAGS.init_loss_scale_value, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
elif tf.flags.FLAGS.npu_bert_loss_scale >= 1:
loss_scale_manager = lsm_lib.FixedLossScaleManager(loss_scale=tf.flags.FLAGS.npu_bert_loss_scale)
else:
raise ValueError("Invalid loss scale: %d" % tf.flags.FLAGS.npu_bert_loss_scale)
optimizer = lso.NPULossScaleOptimizer(opt_tmp, loss_scale_manager, is_distributed=tf.flags.FLAGS.distributed)
tvars = tf.trainable_variables()
grads_and_vars = optimizer.compute_gradients(loss * 1.0 / num_accumulation_steps, tvars)
if num_accumulation_steps > 1:
local_step = tf.get_variable(name="local_step", shape=[], dtype=tf.int32, trainable=False,
initializer=tf.zeros_initializer)
batch_finite = tf.get_variable(name="batch_finite", shape=[], dtype=tf.bool, trainable=False,
initializer=tf.ones_initializer)
accum_vars = [tf.get_variable(
name=tvar.name.split(":")[0] + "/accum",
shape=tvar.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer()) for tvar in tf.trainable_variables()]
reset_step = tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool)
local_step = tf.cond(reset_step, lambda:local_step.assign(tf.ones_like(local_step)), lambda:local_step.assign_add(1))
with tf.name_scope(accumulate_step):
grads_and_vars_and_accums = [(gv[0],gv[1],accum_vars[i]) for i, gv in enumerate(grads_and_vars) if gv[0] is not None]
grads, tvars, accum_vars = list(zip(*grads_and_vars_and_accums))
all_are_finite = tf.reduce_all([tf.reduce_all(tf.is_finite(g)) for g in grads]) if (tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]) and (manual_fp16 or use_fp16) else tf.constant(True, dtype=tf.bool)
batch_finite = tf.cond(reset_step,
lambda: batch_finite.assign(tf.math.logical_and(tf.constant(True, dtype=tf.bool), all_are_finite)),
lambda:batch_finite.assign(tf.math.logical_and(batch_finite, all_are_finite)))
# This is how the model was pre-trained.
# ensure global norm is a finite number
# to prevent clip_by_global_norm from having a hizzy fit.
if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
(clipped_grads, _) = tf.clip_by_global_norm(
grads, clip_norm=1.0,
use_norm=tf.cond(
all_are_finite,
lambda: tf.global_norm(grads),
lambda: tf.constant(1.0)))
else:
with tf.name_scope("clip_grads"):
clipped_grads = [
(tf.clip_by_norm(grad, clip_norm=1.0))
if grad is not None else (grad, var) for grad in grads
]
accum_vars = tf.cond(reset_step,
lambda: [accum_vars[i].assign(grad) for i, grad in enumerate(clipped_grads)],
lambda: [accum_vars[i].assign_add(grad) for i, grad in enumerate(clipped_grads)])
def update(accum_vars):
with tf.name_scope("opt_update"):
if allreduce_post_accumulation and hvd is not None:
accum_vars = [hvd.allreduce(tf.convert_to_tensor(accum_var), compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) if isinstance(accum_var, tf.IndexedSlices)
else hvd.allreduce(accum_var, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) for accum_var in accum_vars]
return optimizer.apply_gradients(list(zip(accum_vars, tvars)), global_step=global_step)
update_step = tf.identity(tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool), name="update_step")
update_op = tf.cond(update_step,
lambda: update(accum_vars), lambda: tf.no_op())
new_global_step = tf.cond(tf.math.logical_and(update_step, tf.cast(hvd.allreduce(tf.cast(batch_finite, tf.int32)), tf.bool)), lambda: global_step+1, lambda: global_step)
new_global_step = tf.identity(new_global_step, name='step_update')
train_op = tf.group(update_op, [global_step.assign(new_global_step)])
else:
grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
grads, tvars = list(zip(*grads_and_vars))
if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
all_are_finite = tf.reduce_all(
[tf.reduce_all(tf.is_finite(g)) for g in grads]) if (tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]) and (use_fp16 or manual_fp16) else tf.constant(True, dtype=tf.bool)
# This is how the model was pre-trained.
# ensure global norm is a finite number
# to prevent clip_by_global_norm from having a hizzy fit.
if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
(clipped_grads, _) = tf.clip_by_global_norm(
grads, clip_norm=1.0,
use_norm=tf.cond(
all_are_finite,
lambda: tf.global_norm(grads),
lambda: tf.constant(1.0)))
else:
with tf.name_scope("clip_grads"):
clipped_grads = [
(tf.clip_by_norm(grad, clip_norm=1.0))
if grad is not None else (grad, var) for grad in grads
]
with tf.name_scope("apply_grads"):
train_op = optimizer.apply_gradients(
list(zip(clipped_grads, tvars)), global_step=global_step)
#if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
# new_global_step = tf.cond(all_are_finite, lambda: global_step + 1, lambda: global_step)
#else:
# new_global_step = global_step + 1
#new_global_step = tf.identity(new_global_step, name='step_update')
#train_op = tf.group(train_op, [global_step.assign(new_global_step)])
return train_op
class AdamWeightDecayOptimizer(tf.train.Optimizer):
"""A basic Adam optimizer that includes "correct" L2 weight decay."""
def __init__(self,
learning_rate,
weight_decay_rate=0.0,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-4,
exclude_from_weight_decay=None,
name="AdamWeightDecayOptimizer"):
"""Constructs a AdamWeightDecayOptimizer."""
super(AdamWeightDecayOptimizer, self).__init__(False, name)
self.learning_rate = tf.identity(learning_rate, name='learning_rate')
self.weight_decay_rate = weight_decay_rate
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self.exclude_from_weight_decay = exclude_from_weight_decay
def apply_gradients(self, grads_and_vars, global_step=None, name=None,
manual_fp16=False):
"""See base class."""
assignments = []
for (grad, param) in grads_and_vars:
with tf.name_scope("apply_one_adam"):
if grad is None or param is None:
continue
param_name = self._get_variable_name(param.name)
has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
if has_shadow:
# create shadow fp32 weights for fp16 variable
param_fp32 = tf.get_variable(
name=param_name + "/shadow",
dtype=tf.float32,
trainable=False,
initializer=tf.cast(param.initialized_value(),tf.float32))
else:
param_fp32 = param
m = tf.get_variable(
name=param_name + "/adam_m",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
v = tf.get_variable(
name=param_name + "/adam_v",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
# Standard Adam update.
next_m = (
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
next_v = (
tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
tf.square(grad)))
update = next_m / (tf.sqrt(next_v) + self.epsilon)
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if self._do_use_weight_decay(param_name):
update += self.weight_decay_rate * param_fp32
update_with_lr = self.learning_rate * update
next_param = param_fp32 - update_with_lr
if has_shadow:
# cast shadow fp32 weights to fp16 and assign to trainable variable
param.assign(tf.cast(next_param, param.dtype.base_dtype))
assignments.extend(
[param_fp32.assign(next_param),
m.assign(next_m),
v.assign(next_v)])
new_global_step = global_step + 1
new_global_step = tf.identity(new_global_step, name='step_update')
assignments.extend([global_step.assign(new_global_step)])
return tf.group(*assignments, name=name)
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if not self.weight_decay_rate:
return False
if self.exclude_from_weight_decay:
for r in self.exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
def _get_variable_name(self, param_name):
"""Get the variable name from the tensor name."""
m = re.match("^(.*):\\d+$", param_name)
if m is not None:
param_name = m.group(1)
return param_name
class LAMBOptimizer(tf.train.Optimizer):
"""A LAMB optimizer that includes "correct" L2 weight decay."""
def __init__(self,
learning_rate,
weight_decay_rate=0.0,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=None,
name="LAMBOptimizer"):
"""Constructs a LAMBOptimizer."""
super(LAMBOptimizer, self).__init__(False, name)
self.learning_rate = tf.identity(learning_rate, name='learning_rate')
self.weight_decay_rate = weight_decay_rate
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self.exclude_from_weight_decay = exclude_from_weight_decay
self.steps = 0
def apply_gradients(self, grads_and_vars, global_step=None, name=None,
manual_fp16=False):
"""See base class."""
assignments = []
for (grad, param) in grads_and_vars:
with tf.name_scope("apply_one_lamb"):
if grad is None or param is None:
continue
param_name = self._get_variable_name(param.name)
has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
if has_shadow:
# create shadow fp32 weights for fp16 variable
param_fp32 = tf.get_variable(
name=param_name + "/shadow",
dtype=tf.float32,
trainable=False,
initializer=tf.cast(param.initialized_value(),tf.float32))
else:
param_fp32 = param
m = tf.get_variable(
name=param_name + "/adam_m",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
v = tf.get_variable(
name=param_name + "/adam_v",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
# LAMB update
next_m = (
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
next_v = (
tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
tf.square(grad)))
self.steps += 1
beta1_correction = (1 - self.beta_1 ** self.steps)
beta2_correction = (1 - self.beta_2 ** self.steps)
next_m_unbiased = next_m / beta1_correction
next_v_unbiased = next_v / beta2_correction
update = next_m_unbiased / (tf.sqrt(next_v_unbiased) + self.epsilon)
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if self._do_use_weight_decay(param_name):
update += self.weight_decay_rate * param_fp32
w_norm = linalg_ops.norm(param, ord=2)
g_norm = linalg_ops.norm(update, ord=2)
ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0)
update_with_lr = ratio * self.learning_rate * update
next_param = param_fp32 - update_with_lr
if has_shadow:
# cast shadow fp32 weights to fp16 and assign to trainable variable
param.assign(tf.cast(next_param, param.dtype.base_dtype))
assignments.extend(
[param_fp32.assign(next_param),
m.assign(next_m),
v.assign(next_v)])
new_global_step = global_step + 1
new_global_step = tf.identity(new_global_step, name='step_update')
assignments.extend([global_step.assign(new_global_step)])
return tf.group(*assignments, name=name)
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if not self.weight_decay_rate:
return False
if self.exclude_from_weight_decay:
for r in self.exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
def _get_variable_name(self, param_name):
"""Get the variable name from the tensor name."""
m = re.match("^(.*):\\d+$", param_name)
if m is not None:
param_name = m.group(1)
return param_name
@@ -0,0 +1,784 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run masked LM/next sentence masked_lm pre-training for BERT."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import modeling
import optimization
import tensorflow as tf
import glob
from utils import LogEvalRunHook
from tensorflow.core.protobuf import rewriter_config_pb2
from gpu_environment import get_custom_getter
from npu_bridge.estimator.npu.npu_config import *
from npu_bridge.estimator.npu.npu_estimator import *
from npu_bridge.estimator.npu.npu_config import NPURunConfig
from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../../../../../utils/atlasboost'))
# import hwlog
from benchmark_log import hwlog
from benchmark_log.basic_utils import get_environment_info
from benchmark_log.basic_utils import get_model_parameter
os.environ['WHICH_OP'] = 'GEOP'
os.environ['NEW_GE_FE_ID'] = '1'
os.environ['GE_AICPU_FLAG'] = '1'
os.environ['GE_USE_STATIC_MEMORY'] = '1'
os.environ['OPTION_EXEC_HCCL_FLAG'] = '1'
os.environ['HCCL_CONNECT_TIMEOUT'] = '600'
flags = tf.flags
FLAGS = flags.FLAGS
## Required parameters
flags.DEFINE_string(
"bert_config_file", None,
"The config json file corresponding to the pre-trained BERT model. "
"This specifies the model architecture.")
flags.DEFINE_string(
"input_files_dir", "./data",
"Directory with input files, comma separated or single directory.")
flags.DEFINE_string(
"eval_files_dir", None,
"Directory with eval files, comma separated or single directory. ")
flags.DEFINE_string(
"output_dir", "./models",
"The output directory where the model checkpoints will be written.")
## Other parameters
flags.DEFINE_string(
"init_checkpoint", None,
"Initial checkpoint (usually from a pre-trained BERT model).")
flags.DEFINE_string(
"optimizer_type", "lamb",
"Optimizer used for training - LAMB or ADAM")
flags.DEFINE_integer(
"max_seq_length", 128,
"The maximum total input sequence length after WordPiece tokenization. "
"Sequences longer than this will be truncated, and sequences shorter "
"than this will be padded. Must match data generation.")
flags.DEFINE_integer(
"max_predictions_per_seq", 20,
"Maximum number of masked LM predictions per sequence. "
"Must match data generation.")
flags.DEFINE_bool("do_train", True, "Whether to run training.")
flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
flags.DEFINE_integer("train_batch_size", 64, "Total batch size for training.")
flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
flags.DEFINE_float("learning_rate", 1e-4, "The initial learning rate for Adam.")
flags.DEFINE_integer("num_train_steps", 1000000, "Number of training steps.")
flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.")
flags.DEFINE_integer("save_checkpoints_steps", 10000,
"How often to save the model checkpoint.")
flags.DEFINE_integer("display_loss_steps", 10,
"How often to print loss")
flags.DEFINE_integer("iterations_per_loop", 1000,
"How many steps to make in each estimator call.")
flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.")
flags.DEFINE_integer("num_accumulation_steps", 1,
"Number of accumulation steps before gradient update."
"Global batch size = num_accumulation_steps * train_batch_size")
flags.DEFINE_bool("allreduce_post_accumulation", False, "Whether to all reduce after accumulation of N steps or after each step")
flags.DEFINE_bool(
"verbose_logging", False,
"If true, all of the trainable parameters are printed")
flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
flags.DEFINE_bool("report_loss", True, "Whether to report total loss during training.")
flags.DEFINE_bool("manual_fp16", True, "Whether to use fp32 or fp16 arithmetic on GPU. "
"Manual casting is done instead of using AMP")
flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
flags.DEFINE_bool("use_fp16", False, "Whether to enable AMP ops.")
flags.DEFINE_bool("use_fp16_cls", True, "Whether to use fp16 in cls and pooler.")
flags.DEFINE_bool("distributed", True, "Whether to use multi-npu")
flags.DEFINE_bool('npu_bert_fused_gelu', True, 'Whether to use npu defined gelu op')
flags.DEFINE_bool('npu_bert_debug', False, 'If True, dropout and shuffle is disabled.')
flags.DEFINE_bool('npu_bert_use_tdt', True, 'Whether to use tdt as dataset')
flags.DEFINE_string("npu_bert_job_start_file", None, "CSA job start file path.")
flags.DEFINE_integer("npu_bert_loss_scale", 0, "Whether to use loss scale, -1 is disable, 0 is dynamic loss scale, >=1 is static loss scale")
flags.DEFINE_bool("npu_bert_clip_by_global_norm", False, "Use clip_by_global_norm if True, or use clip_by_norm for each gradient")
flags.DEFINE_bool('npu_bert_npu_dropout', True, 'Whether to use npu defined gelu op')
flags.DEFINE_bool('npu_gather', True, 'Whether to use gather_npu whose backward propagation avoids IndexedSlices')
flags.DEFINE_bool('hcom_parallel', True, 'Whether to use parallel allreduce')
flags.DEFINE_integer('init_loss_scale_value', 2**32, 'Initial loss scale value for loss scale optimizer')
# report samples/sec, total loss and learning rate during training
class _LogSessionRunHook(tf.train.SessionRunHook):
def __init__(self, global_batch_size, num_accumulation_steps, display_every=10, hvd_rank=-1):
self.global_batch_size = global_batch_size
self.display_every = display_every
self.hvd_rank = hvd_rank
self.num_accumulation_steps = num_accumulation_steps
def after_create_session(self, session, coord):
self.elapsed_secs = 0.
self.count = 0
self.all_count = 0
self.avg_loss = 0.0
def before_run(self, run_context):
self.t0 = time.time()
if self.num_accumulation_steps <= 1:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
return tf.train.SessionRunArgs(
fetches=['global_step:0', 'total_loss:0',
'learning_rate:0', 'nsp_loss:0',
'mlm_loss:0', 'loss_scale:0', 'apply_grads/All:0'])
else:
return tf.train.SessionRunArgs(
fetches=['global_step:0', 'total_loss:0',
'learning_rate:0', 'nsp_loss:0',
'mlm_loss:0'])
else:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
return tf.train.SessionRunArgs(
fetches=['global_step:0', 'update_step:0', 'total_loss:0',
'learning_rate:0', 'nsp_loss:0',
'mlm_loss:0', 'loss_scale:0'])
else:
return tf.train.SessionRunArgs(
fetches=['global_step:0', 'update_step:0', 'total_loss:0',
'learning_rate:0', 'nsp_loss:0',
'mlm_loss:0'])
def after_run(self, run_context, run_values):
self.elapsed_secs += time.time() - self.t0
if self.num_accumulation_steps <=1:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler, custom_arg = run_values.results
else:
global_step, total_loss, lr, nsp_loss, mlm_loss = run_values. \
results
update_step = True
else:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
global_step, update_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
else:
global_step, update_step, total_loss, lr, nsp_loss, mlm_loss = run_values.\
results
print_step = global_step + 1 # One-based index for printing.
self.avg_loss += total_loss
self.all_count += 1
if update_step:
self.count += 1
dt = self.elapsed_secs / self.count
sent_per_sec = self.global_batch_size / dt * FLAGS.iterations_per_loop
avg_loss_step = self.avg_loss / self.all_count
if self.hvd_rank >= 0:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
print('Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e Loss scale = %6.4e isFinite = %6i' %
(self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr, loss_scaler, custom_arg), flush=True)
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
else:
print('Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' %
(self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr), flush=True)
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
else:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e Loss scale = %6.4e isFinite = %6i' %
(print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr, loss_scaler, custom_arg), flush=True)
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
else:
print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' %
(print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr), flush=True)
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
self.elapsed_secs = 0.
self.count = 0
self.avg_loss = 0.0
self.all_count = 0
def model_fn_builder(bert_config, init_checkpoint, learning_rate,
num_train_steps, num_warmup_steps,
use_one_hot_embeddings, hvd=None):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
tf.logging.info("*** Features ***")
for name in sorted(features.keys()):
tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
masked_lm_positions = features["masked_lm_positions"]
masked_lm_ids = features["masked_lm_ids"]
masked_lm_weights = features["masked_lm_weights"]
next_sentence_labels = features["next_sentence_labels"]
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
model = modeling.BertModel(
config=bert_config,
is_training=is_training,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=segment_ids,
use_one_hot_embeddings=use_one_hot_embeddings,
compute_type=tf.float16 if FLAGS.manual_fp16 else tf.float32)
(masked_lm_loss,
masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output(
bert_config, model.get_sequence_output(), model.get_embedding_table(),
masked_lm_positions, masked_lm_ids,
masked_lm_weights)
(next_sentence_loss, next_sentence_example_loss,
next_sentence_log_probs) = get_next_sentence_output(
bert_config, model.get_pooled_output(), next_sentence_labels)
masked_lm_loss = tf.identity(masked_lm_loss, name="mlm_loss")
next_sentence_loss = tf.identity(next_sentence_loss, name="nsp_loss")
total_loss = masked_lm_loss + next_sentence_loss
total_loss = tf.identity(total_loss, name='total_loss')
tvars = tf.trainable_variables()
initialized_variable_names = {}
if init_checkpoint and (hvd is None or hvd.rank() == 0):
print("Loading checkpoint", init_checkpoint)
(assignment_map, initialized_variable_names
) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
if FLAGS.verbose_logging:
tf.logging.info("**** Trainable Variables ****")
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
tf.logging.info(" %d :: name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape,
init_string)
output_spec = None
if mode == tf.estimator.ModeKeys.TRAIN:
train_op = optimization.create_optimizer(
total_loss, learning_rate, num_train_steps, num_warmup_steps,
hvd, FLAGS.manual_fp16, FLAGS.use_fp16, FLAGS.num_accumulation_steps, FLAGS.optimizer_type, FLAGS.allreduce_post_accumulation)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
train_op=train_op)
elif mode == tf.estimator.ModeKeys.EVAL:
def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
masked_lm_weights, next_sentence_example_loss,
next_sentence_log_probs, next_sentence_labels):
"""Computes the loss and accuracy of the model."""
masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
[-1, masked_lm_log_probs.shape[-1]])
masked_lm_predictions = tf.argmax(
masked_lm_log_probs, axis=-1, output_type=tf.int32)
masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
masked_lm_accuracy = tf.metrics.accuracy(
labels=masked_lm_ids,
predictions=masked_lm_predictions,
weights=masked_lm_weights)
masked_lm_mean_loss = tf.metrics.mean(
values=masked_lm_example_loss, weights=masked_lm_weights)
next_sentence_log_probs = tf.reshape(
next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
next_sentence_predictions = tf.argmax(
next_sentence_log_probs, axis=-1, output_type=tf.int32)
next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
next_sentence_accuracy = tf.metrics.accuracy(
labels=next_sentence_labels, predictions=next_sentence_predictions)
next_sentence_mean_loss = tf.metrics.mean(
values=next_sentence_example_loss)
return {
"masked_lm_accuracy": masked_lm_accuracy,
"masked_lm_loss": masked_lm_mean_loss,
"next_sentence_accuracy": next_sentence_accuracy,
"next_sentence_loss": next_sentence_mean_loss,
}
eval_metric_ops = metric_fn(
masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
masked_lm_weights, next_sentence_example_loss,
next_sentence_log_probs, next_sentence_labels
)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
eval_metric_ops=eval_metric_ops)
else:
raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))
return output_spec
return model_fn
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
label_ids, label_weights):
"""Get loss and log probs for the masked LM."""
input_tensor = gather_indexes(input_tensor, positions)
with tf.variable_scope("cls/predictions"):
# We apply one more non-linear transformation before the output layer.
# This matrix is not used after pre-training.
with tf.variable_scope("transform", custom_getter=get_custom_getter(compute_type=tf.float16 if FLAGS.use_fp16_cls else tf.float32)):
if FLAGS.use_fp16_cls:
input_tensor = tf.cast(input_tensor, tf.float16)
input_tensor = tf.layers.dense(
input_tensor,
units=bert_config.hidden_size,
activation=modeling.get_activation(bert_config.hidden_act),
kernel_initializer=modeling.create_initializer(
bert_config.initializer_range))
input_tensor = tf.cast(input_tensor, tf.float32)
input_tensor = modeling.layer_norm(input_tensor)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
output_bias = tf.get_variable(
"output_bias",
shape=[bert_config.vocab_size],
initializer=tf.zeros_initializer())
if FLAGS.use_fp16_cls:
input_tensor = tf.cast(input_tensor, tf.float16)
logits = tf.matmul(input_tensor, tf.cast(output_weights, tf.float16), transpose_b=True)
logits = tf.cast(logits, tf.float32)
else:
logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
label_ids = tf.reshape(label_ids, [-1])
label_weights = tf.reshape(label_weights, [-1])
one_hot_labels = tf.one_hot(
label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
# The `positions` tensor might be zero-padded (if the sequence is too
# short to have the maximum number of predictions). The `label_weights`
# tensor has a value of 1.0 for every real prediction and 0.0 for the
# padding predictions.
per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
numerator = tf.reduce_sum(label_weights * per_example_loss)
denominator = tf.reduce_sum(label_weights) + 1e-5
loss = numerator / denominator
return (loss, per_example_loss, log_probs)
def get_next_sentence_output(bert_config, input_tensor, labels):
"""Get loss and log probs for the next sentence prediction."""
# Simple binary classification. Note that 0 is "next sentence" and 1 is
# "random sentence". This weight matrix is not used after pre-training.
with tf.variable_scope("cls/seq_relationship"):
output_weights = tf.get_variable(
"output_weights",
shape=[2, bert_config.hidden_size],
initializer=modeling.create_initializer(bert_config.initializer_range))
output_bias = tf.get_variable(
"output_bias", shape=[2], initializer=tf.zeros_initializer())
if FLAGS.use_fp16_cls:
input_tensor = tf.cast(input_tensor, tf.float16)
logits = tf.matmul(input_tensor, tf.cast(output_weights, tf.float16), transpose_b=True)
logits = tf.cast(logits, tf.float32)
else:
logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
labels = tf.reshape(labels, [-1])
one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
return (loss, per_example_loss, log_probs)
def gather_indexes(sequence_tensor, positions):
"""Gathers the vectors at the specific positions over a minibatch."""
sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
batch_size = sequence_shape[0]
seq_length = sequence_shape[1]
width = sequence_shape[2]
flat_offsets = tf.reshape(
tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
flat_positions = tf.reshape(positions + flat_offsets, [-1])
flat_sequence_tensor = tf.reshape(sequence_tensor,
[batch_size * seq_length, width])
output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
return output_tensor
def input_fn_builder(input_files,
batch_size,
max_seq_length,
max_predictions_per_seq,
is_training,
num_cpu_threads=4,
hvd=None):
"""Creates an `input_fn` closure to be passed to Estimator."""
def input_fn():
"""The actual input function."""
name_to_features = {
"input_ids":
tf.FixedLenFeature([max_seq_length], tf.int64),
"input_mask":
tf.FixedLenFeature([max_seq_length], tf.int64),
"segment_ids":
tf.FixedLenFeature([max_seq_length], tf.int64),
"masked_lm_positions":
tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
"masked_lm_ids":
tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
"masked_lm_weights":
tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
"next_sentence_labels":
tf.FixedLenFeature([1], tf.int64),
}
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if is_training:
d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
if FLAGS.distributed:
#rank_size = int(os.getenv('RANK_SIZE'))
#rank_id = int(os.getenv('RANK_INDEX'))
#device_id = int(os.getenv('DEVICE_ID'))
#local_rank = rank_id * 8 + device_id
#print('RANK_SIZE=', rank_size, ' RANK_ID=', local_rank)
rank_size = int(os.getenv('RANK_SIZE'))
rank_id = int(os.getenv('RANK_ID'))
print('RANK_SIZE=', rank_size, ' rank_id=', rank_id)
d = d.shard(rank_size, rank_id)
d = d.repeat()
if not FLAGS.npu_bert_debug:
d = d.shuffle(buffer_size=len(input_files))
# `cycle_length` is the number of parallel files that get read.
if not FLAGS.npu_bert_debug:
#cycle_length = min(num_cpu_threads, len(input_files))
cycle_length = min(num_cpu_threads, int(len(input_files)/int(os.getenv('RANK_SIZE'))))
else:
cycle_length = 1
# `sloppy` mode means that the interleaving is not exact. This adds
# even more randomness to the training pipeline.
#d = d.apply(
# tf.contrib.data.parallel_interleave(
# tf.data.TFRecordDataset,
# sloppy=(not FLAGS.npu_bert_debug),
# cycle_length=cycle_length))
d = d.interleave(
tf.data.TFRecordDataset,
cycle_length=cycle_length,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
if not FLAGS.npu_bert_debug:
d = d.shuffle(buffer_size=100)
else:
d = tf.data.TFRecordDataset(input_files)
# Since we evaluate for a fixed number of steps we don't want to encounter
# out-of-range exceptions.
d = d.repeat()
# We must `drop_remainder` on training because the TPU requires fixed
# size dimensions. For eval, we assume we are evaluating on the CPU or GPU
# and we *don't* want to drop the remainder, otherwise we wont cover
# every sample.
d = d.apply(
tf.contrib.data.map_and_batch(
lambda record: _decode_record(record, name_to_features),
batch_size=batch_size,
num_parallel_batches=num_cpu_threads,
drop_remainder=True))
return d
return input_fn
def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
example = tf.parse_single_example(record, name_to_features)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for name in list(example.keys()):
t = example[name]
if t.dtype == tf.int64:
t = tf.to_int32(t)
example[name] = t
return example
def main(_):
for name, value in FLAGS.__flags.items():
print("name:", name, " ", FLAGS[name].value)
tf.logging.set_verbosity(tf.logging.INFO)
if not FLAGS.do_train and not FLAGS.do_eval:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
if FLAGS.use_fp16:
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
if FLAGS.horovod:
import horovod.tensorflow as hvd
hvd.init()
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
if FLAGS.npu_gather:
if FLAGS.distributed and bert_config.num_hidden_layers == 24:
#from hccl.split.api import set_split_strategy_by_idx
from hccl.split.api import set_split_strategy_by_size
#set_split_strategy_by_idx([8,72,136,200,264,328,392,397])
set_split_strategy_by_size([10,10,10,10,15,15,15,15])
if FLAGS.distributed and bert_config.num_hidden_layers == 12:
from hccl.split.api import set_split_strategy_by_idx
set_split_strategy_by_idx([8,56,104,152,200,205])
if FLAGS.distributed and bert_config.num_hidden_layers == 6:
from hccl.split.api import set_split_strategy_by_idx
set_split_strategy_by_idx([8,40,72,104,109])
tf.gfile.MakeDirs(FLAGS.output_dir)
input_files = []
for input_file_dir in FLAGS.input_files_dir.split(","):
input_files.extend(tf.gfile.Glob(os.path.join(input_file_dir, "*")))
input_files.sort()
print("Input Files:", input_files)
if FLAGS.horovod and len(input_files) < hvd.size():
raise ValueError("Input Files must be sharded")
if FLAGS.use_fp16 and FLAGS.manual_fp16:
raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error")
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
config = tf.ConfigProto()
if FLAGS.horovod:
config.gpu_options.visible_device_list = str(hvd.local_rank())
if hvd.rank() == 0:
tf.logging.info("***** Configuaration *****")
for key in FLAGS.__flags.keys():
tf.logging.info(' {}: {}'.format(key, getattr(FLAGS, key)))
tf.logging.info("**************************")
# config.gpu_options.per_process_gpu_memory_fraction = 0.7
if FLAGS.use_xla:
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT
#run_config = tf.estimator.RunConfig(
run_config = NPURunConfig(
model_dir=FLAGS.output_dir,
save_summary_steps=0,
session_config=config,
save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None,
# This variable controls how often estimator reports examples/sec.
# Default value is every 100 steps.
# When --report_loss is True, we set to very large value to prevent
# default info reporting from estimator.
# Ideally we should set it to None, but that does not work.
log_step_count_steps=1 if FLAGS.report_loss else 100,
enable_data_pre_proc=FLAGS.npu_bert_use_tdt,
iterations_per_loop=FLAGS.iterations_per_loop,
hcom_parallel=FLAGS.hcom_parallel)
if FLAGS.distributed:
rank_size = int(os.getenv('RANK_SIZE'))
model_fn = model_fn_builder(
bert_config=bert_config,
init_checkpoint=FLAGS.init_checkpoint,
learning_rate=FLAGS.learning_rate,
num_train_steps=FLAGS.num_train_steps,
num_warmup_steps=FLAGS.num_warmup_steps,
use_one_hot_embeddings=False,
hvd=None if not FLAGS.horovod else hvd)
training_hooks = []
"""
if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0):
global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.horovod else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
if FLAGS.horovod and hvd.size() > 1:
training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
"""
if FLAGS.report_loss:
global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.distributed else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * rank_size
training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
#estimator = tf.estimator.Estimator(
estimator = NPUEstimator(
model_fn=model_fn,
config=run_config,
job_start_file=FLAGS.npu_bert_job_start_file)
if FLAGS.do_train:
tf.logging.info("***** Running training *****")
tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
train_input_fn = input_fn_builder(
input_files=input_files,
batch_size=FLAGS.train_batch_size,
max_seq_length=FLAGS.max_seq_length,
max_predictions_per_seq=FLAGS.max_predictions_per_seq,
is_training=True,
hvd=None if not FLAGS.horovod else hvd)
estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps)
if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
tf.logging.info("***** Running evaluation *****")
tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
eval_files = []
for eval_file_dir in FLAGS.eval_files_dir.split(","):
eval_files.extend(tf.gfile.Glob(os.path.join(eval_file_dir, "*")))
eval_input_fn = input_fn_builder(
input_files=eval_files,
batch_size=FLAGS.eval_batch_size,
max_seq_length=FLAGS.max_seq_length,
max_predictions_per_seq=FLAGS.max_predictions_per_seq,
is_training=False,
hvd=None if not FLAGS.horovod else hvd)
eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
eval_start_time = time.time()
result = estimator.evaluate(
input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks)
eval_time_elapsed = time.time() - eval_start_time
eval_time_wo_overhead = eval_hooks[-1].total_time
num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size
ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
tf.logging.info("-----------------------------")
tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
eval_hooks[-1].count * FLAGS.eval_batch_size)
tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
(eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size)
tf.logging.info("Summary Inference Statistics on EVAL set")
tf.logging.info("Batch size = %d", FLAGS.eval_batch_size)
tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
tf.logging.info("-----------------------------")
output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
with tf.gfile.GFile(output_eval_file, "w") as writer:
tf.logging.info("***** Eval results *****")
for key in sorted(result.keys()):
tf.logging.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
if key == 'masked_lm_accuracy':
hwlog.remark_print(key=hwlog.MASKED_LM_ACCURACY, value=str(result[key]))
elif key == 'next_sentence_accuracy ':
hwlog.remark_print(key=hwlog.NEXT_SENTENCE_ACCURACY, value=str(result[key]))
elif key == 'global_step':
hwlog.remark_print(key=hwlog.GLOBAL_STEP, value=str(result[key]))
elif key == 'loss':
hwlog.remark_print(key=hwlog.LOSS, value=str(result[key]))
elif key == 'masked_lm_loss':
hwlog.remark_print(key=hwlog.MASKED_LM_LOSS, value=str(result[key]))
elif key == 'next_sentence_loss ':
hwlog.remark_print(key=hwlog.NEXT_SENTENCE_LOSS, value=str(result[key]))
else:
pass
if __name__ == "__main__":
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
config_info = get_model_parameter("tensorflow_config")
initinal_data = {"base_lr": 0.01, "dataset": "cn-clue/en-wiki", "optimizer": "Adam", "loss_scale": 512}
flags.mark_flag_as_required("input_files_dir")
flags.mark_flag_as_required("eval_files_dir")
flags.mark_flag_as_required("bert_config_file")
flags.mark_flag_as_required("output_dir")
flags.mark_flag_as_required("npu_bert_job_start_file")
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
if FLAGS.use_xla and FLAGS.manual_fp16:
print('WARNING! Combining --use_xla with --manual_fp16 may prevent convergence.')
print(' This warning message will be removed when the underlying')
print(' issues have been fixed and you are running a TF version')
print(' that has that fix.')
tf.app.run()
@@ -0,0 +1,215 @@
"""
Multiclass
from:
https://github.com/guillaumegenthial/tf_metrics/blob/master/tf_metrics/__init__.py
"""
__author__ = "Guillaume Genthial"
import numpy as np
import tensorflow as tf
from tensorflow.python.ops.metrics_impl import _streaming_confusion_matrix
def precision(labels, predictions, num_classes, pos_indices=None,
weights=None, average='micro'):
"""Multi-class precision metric for Tensorflow
Parameters
----------
labels : Tensor of tf.int32 or tf.int64
The true labels
predictions : Tensor of tf.int32 or tf.int64
The predictions, same shape as labels
num_classes : int
The number of classes
pos_indices : list of int, optional
The indices of the positive classes, default is all
weights : Tensor of tf.int32, optional
Mask, must be of compatible shape with labels
average : str, optional
'micro': counts the total number of true positives, false
positives, and false negatives for the classes in
`pos_indices` and infer the metric from it.
'macro': will compute the metric separately for each class in
`pos_indices` and average. Will not account for class
imbalance.
'weighted': will compute the metric separately for each class in
`pos_indices` and perform a weighted average by the total
number of true labels for each class.
Returns
-------
tuple of (scalar float Tensor, update_op)
"""
cm, op = _streaming_confusion_matrix(
labels, predictions, num_classes, weights)
pr, _, _ = metrics_from_confusion_matrix(
cm, pos_indices, average=average)
op, _, _ = metrics_from_confusion_matrix(
op, pos_indices, average=average)
return (pr, op)
def recall(labels, predictions, num_classes, pos_indices=None, weights=None,
average='micro'):
"""Multi-class recall metric for Tensorflow
Parameters
----------
labels : Tensor of tf.int32 or tf.int64
The true labels
predictions : Tensor of tf.int32 or tf.int64
The predictions, same shape as labels
num_classes : int
The number of classes
pos_indices : list of int, optional
The indices of the positive classes, default is all
weights : Tensor of tf.int32, optional
Mask, must be of compatible shape with labels
average : str, optional
'micro': counts the total number of true positives, false
positives, and false negatives for the classes in
`pos_indices` and infer the metric from it.
'macro': will compute the metric separately for each class in
`pos_indices` and average. Will not account for class
imbalance.
'weighted': will compute the metric separately for each class in
`pos_indices` and perform a weighted average by the total
number of true labels for each class.
Returns
-------
tuple of (scalar float Tensor, update_op)
"""
cm, op = _streaming_confusion_matrix(
labels, predictions, num_classes, weights)
_, re, _ = metrics_from_confusion_matrix(
cm, pos_indices, average=average)
_, op, _ = metrics_from_confusion_matrix(
op, pos_indices, average=average)
return (re, op)
def f1(labels, predictions, num_classes, pos_indices=None, weights=None,
average='micro'):
return fbeta(labels, predictions, num_classes, pos_indices, weights,
average)
def fbeta(labels, predictions, num_classes, pos_indices=None, weights=None,
average='micro', beta=1):
"""Multi-class fbeta metric for Tensorflow
Parameters
----------
labels : Tensor of tf.int32 or tf.int64
The true labels
predictions : Tensor of tf.int32 or tf.int64
The predictions, same shape as labels
num_classes : int
The number of classes
pos_indices : list of int, optional
The indices of the positive classes, default is all
weights : Tensor of tf.int32, optional
Mask, must be of compatible shape with labels
average : str, optional
'micro': counts the total number of true positives, false
positives, and false negatives for the classes in
`pos_indices` and infer the metric from it.
'macro': will compute the metric separately for each class in
`pos_indices` and average. Will not account for class
imbalance.
'weighted': will compute the metric separately for each class in
`pos_indices` and perform a weighted average by the total
number of true labels for each class.
beta : int, optional
Weight of precision in harmonic mean
Returns
-------
tuple of (scalar float Tensor, update_op)
"""
cm, op = _streaming_confusion_matrix(
labels, predictions, num_classes, weights)
_, _, fbeta = metrics_from_confusion_matrix(
cm, pos_indices, average=average, beta=beta)
_, _, op = metrics_from_confusion_matrix(
op, pos_indices, average=average, beta=beta)
return (fbeta, op)
def safe_div(numerator, denominator):
"""Safe division, return 0 if denominator is 0"""
numerator, denominator = tf.to_float(numerator), tf.to_float(denominator)
zeros = tf.zeros_like(numerator, dtype=numerator.dtype)
denominator_is_zero = tf.equal(denominator, zeros)
return tf.where(denominator_is_zero, zeros, numerator / denominator)
def pr_re_fbeta(cm, pos_indices, beta=1):
"""Uses a confusion matrix to compute precision, recall and fbeta"""
num_classes = cm.shape[0]
neg_indices = [i for i in range(num_classes) if i not in pos_indices]
cm_mask = np.ones([num_classes, num_classes])
cm_mask[neg_indices, neg_indices] = 0
diag_sum = tf.reduce_sum(tf.diag_part(cm * cm_mask))
cm_mask = np.ones([num_classes, num_classes])
cm_mask[:, neg_indices] = 0
tot_pred = tf.reduce_sum(cm * cm_mask)
cm_mask = np.ones([num_classes, num_classes])
cm_mask[neg_indices, :] = 0
tot_gold = tf.reduce_sum(cm * cm_mask)
pr = safe_div(diag_sum, tot_pred)
re = safe_div(diag_sum, tot_gold)
fbeta = safe_div((1. + beta**2) * pr * re, beta**2 * pr + re)
return pr, re, fbeta
def metrics_from_confusion_matrix(cm, pos_indices=None, average='micro',
beta=1):
"""Precision, Recall and F1 from the confusion matrix
Parameters
----------
cm : tf.Tensor of type tf.int32, of shape (num_classes, num_classes)
The streaming confusion matrix.
pos_indices : list of int, optional
The indices of the positive classes
beta : int, optional
Weight of precision in harmonic mean
average : str, optional
'micro', 'macro' or 'weighted'
"""
num_classes = cm.shape[0]
if pos_indices is None:
pos_indices = [i for i in range(num_classes)]
if average == 'micro':
return pr_re_fbeta(cm, pos_indices, beta)
elif average in {'macro', 'weighted'}:
precisions, recalls, fbetas, n_golds = [], [], [], []
for idx in pos_indices:
pr, re, fbeta = pr_re_fbeta(cm, [idx], beta)
precisions.append(pr)
recalls.append(re)
fbetas.append(fbeta)
cm_mask = np.zeros([num_classes, num_classes])
cm_mask[idx, :] = 1
n_golds.append(tf.to_float(tf.reduce_sum(cm * cm_mask)))
if average == 'macro':
pr = tf.reduce_mean(precisions)
re = tf.reduce_mean(recalls)
fbeta = tf.reduce_mean(fbetas)
return pr, re, fbeta
if average == 'weighted':
n_gold = tf.reduce_sum(n_golds)
pr_sum = sum(p * n for p, n in zip(precisions, n_golds))
pr = safe_div(pr_sum, n_gold)
re_sum = sum(r * n for r, n in zip(recalls, n_golds))
re = safe_div(re_sum, n_gold)
fbeta_sum = sum(f * n for f, n in zip(fbetas, n_golds))
fbeta = safe_div(fbeta_sum, n_gold)
return pr, re, fbeta
else:
raise NotImplementedError()
@@ -0,0 +1,451 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import unicodedata
import six
import tensorflow as tf
import re
import os
PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
}
def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
"""Checks whether the casing config is consistent with the checkpoint name."""
# The casing has to be passed in by the user and there is no explicit check
# as to whether it matches the checkpoint. The casing information probably
# should have been stored in the bert_config.json file, but it's not, so
# we have to heuristically detect it to validate.
if not init_checkpoint:
return
m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
if m is None:
return
model_name = m.group(1)
lower_models = [
"uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
"multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
]
cased_models = [
"cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
"multi_cased_L-12_H-768_A-12"
]
is_bad_config = False
if model_name in lower_models and not do_lower_case:
is_bad_config = True
actual_flag = "False"
case_name = "lowercased"
opposite_flag = "True"
if model_name in cased_models and do_lower_case:
is_bad_config = True
actual_flag = "True"
case_name = "cased"
opposite_flag = "False"
if is_bad_config:
raise ValueError(
"You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
"However, `%s` seems to be a %s model, so you "
"should pass in `--do_lower_case=%s` so that the fine-tuning matches "
"how the model was pre-training. If this error is wrong, please "
"just comment out this check." % (actual_flag, init_checkpoint,
model_name, case_name, opposite_flag))
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, "r") as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab
def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
output.append(vocab[item])
return output
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class FullTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class BertTokenizer(object):
"""Runs end-to-end tokenization: punctuation splitting + wordpiece"""
def __init__(self, vocab_file, do_lower_case=True):
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()])
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
"""Converts a sequence of tokens into ids using the vocab."""
ids = []
for token in tokens:
ids.append(self.vocab[token])
return ids
def convert_ids_to_tokens(self, ids):
"""Converts a sequence of ids in wordpiece tokens using the vocab."""
tokens = []
for i in ids:
tokens.append(self.ids_to_tokens[i])
return tokens
@classmethod
def from_pretrained(cls, pretrained_model_name, do_lower_case=True):
"""
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
"""
if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
else:
vocab_file = pretrained_model_name
# redirect to the cache, if necessary
try:
resolved_vocab_file = cached_path(vocab_file)
if resolved_vocab_file == vocab_file:
logger.info("loading vocabulary file {}".format(vocab_file))
else:
logger.info("loading vocabulary file {} from cache at {}".format(
vocab_file, resolved_vocab_file))
# Instantiate tokenizer.
tokenizer = cls(resolved_vocab_file, do_lower_case)
except FileNotFoundError:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
pretrained_model_name))
tokenizer = None
return tokenizer
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text = convert_to_unicode(text)
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
@@ -0,0 +1,62 @@
import tensorflow as tf
import time
# report latency and throughput during eval
class LogEvalRunHook(tf.train.SessionRunHook):
def __init__(self, global_batch_size, hvd_rank=-1):
self.global_batch_size = global_batch_size
self.hvd_rank = hvd_rank
self.total_time = 0.0
self.count = 0
self.skipped = 0
self.time_list = []
def before_run(self, run_context):
self.t0 = time.time()
def after_run(self, run_context, run_values):
elapsed_secs = time.time() - self.t0
self.count += 1
# Removing first 2 (arbitrary) number of startup iterations from perf evaluations
if self.count <= 2:
print("Skipping time record for ", self.count, " due to overhead")
self.skipped += 1
else:
self.time_list.append(elapsed_secs)
self.total_time += elapsed_secs
# report throughput during training
class LogTrainRunHook(tf.train.SessionRunHook):
def __init__(self, global_batch_size, hvd_rank=-1, save_checkpoints_steps=1000):
self.global_batch_size = global_batch_size
self.hvd_rank = hvd_rank
self.save_checkpoints_steps = save_checkpoints_steps
self.total_time = 0.0
self.count = 0 # Holds number of iterations, including skipped iterations for fp16 loss scaling
def after_create_session(self, session, coord):
self.init_global_step = session.run(tf.train.get_global_step())
def before_run(self, run_context):
self.t0 = time.time()
return tf.train.SessionRunArgs(
fetches=['step_update:0'])
def after_run(self, run_context, run_values):
elapsed_secs = time.time() - self.t0
self.global_step = run_values.results[0]
self.count += 1
# Removing first step + first two steps after every checkpoint save
if (self.global_step - self.init_global_step) % self.save_checkpoints_steps <= 1:
print("Skipping time record for ", self.global_step, " due to checkpoint-saving/warmup overhead")
else:
self.total_time += elapsed_secs
def end(self, session):
num_global_steps = self.global_step - self.init_global_step
self.skipped = (num_global_steps // self.save_checkpoints_steps) * 2 + \
min(2, num_global_steps % self.save_checkpoints_steps) - 1
@@ -0,0 +1,14 @@
{
"server_count": "1",
"server_list": [{
"device": [
{
"device_id": "0",
"device_ip": "192.168.10.101",
"rank_id": "0"
}],
"server_id": "127.0.0.1"
}],
"status": "completed",
"version": "1.0"
}
@@ -0,0 +1,49 @@
{
"server_count": "1",
"server_list": [{
"device": [
{
"device_id": "0",
"device_ip": "192.168.10.101",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.168.11.101",
"rank_id": "1"
},
{
"device_id": "2",
"device_ip": "192.168.12.101",
"rank_id": "2"
},
{
"device_id": "3",
"device_ip": "192.168.13.101",
"rank_id": "3"
},
{
"device_id": "4",
"device_ip": "192.168.10.100",
"rank_id": "4"
},
{
"device_id": "5",
"device_ip": "192.168.11.100",
"rank_id": "5"
},
{
"device_id": "6",
"device_ip": "192.168.12.100",
"rank_id": "6"
},
{
"device_id": "7",
"device_ip": "192.168.13.100",
"rank_id": "7"
}],
"server_id": "127.0.0.1"
}],
"status": "completed",
"version": "1.0"
}
@@ -0,0 +1,14 @@
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 21136
}
@@ -0,0 +1,14 @@
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 30522
}
@@ -0,0 +1,14 @@
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 6,
"type_vocab_size": 2,
"vocab_size": 21136
}
@@ -0,0 +1,14 @@
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 6,
"type_vocab_size": 2,
"vocab_size": 30522
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,39 @@
#!/bin/bash
#toolkit env
#export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
#export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$utilDir
#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
#nnae env
#export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/
#export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:$utilDir
#export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin
#export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
if [ -d /usr/local/Ascend/nnae/latest ];then
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:$utilDir
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
else
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$utilDir
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
fi
export NEW_GE_FE_ID=1
export GE_AICPU_FLAG=1
export SOC_VERSION=Ascend910
#export DUMP_GE_GRAPH=2
#export DUMP_GRAPH_LEVEL=3
#export PRINT_MODEL=1
export SLOG_PRINT_TO_STDOUT=0
export HCCL_CONNECT_TIMEOUT=600
# system env
#ulimit -c unlimited
@@ -0,0 +1,67 @@
#!/bin/bash
rank_size=$1
yamlPath=$2
toolsPath=$3
if [ -f /.dockerenv ];then
CLUSTER=$4
MPIRUN_ALL_IP="$5"
export CLUSTER=${CLUSTER}
fi
currentDir=$(cd "$(dirname "$0")/.."; pwd)
currtime=`date +%Y%m%d%H%M%S`
mkdir -p ${currentDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
train_job_dir=${currentDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] see more config info in ${currentDir}/config"
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] train result in ${train_job_dir}"
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
# device 列表, 若无指定 device 根据 rank_size 顺序选择
eval device_group=\$device_group_${rank_size}p
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
device_group="$(seq 0 "$(expr $rank_size - 1)")"
fi
# get last device id in device_group, hw log in performance from the dir named first_device_id
device_group_str=`echo ${device_group} | sed 's/ //g'`
first_device_id=`echo ${device_group_str: 0:1}`
# user env
export JOB_ID=9999001
export RANK_TABLE_FILE=${currentDir}/config/${rank_size}p.json
export RANK_SIZE=${rank_size}
export SLOG_PRINT_TO_STDOUT=0
export DEVICE_ID=0
export DEVICE_INDEX=$DEVICE_ID
if [ x"${CLUSTER}" == x"True" ];then
# ln hw log
ln -snf ${train_job_dir}/0/hw_bert.log ${train_job_dir}
this_ip=$(hostname -I |awk '{print $1}')
for ip in $MPIRUN_ALL_IP;do
if [ x"$ip" != x"$this_ip" ];then
scp $yamlPath root@$ip:$yamlPath
fi
done
export PATH=$PATH:/usr/local/mpirun4.0/bin
mpirun -H ${mpirun_ip} \
--bind-to none -map-by slot\
--allow-run-as-root \
--mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
--prefix /usr/local/mpirun4.0/ \
${currentDir}/scripts/train.sh 0 $currtime $yamlPath 0 True ${toolsPath} ${rank_size}
else
# ln hw log
ln -snf ${train_job_dir}/${first_device_id}/hw_bert.log ${train_job_dir}
rank_id=0
for device_id in ${device_group};do
#echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train ${device_id} & " >> ./main.log
${currentDir}/scripts/train.sh $device_id $currtime $yamlPath $rank_id solo ${toolsPath} ${rank_size} &
let rank_id++
done
fi
wait
@@ -0,0 +1,157 @@
#!/bin/bash
# 0 $currtime $yamlPath 0 cluster ${toolsPath}
device_id=$1
currtime=$2
yamlPath=$3
toolsPath=$6
rank_size=$7
export YAML_PATH=$3
mainDir=$(cd "$(dirname "$0")/.."; pwd)
mkdir -p ${mainDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
export train_job_dir=${mainDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
#exec_path=${train_job_dir}
cd ${train_job_dir}
export utilDir=$(cd "$(dirname "$yamlPath")/../atlas_benchmark-master/utils"; pwd)
export utilDir=$(cd "$(dirname "$yamlPath")/../atlas_benchmark-master/utils/atlasboost"; pwd)
source ${mainDir}/config/npu_set_env.sh
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
# 声明变量
export REMARK_LOG_FILE=hw_bert.log # 打点日志文件名称, 必须hw_后跟模型名称小写
# 添加日志打点模块路径
benchmark_log_path=${mainDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
export JOB_ID=9999001
export RANK_TABLE_FILE=${mainDir}/config/${rank_size}p.json
export RANK_SIZE=${rank_size}
export SLOG_PRINT_TO_STDOUT=0
export DEVICE_ID=${device_id}
export DEVICE_INDEX=$DEVICE_ID
export RANK_INDEX=0
export PROFILING_OPTIONS=${PROFILING_OPTIONS}
export FP_POINT=${FP_POINT}
export BP_POINT=${BP_POINT}
if [ ${PROFILING_MODE} == True ];
then
export PROFILING_MODE=true
else
export PROFILING_MODE=false
fi
if [ ${PROFILING_MODE} == True ];
then
export AICPU_PROFILING_MODE=true
else
export AICPU_PROFILING_MODE=false
fi
if [ x"${device_id}" = x ] ;
then
echo "turing train fail" >> ${exec_path}/train_${device_id}.log
exit
else
export DEVICE_ID=${device_id}
fi
env > ${currentDir}/env_${device_id}.log
cd ${train_job_dir}
if [ x"$5" != x"True" ];then
rank_id=$4
export RANK_ID=$4
else
device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
atlasboost.set_device_id(device_id);print(atlasboost.rank())")
device_id_mo=`echo $device_id_mo`
rank_id=${device_id_mo##* }
#echo rank_id is $rank_id
export RANK_ID=${rank_id}
device=${device_id_mo##*deviceid = }
device_id=${device%% phyid=*}
export DEVICE_ID=${device_id}
#echo device_id is $device_id
hccljson=${train_job_dir}/*.json
cp ${hccljson} ${mainDir}/config/${rank_size}p.json
fi
env > ${currentDir}/env_${device_id}.log
#mkdir exec path
mkdir -p ${train_job_dir}/${device_id}/ckpt${DEVICE_ID}
cd ${train_job_dir}/${device_id}
startTime=`date +%Y%m%d-%H:%M:%S`
startTime_s=`date +%s`
#start exec
python3.7 ${mainDir}/code/pretrain/run_pretraining.py \
--bert_config_file=${mainDir}/config/${bert_config_file} \
--max_seq_length=${max_seq_length} \
--max_predictions_per_seq=${max_predictions_per_seq} \
--train_batch_size=${train_batch_size} \
--learning_rate=${learning_rate} \
--num_warmup_steps=${num_warmup_steps} \
--num_train_steps=${num_train_steps} \
--optimizer_type=${optimizer_type} \
--manual_fp16=${manual_fp16} \
--use_fp16_cls=${use_fp16_cls} \
--input_files_dir=${input_files_dir} \
--eval_files_dir=${eval_files_dir} \
--npu_bert_debug=${npu_bert_debug} \
--npu_bert_use_tdt=${npu_bert_use_tdt} \
--do_train=${do_train} \
--do_eval=${do_eval} \
--num_accumulation_steps=${num_accumulation_steps} \
--npu_bert_job_start_file=None \
--iterations_per_loop=${iterations_per_loop} \
--npu_bert_loss_scale=${npu_bert_loss_scale} \
--distributed=${distributed} \
--save_checkpoints_steps=${save_checkpoints_steps} \
--npu_bert_clip_by_global_norm=${npu_bert_clip_by_global_norm} \
--output_dir=${train_job_dir}/${device_id}/ckpt${DEVICE_ID} > ${train_job_dir}/train_${device_id}.log 2>&1
if [ $? -eq 0 ] ;then
echo ":::ABK 1.0.0 bert train success"
echo ":::ABK 1.0.0 bert train success" >> ${train_job_dir}/train_${device_id}.log
echo ":::ABK 1.0.0 bert train success" >> ${train_job_dir}/${device_id}/hw_bert.log
else
echo ":::ABK 1.0.0 bert train failed"
echo ":::ABK 1.0.0 bert train failed" >> ${train_job_dir}/train_${device_id}.log
echo ":::ABK 1.0.0 bert train failed" >> ${train_job_dir}/${device_id}/hw_bert.log
fi
endTime=`date +%Y%m%d-%H:%M:%S`
endTime_s=`date +%s`
sumTime=$[ $endTime_s - $startTime_s ]
hour=$(( $sumTime/3600 ))
min=$(( ($sumTime-${hour}*3600)/60 ))
sec=$(( $sumTime-${hour}*3600-${min}*60 ))
echo ":::ABK 1.0.0 bert train total time ${hour}:${min}:${sec}"
echo ":::ABK 1.0.0 bert train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_bert.log
#if [ x"$5" == x"solo" ];
#then
# /bin/cp -f hw_bert.log $perfDir/hw_bert.log
#fi
@@ -0,0 +1,54 @@
# Bert-Large_tensorflow训练说明
### 1. 模型训练参数配置
在train/yaml/Bert-Large.yaml中修改相应配置, 配置项含义:
```
tensorflow_config:
#中文数据用 bert_config_large_cn.json 英文用bert_config_large_en.json
bert_config_file: bert_config_large_cn.json
#数据集句子长度是256时 设置为 256,40,句子长度是128时设置为128,20
max_seq_length: 128
max_predictions_per_seq: 20
# 最佳性能train_batch_size为96,如果超显存,可调小至32
train_batch_size: 96
learning_rate: 3.125e-5
num_warmup_steps: 100
num_train_steps: 1000
optimizer_type: adam
manual_fp16: True
use_fp16_cls: True
input_files_dir: /home/BertData/cn-wiki-128/
eval_files_dir: /home/BertData/cn-wiki-128/
do_train: True
do_eval: True
num_accumulation_steps: 1
iterations_per_loop: 100
npu_bert_loss_scale: 0
save_checkpoints_steps: 1000
npu_bert_clip_by_global_norm: False
# docker 镜像名称:版本号
docker_image: c73:b021
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
mpirun_ip: 90.90.140.199:8,90.90.140.229:8
# 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
device_group_1p: 0
device_group_2p: 0 1
device_group_4p: 0 1 2 3
```
------
@@ -0,0 +1,31 @@
# How to Contribute
BERT needs to maintain permanent compatibility with the pre-trained model files,
so we do not plan to make any major changes to this library (other than what was
promised in the README). However, we can accept small patches related to
re-factoring and documentation. To submit contributes, there are just a few
small guidelines you need to follow.
## Contributor License Agreement
Contributions to this project must be accompanied by a Contributor License
Agreement. You (or your employer) retain the copyright to your contribution;
this simply gives us permission to use and redistribute your contributions as
part of the project. Head over to <https://cla.developers.google.com/> to see
your current agreements on file or to sign a new one.
You generally only need to submit a CLA once, so if you've already submitted one
(even if it was for a different project), you probably don't need to do it
again.
## Code reviews
All submissions, including submissions by project members, require review. We
use GitHub pull requests for this purpose. Consult
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
information on using pull requests.
## Community Guidelines
This project follows
[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
@@ -0,0 +1,31 @@
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.08-py3
FROM tensorrtserver_client as trt
FROM ${FROM_IMAGE_NAME}
RUN apt-get update && apt-get install -y pbzip2 pv bzip2 libcurl3
RUN pip install toposort networkx pytest nltk tqdm html2text progressbar
WORKDIR /workspace
RUN git clone https://github.com/openai/gradient-checkpointing.git
RUN git clone https://github.com/attardi/wikiextractor.git
RUN git clone https://github.com/soskek/bookcorpus.git
RUN git clone https://github.com/titipata/pubmed_parser
RUN pip3 install /workspace/pubmed_parser
#Copy the perf_client over
COPY --from=trt /workspace/install/ /workspace/install/
#Install the python wheel with pip
RUN pip install /workspace/install/python/tensorrtserver*.whl
WORKDIR /workspace/bert
COPY . .
ENV PYTHONPATH /workspace/bert
ENV BERT_PREP_WORKING_DIR /workspace/bert/data
ENV PATH //workspace/install/bin:${PATH}
ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
@@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
@@ -0,0 +1,4 @@
BERT TensorFlow
This repository includes software from https://github.com/google-research/bert
licensed under the Apache License, Version 2.0 (the "License")
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,15 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
@@ -0,0 +1,567 @@
# BioBert For TensorFlow
This folder provides a script and recipe to train BERT for TensorFlow to achieve state-of-the-art accuracy on *biomedical text-mining* and is tested and maintained by NVIDIA.
## Table Of Contents
* [Model overview](#model-overview)
* [Quick Start Guide](#quick-start-guide)
* [Advanced](#advanced)
* [Scripts and sample code](#scripts-and-sample-code)
* [Parameters](#parameters)
* [Command-line options](#command-line-options)
* [Getting the data](#getting-the-data)
* [Dataset guidelines](#dataset-guidelines)
* [Multi-dataset](#multi-dataset)
* [Training process](#training-process)
* [Pre-training](#pre-training)
* [Fine tuning](#fine-tuning)
* [Multi-node](#multi-node)
* [Inference process](#inference-process)
* [Performance](#performance)
* [Benchmarking](#benchmarking)
* [Training performance benchmark](#training-performance-benchmark)
* [Inference performance benchmark](#inference-performance-benchmark)
* [Results](#results)
* [Training accuracy results](#training-accuracy-results)
* [Pre-training accuracy](#pre-training-accuracy)
* [Fine-tuning accuracy](#fine-tuning-accuracy)
* [Fine-tuning accuracy for NER Chem](#fine-tuning-accuracy-for-ner-chem)
* [Training stability test](#training-stability-test)
* [Fine-tuning stability test](#fine-tuning-stability-test)
* [Training performance results](#training-performance-results)
* [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
* [Pre-training training performance: multi-node on 16G](#pre-training-training-performance-multi-node-on-16g)
* [Fine-tuning training performance for NER on 16G](#fine-tuning-training-performance-for-ner-on-16g)
* [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32g)
* [Fine-tuning training performance for NER on 32G](#fine-tuning-training-performance-for-ner-on-32g)
* [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
* [Pre-training training performance: multi-node on DGX-2 32G](#pre-training-training-performance-multi-node-on-dgx-2-32g)
* [Fine-tuning training performance for NER on DGX-2 32G](#fine-tuning-training-performance-for-ner-on-dgx-2-32g)
* [Release notes](#release-notes)
* [Changelog](#changelog)
* [Known issues](#known-issues)
## Model overview
In the original [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) paper, pre-training is done on [Wikipedia](https://dumps.wikimedia.org/) and [Books Corpus](http://yknzhu.wixsite.com/mbweb), with state-of-the-art results demonstrated on [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) (Stanford Question Answering Dataset) benchmark.
Meanwhile, many works, including [BioBERT](https://arxiv.org/pdf/1901.08746.pdf), [SciBERT](https://arxiv.org/pdf/1903.10676.pdf), [NCBI-BERT](https://arxiv.org/pdf/1906.05474.pdf), [ClinicalBERT (MIT)](https://arxiv.org/pdf/1904.03323.pdf), [ClinicalBERT (NYU, Princeton)](https://arxiv.org/pdf/1904.05342.pdf), and others at [BioNLP19 workshop](https://aclweb.org/aclwiki/BioNLP_Workshop), show that additional pre-training of BERT on large biomedical text corpus such as [PubMed](https://www.ncbi.nlm.nih.gov/pubmed/) results in better performance in biomedical text-mining tasks.
This repository provides scripts and recipe to adopt the [NVIDIA BERT code-base](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT) to achieve state-of-the-art results in the following biomedical text-mining benchmark tasks:
- [BC5CDR-disease](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/track-3-cdr/) A Named-Entity-Recognition task to recognize diseases mentioned in a collection of 1500 PubMed titles and abstracts ([Li et al., 2016](https://academic.oup.com/database/article/doi/10.1093/database/baw068/2630414))
- [BC5CDR-chemical](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/track-3-cdr/) A Named-Entity-Recognition task to recognize chemicals mentioned in a collection of 1500 PubMed titles and abstracts ([Li et al., 2016](https://academic.oup.com/database/article/doi/10.1093/database/baw068/2630414))
- [ChemProt](https://biocreative.bioinformatics.udel.edu/news/corpora/) A Relation-Extraction task to determine chemical-protein interactions in a collection of 1820 PubMed abstracts ([Krallinger et al., 2017](https://biocreative.bioinformatics.udel.edu/media/store/files/2017/ProceedingsBCVI_v2.pdf?page=141))
## Quick Start Guide
To pretrain or fine tune your model for BioMedical tasks using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the BERT model.
1. Clone the repository.
```bash
git clone https://github.com/NVIDIA/DeepLearningExamples
cd DeepLearningExamples/TensorFlow/LanguageModeling/BERT
```
2. Build the BERT TensorFlow NGC container.
```bash
bash scripts/docker/build.sh
```
3. Download and preprocess the PubMed dataset.
To download and preprocess pre-training data as well as the required vocab files, run the following script:
```bash
bash biobert/scripts/biobert_data_download.sh
```
Datasets for finetuning can be obtained from this [repository](https://github.com/ncbi-nlp/BLUE_Benchmark/releases/tag/0.1)
Place them in `/workspace/bert/data/biobert/` to be automatically picked up by our scripts.
4. Start an interactive session in the NGC container to run training/inference.
After you build the container image and download the data, you can start an interactive CLI session as follows:
```bash
bash scripts/docker/launch.sh
```
5. Download the pre-trained checkpoint, vocabulary, and configuration files.
We have uploaded checkpoints for fine tuning and pre-training on BioMedical Corpuss on the NGC Model Registry. You can download them directly from the [NGC model catalog](https://ngc.nvidia.com/catalog/models).
Place our `BioBERT checkpoints` in the `results/` to easily access it in your scripts.
6. Start pre-training.
From within the container, you can use the following script to run the 1st phase of the pre-training using cased vocabulary:
```bash
bash biobert/scripts/run_pretraining-pubmed_base_phase_1.sh <train_batch_size> <learning_rate> <cased> <precision> <use_xla> <num_gpus> <warmup_steps> <train_steps> <num_accumulation_steps> <save_checkpoint_steps> <eval_batch_size>
```
For the 2nd phase of the pre-training, issue:
```bash
bash biobert/scripts/run_pretraining-pubmed_base_phase_2.sh <path_to_phase_1_checkpoint> <train_batch_size> <learning_rate> <cased> <precision> <use_xla> <num_gpus> <warmup_steps> <train_steps> <num_accumulation_steps> <save_checkpoint_steps> <eval_batch_size>
```
Refer to (MultiNode Section)[multi-node] for details on utilizing multiple nodes for faster pretraining.
6. Start fine tuning.
The above pretrained BERT representations can be fine tuned with just one additional output layer for a state-of-the-art biomedical text-mining system.
From within the container, you can use the following script to run fine-training for NER.
Note: The scripts assume you are running on 16 V100 32GB GPUs. If you are running on GPU having less than 32GB memory or fewer GPUs, batch size, learning rate and number of GPUs needs to be adjusted.
For NER on disease entities:
```bash
bash biobert/scripts/ner_bc5cdr-disease.sh <init_checkpoint> <train_batch_size> <learning_rate> <cased> <precision> <use_xla> <num_gpu> <seq_length> <bert_model> <eval_batch_size> <epochs>
```
For NER on chemical entities:
```bash
bash biobert/scripts/ner_bc5cdr-chem.sh <init_checkpoint> <train_batch_size> <learning_rate> <cased> <precision> <use_xla> <num_gpu> <seq_length> <bert_model> <eval_batch_size> <epochs>
```
For relation extraction, issue:
```
bash biobert/scripts/rel_chemprot.sh <init_checkpoint> <train_batch_size> <learning_rate> <cased> <precision> <use_xla> <num_gpu> <seq_length> <bert_model> <eval_batch_size> <epochs>
```
8. Start validation/evaluation.
The `biobert/scripts/run_biobert_finetuning_inference.sh` script runs inference on a checkpoint fine tuned for a specific task and evaluates the validity of predictions on the basis of F1, precision and recall scores.
```bash
bash biobert/scripts/run_biobert_finetuning_inference.sh <task> <init_checkpoint> <bert_model> <cased> <precision> <use_xla> <batch_size>
```
For FP16 inference for NER on BC5DR Chemical task with XLA using a DGX-2 V100 32G, run:
```bash
bash biobert/scripts/run_biobert_finetuning_inference.sh ner_bc5cdr-chem /results/model.ckpt base false fp16 true 16
```
Tasks `ner_bc5cdr-chem`, `ner_bc5cdr-disease` and `rel_chemprot` are currently supported.
## Advanced
The following sections provide greater details of the dataset, running training and inference, and the training results.
### Scripts and sample code
In addition to BERT TensorFlow files, the most important files added for NER and RE fine tuning tasks are:
* `run_ner.py` - Serves as an entry point for NER training.
* `run_re.py` - Serves as an entry point for RE training.
The `biobert/scripts/` folder encapsulates all the one-click scripts required for running various functionalities supported such as:
* `ner_bc5cdr-chem.sh` - Runs NER training and inference on the BC5CDR Chemical dataset using the `run_ner.py` file.
* `ner_bc5cdr-disease.sh` - Runs NER training and inference on the BC5CDR Disease dataset using the `run_ner.py` file.
* `rel_chemprot.sh` - Runs RE training and inference on the ChemProt dataset using the `run_re.py` file.
* `run_pretraining_pubmed_base_phase_*.sh` - Runs pre-training with LAMB optimizer using the `run_pretraining.py` file in two phases. Phase 1 does training with sequence length = 128. In phase 2, the remaining 10% of the training is done with sequence length = 512.
* `biobert_data_download.sh` - Downloads the PubMed dataset and Vocab files using files in the `data/` folder.
* `run_biobert_finetuning_inference.sh` - Runs task specific inference using a fine tuned checkpoint.
### Parameters
Aside from the options to set hyperparameters, some relevant options to control the behaviour of the `run_ner.py` and `run_re.py` scripts are:
```
--bert_config_file: The config json file corresponding to the pre-trained BERT model. This specifies the model architecture.
--vocab_file: The vocabulary file that the BERT model was trained on.
--output_dir: The output directory where the model checkpoints will be written.
--[no]do_eval: Whether to run evaluation on the dev set. (default: 'false')
--[no]do_predict: Whether to run evaluation on the test set. (default: 'false')
--[no]do_train: Whether to run training. (default: 'false')
--learning_rate: The initial learning rate for Adam.(default: '5e-06')(a number)
--max_seq_length: The maximum total input sequence length after WordPiece tokenization. Sequences longer than this will be truncated, and sequences shorter than this will be padded.(default: '384')(an integer)
--predict_batch_size: Total batch size for predictions.(default: '8')(an integer)
--train_batch_size: Total batch size for training (default: '8')(an integer)
--[no]use_fp16: Whether to enable AMP ops.(default: 'false')
--[no]use_xla: Whether to enable XLA JIT compilation.(default: 'false')
--init_checkpoint: Initial checkpoint (usually from a pre-trained BERT model).
--num_train_epochs: Total number of training epochs to perform.(default: '3.0')(a number)
```
Note: When initializing from a checkpoint using `--init_checkpoint` and a corpus of your choice, keep in mind that `bert_config_file` and `vocab_file` should remain unchanged.
### Command-line options
To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option with the Python file, for example:
```bash
python run_ner.py --help
python run_re.py --help
```
### Getting the data
For pre-training BERT, we use the PubMed Dataset. For PubMed, we extract the xml files which are structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences.
The next step is to run `create_pretraining_data.py` with the document level corpus as input, which generates input data and labels for the masked language modeling and next sentence prediction tasks. Pre-training can also be performed on any corpus of your choice. The collection of data generation scripts are intended to be modular to allow modifications for additional preprocessing steps or to use additional data. They can hence easily be modified for an arbitrary corpus.
The preparation of an individual pre-training dataset is described in the `create_biobert_datasets_from_start.sh ` script found in the `data/` folder. The component steps to prepare the datasets are as follows:
1. Data download and extract - the dataset is downloaded and extracted.
2. Clean and format - document tags, etc. are removed from the dataset. The end result of this step is a `{dataset_name_one_article_per_line}.txt` file that contains the entire corpus. Each line in the text file contains an entire document from the corpus. One file per dataset is created in the `formatted_one_article_per_line` folder.
3. Sharding - the sentence segmented corpus file is split into a number of smaller text documents. The sharding is configured so that a document will not be split between two shards. Sentence segmentation is performed at this time using NLTK.
4. TFRecord file creation - each text file shard is processed by the `create_pretraining_data.py` script to produce a corresponding TFRecord file. The script generates input data and labels for masked language modeling and sentence prediction tasks for the input text shard.
For fine tuning BioBERT for the task of Named Entity Recognition and Relation Extraction Tasks, we use BC5CDR and Chemprot Datasets. BC5CDR corpus consists of 1500 PubMed articles with 4409 annotated chemicals, 5818 diseases and 3116 chemical-disease interactions.
ChemProt corpus consists of text exhaustively annotated by hand with mentions of chemical compounds/drugs and genes/proteins, as well as 22 different types of compound-protein relations focussing on 5 important relation classes. It was preprocessed following [Lim and Kang](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6014134/) guidelines.
#### Dataset guidelines
The procedure to prepare a text corpus for pre-training is described in the previous section. This section provides additional insight into how exactly raw text is processed so that it is ready for pre-training.
First, raw text is tokenized using [WordPiece tokenization](https://arxiv.org/pdf/1609.08144.pdf). A [CLS] token is inserted at the start of every sequence, and the two sentences in the sequence are separated by a [SEP] token.
Note: BERT pre-training looks at pairs of sentences at a time. A sentence embedding token [A] is added to the first sentence and token [B] to the next.
BERT pre-training optimizes for two unsupervised classification tasks. The first is Masked Language Modelling (Masked LM). One training instance of Masked LM is a single modified sentence. Each token in the sentence has a 15% chance of being replaced by a [MASK] token. The chosen token is replaced with [MASK] 80% of the time, 10% with another random token and the remaining 10% with the same token. The task is then to predict the original token.
The second task is next sentence prediction. One training instance of BERT pre-training is two sentences (a sentence pair). A sentence pair may be constructed by simply taking two adjacent sentences from a single document, or by pairing up two random sentences with equal probability. The goal of this task is to predict whether or not the second sentence followed the first in the original document.
The `create_pretraining_data.py` script takes in raw text and creates training instances for both pre-training tasks.
#### Multi-dataset
We are able to combine multiple datasets into a single dataset for pre-training on a diverse text corpus. Once TFRecords have been created for each component dataset, you can create a combined dataset by adding the directory to `*FILES_DIR` in `run_pretraining_*.sh`. This will feed all matching files to the input pipeline in `run_pretraining.py`. However, in the training process, only one TFRecord file is consumed at a time, therefore, the training instances of any given training batch will all belong to the same source dataset.
### Training process
The training process consists of two steps: pre-training and fine tuning.
#### Pre-training
BERT is designed to pre-train deep bidirectional representations for language representations. The following scripts are to pre-train BERT on PubMed dataset. These scripts are general and can be used for pre-training language representations on additional corpus of biomedical text.
Pre-training is performed using the `run_pretraining.py` script along with parameters defined in the `biobert/scripts/run_pretraining_pubmed_base_phase_1.sh` and `biobert/scripts/run_pretraining_pubmed_base_phase_2.sh` scripts.
The `biobert/scripts/run_pretraining_pubmed_base_phase*.sh` scripts run a job on a single node that trains the BERT-base model from scratch using the PubMed Corpus dataset as training data. By default, the training script:
- Runs on 16 GPUs
- Has FP16 precision enabled
- Is XLA enabled
- Creates a log file containing all the output
- Saves a checkpoint every 5000 iterations (keeps only the latest checkpoint) and at the end of training. All checkpoints, evaluation results, and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
- Evaluates the model at the end of each phase
- Phase 1
- Runs 19531 steps with 1953 warmup steps
- Sets Maximum sequence length as 128
- Sets Global Batch size as 64K
- Phase 2
- Runs 4340 steps with 434 warm-up steps
- Sets Maximum sequence length as 512
- Sets Global Batch size as 32K
- Should start from Phase1's final checkpoint
These parameters train PubMed with reasonable accuracy on a DGX-2 with 32GB V100 cards.
For example:
```bash
biobert/scripts/run_pretraining-pubmed_base_phase_1.sh <train_batch_size> <learning_rate> <cased> <precision> <use_xla> <num_gpus> <warmup_steps> <train_steps> <num_accumulation_steps> <save_checkpoint_steps> <eval_batch_size>
```
Where:
- `<training_batch_size>` is per-GPU batch size used for training. Batch size varies with precision, larger batch sizes run more efficiently, but require more memory.
- `<learning_rate>` is the default rate of 3.2e-5 is good for global batch size 64k.
- `<cased>` is set to `true` or `false` depending on whether the model should be trained on cased or uncased data.
- `<precision>` is the type of math in your model, can be either `fp32` or `fp16`. Specifically:
- `fp32` is 32-bit IEEE single precision floats.
- `fp16` is Automatic rewrite of TensorFlow compute graph to take advantage of 16-bit arithmetic whenever it is safe.
- `<num_gpus>` is the number of GPUs to use for training. Must be equal to or smaller than the number of GPUs attached to your node.
- `<warmup_steps>` is the number of warm-up steps at the start of training.
- `<training_steps>` is the total number of training steps.
- `<save_checkpoint_steps>` controls how often checkpoints are saved. Default is 5000 steps.
- `<num_accumulation_steps>` is used to mimic higher batch sizes in the respective phase by accumulating gradients N times before weight update.
- `<bert_model>` is used to indicate whether to pretrain BERT Large or BERT Base model.
- `<eval_batch_size>` is per-GPU batch size used for evaluation after training.
The following sample code trains phase 1 of BERT-base from scratch on a single DGX-2 using FP16 arithmetic and uncased data.
```bash
biobert/scripts/run_pretraining-pubmed_base_phase_1.sh 128 3.2e-5 false fp16 true 16 1953 19531 32 5000 80
```
#### Fine tuning
Fine tuning is performed using the `run_ner.py` script along with parameters defined in `biobert/scripts/ner_bc5cdr*.sh`.
For example, `biobert/scripts/ner_bc5cdr-chem.sh` script trains a model and performs evaluation on the BC5CDR Chemical dataset. By default, the training script:
- Trains on BERT Base Uncased Model
- Uses 16 GPUs and batch size of 8 on each GPU
- Has FP16 precision enabled
- Is XLA enabled
- Runs for 10 epochs
- Evaluation is done at the end of training. To skip evaluation, modify `--do_eval` and `--do_predict` to `False`.
This script outputs checkpoints to the `/results` directory, by default, inside the container. Mount point of `/results` can be changed in the `scripts/docker/launch.sh` file. The training log contains information about:
- Loss for the final step
- Training and evaluation performance
- F1, Precision and Recall on the Test Set of BC5CDR Chemical after evaluation.
The summary after training is printed in the following format:
```bash
0: /results/biobert_finetune_ner_chem_191028154209/test_labels.txt
0: /results/biobert_finetune_ner_chem_191028154209/test_labels_errs.txt
0: processed 124669 tokens with 5433 phrases; found: 5484 phrases; correct: 5102.
0: accuracy: 99.26%; precision: 93.03%; recall: 93.91%; FB1: 93.47
0: : precision: 93.03%; recall: 93.91%; FB1: 93.47 5484
```
Multi-GPU training is enabled with the Horovod TensorFlow module. The following example runs training on 16 GPUs:
```bash
BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
DATA_DIR=data/biobert/BC5CDR/chem
mpi_command="mpirun -np 16 -H localhost:16 \
--allow-run-as-root -bind-to none -map-by slot \
-x NCCL_DEBUG=INFO \
-x LD_LIBRARY_PATH \
-x PATH -mca pml ob1 -mca btl ^openib" \
python run_ner.py --horovod --use_fp16 --use_xla \
--vocab_file=$BERT_DIR/vocab.txt \
--bert_config_file=$BERT_DIR/bert_config.json \
--output_dir=/results --data_dir=$DATA_DIR"
```
#### Multi-node
Multi-node runs can be launched on a pyxis/enroot Slurm cluster (see [Requirements](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT#requirements)) with the `biobert/scripts/run_biobert.sub` script with the following command for a 4-node DGX2 example for both phase 1 and phase 2:
```bash
BATCHSIZE=128 LEARNING_RATE='8e-6' NUM_ACCUMULATION_STEPS=8 PHASE=1 sbatch -N4 --ntasks-per-node=16 biobert/scripts/run_biobert.sub
BATCHSIZE=16 LEARNING_RATE='3.2e-5' NUM_ACCUMULATION_STEPS=32 PHASE=1 sbatch -N4 --ntasks-per-node=16 biobert/scripts/run_biobert.sub
```
Checkpoint after phase 1 will be saved in `checkpointdir` specified in `biobert/scripts/run_biobert.sub`. The checkpoint will be automatically picked up to resume training on phase 2. Note that phase 2 should be run after phase 1.
Variables to re-run the [Training performance results](#training-performance-results) are available in the `configurations.yml` file.
The batch variables `BATCHSIZE`, `LEARNING_RATE`, `NUM_ACCUMULATION_STEPS` refer to the Python arguments `train_batch_size`, `learning_rate`, `num_accumulation_steps` respectively.
The variable `PHASE` refers to phase specific arguments available in `biobert/scripts/run_biobert.sub`.
Note that the `biobert/scripts/run_biobert.sub` script is a starting point that has to be adapted depending on the environment. In particular, variables such as `datadir` handle the location of the files for each phase.
Refer to the file contents to see the full list of variables to adjust for your system.
### Inference process
Inference on a fine tuned model for Bio Medical tasks is performed using the `run_ner.py` or `run_re.py` script along with parameters defined in `biobert/scripts/run_biobert_finetuning_inference.sh`. Inference is supported on a single GPU.
The `biobert/scripts/run_biobert_finetuning_inference.sh` script performs evaluation on ChemProt or BC5CDR datasets depending on the task specified. By default, the inferencing script:
- Uses BC5CDR Chemical dataset
- Has FP16 precision enabled
- Is XLA enabled
- Evaluates the latest checkpoint present in `/results` with a batch size of 16.
This script computes F1, Precision and Recall scores. Mount point of `/results` can be changed in the `scripts/docker/launch.sh` file.
## Performance
### Benchmarking
The following section shows how to run benchmarks measuring the model performance in training and inference modes.
Both of these benchmarking scripts enable you to run a number of epochs, extract performance numbers, and run the BERT model for fine tuning.
#### Training performance benchmark
Training benchmarking can be performed by running the script:
``` bash
biobert/scripts/biobert_finetune_training_benchmark.sh <task> <num_gpu> <bert_model> <cased>
```
This script runs 2 epochs by default on the NER BC5CDR dataset and extracts performance numbers for various batch sizes and sequence lengths in both FP16 and FP32. These numbers are saved at `/results/tf_bert_biobert_<task>_training_benchmark__<bert_model>_<cased/uncased>_num_gpu_<num_gpu>_<DATESTAMP>`
#### Inference performance benchmark
Training benchmarking can be performed by running the script:
``` bash
biobert/scripts/biobert_finetune_inference_benchmark.sh <task> <bert_model> <cased>
```
This script runs inference on the test and dev sets and extracts performance and latency numbers for various batch sizes and sequence lengths in both FP16 with XLA and FP32 without XLA. These numbers are saved at `/results/tf_bert_biobert_<task>_training_benchmark__<bert_model>_<cased/uncased>_num_gpu_<num_gpu>_<DATESTAMP>`
## Results
The following sections provide detailed results of downstream fine-tuning task on NER and RE benchmark tasks.
### Training accuracy results
#### Pre-training accuracy
Our results were obtained by running the `scripts/run_pretraining_lamb.sh` training script in the TensorFlow 19.08-py3 NGC container.
| **DGX System** | **Nodes** | **Precision** | **Batch Size/GPU: Phase1, Phase2** | **Accumulation Steps: Phase1, Phase2** | **Time to Train (Hrs)** | **Final Loss** |
|----------------|-----------|---------------|------------------------------------|----------------------------------------|----------------|-------------------------|
| DGX2H | 4 | FP16 | 128, 16 | 8, 32 | 19.14 | 0.88 |
| DGX2H | 16 | FP16 | 128, 16 | 2, 8 | 4.81 | 0.86 |
| DGX2H | 32 | FP16 | 128, 16 | 1, 4 | 2.65 | 0.87 |
#### Fine-tuning accuracy
| **Task** | **F1** | **Precision** | **Recall** |
|:-------:|:----:|:----:|:----:|
| NER BC5CDR-chemical | 93.47 | 93.03 | 93.91 |
| NER BC5CDR-disease | 86.22 | 85.05 | 87.43 |
| RE Chemprot | 76.27 | 77.62 | 74.98 |
##### Fine-tuning accuracy for NER Chem
Our results were obtained by running the `biobert/scripts/ner_bc5cdr-chem.sh` training script in the TensorFlow 19.08-py3 NGC container.
| **DGX System** | **Batch size / GPU** | **F1 - FP32** | **F1- mixed precision** | **Time to Train - FP32 (Minutes)** | **Time to Train - mixed precision (Minutes)** |
|:---:|:----:|:----:|:---:|:----:|:----:|
| DGX-1 16G | 64 |93.33|93.40|23.95|14.13|
| DGX-1 32G | 64 |93.31|93.36|24.35|12.63|
| DGX-2 32G | 64 |93.66|93.47|12.26|8.16|
### Training stability test
#### Fine-tuning stability test:
The following tables compare F1 scores scores across 5 different training runs on the NER Chemical task with different seeds, for both FP16 and FP32. The runs showcase consistent convergence on all 5 seeds with very little deviation.
| **16 x V100 GPUs** | **seed 1** | **seed 2** | **seed 3** | **seed 4** | **seed 5** | **mean** | **std** |
|:-----------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|
| F1 Score (FP16) | 93.13 | 92.92 | 93.34 | 93.66 | 93.47 | 93.3 | 0.29 |
| F1 Score (FP32) | 93.1 | 93.28 | 93.33 | 93.45 | 93.17 | 93.27 | 0.14 |
### Training performance results
#### Training performance: NVIDIA DGX-1 (8x V100 16G)
##### Pre-training training performance: multi-node on DGX-1 16G
Our results were obtained by running the `biobert/scripts/run_biobert.sub` training script in the TensorFlow 19.08-py3 NGC container using multiple NVIDIA DGX-1 with 8x V100 16G GPUs. Performance (in sentences per second) is the steady state throughput.
| **Nodes** | **Sequence Length**| **Batch size / GPU: mixed precision, FP32** | **Throughput - mixed precision** | **Throughput - FP32** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - mixed precision** | **Weak scaling - FP32** |
|:-------:|:-----:|:-------:|:-------:|:-------:|:-------------:|:------:|:------:|
| 1 | 128 | 64,32 | 2762.06 | 744.48 | 3.71 | 1.00 | 1.00 |
| 4 | 128 | 64,32 | 10283.08 | 2762.88 | 3.72 | 3.72 | 3.71 |
| 16 | 128 | 64,32 | 39051.69 | 10715.14 | 3.64 | 14.14 | 14.39 |
| 32 | 128 | 64,32 | 76077.39 | 21104.87 | 3.60 | 27.54 | 28.35 |
| 1 | 512 | 8,8 | 432.33 | 160.38 | 2.70 | 1.00 | 1.00 |
| 4 | 512 | 8,8 | 1593.00 | 604.36 | 2.64 | 3.68 | 3.77 |
| 16 | 512 | 8,8 | 5941.82 | 2356.44 | 2.52 | 13.74 | 14.69 |
| 32 | 512 | 8,8 | 11483.73 | 4631.29 | 2.48 | 26.56 | 28.88 |
Note: The respective values for FP32 runs that use a batch size of 16, 2 in sequence lengths 128 and 512 respectively are not available due to out of memory errors that arise.
##### Fine-tuning training performance for NER on DGX-1 16G
Our results were obtained by running the `biobert/scripts/ner_bc5cdr-chem.sh` training script in the TensorFlow 19.08-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs. Performance (in sentences per second) is the mean throughput from 2 epochs.
| **GPUs** | **Batch size / GPU** | **Throughput - FP32** | **Throughput - mixed precision** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - FP32** | **Weak scaling - mixed precision** |
|:---:|:---:|:------:|:-----:|:----:|:----:|:----:|
| 1 | 64 | 147.71 | 348.84 | 2.36 | 1.00 | 1.00 |
| 4 | 64 | 583.78 | 1145.46 | 1.96 | 3.95 | 3.28 |
| 8 | 64 | 981.22 | 1964.85 | 2.00 | 6.64 | 5.63 |
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
#### Training performance: NVIDIA DGX-1 (8x V100 32G)
##### Fine-tuning training performance for NER on DGX-1 32G
Our results were obtained by running the `biobert/scripts/ner_bc5cdr-chem.sh` training script in the TensorFlow 19.08-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs. Performance (in sentences per second) is the mean throughput from 2 epochs.
| **GPUs** | **Batch size / GPU** | **Throughput - FP32** | **Throughput - mixed precision** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - FP32** | **Weak scaling - mixed precision** |
|:---:|:---:|:------:|:-----:|:----:|:----:|:----:|
| 1 | 64 | 144.1 | 417.39 | 2.89 | 1.00 | 1.00 |
| 4 | 64 | 525.15 | 1354.14 | 2.57 | 3.64 | 3.24 |
| 8 | 64 | 969.4 | 2341.39 | 2.41 | 6.73 | 5.61 |
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
#### Training performance: NVIDIA DGX-2 (16x V100 32G)
##### Pre-training training performance: multi-node on DGX-2H 32G
Our results were obtained by running the `biobert/scripts/run_biobert.sub` training script in the TensorFlow 19.08-py3 NGC container using multiple NVIDIA DGX-2H with 16x V100 32G GPUs. Performance (in sentences per second) is the steady state throughput.
| **Nodes** | **Sequence Length**| **Batch size / GPU: mixed precision, FP32** | **Throughput - mixed precision** | **Throughput - FP32** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - mixed precision** | **Weak scaling - FP32** |
|:-------:|:-----:|:-------:|:-------:|:-------:|:-------------:|:------:|:------:|
| 1 | 128 | 128,128 | 7772.18 | 2165.04 | 3.59 | 1.00 | 1.00 |
| 4 | 128 | 128,128 | 29785.31 | 8516.90 | 3.50 | 3.83 | 3.93 |
| 16 | 128 | 128,128 | 115581.29 | 33699.15 | 3.43 | 14.87 | 15.57 |
| 32 | 128 | 128,128 | 226156.53 | 66996.73 | 3.38 | 29.10 | 30.94 |
| 64 | 128 | 128,128 | 444955.74 | 133424.95 | 3.33 | 57.25 | 61.63 |
| 1 | 512 | 16,16 | 1260.06 | 416.92 | 3.02 | 1.00 | 1.00 |
| 4 | 512 | 16,16 | 4781.19 | 1626.76 | 2.94 | 3.79 | 3.90 |
| 16 | 512 | 16,16 | 18405.65 | 6418.09 | 2.87 | 14.61 | 15.39 |
| 32 | 512 | 16,16 | 36071.06 | 12713.67 | 2.84 | 28.63 | 30.49 |
| 64 | 512 | 16,16 | 69950.86 | 25245.96 | 2.77 | 55.51 | 60.55 |
##### Fine-tuning training performance for NER on DGX-2 32G
Our results were obtained by running the `biobert/scripts/ner_bc5cdr-chem.sh` training script in the TensorFlow 19.08-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G GPUs. Performance (in sentences per second) is the mean throughput from 2 epochs.
| **GPUs** | **Batch size / GPU** | **Throughput - FP32** | **Throughput - mixed precision** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - FP32** | **Weak scaling - mixed precision** |
|:---:|:---:|:------:|:-----:|:----:|:----:|:----:|
| 1 | 64 | 139.59 | 475.54 | 3.4 | 1.00 | 1.00 |
| 4 | 64 | 517.08 | 1544.01 | 2.98 | 3.70 | 3.25 |
| 8 | 64 | 1009.84 | 2695.34 | 2.66 | 7.23 | 5.67 |
| 16 | 64 | 1997.73 | 4268.81 | 2.13 | 14.31 | 8.98 |
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
## Release notes
### Changelog
November 2019
- Initial release
### Known issues
- There are no known issues with the model.
@@ -0,0 +1,302 @@
# Python version of the evaluation script from CoNLL'00-
# Originates from: https://github.com/spyysalo/conlleval.py
# Intentional differences:
# - accept any space as delimiter by default
# - optional file argument (default STDIN)
# - option to set boundary (-b argument)
# - LaTeX output (-l argument) not supported
# - raw tags (-r argument) not supported
# add function :evaluate(predicted_label, ori_label): which will not read from file
import sys
import re
import codecs
from collections import defaultdict, namedtuple
ANY_SPACE = '<SPACE>'
class FormatError(Exception):
pass
Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')
class EvalCounts(object):
def __init__(self):
self.correct_chunk = 0 # number of correctly identified chunks
self.correct_tags = 0 # number of correct chunk tags
self.found_correct = 0 # number of chunks in corpus
self.found_guessed = 0 # number of identified chunks
self.token_counter = 0 # token counter (ignores sentence breaks)
# counts by type
self.t_correct_chunk = defaultdict(int)
self.t_found_correct = defaultdict(int)
self.t_found_guessed = defaultdict(int)
def parse_args(argv):
import argparse
parser = argparse.ArgumentParser(
description='evaluate tagging results using CoNLL criteria',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
arg = parser.add_argument
arg('-b', '--boundary', metavar='STR', default='-X-',
help='sentence boundary')
arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE,
help='character delimiting items in input')
arg('-o', '--otag', metavar='CHAR', default='O',
help='alternative outside tag')
arg('file', nargs='?', default=None)
return parser.parse_args(argv)
def parse_tag(t):
m = re.match(r'^([^-]*)-(.*)$', t)
return m.groups() if m else (t, '')
def evaluate(iterable, options=None):
if options is None:
options = parse_args([]) # use defaults
counts = EvalCounts()
num_features = None # number of features per line
in_correct = False # currently processed chunks is correct until now
last_correct = 'O' # previous chunk tag in corpus
last_correct_type = '' # type of previously identified chunk tag
last_guessed = 'O' # previously identified chunk tag
last_guessed_type = '' # type of previous chunk tag in corpus
for i, line in enumerate(iterable):
line = line.rstrip('\r\n')
# print(line)
if options.delimiter == ANY_SPACE:
features = line.split()
else:
features = line.split(options.delimiter)
if num_features is None:
num_features = len(features)
elif num_features != len(features) and len(features) != 0:
raise FormatError('unexpected number of features: %d (%d) at line %d\n%s' %
(len(features), num_features, i, line))
if len(features) == 0 or features[0] == options.boundary:
features = [options.boundary, 'O', 'O']
if len(features) < 3:
raise FormatError('unexpected number of features in line %s' % line)
guessed, guessed_type = parse_tag(features.pop())
correct, correct_type = parse_tag(features.pop())
first_item = features.pop(0)
if first_item == options.boundary:
guessed = 'O'
end_correct = end_of_chunk(last_correct, correct,
last_correct_type, correct_type)
end_guessed = end_of_chunk(last_guessed, guessed,
last_guessed_type, guessed_type)
start_correct = start_of_chunk(last_correct, correct,
last_correct_type, correct_type)
start_guessed = start_of_chunk(last_guessed, guessed,
last_guessed_type, guessed_type)
if in_correct:
if (end_correct and end_guessed and
last_guessed_type == last_correct_type):
in_correct = False
counts.correct_chunk += 1
counts.t_correct_chunk[last_correct_type] += 1
elif (end_correct != end_guessed or guessed_type != correct_type):
in_correct = False
if start_correct and start_guessed and guessed_type == correct_type:
in_correct = True
if start_correct:
counts.found_correct += 1
counts.t_found_correct[correct_type] += 1
if start_guessed:
counts.found_guessed += 1
counts.t_found_guessed[guessed_type] += 1
if first_item != options.boundary:
if correct == guessed and guessed_type == correct_type:
counts.correct_tags += 1
counts.token_counter += 1
last_guessed = guessed
last_correct = correct
last_guessed_type = guessed_type
last_correct_type = correct_type
if in_correct:
counts.correct_chunk += 1
counts.t_correct_chunk[last_correct_type] += 1
return counts
def uniq(iterable):
seen = set()
return [i for i in iterable if not (i in seen or seen.add(i))]
def calculate_metrics(correct, guessed, total):
tp, fp, fn = correct, guessed-correct, total-correct
p = 0 if tp + fp == 0 else 1.*tp / (tp + fp)
r = 0 if tp + fn == 0 else 1.*tp / (tp + fn)
f = 0 if p + r == 0 else 2 * p * r / (p + r)
return Metrics(tp, fp, fn, p, r, f)
def metrics(counts):
c = counts
overall = calculate_metrics(
c.correct_chunk, c.found_guessed, c.found_correct
)
by_type = {}
for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)):
by_type[t] = calculate_metrics(
c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t]
)
return overall, by_type
def report(counts, out=None):
if out is None:
out = sys.stdout
overall, by_type = metrics(counts)
c = counts
out.write('processed %d tokens with %d phrases; ' %
(c.token_counter, c.found_correct))
out.write('found: %d phrases; correct: %d.\n' %
(c.found_guessed, c.correct_chunk))
if c.token_counter > 0:
out.write('accuracy: %6.2f%%; ' %
(100.*c.correct_tags/c.token_counter))
out.write('precision: %6.2f%%; ' % (100.*overall.prec))
out.write('recall: %6.2f%%; ' % (100.*overall.rec))
out.write('FB1: %6.2f\n' % (100.*overall.fscore))
for i, m in sorted(by_type.items()):
out.write('%17s: ' % i)
out.write('precision: %6.2f%%; ' % (100.*m.prec))
out.write('recall: %6.2f%%; ' % (100.*m.rec))
out.write('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i]))
def report_notprint(counts, out=None):
if out is None:
out = sys.stdout
overall, by_type = metrics(counts)
c = counts
final_report = []
line = []
line.append('processed %d tokens with %d phrases; ' %
(c.token_counter, c.found_correct))
line.append('found: %d phrases; correct: %d.\n' %
(c.found_guessed, c.correct_chunk))
final_report.append("".join(line))
if c.token_counter > 0:
line = []
line.append('accuracy: %6.2f%%; ' %
(100.*c.correct_tags/c.token_counter))
line.append('precision: %6.2f%%; ' % (100.*overall.prec))
line.append('recall: %6.2f%%; ' % (100.*overall.rec))
line.append('FB1: %6.2f\n' % (100.*overall.fscore))
final_report.append("".join(line))
for i, m in sorted(by_type.items()):
line = []
line.append('%17s: ' % i)
line.append('precision: %6.2f%%; ' % (100.*m.prec))
line.append('recall: %6.2f%%; ' % (100.*m.rec))
line.append('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i]))
final_report.append("".join(line))
return final_report
def end_of_chunk(prev_tag, tag, prev_type, type_):
# check if a chunk ended between the previous and current word
# arguments: previous and current chunk tags, previous and current types
chunk_end = False
if prev_tag == 'E': chunk_end = True
if prev_tag == 'S': chunk_end = True
if prev_tag == 'B' and tag == 'B': chunk_end = True
if prev_tag == 'B' and tag == 'S': chunk_end = True
if prev_tag == 'B' and tag == 'O': chunk_end = True
if prev_tag == 'I' and tag == 'B': chunk_end = True
if prev_tag == 'I' and tag == 'S': chunk_end = True
if prev_tag == 'I' and tag == 'O': chunk_end = True
if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
chunk_end = True
# these chunks are assumed to have length 1
if prev_tag == ']': chunk_end = True
if prev_tag == '[': chunk_end = True
return chunk_end
def start_of_chunk(prev_tag, tag, prev_type, type_):
# check if a chunk started between the previous and current word
# arguments: previous and current chunk tags, previous and current types
chunk_start = False
if tag == 'B': chunk_start = True
if tag == 'S': chunk_start = True
if prev_tag == 'E' and tag == 'E': chunk_start = True
if prev_tag == 'E' and tag == 'I': chunk_start = True
if prev_tag == 'S' and tag == 'E': chunk_start = True
if prev_tag == 'S' and tag == 'I': chunk_start = True
if prev_tag == 'O' and tag == 'E': chunk_start = True
if prev_tag == 'O' and tag == 'I': chunk_start = True
if tag != 'O' and tag != '.' and prev_type != type_:
chunk_start = True
# these chunks are assumed to have length 1
if tag == '[': chunk_start = True
if tag == ']': chunk_start = True
return chunk_start
def main(argv):
args = parse_args(argv[1:])
if args.file is None:
counts = evaluate(sys.stdin, args)
else:
with open(args.file) as f:
counts = evaluate(f, args)
report(counts)
def return_report(input_file):
with open(input_file, "r") as f:
counts = evaluate(f)
return report_notprint(counts)
if __name__ == '__main__':
# sys.exit(main(sys.argv))
return_report('/home/pengy6/data/sentence_similarity/data/cdr/test1/wanli_result2/label_test.txt')
@@ -0,0 +1,51 @@
import os
import numpy as np
import pandas as pd
import sklearn.metrics
import argparse
parser = argparse.ArgumentParser(description='')
parser.add_argument('--output_path', type=str, help='')
parser.add_argument('--answer_path', type=str, help='')
parser.add_argument('--task', type=str, default="binary", help='default:binary, possible other options:{chemprot}')
args = parser.parse_args()
testdf = pd.read_csv(args.answer_path, sep="\t", index_col=0)
preddf = pd.read_csv(args.output_path, sep="\t", header=None)
# binary
if args.task == "binary":
pred = [preddf.iloc[i].tolist() for i in preddf.index]
pred_class = [np.argmax(v) for v in pred]
pred_prob_one = [v[1] for v in pred]
p,r,f,s = sklearn.metrics.precision_recall_fscore_support(y_pred=pred_class, y_true=testdf["label"])
results = dict()
results["f1 score"] = f[1]
results["recall"] = r[1]
results["precision"] = p[1]
results["specificity"] = r[0]
# chemprot
# micro-average of 5 target classes
# see "Potent pairing: ensemble of long short-term memory networks and support vector machine for chemical-protein relation extraction (Mehryary, 2018)" for details
if args.task == "chemprot":
pred = [preddf.iloc[i].tolist() for i in preddf.index]
pred_class = [np.argmax(v) for v in pred]
str_to_int_mapper = dict()
for i,v in enumerate(sorted(testdf["label"].unique())):
str_to_int_mapper[v] = i
test_answer = [str_to_int_mapper[v] for v in testdf["label"]]
p,r,f,s = sklearn.metrics.precision_recall_fscore_support(y_pred=pred_class, y_true=test_answer, labels=[0,1,2,3,4], average="micro")
results = dict()
results["f1 score"] = f
results["recall"] = r
results["precision"] = p
for k,v in results.items():
print("{:11s} : {:.2%}".format(k,v))
@@ -0,0 +1,19 @@
#!/usr/bin/env bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
docker run --runtime=nvidia -v $PWD:/workspace/bert \
--rm --shm-size=1g --ulimit memlock=-1 \
--ulimit stack=67108864 --ipc=host -t -i \
bert bash -c "bash data/create_biobert_datasets_from_start.sh"
@@ -0,0 +1,187 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
task=${1:-"ner_bc5cdr-chem"}
bert_model=${2:-"base"}
cased=${3:-"false"}
if [ "$cased" = "true" ] ; then
DO_LOWER_CASE=0
CASING_DIR_PREFIX="cased"
case_flag="--do_lower_case=False"
else
DO_LOWER_CASE=1
CASING_DIR_PREFIX="uncased"
case_flag="--do_lower_case=True"
fi
if [ "$bert_model" = "large" ] ; then
export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-24_H-1024_A-16
else
export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
fi
DATESTAMP=`date +'%y%m%d%H%M%S'`
printf -v TAG "tf_bert_biobert_%s_inference_benchmark_%s_%s" "$task" "$bert_model" "$CASING_DIR_PREFIX"
OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
mkdir -p ${OUTPUT_DIR}
if [ "$task" = "ner_bc5cdr-chem" ] ; then
DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/chem
LOGFILE="${OUTPUT_DIR}/${task}_training_benchmark_bert_${bert_model}.log"
echo "Training performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
echo "Precision Sequence Length Batch size Performance(sent/sec)" >> $LOGFILE
for seq_length in 128 512; do
for batch_size in 8 32 64; do
for precision in fp16 fp32; do
res_dir=${OUTPUT_DIR}/bert_${bert_model}_sl_${seq_len}_prec_${precision}_bs_${batch_size}
mkdir -p ${res_dir}
tmp_file="${res_dir}/${task}_training_benchmark.log"
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--use_fp16"
use_xla_tag="--use_xla"
else
echo "fp32 activated!"
use_fp16=""
use_xla_tag=""
fi
python /workspace/bert/run_ner.py \
--do_prepare=true \
--do_eval=true \
--do_predict=true \
--task_name="bc5cdr" \
--vocab_file=$BERT_DIR/vocab.txt \
--bert_config_file=$BERT_DIR/bert_config.json \
--init_checkpoint="$BERT_DIR/bert_model.ckpt" \
--data_dir=$DATASET_DIR \
--output_dir=$res_dir \
--eval_batch_size=$batch_size \
--predict_batch_size=$batch_size \
--max_seq_length=$seq_length \
$use_fp16 $use_xla_tag $case_flag |& tee $tmp_file
perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | tail -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
echo "$precision $seq_len $batch_size $perf" >> $LOGFILE
done
done
done
elif [ "$task" = "ner_bc5cdr-disease" ] ; then
DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/disease
LOGFILE="${OUTPUT_DIR}/${task}_training_benchmark_bert_${bert_model}.log"
echo "Training performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
echo "Precision Sequence Length Batch size Performance(sent/sec)" >> $LOGFILE
for seq_length in 128 512; do
for batch_size in 8 32 64; do
for precision in fp16 fp32; do
res_dir=${OUTPUT_DIR}/bert_${bert_model}_sl_${seq_len}_prec_${precision}_bs_${batch_size}
mkdir -p ${res_dir}
tmp_file="${res_dir}/${task}_training_benchmark.log"
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--use_fp16"
use_xla_tag="--use_xla"
else
echo "fp32 activated!"
use_fp16=""
use_xla_tag=""
fi
python3 /workspace/bert/run_ner.py \
--do_prepare=true \
--do_eval=true \
--do_predict=true \
--task_name="bc5cdr" \
--vocab_file=$BERT_DIR/vocab.txt \
--bert_config_file=$BERT_DIR/bert_config.json \
--init_checkpoint="$BERT_DIR/bert_model.ckpt" \
--data_dir=$DATASET_DIR \
--output_dir=$res_dir \
--eval_batch_size=$batch_size \
--predict_batch_size=$batch_size \
--max_seq_length=$seq_length \
"$use_fp16" $use_xla_tag $case_flag |& tee $tmp_file
perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | tail -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
echo "$precision $seq_len $batch_size $perf" >> $LOGFILE
done
done
done
elif [ "$task" = "rel_chemprot" ] ; then
DATASET_DIR=/workspace/bert/data/biobert/ChemProt
LOGFILE="${OUTPUT_DIR}/${task}_training_benchmark_bert_${bert_model}.log"
echo "Training performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
echo "Precision Sequence Length Batch size Performance(sent/sec)" >> $LOGFILE
for seq_length in 128 512; do
for batch_size in 8 32 64; do
for precision in fp16 fp32; do
res_dir=${OUTPUT_DIR}/bert_${bert_model}_sl_${seq_len}_prec_${precision}_bs_${batch_size}
mkdir -p ${res_dir}
tmp_file="${res_dir}/${task}_training_benchmark.log"
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--use_fp16"
use_xla_tag="--use_xla"
else
echo "fp32 activated!"
use_fp16=""
use_xla_tag=""
fi
python3 /workspace/bert/run_re.py \
--do_prepare=true \
--do_eval=true \
--do_predict=true \
--task_name="chemprot" \
--vocab_file=$BERT_DIR/vocab.txt \
--bert_config_file=$BERT_DIR/bert_config.json \
--init_checkpoint="$BERT_DIR/bert_model.ckpt" \
--data_dir=$DATASET_DIR \
--output_dir=$res_dir \
--eval_batch_size=$batch_size \
--predict_batch_size=$batch_size \
--max_seq_length=$seq_length \
"$use_fp16" $use_xla_tag $case_flag |& tee $tmp_file
perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | tail -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
echo "$precision $seq_len $batch_size $perf" >> $LOGFILE
done
done
done
else
echo "Benchmarking for " $task "currently not supported. Sorry!"
fi
@@ -0,0 +1,203 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
task=${1:-"ner_bc5cdr-chem"}
num_gpu=${2:-"2"}
bert_model=${3:-"base"}
cased=${4:-"false"}
epochs=2.0
if [ "$cased" = "true" ] ; then
DO_LOWER_CASE=0
CASING_DIR_PREFIX="cased"
case_flag="--do_lower_case=False"
else
DO_LOWER_CASE=1
CASING_DIR_PREFIX="uncased"
case_flag="--do_lower_case=True"
fi
if [ "$bert_model" = "large" ] ; then
export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-24_H-1024_A-16
else
export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
fi
if [ $num_gpu -gt 1 ] ; then
mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
--allow-run-as-root -bind-to none -map-by slot \
-x NCCL_DEBUG=INFO \
-x LD_LIBRARY_PATH \
-x PATH -mca pml ob1 -mca btl ^openib"
use_hvd="--horovod"
else
mpi_command=""
use_hvd=""
fi
DATESTAMP=`date +'%y%m%d%H%M%S'`
printf -v TAG "tf_bert_biobert_%s_training_benchmark_%s_%s_num_gpu_%d" "$task" "$bert_model" "$CASING_DIR_PREFIX" "$num_gpu"
OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
mkdir -p ${OUTPUT_DIR}
if [ "$task" = "ner_bc5cdr-chem" ] ; then
DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/chem
LOGFILE="${OUTPUT_DIR}/${task}_training_benchmark_bert_${bert_model}_gpu_${num_gpu}.log"
echo "Training performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
echo "Precision Sequence Length Batch size Performance(sent/sec)" >> $LOGFILE
for seq_length in 128 512; do
for train_batch_size in 8 32 64; do
for precision in fp16 fp32; do
res_dir=${OUTPUT_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_length}_prec_${precision}_bs_${batch_size}
mkdir -p ${res_dir}
tmp_file="${res_dir}/${task}_training_benchmark.log"
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--use_fp16"
use_xla_tag="--use_xla"
else
echo "fp32 activated!"
use_fp16=""
use_xla_tag=""
fi
$mpi_command python /workspace/bert/run_ner.py \
--do_prepare=true \
--do_train=true \
--do_eval=true \
--do_predict=true \
--task_name=bc5cdr \
--vocab_file=$BERT_DIR/vocab.txt \
--bert_config_file=$BERT_DIR/bert_config.json \
--init_checkpoint="$BERT_DIR/bert_model.ckpt" \
--num_train_epochs=$epochs \
--data_dir=$DATASET_DIR \
--output_dir=$res_dir \
--train_batch_size=$train_batch_size \
--max_seq_length=$seq_length \
$use_hvd $use_fp16 $use_xla_tag $case_flag |& tee $tmp_file
perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
echo "$precision $seq_length $train_batch_size $perf" >> $LOGFILE
done
done
done
elif [ "$task" = "ner_bc5cdr-disease" ] ; then
DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/disease
LOGFILE="${OUTPUT_DIR}/${task}_training_benchmark_bert_${bert_model}_gpu_${num_gpu}.log"
echo "Training performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
echo "Precision Sequence Length Batch size Performance(sent/sec)" >> $LOGFILE
for seq_length in 128 512; do
for train_batch_size in 8 32 64; do
for precision in fp16 fp32; do
res_dir=${OUTPUT_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_length}_prec_${precision}_bs_${batch_size}
mkdir -p ${res_dir}
tmp_file="${res_dir}/${task}_training_benchmark.log"
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--use_fp16"
use_xla_tag="--use_xla"
else
echo "fp32 activated!"
use_fp16=""
use_xla_tag=""
fi
$mpi_command python3 /workspace/bert/run_ner.py \
--do_prepare=true \
--do_train=true \
--do_eval=true \
--do_predict=true \
--task_name="bc5cdr" \
--vocab_file=$BERT_DIR/vocab.txt \
--bert_config_file=$BERT_DIR/bert_config.json \
--init_checkpoint="$BERT_DIR/bert_model.ckpt" \
--num_train_epochs=$epochs \
--data_dir=$DATASET_DIR \
--output_dir=$res_dir \
--train_batch_size=$train_batch_size \
--max_seq_length=$seq_length \
"$use_hvd" "$use_fp16" $use_xla_tag $case_flag |& tee $tmp_file
perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
echo "$precision $seq_length $train_batch_size $perf" >> $LOGFILE
done
done
done
elif [ "$task" = "rel_chemprot" ] ; then
DATASET_DIR=/workspace/bert/data/biobert/ChemProt
LOGFILE="${OUTPUT_DIR}/${task}_training_benchmark_bert_${bert_model}_gpu_${num_gpu}.log"
echo "Training performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
echo "Precision Sequence Length Batch size Performance(sent/sec)" >> $LOGFILE
for seq_length in 128 512; do
for train_batch_size in 8 32 64; do
for precision in fp16 fp32; do
res_dir=${OUTPUT_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_length}_prec_${precision}_bs_${batch_size}
mkdir -p ${res_dir}
tmp_file="${res_dir}/${task}_training_benchmark.log"
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--use_fp16"
use_xla_tag="--use_xla"
else
echo "fp32 activated!"
use_fp16=""
use_xla_tag=""
fi
$mpi_command python3 /workspace/bert/run_re.py \
--do_prepare=true \
--do_train=true \
--do_eval=true \
--do_predict=true \
--task_name="chemprot" \
--vocab_file=$BERT_DIR/vocab.txt \
--bert_config_file=$BERT_DIR/bert_config.json \
--init_checkpoint="$BERT_DIR/bert_model.ckpt" \
--num_train_epochs=$epochs \
--data_dir=$DATASET_DIR \
--output_dir=$res_dir \
--train_batch_size=$train_batch_size \
--max_seq_length=$seq_length \
"$use_hvd" "$use_fp16" $use_xla_tag $case_flag |& tee $tmp_file
perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
echo "$precision $seq_length $train_batch_size $perf" >> $LOGFILE
done
done
done
else
echo "Benchmarking for " $task "currently not supported. Sorry!"
fi
@@ -0,0 +1,86 @@
#!/bin/bash
echo "Container nvidia build = " $NVIDIA_BUILD_ID
init_checkpoint=${1:-"/results/biobert_tf_uncased_base/model.ckpt-4340"}
train_batch_size=${2:-8}
learning_rate=${3:-3.125e-6}
cased=${4:-false}
precision=${5:-"fp16"}
use_xla=${6:-"true"}
num_gpu=${7:-"16"}
seq_length=${8:-128}
bert_model=${9:-"base"}
eval_batch_size=${10:-8} #Eval and Predict BS is assumed to be same
epochs=${11:-"10.0"}
if [ "$cased" = "true" ] ; then
DO_LOWER_CASE=0
CASING_DIR_PREFIX="cased"
case_flag="--do_lower_case=False"
else
DO_LOWER_CASE=1
CASING_DIR_PREFIX="uncased"
case_flag="--do_lower_case=True"
fi
if [ "$bert_model" = "large" ] ; then
export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-24_H-1024_A-16
else
export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
fi
export GBS=$(expr $train_batch_size \* $num_gpu)
printf -v TAG "tf_bert_biobert_ner_bc5cdr_chem_%s_%s_gbs%d" "$bert_model" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`
DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/chem
OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
mkdir -p ${OUTPUT_DIR}
use_fp16=""
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--use_fp16"
fi
if [ "$use_xla" = "true" ] ; then
use_xla_tag="--use_xla"
echo "XLA activated"
else
use_xla_tag=""
fi
if [ $num_gpu -gt 1 ] ; then
mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
--allow-run-as-root -bind-to none -map-by slot \
-x NCCL_DEBUG=INFO \
-x LD_LIBRARY_PATH \
-x PATH -mca pml ob1 -mca btl ^openib"
use_hvd="--horovod"
else
mpi_command=""
use_hvd=""
fi
$mpi python /workspace/bert/run_ner.py \
--do_prepare=true \
--do_train=true \
--do_eval=true \
--do_predict=true \
--task_name=bc5cdr \
--vocab_file=$BERT_DIR/vocab.txt \
--bert_config_file=$BERT_DIR/bert_config.json \
--init_checkpoint=$init_checkpoint \
--num_train_epochs=$epochs \
--data_dir=$DATASET_DIR \
--output_dir=$OUTPUT_DIR \
--learning_rate=$learning_rate \
--train_batch_size=$train_batch_size \
--eval_batch_size=$eval_batch_size \
--predict_batch_size=$eval_batch_size \
--max_seq_length=$seq_length \
$use_hvd $use_fp16 $use_xla_tag $case_flag
@@ -0,0 +1,85 @@
#!/bin/bash
echo "Container nvidia build = " $NVIDIA_BUILD_ID
init_checkpoint=${1:-"/results/biobert_tf_uncased_base/model.ckpt-4340"}
train_batch_size=${2:-8}
learning_rate=${3:-3.125e-6}
cased=${4:-false}
precision=${5:-"fp16"}
use_xla=${6:-"true"}
num_gpu=${7:-"16"}
seq_length=${8:-128}
bert_model=${9:-"base"}
eval_batch_size=${10:-8} #Eval and Predict BS is assumed to be same
epochs=${11:-"100.0"}
if [ "$cased" = "true" ] ; then
DO_LOWER_CASE=0
CASING_DIR_PREFIX="cased"
case_flag="--do_lower_case=False"
else
DO_LOWER_CASE=1
CASING_DIR_PREFIX="uncased"
case_flag="--do_lower_case=True"
fi
if [ "$bert_model" = "large" ] ; then
export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-24_H-1024_A-16
else
export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
fi
export GBS=$(expr $train_batch_size \* $num_gpu)
printf -v TAG "tf_bert_biobert_ner_bc5cdr_disease_%s_%s_gbs%d" "$bert_model" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`
DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/disease
OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
mkdir -p ${OUTPUT_DIR}
use_fp16=""
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--use_fp16"
fi
if [ "$use_xla" = "true" ] ; then
use_xla_tag="--use_xla"
echo "XLA activated"
else
use_xla_tag=""
fi
if [ $num_gpu -gt 1 ] ; then
mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
--allow-run-as-root -bind-to none -map-by slot \
-x NCCL_DEBUG=INFO \
-x LD_LIBRARY_PATH \
-x PATH -mca pml ob1 -mca btl ^openib"
use_hvd="--horovod"
else
mpi_command=""
use_hvd=""
fi
$mpi_command python3 /workspace/bert/run_ner.py \
--do_prepare=true \
--do_train=true \
--do_eval=true \
--do_predict=true \
--task_name="bc5cdr" \
--vocab_file=$BERT_DIR/vocab.txt \
--bert_config_file=$BERT_DIR/bert_config.json \
--init_checkpoint=$init_checkpoint \
--num_train_epochs=$epochs \
--data_dir=$DATASET_DIR \
--output_dir=$OUTPUT_DIR \
--learning_rate=$learning_rate \
--train_batch_size=$train_batch_size \
--eval_batch_size=$eval_batch_size \
--predict_batch_size=$eval_batch_size \
--max_seq_length=$seq_length \
"$use_hvd" "$use_fp16" $use_xla_tag $case_flag
@@ -0,0 +1,87 @@
#!/bin/bash
echo "Container nvidia build = " $NVIDIA_BUILD_ID
init_checkpoint=${1:-"/results/biobert_tf_uncased_base/model.ckpt-4340"}
train_batch_size=${2:-64}
learning_rate=${3:-1.5e-6}
cased=${4:-false}
precision=${5:-"fp16"}
use_xla=${6:-"true"}
num_gpu=${7:-"16"}
seq_length=${8:-512}
bert_model=${9:-"base"}
eval_batch_size=${10:-16} #Eval and Predict BS is assumed to be same
epochs=${11:-"3.0"}
if [ "$cased" = "true" ] ; then
DO_LOWER_CASE=0
CASING_DIR_PREFIX="cased"
case_flag="--do_lower_case=False"
else
DO_LOWER_CASE=1
CASING_DIR_PREFIX="uncased"
case_flag="--do_lower_case=True"
fi
if [ "$bert_model" = "large" ] ; then
export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-24_H-1024_A-16
else
export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
fi
export GBS=$(expr $train_batch_size \* $num_gpu)
printf -v TAG "tf_bert_biobert_rel_chemprot_%s_%s_gbs%d" "$bert_model" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`
DATASET_DIR=/workspace/bert/data/biobert/ChemProt
OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
mkdir -p ${OUTPUT_DIR}
use_fp16=""
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--use_fp16"
fi
if [ "$use_xla" = "true" ] ; then
use_xla_tag="--use_xla"
echo "XLA activated"
else
use_xla_tag=""
fi
if [ $num_gpu -gt 1 ] ; then
mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
--allow-run-as-root -bind-to none -map-by slot \
-x NCCL_DEBUG=INFO \
-x LD_LIBRARY_PATH \
-x PATH -mca pml ob1 -mca btl ^openib"
use_hvd="--horovod"
else
mpi_command=""
use_hvd=""
fi
$mpi_command python3 /workspace/bert/run_re.py \
--do_prepare=true \
--do_train=true \
--do_eval=true \
--do_predict=true \
--task_name="chemprot" \
--vocab_file=$BERT_DIR/vocab.txt \
--bert_config_file=$BERT_DIR/bert_config.json \
--init_checkpoint=$init_checkpoint \
--num_train_epochs=$epochs \
--data_dir=$DATASET_DIR \
--output_dir=$OUTPUT_DIR \
--learning_rate=$learning_rate \
--train_batch_size=$train_batch_size \
--eval_batch_size=$eval_batch_size \
--predict_batch_size=$eval_batch_size \
--max_seq_length=$seq_length \
"$use_hvd" "$use_fp16" $use_xla_tag $case_flag
python3 /workspace/bert/biobert/re_eval.py --task=chemprot --output_path=$OUTPUT_DIR/test_results.tsv \
--answer_path=$DATASET_DIR/test.tsv |& tee $OUTPUT_DIR/test_results.txt
@@ -0,0 +1,87 @@
#!/bin/bash
#SBATCH --exclusive
#SBATCH --mem=0
#SBATCH --overcommit
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -eux
readonly docker_image="nvcr.io/nvidia/tensorflow:19.08-py3"
readonly datadir="/raid/data/bert"
readonly checkpointdir="$PWD/checkpoints"
readonly mounts=".:/workspace/bert,${datadir}:/workspace/bert/data,${checkpointdir}:/results"
DO_LOWER_CASE=${DO_LOWER_CASE:-1}
if [ "$DO_LOWER_CASE" == "1" ]; then
CASING_DIR_PREFIX="uncased"
else
CASING_DIR_PREFIX="cased"
fi
DO_BERT_BASE=${DO_BERT_BASE:-1}
if [ "$DO_BERT_BASE" == "1" ]; then
CASING_DIR_SUFFIX="L-12_H-768_A-12"
else
CASING_DIR_SUFFIX="L-24_H-1024_A-16"
fi
srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}/biobert_phase_1"
srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}/biobert_phase_2"
PHASE1="\
--train_batch_size=${BATCHSIZE:-128} \
--learning_rate=${LEARNING_RATE:-3.2e-5} \
--num_accumulation_steps=${NUM_ACCUMULATION_STEPS:-128} \
--input_files_dir=lower_case_${DO_LOWER_CASE}_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/training \
--eval_files_dir=lower_case_${DO_LOWER_CASE}_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/test \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--num_train_steps=19531 \
--num_warmup_steps=1953 \
--output_dir=/results/biobert_phase_1 \
"
PHASE2="\
--train_batch_size=${BATCHSIZE:-16} \
--learning_rate=${LEARNING_RATE:-6.4e-5} \
--num_accumulation_steps=${NUM_ACCUMULATION_STEPS:-512} \
--input_files_dir=/workspace/bert/data/tfrecord/lower_case_${DO_LOWER_CASE}_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/training \
--eval_files_dir=/workspace/bert/data/tfrecord/lower_case_${DO_LOWER_CASE}_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/test \
--max_seq_length=512 \
--max_predictions_per_seq=80 \
--num_train_steps=4340 \
--num_warmup_steps=434 \
--output_dir=/results/biobert_phase_2 \
--init_checkpoint=/results/biobert_phase_1/model.ckpt-19531 \
"
PHASES=( "$PHASE1" "$PHASE2" )
PHASE=${PHASE:-1}
BERT_CMD="\
python /workspace/bert/run_pretraining.py \
${PHASES[$((PHASE-1))]} \
--bert_config_file=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_${CASING_DIR_SUFFIX}/bert_config.json \
--vocab_file=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_${CASING_DIR_SUFFIX}/vocab.txt \
--do_train=True \
--do_eval=True \
--save_checkpoints_steps=5000 \
--horovod --use_fp16 --use_xla \
--allreduce_post_accumulation=True \
--eval_batch_size=8"
srun --mpi=pmi2 -l --container-image="${docker_image}" --container-mounts="${mounts}" bash -c "${BERT_CMD}"
@@ -0,0 +1,122 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
task=${1:-"ner_bc5cdr-chem"}
init_checkpoint=${2:-"/results/biobert_tf_uncased_base/model.ckpt-4340"}
bert_model=${3:-"base"}
cased=${4:-"false"}
precision=${5:-"fp16"}
use_xla=${6:-"true"}
batch_size=${7:-"16"}
if [ "$cased" = "true" ] ; then
DO_LOWER_CASE=0
CASING_DIR_PREFIX="cased"
case_flag="--do_lower_case=False"
else
DO_LOWER_CASE=1
CASING_DIR_PREFIX="uncased"
case_flag="--do_lower_case=True"
fi
if [ "$bert_model" = "large" ] ; then
export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-24_H-1024_A-16
else
export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
fi
use_fp16=""
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--use_fp16"
fi
if [ "$use_xla" = "true" ] ; then
use_xla_tag="--use_xla"
echo "XLA activated"
else
use_xla_tag=""
fi
DATESTAMP=`date +'%y%m%d%H%M%S'`
if [ "$task" = "ner_bc5cdr-chem" ] ; then
printf -v TAG "tf_bert_biobert_ner_bc5cdr_chem_inference_%s_%s" "$bert_model" "$precision"
DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/chem
OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
python /workspace/bert/run_ner.py \
--do_prepare=true \
--do_eval=true \
--do_predict=true \
--task_name="bc5cdr" \
--vocab_file=$BERT_DIR/vocab.txt \
--bert_config_file=$BERT_DIR/bert_config.json \
--init_checkpoint=$init_checkpoint \
--data_dir=$DATASET_DIR \
--output_dir=$OUTPUT_DIR \
--eval_batch_size=$batch_size \
--predict_batch_size=$batch_size \
--max_seq_length=128 \
$use_fp16 $use_xla_tag $case_flag
elif [ "$task" = "ner_bc5cdr-disease" ] ; then
printf -v TAG "tf_bert_biobert_ner_bc5cdr_disease_inference_%s_%s" "$bert_model" "$precision"
DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/disease
OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
python3 /workspace/bert/run_ner.py \
--do_prepare=true \
--do_eval=true \
--do_predict=true \
--task_name="bc5cdr" \
--vocab_file=$BERT_DIR/vocab.txt \
--bert_config_file=$BERT_DIR/bert_config.json \
--init_checkpoint=$init_checkpoint \
--data_dir=$DATASET_DIR \
--output_dir=$OUTPUT_DIR \
--eval_batch_size=$batch_size \
--predict_batch_size=$batch_size \
--max_seq_length=128 \
"$use_fp16" $use_xla_tag $case_flag
elif [ "$task" = "rel_chemprot" ] ; then
printf -v TAG "tf_bert_biobert_rel_chemprot_inference_%s_%s_" "$bert_model" "$precision"
DATASET_DIR=/workspace/bert/data/biobert/ChemProt
OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
python3 /workspace/bert/run_re.py \
--do_prepare=true \
--do_eval=true \
--do_predict=true \
--task_name="chemprot" \
--vocab_file=$BERT_DIR/vocab.txt \
--bert_config_file=$BERT_DIR/bert_config.json \
--init_checkpoint=$init_checkpoint \
--data_dir=$DATASET_DIR \
--output_dir=$OUTPUT_DIR \
--eval_batch_size=$batch_size \
--predict_batch_size=$batch_size \
--max_seq_length=512 \
"$use_fp16" $use_xla_tag $case_flag
python3 /workspace/bert/biobert/re_eval.py --task=chemprot --output_path=$OUTPUT_DIR/test_results.tsv \
--answer_path=$DATASET_DIR/test.tsv |& tee $OUTPUT_DIR/test_results.txt
else
echo "Benchmarking for " $task "currently not supported. Sorry!"
fi
@@ -0,0 +1,87 @@
#! /bin/bash
echo "Container nvidia build = " $NVIDIA_BUILD_ID
train_batch_size=${1:-128}
learning_rate=${2:-"9.625e-5"}
cased=${3:-false}
precision=${4:-"fp16"}
use_xla=${5:-"true"}
num_gpus=${6:-16}
warmup_steps=${7:-"1953"}
train_steps=${8:-19531}
num_accumulation_steps=${9:-32}
save_checkpoint_steps=${10:-5000}
eval_batch_size=${11:-80}
use_fp16=""
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--use_fp16"
fi
if [ "$use_xla" = "true" ] ; then
use_xla_tag="--use_xla"
echo "XLA activated"
else
use_xla_tag=""
fi
if [ "$cased" = "true" ] ; then
DO_LOWER_CASE=0
CASING_DIR_PREFIX="cased"
else
DO_LOWER_CASE=1
CASING_DIR_PREFIX="uncased"
fi
BERT_CONFIG=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12/bert_config.json
RESULTS_DIR=/results
CHECKPOINTS_DIR=${RESULTS_DIR}/biobert_phase_1
mkdir -p ${CHECKPOINTS_DIR}
INIT_CHECKPOINT=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12/bert_model.ckpt
INPUT_FILES_DIR="/workspace/bert/data/tfrecord/lower_case_${DO_LOWER_CASE}_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/training"
EVAL_FILES_DIR="/workspace/bert/data/tfrecord/lower_case_${DO_LOWER_CASE}_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/test"
if [ $num_gpu -gt 1 ] ; then
mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
--allow-run-as-root -bind-to none -map-by slot \
-x NCCL_DEBUG=INFO \
-x LD_LIBRARY_PATH \
-x PATH -mca pml ob1 -mca btl ^openib"
use_hvd="--horovod"
else
mpi_command=""
use_hvd=""
fi
export GBS=$(expr $train_batch_size \* $num_gpus \* num_accumulation_steps)
printf -v TAG "tf_bert_bio_1n_phase1_cased_%s_%s_gbs%d" "$cased" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`
LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
printf "Logs written to %s\n" "$LOGFILE"
$mpi python3 /workspace/bert/run_pretraining.py \
--input_files_dir=$INPUT_FILES_DIR \
--eval_files_dir=$EVAL_FILES_DIR \
--output_dir=$CHECKPOINTS_DIR \
--bert_config_file=$BERT_CONFIG \
--do_train=True \
--do_eval=True \
--train_batch_size=$train_batch_size \
--eval_batch_size=$eval_batch_size \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--num_train_steps=$train_steps \
--num_warmup_steps=$warmup_steps \
--save_checkpoints_steps=$save_checkpoint_steps \
--num_accumulation_steps=$num_accumulation_steps \
--learning_rate=$learning_rate \
--report_loss \
--$use_hvd $use_fp16 $use_xla_tag \
--init_checkpoint=$INIT_CHECKPOINT |& tee $LOGFILE
@@ -0,0 +1,85 @@
#! /bin/bash
echo "Container nvidia build = " $NVIDIA_BUILD_ID
init_checkpoint=${1}
train_batch_size=${2:-16}
learning_rate=${3:-"2.9e-4"}
cased=${4:-false}
precision=${5:-"fp16"}
use_xla=${6:-true}
num_gpus=${7:-16}
warmup_steps=${8:-"434"}
train_steps=${9:-4340}
num_accumulation_steps=${10:-128}
save_checkpoint_steps=${11:-5000}
eval_batch_size=${12:-26}
use_fp16=""
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--use_fp16"
fi
if [ "$use_xla" = "true" ] ; then
use_xla_tag="--use_xla"
echo "XLA activated"
else
use_xla_tag=""
fi
if [ "$cased" = "true" ] ; then
DO_LOWER_CASE=0
CASING_DIR_PREFIX="cased"
else
DO_LOWER_CASE=1
CASING_DIR_PREFIX="uncased"
fi
BERT_CONFIG=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12/bert_config.json
RESULTS_DIR=/results
CHECKPOINTS_DIR=${RESULTS_DIR}/biobert_phase_2
mkdir -p ${CHECKPOINTS_DIR}
INPUT_FILES_DIR="/workspace/bert/data/tfrecord/lower_case_${DO_LOWER_CASE}_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/training"
EVAL_FILES_DIR="/workspace/bert/data/tfrecord/lower_case_${DO_LOWER_CASE}_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/test"
if [ $num_gpu -gt 1 ] ; then
mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
--allow-run-as-root -bind-to none -map-by slot \
-x NCCL_DEBUG=INFO \
-x LD_LIBRARY_PATH \
-x PATH -mca pml ob1 -mca btl ^openib"
use_hvd="--horovod"
else
mpi_command=""
use_hvd=""
fi
export GBS=$(expr $train_batch_size \* $num_gpus \* num_accumulation_steps)
printf -v TAG "tf_bert_bio_1n_phase2_cased_%s_%s_gbs%d" "$cased" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`
LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
printf "Logs written to %s\n" "$LOGFILE"
$mpi python3 /workspace/bert/run_pretraining.py \
--input_files_dir=$INPUT_FILES_DIR \
--eval_files_dir=$EVAL_FILES_DIR \
--output_dir=$CHECKPOINTS_DIR \
--bert_config_file=$BERT_CONFIG \
--do_train=True \
--do_eval=True \
--train_batch_size=$train_batch_size \
--eval_batch_size=$eval_batch_size \
--max_seq_length=512 \
--max_predictions_per_seq=80 \
--num_train_steps=$train_steps \
--num_warmup_steps=$warmup_steps \
--save_checkpoints_steps=$save_checkpoint_steps \
--num_accumulation_steps=$num_accumulation_steps \
--learning_rate=$learning_rate \
--report_loss \
--$use_hvd $use_xla_tag $use_fp16 \
--init_checkpoint=$INIT_CHECKPOINT |& tee $LOGFILE
@@ -0,0 +1,206 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#1 DGX1 phase1
bert--DGX1:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "1"
BATCHSIZE: "8"
LEARNING_RATE: "7.5e-4"
NUM_ACCUMULATION_STEPS: "1024"
PHASE: "1"
#4 DGX1 phase1
bert--DGX1_n4:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "4"
BATCHSIZE: "8"
LEARNING_RATE: "1.875e-4"
NUM_ACCUMULATION_STEPS: "256"
PHASE: "1"
#16 DGX1 phase1
bert--DGX1_n16:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "16"
BATCHSIZE: "8"
LEARNING_RATE: "4.6875e-5"
NUM_ACCUMULATION_STEPS: "64"
PHASE: "1"
#32 DGX1 phase1
bert--DGX1_n32:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "32"
BATCHSIZE: "8"
LEARNING_RATE: "2.34375e-5"
NUM_ACCUMULATION_STEPS: "32"
PHASE: "1"
#1 DGX2 phase1
bert--DGX2:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "1"
BATCHSIZE: "32"
LEARNING_RATE: "3.75e-4"
NUM_ACCUMULATION_STEPS: "128"
PHASE: "1"
#4 DGX2 phase1
bert--DGX2_n4:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "4"
BATCHSIZE: "32"
LEARNING_RATE: "9.375e-5"
NUM_ACCUMULATION_STEPS: "32"
PHASE: "1"
#16 DGX2 phase1
bert--DGX2_n16:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "16"
BATCHSIZE: "256"
LEARNING_RATE: "3.75e-4"
NUM_ACCUMULATION_STEPS: "4"
PHASE: "1"
#32 DGX2 phase1
bert--DGX2_n32:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "32"
BATCHSIZE: "32"
LEARNING_RATE: "2.34375e-5"
NUM_ACCUMULATION_STEPS: "8"
PHASE: "1"
#1 DGX1 phase2
bert--DGX1_n1p2:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "1"
BATCHSIZE: "2"
LEARNING_RATE: "5e-4"
NUM_ACCUMULATION_STEPS: "4096"
PHASE: "2"
#4 DGX1 phase2
bert--DGX1_n4p2:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "4"
BATCHSIZE: "2"
LEARNING_RATE: "1.25e-4"
NUM_ACCUMULATION_STEPS: "512"
PHASE: "2"
#16 DGX1 phase2
bert--DGX1_n16p2:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "16"
BATCHSIZE: "2"
LEARNING_RATE: "1.5625e-5"
NUM_ACCUMULATION_STEPS: "128"
PHASE: "2"
#32 DGX1 phase2
bert--DGX1_n32p2:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "32"
BATCHSIZE: "2"
LEARNING_RATE: "1.5625e-5"
NUM_ACCUMULATION_STEPS: "64"
PHASE: "2"
#1 DGX2 phase2
bert--DGX2_n1p2:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "1"
BATCHSIZE: "8"
LEARNING_RATE: "2.5e-5"
NUM_ACCUMULATION_STEPS: "256"
PHASE: "2"
#4 DGX2 phase2
bert--DGX2_n4p2:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "4"
BATCHSIZE: "8"
LEARNING_RATE: "6.25e-5"
NUM_ACCUMULATION_STEPS: "64"
PHASE: "2"
#16 DGX2 phase2
bert--DGX2_n16p2:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "16"
BATCHSIZE: "8"
LEARNING_RATE: "1.5625e-5"
NUM_ACCUMULATION_STEPS: "16"
PHASE: "2"
#32 DGX2 phase2
bert--DGX2_n32p2:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "32"
BATCHSIZE: "8"
LEARNING_RATE: "7.8125e-6"
NUM_ACCUMULATION_STEPS: "8"
PHASE: "2"
@@ -0,0 +1,26 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import subprocess
class BooksDownloader:
def __init__(self, save_path):
self.save_path = save_path
pass
def download(self):
bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out'
bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
bookscorpus_download_command += ' --trash-bad-count'
bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
@@ -0,0 +1,32 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import os
class BookscorpusTextFormatting:
def __init__(self, books_path, output_filename, recursive = False):
self.books_path = books_path
self.recursive = recursive
self.output_filename = output_filename
# This puts one book per line
def merge(self):
with open(self.output_filename, mode='w', newline='\n') as ofile:
for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file:
for line in file:
if line.strip() != '':
ofile.write(line.strip() + ' ')
ofile.write("\n\n")
@@ -0,0 +1,120 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader
from NVIDIAPretrainedWeightDownloader import NVIDIAPretrainedWeightDownloader
from WikiDownloader import WikiDownloader
from BooksDownloader import BooksDownloader
from GLUEDownloader import GLUEDownloader
from SquadDownloader import SquadDownloader
from PubMedDownloader import PubMedDownloader
class Downloader:
def __init__(self, dataset_name, save_path):
self.dataset_name = dataset_name
self.save_path = save_path
def download(self):
if self.dataset_name == 'bookscorpus':
self.download_bookscorpus()
elif self.dataset_name == 'wikicorpus_en':
self.download_wikicorpus('en')
elif self.dataset_name == 'wikicorpus_zh':
self.download_wikicorpus('zh')
elif self.dataset_name == 'pubmed_baseline':
self.download_pubmed('baseline')
elif self.dataset_name == 'pubmed_daily_update':
self.download_pubmed('daily_update')
elif self.dataset_name == 'pubmed_fulltext':
self.download_pubmed('fulltext')
elif self.dataset_name == 'pubmed_open_access':
self.download_pubmed('open_access')
elif self.dataset_name == 'google_pretrained_weights':
self.download_google_pretrained_weights()
elif self.dataset_name == 'nvidia_pretrained_weights':
self.download_nvidia_pretrained_weights()
elif self.dataset_name == 'MRPC':
self.download_glue(self.dataset_name)
elif self.dataset_name == 'MNLI':
self.download_glue(self.dataset_name)
elif self.dataset_name == 'CoLA':
self.download_glue(self.dataset_name)
elif self.dataset_name == 'squad':
self.download_squad()
elif self.dataset_name == 'all':
self.download_bookscorpus()
self.download_wikicorpus('en')
self.download_wikicorpus('zh')
self.download_pubmed('baseline')
self.download_pubmed('daily_update')
self.download_pubmed('fulltext')
self.download_pubmed('open_access')
self.download_google_pretrained_weights()
self.download_nvidia_pretrained_weights()
self.download_glue("CoLA")
self.download_glue("MNLI")
self.download_glue("MRPC")
self.download_squad()
else:
print(self.dataset_name)
assert False, 'Unknown dataset_name provided to downloader'
def download_bookscorpus(self):
downloader = BooksDownloader(self.save_path)
downloader.download()
def download_wikicorpus(self, language):
downloader = WikiDownloader(language, self.save_path)
downloader.download()
def download_pubmed(self, subset):
downloader = PubMedDownloader(subset, self.save_path)
downloader.download()
def download_google_pretrained_weights(self):
downloader = GooglePretrainedWeightDownloader(self.save_path)
downloader.download()
def download_nvidia_pretrained_weights(self):
downloader = NVIDIAPretrainedWeightDownloader(self.save_path)
downloader.download()
def download_glue(self, glue_task_name):
downloader = GLUEDownloader(glue_task_name, self.save_path)
downloader.download()
def download_squad(self):
downloader = SquadDownloader(self.save_path)
downloader.download()
@@ -0,0 +1,109 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bz2
import os
import urllib
import sys
import zipfile
import io
URLLIB=urllib
if sys.version_info >= (3, 0):
URLLIB=urllib.request
class GLUEDownloader:
def __init__(self, task, save_path):
# Documentation - Download link obtained from here: https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py
self.TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
"MRPC":{"mrpc_dev": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
"mrpc_train": 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt',
"mrpc_test": 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'},
"QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
"STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
"MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
"SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
"QNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLI.zip?alt=media&token=c24cad61-f2df-4f04-9ab6-aa576fa829d0',
"RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
"WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
"diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
self.save_path = save_path
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
self.task = task
def download(self):
if self.task == 'MRPC':
self.download_mrpc()
elif self.task == 'diagnostic':
self.download_diagnostic()
else:
self.download_and_extract(self.task)
def download_and_extract(self, task):
print("Downloading and extracting %s..." % task)
data_file = "%s.zip" % task
URLLIB.urlretrieve(self.TASK2PATH[task], data_file)
print(data_file,"\n\n\n")
with zipfile.ZipFile(data_file) as zip_ref:
zip_ref.extractall(self.save_path)
os.remove(data_file)
print("\tCompleted!")
def download_mrpc(self):
print("Processing MRPC...")
mrpc_dir = os.path.join(self.save_path, "MRPC")
if not os.path.isdir(mrpc_dir):
os.mkdir(mrpc_dir)
mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
mrpc_dev_file = os.path.join(mrpc_dir, "dev_ids.tsv")
mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_train"], mrpc_train_file)
URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_test"], mrpc_test_file)
URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_dev"], mrpc_dev_file)
dev_ids = []
with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
for row in ids_fh:
dev_ids.append(row.strip().split('\t'))
with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \
io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \
io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
header = data_fh.readline()
train_fh.write(header)
dev_fh.write(header)
for row in data_fh:
label, id1, id2, s1, s2 = row.strip().split('\t')
if [id1, id2] in dev_ids:
dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
else:
train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
header = data_fh.readline()
test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
for idx, row in enumerate(data_fh):
label, id1, id2, s1, s2 = row.strip().split('\t')
test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
print("\tCompleted!")
@@ -0,0 +1,158 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
import os
import urllib.request
import zipfile
class GooglePretrainedWeightDownloader:
def __init__(self, save_path):
self.save_path = save_path + '/google_pretrained_weights'
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
# Download urls
self.model_urls = {
'bert_base_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'),
'bert_large_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'),
'bert_base_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'),
'bert_large_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'),
'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
}
# SHA256sum verification for file download integrity (and checking for changes from the download source over time)
self.bert_base_uncased_sha = {
'bert_config.json': '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
'bert_model.ckpt.data-00000-of-00001': '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84',
'bert_model.ckpt.index': '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b',
'bert_model.ckpt.meta': 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e',
'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
}
self.bert_large_uncased_sha = {
'bert_config.json': 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
'bert_model.ckpt.data-00000-of-00001': 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1',
'bert_model.ckpt.index': '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093',
'bert_model.ckpt.meta': '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1',
'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
}
self.bert_base_cased_sha = {
'bert_config.json': 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
'bert_model.ckpt.data-00000-of-00001': '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea',
'bert_model.ckpt.index': '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1',
'bert_model.ckpt.meta': '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98',
'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
}
self.bert_large_cased_sha = {
'bert_config.json': '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
'bert_model.ckpt.data-00000-of-00001': '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0',
'bert_model.ckpt.index': 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf',
'bert_model.ckpt.meta': 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1',
'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
}
self.bert_base_multilingual_cased_sha = {
'bert_config.json': 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
'bert_model.ckpt.data-00000-of-00001': '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
'bert_model.ckpt.index': '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
'bert_model.ckpt.meta': '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
'vocab.txt': 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
}
self.bert_large_multilingual_uncased_sha = {
'bert_config.json': '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
'bert_model.ckpt.data-00000-of-00001': '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
'bert_model.ckpt.index': '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
'bert_model.ckpt.meta': '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
'vocab.txt': '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
}
self.bert_base_chinese_sha = {
'bert_config.json': '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
'bert_model.ckpt.data-00000-of-00001': '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
'bert_model.ckpt.index': '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
'bert_model.ckpt.meta': 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
'vocab.txt': '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
}
# Relate SHA to urls for loop below
self.model_sha = {
'bert_base_uncased': self.bert_base_uncased_sha,
'bert_large_uncased': self.bert_large_uncased_sha,
'bert_base_cased': self.bert_base_cased_sha,
'bert_large_cased': self.bert_large_cased_sha,
'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha,
'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha,
'bert_base_chinese': self.bert_base_chinese_sha
}
# Helper to get sha256sum of a file
def sha256sum(self, filename):
h = hashlib.sha256()
b = bytearray(128*1024)
mv = memoryview(b)
with open(filename, 'rb', buffering=0) as f:
for n in iter(lambda : f.readinto(mv), 0):
h.update(mv[:n])
return h.hexdigest()
def download(self):
# Iterate over urls: download, unzip, verify sha256sum
found_mismatch_sha = False
for model in self.model_urls:
url = self.model_urls[model][0]
file = self.save_path + '/' + self.model_urls[model][1]
print('Downloading', url)
response = urllib.request.urlopen(url)
with open(file, 'wb') as handle:
handle.write(response.read())
print('Unzipping', file)
zip = zipfile.ZipFile(file, 'r')
zip.extractall(self.save_path)
zip.close()
sha_dict = self.model_sha[model]
for extracted_file in sha_dict:
sha = sha_dict[extracted_file]
if sha != self.sha256sum(file[:-4] + '/' + extracted_file):
found_mismatch_sha = True
print('SHA256sum does not match on file:', extracted_file, 'from download url:', url)
else:
print(file[:-4] + '/' + extracted_file, '\t', 'verified')
if not found_mismatch_sha:
print("All downloads pass sha256sum verification.")
def serialize(self):
pass
def deserialize(self):
pass
def listAvailableWeights(self):
print("Available Weight Datasets")
for item in self.model_urls:
print(item)
def listLocallyStoredWeights(self):
pass
@@ -0,0 +1,27 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
class NVIDIAPretrainedWeightDownloader:
def __init__(self, save_path):
self.save_path = save_path + '/nvidia_pretrained_weights'
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
pass
def download(self):
assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.'
@@ -0,0 +1,93 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bz2
import glob
import gzip
import os
import urllib.request
import shutil
import sys
class PubMedDownloader:
def __init__(self, subset, save_path):
self.subset = subset
# Modifying self.save_path in two steps to handle creation of subdirectories
self.save_path = save_path + '/pubmed' + '/'
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
self.save_path = self.save_path + '/' + subset
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
self.download_urls = {
'baseline' : 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/',
'daily_update' : 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/',
'fulltext' : 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/',
'open_access' : 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/'
}
def download(self):
print('subset:', self.subset)
url = self.download_urls[self.subset]
self.download_files(url)
self.extract_files()
def download_files(self, url):
url = self.download_urls[self.subset]
output = os.popen('curl ' + url).read()
if self.subset == 'fulltext' or self.subset == 'open_access':
line_split = 'comm_use' if self.subset == 'fulltext' else 'non_comm_use'
for line in output.splitlines():
if line[-10:] == 'xml.tar.gz' and \
line.split(' ')[-1].split('.')[0] == line_split:
file = os.path.join(self.save_path, line.split(' ')[-1])
if not os.path.isfile(file):
print('Downloading', file)
response = urllib.request.urlopen(url + line.split(' ')[-1])
with open(file, "wb") as handle:
handle.write(response.read())
elif self.subset == 'baseline' or self.subset == 'daily_update':
for line in output.splitlines():
if line[-3:] == '.gz':
file = os.path.join(self.save_path, line.split(' ')[-1])
if not os.path.isfile(file):
print('Downloading', file)
response = urllib.request.urlopen(url + line.split(' ')[-1])
with open(file, "wb") as handle:
handle.write(response.read())
else:
assert False, 'Invalid PubMed dataset/subset specified.'
def extract_files(self):
files = glob.glob(self.save_path + '/*.xml.gz')
for file in files:
print('file:', file)
input = gzip.GzipFile(file, mode='rb')
s = input.read()
input.close()
out = open(file[:-3], mode='wb')
out.write(s)
out.close()
@@ -0,0 +1,44 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import os
import pubmed_parser as pmp
class PubMedTextFormatting:
def __init__(self, pubmed_path, output_filename, recursive = False):
self.pubmed_path = pubmed_path
self.recursive = recursive
self.output_filename = output_filename
# This puts one article per line
def merge(self):
print('PubMed path:', self.pubmed_path)
with open(self.output_filename, mode='w', newline='\n') as ofile:
for filename in glob.glob(self.pubmed_path + '/*.xml*', recursive=self.recursive):
print('file:', filename)
dicts_out = pmp.parse_medline_xml(filename)
for dict_out in dicts_out:
if not dict_out['abstract']:
continue
try:
for line in dict_out['abstract'].splitlines():
if len(line) < 30:
continue
ofile.write(line.strip() + " ")
ofile.write("\n\n")
except:
ofile.write("\n\n")
continue
@@ -0,0 +1,32 @@
Steps to reproduce datasets from web
1) Build the container
* docker build -t bert_tf .
2) Run the container interactively
* nvidia-docker run -it --ipc=host bert_tf
* Optional: Mount data volumes
* -v yourpath:/workspace/bert/data/wikipedia_corpus/download
* -v yourpath:/workspace/bert/data/wikipedia_corpus/extracted_articles
* -v yourpath:/workspace/bert/data/wikipedia_corpus/raw_data
* -v yourpath:/workspace/bert/data/wikipedia_corpus/intermediate_files
* -v yourpath:/workspace/bert/data/wikipedia_corpus/final_text_file_single
* -v yourpath:/workspace/bert/data/wikipedia_corpus/final_text_files_sharded
* -v yourpath:/workspace/bert/data/wikipedia_corpus/final_tfrecords_sharded
* -v yourpath:/workspace/bert/data/bookcorpus/download
* -v yourpath:/workspace/bert/data/bookcorpus/final_text_file_single
* -v yourpath:/workspace/bert/data/bookcorpus/final_text_files_sharded
* -v yourpath:/workspace/bert/data/bookcorpus/final_tfrecords_sharded
* Optional: Select visible GPUs
* -e CUDA_VISIBLE_DEVICES=0
** Inside of the container starting here**
3) Download pretrained weights (they contain vocab files for preprocessing)
* cd data/pretrained_models_google && python3 download_models.py
4) "One-click" SQuAD download
* cd /workspace/bert/data/squad && . squad_download.sh
5) "One-click" Wikipedia data download and prep (provides tfrecords)
* Set your configuration in data/wikipedia_corpus/config.sh
* cd /data/wikipedia_corpus && ./run_preprocessing.sh
6) "One-click" BookCorpus data download and prep (provided tfrecords)
* Set your configuration in data/wikipedia_corpus/config.sh
* cd /data/bookcorpus && ./run_preprocessing.sh
@@ -0,0 +1,54 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bz2
import os
import urllib.request
import sys
class SquadDownloader:
def __init__(self, save_path):
self.save_path = save_path + '/squad'
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
if not os.path.exists(self.save_path + '/v1.1'):
os.makedirs(self.save_path + '/v1.1')
if not os.path.exists(self.save_path + '/v2.0'):
os.makedirs(self.save_path + '/v2.0')
self.download_urls = {
'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json',
'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json',
'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py',
'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json',
'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json',
'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py',
}
def download(self):
for item in self.download_urls:
url = item
file = self.download_urls[item]
print('Downloading:', url)
if os.path.isfile(self.save_path + '/' + file):
print('** Download file already exists, skipping download')
else:
response = urllib.request.urlopen(url)
with open(self.save_path + '/' + file, "wb") as handle:
handle.write(response.read())
@@ -0,0 +1,331 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
from itertools import islice
import multiprocessing
import os
import statistics
class Sharding:
def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
assert len(input_files) > 0, 'The input file list must contain at least one file.'
assert n_training_shards > 0, 'There must be at least one output shard.'
assert n_test_shards > 0, 'There must be at least one output shard.'
self.n_training_shards = n_training_shards
self.n_test_shards = n_test_shards
self.fraction_test_set = fraction_test_set
self.input_files = input_files
self.output_name_prefix = output_name_prefix
self.output_training_identifier = '_training'
self.output_test_identifier = '_test'
self.output_file_extension = '.txt'
self.articles = {} # key: integer identifier, value: list of articles
self.sentences = {} # key: integer identifier, value: list of sentences
self.output_training_files = {} # key: filename, value: list of articles to go into file
self.output_test_files = {} # key: filename, value: list of articles to go into file
self.init_output_files()
# Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
def load_articles(self):
print('Start: Loading Articles')
global_article_count = 0
for input_file in self.input_files:
print('input file:', input_file)
with open(input_file, mode='r', newline='\n') as f:
for i, line in enumerate(f):
if line.strip():
self.articles[global_article_count] = line.rstrip()
global_article_count += 1
print('End: Loading Articles: There are', len(self.articles), 'articles.')
def segment_articles_into_sentences(self, segmenter):
print('Start: Sentence Segmentation')
if len(self.articles) is 0:
self.load_articles()
assert len(self.articles) is not 0, 'Please check that input files are present and contain data.'
# TODO: WIP: multiprocessing (create independent ranges and spawn processes)
use_multiprocessing = 'serial'
def chunks(data, size=len(self.articles)):
it = iter(data)
for i in range(0, len(data), size):
yield {k: data[k] for k in islice(it, size)}
if use_multiprocessing == 'manager':
manager = multiprocessing.Manager()
return_dict = manager.dict()
jobs = []
n_processes = 7 # in addition to the main process, total = n_proc+1
def work(articles, return_dict):
sentences = {}
for i, article in enumerate(articles):
sentences[i] = segmenter.segment_string(articles[article])
if i % 5000 == 0:
print('Segmenting article', i)
return_dict.update(sentences)
for item in chunks(self.articles, len(self.articles)):
p = multiprocessing.Process(target=work, args=(item, return_dict))
# Busy wait
while len(jobs) >= n_processes:
pass
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
elif use_multiprocessing == 'queue':
work_queue = multiprocessing.Queue()
jobs = []
for item in chunks(self.articles, len(self.articles)):
pass
else: # serial option
for i, article in enumerate(self.articles):
self.sentences[i] = segmenter.segment_string(self.articles[article])
if i % 5000 == 0:
print('Segmenting article', i)
print('End: Sentence Segmentation')
def init_output_files(self):
print('Start: Init Output Files')
assert len(self.output_training_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
assert len(self.output_test_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
for i in range(self.n_training_shards):
name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
self.output_training_files[name] = []
for i in range(self.n_test_shards):
name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
self.output_test_files[name] = []
print('End: Init Output Files')
def get_sentences_per_shard(self, shard):
result = 0
for article_id in shard:
result += len(self.sentences[article_id])
return result
def distribute_articles_over_shards(self):
print('Start: Distribute Articles Over Shards')
assert len(self.articles) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.'
# Create dictionary with - key: sentence count per article, value: article id number
sentence_counts = defaultdict(lambda: [])
max_sentences = 0
total_sentences = 0
for article_id in self.sentences:
current_length = len(self.sentences[article_id])
sentence_counts[current_length].append(article_id)
max_sentences = max(max_sentences, current_length)
total_sentences += current_length
n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences)
nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards
nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards
consumed_article_set = set({})
unused_article_set = set(self.articles.keys())
# Make first pass and add one article worth of lines per file
for file in self.output_training_files:
current_article_id = sentence_counts[max_sentences][-1]
sentence_counts[max_sentences].pop(-1)
self.output_training_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
print('Warning: A single article contains more than the nominal number of sentences per training shard.')
for file in self.output_test_files:
current_article_id = sentence_counts[max_sentences][-1]
sentence_counts[max_sentences].pop(-1)
self.output_test_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
print('Warning: A single article contains more than the nominal number of sentences per test shard.')
training_counts = []
test_counts = []
for shard in self.output_training_files:
training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
for shard in self.output_test_files:
test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
training_median = statistics.median(training_counts)
test_median = statistics.median(test_counts)
# Make subsequent passes over files to find articles to add without going over limit
history_remaining = []
n_history_remaining = 4
while len(consumed_article_set) < len(self.articles):
for fidx, file in enumerate(self.output_training_files):
nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
nominal_next_article_size -= 1
if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or training_counts[fidx] > training_median:
continue # skip adding to this file, will come back later if no file can accept unused articles
current_article_id = sentence_counts[nominal_next_article_size][-1]
sentence_counts[nominal_next_article_size].pop(-1)
self.output_training_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
for fidx, file in enumerate(self.output_test_files):
nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
nominal_next_article_size -= 1
if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or test_counts[fidx] > test_median:
continue # skip adding to this file, will come back later if no file can accept unused articles
current_article_id = sentence_counts[nominal_next_article_size][-1]
sentence_counts[nominal_next_article_size].pop(-1)
self.output_test_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
# If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed
if len(history_remaining) == n_history_remaining:
history_remaining.pop(0)
history_remaining.append(len(unused_article_set))
history_same = True
for i in range(1, len(history_remaining)):
history_same = history_same and (history_remaining[i-1] == history_remaining[i])
if history_same:
nominal_sentences_per_training_shard += 1
# nominal_sentences_per_test_shard += 1
training_counts = []
test_counts = []
for shard in self.output_training_files:
training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
for shard in self.output_test_files:
test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
training_median = statistics.median(training_counts)
test_median = statistics.median(test_counts)
print('Distributing data over shards:', len(unused_article_set), 'articles remaining.')
if len(unused_article_set) != 0:
print('Warning: Some articles did not make it into output files.')
for shard in self.output_training_files:
print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard]))
for shard in self.output_test_files:
print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard]))
print('End: Distribute Articles Over Shards')
def write_shards_to_disk(self):
print('Start: Write Shards to Disk')
for shard in self.output_training_files:
self.write_single_shard(shard, self.output_training_files[shard], 'training')
for shard in self.output_test_files:
self.write_single_shard(shard, self.output_test_files[shard], 'test')
print('End: Write Shards to Disk')
def write_single_shard(self, shard_name, shard, split):
shard_split = os.path.split(shard_name)
shard_name = shard_split[0] + '/' + split + '/' + shard_split[1]
with open(shard_name, mode='w', newline='\n') as f:
for article_id in shard:
for line in self.sentences[article_id]:
f.write(line + '\n')
f.write('\n') # Line break between articles
import nltk
nltk.download('punkt')
class NLTKSegmenter:
def __init(self):
pass
def segment_string(self, article):
return nltk.tokenize.sent_tokenize(article)
@@ -0,0 +1,58 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bz2
import os
import urllib.request
import sys
import subprocess
class WikiDownloader:
def __init__(self, language, save_path):
self.save_path = save_path + '/wikicorpus_' + language
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
self.language = language
self.download_urls = {
'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
}
self.output_files = {
'en' : 'wikicorpus_en.xml.bz2',
'zh' : 'wikicorpus_zh.xml.bz2'
}
def download(self):
if self.language in self.download_urls:
url = self.download_urls[self.language]
filename = self.output_files[self.language]
print('Downloading:', url)
if os.path.isfile(self.save_path + '/' + filename):
print('** Download file already exists, skipping download')
else:
response = urllib.request.urlopen(url)
with open(self.save_path + '/' + filename, "wb") as handle:
handle.write(response.read())
# Always unzipping since this is relatively fast and will overwrite
print('Unzipping:', self.output_files[self.language])
subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
else:
assert False, 'WikiDownloader not implemented for this language yet.'
@@ -0,0 +1,46 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import os
class WikicorpusTextFormatting:
def __init__(self, wiki_path, output_filename, recursive = False):
self.wiki_path = wiki_path
self.recursive = recursive
self.output_filename = output_filename
# This puts one article per line
def merge(self):
with open(self.output_filename, mode='w', newline='\n') as ofile:
for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
print(filename)
article_lines = []
article_open = False
with open(filename, mode='r', newline='\n') as file:
for line in file:
if '<doc id=' in line:
article_open = True
elif '</doc>' in line:
article_open = False
for oline in article_lines[1:]:
if oline != '\n':
ofile.write(oline.rstrip() + " ")
ofile.write("\n\n")
article_lines = []
else:
if article_open:
article_lines.append(line)
@@ -0,0 +1,12 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
@@ -0,0 +1,387 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import BookscorpusTextFormatting
import Downloader
import TextSharding
import WikicorpusTextFormatting
import PubMedTextFormatting
import argparse
import itertools
import multiprocessing
import os
import pprint
import subprocess
def main(args):
working_dir = os.environ['BERT_PREP_WORKING_DIR']
print('Working Directory:', working_dir)
print('Action:', args.action)
print('Dataset Name:', args.dataset)
if args.input_files:
args.input_files = args.input_files.split(',')
hdf5_tfrecord_folder_prefix = "/lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \
+ "_max_pred_" + str(args.max_predictions_per_seq) + "_masked_lm_prob_" + str(args.masked_lm_prob) \
+ "_random_seed_" + str(args.random_seed) + "_dupe_factor_" + str(args.dupe_factor) \
+ "_shard_" + str(args.n_training_shards) + "_test_split_" + str(int(args.fraction_test_set * 100))
directory_structure = {
'download' : working_dir + '/download', # Downloaded and decompressed
'extracted' : working_dir +'/extracted', # Extracted from whatever the initial format is (e.g., wikiextractor)
'formatted' : working_dir + '/formatted_one_article_per_line', # This is the level where all sources should look the same
'sharded' : working_dir + '/sharded',
'tfrecord' : working_dir + '/tfrecord' + hdf5_tfrecord_folder_prefix,
'hdf5': working_dir + '/hdf5'+ hdf5_tfrecord_folder_prefix,
}
print('\nDirectory Structure:')
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(directory_structure)
print('')
if args.action == 'download':
if not os.path.exists(directory_structure['download']):
os.makedirs(directory_structure['download'])
downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
downloader.download()
elif args.action == 'text_formatting':
assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' \
and args.dataset != 'squad' and args.dataset != 'MRPC' and args.dataset != 'CoLA' and \
args.dataset != 'MNLI', 'Cannot perform text_formatting on pretrained weights'
if not os.path.exists(directory_structure['extracted']):
os.makedirs(directory_structure['extracted'])
if not os.path.exists(directory_structure['formatted']):
os.makedirs(directory_structure['formatted'])
if args.dataset == 'bookscorpus':
books_path = directory_structure['download'] + '/bookscorpus'
#books_path = directory_structure['download']
output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'
books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
books_formatter.merge()
elif args.dataset == 'wikicorpus_en':
if args.skip_wikiextractor == 0:
path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
print('WikiExtractor Command:', wikiextractor_command)
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
wiki_formatter.merge()
elif args.dataset == 'wikicorpus_zh':
assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
if args.skip_wikiextractor == 0:
path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
print('WikiExtractor Command:', wikiextractor_command)
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
wiki_formatter.merge()
elif args.dataset == 'pubmed_baseline':
pubmed_path = directory_structure['download'] + '/pubmed' + '/baseline'
output_filename = directory_structure['formatted'] + '/pubmed_baseline_one_article_per_line.txt'
pubmed_formatter = PubMedTextFormatting.PubMedTextFormatting(pubmed_path, output_filename, recursive=True)
pubmed_formatter.merge()
elif args.action == 'sharding':
# Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset or 'pubmed' in args.dataset:
if args.input_files is None:
if args.dataset == 'bookscorpus':
args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt']
elif args.dataset == 'wikicorpus_en':
args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
elif args.dataset == 'wikicorpus_zh':
args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt']
elif args.dataset == 'books_wiki_en_corpus':
args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
elif args.dataset == 'pubmed_baseline':
args.input_files = [directory_structure['formatted'] + '/pubmed_baseline_one_article_per_line.txt']
output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
if not os.path.exists(directory_structure['sharded']):
os.makedirs(directory_structure['sharded'])
if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset):
os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/training'):
os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/training')
if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/test'):
os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/test')
# Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
# it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
# Different languages (e.g., Chinese simplified/traditional) may require translation and
# other packages to be called from here -- just add a conditional branch for those extra steps
segmenter = TextSharding.NLTKSegmenter()
sharding = TextSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set)
sharding.load_articles()
sharding.segment_articles_into_sentences(segmenter)
sharding.distribute_articles_over_shards()
sharding.write_shards_to_disk()
else:
assert False, 'Unsupported dataset for sharding'
elif args.action == 'create_tfrecord_files':
if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset):
os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset)
if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset + '/training'):
os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset + '/training')
if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset + '/test'):
os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset + '/test')
last_process = None
def create_record_worker(filename_prefix, shard_id, output_format='tfrecord', split='training'):
bert_preprocessing_command = 'python /workspace/bert/utils/create_pretraining_data.py'
bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
last_process = bert_preprocessing_process
# This could be better optimized (fine if all take equal time)
if shard_id % args.n_processes == 0 and shard_id > 0:
bert_preprocessing_process.wait()
return last_process
output_file_prefix = args.dataset
for i in range(args.n_training_shards):
last_process = create_record_worker(output_file_prefix + '_training', i, 'tfrecord', 'training')
last_process.wait()
for i in range(args.n_test_shards):
last_process = create_record_worker(output_file_prefix + '_test', i, 'tfrecord', 'test')
last_process.wait()
elif args.action == 'create_hdf5_files':
assert False, 'HDF5 format not fully supported in this release.'
if not os.path.exists(directory_structure['hdf5'] + "/" + args.dataset):
os.makedirs(directory_structure['hdf5'] + "/" + args.dataset)
last_process = None
def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
bert_preprocessing_command = 'python /workspace/bert/utils/create_pretraining_data.py'
bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
bert_preprocessing_command += ' --output_file=' + directory_structure['hdf5'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
bert_preprocessing_command += ' --max_seq_length=' + args.max_seq_length
bert_preprocessing_command += ' --max_predictions_per_seq=' + args.max_predictions_per_seq
bert_preprocessing_command += ' --masked_lm_prob=' + args.masked_lm_prob
bert_preprocessing_command += ' --random_seed=' + args.random_seed
bert_preprocessing_command += ' --dupe_factor=' + args.dupe_factor
bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
last_process = bert_preprocessing_process
# This could be better optimized (fine if all take equal time)
if shard_id % args.n_processes == 0 and shard_id > 0:
bert_preprocessing_process.wait()
for i in range(args.n_training_shards):
create_record_worker(args.output_file_prefix + '_training', i)
last_process.wait()
for i in range(args.n_test_shards):
create_record_worker(args.output_file_prefix + '_test', i)
last_process.wait()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Preprocessing Application for Everything BERT-related'
)
parser.add_argument(
'--action',
type=str,
help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords',
choices={
'download', # Download and verify mdf5/sha sums
'text_formatting', # Convert into a file that contains one article/book per line
'sharding', # Convert previous formatted text into shards containing one sentence per line
'create_tfrecord_files', # Turn each shard into a TFrecord with masking and next sentence prediction info
'create_hdf5_files' # Turn each shard into a HDF5 file with masking and next sentence prediction info
}
)
parser.add_argument(
'--dataset',
type=str,
help='Specify the dataset to perform --action on',
choices={
'bookscorpus',
'wikicorpus_en',
'wikicorpus_zh',
'books_wiki_en_corpus',
'pubmed_baseline',
'pubmed_daily_update',
'pubmed_fulltext',
'pubmed_open_access',
'google_pretrained_weights',
'nvidia_pretrained_weights',
'squad',
'MRPC',
'CoLA',
'MNLI',
'all'
}
)
parser.add_argument(
'--input_files',
type=str,
help='Specify the input files in a comma-separated list (no spaces)'
)
parser.add_argument(
'--n_training_shards',
type=int,
help='Specify the number of training shards to generate',
default=1472
)
parser.add_argument(
'--n_test_shards',
type=int,
help='Specify the number of test shards to generate',
default=1472
)
parser.add_argument(
'--fraction_test_set',
type=float,
help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)',
default=0.1
)
parser.add_argument(
'--segmentation_method',
type=str,
help='Specify your choice of sentence segmentation',
choices={
'nltk'
},
default='nltk'
)
parser.add_argument(
'--n_processes',
type=int,
help='Specify the max number of processes to allow at one time',
default=4
)
parser.add_argument(
'--random_seed',
type=int,
help='Specify the base seed to use for any random number generation',
default=12345
)
parser.add_argument(
'--dupe_factor',
type=int,
help='Specify the duplication factor',
default=5
)
parser.add_argument(
'--masked_lm_prob',
type=float,
help='Specify the probability for masked lm',
default=0.15
)
parser.add_argument(
'--max_seq_length',
type=int,
help='Specify the maximum sequence length',
default=512
)
parser.add_argument(
'--max_predictions_per_seq',
type=int,
help='Specify the maximum number of masked words per sequence',
default=20
)
parser.add_argument(
'--do_lower_case',
type=int,
help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)',
default=1
)
parser.add_argument(
'--vocab_file',
type=str,
help='Specify absolute path to vocab file to use)'
)
parser.add_argument(
'--skip_wikiextractor',
type=int,
help='Specify whether to skip wikiextractor step 0=False, 1=True',
default=0
)
parser.add_argument(
'--interactive_json_config_generator',
type=str,
help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords'
)
args = parser.parse_args()
main(args)
@@ -0,0 +1,55 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
export BERT_PREP_WORKING_DIR="${BERT_PREP_WORKING_DIR}"
# Download
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset pubmed_baseline
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset google_pretrained_weights # Includes vocab
# Properly format the text files
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action text_formatting --dataset pubmed_baseline
# Shard the text files
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action sharding --dataset pubmed_baseline
### BERT BASE
## UNCASED
# Create TFRecord files Phase 1
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 128 \
--max_predictions_per_seq 20 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-12_H-768_A-12/vocab.txt
# Create TFRecord files Phase 2
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 512 \
--max_predictions_per_seq 80 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-12_H-768_A-12/vocab.txt
## CASED
# Create TFRecord files Phase 1
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 128 \
--max_predictions_per_seq 20 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/cased_L-12_H-768_A-12/vocab.txt \
--do_lower_case=0
# Create TFRecord files Phase 2
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 512 \
--max_predictions_per_seq 80 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/cased_L-12_H-768_A-12/vocab.txt \
--do_lower_case=0
@@ -0,0 +1,46 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
export BERT_PREP_WORKING_DIR="${BERT_PREP_WORKING_DIR}"
# Download
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset bookscorpus
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset wikicorpus_en
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset google_pretrained_weights # Includes vocab
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset squad
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset "CoLA"
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset "MRPC"
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset "MNLI"
# Properly format the text files
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action text_formatting --dataset bookscorpus
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action text_formatting --dataset wikicorpus_en
# Shard the text files (group wiki+books then shard)
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action sharding --dataset books_wiki_en_corpus
# Create TFRecord files Phase 1
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset books_wiki_en_corpus --max_seq_length 128 \
--max_predictions_per_seq 20 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt
# Create TFRecord files Phase 2
python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset books_wiki_en_corpus --max_seq_length 512 \
--max_predictions_per_seq 80 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt
Binary file not shown.

After

Width:  |  Height:  |  Size: 208 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 86 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

@@ -0,0 +1,13 @@
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"max_position_embeddings": 512,
"num_attention_heads": 16,
"num_hidden_layers": 24,
"type_vocab_size": 2,
"vocab_size": 30528
}
@@ -0,0 +1,419 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Extract pre-computed feature vectors from BERT."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import codecs
import collections
import json
import re
import modeling
import tokenization
import tensorflow as tf
flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_string("input_file", None, "")
flags.DEFINE_string("output_file", None, "")
flags.DEFINE_string("layers", "-1,-2,-3,-4", "")
flags.DEFINE_string(
"bert_config_file", None,
"The config json file corresponding to the pre-trained BERT model. "
"This specifies the model architecture.")
flags.DEFINE_integer(
"max_seq_length", 128,
"The maximum total input sequence length after WordPiece tokenization. "
"Sequences longer than this will be truncated, and sequences shorter "
"than this will be padded.")
flags.DEFINE_string(
"init_checkpoint", None,
"Initial checkpoint (usually from a pre-trained BERT model).")
flags.DEFINE_string("vocab_file", None,
"The vocabulary file that the BERT model was trained on.")
flags.DEFINE_bool(
"do_lower_case", True,
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.")
flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.")
flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
flags.DEFINE_string("master", None,
"If using a TPU, the address of the master.")
flags.DEFINE_integer(
"num_tpu_cores", 8,
"Only used if `use_tpu` is True. Total number of TPU cores to use.")
flags.DEFINE_bool(
"use_one_hot_embeddings", False,
"If True, tf.one_hot will be used for embedding lookups, otherwise "
"tf.nn.embedding_lookup will be used. On TPUs, this should be True "
"since it is much faster.")
class InputExample(object):
def __init__(self, unique_id, text_a, text_b):
self.unique_id = unique_id
self.text_a = text_a
self.text_b = text_b
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
self.unique_id = unique_id
self.tokens = tokens
self.input_ids = input_ids
self.input_mask = input_mask
self.input_type_ids = input_type_ids
def input_fn_builder(features, seq_length):
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
all_unique_ids = []
all_input_ids = []
all_input_mask = []
all_input_type_ids = []
for feature in features:
all_unique_ids.append(feature.unique_id)
all_input_ids.append(feature.input_ids)
all_input_mask.append(feature.input_mask)
all_input_type_ids.append(feature.input_type_ids)
def input_fn(params):
"""The actual input function."""
batch_size = params["batch_size"]
num_examples = len(features)
# This is for demo purposes and does NOT scale to large data sets. We do
# not use Dataset.from_generator() because that uses tf.py_func which is
# not TPU compatible. The right way to load data is with TFRecordReader.
d = tf.data.Dataset.from_tensor_slices({
"unique_ids":
tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
"input_ids":
tf.constant(
all_input_ids, shape=[num_examples, seq_length],
dtype=tf.int32),
"input_mask":
tf.constant(
all_input_mask,
shape=[num_examples, seq_length],
dtype=tf.int32),
"input_type_ids":
tf.constant(
all_input_type_ids,
shape=[num_examples, seq_length],
dtype=tf.int32),
})
d = d.batch(batch_size=batch_size, drop_remainder=False)
return d
return input_fn
def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
use_one_hot_embeddings):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
unique_ids = features["unique_ids"]
input_ids = features["input_ids"]
input_mask = features["input_mask"]
input_type_ids = features["input_type_ids"]
model = modeling.BertModel(
config=bert_config,
is_training=False,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=input_type_ids,
use_one_hot_embeddings=use_one_hot_embeddings)
if mode != tf.estimator.ModeKeys.PREDICT:
raise ValueError("Only PREDICT modes are supported: %s" % (mode))
tvars = tf.trainable_variables()
scaffold_fn = None
(assignment_map,
initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
tvars, init_checkpoint)
if use_tpu:
def tpu_scaffold():
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
return tf.train.Scaffold()
scaffold_fn = tpu_scaffold
else:
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
tf.logging.info("**** Trainable Variables ****")
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
init_string)
all_layers = model.get_all_encoder_layers()
predictions = {
"unique_id": unique_ids,
}
for (i, layer_index) in enumerate(layer_indexes):
predictions["layer_output_%d" % i] = all_layers[layer_index]
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
return output_spec
return model_fn
def convert_examples_to_features(examples, seq_length, tokenizer):
"""Loads a data file into a list of `InputBatch`s."""
features = []
for (ex_index, example) in enumerate(examples):
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > seq_length - 2:
tokens_a = tokens_a[0:(seq_length - 2)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
input_type_ids = []
tokens.append("[CLS]")
input_type_ids.append(0)
for token in tokens_a:
tokens.append(token)
input_type_ids.append(0)
tokens.append("[SEP]")
input_type_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
input_type_ids.append(1)
tokens.append("[SEP]")
input_type_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < seq_length:
input_ids.append(0)
input_mask.append(0)
input_type_ids.append(0)
assert len(input_ids) == seq_length
assert len(input_mask) == seq_length
assert len(input_type_ids) == seq_length
if ex_index < 5:
tf.logging.info("*** Example ***")
tf.logging.info("unique_id: %s" % (example.unique_id))
tf.logging.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
tf.logging.info(
"input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
features.append(
InputFeatures(
unique_id=example.unique_id,
tokens=tokens,
input_ids=input_ids,
input_mask=input_mask,
input_type_ids=input_type_ids))
return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def read_examples(input_file):
"""Read a list of `InputExample`s from an input file."""
examples = []
unique_id = 0
with tf.gfile.GFile(input_file, "r") as reader:
while True:
line = tokenization.convert_to_unicode(reader.readline())
if not line:
break
line = line.strip()
text_a = None
text_b = None
m = re.match(r"^(.*) \|\|\| (.*)$", line)
if m is None:
text_a = line
else:
text_a = m.group(1)
text_b = m.group(2)
examples.append(
InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
unique_id += 1
return examples
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
layer_indexes = [int(x) for x in FLAGS.layers.split(",")]
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
master=FLAGS.master,
tpu_config=tf.contrib.tpu.TPUConfig(
num_shards=FLAGS.num_tpu_cores,
per_host_input_for_training=is_per_host))
examples = read_examples(FLAGS.input_file)
features = convert_examples_to_features(
examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
unique_id_to_feature = {}
for feature in features:
unique_id_to_feature[feature.unique_id] = feature
model_fn = model_fn_builder(
bert_config=bert_config,
init_checkpoint=FLAGS.init_checkpoint,
layer_indexes=layer_indexes,
use_tpu=FLAGS.use_tpu,
use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)
# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator = tf.contrib.tpu.TPUEstimator(
use_tpu=FLAGS.use_tpu,
model_fn=model_fn,
config=run_config,
predict_batch_size=FLAGS.batch_size)
input_fn = input_fn_builder(
features=features, seq_length=FLAGS.max_seq_length)
with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file,
"w")) as writer:
for result in estimator.predict(input_fn, yield_single_examples=True):
unique_id = int(result["unique_id"])
feature = unique_id_to_feature[unique_id]
output_json = collections.OrderedDict()
output_json["linex_index"] = unique_id
all_features = []
for (i, token) in enumerate(feature.tokens):
all_layers = []
for (j, layer_index) in enumerate(layer_indexes):
layer_output = result["layer_output_%d" % j]
layers = collections.OrderedDict()
layers["index"] = layer_index
layers["values"] = [
round(float(x), 6) for x in layer_output[i:(i + 1)].flat
]
all_layers.append(layers)
features = collections.OrderedDict()
features["token"] = token
features["layers"] = all_layers
all_features.append(features)
output_json["features"] = all_features
writer.write(json.dumps(output_json) + "\n")
if __name__ == "__main__":
flags.mark_flag_as_required("input_file")
flags.mark_flag_as_required("vocab_file")
flags.mark_flag_as_required("bert_config_file")
flags.mark_flag_as_required("init_checkpoint")
flags.mark_flag_as_required("output_file")
tf.app.run()
@@ -0,0 +1,35 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
import numpy as np
def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
initializer=None, regularizer=None,
trainable=True,
*args, **kwargs):
"""Custom variable getter that forces trainable variables to be stored in
float32 precision and then casts them to the training precision.
"""
storage_dtype = tf.float32 if trainable else dtype
variable = getter(name, shape, dtype=storage_dtype,
initializer=initializer, regularizer=regularizer,
trainable=trainable,
*args, **kwargs)
if trainable and dtype != tf.float32:
variable = tf.cast(variable, dtype)
return variable
@@ -0,0 +1,141 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import copy
import json
import math
import re
import six
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.contrib.layers.python.layers import utils
from tensorflow.contrib.framework.python.ops import variables
from tensorflow.python.ops import init_ops
import numpy
from tensorflow.python.ops import array_ops
from tensorflow.python.framework import dtypes
from tensorflow.python.ops import nn
def fused_layer_norm(inputs,
center=True,
scale=True,
activation_fn=None,
reuse=None,
variables_collections=None,
outputs_collections=None,
trainable=True,
begin_norm_axis=1,
begin_params_axis=-1,
scope=None,
use_fused_batch_norm=False):
with tf.variable_scope(
scope, 'LayerNorm', [inputs], reuse=reuse) as sc:
inputs = ops.convert_to_tensor(inputs)
inputs_shape = inputs.shape
inputs_rank = inputs_shape.ndims
if inputs_rank is None:
raise ValueError('Inputs %s has undefined rank.' % inputs.name)
dtype = inputs.dtype.base_dtype
if begin_norm_axis < 0:
begin_norm_axis = inputs_rank + begin_norm_axis
if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) '
'must be < rank(inputs) (%d)' %
(begin_params_axis, begin_norm_axis, inputs_rank))
params_shape = inputs_shape[begin_params_axis:]
if not params_shape.is_fully_defined():
raise ValueError(
'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' %
(inputs.name, begin_params_axis, inputs_shape))
# Allocate parameters for the beta and gamma of the normalization.
beta, gamma = None, None
if center:
beta_collections = utils.get_variable_collections(variables_collections,
'beta')
beta = variables.model_variable(
'beta',
shape=params_shape,
dtype=dtype,
initializer=init_ops.zeros_initializer(),
collections=beta_collections,
trainable=trainable)
if scale:
gamma_collections = utils.get_variable_collections(
variables_collections, 'gamma')
gamma = variables.model_variable(
'gamma',
shape=params_shape,
dtype=dtype,
initializer=init_ops.ones_initializer(),
collections=gamma_collections,
trainable=trainable)
if use_fused_batch_norm:
# get static TensorShape if fully defined,
# otherwise retrieve shape tensor
norm_shape = inputs.shape[begin_norm_axis:]
if norm_shape.is_fully_defined():
bn_shape = [1, -1, 1, numpy.prod(norm_shape.as_list())]
else:
norm_shape = tf.shape(inputs)[begin_norm_axis:]
bn_shape = [1, -1, 1, tf.reduce_prod(norm_shape)]
if inputs.get_shape().is_fully_defined():
outputs_shape = inputs.get_shape()
else:
outputs_shape = tf.shape(inputs)
inputs = array_ops.reshape(inputs, bn_shape)
if inputs.get_shape().is_fully_defined():
# static inputs TensorShape fully defined after reshape.
ones = array_ops.ones(inputs.get_shape()[1], dtype=dtypes.float32)
zeros = array_ops.zeros(inputs.get_shape()[1], dtype=dtypes.float32)
else:
# static inputs TensorShape NOT fully defined after reshape.
# must use dynamic shape, which means these input tensors
# have to be created at runtime, which causes a slowdown.
scale_shape = tf.shape(inputs)[1]
ones = array_ops.ones(scale_shape, dtype=dtypes.float32)
zeros = array_ops.zeros(scale_shape, dtype=dtypes.float32)
outputs, mean, variance = nn.fused_batch_norm(
inputs,
ones, zeros,
epsilon=1e-4,
data_format="NCHW")
outputs = array_ops.reshape(outputs, outputs_shape)
if center and scale:
outputs = outputs * gamma + beta
elif center:
outputs = outputs + beta
elif scale:
outputs = outputs * gamma
else:
# Calculate the moments on the last axis (layer activations).
norm_axes = list(range(begin_norm_axis, inputs_rank))
mean, variance = nn.moments(inputs, norm_axes, keep_dims=True)
# Compute layer normalization using the batch_normalization function.
variance_epsilon = 1e-4
outputs = nn.batch_normalization(
inputs,
mean,
variance,
offset=beta,
scale=gamma,
variance_epsilon=variance_epsilon)
outputs.set_shape(inputs_shape)
if activation_fn is not None:
outputs = activation_fn(outputs)
return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
@@ -0,0 +1,36 @@
# coding=utf-8
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
import numpy as np
def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
initializer=None, regularizer=None,
trainable=True,
*args, **kwargs):
"""Custom variable getter that forces trainable variables to be stored in
float32 precision and then casts them to the training precision.
"""
storage_dtype = tf.float32 if trainable else dtype
variable = getter(name, shape, dtype=storage_dtype,
initializer=initializer, regularizer=regularizer,
trainable=trainable,
*args, **kwargs)
if trainable and dtype != tf.float32:
variable = tf.cast(variable, dtype)
return variable
def get_custom_getter(compute_type):
return float32_variable_storage_getter if compute_type == tf.float16 else None
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,277 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import json
import random
import re
import modeling
import six
import tensorflow as tf
class BertModelTest(tf.test.TestCase):
class BertModelTester(object):
def __init__(self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
use_token_type_ids=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
initializer_range=0.02,
scope=None):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_token_type_ids = use_token_type_ids
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.scope = scope
def create_model(self):
input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length],
self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = BertModelTest.ids_tensor(
[self.batch_size, self.seq_length], vocab_size=2)
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = BertModelTest.ids_tensor(
[self.batch_size, self.seq_length], self.type_vocab_size)
config = modeling.BertConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range)
model = modeling.BertModel(
config=config,
is_training=self.is_training,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=token_type_ids,
scope=self.scope)
outputs = {
"embedding_output": model.get_embedding_output(),
"sequence_output": model.get_sequence_output(),
"pooled_output": model.get_pooled_output(),
"all_encoder_layers": model.get_all_encoder_layers(),
}
return outputs
def check_output(self, result):
self.parent.assertAllEqual(
result["embedding_output"].shape,
[self.batch_size, self.seq_length, self.hidden_size])
self.parent.assertAllEqual(
result["sequence_output"].shape,
[self.batch_size, self.seq_length, self.hidden_size])
self.parent.assertAllEqual(result["pooled_output"].shape,
[self.batch_size, self.hidden_size])
def test_default(self):
self.run_tester(BertModelTest.BertModelTester(self))
def test_config_to_json_string(self):
config = modeling.BertConfig(vocab_size=99, hidden_size=37)
obj = json.loads(config.to_json_string())
self.assertEqual(obj["vocab_size"], 99)
self.assertEqual(obj["hidden_size"], 37)
def run_tester(self, tester):
with self.test_session() as sess:
ops = tester.create_model()
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
sess.run(init_op)
output_result = sess.run(ops)
tester.check_output(output_result)
self.assert_all_tensors_reachable(sess, [init_op, ops])
@classmethod
def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
"""Creates a random int32 tensor of the shape within the vocab size."""
if rng is None:
rng = random.Random()
total_dims = 1
for dim in shape:
total_dims *= dim
values = []
for _ in range(total_dims):
values.append(rng.randint(0, vocab_size - 1))
return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name)
def assert_all_tensors_reachable(self, sess, outputs):
"""Checks that all the tensors in the graph are reachable from outputs."""
graph = sess.graph
ignore_strings = [
"^.*/assert_less_equal/.*$",
"^.*/dilation_rate$",
"^.*/Tensordot/concat$",
"^.*/Tensordot/concat/axis$",
"^testing/.*$",
]
ignore_regexes = [re.compile(x) for x in ignore_strings]
unreachable = self.get_unreachable_ops(graph, outputs)
filtered_unreachable = []
for x in unreachable:
do_ignore = False
for r in ignore_regexes:
m = r.match(x.name)
if m is not None:
do_ignore = True
if do_ignore:
continue
filtered_unreachable.append(x)
unreachable = filtered_unreachable
self.assertEqual(
len(unreachable), 0, "The following ops are unreachable: %s" %
(" ".join([x.name for x in unreachable])))
@classmethod
def get_unreachable_ops(cls, graph, outputs):
"""Finds all of the tensors in graph that are unreachable from outputs."""
outputs = cls.flatten_recursive(outputs)
output_to_op = collections.defaultdict(list)
op_to_all = collections.defaultdict(list)
assign_out_to_in = collections.defaultdict(list)
for op in graph.get_operations():
for x in op.inputs:
op_to_all[op.name].append(x.name)
for y in op.outputs:
output_to_op[y.name].append(op.name)
op_to_all[op.name].append(y.name)
if str(op.type) == "Assign":
for y in op.outputs:
for x in op.inputs:
assign_out_to_in[y.name].append(x.name)
assign_groups = collections.defaultdict(list)
for out_name in assign_out_to_in.keys():
name_group = assign_out_to_in[out_name]
for n1 in name_group:
assign_groups[n1].append(out_name)
for n2 in name_group:
if n1 != n2:
assign_groups[n1].append(n2)
seen_tensors = {}
stack = [x.name for x in outputs]
while stack:
name = stack.pop()
if name in seen_tensors:
continue
seen_tensors[name] = True
if name in output_to_op:
for op_name in output_to_op[name]:
if op_name in op_to_all:
for input_name in op_to_all[op_name]:
if input_name not in stack:
stack.append(input_name)
expanded_names = []
if name in assign_groups:
for assign_name in assign_groups[name]:
expanded_names.append(assign_name)
for expanded_name in expanded_names:
if expanded_name not in stack:
stack.append(expanded_name)
unreachable_ops = []
for op in graph.get_operations():
is_unreachable = False
all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs]
for name in all_names:
if name not in seen_tensors:
is_unreachable = True
if is_unreachable:
unreachable_ops.append(op)
return unreachable_ops
@classmethod
def flatten_recursive(cls, item):
"""Flattens (potentially nested) a tuple/dictionary/list to a list."""
output = []
if isinstance(item, list):
output.extend(item)
elif isinstance(item, tuple):
output.extend(list(item))
elif isinstance(item, dict):
for (_, v) in six.iteritems(item):
output.append(v)
else:
return [item]
flat_output = []
for x in output:
flat_output.extend(cls.flatten_recursive(x))
return flat_output
if __name__ == "__main__":
tf.test.main()
@@ -0,0 +1,305 @@
## Models
There are two multilingual models currently available. We do not plan to release
more single-language models, but we may release `BERT-Large` versions of these
two in the future:
* **[`BERT-Base, Multilingual Cased (New, recommended)`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**:
104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**:
102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
* **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**:
Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M
parameters
**The `Multilingual Cased (New)` model also fixes normalization issues in many
languages, so it is recommended in languages with non-Latin alphabets (and is
often better for most languages with Latin alphabets). When using this model,
make sure to pass `--do_lower_case=false` to `run_pretraining.py` and other
scripts.**
See the [list of languages](#list-of-languages) that the Multilingual model
supports. The Multilingual model does include Chinese (and English), but if your
fine-tuning data is Chinese-only, then the Chinese model will likely produce
better results.
## Results
To evaluate these systems, we use the
[XNLI dataset](https://github.com/facebookresearch/XNLI) dataset, which is a
version of [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) where the
dev and test sets have been translated (by humans) into 15 languages. Note that
the training set was *machine* translated (we used the translations provided by
XNLI, not Google NMT). For clarity, we only report on 6 languages below:
<!-- mdformat off(no table) -->
| System | English | Chinese | Spanish | German | Arabic | Urdu |
| --------------------------------- | -------- | -------- | -------- | -------- | -------- | -------- |
| XNLI Baseline - Translate Train | 73.7 | 67.0 | 68.8 | 66.5 | 65.8 | 56.6 |
| XNLI Baseline - Translate Test | 73.7 | 68.3 | 70.7 | 68.7 | 66.8 | 59.3 |
| BERT - Translate Train Cased | **81.9** | **76.6** | **77.8** | **75.9** | **70.7** | 61.6 |
| BERT - Translate Train Uncased | 81.4 | 74.2 | 77.3 | 75.2 | 70.5 | 61.7 |
| BERT - Translate Test Uncased | 81.4 | 70.1 | 74.9 | 74.4 | 70.4 | **62.1** |
| BERT - Zero Shot Uncased | 81.4 | 63.8 | 74.3 | 70.5 | 62.1 | 58.3 |
<!-- mdformat on -->
The first two rows are baselines from the XNLI paper and the last three rows are
our results with BERT.
**Translate Train** means that the MultiNLI training set was machine translated
from English into the foreign language. So training and evaluation were both
done in the foreign language. Unfortunately, training was done on
machine-translated data, so it is impossible to quantify how much of the lower
accuracy (compared to English) is due to the quality of the machine translation
vs. the quality of the pre-trained model.
**Translate Test** means that the XNLI test set was machine translated from the
foreign language into English. So training and evaluation were both done on
English. However, test evaluation was done on machine-translated English, so the
accuracy depends on the quality of the machine translation system.
**Zero Shot** means that the Multilingual BERT system was fine-tuned on English
MultiNLI, and then evaluated on the foreign language XNLI test. In this case,
machine translation was not involved at all in either the pre-training or
fine-tuning.
Note that the English result is worse than the 84.2 MultiNLI baseline because
this training used Multilingual BERT rather than English-only BERT. This implies
that for high-resource languages, the Multilingual model is somewhat worse than
a single-language model. However, it is not feasible for us to train and
maintain dozens of single-language model. Therefore, if your goal is to maximize
performance with a language other than English or Chinese, you might find it
beneficial to run pre-training for additional steps starting from our
Multilingual model on data from your language of interest.
Here is a comparison of training Chinese models with the Multilingual
`BERT-Base` and Chinese-only `BERT-Base`:
System | Chinese
----------------------- | -------
XNLI Baseline | 67.0
BERT Multilingual Model | 74.2
BERT Chinese-only Model | 77.2
Similar to English, the single-language model does 3% better than the
Multilingual model.
## Fine-tuning Example
The multilingual model does **not** require any special consideration or API
changes. We did update the implementation of `BasicTokenizer` in
`tokenization.py` to support Chinese character tokenization, so please update if
you forked it. However, we did not change the tokenization API.
To test the new models, we did modify `run_classifier.py` to add support for the
[XNLI dataset](https://github.com/facebookresearch/XNLI). This is a 15-language
version of MultiNLI where the dev/test sets have been human-translated, and the
training set has been machine-translated.
To run the fine-tuning code, please download the
[XNLI dev/test set](https://s3.amazonaws.com/xnli/XNLI-1.0.zip) and the
[XNLI machine-translated training set](https://s3.amazonaws.com/xnli/XNLI-MT-1.0.zip)
and then unpack both .zip files into some directory `$XNLI_DIR`.
To run fine-tuning on XNLI. The language is hard-coded into `run_classifier.py`
(Chinese by default), so please modify `XnliProcessor` if you want to run on
another language.
This is a large dataset, so this will training will take a few hours on a GPU
(or about 30 minutes on a Cloud TPU). To run an experiment quickly for
debugging, just set `num_train_epochs` to a small value like `0.1`.
```shell
export BERT_BASE_DIR=/path/to/bert/chinese_L-12_H-768_A-12 # or multilingual_L-12_H-768_A-12
export XNLI_DIR=/path/to/xnli
python run_classifier.py \
--task_name=XNLI \
--do_train=true \
--do_eval=true \
--data_dir=$XNLI_DIR \
--vocab_file=$BERT_BASE_DIR/vocab.txt \
--bert_config_file=$BERT_BASE_DIR/bert_config.json \
--init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
--max_seq_length=128 \
--train_batch_size=32 \
--learning_rate=5e-5 \
--num_train_epochs=2.0 \
--output_dir=/tmp/xnli_output/
```
With the Chinese-only model, the results should look something like this:
```
***** Eval results *****
eval_accuracy = 0.774116
eval_loss = 0.83554
global_step = 24543
loss = 0.74603
```
## Details
### Data Source and Sampling
The languages chosen were the
[top 100 languages with the largest Wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias).
The entire Wikipedia dump for each language (excluding user and talk pages) was
taken as the training data for each language
However, the size of the Wikipedia for a given language varies greatly, and
therefore low-resource languages may be "under-represented" in terms of the
neural network model (under the assumption that languages are "competing" for
limited model capacity to some extent).
However, the size of a Wikipedia also correlates with the number of speakers of
a language, and we also don't want to overfit the model by performing thousands
of epochs over a tiny Wikipedia for a particular language.
To balance these two factors, we performed exponentially smoothed weighting of
the data during pre-training data creation (and WordPiece vocab creation). In
other words, let's say that the probability of a language is *P(L)*, e.g.,
*P(English) = 0.21* means that after concatenating all of the Wikipedias
together, 21% of our data is English. We exponentiate each probability by some
factor *S* and then re-normalize, and sample from that distribution. In our case
we use *S=0.7*. So, high-resource languages like English will be under-sampled,
and low-resource languages like Icelandic will be over-sampled. E.g., in the
original distribution English would be sampled 1000x more than Icelandic, but
after smoothing it's only sampled 100x more.
### Tokenization
For tokenization, we use a 110k shared WordPiece vocabulary. The word counts are
weighted the same way as the data, so low-resource languages are upweighted by
some factor. We intentionally do *not* use any marker to denote the input
language (so that zero-shot training can work).
Because Chinese (and Japanese Kanji and Korean Hanja) does not have whitespace
characters, we add spaces around every character in the
[CJK Unicode range](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_\(Unicode_block\))
before applying WordPiece. This means that Chinese is effectively
character-tokenized. Note that the CJK Unicode block only includes
Chinese-origin characters and does *not* include Hangul Korean or
Katakana/Hiragana Japanese, which are tokenized with whitespace+WordPiece like
all other languages.
For all other languages, we apply the
[same recipe as English](https://github.com/google-research/bert#tokenization):
(a) lower casing+accent removal, (b) punctuation splitting, (c) whitespace
tokenization. We understand that accent markers have substantial meaning in some
languages, but felt that the benefits of reducing the effective vocabulary make
up for this. Generally the strong contextual models of BERT should make up for
any ambiguity introduced by stripping accent markers.
### List of Languages
The multilingual model supports the following languages. These languages were
chosen because they are the top 100 languages with the largest Wikipedias:
* Afrikaans
* Albanian
* Arabic
* Aragonese
* Armenian
* Asturian
* Azerbaijani
* Bashkir
* Basque
* Bavarian
* Belarusian
* Bengali
* Bishnupriya Manipuri
* Bosnian
* Breton
* Bulgarian
* Burmese
* Catalan
* Cebuano
* Chechen
* Chinese (Simplified)
* Chinese (Traditional)
* Chuvash
* Croatian
* Czech
* Danish
* Dutch
* English
* Estonian
* Finnish
* French
* Galician
* Georgian
* German
* Greek
* Gujarati
* Haitian
* Hebrew
* Hindi
* Hungarian
* Icelandic
* Ido
* Indonesian
* Irish
* Italian
* Japanese
* Javanese
* Kannada
* Kazakh
* Kirghiz
* Korean
* Latin
* Latvian
* Lithuanian
* Lombard
* Low Saxon
* Luxembourgish
* Macedonian
* Malagasy
* Malay
* Malayalam
* Marathi
* Minangkabau
* Nepali
* Newar
* Norwegian (Bokmal)
* Norwegian (Nynorsk)
* Occitan
* Persian (Farsi)
* Piedmontese
* Polish
* Portuguese
* Punjabi
* Romanian
* Russian
* Scots
* Serbian
* Serbo-Croatian
* Sicilian
* Slovak
* Slovenian
* South Azerbaijani
* Spanish
* Sundanese
* Swahili
* Swedish
* Tagalog
* Tajik
* Tamil
* Tatar
* Telugu
* Turkish
* Ukrainian
* Urdu
* Uzbek
* Vietnamese
* Volapük
* Waray-Waray
* Welsh
* West
* Western Punjabi
* Yoruba
The **Multilingual Cased (New)** release contains additionally **Thai** and
**Mongolian**, which were not included in the original release.
@@ -0,0 +1,173 @@
```
# Licensed under the Apache License, Version 2.0 (the "License")
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
```
<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">
# Table Of Contents
- [BERT Question Answering Fine-Tuning and Inference with Mixed Precision](#bert-question-answering-inference/fine-tuning-with-mixed-precision)
- [BioBERT Named-Entity Recognition Inference with Mixed Precision](#biobert-named-entity-recognition-inference-with-mixed-precision)
# BERT Question Answering Inference/Fine-Tuning with Mixed Precision
## 1. Overview
Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks.
The original paper can be found here: https://arxiv.org/abs/1810.04805.
NVIDIA's BERT 19.10 is an optimized version of Google's official implementation, leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy.
### 1.a Learning objectives
This repository contains multiple notebooks which demonstrate:
- Inference on QA task with BERT Large model
- The use/download of pretrained NVIDIA BERT models
- Fine-Tuning on SQuaD 2.0 Dataset
- Use of Mixed Precision for Inference and Fine-Tuning
Here is a short description of each relevant file:
- _bert_squad_tf_inference.ipynb_ : BERT Q&A Inference with TF Checkpoint model
- _bert_squad_tf_finetuning.ipynb_ : BERT Fine-Tuning on SQuaD dataset
## 2. Quick Start Guide
### 2.a Build the BERT TensorFlow NGC container:
To run the notebook you first need to build the Bert TensorFlow container using the following command from the main directory of this repository:
``` bash
docker build . --rm -t bert
```
### 2.b Dataset
We need to download the vocabulary and the bert_config files:
``` python3
python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights # Includes vocab
```
This is only needed during fine-tuning in order to download the Squad dataset:
``` python3
python3 /workspace/bert/data/bertPrep.py --action download --dataset squad
```
### 2.c Start of the NGC container to run inference:
Once the image is built, you need to run the container with the `--publish
0.0.0.0:8888:8888` option to publish Jupyter's port `8888` to the host machine
at port `8888` over all network interfaces (`0.0.0.0`):
```bash
nvidia-docker run \
-v $PWD:/workspace/bert \
-v $PWD/results:/results \
--shm-size=1g \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
--publish 0.0.0.0:8888:8888 \
-it bert:latest bash
```
Then you can use the following command within the BERT Tensorflow container under
`/workspace/bert`:
```bash
jupyter notebook --ip=0.0.0.0 --allow-root
```
And navigate a web browser to the IP address or hostname of the host machine
at port `8888`:
```
http://[host machine]:8888
```
Use the token listed in the output from running the `jupyter` command to log
in, for example:
```
http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b
```
# BioBERT Named-Entity Recognition Inference with Mixed Precision
## 1. Overview
Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks.
BioBERT is a domain specific version of BERT that has been trained on PubMed abstracts.
The original BioBERT paper can be found here: https://arxiv.org/abs/1901.08746
NVIDIA's BioBERT is an optimized version of the implementation presented in the paper, leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy.
### 1.a Learning objectives
This repository contains an example notebook that demonstrates:
- Inference on NER task with BioBERT model
- The use/download of fine-tuned NVIDIA BioBERT models
- Use of Mixed Precision for Inference
Here is a short description of the relevant file:
- _biobert_ner_tf_inference.ipynb_ : BioBERT Inference with TF Checkpoint model
## 2. Quick Start Guide
### 2.a Build the BERT TensorFlow NGC container:
To run the notebook you first need to build the Bert TensorFlow container using the following command from the main directory of this repository:
``` bash
docker build . --rm -t bert
```
### 2.b Start of the NGC container to run inference:
Once the image is built, you need to run the container with the `--publish
0.0.0.0:8888:8888` option to publish Jupyter's port `8888` to the host machine
at port `8888` over all network interfaces (`0.0.0.0`):
```bash
nvidia-docker run \
-v $PWD:/workspace/bert \
-v $PWD/results:/results \
--shm-size=1g \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
--publish 0.0.0.0:8888:8888 \
-it bert:latest bash
```
Then you can use the following commands within the BERT Tensorflow container under
`/workspace/bert`:
Install spaCy. You'll use this to pre-process text and to visualize the results using displaCy.
```
pip install spacy
python -m spacy download en_core_web_sm
```
Launch Jupyter.
```bash
jupyter notebook --ip=0.0.0.0 --allow-root
```
And navigate a web browser to the IP address or hostname of the host machine
at port `8888`:
```
http://[host machine]:8888
```
Use the token listed in the output from running the `jupyter` command to log
in, for example:
```
http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b
```
@@ -0,0 +1,624 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Copyright 2019 NVIDIA Corporation. All Rights Reserved.\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# http://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License.\n",
"# =============================================================================="
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
"\n",
"# BERT Question Answering Fine-Tuning with Mixed Precision"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Overview\n",
"\n",
"Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. \n",
"\n",
"The original paper can be found here: https://arxiv.org/abs/1810.04805.\n",
"\n",
"NVIDIA's BERT 19.10 is an optimized version of Google's official implementation, leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.a Learning objectives\n",
"\n",
"This notebook demonstrates:\n",
"- Fine-Tuning on Question Answering (QA) task with BERT Large model\n",
"- The use/download of pretrained NVIDIA BERT models\n",
"- Use of Mixed Precision for Training"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Requirements\n",
"\n",
"Please refer to Section 2. of the ReadMe file"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. BERT Question Answering Task\n",
"\n",
"Here we run QA fine-tuning on a pre-trained BERT model.\n",
"To fine-tune we will use the [SQuaD 1.1 Dataset](https://rajpurkar.github.io/SQuAD-explorer/) which contains 100,000+ question-answer pairs on 500+ articles."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"\n",
"data_dir = '/workspace/bert/data/download'\n",
"\n",
"# SQuAD json for training\n",
"train_file = os.path.join(data_dir, 'squad/v1.1/train-v1.1.json')\n",
"# json for inference\n",
"predict_file = os.path.join(data_dir, 'squad/v1.1/dev-v1.1.json')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3.a Mixed Precision\n",
"\n",
"Mixed precision training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of tensor cores in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.\n",
"\n",
"For information about:\n",
"- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.\n",
"- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.\n",
"- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook we control mixed precision execution with the following flag: "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"use_fp16 = True;\n",
"\n",
"import os\n",
"os.environ[\"TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE\"] = \"1\" if use_fp16 else \"0\" \n",
"\n",
"# For detailed debug uncomment the following line:\n",
"#os.environ[\"TF_CPP_VMODULE\"]=\"auto_mixed_precision=2\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Pre-Trained NVIDIA BERT TF Models\n",
"\n",
"Based on the model size, we have the following two default configurations of BERT.\n",
"\n",
"| **Model** | **Hidden layers** | **Hidden unit size** | **Attention heads** | **Feedforward filter size** | **Max sequence length** | **Parameters** |\n",
"|:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|\n",
"|BERTBASE |12 encoder| 768| 12|4 x 768|512|110M|\n",
"|BERTLARGE|24 encoder|1024| 16|4 x 1024|512|330M|\n",
"\n",
"We will large use pre-trained models avaialble on NGC (NVIDIA GPU Cluster, https://ngc.nvidia.com).\n",
"There are many configuration available, in particular we will download and use the following:\n",
"\n",
"**bert_tf_large_fp16_384**\n",
"\n",
"Which is pre-trained using the Wikipedia and Book corpus datasets as training data. \n",
"We will fine-tune on the SQuaD 1.1 Dataset."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's create the folders for the pre-trained models:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# bert_tf_large_fp16_384\n",
"DATA_DIR_FP16 = '/workspace/bert/data/download/pretrained_model_fp16'\n",
"!mkdir -p $DATA_DIR_FP16\n",
"!wget -nc -q --show-progress -O $DATA_DIR_FP16/bert_for_tensorflow.zip \\\n",
"https://api.ngc.nvidia.com/v2/models/nvidia/bert_for_tensorflow/versions/1/zip\n",
"!unzip -n -d $DATA_DIR_FP16/ $DATA_DIR_FP16/bert_for_tensorflow.zip "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the code that follows we will refer to this model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"notebooks_dir = '/workspace/bert/notebooks'\n",
"\n",
"working_dir = '/workspace/bert'\n",
"if working_dir not in sys.path:\n",
" sys.path.append(working_dir)\n",
"\n",
"init_checkpoint = os.path.join(data_dir, 'pretrained_model_fp16/model.ckpt-1000000')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Running QA task fine-tuning\n",
"\n",
"In order to run Q-A inference we will follow step-by-step a simplified flow implemented in run_squad.py:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import run_squad\n",
"\n",
"import json\n",
"import tensorflow as tf\n",
"import modeling\n",
"import tokenization\n",
"import time\n",
"import random\n",
"\n",
"import optimization\n",
"\n",
"tf.logging.set_verbosity(tf.logging.INFO)\n",
"\n",
"# Create the output directory where all the results are saved.\n",
"output_dir = os.path.join(working_dir, 'results')\n",
"tf.gfile.MakeDirs(output_dir)\n",
"\n",
"# The config json file corresponding to the pre-trained BERT model.\n",
"# This specifies the model architecture.\n",
"bert_config_file = os.path.join(data_dir, 'google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json')\n",
"\n",
"# The vocabulary file that the BERT model was trained on.\n",
"vocab_file = os.path.join(data_dir, 'google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt')\n",
"\n",
"# Whether to lower case the input text. \n",
"# Should be True for uncased models and False for cased models.\n",
"do_lower_case = True\n",
" \n",
"# Total batch size for predictions\n",
"predict_batch_size = 1\n",
"params = dict([('batch_size', predict_batch_size)])\n",
"\n",
"# The maximum total input sequence length after WordPiece tokenization. \n",
"# Sequences longer than this will be truncated, and sequences shorter than this will be padded.\n",
"max_seq_length = 384\n",
"\n",
"# When splitting up a long document into chunks, how much stride to take between chunks.\n",
"doc_stride = 128\n",
"\n",
"# The maximum number of tokens for the question. \n",
"# Questions longer than this will be truncated to this length.\n",
"max_query_length = 64\n",
"\n",
"# This is a WA to use flags from here:\n",
"flags = tf.flags\n",
"\n",
"if 'f' not in tf.flags.FLAGS: \n",
" tf.app.flags.DEFINE_string('f', '', 'kernel')\n",
"FLAGS = flags.FLAGS\n",
"# FLAGS.verbose_logging = True\n",
"\n",
"# The total number of n-best predictions to generate in the nbest_predictions.json output file.\n",
"n_best_size = 20\n",
"\n",
"# The maximum length of an answer that can be generated. \n",
"# This is needed because the start and end predictions are not conditioned on one another.\n",
"max_answer_length = 30\n",
"\n",
"# The initial learning rate for Adam\n",
"learning_rate = 5e-6\n",
"\n",
"# Total batch size for training\n",
"train_batch_size = 3\n",
"\n",
"# Proportion of training to perform linear learning rate warmup for\n",
"warmup_proportion = 0.1\n",
"\n",
"# # Total number of training epochs to perform (results will improve if trained with epochs)\n",
"num_train_epochs = 2\n",
"\n",
"global_batch_size = train_batch_size\n",
"training_hooks = []\n",
"training_hooks.append(run_squad.LogTrainRunHook(global_batch_size, 0))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's create the tokenizer and the training tf_record:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Validate the casing config consistency with the checkpoint name.\n",
"tokenization.validate_case_matches_checkpoint(do_lower_case, init_checkpoint)\n",
"\n",
"# Create the tokenizer.\n",
"tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)\n",
" \n",
"# Load the configuration from file\n",
"bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
"\n",
"config = tf.ConfigProto(log_device_placement=True) \n",
"\n",
"run_config = tf.estimator.RunConfig(\n",
" model_dir=output_dir,\n",
" session_config=config,\n",
" save_checkpoints_steps=1000,\n",
" keep_checkpoint_max=1)\n",
"\n",
"# Read the training examples from the training file:\n",
"train_examples = run_squad.read_squad_examples(input_file=train_file, is_training=True)\n",
"\n",
"num_train_steps = int(len(train_examples) / global_batch_size * num_train_epochs)\n",
"num_warmup_steps = int(num_train_steps * warmup_proportion)\n",
"\n",
"# Pre-shuffle the input to avoid having to make a very large shuffle\n",
"# buffer in in the `input_fn`.\n",
"rng = random.Random(12345)\n",
"rng.shuffle(train_examples)\n",
"\n",
"start_index = 0 \n",
"end_index = len(train_examples)\n",
"tmp_filenames = os.path.join(output_dir, \"train.tf_record\")\n",
"\n",
"# We write to a temporary file to avoid storing very large constant tensors\n",
"# in memory.\n",
"train_writer = run_squad.FeatureWriter(\n",
" filename=tmp_filenames,\n",
" is_training=True)\n",
"\n",
"run_squad.convert_examples_to_features(\n",
" examples=train_examples[start_index:end_index],\n",
" tokenizer=tokenizer,\n",
" max_seq_length=max_seq_length,\n",
" doc_stride=doc_stride,\n",
" max_query_length=max_query_length,\n",
" is_training=True,\n",
" output_fn=train_writer.process_feature)\n",
"\n",
"train_writer.close()\n",
"\n",
"tf.logging.info(\"***** Running training *****\")\n",
"tf.logging.info(\" Num orig examples = %d\", end_index - start_index)\n",
"tf.logging.info(\" Num split examples = %d\", train_writer.num_features)\n",
"tf.logging.info(\" Batch size = %d\", train_batch_size)\n",
"tf.logging.info(\" Num steps = %d\", num_train_steps)\n",
"tf.logging.info(\" LR = %f\", learning_rate)\n",
"\n",
"del train_examples"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We need to create the model for the estimator:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def model_fn(features, labels, mode, params): # pylint: disable=unused-argument\n",
" unique_ids = features[\"unique_ids\"]\n",
" input_ids = features[\"input_ids\"]\n",
" input_mask = features[\"input_mask\"]\n",
" segment_ids = features[\"segment_ids\"]\n",
" \n",
" is_training = (mode == tf.estimator.ModeKeys.TRAIN)\n",
"\n",
" (start_logits, end_logits) = run_squad.create_model(\n",
" bert_config=bert_config,\n",
" is_training=is_training,\n",
" input_ids=input_ids,\n",
" input_mask=input_mask,\n",
" segment_ids=segment_ids,\n",
" use_one_hot_embeddings=False)\n",
"\n",
" tvars = tf.trainable_variables()\n",
"\n",
" initialized_variable_names = {}\n",
" if init_checkpoint:\n",
" (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)\n",
" tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
"\n",
" output_spec = None\n",
" if mode == tf.estimator.ModeKeys.TRAIN:\n",
" seq_length = modeling.get_shape_list(input_ids)[1]\n",
" \n",
" def compute_loss(logits, positions):\n",
" one_hot_positions = tf.one_hot(positions, depth=seq_length, dtype=tf.float32)\n",
" log_probs = tf.nn.log_softmax(logits, axis=-1)\n",
" loss = -tf.reduce_mean(tf.reduce_sum(one_hot_positions * log_probs, axis=-1))\n",
" return loss\n",
"\n",
" start_positions = features[\"start_positions\"]\n",
" end_positions = features[\"end_positions\"]\n",
" start_loss = compute_loss(start_logits, start_positions)\n",
" end_loss = compute_loss(end_logits, end_positions)\n",
" total_loss = (start_loss + end_loss) / 2.0\n",
" \n",
" train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, None, False, use_fp16)\n",
" \n",
" output_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op)\n",
" \n",
" elif mode == tf.estimator.ModeKeys.PREDICT:\n",
" predictions = {\n",
" \"unique_ids\": unique_ids,\n",
" \"start_logits\": start_logits,\n",
" \"end_logits\": end_logits,\n",
" }\n",
" output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)\n",
"\n",
" return output_spec\n",
"\n",
"estimator = tf.estimator.Estimator(\n",
" model_fn=model_fn,\n",
" config=run_config,\n",
" params=params)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.a Fine Tuning\n",
"\n",
"Fine tuning is performed using the run_squad.py.\n",
"\n",
"The run_squad.sh script trains a model and performs evaluation on the SQuaD v1.1 dataset. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"train_input_fn = run_squad.input_fn_builder(\n",
" input_file=tmp_filenames,\n",
" batch_size=train_batch_size,\n",
" seq_length=max_seq_length,\n",
" is_training=True,\n",
" drop_remainder=True,\n",
" hvd=None)\n",
"\n",
"train_start_time = time.time()\n",
"estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=num_train_steps)\n",
"train_time_elapsed = time.time() - train_start_time\n",
"train_time_wo_startup = training_hooks[-1].total_time\n",
"\n",
"avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_wo_startup if train_time_wo_startup else 0\n",
"\n",
"tf.logging.info(\"-----------------------------\")\n",
"tf.logging.info(\"Total Training Time = %0.2f Training Time W/O start up overhead = %0.2f \"\n",
" \"Sentences processed = %d\", train_time_elapsed, train_time_wo_startup,\n",
" num_train_steps * global_batch_size)\n",
"tf.logging.info(\"Training Performance = %0.4f sentences/sec\", avg_sentences_per_second)\n",
"tf.logging.info(\"-----------------------------\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.b Inference\n",
"\n",
"Now we run inference with the fine-tuned model just saved:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"eval_examples = run_squad.read_squad_examples(\n",
" input_file=predict_file, is_training=False)\n",
"\n",
"eval_writer = run_squad.FeatureWriter(\n",
" filename=os.path.join(output_dir, \"eval.tf_record\"),\n",
" is_training=False)\n",
"\n",
"eval_features = []\n",
"def append_feature(feature):\n",
" eval_features.append(feature)\n",
" eval_writer.process_feature(feature)\n",
"\n",
"\n",
"# Loads a data file into a list of InputBatch's\n",
"run_squad.convert_examples_to_features(\n",
" examples=eval_examples,\n",
" tokenizer=tokenizer,\n",
" max_seq_length=max_seq_length,\n",
" doc_stride=doc_stride,\n",
" max_query_length=max_query_length,\n",
" is_training=False,\n",
" output_fn=append_feature)\n",
"\n",
"eval_writer.close()\n",
"\n",
"tf.logging.info(\"***** Running predictions *****\")\n",
"tf.logging.info(\" Num orig examples = %d\", len(eval_examples))\n",
"tf.logging.info(\" Num split examples = %d\", len(eval_features))\n",
"tf.logging.info(\" Batch size = %d\", predict_batch_size)\n",
"\n",
"predict_input_fn = run_squad.input_fn_builder(\n",
" input_file=eval_writer.filename,\n",
" batch_size=predict_batch_size,\n",
" seq_length=max_seq_length,\n",
" is_training=False,\n",
" drop_remainder=False)\n",
"\n",
"all_results = []\n",
"eval_hooks = [run_squad.LogEvalRunHook(predict_batch_size)]\n",
"eval_start_time = time.time()\n",
"for result in estimator.predict(\n",
" predict_input_fn, yield_single_examples=True, hooks=eval_hooks, checkpoint_path=None):\n",
" unique_id = int(result[\"unique_ids\"])\n",
" start_logits = [float(x) for x in result[\"start_logits\"].flat]\n",
" end_logits = [float(x) for x in result[\"end_logits\"].flat]\n",
" all_results.append(\n",
" run_squad.RawResult(\n",
" unique_id=unique_id,\n",
" start_logits=start_logits,\n",
" end_logits=end_logits))\n",
"\n",
"eval_time_elapsed = time.time() - eval_start_time\n",
"eval_time_wo_startup = eval_hooks[-1].total_time\n",
"num_sentences = eval_hooks[-1].count * predict_batch_size\n",
"avg_sentences_per_second = num_sentences * 1.0 / eval_time_wo_startup\n",
"\n",
"tf.logging.info(\"-----------------------------\")\n",
"tf.logging.info(\"Total Inference Time = %0.2f Inference Time W/O start up overhead = %0.2f \"\n",
" \"Sentences processed = %d\", eval_time_elapsed, eval_time_wo_startup,\n",
" num_sentences)\n",
"tf.logging.info(\"Inference Performance = %0.4f sentences/sec\", avg_sentences_per_second)\n",
"tf.logging.info(\"-----------------------------\")\n",
"\n",
"output_prediction_file = os.path.join(output_dir, \"predictions.json\")\n",
"output_nbest_file = os.path.join(output_dir, \"nbest_predictions.json\")\n",
"output_null_log_odds_file = os.path.join(output_dir, \"null_odds.json\")\n",
"\n",
"run_squad.write_predictions(eval_examples, eval_features, all_results,\n",
" n_best_size, max_answer_length,\n",
" do_lower_case, output_prediction_file,\n",
" output_nbest_file, output_null_log_odds_file)\n",
"\n",
"tf.logging.info(\"Inference Results:\")\n",
"\n",
"# Here we show only the prediction results, nbest prediction is also available in the output directory\n",
"results = \"\"\n",
"with open(output_prediction_file, 'r') as json_file:\n",
" data = json.load(json_file)\n",
" for question in eval_examples:\n",
" results += \"<tr><td>{}</td><td>{}</td><td>{}</td></tr>\".format(question.qas_id, question.question_text, data[question.qas_id])\n",
"\n",
"\n",
"from IPython.display import display, HTML\n",
"display(HTML(\"<table><tr><th>Id</th><th>Question</th><th>Answer</th></tr>{}</table>\".format(results))) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.b Evaluation\n",
"\n",
"Let's run evaluation using the script in the SQuaD1.1 folder and our fine-tuned model:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!python /workspace/bert/data/download/squad/v1.1/evaluate-v1.1.py \\\n",
" $predict_file \\\n",
" $output_dir/predictions.json"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. What's next\n",
"\n",
"Now that you have fine-tuned a BERT model you may want to take a look ad the run_squad script which containd more options for fine-tuning."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@@ -0,0 +1,577 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Copyright 2019 NVIDIA Corporation. All Rights Reserved.\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# http://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License.\n",
"# =============================================================================="
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
"\n",
"# BERT Question Answering Inference with Mixed Precision\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Overview\n",
"\n",
"Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. \n",
"\n",
"The original paper can be found here: https://arxiv.org/abs/1810.04805.\n",
"\n",
"NVIDIA's BERT 19.10 is an optimized version of Google's official implementation, leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.a Learning objectives\n",
"\n",
"This notebook demonstrates:\n",
"- Inference on QA task with BERT Large model\n",
"- The use/download of fine-tuned NVIDIA BERT models\n",
"- Use of Mixed Precision for Inference"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Requirements\n",
"\n",
"Please refer to the ReadMe file"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. BERT Inference: Question Answering\n",
"\n",
"We can run inference on a fine-tuned BERT model for tasks like Question Answering.\n",
"\n",
"Here we use a BERT model fine-tuned on a [SQuaD 2.0 Dataset](https://rajpurkar.github.io/SQuAD-explorer/) which contains 100,000+ question-answer pairs on 500+ articles combined with over 50,000 new, unanswerable questions."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3.a Paragraph and Queries\n",
"\n",
"In this example we will ask our BERT model questions related to the following paragraph:\n",
"\n",
"**The Apollo Program**\n",
"_\"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\"_\n",
"\n",
"The questions and relative answers expected are shown below:\n",
"\n",
" - **Q1:** \"What project put the first Americans into space?\" \n",
" - **A1:** \"Project Mercury\"\n",
" - **Q2:** \"What program was created to carry out these projects and missions?\"\n",
" - **A2:** \"The Apollo program\"\n",
" - **Q3:** \"What year did the first manned Apollo flight occur?\"\n",
" - **A3:** \"1968\"\n",
" - **Q4:** \"What President is credited with the original notion of putting Americans in space?\"\n",
" - **A4:** \"John F. Kennedy\"\n",
" - **Q5:** \"Who did the U.S. collaborate with on an Earth orbit mission in 1975?\"\n",
" - **A5:** \"Soviet Union\"\n",
" - **Q6:** \"How long did Project Apollo run?\"\n",
" - **A6:** \"1961 to 1972\"\n",
" - **Q7:** \"What program helped develop space travel techniques that Project Apollo used?\"\n",
" - **A7:** \"Gemini Mission\"\n",
" - **Q8:** \"What space station supported three manned missions in 1973-1974?\"\n",
" - **A8:** \"Skylab\"\n",
" \n",
"---\n",
"\n",
"The paragraph and the questions can be easily customized by changing the code below:\n",
"\n",
"---"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile input.json\n",
"{\"data\": \n",
" [\n",
" {\"title\": \"Project Apollo\",\n",
" \"paragraphs\": [\n",
" {\"context\":\"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\", \n",
" \"qas\": [\n",
" { \"question\": \"What project put the first Americans into space?\", \n",
" \"id\": \"Q1\"\n",
" },\n",
" { \"question\": \"What program was created to carry out these projects and missions?\",\n",
" \"id\": \"Q2\"\n",
" },\n",
" { \"question\": \"What year did the first manned Apollo flight occur?\",\n",
" \"id\": \"Q3\"\n",
" }, \n",
" { \"question\": \"What President is credited with the original notion of putting Americans in space?\",\n",
" \"id\": \"Q4\"\n",
" },\n",
" { \"question\": \"Who did the U.S. collaborate with on an Earth orbit mission in 1975?\",\n",
" \"id\": \"Q5\"\n",
" },\n",
" { \"question\": \"How long did Project Apollo run?\",\n",
" \"id\": \"Q6\"\n",
" }, \n",
" { \"question\": \"What program helped develop space travel techniques that Project Apollo used?\",\n",
" \"id\": \"Q7\"\n",
" }, \n",
" {\"question\": \"What space station supported three manned missions in 1973-1974?\",\n",
" \"id\": \"Q8\"\n",
" } \n",
"]}]}]}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"\n",
"notebooks_dir = '/workspace/bert/notebooks'\n",
"data_dir = '/workspace/bert/data/download'\n",
"\n",
"working_dir = '/workspace/bert'\n",
"if working_dir not in sys.path:\n",
" sys.path.append(working_dir)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"input_file = os.path.join(notebooks_dir, 'input.json')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3.b Mixed Precision\n",
"\n",
"Mixed precision training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of tensor cores in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.\n",
"\n",
"For information about:\n",
"- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.\n",
"- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.\n",
"- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook we control mixed precision execution with the environmental variable:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"TF_ENABLE_AUTO_MIXED_PRECISION\"] = \"1\" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can choose the mixed precision model (which takes much less time to train than the fp32 version) without losing accuracy, with the following flag: "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"use_mixed_precision_model = True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To effectively evaluate the speedup of mixed precision try a bigger workload by uncommenting the following line:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#input_file = '/workspace/bert/data/download/squad/v2.0/dev-v2.0.json'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Fine-Tuned NVIDIA BERT TF Models\n",
"\n",
"Based on the model size, we have the following two default configurations of BERT.\n",
"\n",
"| **Model** | **Hidden layers** | **Hidden unit size** | **Attention heads** | **Feedforward filter size** | **Max sequence length** | **Parameters** |\n",
"|:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|\n",
"|BERTBASE |12 encoder| 768| 12|4 x 768|512|110M|\n",
"|BERTLARGE|24 encoder|1024| 16|4 x 1024|512|330M|\n",
"\n",
"We will take advantage of the fine-tuned models available on NGC (NVIDIA GPU Cluster, https://ngc.nvidia.com).\n",
"Among the many configurations available we will download these two:\n",
"\n",
" - **bert_tf_v2_large_fp32_384**\n",
"\n",
" - **bert_tf_v2_large_fp16_384**\n",
"\n",
"Which are trained on the SQuaD 2.0 Dataset."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# bert_tf_v2_large_fp32_384\n",
"DATA_DIR_FP32='/workspace/bert/data/download/finetuned_model_fp32'\n",
"!mkdir -p $DATA_DIR_FP32\n",
"!wget -nc -q --show-progress -O $DATA_DIR_FP32/bert_tf_v2_large_fp32_384.zip \\\n",
"https://api.ngc.nvidia.com/v2/models/nvidia/bert_tf_v2_large_fp32_384/versions/1/zip\n",
"!unzip -n -d $DATA_DIR_FP32/ $DATA_DIR_FP32/bert_tf_v2_large_fp32_384.zip \n",
" \n",
"# bert_tf_v2_large_fp16_384\n",
"DATA_DIR_FP16='/workspace/bert/data/download/finetuned_model_fp16'\n",
"!mkdir -p $DATA_DIR_FP16\n",
"!wget -nc -q --show-progress -O $DATA_DIR_FP16/bert_tf_v2_large_fp16_384.zip \\\n",
"https://api.ngc.nvidia.com/v2/models/nvidia/bert_tf_v2_large_fp16_384/versions/1/zip\n",
"!unzip -n -d $DATA_DIR_FP16/ $DATA_DIR_FP16/bert_tf_v2_large_fp16_384.zip "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the code that follows we will refer to these models."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Running QA task inference\n",
"\n",
"In order to run QA inference we will follow step-by-step the flow implemented in run_squad.py.\n",
"\n",
"Configuration:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import run_squad\n",
"import json\n",
"import tensorflow as tf\n",
"import modeling\n",
"import tokenization\n",
"import time\n",
"import random\n",
"\n",
"tf.logging.set_verbosity(tf.logging.INFO)\n",
"\n",
"# Create the output directory where all the results are saved.\n",
"output_dir = os.path.join(working_dir, 'results')\n",
"tf.gfile.MakeDirs(output_dir)\n",
"\n",
"# The config json file corresponding to the pre-trained BERT model.\n",
"# This specifies the model architecture.\n",
"bert_config_file = os.path.join(data_dir, 'google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json')\n",
"\n",
"# The vocabulary file that the BERT model was trained on.\n",
"vocab_file = os.path.join(data_dir, 'google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt')\n",
"\n",
"# Depending on the mixed precision flag we use different fine-tuned model\n",
"if use_mixed_precision_model:\n",
" init_checkpoint = os.path.join(data_dir, 'finetuned_model_fp16/model.ckpt-8144')\n",
"else:\n",
" init_checkpoint = os.path.join(data_dir, 'finetuned_model_fp32/model.ckpt-8144')\n",
"\n",
"# Whether to lower case the input text. \n",
"# Should be True for uncased models and False for cased models.\n",
"do_lower_case = True\n",
" \n",
"# Total batch size for predictions\n",
"predict_batch_size = 1\n",
"params = dict([('batch_size', predict_batch_size)])\n",
"\n",
"# The maximum total input sequence length after WordPiece tokenization. \n",
"# Sequences longer than this will be truncated, and sequences shorter than this will be padded.\n",
"max_seq_length = 384\n",
"\n",
"# When splitting up a long document into chunks, how much stride to take between chunks.\n",
"doc_stride = 128\n",
"\n",
"# The maximum number of tokens for the question. \n",
"# Questions longer than this will be truncated to this length.\n",
"max_query_length = 64\n",
"\n",
"# This is a WA to use flags from here:\n",
"flags = tf.flags\n",
"\n",
"if 'f' not in tf.flags.FLAGS: \n",
" tf.app.flags.DEFINE_string('f', '', 'kernel')\n",
"FLAGS = flags.FLAGS\n",
"\n",
"# The total number of n-best predictions to generate in the nbest_predictions.json output file.\n",
"n_best_size = 20\n",
"\n",
"# The maximum length of an answer that can be generated. \n",
"# This is needed because the start and end predictions are not conditioned on one another.\n",
"max_answer_length = 30"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's define the tokenizer and create the model for the estimator:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Validate the casing config consistency with the checkpoint name.\n",
"tokenization.validate_case_matches_checkpoint(do_lower_case, init_checkpoint)\n",
"\n",
"# Create the tokenizer.\n",
"tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)\n",
"\n",
"# Load the configuration from file\n",
"bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
"\n",
"def model_fn(features, labels, mode, params): # pylint: disable=unused-argument\n",
" unique_ids = features[\"unique_ids\"]\n",
" input_ids = features[\"input_ids\"]\n",
" input_mask = features[\"input_mask\"]\n",
" segment_ids = features[\"segment_ids\"]\n",
"\n",
" (start_logits, end_logits) = run_squad.create_model(\n",
" bert_config=bert_config,\n",
" is_training=False,\n",
" input_ids=input_ids,\n",
" input_mask=input_mask,\n",
" segment_ids=segment_ids,\n",
" use_one_hot_embeddings=False)\n",
"\n",
" tvars = tf.trainable_variables()\n",
"\n",
" initialized_variable_names = {}\n",
" (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)\n",
" tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
" output_spec = None\n",
" predictions = {\"unique_ids\": unique_ids,\n",
" \"start_logits\": start_logits,\n",
" \"end_logits\": end_logits}\n",
" output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)\n",
" return output_spec\n",
"\n",
"config = tf.ConfigProto(log_device_placement=True) \n",
"\n",
"run_config = tf.estimator.RunConfig(\n",
" model_dir=None,\n",
" session_config=config,\n",
" save_checkpoints_steps=1000,\n",
" keep_checkpoint_max=1)\n",
"\n",
"estimator = tf.estimator.Estimator(\n",
" model_fn=model_fn,\n",
" config=run_config,\n",
" params=params)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.a Inference"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"eval_examples = run_squad.read_squad_examples(\n",
" input_file=input_file, is_training=False)\n",
"\n",
"eval_writer = run_squad.FeatureWriter(\n",
" filename=os.path.join(output_dir, \"eval.tf_record\"),\n",
" is_training=False)\n",
"\n",
"eval_features = []\n",
"def append_feature(feature):\n",
" eval_features.append(feature)\n",
" eval_writer.process_feature(feature)\n",
"\n",
"\n",
"# Loads a data file into a list of InputBatch's\n",
"run_squad.convert_examples_to_features(\n",
" examples=eval_examples,\n",
" tokenizer=tokenizer,\n",
" max_seq_length=max_seq_length,\n",
" doc_stride=doc_stride,\n",
" max_query_length=max_query_length,\n",
" is_training=False,\n",
" output_fn=append_feature)\n",
"\n",
"eval_writer.close()\n",
"\n",
"tf.logging.info(\"***** Running predictions *****\")\n",
"tf.logging.info(\" Num orig examples = %d\", len(eval_examples))\n",
"tf.logging.info(\" Num split examples = %d\", len(eval_features))\n",
"tf.logging.info(\" Batch size = %d\", predict_batch_size)\n",
"\n",
"predict_input_fn = run_squad.input_fn_builder(\n",
" input_file=eval_writer.filename,\n",
" batch_size=predict_batch_size,\n",
" seq_length=max_seq_length,\n",
" is_training=False,\n",
" drop_remainder=False)\n",
"\n",
"all_results = []\n",
"eval_hooks = [run_squad.LogEvalRunHook(predict_batch_size)]\n",
"eval_start_time = time.time()\n",
"for result in estimator.predict(\n",
" predict_input_fn, yield_single_examples=True, hooks=eval_hooks, checkpoint_path=init_checkpoint):\n",
" unique_id = int(result[\"unique_ids\"])\n",
" start_logits = [float(x) for x in result[\"start_logits\"].flat]\n",
" end_logits = [float(x) for x in result[\"end_logits\"].flat]\n",
" all_results.append(\n",
" run_squad.RawResult(\n",
" unique_id=unique_id,\n",
" start_logits=start_logits,\n",
" end_logits=end_logits))\n",
"\n",
"eval_time_elapsed = time.time() - eval_start_time\n",
"\n",
"eval_time_wo_startup = eval_hooks[-1].total_time\n",
"num_sentences = eval_hooks[-1].count * predict_batch_size\n",
"avg_sentences_per_second = num_sentences * 1.0 / eval_time_wo_startup\n",
"\n",
"tf.logging.info(\"-----------------------------\")\n",
"tf.logging.info(\"Total Inference Time = %0.2f Inference Time W/O start up overhead = %0.2f \"\n",
" \"Sentences processed = %d\", eval_time_elapsed, eval_time_wo_startup,\n",
" num_sentences)\n",
"tf.logging.info(\"Inference Performance = %0.4f sentences/sec\", avg_sentences_per_second)\n",
"tf.logging.info(\"-----------------------------\")\n",
"\n",
"output_prediction_file = os.path.join(output_dir, \"predictions.json\")\n",
"output_nbest_file = os.path.join(output_dir, \"nbest_predictions.json\")\n",
"output_null_log_odds_file = os.path.join(output_dir, \"null_odds.json\")\n",
"\n",
"run_squad.write_predictions(eval_examples, eval_features, all_results,\n",
" n_best_size, max_answer_length,\n",
" do_lower_case, output_prediction_file,\n",
" output_nbest_file, output_null_log_odds_file)\n",
"\n",
"tf.logging.info(\"Inference Results:\")\n",
"\n",
"# Here we show only the prediction results, nbest prediction is also available in the output directory\n",
"results = \"\"\n",
"with open(output_prediction_file, 'r') as json_file:\n",
" data = json.load(json_file)\n",
" for question in eval_examples:\n",
" results += \"<tr><td>{}</td><td>{}</td><td>{}</td></tr>\".format(question.qas_id, question.question_text, data[question.qas_id])\n",
"\n",
"\n",
"from IPython.display import display, HTML\n",
"display(HTML(\"<table><tr><th>Id</th><th>Question</th><th>Answer</th></tr>{}</table>\".format(results))) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. What's next"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now that you are familiar with running QA Inference on BERT, using mixed precision, you may want to try\n",
"your own paragraphs and queries. \n",
"\n",
"You may also want to take a look to the notebook __bert_squad_tf_finetuning.ipynb__ on how to run fine-tuning on BERT, available in the same directory."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@@ -0,0 +1,765 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "jDXroBuNw60P"
},
"outputs": [],
"source": [
"# Copyright 2019 NVIDIA Corporation. All Rights Reserved.\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# http://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License.\n",
"# =============================================================================="
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/LanguageModeling/BERT/notebooks/bert_squad_tf_inference_colab.ipynb#scrollTo=5hRb96NKE3X0\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "k-XnFINow60d"
},
"source": [
"<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
"\n",
"# BERT Question Answering Inference with Mixed Precision\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "TfF7V662w60j"
},
"source": [
"## 1. Overview\n",
"\n",
"Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. \n",
"\n",
"The original paper can be found here: https://arxiv.org/abs/1810.04805.\n",
"\n",
"NVIDIA's BERT 19.10 is an optimized version of Google's official implementation, leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy."
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Ah3Lv9zyw60l"
},
"source": [
"### 1.a Learning objectives\n",
"\n",
"This notebook demonstrates:\n",
"- Inference on QA task with BERT Large model\n",
"- The use/download of fine-tuned NVIDIA BERT models\n",
"- Use of Mixed Precision for Inference"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "hxNJ8HByw60o"
},
"source": [
"## 2. Requirements\n",
"\n",
"### 2.a GPU\n",
"\n",
"Before running this notebook, please set the Colab runtime environment to GPU via the menu *Runtime => Change runtime type => GPU*.\n",
"\n",
"This demo will work on any NVIDIA GPU with CUDA cores, though for improved FP16 inference, a Volta, Turing or newer generation GPU with Tensor cores is desired. On Google Colab, this normally means a T4 GPU. If you are assigned an older K80 GPU, another trial at another time might give you a T4 GPU."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!nvidia-smi"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "hxNJ8HByw60o"
},
"source": [
"### 2.b Download the required files from NVIDIA-Github:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "KV_WnOY4zUa_"
},
"outputs": [],
"source": [
"!wget -nc -q --show-progress -O ./master.zip \\\n",
"https://github.com/NVIDIA/DeepLearningExamples/archive/master.zip\n",
"!unzip -q -n -d . ./master.zip "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "5D7i7Pao5qoj"
},
"outputs": [],
"source": [
"import os\n",
"\n",
"WORKSPACE_DIR='./DeepLearningExamples-master/TensorFlow/LanguageModeling/BERT/'\n",
"os.chdir(WORKSPACE_DIR)\n",
"print (os.getcwd())"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "mjlZbP0dw60r"
},
"source": [
"## 3. BERT Inference: Question Answering\n",
"\n",
"We can run inference on a fine-tuned BERT model for tasks like Question Answering.\n",
"\n",
"Here we use a BERT model fine-tuned on a [SQuaD 2.0 Dataset](https://rajpurkar.github.io/SQuAD-explorer/) which contains 100,000+ question-answer pairs on 500+ articles combined with over 50,000 new, unanswerable questions."
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "mOc16svBw60t"
},
"source": [
"### 3.a Paragraph and Queries\n",
"\n",
"In this example we will ask our BERT model questions related to the following paragraph:\n",
"\n",
"**The Apollo Program**\n",
"_\"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\"_\n",
"\n",
"The questions and relative answers expected are shown below:\n",
"\n",
" - **Q1:** \"What project put the first Americans into space?\" \n",
" - **A1:** \"Project Mercury\"\n",
" - **Q2:** \"What program was created to carry out these projects and missions?\"\n",
" - **A2:** \"The Apollo program\"\n",
" - **Q3:** \"What year did the first manned Apollo flight occur?\"\n",
" - **A3:** \"1968\"\n",
" - **Q4:** \"What President is credited with the original notion of putting Americans in space?\"\n",
" - **A4:** \"John F. Kennedy\"\n",
" - **Q5:** \"Who did the U.S. collaborate with on an Earth orbit mission in 1975?\"\n",
" - **A5:** \"Soviet Union\"\n",
" - **Q6:** \"How long did Project Apollo run?\"\n",
" - **A6:** \"1961 to 1972\"\n",
" - **Q7:** \"What program helped develop space travel techniques that Project Apollo used?\"\n",
" - **A7:** \"Gemini Mission\"\n",
" - **Q8:** \"What space station supported three manned missions in 1973-1974?\"\n",
" - **A8:** \"Skylab\"\n",
" \n",
"---\n",
"\n",
"The paragraph and the questions can be easily customized by changing the code below:\n",
"\n",
"---"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "srU0TT1Iw60v"
},
"outputs": [],
"source": [
"%%writefile input.json\n",
"{\"data\": \n",
" [\n",
" {\"title\": \"Project Apollo\",\n",
" \"paragraphs\": [\n",
" {\"context\":\"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\", \n",
" \"qas\": [\n",
" { \"question\": \"What project put the first Americans into space?\", \n",
" \"id\": \"Q1\"\n",
" },\n",
" { \"question\": \"What program was created to carry out these projects and missions?\",\n",
" \"id\": \"Q2\"\n",
" },\n",
" { \"question\": \"What year did the first manned Apollo flight occur?\",\n",
" \"id\": \"Q3\"\n",
" }, \n",
" { \"question\": \"What President is credited with the original notion of putting Americans in space?\",\n",
" \"id\": \"Q4\"\n",
" },\n",
" { \"question\": \"Who did the U.S. collaborate with on an Earth orbit mission in 1975?\",\n",
" \"id\": \"Q5\"\n",
" },\n",
" { \"question\": \"How long did Project Apollo run?\",\n",
" \"id\": \"Q6\"\n",
" }, \n",
" { \"question\": \"What program helped develop space travel techniques that Project Apollo used?\",\n",
" \"id\": \"Q7\"\n",
" }, \n",
" {\"question\": \"What space station supported three manned missions in 1973-1974?\",\n",
" \"id\": \"Q8\"\n",
" } \n",
"]}]}]}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "ujyka-8Iw603"
},
"outputs": [],
"source": [
"import sys\n",
"\n",
"working_dir = os.getcwd();\n",
"data_dir = os.path.join(working_dir, 'data/download');\n",
"if working_dir not in sys.path:\n",
" sys.path.append(working_dir)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "6gA3-6LVw61D"
},
"outputs": [],
"source": [
"input_file = os.path.join(working_dir, 'input.json')"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "D9p8XaBnw61N"
},
"source": [
"### 3.b Mixed Precision\n",
"\n",
"Mixed precision training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of tensor cores in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.\n",
"\n",
"For information about:\n",
"- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.\n",
"- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.\n",
"- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog."
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "ceeYPqQcw61P"
},
"source": [
"In this notebook we control mixed precision execution with the environmental variable:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "k4jIJevFw61R"
},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"TF_ENABLE_AUTO_MIXED_PRECISION\"] = \"1\" "
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "rt_4-ZA5w61Y"
},
"source": [
"We can choose the mixed precision model (which takes much less time to train than the fp32 version) without losing accuracy, with the following flag: "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "BRdclfEaw61Z"
},
"outputs": [],
"source": [
"use_mixed_precision_model = True"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "iu4Jb5puw61p"
},
"source": [
"## 4. Fine-Tuned NVIDIA BERT TF Models\n",
"\n",
"Based on the model size, we have the following two default configurations of BERT.\n",
"\n",
"| **Model** | **Hidden layers** | **Hidden unit size** | **Attention heads** | **Feedforward filter size** | **Max sequence length** | **Parameters** |\n",
"|:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|\n",
"|BERTBASE |12 encoder| 768| 12|4 x 768|512|110M|\n",
"|BERTLARGE|24 encoder|1024| 16|4 x 1024|512|330M|\n",
"\n",
"We will take advantage of the fine-tuned models available on NGC (NVIDIA GPU Cluster, https://ngc.nvidia.com).\n",
"Among the many configurations available we will download these two:\n",
"\n",
" - **bert_tf_v2_large_fp32_384**\n",
"\n",
" - **bert_tf_v2_large_fp16_384**\n",
"\n",
"Which are trained on the SQuaD 2.0 Dataset."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "5JWKZfP8w61t"
},
"outputs": [],
"source": [
"# bert_tf_v2_large_fp32_384\n",
"DATA_DIR_FP32 = os.path.join(data_dir, 'finetuned_model_fp32')\n",
"!mkdir -p $DATA_DIR_FP32\n",
"!wget -nc -q --show-progress -O $DATA_DIR_FP32/bert_tf_v2_large_fp32_384.zip \\\n",
"https://api.ngc.nvidia.com/v2/models/nvidia/bert_tf_v2_large_fp32_384/versions/1/zip\n",
"!unzip -n -d $DATA_DIR_FP32/ $DATA_DIR_FP32/bert_tf_v2_large_fp32_384.zip \n",
" \n",
"# bert_tf_v2_large_fp16_384\n",
"DATA_DIR_FP16 = os.path.join(data_dir, 'finetuned_model_fp16')\n",
"!mkdir -p $DATA_DIR_FP16\n",
"!wget -nc -q --show-progress -O $DATA_DIR_FP16/bert_tf_v2_large_fp16_384.zip \\\n",
"https://api.ngc.nvidia.com/v2/models/nvidia/bert_tf_v2_large_fp16_384/versions/1/zip\n",
"!unzip -n -d $DATA_DIR_FP16/ $DATA_DIR_FP16/bert_tf_v2_large_fp16_384.zip "
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "GrFrZickw61z"
},
"source": [
"In the code that follows we will refer to these models."
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "cU8mGJDa1FfX"
},
"source": [
"Download the Google pretrained weights and vocab file:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "5hRb96NKE3X0"
},
"outputs": [],
"source": [
"os.chdir(\"./data\");\n",
"from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader\n",
"gd = GooglePretrainedWeightDownloader(data_dir)\n",
"gd.download()\n",
"os.chdir(\"..\");"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "VY1Dipam15DE"
},
"source": [
"We need the horovod package:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "jqAJob92C2wA"
},
"outputs": [],
"source": [
"try:\n",
" __import__(\"horovod\")\n",
"except ImportError:\n",
" os.system(\"pip install horovod\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "5NuuGNsDw611"
},
"source": [
"## 5. Running QA task inference\n",
"\n",
"In order to run QA inference we will follow step-by-step the flow implemented in run_squad.py.\n",
"\n",
"Configuration:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "_c2qCQ9-w613"
},
"outputs": [],
"source": [
"import run_squad\n",
"import json\n",
"import tensorflow as tf\n",
"import modeling\n",
"import tokenization\n",
"import time\n",
"import random\n",
"\n",
"tf.logging.set_verbosity(tf.logging.INFO)\n",
"\n",
"# Create the output directory where all the results are saved.\n",
"output_dir = os.path.join(working_dir, 'results')\n",
"tf.gfile.MakeDirs(output_dir)\n",
"\n",
"# The config json file corresponding to the pre-trained BERT model.\n",
"# This specifies the model architecture.\n",
"bert_config_file = os.path.join(data_dir, 'google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json')\n",
"\n",
"# The vocabulary file that the BERT model was trained on.\n",
"vocab_file = os.path.join(data_dir, 'google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt')\n",
"\n",
"# Depending on the mixed precision flag we use different fine-tuned model\n",
"if use_mixed_precision_model:\n",
" init_checkpoint = os.path.join(data_dir, 'finetuned_model_fp16/model.ckpt-8144')\n",
"else:\n",
" init_checkpoint = os.path.join(data_dir, 'finetuned_model_fp32/model.ckpt-8144')\n",
"\n",
"# Whether to lower case the input text. \n",
"# Should be True for uncased models and False for cased models.\n",
"do_lower_case = True\n",
" \n",
"# Total batch size for predictions\n",
"predict_batch_size = 1\n",
"params = dict([('batch_size', predict_batch_size)])\n",
"\n",
"# The maximum total input sequence length after WordPiece tokenization. \n",
"# Sequences longer than this will be truncated, and sequences shorter than this will be padded.\n",
"max_seq_length = 384\n",
"\n",
"# When splitting up a long document into chunks, how much stride to take between chunks.\n",
"doc_stride = 128\n",
"\n",
"# The maximum number of tokens for the question. \n",
"# Questions longer than this will be truncated to this length.\n",
"max_query_length = 64\n",
"\n",
"# This is a WA to use flags from here:\n",
"flags = tf.flags\n",
"\n",
"if 'f' not in tf.flags.FLAGS: \n",
" tf.app.flags.DEFINE_string('f', '', 'kernel')\n",
"FLAGS = flags.FLAGS\n",
"\n",
"# The total number of n-best predictions to generate in the nbest_predictions.json output file.\n",
"n_best_size = 20\n",
"\n",
"# The maximum length of an answer that can be generated. \n",
"# This is needed because the start and end predictions are not conditioned on one another.\n",
"max_answer_length = 30"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "2h_eLUgPw618"
},
"source": [
"Let's define the tokenizer and create the model for the estimator:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "RXHdoUb9w619"
},
"outputs": [],
"source": [
"# Validate the casing config consistency with the checkpoint name.\n",
"tokenization.validate_case_matches_checkpoint(do_lower_case, init_checkpoint)\n",
"\n",
"# Create the tokenizer.\n",
"tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)\n",
"\n",
"# Load the configuration from file\n",
"bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
"\n",
"def model_fn(features, labels, mode, params): # pylint: disable=unused-argument\n",
" unique_ids = features[\"unique_ids\"]\n",
" input_ids = features[\"input_ids\"]\n",
" input_mask = features[\"input_mask\"]\n",
" segment_ids = features[\"segment_ids\"]\n",
"\n",
" (start_logits, end_logits) = run_squad.create_model(\n",
" bert_config=bert_config,\n",
" is_training=False,\n",
" input_ids=input_ids,\n",
" input_mask=input_mask,\n",
" segment_ids=segment_ids,\n",
" use_one_hot_embeddings=False)\n",
"\n",
" tvars = tf.trainable_variables()\n",
"\n",
" initialized_variable_names = {}\n",
" (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)\n",
" tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
" output_spec = None\n",
" predictions = {\"unique_ids\": unique_ids,\n",
" \"start_logits\": start_logits,\n",
" \"end_logits\": end_logits}\n",
" output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)\n",
" return output_spec\n",
"\n",
"config = tf.ConfigProto(log_device_placement=True) \n",
"\n",
"run_config = tf.estimator.RunConfig(\n",
" model_dir=None,\n",
" session_config=config,\n",
" save_checkpoints_steps=1000,\n",
" keep_checkpoint_max=1)\n",
"\n",
"estimator = tf.estimator.Estimator(\n",
" model_fn=model_fn,\n",
" config=run_config,\n",
" params=params)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "xSKkf4JLw62E"
},
"source": [
"### 5.a Inference"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "3OKhc349w62F",
"scrolled": true
},
"outputs": [],
"source": [
"eval_examples = run_squad.read_squad_examples(\n",
" input_file=input_file, is_training=False)\n",
"\n",
"eval_writer = run_squad.FeatureWriter(\n",
" filename=os.path.join(output_dir, \"eval.tf_record\"),\n",
" is_training=False)\n",
"\n",
"eval_features = []\n",
"def append_feature(feature):\n",
" eval_features.append(feature)\n",
" eval_writer.process_feature(feature)\n",
"\n",
"\n",
"# Loads a data file into a list of InputBatch's\n",
"run_squad.convert_examples_to_features(\n",
" examples=eval_examples,\n",
" tokenizer=tokenizer,\n",
" max_seq_length=max_seq_length,\n",
" doc_stride=doc_stride,\n",
" max_query_length=max_query_length,\n",
" is_training=False,\n",
" output_fn=append_feature)\n",
"\n",
"eval_writer.close()\n",
"\n",
"tf.logging.info(\"***** Running predictions *****\")\n",
"tf.logging.info(\" Num orig examples = %d\", len(eval_examples))\n",
"tf.logging.info(\" Num split examples = %d\", len(eval_features))\n",
"tf.logging.info(\" Batch size = %d\", predict_batch_size)\n",
"\n",
"predict_input_fn = run_squad.input_fn_builder(\n",
" input_file=eval_writer.filename,\n",
" batch_size=predict_batch_size,\n",
" seq_length=max_seq_length,\n",
" is_training=False,\n",
" drop_remainder=False)\n",
"\n",
"all_results = []\n",
"eval_hooks = [run_squad.LogEvalRunHook(predict_batch_size)]\n",
"eval_start_time = time.time()\n",
"for result in estimator.predict(\n",
" predict_input_fn, yield_single_examples=True, hooks=eval_hooks, checkpoint_path=init_checkpoint):\n",
" unique_id = int(result[\"unique_ids\"])\n",
" start_logits = [float(x) for x in result[\"start_logits\"].flat]\n",
" end_logits = [float(x) for x in result[\"end_logits\"].flat]\n",
" all_results.append(\n",
" run_squad.RawResult(\n",
" unique_id=unique_id,\n",
" start_logits=start_logits,\n",
" end_logits=end_logits))\n",
"\n",
"eval_time_elapsed = time.time() - eval_start_time\n",
"\n",
"eval_time_wo_startup = eval_hooks[-1].total_time\n",
"num_sentences = eval_hooks[-1].count * predict_batch_size\n",
"avg_sentences_per_second = num_sentences * 1.0 / eval_time_wo_startup\n",
"\n",
"tf.logging.info(\"-----------------------------\")\n",
"tf.logging.info(\"Total Inference Time = %0.2f Inference Time W/O start up overhead = %0.2f \"\n",
" \"Sentences processed = %d\", eval_time_elapsed, eval_time_wo_startup,\n",
" num_sentences)\n",
"tf.logging.info(\"Inference Performance = %0.4f sentences/sec\", avg_sentences_per_second)\n",
"tf.logging.info(\"-----------------------------\")\n",
"\n",
"output_prediction_file = os.path.join(output_dir, \"predictions.json\")\n",
"output_nbest_file = os.path.join(output_dir, \"nbest_predictions.json\")\n",
"output_null_log_odds_file = os.path.join(output_dir, \"null_odds.json\")\n",
"\n",
"run_squad.write_predictions(eval_examples, eval_features, all_results,\n",
" n_best_size, max_answer_length,\n",
" do_lower_case, output_prediction_file,\n",
" output_nbest_file, output_null_log_odds_file)\n",
"\n",
"tf.logging.info(\"Inference Results:\")\n",
"\n",
"# Here we show only the prediction results, nbest prediction is also available in the output directory\n",
"results = \"\"\n",
"with open(output_prediction_file, 'r') as json_file:\n",
" data = json.load(json_file)\n",
" for question in eval_examples:\n",
" results += \"<tr><td>{}</td><td>{}</td><td>{}</td></tr>\".format(question.qas_id, question.question_text, data[question.qas_id])\n",
"\n",
"\n",
"from IPython.display import display, HTML\n",
"display(HTML(\"<table><tr><th>Id</th><th>Question</th><th>Answer</th></tr>{}</table>\".format(results))) "
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "EMT0sKxHw62L"
},
"source": [
"## 6. What's next"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "mKBM_UD6w62N"
},
"source": [
"Now that you are familiar with running QA Inference on BERT, using mixed precision, you may want to try\n",
"your own paragraphs and queries. \n",
"\n",
"You may also want to take a look to the notebook __bert_squad_tf_finetuning.ipynb__ on how to run fine-tuning on BERT, available in the same directory."
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"name": "bert_squad_tf_inference.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
@@ -0,0 +1,610 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Copyright 2019 NVIDIA Corporation. All Rights Reserved.\n",
"#\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# http://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License.\n",
"# =============================================================================="
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
"\n",
"# BioBERT Named-Entity Recognition Inference with Mixed Precision\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Overview\n",
"\n",
"Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. \n",
"\n",
"BioBERT is a domain specific version of BERT that has been trained on PubMed abstracts.\n",
"\n",
"The original BioBERT paper can be found here: https://arxiv.org/abs/1901.08746\n",
"\n",
"NVIDIA's BioBERT is an optimized version of the implementation presented in the paper, leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.a Learning objectives\n",
"\n",
"This notebook demonstrates:\n",
"- Inference on NER task with BioBERT model\n",
"- The use/download of fine-tuned NVIDIA BioBERT models\n",
"- Use of Mixed Precision for Inference"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Requirements\n",
"\n",
"Please refer to the ReadMe file"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. BioBERT Inference: Named-Entity Recognition\n",
"\n",
"We can run inference on a fine-tuned BioBERT model for tasks like Named-Entity Recognition.\n",
"\n",
"Here we use a BioBERT model fine-tuned on a [BC5CDR-disease Dataset](https://www.ncbi.nlm.nih.gov/research/bionlp/Data/) which consists of 1500 PubMed articles with 5818 annotated diseases."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3.a Extract Disease Information from Text\n",
"\n",
"In this example we will use Named-Entity Recognition model created using BioBERT to extract disease information from the following paragraph:\n",
"\n",
"**Input Text**\n",
"\n",
"_\"The authors describe the case of a 56 - year - old woman with chronic, severe heart failure \n",
"secondary to dilated cardiomyopathy and absence of significant ventricular arrhythmias \n",
"who developed QT prolongation and torsade de pointes ventricular tachycardia during one cycle \n",
"of intermittent low dose (2.5 mcg/kg per min) dobutamine. \n",
"This report of torsade de pointes ventricular tachycardia during intermittent dobutamine \n",
"supports the hypothesis that unpredictable fatal arrhythmias may occur even with low doses \n",
"and in patients with no history of significant rhythm disturbances.\n",
"The mechanisms of proarrhythmic effects of Dubutamine are discussed.\"_\n",
"\n",
"**Output visualized using displaCy**\n",
"\n",
"<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">The authors describe the case of a 56 year old woman with chronic , severe \n",
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
" heart failure \n",
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
"</mark>\n",
"secondary to \n",
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
" dilated cardiomyopathy \n",
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
"</mark>\n",
"and absence of significant \n",
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
" ventricular arrhythmias \n",
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
"</mark>\n",
"who developed QT \n",
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
" prolongation \n",
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
"</mark>\n",
"and torsade de pointes \n",
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
" ventricular tachycardia \n",
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
"</mark>\n",
"during one cycle of intermittent low dose ( 2.5 mcg / kg per min ) dobutamine . This report of torsade de pointes \n",
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
" ventricular tachycardia \n",
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
"</mark>\n",
"during intermittent dobutamine supports the hypothesis that unpredictable fatal \n",
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
" arrhythmias \n",
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
"</mark>\n",
"may occur even with low doses and in patients with no history of significant \n",
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
" rhythm disturbances \n",
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
"</mark>\n",
". The mechanisms of proarrhythmic effects of Dubutamine are discussed . </div>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"text= \"\"\"\n",
"The authors describe the case of a 56 year old woman with chronic, severe heart failure\n",
"secondary to dilated cardiomyopathy and absence of significant ventricular arrhythmias\n",
"who developed QT prolongation and torsade de pointes ventricular tachycardia during one cycle\n",
"of intermittent low dose (2.5 mcg/kg per min) dobutamine.\n",
"This report of torsade de pointes ventricular tachycardia during intermittent dobutamine\n",
"supports the hypothesis that unpredictable fatal arrhythmias may occur even with low doses\n",
"and in patients with no history of significant rhythm disturbances.\n",
"The mechanisms of proarrhythmic effects of Dubutamine are discussed.\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"\n",
"notebooks_dir = '/workspace/bert/notebooks'\n",
"working_dir = '/workspace/bert'\n",
"if working_dir not in sys.path:\n",
" sys.path.append(working_dir)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Convert the text into the IOB tags format seen during training, using dummy placeholder labels\n",
"import spacy\n",
"nlp = spacy.load(\"en_core_web_sm\")\n",
"\n",
"text = text.strip()\n",
"doc = nlp(text)\n",
"input_file = os.path.join(notebooks_dir, 'input.tsv')\n",
"with open(os.path.join(input_file), 'w') as wf: \n",
" for word in doc:\n",
" if word.text is '\\n':\n",
" continue\n",
" wf.write(word.text + '\\tO\\n')\n",
" wf.write('\\n') # Indicate end of text"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3.b Mixed Precision\n",
"\n",
"Mixed precision training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of tensor cores in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.\n",
"\n",
"For information about:\n",
"- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.\n",
"- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.\n",
"- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook we control mixed precision execution with the environmental variable:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"TF_ENABLE_AUTO_MIXED_PRECISION\"] = \"1\" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The model we'll use was trained with mixed precision model, which takes much less time to train than the fp32 version, without losing accuracy. So we'll need to set with the following flag: "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"use_mixed_precision_model = True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Fine-Tuned NVIDIA BioBERT TF Models\n",
"\n",
"We have the following Named Entity Reconition models fine-tuned from BioBERT available on NGC (NVIDIA GPU Cluster, https://ngc.nvidia.com).\n",
"\n",
"| **Model** | **Description** |\n",
"|:---------:|:----------:|\n",
"|BioBERT NER BC5CDR Disease | NER model to extract disease information from text, trained on the BC5CDR-Disease dataset |\n",
"|BioBERT NER BC5CDR Chemical | NER model to extract chemical information from text, trained on the BC5CDR-Chemical dataset. |\n",
"\n",
"\n",
"For this exampple, we will download the Diease NER model trained from the BC5CDR-disease Dataset.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# biobert_uncased_base_ner_disease\n",
"DATA_DIR_FP16='/workspace/bert/data/download/finetuned_model_fp16'\n",
"!mkdir -p $DATA_DIR_FP16\n",
"!wget -nc -q --show-progress -O $DATA_DIR_FP16/biobert_uncased_base_ner_disease.zip \\\n",
"https://api.ngc.nvidia.com/v2/models/nvidia/biobert_uncased_base_ner_disease/versions/1/zip\n",
"!unzip -n -d $DATA_DIR_FP16/ $DATA_DIR_FP16/biobert_uncased_base_ner_disease.zip "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the code that follows we will refer to these models."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Running NER task inference\n",
"\n",
"In order to run NER inference we will follow step-by-step the flow implemented in run_ner.py."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.a Configure Things"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import run_ner\n",
"from run_ner import BC5CDRProcessor, model_fn_builder, file_based_input_fn_builder, filed_based_convert_examples_to_features, result_to_pair\n",
"\n",
"import os, sys\n",
"import time\n",
"\n",
"import tensorflow as tf\n",
"import modeling\n",
"import tokenization\n",
"\n",
"tf.logging.set_verbosity(tf.logging.ERROR)\n",
"\n",
"# Create the output directory where all the results are saved.\n",
"output_dir = os.path.join(working_dir, 'output')\n",
"tf.gfile.MakeDirs(output_dir)\n",
"\n",
"# The config json file corresponding to the pre-trained BERT model.\n",
"# This specifies the model architecture.\n",
"bert_config_file = os.path.join(DATA_DIR_FP16, 'bert_config.json')\n",
"\n",
"# The vocabulary file that the BERT model was trained on.\n",
"vocab_file = os.path.join(DATA_DIR_FP16, 'vocab.txt')\n",
"\n",
"init_checkpoint = os.path.join(DATA_DIR_FP16, 'model.ckpt-10251')\n",
"\n",
"# Whether to lower case the input text. \n",
"# Should be True for uncased models and False for cased models.\n",
"# The BioBERT available in NGC is uncased\n",
"do_lower_case = True\n",
" \n",
"# Total batch size for predictions\n",
"predict_batch_size = 1\n",
"params = dict([('batch_size', predict_batch_size)])\n",
"\n",
"# The maximum total input sequence length after WordPiece tokenization. \n",
"# Sequences longer than this will be truncated, and sequences shorter than this will be padded.\n",
"max_seq_length = 128\n",
"\n",
"# This is a WA to use flags from here:\n",
"flags = tf.flags\n",
"\n",
"if 'f' not in tf.flags.FLAGS: \n",
" tf.app.flags.DEFINE_string('f', '', 'kernel')\n",
"FLAGS = flags.FLAGS\n",
"\n",
"FLAGS.output_dir = output_dir"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.b Define Tokenizer & Create Estimator"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Validate the casing config consistency with the checkpoint name.\n",
"tokenization.validate_case_matches_checkpoint(do_lower_case, init_checkpoint)\n",
"\n",
"# Create the tokenizer.\n",
"tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)\n",
"\n",
"# Load the configuration from file\n",
"bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
"\n",
"\n",
"# Use the data processor for BC5CDR\n",
"processor = BC5CDRProcessor()\n",
"# Get labels in the index order that was used during training\n",
"label_list = processor.get_labels()\n",
"\n",
"# Reverse index the labels. This will be used later when evaluating predictions.\n",
"id2label = {}\n",
"for (i, label) in enumerate(label_list, 1):\n",
" id2label[i] = label\n",
"\n",
"\n",
"config = tf.ConfigProto(log_device_placement=True) \n",
"run_config = tf.estimator.RunConfig(\n",
" model_dir=None,\n",
" session_config=config,\n",
" save_checkpoints_steps=1000,\n",
" keep_checkpoint_max=1)\n",
"\n",
"\n",
"# Use model function builder to create the model function\n",
"model_fn = model_fn_builder(\n",
" bert_config=bert_config,\n",
" num_labels=len(label_list) + 1,\n",
" init_checkpoint=init_checkpoint,\n",
" use_fp16=use_mixed_precision_model)\n",
"\n",
"estimator = tf.estimator.Estimator(\n",
" model_fn=model_fn,\n",
" config=run_config,\n",
" params=params)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.c Run Inference"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load the input data using the BC5CDR processor\n",
"predict_examples = processor.get_test_examples(notebooks_dir, file_name='input.tsv')\n",
"\n",
"\n",
"# Convert to tf_records and save it\n",
"predict_file = os.path.join(output_dir, \"predict.tf_record\")\n",
"filed_based_convert_examples_to_features(predict_examples, label_list,\n",
" max_seq_length, tokenizer,\n",
" predict_file)\n",
"\n",
"\n",
"tf.logging.info(\"***** Running predictions *****\")\n",
"tf.logging.info(\" Num orig examples = %d\", len(predict_examples))\n",
"tf.logging.info(\" Batch size = %d\", predict_batch_size)\n",
"\n",
"# Run prediction on this tf_record file\n",
"predict_input_fn = file_based_input_fn_builder(\n",
" input_file=predict_file,\n",
" batch_size=predict_batch_size,\n",
" seq_length=max_seq_length,\n",
" is_training=False,\n",
" drop_remainder=False)\n",
"\n",
"\n",
"pred_start_time = time.time()\n",
"\n",
"predictions = estimator.predict(input_fn=predict_input_fn)\n",
"predictions = list(predictions)\n",
"\n",
"pred_time_elapsed = time.time() - pred_start_time\n",
"\n",
"tf.logging.info(\"-----------------------------\")\n",
"tf.logging.info(\"Total Inference Time = %0.2f\", pred_time_elapsed)\n",
"# tf.logging.info(\"Inference Performance = %0.4f sentences/sec\", avg_sentences_per_second)\n",
"tf.logging.info(\"-----------------------------\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.d Save Predictions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Let's now process the predictions and save them to file(s)\n",
"tf.logging.info(\"Save Predictions:\")\n",
"\n",
"# File containing the list of predictions as IOB tags\n",
"output_predict_file = os.path.join(FLAGS.output_dir, \"label_test.txt\")\n",
"# File containing the list of words, the dummy token and the predicted IOB tag\n",
"test_labels_file = os.path.join(FLAGS.output_dir, \"test_labels.txt\")\n",
"test_labels_err_file = os.path.join(FLAGS.output_dir, \"test_labels_errs.txt\")\n",
"\n",
"with tf.gfile.Open(output_predict_file, 'w') as writer, \\\n",
" tf.gfile.Open(test_labels_file, 'w') as tl, \\\n",
" tf.gfile.Open(test_labels_err_file, 'w') as tle:\n",
" i=0\n",
" for prediction in estimator.predict(input_fn=predict_input_fn, yield_single_examples=True):\n",
" output_line = \"\\n\".join(id2label[id] for id in prediction if id != 0) + \"\\n\"\n",
" writer.write(output_line)\n",
" result_to_pair(predict_examples[i], prediction, id2label, tl, tle)\n",
" i = i + 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5.e Visualize Predictions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Let's create a function that can formats the predictions for display using displaCy\n",
"def predictions_for_displacy(predict_examples, predictions, id2label):\n",
" processed_text = ''\n",
" entities = []\n",
" current_pos = 0\n",
" start_pos = 0\n",
" end_pos = 0\n",
" end_detected = False\n",
" prev_label = ''\n",
"\n",
" for predict_line, pred_ids in zip(predict_examples, predictions):\n",
" words = str(predict_line.text).split(' ')\n",
" labels = str(predict_line.label).split(' ')\n",
"\n",
" # get from CLS to SEP\n",
" pred_labels = []\n",
" for id in pred_ids:\n",
" if id == 0:\n",
" continue\n",
" curr_label = id2label[id]\n",
" if curr_label == '[CLS]':\n",
" continue\n",
" elif curr_label == '[SEP]':\n",
" break\n",
" elif curr_label == 'X':\n",
" continue\n",
" pred_labels.append(curr_label)\n",
"\n",
" for tok, label, pred_label in zip(words, labels, pred_labels):\n",
" if pred_label is 'B':\n",
" start_pos = current_pos\n",
" elif pred_label is 'I' and prev_label is not 'B' and prev_label is not 'I':\n",
" start_pos = current_pos\n",
" elif pred_label is 'O' and (prev_label is 'B' or prev_label is 'I'):\n",
" end_pos = current_pos\n",
" end_detected = True\n",
"\n",
" if end_detected:\n",
" entities.append({'start':start_pos, 'end': end_pos, 'label': 'DISEASE'})\n",
" start_pos = 0\n",
" end_pos = 0\n",
" end_detected = False\n",
"\n",
" processed_text = processed_text + tok + ' '\n",
" current_pos = current_pos + len(tok) + 1\n",
" prev_label = pred_label\n",
"\n",
" #Handle entity at the very end\n",
" if start_pos > 0 and end_detected is False:\n",
" entities.append({'start':start_pos, 'end': current_pos, 'label': 'DISEASE'})\n",
" \n",
" displacy_input = [{\"text\": processed_text,\n",
" \"ents\": entities,\n",
" \"title\": None}]\n",
" \n",
" return displacy_input"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Convert the predictions to the Named Entities format required by displaCy and visualize\n",
"displacy_input = predictions_for_displacy(predict_examples, predictions, id2label)\n",
"html = spacy.displacy.render(displacy_input, style=\"ent\", manual=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. What's next"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now that you are familiar with running NER Inference on BioBERT, using mixed precision, you may want to try extracting disease information from other biomedical text. "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
@@ -0,0 +1,31 @@
{"data":
[
{"title": "Project Apollo",
"paragraphs": [
{"context":"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.",
"qas": [
{ "question": "What project put the first Americans into space?",
"id": "Q1"
},
{ "question": "What program was created to carry out these projects and missions?",
"id": "Q2"
},
{ "question": "What year did the first manned Apollo flight occur?",
"id": "Q3"
},
{ "question": "What President is credited with the original notion of putting Americans in space?",
"id": "Q4"
},
{ "question": "Who did the U.S. collaborate with on an Earth orbit mission in 1975?",
"id": "Q5"
},
{ "question": "How long did Project Apollo run?",
"id": "Q6"
},
{ "question": "What program helped develop space travel techniques that Project Apollo used?",
"id": "Q7"
},
{"question": "What space station supported three manned missions in 1973-1974?",
"id": "Q8"
}
]}]}]}
@@ -0,0 +1,467 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions and classes related to optimization (weight updates)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import tensorflow as tf
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import linalg_ops
from tensorflow.python.ops import math_ops
from npu_bridge.estimator.npu.npu_optimizer import NPUOptimizer
from npu_bridge.estimator.npu import npu_loss_scale_manager as lsm_lib
from npu_bridge.estimator import npu_ops
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None, manual_fp16=False, use_fp16=False,
num_accumulation_steps=1,
optimizer_type="adam", allreduce_post_accumulation=False):
"""Creates an optimizer training op."""
global_step = tf.train.get_or_create_global_step()
# avoid step change in learning rate at end of warmup phase
if optimizer_type == "adam":
power = 1.0
decayed_learning_rate_at_crossover_point = init_lr * (
(1.0 - float(num_warmup_steps) / float(num_train_steps)) ** power)
else:
power = 0.5
decayed_learning_rate_at_crossover_point = init_lr
adjusted_init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point)
print('decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e' % (
decayed_learning_rate_at_crossover_point, adjusted_init_lr))
learning_rate = tf.constant(value=adjusted_init_lr, shape=[], dtype=tf.float32)
# Implements linear decay of the learning rate.
learning_rate = tf.train.polynomial_decay(
learning_rate,
global_step,
num_train_steps,
end_learning_rate=0.0,
power=power,
cycle=False)
# Implements linear warmup. I.e., if global_step < num_warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
if num_warmup_steps:
global_steps_int = tf.cast(global_step, tf.int32)
warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
global_steps_float = tf.cast(global_steps_int, tf.float32)
warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
warmup_percent_done = global_steps_float / warmup_steps_float
warmup_learning_rate = init_lr * warmup_percent_done
is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
learning_rate = (
(1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
if optimizer_type == "lamb":
print("Initializing LAMB Optimizer")
optimizer = LAMBOptimizer(
learning_rate=learning_rate,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
else:
print("Initializing ADAM Weight Decay Optimizer")
# It is recommended that you use this optimizer for fine tuning, since this
# is how the model was trained (note that the Adam m/v variables are NOT
# loaded from init_checkpoint.)
optimizer = AdamWeightDecayOptimizer(
learning_rate=learning_rate,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
if hvd is not None and (num_accumulation_steps == 1 or (not allreduce_post_accumulation)):
optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True,
compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none)
if tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]:
if tf.flags.FLAGS.npu_bert_loss_scale == 0:
loss_scale_manager = lsm_lib.ExponentialUpdateLossScaleManager(
init_loss_scale=tf.flags.FLAGS.init_loss_scale_value, incr_every_n_steps=1000,
decr_every_n_nan_or_inf=2, decr_ratio=0.5)
elif tf.flags.FLAGS.npu_bert_loss_scale >= 1:
loss_scale_manager = lsm_lib.FixedLossScaleManager(loss_scale=tf.flags.FLAGS.npu_bert_loss_scale)
else:
raise ValueError("Invalid loss scale: %d" % tf.flags.FLAGS.npu_bert_loss_scale)
optimizer = NPUOptimizer(optimizer, loss_scale_manager, is_distributed=tf.flags.FLAGS.distributed,
is_loss_scale=True, is_tailing_optimization=tf.flags.FLAGS.npu_bert_tail_optimize)
else:
optimizer = NPUOptimizer(optimizer, is_distributed=tf.flags.FLAGS.distributed)
tvars = tf.trainable_variables()
grads_and_vars = optimizer.compute_gradients(loss * 1.0 / num_accumulation_steps, tvars)
if num_accumulation_steps > 1:
local_step = tf.get_variable(name="local_step", shape=[], dtype=tf.int32, trainable=False,
initializer=tf.zeros_initializer)
batch_finite = tf.get_variable(name="batch_finite", shape=[], dtype=tf.bool, trainable=False,
initializer=tf.ones_initializer)
accum_vars = [tf.get_variable(
name=tvar.name.split(":")[0] + "/accum",
shape=tvar.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer()) for tvar in tf.trainable_variables()]
reset_step = tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool)
local_step = tf.cond(reset_step, lambda: local_step.assign(tf.ones_like(local_step)),
lambda: local_step.assign_add(1))
with tf.name_scope(accumulate_step):
grads_and_vars_and_accums = [(gv[0], gv[1], accum_vars[i]) for i, gv in enumerate(grads_and_vars) if
gv[0] is not None]
grads, tvars, accum_vars = list(zip(*grads_and_vars_and_accums))
all_are_finite = tf.reduce_all([tf.reduce_all(tf.is_finite(g)) for g in grads]) if (
tf.flags.FLAGS.npu_bert_loss_scale not in [
None, -1]) and (
manual_fp16 or use_fp16) else tf.constant(
True, dtype=tf.bool)
batch_finite = tf.cond(reset_step,
lambda: batch_finite.assign(
tf.math.logical_and(tf.constant(True, dtype=tf.bool), all_are_finite)),
lambda: batch_finite.assign(tf.math.logical_and(batch_finite, all_are_finite)))
# This is how the model was pre-trained.
# ensure global norm is a finite number
# to prevent clip_by_global_norm from having a hizzy fit.
if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
(clipped_grads, _) = tf.clip_by_global_norm(
grads, clip_norm=1.0,
use_norm=tf.cond(
all_are_finite,
lambda: tf.global_norm(grads),
lambda: tf.constant(1.0)))
else:
with tf.name_scope("clip_grads"):
clipped_grads = [
(tf.clip_by_norm(grad, clip_norm=1.0))
if grad is not None else (grad, var) for grad in grads
]
accum_vars = tf.cond(reset_step,
lambda: [accum_vars[i].assign(grad) for i, grad in enumerate(clipped_grads)],
lambda: [accum_vars[i].assign_add(grad) for i, grad in enumerate(clipped_grads)])
def update(accum_vars):
with tf.name_scope("opt_update"):
if allreduce_post_accumulation and hvd is not None:
accum_vars = [hvd.allreduce(tf.convert_to_tensor(accum_var),
compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) if isinstance(
accum_var, tf.IndexedSlices)
else hvd.allreduce(accum_var,
compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none)
for accum_var in accum_vars]
return optimizer.apply_gradients(list(zip(accum_vars, tvars)), global_step=global_step)
update_step = tf.identity(tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool),
name="update_step")
update_op = tf.cond(update_step,
lambda: update(accum_vars), lambda: tf.no_op())
new_global_step = tf.cond(
tf.math.logical_and(update_step, tf.cast(hvd.allreduce(tf.cast(batch_finite, tf.int32)), tf.bool)),
lambda: global_step + 1, lambda: global_step)
new_global_step = tf.identity(new_global_step, name='step_update')
train_op = tf.group(update_op, [global_step.assign(new_global_step)])
else:
grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
grads, tvars = list(zip(*grads_and_vars))
if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
all_are_finite = tf.constant(True, dtype=tf.bool)
# This is how the model was pre-trained.
# ensure global norm is a finite number
# to prevent clip_by_global_norm from having a hizzy fit.
if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
(clipped_grads, _) = tf.clip_by_global_norm(
grads, clip_norm=1.0,
use_norm=tf.cond(
all_are_finite,
lambda: tf.global_norm(grads),
lambda: tf.constant(1.0)))
else:
with tf.name_scope("clip_grads"):
clipped_grads = [
(tf.clip_by_norm(grad, clip_norm=1.0))
if grad is not None else (grad, var) for grad in grads
]
with tf.name_scope("apply_grads"):
train_op = optimizer.apply_gradients(
list(zip(clipped_grads, tvars)), global_step=global_step)
# if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
# new_global_step = tf.cond(all_are_finite, lambda: global_step + 1, lambda: global_step)
# else:
# new_global_step = global_step + 1
# new_global_step = tf.identity(new_global_step, name='step_update')
# train_op = tf.group(train_op, [global_step.assign(new_global_step)])
return train_op
class AdamWeightDecayOptimizer(tf.train.Optimizer):
"""A basic Adam optimizer that includes "correct" L2 weight decay."""
def __init__(self,
learning_rate,
weight_decay_rate=0.0,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=None,
name="AdamWeightDecayOptimizer"):
"""Constructs a AdamWeightDecayOptimizer."""
super(AdamWeightDecayOptimizer, self).__init__(False, name)
self.learning_rate = tf.identity(learning_rate, name='learning_rate')
self.weight_decay_rate = weight_decay_rate
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self.exclude_from_weight_decay = exclude_from_weight_decay
def apply_gradients(self, grads_and_vars, global_step=None, name=None,
manual_fp16=False):
"""See base class."""
assignments = []
for (grad, param) in grads_and_vars:
with tf.name_scope("apply_one_adam"):
if grad is None or param is None:
continue
param_name = self._get_variable_name(param.name)
has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
if has_shadow:
# create shadow fp32 weights for fp16 variable
param_fp32 = tf.get_variable(
name=param_name + "/shadow",
dtype=tf.float32,
trainable=False,
initializer=tf.cast(param.initialized_value(), tf.float32))
else:
param_fp32 = param
m = tf.get_variable(
name=param_name + "/adam_m",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
v = tf.get_variable(
name=param_name + "/adam_v",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
if tf.flags.FLAGS.npu_bert_use_fused_adam_momentum:
if self._do_use_weight_decay(param_name):
assignments.extend([npu_ops.adam_apply_one_with_decay_assign(grad, v, m, param_fp32, self.learning_rate,
self.beta_1, 1.0 - self.beta_1, self.beta_2, 1.0 - self.beta_2,
self.weight_decay_rate, self.epsilon)])
else:
assignments.extend([npu_ops.adam_apply_one_assign(grad, v, m, param_fp32, self.learning_rate, self.beta_1,
1.0 - self.beta_1, self.beta_2, 1.0 - self.beta_2, self.epsilon)])
else:
# Standard Adam update.
next_m = (
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
next_v = (
tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
tf.square(grad)))
update = next_m / (tf.sqrt(next_v) + self.epsilon)
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if self._do_use_weight_decay(param_name):
update += self.weight_decay_rate * param_fp32
update_with_lr = self.learning_rate * update
next_param = param_fp32 - update_with_lr
if has_shadow:
# cast shadow fp32 weights to fp16 and assign to trainable variable
param.assign(tf.cast(next_param, param.dtype.base_dtype))
assignments.extend(
[param_fp32.assign(next_param),
m.assign(next_m),
v.assign(next_v)])
new_global_step = global_step + 1
new_global_step = tf.identity(new_global_step, name='step_update')
assignments.extend([global_step.assign(new_global_step)])
return tf.group(*assignments, name=name)
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if not self.weight_decay_rate:
return False
if self.exclude_from_weight_decay:
for r in self.exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
def _get_variable_name(self, param_name):
"""Get the variable name from the tensor name."""
m = re.match("^(.*):\\d+$", param_name)
if m is not None:
param_name = m.group(1)
return param_name
class LAMBOptimizer(tf.train.Optimizer):
"""A LAMB optimizer that includes "correct" L2 weight decay."""
def __init__(self,
learning_rate,
weight_decay_rate=0.0,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=None,
name="LAMBOptimizer"):
"""Constructs a LAMBOptimizer."""
super(LAMBOptimizer, self).__init__(False, name)
self.learning_rate = tf.identity(learning_rate, name='learning_rate')
self.weight_decay_rate = weight_decay_rate
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self.exclude_from_weight_decay = exclude_from_weight_decay
self.steps = 0
def apply_gradients(self, grads_and_vars, global_step=None, name=None,
manual_fp16=False):
"""See base class."""
assignments = []
for (grad, param) in grads_and_vars:
with tf.name_scope("apply_one_lamb"):
if grad is None or param is None:
continue
param_name = self._get_variable_name(param.name)
has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
if has_shadow:
# create shadow fp32 weights for fp16 variable
param_fp32 = tf.get_variable(
name=param_name + "/shadow",
dtype=tf.float32,
trainable=False,
initializer=tf.cast(param.initialized_value(), tf.float32))
else:
param_fp32 = param
m = tf.get_variable(
name=param_name + "/adam_m",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
v = tf.get_variable(
name=param_name + "/adam_v",
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
# LAMB update
next_m = (
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
next_v = (
tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
tf.square(grad)))
self.steps += 1
beta1_correction = (1 - self.beta_1 ** self.steps)
beta2_correction = (1 - self.beta_2 ** self.steps)
next_m_unbiased = next_m / beta1_correction
next_v_unbiased = next_v / beta2_correction
update = next_m_unbiased / (tf.sqrt(next_v_unbiased) + self.epsilon)
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if self._do_use_weight_decay(param_name):
update += self.weight_decay_rate * param_fp32
w_norm = linalg_ops.norm(param, ord=2)
g_norm = linalg_ops.norm(update, ord=2)
ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0)
update_with_lr = ratio * self.learning_rate * update
next_param = param_fp32 - update_with_lr
if has_shadow:
# cast shadow fp32 weights to fp16 and assign to trainable variable
param.assign(tf.cast(next_param, param.dtype.base_dtype))
assignments.extend(
[param_fp32.assign(next_param),
m.assign(next_m),
v.assign(next_v)])
new_global_step = global_step + 1
new_global_step = tf.identity(new_global_step, name='step_update')
assignments.extend([global_step.assign(new_global_step)])
return tf.group(*assignments, name=name)
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if not self.weight_decay_rate:
return False
if self.exclude_from_weight_decay:
for r in self.exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
def _get_variable_name(self, param_name):
"""Get the variable name from the tensor name."""
m = re.match("^(.*):\\d+$", param_name)
if m is not None:
param_name = m.group(1)
return param_name
@@ -0,0 +1,48 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import optimization
import tensorflow as tf
class OptimizationTest(tf.test.TestCase):
def test_adam(self):
with self.test_session() as sess:
w = tf.get_variable(
"w",
shape=[3],
initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
x = tf.constant([0.4, 0.2, -0.5])
loss = tf.reduce_mean(tf.square(x - w))
tvars = tf.trainable_variables()
grads = tf.gradients(loss, tvars)
global_step = tf.train.get_or_create_global_step()
optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
sess.run(init_op)
for _ in range(100):
sess.run(train_op)
w_np = sess.run(w)
self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
if __name__ == "__main__":
tf.test.main()
@@ -0,0 +1,73 @@
#!/bin/bash
#SBATCH --exclusive
#SBATCH --mem=0
#SBATCH --overcommit
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -eux
readonly docker_image="nvcr.io/nvidia/tensorflow:19.08-py3"
readonly datadir="/raid/data/bert"
readonly checkpointdir="$PWD/checkpoints"
readonly mounts=".:/workspace/bert,${datadir}:/workspace/bert/data,${checkpointdir}:/results"
srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}/phase_1"
srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}/phase_2"
PHASE1="\
--train_batch_size=${BATCHSIZE:-16} \
--learning_rate=${LEARNING_RATE:-1.875e-4} \
--num_accumulation_steps=${NUM_ACCUMULATION_STEPS:-128} \
--input_files_dir=/workspace/bert/data/tfrecord/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training \
--eval_files_dir=/workspace/bert/data/tfrecord/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/test \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--num_train_steps=7038 \
--num_warmup_steps=2000 \
--output_dir=/results/phase_1 \
"
PHASE2="\
--train_batch_size=${BATCHSIZE:-2} \
--learning_rate=${LEARNING_RATE:-1.25e-4} \
--num_accumulation_steps=${NUM_ACCUMULATION_STEPS:-512} \
--input_files_dir=/workspace/bert/data/tfrecord/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training \
--eval_files_dir=/workspace/bert/data/tfrecord/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/test \
--max_seq_length=512 \
--max_predictions_per_seq=80 \
--num_train_steps=1564 \
--num_warmup_steps=200 \
--output_dir=/results/phase_2 \
--init_checkpoint=/results/phase_1/model.ckpt-7038 \
"
PHASES=( "$PHASE1" "$PHASE2" )
PHASE=${PHASE:-1}
BERT_CMD="\
python /workspace/bert/run_pretraining.py \
${PHASES[$((PHASE-1))]} \
--bert_config_file=/workspace/bert/data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json \
--do_train=True \
--do_eval=True \
--save_checkpoints_steps=100 \
--horovod --use_fp16 --use_xla \
--allreduce_post_accumulation=True \
--eval_batch_size=8"
srun --mpi=pmi2 -l --container-image="${docker_image}" --container-mounts="${mounts}" bash -c "${BERT_CMD}"
@@ -0,0 +1,706 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import csv
import os
import modeling
import optimization
import tokenization
import tensorflow as tf
import horovod.tensorflow as hvd
import time
from utils.utils import LogEvalRunHook, LogTrainRunHook
from utils.create_glue_data import *
import numpy as np
flags = tf.flags
FLAGS = flags.FLAGS
## Required parameters
flags.DEFINE_string(
"data_dir", None,
"The input data dir. Should contain the .tsv files (or other data files) "
"for the task.")
flags.DEFINE_string(
"bert_config_file", None,
"The config json file corresponding to the pre-trained BERT model. "
"This specifies the model architecture.")
flags.DEFINE_string("task_name", None, "The name of the task to train.")
flags.DEFINE_string("vocab_file", None,
"The vocabulary file that the BERT model was trained on.")
flags.DEFINE_string(
"output_dir", None,
"The output directory where the model checkpoints will be written.")
## Other parameters
flags.DEFINE_string(
"init_checkpoint", None,
"Initial checkpoint (usually from a pre-trained BERT model).")
flags.DEFINE_bool(
"do_lower_case", True,
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.")
flags.DEFINE_integer(
"max_seq_length", 128,
"The maximum total input sequence length after WordPiece tokenization. "
"Sequences longer than this will be truncated, and sequences shorter "
"than this will be padded.")
flags.DEFINE_bool("do_train", False, "Whether to run training.")
flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
flags.DEFINE_bool(
"do_predict", False,
"Whether to run the model in inference mode on the test set.")
flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")
flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
flags.DEFINE_bool("use_trt", False, "Whether to use TF-TRT")
flags.DEFINE_float("num_train_epochs", 3.0,
"Total number of training epochs to perform.")
flags.DEFINE_float(
"warmup_proportion", 0.1,
"Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10% of training.")
flags.DEFINE_integer("save_checkpoints_steps", 1000,
"How often to save the model checkpoint.")
flags.DEFINE_integer("iterations_per_loop", 1000,
"How many steps to make in each estimator call.")
flags.DEFINE_integer("num_accumulation_steps", 1,
"Number of accumulation steps before gradient update"
"Global batch size = num_accumulation_steps * train_batch_size")
flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
flags.DEFINE_bool(
"verbose_logging", False,
"If true, all of the warnings related to data processing will be printed. "
"A number of warnings are expected for a normal SQuAD evaluation.")
def file_based_input_fn_builder(input_file, batch_size, seq_length, is_training,
drop_remainder, hvd=None):
"""Creates an `input_fn` closure to be passed to Estimator."""
name_to_features = {
"input_ids": tf.FixedLenFeature([seq_length], tf.int64),
"input_mask": tf.FixedLenFeature([seq_length], tf.int64),
"segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
"label_ids": tf.FixedLenFeature([], tf.int64),
}
def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
example = tf.parse_single_example(record, name_to_features)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for name in list(example.keys()):
t = example[name]
if t.dtype == tf.int64:
t = tf.to_int32(t)
example[name] = t
return example
def input_fn():
"""The actual input function."""
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
d = tf.data.TFRecordDataset(input_file)
if is_training:
if hvd is not None: d = d.shard(hvd.size(), hvd.rank())
d = d.repeat()
d = d.shuffle(buffer_size=100)
d = d.apply(
tf.contrib.data.map_and_batch(
lambda record: _decode_record(record, name_to_features),
batch_size=batch_size,
drop_remainder=drop_remainder))
return d
return input_fn
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
labels, num_labels, use_one_hot_embeddings):
"""Creates a classification model."""
model = modeling.BertModel(
config=bert_config,
is_training=is_training,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=segment_ids,
use_one_hot_embeddings=use_one_hot_embeddings,
compute_type=tf.float16 if FLAGS.use_fp16 else tf.float32)
# In the demo, we are doing a simple classification task on the entire
# segment.
#
# If you want to use the token-level output, use model.get_sequence_output()
# instead.
output_layer = model.get_pooled_output()
hidden_size = output_layer.shape[-1].value
output_weights = tf.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02))
output_bias = tf.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer())
with tf.variable_scope("loss"):
if is_training:
# I.e., 0.1 dropout
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias, name='cls_logits')
probabilities = tf.nn.softmax(logits, axis=-1, name='cls_probabilities')
log_probs = tf.nn.log_softmax(logits, axis=-1)
one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1, name='cls_per_example_loss')
loss = tf.reduce_mean(per_example_loss, name='cls_loss')
return (loss, per_example_loss, logits, probabilities)
def get_frozen_tftrt_model(bert_config, shape, num_labels, use_one_hot_embeddings, init_checkpoint):
tf_config = tf.ConfigProto()
output_node_names = ['loss/cls_loss', 'loss/cls_per_example_loss', 'loss/cls_logits', 'loss/cls_probabilities']
with tf.Session(config=tf_config) as tf_sess:
input_ids = tf.placeholder(tf.int32, shape, 'input_ids')
input_mask = tf.placeholder(tf.int32, shape, 'input_mask')
segment_ids = tf.placeholder(tf.int32, shape, 'segment_ids')
label_ids = tf.placeholder(tf.int32, (None), 'label_ids')
create_model(bert_config, False, input_ids, input_mask, segment_ids, label_ids,
num_labels, use_one_hot_embeddings)
tvars = tf.trainable_variables()
(assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
tf_sess.run(tf.global_variables_initializer())
print("LOADED!")
tf.logging.info("**** Trainable Variables ****")
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
else:
init_string = ", *NOTTTTTTTTTTTTTTTTTTTTT"
tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string)
frozen_graph = tf.graph_util.convert_variables_to_constants(tf_sess,
tf_sess.graph.as_graph_def(), output_node_names)
num_nodes = len(frozen_graph.node)
print('Converting graph using TensorFlow-TensorRT...')
from tensorflow.python.compiler.tensorrt import trt_convert as trt
converter = trt.TrtGraphConverter(
input_graph_def=frozen_graph,
nodes_blacklist=output_node_names,
max_workspace_size_bytes=(4096 << 20) - 1000,
precision_mode = "FP16" if FLAGS.use_fp16 else "FP32",
minimum_segment_size=4,
is_dynamic_op=True,
maximum_cached_engines=1000
)
frozen_graph = converter.convert()
print('Total node count before and after TF-TRT conversion:',
num_nodes, '->', len(frozen_graph.node))
print('TRT node count:',
len([1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp']))
with tf.gfile.GFile("frozen_modelTRT.pb", "wb") as f:
f.write(frozen_graph.SerializeToString())
return frozen_graph
def model_fn_builder(task_name, bert_config, num_labels, init_checkpoint, learning_rate,
num_train_steps, num_warmup_steps,
use_one_hot_embeddings, hvd=None):
"""Returns `model_fn` closure for Estimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for Estimator."""
def metric_fn(per_example_loss, label_ids, logits):
predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
if task_name == "cola":
FN, FN_op = tf.metrics.false_negatives(labels=label_ids, predictions=predictions)
FP, FP_op = tf.metrics.false_positives(labels=label_ids, predictions=predictions)
TP, TP_op = tf.metrics.true_positives(labels=label_ids, predictions=predictions)
TN, TN_op = tf.metrics.true_negatives(labels=label_ids, predictions=predictions)
MCC = (TP * TN - FP * FN) / ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) ** 0.5
MCC_op = tf.group(FN_op, TN_op, TP_op, FP_op, tf.identity(MCC, name="MCC"))
return {"MCC": (MCC, MCC_op)}
else:
accuracy = tf.metrics.accuracy(
labels=label_ids, predictions=predictions)
loss = tf.metrics.mean(values=per_example_loss)
return {
"eval_accuracy": accuracy,
"eval_loss": loss,
}
tf.logging.info("*** Features ***")
for name in sorted(features.keys()):
tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
label_ids = features["label_ids"]
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
if not is_training and FLAGS.use_trt:
trt_graph = get_frozen_tftrt_model(bert_config, input_ids.shape, num_labels, use_one_hot_embeddings, init_checkpoint)
(total_loss, per_example_loss, logits, probabilities) = tf.import_graph_def(trt_graph,
input_map={'input_ids':input_ids, 'input_mask':input_mask, 'segment_ids':segment_ids, 'label_ids':label_ids},
return_elements=['loss/cls_loss:0', 'loss/cls_per_example_loss:0', 'loss/cls_logits:0', 'loss/cls_probabilities:0'],
name='')
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {"probabilities": probabilities}
output_spec = tf.estimator.EstimatorSpec(
mode=mode, predictions=predictions)
elif mode == tf.estimator.ModeKeys.EVAL:
eval_metric_ops = metric_fn(per_example_loss, label_ids, logits)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
eval_metric_ops=eval_metric_ops)
return output_spec
(total_loss, per_example_loss, logits, probabilities) = create_model(
bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
num_labels, use_one_hot_embeddings)
tvars = tf.trainable_variables()
initialized_variable_names = {}
if init_checkpoint and (hvd is None or hvd.rank() == 0):
(assignment_map, initialized_variable_names
) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
if FLAGS.verbose_logging:
tf.logging.info("**** Trainable Variables ****")
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
init_string)
output_spec = None
if mode == tf.estimator.ModeKeys.TRAIN:
train_op = optimization.create_optimizer(
total_loss, learning_rate, num_train_steps, num_warmup_steps,
hvd, False, FLAGS.use_fp16, FLAGS.num_accumulation_steps)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
train_op=train_op)
elif mode == tf.estimator.ModeKeys.EVAL:
eval_metric_ops = metric_fn(per_example_loss, label_ids, logits)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
eval_metric_ops=eval_metric_ops)
else:
output_spec = tf.estimator.EstimatorSpec(
mode=mode, predictions=probabilities)
return output_spec
return model_fn
# This function is not used by this file but is still used by the Colab and
# people who depend on it.
def input_fn_builder(features, batch_size, seq_length, is_training, drop_remainder, hvd=None):
"""Creates an `input_fn` closure to be passed to Estimator."""
all_input_ids = []
all_input_mask = []
all_segment_ids = []
all_label_ids = []
for feature in features:
all_input_ids.append(feature.input_ids)
all_input_mask.append(feature.input_mask)
all_segment_ids.append(feature.segment_ids)
all_label_ids.append(feature.label_id)
def input_fn():
"""The actual input function."""
num_examples = len(features)
# This is for demo purposes and does NOT scale to large data sets. We do
# not use Dataset.from_generator() because that uses tf.py_func which is
# not TPU compatible. The right way to load data is with TFRecordReader.
d = tf.data.Dataset.from_tensor_slices({
"input_ids":
tf.constant(
all_input_ids, shape=[num_examples, seq_length],
dtype=tf.int32),
"input_mask":
tf.constant(
all_input_mask,
shape=[num_examples, seq_length],
dtype=tf.int32),
"segment_ids":
tf.constant(
all_segment_ids,
shape=[num_examples, seq_length],
dtype=tf.int32),
"label_ids":
tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
})
if is_training:
if hvd is not None: d = d.shard(hvd.size(), hvd.rank())
d = d.repeat()
d = d.shuffle(buffer_size=100)
d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
return d
return input_fn
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
if FLAGS.horovod:
hvd.init()
if FLAGS.use_fp16:
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
processors = {
"cola": ColaProcessor,
"mnli": MnliProcessor,
"mrpc": MrpcProcessor,
"xnli": XnliProcessor,
}
if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
raise ValueError(
"At least one of `do_train`, `do_eval` or `do_predict' must be True.")
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
if FLAGS.max_seq_length > bert_config.max_position_embeddings:
raise ValueError(
"Cannot use sequence length %d because the BERT model "
"was only trained up to sequence length %d" %
(FLAGS.max_seq_length, bert_config.max_position_embeddings))
tf.gfile.MakeDirs(FLAGS.output_dir)
task_name = FLAGS.task_name.lower()
if task_name not in processors:
raise ValueError("Task not found: %s" % (task_name))
processor = processors[task_name]()
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
master_process = True
training_hooks = []
global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps
hvd_rank = 0
config = tf.ConfigProto()
if FLAGS.horovod:
tf.logging.info("Multi-GPU training with TF Horovod")
tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank())
global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
master_process = (hvd.rank() == 0)
hvd_rank = hvd.rank()
config.gpu_options.visible_device_list = str(hvd.local_rank())
if hvd.size() > 1:
training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
if FLAGS.use_xla:
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
run_config = tf.estimator.RunConfig(
model_dir=FLAGS.output_dir if master_process else None,
session_config=config,
save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
keep_checkpoint_max=1)
if master_process:
tf.logging.info("***** Configuaration *****")
for key in FLAGS.__flags.keys():
tf.logging.info(' {}: {}'.format(key, getattr(FLAGS, key)))
tf.logging.info("**************************")
train_examples = None
num_train_steps = None
num_warmup_steps = None
training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank))
if FLAGS.do_train:
train_examples = processor.get_train_examples(FLAGS.data_dir)
num_train_steps = int(
len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
start_index = 0
end_index = len(train_examples)
tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]
if FLAGS.horovod:
tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size())]
num_examples_per_rank = len(train_examples) // hvd.size()
remainder = len(train_examples) % hvd.size()
if hvd.rank() < remainder:
start_index = hvd.rank() * (num_examples_per_rank+1)
end_index = start_index + num_examples_per_rank + 1
else:
start_index = hvd.rank() * num_examples_per_rank + remainder
end_index = start_index + (num_examples_per_rank)
model_fn = model_fn_builder(
task_name=task_name,
bert_config=bert_config,
num_labels=len(label_list),
init_checkpoint=FLAGS.init_checkpoint,
learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(),
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
use_one_hot_embeddings=False,
hvd=None if not FLAGS.horovod else hvd)
estimator = tf.estimator.Estimator(
model_fn=model_fn,
config=run_config)
if FLAGS.do_train:
file_based_convert_examples_to_features(
train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])
tf.logging.info("***** Running training *****")
tf.logging.info(" Num examples = %d", len(train_examples))
tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
tf.logging.info(" Num steps = %d", num_train_steps)
train_input_fn = file_based_input_fn_builder(
input_file=tmp_filenames,
batch_size=FLAGS.train_batch_size,
seq_length=FLAGS.max_seq_length,
is_training=True,
drop_remainder=True,
hvd=None if not FLAGS.horovod else hvd)
train_start_time = time.time()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks)
train_time_elapsed = time.time() - train_start_time
train_time_wo_overhead = training_hooks[-1].total_time
avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
ss_sentences_per_second = (num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead
if master_process:
tf.logging.info("-----------------------------")
tf.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed,
num_train_steps * global_batch_size)
tf.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead,
(num_train_steps - training_hooks[-1].skipped) * global_batch_size)
tf.logging.info("Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second)
tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
tf.logging.info("-----------------------------")
if FLAGS.do_eval and master_process:
eval_examples = processor.get_dev_examples(FLAGS.data_dir)
eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
file_based_convert_examples_to_features(
eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
tf.logging.info("***** Running evaluation *****")
tf.logging.info(" Num examples = %d", len(eval_examples))
tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
eval_drop_remainder = False
eval_input_fn = file_based_input_fn_builder(
input_file=eval_file,
batch_size=FLAGS.eval_batch_size,
seq_length=FLAGS.max_seq_length,
is_training=False,
drop_remainder=eval_drop_remainder)
eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
eval_start_time = time.time()
result = estimator.evaluate(input_fn=eval_input_fn, hooks=eval_hooks)
eval_time_elapsed = time.time() - eval_start_time
eval_time_wo_overhead = eval_hooks[-1].total_time
time_list = eval_hooks[-1].time_list
time_list.sort()
num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size
avg = np.mean(time_list)
cf_50 = max(time_list[:int(len(time_list) * 0.50)])
cf_90 = max(time_list[:int(len(time_list) * 0.90)])
cf_95 = max(time_list[:int(len(time_list) * 0.95)])
cf_99 = max(time_list[:int(len(time_list) * 0.99)])
cf_100 = max(time_list[:int(len(time_list) * 1)])
ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
tf.logging.info("-----------------------------")
tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
eval_hooks[-1].count * FLAGS.eval_batch_size)
tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
(eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size)
tf.logging.info("Summary Inference Statistics on EVAL set")
tf.logging.info("Batch size = %d", FLAGS.eval_batch_size)
tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
tf.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
tf.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
tf.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
tf.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000)
tf.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000)
tf.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
tf.logging.info("-----------------------------")
output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
with tf.gfile.GFile(output_eval_file, "w") as writer:
tf.logging.info("***** Eval results *****")
for key in sorted(result.keys()):
tf.logging.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
if FLAGS.do_predict and master_process:
predict_examples = processor.get_test_examples(FLAGS.data_dir)
predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
file_based_convert_examples_to_features(predict_examples, label_list,
FLAGS.max_seq_length, tokenizer,
predict_file)
tf.logging.info("***** Running prediction*****")
tf.logging.info(" Num examples = %d", len(predict_examples))
tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size)
predict_drop_remainder = False
predict_input_fn = file_based_input_fn_builder(
input_file=predict_file,
batch_size=FLAGS.predict_batch_size,
seq_length=FLAGS.max_seq_length,
is_training=False,
drop_remainder=predict_drop_remainder)
predict_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
predict_start_time = time.time()
output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
with tf.gfile.GFile(output_predict_file, "w") as writer:
tf.logging.info("***** Predict results *****")
for prediction in estimator.predict(input_fn=predict_input_fn, hooks=predict_hooks,
yield_single_examples=False):
output_line = "\t".join(
str(class_probability) for class_probability in prediction) + "\n"
writer.write(output_line)
predict_time_elapsed = time.time() - predict_start_time
predict_time_wo_overhead = predict_hooks[-1].total_time
time_list = predict_hooks[-1].time_list
time_list.sort()
num_sentences = (predict_hooks[-1].count - predict_hooks[-1].skipped) * FLAGS.predict_batch_size
avg = np.mean(time_list)
cf_50 = max(time_list[:int(len(time_list) * 0.50)])
cf_90 = max(time_list[:int(len(time_list) * 0.90)])
cf_95 = max(time_list[:int(len(time_list) * 0.95)])
cf_99 = max(time_list[:int(len(time_list) * 0.99)])
cf_100 = max(time_list[:int(len(time_list) * 1)])
ss_sentences_per_second = num_sentences * 1.0 / predict_time_wo_overhead
tf.logging.info("-----------------------------")
tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", predict_time_elapsed,
predict_hooks[-1].count * FLAGS.predict_batch_size)
tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", predict_time_wo_overhead,
(predict_hooks[-1].count - predict_hooks[-1].skipped) * FLAGS.predict_batch_size)
tf.logging.info("Summary Inference Statistics on TEST SET")
tf.logging.info("Batch size = %d", FLAGS.predict_batch_size)
tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
tf.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
tf.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
tf.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
tf.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000)
tf.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000)
tf.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
tf.logging.info("-----------------------------")
if __name__ == "__main__":
flags.mark_flag_as_required("data_dir")
flags.mark_flag_as_required("task_name")
flags.mark_flag_as_required("vocab_file")
flags.mark_flag_as_required("bert_config_file")
flags.mark_flag_as_required("output_dir")
tf.app.run()
@@ -0,0 +1,314 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner with TF-Hub."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import optimization
import run_classifier
import tokenization
import tensorflow as tf
import tensorflow_hub as hub
flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_string(
"bert_hub_module_handle", None,
"Handle for the BERT TF-Hub module.")
def create_model(is_training, input_ids, input_mask, segment_ids, labels,
num_labels, bert_hub_module_handle):
"""Creates a classification model."""
tags = set()
if is_training:
tags.add("train")
bert_module = hub.Module(bert_hub_module_handle, tags=tags, trainable=True)
bert_inputs = dict(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids)
bert_outputs = bert_module(
inputs=bert_inputs,
signature="tokens",
as_dict=True)
# In the demo, we are doing a simple classification task on the entire
# segment.
#
# If you want to use the token-level output, use
# bert_outputs["sequence_output"] instead.
output_layer = bert_outputs["pooled_output"]
hidden_size = output_layer.shape[-1].value
output_weights = tf.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02))
output_bias = tf.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer())
with tf.variable_scope("loss"):
if is_training:
# I.e., 0.1 dropout
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
probabilities = tf.nn.softmax(logits, axis=-1)
log_probs = tf.nn.log_softmax(logits, axis=-1)
one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
return (loss, per_example_loss, logits, probabilities)
def model_fn_builder(num_labels, learning_rate, num_train_steps,
num_warmup_steps, use_tpu, bert_hub_module_handle):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
tf.logging.info("*** Features ***")
for name in sorted(features.keys()):
tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
label_ids = features["label_ids"]
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
(total_loss, per_example_loss, logits, probabilities) = create_model(
is_training, input_ids, input_mask, segment_ids, label_ids, num_labels,
bert_hub_module_handle)
output_spec = None
if mode == tf.estimator.ModeKeys.TRAIN:
train_op = optimization.create_optimizer(
total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode,
loss=total_loss,
train_op=train_op)
elif mode == tf.estimator.ModeKeys.EVAL:
def metric_fn(per_example_loss, label_ids, logits):
predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
accuracy = tf.metrics.accuracy(label_ids, predictions)
loss = tf.metrics.mean(per_example_loss)
return {
"eval_accuracy": accuracy,
"eval_loss": loss,
}
eval_metrics = (metric_fn, [per_example_loss, label_ids, logits])
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode,
loss=total_loss,
eval_metrics=eval_metrics)
elif mode == tf.estimator.ModeKeys.PREDICT:
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode, predictions={"probabilities": probabilities})
else:
raise ValueError(
"Only TRAIN, EVAL and PREDICT modes are supported: %s" % (mode))
return output_spec
return model_fn
def create_tokenizer_from_hub_module(bert_hub_module_handle):
"""Get the vocab file and casing info from the Hub module."""
with tf.Graph().as_default():
bert_module = hub.Module(bert_hub_module_handle)
tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
with tf.Session() as sess:
vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
tokenization_info["do_lower_case"]])
return tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
processors = {
"cola": run_classifier.ColaProcessor,
"mnli": run_classifier.MnliProcessor,
"mrpc": run_classifier.MrpcProcessor,
}
if not FLAGS.do_train and not FLAGS.do_eval:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
tf.gfile.MakeDirs(FLAGS.output_dir)
task_name = FLAGS.task_name.lower()
if task_name not in processors:
raise ValueError("Task not found: %s" % (task_name))
processor = processors[task_name]()
label_list = processor.get_labels()
tokenizer = create_tokenizer_from_hub_module(FLAGS.bert_hub_module_handle)
tpu_cluster_resolver = None
if FLAGS.use_tpu and FLAGS.tpu_name:
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
cluster=tpu_cluster_resolver,
master=FLAGS.master,
model_dir=FLAGS.output_dir,
save_checkpoints_steps=FLAGS.save_checkpoints_steps,
tpu_config=tf.contrib.tpu.TPUConfig(
iterations_per_loop=FLAGS.iterations_per_loop,
num_shards=FLAGS.num_tpu_cores,
per_host_input_for_training=is_per_host))
train_examples = None
num_train_steps = None
num_warmup_steps = None
if FLAGS.do_train:
train_examples = processor.get_train_examples(FLAGS.data_dir)
num_train_steps = int(
len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
model_fn = model_fn_builder(
num_labels=len(label_list),
learning_rate=FLAGS.learning_rate,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
use_tpu=FLAGS.use_tpu,
bert_hub_module_handle=FLAGS.bert_hub_module_handle)
# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator = tf.contrib.tpu.TPUEstimator(
use_tpu=FLAGS.use_tpu,
model_fn=model_fn,
config=run_config,
train_batch_size=FLAGS.train_batch_size,
eval_batch_size=FLAGS.eval_batch_size,
predict_batch_size=FLAGS.predict_batch_size)
if FLAGS.do_train:
train_features = run_classifier.convert_examples_to_features(
train_examples, label_list, FLAGS.max_seq_length, tokenizer)
tf.logging.info("***** Running training *****")
tf.logging.info(" Num examples = %d", len(train_examples))
tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
tf.logging.info(" Num steps = %d", num_train_steps)
train_input_fn = run_classifier.input_fn_builder(
features=train_features,
seq_length=FLAGS.max_seq_length,
is_training=True,
drop_remainder=True)
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
if FLAGS.do_eval:
eval_examples = processor.get_dev_examples(FLAGS.data_dir)
eval_features = run_classifier.convert_examples_to_features(
eval_examples, label_list, FLAGS.max_seq_length, tokenizer)
tf.logging.info("***** Running evaluation *****")
tf.logging.info(" Num examples = %d", len(eval_examples))
tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
# This tells the estimator to run through the entire set.
eval_steps = None
# However, if running eval on the TPU, you will need to specify the
# number of steps.
if FLAGS.use_tpu:
# Eval will be slightly WRONG on the TPU because it will truncate
# the last batch.
eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
eval_drop_remainder = True if FLAGS.use_tpu else False
eval_input_fn = run_classifier.input_fn_builder(
features=eval_features,
seq_length=FLAGS.max_seq_length,
is_training=False,
drop_remainder=eval_drop_remainder)
result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
with tf.gfile.GFile(output_eval_file, "w") as writer:
tf.logging.info("***** Eval results *****")
for key in sorted(result.keys()):
tf.logging.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
if FLAGS.do_predict:
predict_examples = processor.get_test_examples(FLAGS.data_dir)
if FLAGS.use_tpu:
# Discard batch remainder if running on TPU
n = len(predict_examples)
predict_examples = predict_examples[:(n - n % FLAGS.predict_batch_size)]
predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
run_classifier.file_based_convert_examples_to_features(
predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
predict_file)
tf.logging.info("***** Running prediction*****")
tf.logging.info(" Num examples = %d", len(predict_examples))
tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size)
predict_input_fn = run_classifier.file_based_input_fn_builder(
input_file=predict_file,
seq_length=FLAGS.max_seq_length,
is_training=False,
drop_remainder=FLAGS.use_tpu)
result = estimator.predict(input_fn=predict_input_fn)
output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
with tf.gfile.GFile(output_predict_file, "w") as writer:
tf.logging.info("***** Predict results *****")
for prediction in result:
probabilities = prediction["probabilities"]
output_line = "\t".join(
str(class_probability)
for class_probability in probabilities) + "\n"
writer.write(output_line)
if __name__ == "__main__":
flags.mark_flag_as_required("data_dir")
flags.mark_flag_as_required("task_name")
flags.mark_flag_as_required("bert_hub_module_handle")
flags.mark_flag_as_required("output_dir")
tf.app.run()
@@ -0,0 +1,871 @@
#! usr/bin/env python3
# -*- coding:utf-8 -*-
"""
Copyright 2018 The Google AI Language Team Authors.
BASED ON Google_BERT.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os, sys
import pickle
import tensorflow as tf
import numpy as np
sys.path.append("/workspace/bert")
from biobert.conlleval import evaluate, report_notprint
import modeling
import optimization
import tokenization
import tf_metrics
import time
import horovod.tensorflow as hvd
from utils.utils import LogEvalRunHook, LogTrainRunHook
flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_string(
"task_name", "NER", "The name of the task to train."
)
flags.DEFINE_string(
"data_dir", None,
"The input datadir.",
)
flags.DEFINE_string(
"output_dir", None,
"The output directory where the model checkpoints will be written."
)
flags.DEFINE_string(
"bert_config_file", None,
"The config json file corresponding to the pre-trained BERT model."
)
flags.DEFINE_string(
"vocab_file", None,
"The vocabulary file that the BERT model was trained on.")
flags.DEFINE_string(
"init_checkpoint", None,
"Initial checkpoint (usually from a pre-trained BERT model)."
)
flags.DEFINE_bool(
"do_lower_case", False,
"Whether to lower case the input text."
)
flags.DEFINE_integer(
"max_seq_length", 128,
"The maximum total input sequence length after WordPiece tokenization."
)
flags.DEFINE_bool(
"do_train", False,
"Whether to run training."
)
flags.DEFINE_bool(
"do_eval", False,
"Whether to run eval on the dev set.")
flags.DEFINE_bool(
"do_predict", False,
"Whether to run the model in inference mode on the test set.")
flags.DEFINE_integer(
"train_batch_size", 64,
"Total batch size for training.")
flags.DEFINE_integer(
"eval_batch_size", 16,
"Total batch size for eval.")
flags.DEFINE_integer(
"predict_batch_size", 16,
"Total batch size for predict.")
flags.DEFINE_float(
"learning_rate", 5e-6,
"The initial learning rate for Adam.")
flags.DEFINE_float(
"num_train_epochs", 10.0,
"Total number of training epochs to perform.")
flags.DEFINE_float(
"warmup_proportion", 0.1,
"Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10% of training.")
flags.DEFINE_integer(
"save_checkpoints_steps", 1000,
"How often to save the model checkpoint.")
flags.DEFINE_integer(
"iterations_per_loop", 1000,
"How many steps to make in each estimator call.")
tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
class InputExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self, guid, text, label=None):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self.guid = guid
self.text = text
self.label = label
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, segment_ids, label_ids, ):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_ids = label_ids
# self.label_mask = label_mask
class DataProcessor(object):
"""Base class for data converters for sequence classification data sets."""
def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set."""
raise NotImplementedError()
def get_dev_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError()
def get_labels(self):
"""Gets the list of labels for this data set."""
raise NotImplementedError()
@classmethod
def _read_data(cls, input_file):
"""Reads a BIO data."""
with tf.gfile.Open(input_file, "r") as f:
lines = []
words = []
labels = []
for line in f:
contends = line.strip()
if len(contends) == 0:
assert len(words) == len(labels)
if len(words) > 30:
# split if the sentence is longer than 30
while len(words) > 30:
tmplabel = labels[:30]
for iidx in range(len(tmplabel)):
if tmplabel.pop() == 'O':
break
l = ' '.join(
[label for label in labels[:len(tmplabel) + 1] if len(label) > 0])
w = ' '.join(
[word for word in words[:len(tmplabel) + 1] if len(word) > 0])
lines.append([l, w])
words = words[len(tmplabel) + 1:]
labels = labels[len(tmplabel) + 1:]
if len(words) == 0:
continue
l = ' '.join([label for label in labels if len(label) > 0])
w = ' '.join([word for word in words if len(word) > 0])
lines.append([l, w])
words = []
labels = []
continue
word = line.strip().split()[0]
label = line.strip().split()[-1]
words.append(word)
labels.append(label)
return lines
class BC5CDRProcessor(DataProcessor):
def get_train_examples(self, data_dir):
l1 = self._read_data(os.path.join(data_dir, "train.tsv"))
l2 = self._read_data(os.path.join(data_dir, "devel.tsv"))
return self._create_example(l1 + l2, "train")
def get_dev_examples(self, data_dir, file_name="devel.tsv"):
return self._create_example(
self._read_data(os.path.join(data_dir, file_name)), "dev"
)
def get_test_examples(self, data_dir, file_name="test.tsv"):
return self._create_example(
self._read_data(os.path.join(data_dir, file_name)), "test")
def get_labels(self):
return ["B", "I", "O", "X", "[CLS]", "[SEP]"]
def _create_example(self, lines, set_type):
examples = []
for (i, line) in enumerate(lines):
guid = "%s-%s" % (set_type, i)
text = tokenization.convert_to_unicode(line[1])
label = tokenization.convert_to_unicode(line[0])
examples.append(InputExample(guid=guid, text=text, label=label))
return examples
class CLEFEProcessor(DataProcessor):
def get_train_examples(self, data_dir):
lines1 = self._read_data2(os.path.join(data_dir, "Training.tsv"))
lines2 = self._read_data2(os.path.join(data_dir, "Development.tsv"))
return self._create_example(
lines1 + lines2, "train"
)
def get_dev_examples(self, data_dir, file_name="Development.tsv"):
return self._create_example(
self._read_data2(os.path.join(data_dir, file_name)), "dev"
)
def get_test_examples(self, data_dir, file_name="Test.tsv"):
return self._create_example(
self._read_data2(os.path.join(data_dir, file_name)), "test")
def get_labels(self):
return ["B", "I", "O", "X", "[CLS]", "[SEP]"]
def _create_example(self, lines, set_type):
examples = []
for (i, line) in enumerate(lines):
guid = "%s-%s" % (set_type, i)
text = tokenization.convert_to_unicode(line[1])
label = tokenization.convert_to_unicode(line[0])
examples.append(InputExample(guid=guid, text=text, label=label))
return examples
@classmethod
def _read_data2(cls, input_file):
with tf.gfile.Open(input_file, "r") as f:
lines = []
words = []
labels = []
for line in f:
contends = line.strip()
if len(contends) == 0:
assert len(words) == len(labels)
if len(words) == 0:
continue
l = ' '.join([label for label in labels if len(label) > 0])
w = ' '.join([word for word in words if len(word) > 0])
lines.append([l, w])
words = []
labels = []
continue
elif contends.startswith('###'):
continue
word = line.strip().split()[0]
label = line.strip().split()[-1]
words.append(word)
labels.append(label)
return lines
class I2b22012Processor(CLEFEProcessor):
def get_labels(self):
return ['B-CLINICAL_DEPT', 'B-EVIDENTIAL', 'B-OCCURRENCE', 'B-PROBLEM', 'B-TEST', 'B-TREATMENT', 'I-CLINICAL_DEPT', 'I-EVIDENTIAL', 'I-OCCURRENCE', 'I-PROBLEM', 'I-TEST', 'I-TREATMENT', "O", "X", "[CLS]", "[SEP]"]
def write_tokens(tokens, labels, mode):
if mode == "test":
path = os.path.join(FLAGS.output_dir, "token_" + mode + ".txt")
if tf.gfile.Exists(path):
wf = tf.gfile.Open(path, 'a')
else:
wf = tf.gfile.Open(path, 'w')
for token, label in zip(tokens, labels):
if token != "**NULL**":
wf.write(token + ' ' + str(label) + '\n')
wf.close()
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, mode):
label_map = {}
for (i, label) in enumerate(label_list, 1):
label_map[label] = i
label2id_file = os.path.join(FLAGS.output_dir, 'label2id.pkl')
if not tf.gfile.Exists(label2id_file):
with tf.gfile.Open(label2id_file, 'wb') as w:
pickle.dump(label_map, w)
textlist = example.text.split(' ')
labellist = example.label.split(' ')
tokens = []
labels = []
for i, word in enumerate(textlist):
token = tokenizer.tokenize(word)
tokens.extend(token)
label_1 = labellist[i]
for m in range(len(token)):
if m == 0:
labels.append(label_1)
else:
labels.append("X")
# tokens = tokenizer.tokenize(example.text)
if len(tokens) >= max_seq_length - 1:
tokens = tokens[0:(max_seq_length - 2)]
labels = labels[0:(max_seq_length - 2)]
ntokens = []
segment_ids = []
label_ids = []
ntokens.append("[CLS]")
segment_ids.append(0)
# append("O") or append("[CLS]") not sure!
label_ids.append(label_map["[CLS]"])
for i, token in enumerate(tokens):
ntokens.append(token)
segment_ids.append(0)
label_ids.append(label_map[labels[i]])
ntokens.append("[SEP]")
segment_ids.append(0)
# append("O") or append("[SEP]") not sure!
label_ids.append(label_map["[SEP]"])
input_ids = tokenizer.convert_tokens_to_ids(ntokens)
input_mask = [1] * len(input_ids)
# label_mask = [1] * len(input_ids)
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
# we don't concerned about it!
label_ids.append(0)
ntokens.append("**NULL**")
# label_mask.append(0)
# print(len(input_ids))
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
assert len(label_ids) == max_seq_length
# assert len(label_mask) == max_seq_length
if ex_index < 5:
tf.logging.info("*** Example ***")
tf.logging.info("guid: %s" % (example.guid))
tf.logging.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids]))
# tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask]))
feature = InputFeatures(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_ids=label_ids,
# label_mask = label_mask
)
# write_tokens(ntokens, label_ids, mode)
return feature
def filed_based_convert_examples_to_features(
examples, label_list, max_seq_length, tokenizer, output_file, mode=None):
writer = tf.python_io.TFRecordWriter(output_file)
for (ex_index, example) in enumerate(examples):
if ex_index % 5000 == 0:
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer,
mode)
def create_int_feature(values):
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return f
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(feature.input_ids)
features["input_mask"] = create_int_feature(feature.input_mask)
features["segment_ids"] = create_int_feature(feature.segment_ids)
features["label_ids"] = create_int_feature(feature.label_ids)
# features["label_mask"] = create_int_feature(feature.label_mask)
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
def file_based_input_fn_builder(input_file, batch_size, seq_length, is_training, drop_remainder, hvd=None):
name_to_features = {
"input_ids": tf.FixedLenFeature([seq_length], tf.int64),
"input_mask": tf.FixedLenFeature([seq_length], tf.int64),
"segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
"label_ids": tf.FixedLenFeature([seq_length], tf.int64),
# "label_ids":tf.VarLenFeature(tf.int64),
# "label_mask": tf.FixedLenFeature([seq_length], tf.int64),
}
def _decode_record(record, name_to_features):
example = tf.parse_single_example(record, name_to_features)
for name in list(example.keys()):
t = example[name]
if t.dtype == tf.int64:
t = tf.to_int32(t)
example[name] = t
return example
def input_fn(params):
#batch_size = params["batch_size"]
d = tf.data.TFRecordDataset(input_file)
if is_training:
if hvd is not None: d = d.shard(hvd.size(), hvd.rank())
d = d.repeat()
d = d.shuffle(buffer_size=100)
d = d.apply(tf.contrib.data.map_and_batch(
lambda record: _decode_record(record, name_to_features),
batch_size=batch_size,
drop_remainder=drop_remainder
))
return d
return input_fn
def create_model(bert_config, is_training, input_ids, input_mask,
segment_ids, labels, num_labels, use_one_hot_embeddings):
model = modeling.BertModel(
config=bert_config,
is_training=is_training,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=segment_ids,
use_one_hot_embeddings=use_one_hot_embeddings
)
output_layer = model.get_sequence_output()
hidden_size = output_layer.shape[-1].value
output_weight = tf.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02)
)
output_bias = tf.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer()
)
with tf.variable_scope("loss"):
if is_training:
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
output_layer = tf.reshape(output_layer, [-1, hidden_size])
logits = tf.matmul(output_layer, output_weight, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels])
# mask = tf.cast(input_mask,tf.float32)
# loss = tf.contrib.seq2seq.sequence_loss(logits,labels,mask)
# return (loss, logits, predict)
##########################################################################
log_probs = tf.nn.log_softmax(logits, axis=-1)
one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
probabilities = tf.nn.softmax(logits, axis=-1)
predict = tf.argmax(probabilities, axis=-1)
return (loss, per_example_loss, logits, predict)
##########################################################################
def model_fn_builder(bert_config, num_labels, init_checkpoint=None, learning_rate=None,
num_train_steps=None, num_warmup_steps=None,
use_one_hot_embeddings=False, hvd=None, use_fp16=False):
def model_fn(features, labels, mode, params):
tf.logging.info("*** Features ***")
for name in sorted(features.keys()):
tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
label_ids = features["label_ids"]
# label_mask = features["label_mask"]
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
(total_loss, per_example_loss, logits, predicts) = create_model(
bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
num_labels, use_one_hot_embeddings)
tvars = tf.trainable_variables()
initialized_variable_names = {}
scaffold_fn = None
if init_checkpoint and (hvd is None or hvd.rank() == 0):
(assignment_map,
initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars,
init_checkpoint)
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
tf.logging.info("**** Trainable Variables ****")
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
init_string)
output_spec = None
if mode == tf.estimator.ModeKeys.TRAIN:
train_op = optimization.create_optimizer(
total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, use_fp16)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
train_op=train_op)
elif mode == tf.estimator.ModeKeys.EVAL:
def metric_fn(per_example_loss, label_ids, logits):
# def metric_fn(label_ids, logits):
predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
precision = tf_metrics.precision(label_ids, predictions, num_labels, [1, 2], average="macro")
recall = tf_metrics.recall(label_ids, predictions, num_labels, [1, 2], average="macro")
f = tf_metrics.f1(label_ids, predictions, num_labels, [1, 2], average="macro")
#
return {
"eval_precision": precision,
"eval_recall": recall,
"eval_f": f,
# "eval_loss": loss,
}
eval_metric_ops = metric_fn(per_example_loss, label_ids, logits)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
eval_metric_ops=eval_metric_ops)
else:
output_spec = tf.estimator.EstimatorSpec(
mode=mode, predictions=predicts)#probabilities)
return output_spec
return model_fn
def result_to_pair(predict_line, pred_ids, id2label, writer, err_writer):
words = str(predict_line.text).split(' ')
labels = str(predict_line.label).split(' ')
if len(words) != len(labels):
tf.logging.error('Text and label not equal')
tf.logging.error(predict_line.text)
tf.logging.error(predict_line.label)
exit(1)
# get from CLS to SEP
pred_labels = []
for id in pred_ids:
if id == 0:
continue
curr_label = id2label[id]
if curr_label == '[CLS]':
continue
elif curr_label == '[SEP]':
break
elif curr_label == 'X':
continue
pred_labels.append(curr_label)
if len(pred_labels) > len(words):
err_writer.write(predict_line.guid + '\n')
err_writer.write(predict_line.text + '\n')
err_writer.write(predict_line.label + '\n')
err_writer.write(' '.join([str(i) for i in pred_ids]) + '\n')
err_writer.write(' '.join([id2label.get(i, '**NULL**') for i in pred_ids]) + '\n\n')
pred_labels = pred_labels[:len(words)]
elif len(pred_labels) < len(words):
err_writer.write(predict_line.guid + '\n')
err_writer.write(predict_line.text + '\n')
err_writer.write(predict_line.label + '\n')
err_writer.write(' '.join([str(i) for i in pred_ids]) + '\n')
err_writer.write(' '.join([id2label.get(i, '**NULL**') for i in pred_ids]) + '\n\n')
pred_labels += ['O'] * (len(words) - len(pred_labels))
for tok, label, pred_label in zip(words, labels, pred_labels):
writer.write(tok + ' ' + label + ' ' + pred_label + '\n')
writer.write('\n')
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
if FLAGS.horovod:
hvd.init()
if FLAGS.use_fp16:
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
processors = {
"bc5cdr": BC5CDRProcessor,
"clefe": CLEFEProcessor,
'i2b2': I2b22012Processor
}
if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
if FLAGS.max_seq_length > bert_config.max_position_embeddings:
raise ValueError(
"Cannot use sequence length %d because the BERT model "
"was only trained up to sequence length %d" %
(FLAGS.max_seq_length, bert_config.max_position_embeddings))
task_name = FLAGS.task_name.lower()
if task_name not in processors:
raise ValueError("Task not found: %s" % (task_name))
tf.gfile.MakeDirs(FLAGS.output_dir)
processor = processors[task_name]()
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
master_process = True
training_hooks = []
global_batch_size = FLAGS.train_batch_size
hvd_rank = 0
config = tf.ConfigProto()
if FLAGS.horovod:
global_batch_size = FLAGS.train_batch_size * hvd.size()
master_process = (hvd.rank() == 0)
hvd_rank = hvd.rank()
config.gpu_options.visible_device_list = str(hvd.local_rank())
if hvd.size() > 1:
training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
if FLAGS.use_xla:
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
run_config = tf.estimator.RunConfig(
model_dir=FLAGS.output_dir if master_process else None,
session_config=config,
save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
keep_checkpoint_max=1)
if master_process:
tf.logging.info("***** Configuaration *****")
for key in FLAGS.__flags.keys():
tf.logging.info(' {}: {}'.format(key, getattr(FLAGS, key)))
tf.logging.info("**************************")
train_examples = None
num_train_steps = None
num_warmup_steps = None
training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank))
if FLAGS.do_train:
train_examples = processor.get_train_examples(FLAGS.data_dir)
num_train_steps = int(
len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
start_index = 0
end_index = len(train_examples)
tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]
if FLAGS.horovod:
tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size())]
num_examples_per_rank = len(train_examples) // hvd.size()
remainder = len(train_examples) % hvd.size()
if hvd.rank() < remainder:
start_index = hvd.rank() * (num_examples_per_rank+1)
end_index = start_index + num_examples_per_rank + 1
else:
start_index = hvd.rank() * num_examples_per_rank + remainder
end_index = start_index + (num_examples_per_rank)
model_fn = model_fn_builder(
bert_config=bert_config,
num_labels=len(label_list) + 1,
init_checkpoint=FLAGS.init_checkpoint,
learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(),
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
use_one_hot_embeddings=False,
hvd=None if not FLAGS.horovod else hvd,
use_fp16=FLAGS.use_fp16)
estimator = tf.estimator.Estimator(
model_fn=model_fn,
config=run_config)
if FLAGS.do_train:
#train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
#filed_based_convert_examples_to_features(
# train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
filed_based_convert_examples_to_features(
train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])
tf.logging.info("***** Running training *****")
tf.logging.info(" Num examples = %d", len(train_examples))
tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
tf.logging.info(" Num steps = %d", num_train_steps)
train_input_fn = file_based_input_fn_builder(
input_file=tmp_filenames, #train_file,
batch_size=FLAGS.train_batch_size,
seq_length=FLAGS.max_seq_length,
is_training=True,
drop_remainder=True,
hvd=None if not FLAGS.horovod else hvd)
#estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
train_start_time = time.time()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks)
train_time_elapsed = time.time() - train_start_time
train_time_wo_overhead = training_hooks[-1].total_time
avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
ss_sentences_per_second = (num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead
if master_process:
tf.logging.info("-----------------------------")
tf.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed,
num_train_steps * global_batch_size)
tf.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead,
(num_train_steps - training_hooks[-1].skipped) * global_batch_size)
tf.logging.info("Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second)
tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
tf.logging.info("-----------------------------")
if FLAGS.do_eval and master_process:
eval_examples = processor.get_dev_examples(FLAGS.data_dir)
eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
filed_based_convert_examples_to_features(
eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
tf.logging.info("***** Running evaluation *****")
tf.logging.info(" Num examples = %d", len(eval_examples))
tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
eval_steps = None
eval_drop_remainder = False
eval_input_fn = file_based_input_fn_builder(
input_file=eval_file,
batch_size=FLAGS.eval_batch_size,
seq_length=FLAGS.max_seq_length,
is_training=False,
drop_remainder=eval_drop_remainder)
result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
with tf.gfile.Open(output_eval_file, "w") as writer:
tf.logging.info("***** Eval results *****")
for key in sorted(result.keys()):
tf.logging.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
if FLAGS.do_predict and master_process:
predict_examples = processor.get_test_examples(FLAGS.data_dir)
predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
filed_based_convert_examples_to_features(predict_examples, label_list,
FLAGS.max_seq_length, tokenizer,
predict_file, mode="test")
with tf.gfile.Open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf:
label2id = pickle.load(rf)
id2label = {value: key for key, value in label2id.items()}
token_path = os.path.join(FLAGS.output_dir, "token_test.txt")
if tf.gfile.Exists(token_path):
tf.gfile.Remove(token_path)
tf.logging.info("***** Running prediction*****")
tf.logging.info(" Num examples = %d", len(predict_examples))
tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size)
predict_drop_remainder = False
predict_input_fn = file_based_input_fn_builder(
input_file=predict_file,
batch_size=FLAGS.predict_batch_size,
seq_length=FLAGS.max_seq_length,
is_training=False,
drop_remainder=predict_drop_remainder)
eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
eval_start_time = time.time()
output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt")
test_labels_file = os.path.join(FLAGS.output_dir, "test_labels.txt")
test_labels_err_file = os.path.join(FLAGS.output_dir, "test_labels_errs.txt")
with tf.gfile.Open(output_predict_file, 'w') as writer, \
tf.gfile.Open(test_labels_file, 'w') as tl, \
tf.gfile.Open(test_labels_err_file, 'w') as tle:
print(id2label)
i=0
for prediction in estimator.predict(input_fn=predict_input_fn, hooks=eval_hooks,
yield_single_examples=True):
output_line = "\n".join(id2label[id] for id in prediction if id != 0) + "\n"
writer.write(output_line)
result_to_pair(predict_examples[i], prediction, id2label, tl, tle)
i = i + 1
eval_time_elapsed = time.time() - eval_start_time
eval_time_wo_overhead = eval_hooks[-1].total_time
time_list = eval_hooks[-1].time_list
time_list.sort()
num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size
avg = np.mean(time_list)
cf_50 = max(time_list[:int(len(time_list) * 0.50)])
cf_90 = max(time_list[:int(len(time_list) * 0.90)])
cf_95 = max(time_list[:int(len(time_list) * 0.95)])
cf_99 = max(time_list[:int(len(time_list) * 0.99)])
cf_100 = max(time_list[:int(len(time_list) * 1)])
ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
tf.logging.info("-----------------------------")
tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
eval_hooks[-1].count * FLAGS.predict_batch_size)
tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
(eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size)
tf.logging.info("Summary Inference Statistics")
tf.logging.info("Batch size = %d", FLAGS.predict_batch_size)
tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
tf.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
tf.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
tf.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
tf.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000)
tf.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000)
tf.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
tf.logging.info("-----------------------------")
tf.logging.info('Reading: %s', test_labels_file)
with tf.gfile.Open(test_labels_file, "r") as f:
counts = evaluate(f)
eval_result = report_notprint(counts)
print(''.join(eval_result))
with tf.gfile.Open(os.path.join(FLAGS.output_dir, 'test_results_conlleval.txt'), 'w') as fd:
fd.write(''.join(eval_result))
if __name__ == "__main__":
flags.mark_flag_as_required("data_dir")
flags.mark_flag_as_required("task_name")
flags.mark_flag_as_required("vocab_file")
flags.mark_flag_as_required("bert_config_file")
flags.mark_flag_as_required("output_dir")
tf.app.run()
@@ -0,0 +1,818 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run masked LM/next sentence masked_lm pre-training for BERT."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import modeling
import optimization
import tensorflow as tf
import glob
from utils.utils import LogEvalRunHook
from tensorflow.core.protobuf import rewriter_config_pb2
from gpu_environment import get_custom_getter
from npu_bridge.estimator.npu.npu_config import *
from npu_bridge.estimator.npu.npu_estimator import *
from npu_bridge.estimator.npu.npu_config import NPURunConfig
from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
import sys
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../../../../../utils/atlasboost'))
# import hwlog
from benchmark_log import hwlog
from benchmark_log.basic_utils import get_environment_info
from benchmark_log.basic_utils import get_model_parameter
os.environ['WHICH_OP'] = 'GEOP'
os.environ['NEW_GE_FE_ID'] = '1'
os.environ['GE_AICPU_FLAG'] = '1'
os.environ['GE_USE_STATIC_MEMORY'] = '1'
os.environ['OPTION_EXEC_HCCL_FLAG'] = '1'
os.environ['HCCL_CONNECT_TIMEOUT'] = '600'
flags = tf.flags
FLAGS = flags.FLAGS
## Required parameters
flags.DEFINE_string(
"bert_config_file", None,
"The config json file corresponding to the pre-trained BERT model. "
"This specifies the model architecture.")
flags.DEFINE_string(
"input_files_dir", None,
"Directory with input files, comma separated or single directory.")
flags.DEFINE_string(
"eval_files_dir", None,
"Directory with eval files, comma separated or single directory. ")
flags.DEFINE_string(
"output_dir", None,
"The output directory where the model checkpoints will be written.")
## Other parameters
flags.DEFINE_string(
"init_checkpoint", None,
"Initial checkpoint (usually from a pre-trained BERT model).")
flags.DEFINE_string(
"optimizer_type", "lamb",
"Optimizer used for training - LAMB or ADAM")
flags.DEFINE_integer(
"max_seq_length", 512,
"The maximum total input sequence length after WordPiece tokenization. "
"Sequences longer than this will be truncated, and sequences shorter "
"than this will be padded. Must match data generation.")
flags.DEFINE_integer(
"max_predictions_per_seq", 80,
"Maximum number of masked LM predictions per sequence. "
"Must match data generation.")
flags.DEFINE_bool("do_train", False, "Whether to run training.")
flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.")
flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.")
flags.DEFINE_integer("save_checkpoints_steps", 1000,
"How often to save the model checkpoint.")
flags.DEFINE_integer("display_loss_steps", 10,
"How often to print loss")
flags.DEFINE_integer("iterations_per_loop", 1000,
"How many steps to make in each estimator call.")
flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.")
flags.DEFINE_integer("num_accumulation_steps", 1,
"Number of accumulation steps before gradient update."
"Global batch size = num_accumulation_steps * train_batch_size")
flags.DEFINE_bool("allreduce_post_accumulation", False,
"Whether to all reduce after accumulation of N steps or after each step")
flags.DEFINE_bool(
"verbose_logging", False,
"If true, all of the trainable parameters are printed")
flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
flags.DEFINE_bool("report_loss", True, "Whether to report total loss during training.")
flags.DEFINE_bool("manual_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU. "
"Manual casting is done instead of using AMP")
flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
flags.DEFINE_bool("use_fp16", False, "Whether to enable AMP ops.")
flags.DEFINE_bool("use_fp16_cls", False, "Whether to use fp16 in cls and pooler.")
flags.DEFINE_bool("distributed", False, "Whether to use multi-npu")
flags.DEFINE_bool('npu_bert_fused_gelu', True, 'Whether to use npu defined gelu op')
flags.DEFINE_bool('npu_bert_debug', False, 'If True, dropout and shuffle is disabled.')
flags.DEFINE_bool('npu_bert_use_tdt', True, 'Whether to use tdt as dataset')
flags.DEFINE_string("npu_bert_job_start_file", None, "CSA job start file path.")
flags.DEFINE_integer("npu_bert_loss_scale", -1,
"Whether to use loss scale, -1 is disable, 0 is dynamic loss scale, >=1 is static loss scale")
flags.DEFINE_bool("npu_bert_clip_by_global_norm", True,
"Use clip_by_global_norm if True, or use clip_by_norm for each gradient")
flags.DEFINE_bool('npu_bert_npu_dropout', True, 'Whether to use npu defined gelu op')
flags.DEFINE_bool('npu_bert_tail_optimize', False, 'Whether to use npu allreduce tail optimization')
flags.DEFINE_bool('npu_gather', True, 'Whether to use gather_npu whose backward propagation avoids IndexedSlices')
flags.DEFINE_bool('hcom_parallel', True, 'Whether to use parallel allreduce')
flags.DEFINE_integer('init_loss_scale_value', 2 ** 32, 'Initial loss scale value for loss scale optimizer')
flags.DEFINE_bool('npu_bert_use_fused_batch_norm', False,
'Whether to use fused batch norm implementation in fused_layer_norm')
flags.DEFINE_bool('npu_bert_use_fused_adam_momentum', True, 'Whether to use fused apply and assign in adam')
flags.DEFINE_integer('graph_memory_max_size', 27 * 1024 * 1024 * 1024, 'feature map memory max size')
flags.DEFINE_integer('variable_memory_max_size', 4 * 1024 * 1024 * 1024, 'variable memory max size')
# report samples/sec, total loss and learning rate during training
class _LogSessionRunHook(tf.train.SessionRunHook):
def __init__(self, global_batch_size, num_accumulation_steps, display_every=10, hvd_rank=-1):
self.global_batch_size = global_batch_size
self.display_every = display_every
self.hvd_rank = hvd_rank
self.num_accumulation_steps = num_accumulation_steps
def after_create_session(self, session, coord):
self.elapsed_secs = 0.
self.count = 0
self.all_count = 0
self.avg_loss = 0.0
def before_run(self, run_context):
self.t0 = time.time()
if self.num_accumulation_steps <= 1:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
if tf.flags.FLAGS.npu_bert_tail_optimize:
return tf.train.SessionRunArgs(
fetches=['global_step:0', 'total_loss:0',
'learning_rate:0', 'nsp_loss:0',
'mlm_loss:0', 'loss_scale:0'])
else:
return tf.train.SessionRunArgs(
fetches=['global_step:0', 'total_loss:0',
'learning_rate:0', 'nsp_loss:0',
'mlm_loss:0', 'loss_scale:0'])
else:
return tf.train.SessionRunArgs(
fetches=['global_step:0', 'total_loss:0',
'learning_rate:0', 'nsp_loss:0',
'mlm_loss:0'])
else:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
return tf.train.SessionRunArgs(
fetches=['global_step:0', 'update_step:0', 'total_loss:0',
'learning_rate:0', 'nsp_loss:0',
'mlm_loss:0', 'loss_scale:0'])
else:
return tf.train.SessionRunArgs(
fetches=['global_step:0', 'update_step:0', 'total_loss:0',
'learning_rate:0', 'nsp_loss:0',
'mlm_loss:0'])
def after_run(self, run_context, run_values):
self.elapsed_secs += time.time() - self.t0
if self.num_accumulation_steps <= 1:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
else:
global_step, total_loss, lr, nsp_loss, mlm_loss = run_values. \
results
update_step = True
else:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
global_step, update_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
else:
global_step, update_step, total_loss, lr, nsp_loss, mlm_loss = run_values. \
results
print_step = global_step + 1 # One-based index for printing.
self.avg_loss += total_loss
self.all_count += 1
if update_step:
self.count += 1
dt = self.elapsed_secs / self.count
sent_per_sec = self.global_batch_size / dt * FLAGS.iterations_per_loop
avg_loss_step = self.avg_loss / self.all_count
if self.hvd_rank >= 0:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
print(
'Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e Loss scale = %6.4e' %
(self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr,
loss_scaler), flush=True)
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
else:
print(
'Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' %
(self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr),
flush=True)
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
else:
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
print(
'Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e Loss scale = %6.4e' %
(print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr, loss_scaler),
flush=True)
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
else:
print(
'Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' %
(print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr), flush=True)
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
self.elapsed_secs = 0.
self.count = 0
self.avg_loss = 0.0
self.all_count = 0
def model_fn_builder(bert_config, init_checkpoint, learning_rate,
num_train_steps, num_warmup_steps,
use_one_hot_embeddings, hvd=None):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
tf.logging.info("*** Features ***")
for name in sorted(features.keys()):
tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
masked_lm_positions = features["masked_lm_positions"]
masked_lm_ids = features["masked_lm_ids"]
masked_lm_weights = features["masked_lm_weights"]
next_sentence_labels = features["next_sentence_labels"]
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
model = modeling.BertModel(
config=bert_config,
is_training=is_training,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=segment_ids,
use_one_hot_embeddings=use_one_hot_embeddings,
compute_type=tf.float16 if FLAGS.manual_fp16 else tf.float32)
(masked_lm_loss,
masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output(
bert_config, model.get_sequence_output(), model.get_embedding_table(),
masked_lm_positions, masked_lm_ids,
masked_lm_weights)
(next_sentence_loss, next_sentence_example_loss,
next_sentence_log_probs) = get_next_sentence_output(
bert_config, model.get_pooled_output(), next_sentence_labels)
masked_lm_loss = tf.identity(masked_lm_loss, name="mlm_loss")
next_sentence_loss = tf.identity(next_sentence_loss, name="nsp_loss")
total_loss = masked_lm_loss + next_sentence_loss
total_loss = tf.identity(total_loss, name='total_loss')
tvars = tf.trainable_variables()
initialized_variable_names = {}
if init_checkpoint and (hvd is None or hvd.rank() == 0):
print("Loading checkpoint", init_checkpoint)
(assignment_map, initialized_variable_names
) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
if FLAGS.verbose_logging:
tf.logging.info("**** Trainable Variables ****")
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
tf.logging.info(" %d :: name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name,
var.shape,
init_string)
output_spec = None
if mode == tf.estimator.ModeKeys.TRAIN:
train_op = optimization.create_optimizer(
total_loss, learning_rate, num_train_steps, num_warmup_steps,
hvd, FLAGS.manual_fp16, FLAGS.use_fp16, FLAGS.num_accumulation_steps, FLAGS.optimizer_type,
FLAGS.allreduce_post_accumulation)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
train_op=train_op)
elif mode == tf.estimator.ModeKeys.EVAL:
def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
masked_lm_weights, next_sentence_example_loss,
next_sentence_log_probs, next_sentence_labels):
"""Computes the loss and accuracy of the model."""
masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
[-1, masked_lm_log_probs.shape[-1]])
masked_lm_predictions = tf.argmax(
masked_lm_log_probs, axis=-1, output_type=tf.int32)
masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
masked_lm_accuracy = tf.metrics.accuracy(
labels=masked_lm_ids,
predictions=masked_lm_predictions,
weights=masked_lm_weights)
masked_lm_mean_loss = tf.metrics.mean(
values=masked_lm_example_loss, weights=masked_lm_weights)
next_sentence_log_probs = tf.reshape(
next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
next_sentence_predictions = tf.argmax(
next_sentence_log_probs, axis=-1, output_type=tf.int32)
next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
next_sentence_accuracy = tf.metrics.accuracy(
labels=next_sentence_labels, predictions=next_sentence_predictions)
next_sentence_mean_loss = tf.metrics.mean(
values=next_sentence_example_loss)
return {
"masked_lm_accuracy": masked_lm_accuracy,
"masked_lm_loss": masked_lm_mean_loss,
"next_sentence_accuracy": next_sentence_accuracy,
"next_sentence_loss": next_sentence_mean_loss,
}
eval_metric_ops = metric_fn(
masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
masked_lm_weights, next_sentence_example_loss,
next_sentence_log_probs, next_sentence_labels
)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
eval_metric_ops=eval_metric_ops)
else:
raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))
return output_spec
return model_fn
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
label_ids, label_weights):
"""Get loss and log probs for the masked LM."""
input_tensor = gather_indexes(input_tensor, positions)
with tf.variable_scope("cls/predictions"):
# We apply one more non-linear transformation before the output layer.
# This matrix is not used after pre-training.
with tf.variable_scope("transform", custom_getter=get_custom_getter(
compute_type=tf.float16 if FLAGS.use_fp16_cls else tf.float32)):
if FLAGS.use_fp16_cls:
input_tensor = tf.cast(input_tensor, tf.float16)
input_tensor = tf.layers.dense(
input_tensor,
units=bert_config.hidden_size,
activation=modeling.get_activation(bert_config.hidden_act),
kernel_initializer=modeling.create_initializer(
bert_config.initializer_range))
input_tensor = tf.cast(input_tensor, tf.float32)
input_tensor = modeling.layer_norm(input_tensor)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
output_bias = tf.get_variable(
"output_bias",
shape=[bert_config.vocab_size],
initializer=tf.zeros_initializer())
if FLAGS.use_fp16_cls:
input_tensor = tf.cast(input_tensor, tf.float16)
logits = tf.matmul(input_tensor, tf.cast(output_weights, tf.float16), transpose_b=True)
logits = tf.cast(logits, tf.float32)
else:
logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
label_ids = tf.reshape(label_ids, [-1])
label_weights = tf.reshape(label_weights, [-1])
one_hot_labels = tf.one_hot(
label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
# The `positions` tensor might be zero-padded (if the sequence is too
# short to have the maximum number of predictions). The `label_weights`
# tensor has a value of 1.0 for every real prediction and 0.0 for the
# padding predictions.
per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
numerator = tf.reduce_sum(label_weights * per_example_loss)
denominator = tf.reduce_sum(label_weights) + 1e-5
loss = numerator / denominator
return (loss, per_example_loss, log_probs)
def get_next_sentence_output(bert_config, input_tensor, labels):
"""Get loss and log probs for the next sentence prediction."""
# Simple binary classification. Note that 0 is "next sentence" and 1 is
# "random sentence". This weight matrix is not used after pre-training.
with tf.variable_scope("cls/seq_relationship"):
output_weights = tf.get_variable(
"output_weights",
shape=[2, bert_config.hidden_size],
initializer=modeling.create_initializer(bert_config.initializer_range))
output_bias = tf.get_variable(
"output_bias", shape=[2], initializer=tf.zeros_initializer())
if FLAGS.use_fp16_cls:
input_tensor = tf.cast(input_tensor, tf.float16)
logits = tf.matmul(input_tensor, tf.cast(output_weights, tf.float16), transpose_b=True)
logits = tf.cast(logits, tf.float32)
else:
logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
labels = tf.reshape(labels, [-1])
one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
return (loss, per_example_loss, log_probs)
def gather_indexes(sequence_tensor, positions):
"""Gathers the vectors at the specific positions over a minibatch."""
sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
batch_size = sequence_shape[0]
seq_length = sequence_shape[1]
width = sequence_shape[2]
flat_offsets = tf.reshape(
tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
flat_positions = tf.reshape(positions + flat_offsets, [-1])
flat_sequence_tensor = tf.reshape(sequence_tensor,
[batch_size * seq_length, width])
output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
return output_tensor
def input_fn_builder(input_files,
batch_size,
max_seq_length,
max_predictions_per_seq,
is_training,
num_cpu_threads=4,
hvd=None):
"""Creates an `input_fn` closure to be passed to Estimator."""
def input_fn():
"""The actual input function."""
name_to_features = {
"input_ids":
tf.FixedLenFeature([max_seq_length], tf.int64),
"input_mask":
tf.FixedLenFeature([max_seq_length], tf.int64),
"segment_ids":
tf.FixedLenFeature([max_seq_length], tf.int64),
"masked_lm_positions":
tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
"masked_lm_ids":
tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
"masked_lm_weights":
tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
"next_sentence_labels":
tf.FixedLenFeature([1], tf.int64),
}
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if is_training:
d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
if FLAGS.distributed:
rank_size = int(os.getenv('RANK_SIZE'))
rank_id = int(os.getenv('RANK_ID'))
print('RANK_SIZE=', rank_size, ' rank_id=', rank_id)
d = d.shard(rank_size, rank_id)
d = d.repeat()
if not FLAGS.npu_bert_debug:
d = d.shuffle(buffer_size=len(input_files))
# `cycle_length` is the number of parallel files that get read.
if not FLAGS.npu_bert_debug:
# cycle_length = min(num_cpu_threads, len(input_files))
cycle_length = min(num_cpu_threads, int(len(input_files) / int(os.getenv('RANK_SIZE'))))
else:
cycle_length = 1
# `sloppy` mode means that the interleaving is not exact. This adds
# even more randomness to the training pipeline.
# d = d.apply(
# tf.contrib.data.parallel_interleave(
# tf.data.TFRecordDataset,
# sloppy=(not FLAGS.npu_bert_debug),
# cycle_length=cycle_length))
d = d.interleave(tf.data.TFRecordDataset, cycle_length=cycle_length,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
if not FLAGS.npu_bert_debug:
d = d.shuffle(buffer_size=100)
else:
d = tf.data.TFRecordDataset(input_files)
# Since we evaluate for a fixed number of steps we don't want to encounter
# out-of-range exceptions.
d = d.repeat()
# We must `drop_remainder` on training because the TPU requires fixed
# size dimensions. For eval, we assume we are evaluating on the CPU or GPU
# and we *don't* want to drop the remainder, otherwise we wont cover
# every sample.
d = d.apply(
tf.contrib.data.map_and_batch(
lambda record: _decode_record(record, name_to_features),
batch_size=batch_size,
num_parallel_batches=num_cpu_threads,
drop_remainder=True))
return d
return input_fn
def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
example = tf.parse_single_example(record, name_to_features)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for name in list(example.keys()):
t = example[name]
if t.dtype == tf.int64:
t = tf.to_int32(t)
example[name] = t
return example
def main(_):
for name, value in FLAGS.__flags.items():
print("name:", name, " ", FLAGS[name].value)
tf.logging.set_verbosity(tf.logging.INFO)
if not FLAGS.do_train and not FLAGS.do_eval:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
if FLAGS.use_fp16:
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
if FLAGS.horovod:
import horovod.tensorflow as hvd
hvd.init()
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
if FLAGS.npu_gather:
if FLAGS.distributed and bert_config.num_hidden_layers == 24:
from hccl.split.api import set_split_strategy_by_idx
set_split_strategy_by_idx([49, 113, 177, 241, 305, 353, 385, 397])
if FLAGS.distributed and bert_config.num_hidden_layers == 12:
from hccl.split.api import set_split_strategy_by_idx
set_split_strategy_by_idx([8, 56, 104, 152, 200, 205])
if FLAGS.distributed and bert_config.num_hidden_layers == 6:
from hccl.split.api import set_split_strategy_by_idx
set_split_strategy_by_idx([8, 40, 72, 104, 109])
tf.gfile.MakeDirs(FLAGS.output_dir)
input_files = []
for input_file_dir in FLAGS.input_files_dir.split(","):
input_files.extend(tf.gfile.Glob(os.path.join(input_file_dir, "*")))
input_files.sort()
print("Input Files:", input_files)
if FLAGS.horovod and len(input_files) < hvd.size():
raise ValueError("Input Files must be sharded")
if FLAGS.use_fp16 and FLAGS.manual_fp16:
raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error")
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
config = tf.ConfigProto()
if FLAGS.horovod:
config.gpu_options.visible_device_list = str(hvd.local_rank())
if hvd.rank() == 0:
tf.logging.info("***** Configuaration *****")
for key in FLAGS.__flags.keys():
tf.logging.info(' {}: {}'.format(key, getattr(FLAGS, key)))
tf.logging.info("**************************")
# config.gpu_options.per_process_gpu_memory_fraction = 0.7
if FLAGS.use_xla:
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT
# run_config = tf.estimator.RunConfig(
run_config = NPURunConfig(
model_dir=FLAGS.output_dir,
save_summary_steps=0,
session_config=config,
save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None,
# This variable controls how often estimator reports examples/sec.
# Default value is every 100 steps.
# When --report_loss is True, we set to very large value to prevent
# default info reporting from estimator.
# Ideally we should set it to None, but that does not work.
log_step_count_steps=1 if FLAGS.report_loss else 100,
enable_data_pre_proc=FLAGS.npu_bert_use_tdt,
iterations_per_loop=FLAGS.iterations_per_loop,
is_tailing_optimization=FLAGS.npu_bert_tail_optimize,
hcom_parallel=FLAGS.hcom_parallel,
graph_memory_max_size=FLAGS.graph_memory_max_size,
variable_memory_max_size=FLAGS.variable_memory_max_size)
if FLAGS.distributed:
rank_size = int(os.getenv('RANK_SIZE'))
model_fn = model_fn_builder(
bert_config=bert_config,
init_checkpoint=FLAGS.init_checkpoint,
learning_rate=FLAGS.learning_rate if not (FLAGS.distributed) else FLAGS.learning_rate * rank_size,
num_train_steps=FLAGS.num_train_steps,
num_warmup_steps=FLAGS.num_warmup_steps,
use_one_hot_embeddings=False,
hvd=None if not FLAGS.horovod else hvd)
training_hooks = []
"""
if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0):
global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.horovod else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
if FLAGS.horovod and hvd.size() > 1:
training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
"""
if FLAGS.report_loss:
global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.distributed else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * rank_size
training_hooks.append(
_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
# estimator = tf.estimator.Estimator(
estimator = NPUEstimator(
model_fn=model_fn,
config=run_config,
job_start_file=FLAGS.npu_bert_job_start_file)
if FLAGS.do_train:
tf.logging.info("***** Running training *****")
tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
train_input_fn = input_fn_builder(
input_files=input_files,
batch_size=FLAGS.train_batch_size,
max_seq_length=FLAGS.max_seq_length,
max_predictions_per_seq=FLAGS.max_predictions_per_seq,
is_training=True,
hvd=None if not FLAGS.horovod else hvd)
estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps)
if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
tf.logging.info("***** Running evaluation *****")
tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
eval_files = []
for eval_file_dir in FLAGS.eval_files_dir.split(","):
eval_files.extend(tf.gfile.Glob(os.path.join(eval_file_dir, "*")))
eval_input_fn = input_fn_builder(
input_files=eval_files,
batch_size=FLAGS.eval_batch_size,
max_seq_length=FLAGS.max_seq_length,
max_predictions_per_seq=FLAGS.max_predictions_per_seq,
is_training=False,
hvd=None if not FLAGS.horovod else hvd)
eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
eval_start_time = time.time()
result = estimator.evaluate(
input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks)
eval_time_elapsed = time.time() - eval_start_time
eval_time_wo_overhead = eval_hooks[-1].total_time
num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size
ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
tf.logging.info("-----------------------------")
tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
eval_hooks[-1].count * FLAGS.eval_batch_size)
tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
(eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size)
tf.logging.info("Summary Inference Statistics on EVAL set")
tf.logging.info("Batch size = %d", FLAGS.eval_batch_size)
tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
tf.logging.info("-----------------------------")
output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
with tf.gfile.GFile(output_eval_file, "w") as writer:
tf.logging.info("***** Eval results *****")
for key in sorted(result.keys()):
tf.logging.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
if key == 'masked_lm_accuracy':
hwlog.remark_print(key=hwlog.MASKED_LM_ACCURACY, value=str(result[key]))
elif key == 'next_sentence_accuracy ':
hwlog.remark_print(key=hwlog.NEXT_SENTENCE_ACCURACY, value=str(result[key]))
elif key == 'global_step':
hwlog.remark_print(key=hwlog.GLOBAL_STEP, value=str(result[key]))
elif key == 'loss':
hwlog.remark_print(key=hwlog.LOSS, value=str(result[key]))
elif key == 'masked_lm_loss':
hwlog.remark_print(key=hwlog.MASKED_LM_LOSS, value=str(result[key]))
elif key == 'next_sentence_loss ':
hwlog.remark_print(key=hwlog.NEXT_SENTENCE_LOSS, value=str(result[key]))
else:
pass
if __name__ == "__main__":
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
config_info = get_model_parameter("tensorflow_config")
initinal_data = {"base_lr": 0.01, "dataset": "cn-clue/en-wiki", "optimizer": "Adam", "loss_scale": 512}
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
flags.mark_flag_as_required("input_files_dir")
flags.mark_flag_as_required("eval_files_dir")
flags.mark_flag_as_required("bert_config_file")
flags.mark_flag_as_required("output_dir")
flags.mark_flag_as_required("npu_bert_job_start_file")
if FLAGS.use_xla and FLAGS.manual_fp16:
print('WARNING! Combining --use_xla with --manual_fp16 may prevent convergence.')
print(' This warning message will be removed when the underlying')
print(' issues have been fixed and you are running a TF version')
print(' that has that fix.')
tf.app.run()
@@ -0,0 +1,939 @@
# coding=utf-8
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import csv
import logging
import os, sys
import numpy as np
import tensorflow as tf
sys.path.append("/workspace/bert")
import modeling
import optimization
import tokenization
import time
import horovod.tensorflow as hvd
from utils.utils import LogEvalRunHook, LogTrainRunHook
flags = tf.flags
FLAGS = flags.FLAGS
## Required parameters
flags.DEFINE_string(
"data_dir", None,
"The input data dir. Should contain the .tsv files (or other data files) "
"for the task.")
flags.DEFINE_string(
"bert_config_file", None,
"The config json file corresponding to the pre-trained BERT model. "
"This specifies the model architecture.")
flags.DEFINE_string("task_name", None, "The name of the task to train.")
flags.DEFINE_string("vocab_file", None,
"The vocabulary file that the BERT model was trained on.")
flags.DEFINE_string(
"output_dir", None,
"The output directory where the model checkpoints will be written.")
## Other parameters
flags.DEFINE_string(
"init_checkpoint", None,
"Initial checkpoint (usually from a pre-trained BERT model).")
flags.DEFINE_bool(
"do_lower_case", True,
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.")
flags.DEFINE_integer(
"max_seq_length", 128,
"The maximum total input sequence length after WordPiece tokenization. "
"Sequences longer than this will be truncated, and sequences shorter "
"than this will be padded.")
flags.DEFINE_bool("do_train", False, "Whether to run training.")
flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
flags.DEFINE_bool(
"do_predict", False,
"Whether to run the model in inference mode on the test set.")
flags.DEFINE_integer("train_batch_size", 16, "Total batch size for training.")
flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")
flags.DEFINE_float("learning_rate", 5e-6, "The initial learning rate for Adam.")
flags.DEFINE_float("num_train_epochs", 3.0,
"Total number of training epochs to perform.")
flags.DEFINE_float(
"warmup_proportion", 0.1,
"Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10% of training.")
flags.DEFINE_integer("save_checkpoints_steps", 1000,
"How often to save the model checkpoint.")
flags.DEFINE_integer("iterations_per_loop", 1000,
"How many steps to make in each estimator call.")
tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
class InputExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self, guid, text_a, text_b=None, label=None):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
class PaddingInputExample(object):
"""Fake example so the num input examples is a multiple of the batch size.
When running eval/predict on the TPU, we need to pad the number of examples
to be a multiple of the batch size, because the TPU requires a fixed batch
size. The alternative is to drop the last batch, which is bad because it means
the entire output data won't be generated.
We use this class instead of `None` because treating `None` as padding
battches could cause silent errors.
"""
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self,
input_ids,
input_mask,
segment_ids,
label_id,
is_real_example=True):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
self.is_real_example = is_real_example
class DataProcessor(object):
"""Base class for data converters for sequence classification data sets."""
def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set."""
raise NotImplementedError()
def get_dev_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError()
def get_test_examples(self, data_dir):
"""Gets a collection of `InputExample`s for prediction."""
raise NotImplementedError()
def get_labels(self):
"""Gets the list of labels for this data set."""
raise NotImplementedError()
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with tf.gfile.Open(input_file, "r") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
lines.append(line)
return lines
class _ChemProtProcessor(DataProcessor):
"""Processor for the ChemProt data set."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir, file_name="dev.tsv"):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, file_name)), "dev")
def get_test_examples(self, data_dir, file_name="test.tsv"):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, file_name)), "test")
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
# skip header
if i == 0:
continue
guid = line[0]
text_a = tokenization.convert_to_unicode(line[1])
if set_type == "test":
label = self.get_labels()[-1]
else:
try:
label = tokenization.convert_to_unicode(line[2])
except IndexError:
logging.exception(line)
exit(1)
examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples
class ChemProtProcessor(_ChemProtProcessor):
def get_labels(self):
"""See base class."""
return ["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9", "false"]
class MedNLIProcessor(DataProcessor):
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir, file_name="dev.tsv"):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, file_name)), "dev")
def get_test_examples(self, data_dir, file_name="test.tsv"):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, file_name)), "test")
def get_labels(self):
"""See base class."""
return ['contradiction', 'entailment', 'neutral']
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = line[1]
text_a = tokenization.convert_to_unicode(line[2])
text_b = tokenization.convert_to_unicode(line[3])
if set_type == "test":
label = self.get_labels()[-1]
else:
label = tokenization.convert_to_unicode(line[0])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
def convert_single_example(ex_index, example, label_list, max_seq_length,
tokenizer):
"""Converts a single `InputExample` into a single `InputFeatures`."""
if isinstance(example, PaddingInputExample):
return InputFeatures(
input_ids=[0] * max_seq_length,
input_mask=[0] * max_seq_length,
segment_ids=[0] * max_seq_length,
label_id=0,
is_real_example=False)
label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
label_id = label_map[example.label]
if ex_index < 5:
tf.logging.info("*** Example ***")
tf.logging.info("guid: %s" % (example.guid))
tf.logging.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
tf.logging.info("label: %s (id = %d)" % (example.label, label_id))
feature = InputFeatures(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id,
is_real_example=True)
return feature
def file_based_convert_examples_to_features(
examples, label_list, max_seq_length, tokenizer, output_file):
"""Convert a set of `InputExample`s to a TFRecord file."""
writer = tf.python_io.TFRecordWriter(output_file)
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
feature = convert_single_example(ex_index, example, label_list,
max_seq_length, tokenizer)
def create_int_feature(values):
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return f
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(feature.input_ids)
features["input_mask"] = create_int_feature(feature.input_mask)
features["segment_ids"] = create_int_feature(feature.segment_ids)
features["label_ids"] = create_int_feature([feature.label_id])
features["is_real_example"] = create_int_feature(
[int(feature.is_real_example)])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
writer.close()
def file_based_input_fn_builder(input_file, batch_size, seq_length, is_training,
drop_remainder, hvd=None):
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
name_to_features = {
"input_ids": tf.FixedLenFeature([seq_length], tf.int64),
"input_mask": tf.FixedLenFeature([seq_length], tf.int64),
"segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
"label_ids": tf.FixedLenFeature([], tf.int64),
"is_real_example": tf.FixedLenFeature([], tf.int64),
}
def _decode_record(record, name_to_features):
"""Decodes a record to a TensorFlow example."""
example = tf.parse_single_example(record, name_to_features)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for name in list(example.keys()):
t = example[name]
if t.dtype == tf.int64:
t = tf.to_int32(t)
example[name] = t
return example
def input_fn(params):
"""The actual input function."""
#batch_size = params["batch_size"]
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
d = tf.data.TFRecordDataset(input_file)
if is_training:
if hvd is not None: d = d.shard(hvd.size(), hvd.rank())
d = d.repeat()
d = d.shuffle(buffer_size=100)
d = d.apply(
tf.contrib.data.map_and_batch(
lambda record: _decode_record(record, name_to_features),
batch_size=batch_size,
drop_remainder=drop_remainder))
return d
return input_fn
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
labels, num_labels, use_one_hot_embeddings):
"""Creates a classification model."""
model = modeling.BertModel(
config=bert_config,
is_training=is_training,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=segment_ids,
use_one_hot_embeddings=use_one_hot_embeddings)
# In the demo, we are doing a simple classification task on the entire
# segment.
#
# If you want to use the token-level output, use model.get_sequence_output()
# instead.
output_layer = model.get_pooled_output()
hidden_size = output_layer.shape[-1].value
output_weights = tf.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02))
output_bias = tf.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer())
with tf.variable_scope("loss"):
if is_training:
# I.e., 0.1 dropout
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
probabilities = tf.nn.softmax(logits, axis=-1)
log_probs = tf.nn.log_softmax(logits, axis=-1)
one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
return (loss, per_example_loss, logits, probabilities)
def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate=None,
num_train_steps=None, num_warmup_steps=None,
use_one_hot_embeddings=False, hvd=None, use_fp16=False):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
tf.logging.info("*** Features ***")
for name in sorted(features.keys()):
tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
label_ids = features["label_ids"]
is_real_example = None
if "is_real_example" in features:
is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
else:
is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
(total_loss, per_example_loss, logits, probabilities) = create_model(
bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
num_labels, use_one_hot_embeddings)
tvars = tf.trainable_variables()
initialized_variable_names = {}
scaffold_fn = None
if init_checkpoint and (hvd is None or hvd.rank() == 0):
(assignment_map, initialized_variable_names
) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
tf.logging.info("**** Trainable Variables ****")
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
init_string)
output_spec = None
if mode == tf.estimator.ModeKeys.TRAIN:
train_op = optimization.create_optimizer(
total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, use_fp16)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
train_op=train_op)
elif mode == tf.estimator.ModeKeys.EVAL:
def metric_fn(per_example_loss, label_ids, logits, is_real_example):
predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
accuracy = tf.metrics.accuracy(
labels=label_ids, predictions=predictions, weights=is_real_example)
loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
return {
"eval_accuracy": accuracy,
"eval_loss": loss,
}
eval_metric_ops = metric_fn(per_example_loss, label_ids, logits, is_real_example)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
eval_metric_ops=eval_metric_ops)
else:
output_spec = tf.estimator.EstimatorSpec(
mode=mode, predictions={"probabilities": probabilities})#predicts)#probabilities)
return output_spec
return model_fn
# This function is not used by this file but is still used by the Colab and
# people who depend on it.
def input_fn_builder(features, seq_length, is_training, drop_remainder):
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
all_input_ids = []
all_input_mask = []
all_segment_ids = []
all_label_ids = []
for feature in features:
all_input_ids.append(feature.input_ids)
all_input_mask.append(feature.input_mask)
all_segment_ids.append(feature.segment_ids)
all_label_ids.append(feature.label_id)
def input_fn(params):
"""The actual input function."""
batch_size = params["batch_size"]
num_examples = len(features)
# This is for demo purposes and does NOT scale to large data sets. We do
# not use Dataset.from_generator() because that uses tf.py_func which is
# not TPU compatible. The right way to load data is with TFRecordReader.
d = tf.data.Dataset.from_tensor_slices({
"input_ids":
tf.constant(
all_input_ids, shape=[num_examples, seq_length],
dtype=tf.int32),
"input_mask":
tf.constant(
all_input_mask,
shape=[num_examples, seq_length],
dtype=tf.int32),
"segment_ids":
tf.constant(
all_segment_ids,
shape=[num_examples, seq_length],
dtype=tf.int32),
"label_ids":
tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
})
if is_training:
d = d.repeat()
d = d.shuffle(buffer_size=100)
d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
return d
return input_fn
# This function is not used by this file but is still used by the Colab and
# people who depend on it.
def convert_examples_to_features(examples, label_list, max_seq_length,
tokenizer):
"""Convert a set of `InputExample`s to a list of `InputFeatures`."""
features = []
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
feature = convert_single_example(ex_index, example, label_list,
max_seq_length, tokenizer)
features.append(feature)
return features
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
if FLAGS.horovod:
hvd.init()
if FLAGS.use_fp16:
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
processors = {
"chemprot": ChemProtProcessor,
'mednli': MedNLIProcessor,
}
tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
FLAGS.init_checkpoint)
if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
raise ValueError(
"At least one of `do_train`, `do_eval` or `do_predict' must be True.")
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
if FLAGS.max_seq_length > bert_config.max_position_embeddings:
raise ValueError(
"Cannot use sequence length %d because the BERT model "
"was only trained up to sequence length %d" %
(FLAGS.max_seq_length, bert_config.max_position_embeddings))
tf.gfile.MakeDirs(FLAGS.output_dir)
task_name = FLAGS.task_name.lower()
if task_name not in processors:
raise ValueError("Task not found: %s" % (task_name))
processor = processors[task_name]()
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
master_process = True
training_hooks = []
global_batch_size = FLAGS.train_batch_size
hvd_rank = 0
config = tf.ConfigProto()
if FLAGS.horovod:
global_batch_size = FLAGS.train_batch_size * hvd.size()
master_process = (hvd.rank() == 0)
hvd_rank = hvd.rank()
config.gpu_options.visible_device_list = str(hvd.local_rank())
if hvd.size() > 1:
training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
if FLAGS.use_xla:
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
run_config = tf.estimator.RunConfig(
model_dir=FLAGS.output_dir if master_process else None,
session_config=config,
save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
keep_checkpoint_max=1)
if master_process:
tf.logging.info("***** Configuaration *****")
for key in FLAGS.__flags.keys():
tf.logging.info(' {}: {}'.format(key, getattr(FLAGS, key)))
tf.logging.info("**************************")
train_examples = None
num_train_steps = None
num_warmup_steps = None
training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank))
if FLAGS.do_train:
train_examples = processor.get_train_examples(FLAGS.data_dir)
num_train_steps = int(
len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
start_index = 0
end_index = len(train_examples)
tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]
if FLAGS.horovod:
tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size())]
num_examples_per_rank = len(train_examples) // hvd.size()
remainder = len(train_examples) % hvd.size()
if hvd.rank() < remainder:
start_index = hvd.rank() * (num_examples_per_rank+1)
end_index = start_index + num_examples_per_rank + 1
else:
start_index = hvd.rank() * num_examples_per_rank + remainder
end_index = start_index + (num_examples_per_rank)
model_fn = model_fn_builder(
bert_config=bert_config,
num_labels=len(label_list),
init_checkpoint=FLAGS.init_checkpoint,
learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(),
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
use_one_hot_embeddings=False,
hvd=None if not FLAGS.horovod else hvd,
use_fp16=FLAGS.use_fp16)
estimator = tf.estimator.Estimator(
model_fn=model_fn,
config=run_config)
if FLAGS.do_train:
file_based_convert_examples_to_features(
train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])
tf.logging.info("***** Running training *****")
tf.logging.info(" Num examples = %d", len(train_examples))
tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
tf.logging.info(" Num steps = %d", num_train_steps)
train_input_fn = file_based_input_fn_builder(
input_file=tmp_filenames,
batch_size=FLAGS.train_batch_size,
seq_length=FLAGS.max_seq_length,
is_training=True,
drop_remainder=True,
hvd=None if not FLAGS.horovod else hvd)
train_start_time = time.time()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks)
train_time_elapsed = time.time() - train_start_time
train_time_wo_overhead = training_hooks[-1].total_time
avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
ss_sentences_per_second = (num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead
if master_process:
tf.logging.info("-----------------------------")
tf.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed,
num_train_steps * global_batch_size)
tf.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead,
(num_train_steps - training_hooks[-1].skipped) * global_batch_size)
tf.logging.info("Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second)
tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
tf.logging.info("-----------------------------")
if FLAGS.do_eval and master_process:
eval_examples = processor.get_dev_examples(FLAGS.data_dir)
num_actual_eval_examples = len(eval_examples)
eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
file_based_convert_examples_to_features(
eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
tf.logging.info("***** Running evaluation *****")
tf.logging.info(" Num examples = %d (%d actual, %d padding)",
len(eval_examples), num_actual_eval_examples,
len(eval_examples) - num_actual_eval_examples)
tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
# This tells the estimator to run through the entire set.
eval_steps = None
eval_drop_remainder = False
eval_input_fn = file_based_input_fn_builder(
input_file=eval_file,
batch_size=FLAGS.eval_batch_size,
seq_length=FLAGS.max_seq_length,
is_training=False,
drop_remainder=eval_drop_remainder)
result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
with tf.gfile.GFile(output_eval_file, "w") as writer:
tf.logging.info("***** Eval results *****")
for key in sorted(result.keys()):
tf.logging.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
if FLAGS.do_predict and master_process:
predict_examples = processor.get_test_examples(FLAGS.data_dir)
num_actual_predict_examples = len(predict_examples)
predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
file_based_convert_examples_to_features(predict_examples, label_list,
FLAGS.max_seq_length, tokenizer,
predict_file)
tf.logging.info("***** Running prediction*****")
tf.logging.info(" Num examples = %d (%d actual, %d padding)",
len(predict_examples), num_actual_predict_examples,
len(predict_examples) - num_actual_predict_examples)
tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size)
predict_drop_remainder = False
predict_input_fn = file_based_input_fn_builder(
input_file=predict_file,
batch_size=FLAGS.predict_batch_size,
seq_length=FLAGS.max_seq_length,
is_training=False,
drop_remainder=predict_drop_remainder)
eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
eval_start_time = time.time()
output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
with tf.gfile.GFile(output_predict_file, "w") as writer:
num_written_lines = 0
tf.logging.info("***** Predict results *****")
for prediction in estimator.predict(input_fn=predict_input_fn, hooks=eval_hooks,
yield_single_examples=True):
probabilities = prediction["probabilities"]
output_line = "\t".join(
str(class_probability)
for class_probability in probabilities) + "\n"
writer.write(output_line)
num_written_lines += 1
assert num_written_lines == num_actual_predict_examples
eval_time_elapsed = time.time() - eval_start_time
eval_time_wo_overhead = eval_hooks[-1].total_time
time_list = eval_hooks[-1].time_list
time_list.sort()
num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size
avg = np.mean(time_list)
cf_50 = max(time_list[:int(len(time_list) * 0.50)])
cf_90 = max(time_list[:int(len(time_list) * 0.90)])
cf_95 = max(time_list[:int(len(time_list) * 0.95)])
cf_99 = max(time_list[:int(len(time_list) * 0.99)])
cf_100 = max(time_list[:int(len(time_list) * 1)])
ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
tf.logging.info("-----------------------------")
tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
eval_hooks[-1].count * FLAGS.predict_batch_size)
tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
(eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size)
tf.logging.info("Summary Inference Statistics")
tf.logging.info("Batch size = %d", FLAGS.predict_batch_size)
tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
tf.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
tf.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
tf.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
tf.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000)
tf.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000)
tf.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
tf.logging.info("-----------------------------")
if __name__ == "__main__":
flags.mark_flag_as_required("data_dir")
flags.mark_flag_as_required("task_name")
flags.mark_flag_as_required("vocab_file")
flags.mark_flag_as_required("bert_config_file")
flags.mark_flag_as_required("output_dir")
tf.app.run()
File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More