[add]上传训练benchmark by z00560161
This commit is contained in:
@@ -0,0 +1,56 @@
|
||||
# Bert-Base_tensorflow训练说明
|
||||
|
||||
### 1. 模型训练参数配置
|
||||
|
||||
在train/yaml/Bert-Base.yaml中修改相应配置, 配置项含义:
|
||||
|
||||
```
|
||||
tensorflow_config:
|
||||
#layer层数有6和12两种,中文数据集用 bert_base_layer6_cn.json/bert_base_layer12_cn.json 英文用bert_base_layer6_cn.json/bert_base_layer12_en.json
|
||||
bert_config_file: bert_base_layer6_cn.json
|
||||
#数据集句子长度是256时 设置为 256,40,句子长度是128时设置为128,20
|
||||
max_seq_length: 128
|
||||
max_predictions_per_seq: 20
|
||||
|
||||
# 最佳性能train_batch_size为160
|
||||
train_batch_size: 160
|
||||
learning_rate: 1e-4
|
||||
num_warmup_steps: 100
|
||||
num_train_steps: 1000
|
||||
optimizer_type: adam
|
||||
manual_fp16: True
|
||||
use_fp16_cls: True
|
||||
input_files_dir: 数据集路径
|
||||
eval_files_dir: 数据集路径
|
||||
npu_bert_debug: False
|
||||
npu_bert_use_tdt: True
|
||||
distributed: True
|
||||
do_train: True
|
||||
do_eval: False
|
||||
num_accumulation_steps: 1
|
||||
iterations_per_loop: 100
|
||||
npu_bert_loss_scale: 0
|
||||
save_checkpoints_steps: 1000
|
||||
npu_bert_clip_by_global_norm: False
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: c73:b021
|
||||
|
||||
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
|
||||
mpirun_ip: 90.90.140.199:8,90.90.140.229:8
|
||||
|
||||
# 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
|
||||
device_group_1p: 6
|
||||
device_group_2p: 0 1
|
||||
device_group_4p: 0 1 2 3
|
||||
```
|
||||
|
||||
------
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
+13
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"hidden_size": 768,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 3072,
|
||||
"max_position_embeddings": 512,
|
||||
"num_attention_heads": 12,
|
||||
"num_hidden_layers": 12,
|
||||
"type_vocab_size": 2,
|
||||
"vocab_size": 30522
|
||||
}
|
||||
+21128
File diff suppressed because it is too large
Load Diff
+442
@@ -0,0 +1,442 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Create masked LM/next sentence masked_lm TF examples for BERT."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import random
|
||||
import tokenization
|
||||
import tensorflow as tf
|
||||
|
||||
flags = tf.flags
|
||||
|
||||
FLAGS = flags.FLAGS
|
||||
|
||||
flags.DEFINE_string("input_file", None,
|
||||
"Input raw text file (or comma-separated list of files).")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"output_file", None,
|
||||
"Output TF example file (or comma-separated list of files).")
|
||||
|
||||
flags.DEFINE_string("vocab_file", None,
|
||||
"The vocabulary file that the BERT model was trained on.")
|
||||
|
||||
flags.DEFINE_bool(
|
||||
"do_lower_case", True,
|
||||
"Whether to lower case the input text. Should be True for uncased "
|
||||
"models and False for cased models.")
|
||||
|
||||
flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
|
||||
|
||||
flags.DEFINE_integer("max_predictions_per_seq", 20,
|
||||
"Maximum number of masked LM predictions per sequence.")
|
||||
|
||||
flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"dupe_factor", 10,
|
||||
"Number of times to duplicate the input data (with different masks).")
|
||||
|
||||
flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
|
||||
|
||||
flags.DEFINE_float(
|
||||
"short_seq_prob", 0.1,
|
||||
"Probability of creating sequences which are shorter than the "
|
||||
"maximum length.")
|
||||
|
||||
|
||||
class TrainingInstance(object):
|
||||
"""A single training instance (sentence pair)."""
|
||||
|
||||
def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
|
||||
is_random_next):
|
||||
self.tokens = tokens
|
||||
self.segment_ids = segment_ids
|
||||
self.is_random_next = is_random_next
|
||||
self.masked_lm_positions = masked_lm_positions
|
||||
self.masked_lm_labels = masked_lm_labels
|
||||
|
||||
def __str__(self):
|
||||
s = ""
|
||||
s += "tokens: %s\n" % (" ".join(
|
||||
[tokenization.printable_text(x) for x in self.tokens]))
|
||||
s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
|
||||
s += "is_random_next: %s\n" % self.is_random_next
|
||||
s += "masked_lm_positions: %s\n" % (" ".join(
|
||||
[str(x) for x in self.masked_lm_positions]))
|
||||
s += "masked_lm_labels: %s\n" % (" ".join(
|
||||
[tokenization.printable_text(x) for x in self.masked_lm_labels]))
|
||||
s += "\n"
|
||||
return s
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
|
||||
def write_instance_to_example_files(instances, tokenizer, max_seq_length,
|
||||
max_predictions_per_seq, output_files):
|
||||
"""Create TF example files from `TrainingInstance`s."""
|
||||
writers = []
|
||||
for output_file in output_files:
|
||||
writers.append(tf.python_io.TFRecordWriter(output_file))
|
||||
|
||||
writer_index = 0
|
||||
|
||||
total_written = 0
|
||||
for (inst_index, instance) in enumerate(instances):
|
||||
input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
|
||||
input_mask = [1] * len(input_ids)
|
||||
segment_ids = list(instance.segment_ids)
|
||||
assert len(input_ids) <= max_seq_length
|
||||
|
||||
while len(input_ids) < max_seq_length:
|
||||
input_ids.append(0)
|
||||
input_mask.append(0)
|
||||
segment_ids.append(0)
|
||||
|
||||
assert len(input_ids) == max_seq_length
|
||||
assert len(input_mask) == max_seq_length
|
||||
assert len(segment_ids) == max_seq_length
|
||||
|
||||
masked_lm_positions = list(instance.masked_lm_positions)
|
||||
masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
|
||||
masked_lm_weights = [1.0] * len(masked_lm_ids)
|
||||
|
||||
while len(masked_lm_positions) < max_predictions_per_seq:
|
||||
masked_lm_positions.append(0)
|
||||
masked_lm_ids.append(0)
|
||||
masked_lm_weights.append(0.0)
|
||||
|
||||
next_sentence_label = 1 if instance.is_random_next else 0
|
||||
|
||||
features = collections.OrderedDict()
|
||||
features["input_ids"] = create_int_feature(input_ids)
|
||||
features["input_mask"] = create_int_feature(input_mask)
|
||||
features["segment_ids"] = create_int_feature(segment_ids)
|
||||
features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
|
||||
features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
|
||||
features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
|
||||
features["next_sentence_labels"] = create_int_feature([next_sentence_label])
|
||||
|
||||
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
|
||||
|
||||
writers[writer_index].write(tf_example.SerializeToString())
|
||||
writer_index = (writer_index + 1) % len(writers)
|
||||
|
||||
total_written += 1
|
||||
|
||||
if inst_index < 20:
|
||||
tf.logging.info("*** Example ***")
|
||||
tf.logging.info("tokens: %s" % " ".join(
|
||||
[tokenization.printable_text(x) for x in instance.tokens]))
|
||||
|
||||
for feature_name in features.keys():
|
||||
feature = features[feature_name]
|
||||
values = []
|
||||
if feature.int64_list.value:
|
||||
values = feature.int64_list.value
|
||||
elif feature.float_list.value:
|
||||
values = feature.float_list.value
|
||||
tf.logging.info(
|
||||
"%s: %s" % (feature_name, " ".join([str(x) for x in values])))
|
||||
|
||||
for writer in writers:
|
||||
writer.close()
|
||||
|
||||
tf.logging.info("Wrote %d total instances", total_written)
|
||||
|
||||
|
||||
def create_int_feature(values):
|
||||
feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
|
||||
return feature
|
||||
|
||||
|
||||
def create_float_feature(values):
|
||||
feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
|
||||
return feature
|
||||
|
||||
|
||||
def create_training_instances(input_files, tokenizer, max_seq_length,
|
||||
dupe_factor, short_seq_prob, masked_lm_prob,
|
||||
max_predictions_per_seq, rng):
|
||||
"""Create `TrainingInstance`s from raw text."""
|
||||
all_documents = [[]]
|
||||
|
||||
# Input file format:
|
||||
# (1) One sentence per line. These should ideally be actual sentences, not
|
||||
# entire paragraphs or arbitrary spans of text. (Because we use the
|
||||
# sentence boundaries for the "next sentence prediction" task).
|
||||
# (2) Blank lines between documents. Document boundaries are needed so
|
||||
# that the "next sentence prediction" task doesn't span between documents.
|
||||
for input_file in input_files:
|
||||
with tf.gfile.GFile(input_file, "r") as reader:
|
||||
while True:
|
||||
line = tokenization.convert_to_unicode(reader.readline())
|
||||
if not line:
|
||||
break
|
||||
line = line.strip()
|
||||
|
||||
# Empty lines are used as document delimiters
|
||||
if not line:
|
||||
all_documents.append([])
|
||||
tokens = tokenizer.tokenize(line)
|
||||
if tokens:
|
||||
all_documents[-1].append(tokens)
|
||||
|
||||
# Remove empty documents
|
||||
all_documents = [x for x in all_documents if x]
|
||||
rng.shuffle(all_documents)
|
||||
|
||||
vocab_words = list(tokenizer.vocab.keys())
|
||||
instances = []
|
||||
for _ in range(dupe_factor):
|
||||
for document_index in range(len(all_documents)):
|
||||
instances.extend(
|
||||
create_instances_from_document(
|
||||
all_documents, document_index, max_seq_length, short_seq_prob,
|
||||
masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
|
||||
|
||||
rng.shuffle(instances)
|
||||
return instances
|
||||
|
||||
|
||||
def create_instances_from_document(
|
||||
all_documents, document_index, max_seq_length, short_seq_prob,
|
||||
masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
|
||||
"""Creates `TrainingInstance`s for a single document."""
|
||||
document = all_documents[document_index]
|
||||
|
||||
# Account for [CLS], [SEP], [SEP]
|
||||
max_num_tokens = max_seq_length - 3
|
||||
|
||||
# We *usually* want to fill up the entire sequence since we are padding
|
||||
# to `max_seq_length` anyways, so short sequences are generally wasted
|
||||
# computation. However, we *sometimes*
|
||||
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
|
||||
# sequences to minimize the mismatch between pre-training and fine-tuning.
|
||||
# The `target_seq_length` is just a rough target however, whereas
|
||||
# `max_seq_length` is a hard limit.
|
||||
target_seq_length = max_num_tokens
|
||||
if rng.random() < short_seq_prob:
|
||||
target_seq_length = rng.randint(2, max_num_tokens)
|
||||
|
||||
# We DON'T just concatenate all of the tokens from a document into a long
|
||||
# sequence and choose an arbitrary split point because this would make the
|
||||
# next sentence prediction task too easy. Instead, we split the input into
|
||||
# segments "A" and "B" based on the actual "sentences" provided by the user
|
||||
# input.
|
||||
instances = []
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
i = 0
|
||||
while i < len(document):
|
||||
segment = document[i]
|
||||
current_chunk.append(segment)
|
||||
current_length += len(segment)
|
||||
if i == len(document) - 1 or current_length >= target_seq_length:
|
||||
if current_chunk:
|
||||
# `a_end` is how many segments from `current_chunk` go into the `A`
|
||||
# (first) sentence.
|
||||
a_end = 1
|
||||
if len(current_chunk) >= 2:
|
||||
a_end = rng.randint(1, len(current_chunk) - 1)
|
||||
|
||||
tokens_a = []
|
||||
for j in range(a_end):
|
||||
tokens_a.extend(current_chunk[j])
|
||||
|
||||
tokens_b = []
|
||||
# Random next
|
||||
is_random_next = False
|
||||
if len(current_chunk) == 1 or rng.random() < 0.5:
|
||||
is_random_next = True
|
||||
target_b_length = target_seq_length - len(tokens_a)
|
||||
|
||||
# This should rarely go for more than one iteration for large
|
||||
# corpora. However, just to be careful, we try to make sure that
|
||||
# the random document is not the same as the document
|
||||
# we're processing.
|
||||
for _ in range(10):
|
||||
random_document_index = rng.randint(0, len(all_documents) - 1)
|
||||
if random_document_index != document_index:
|
||||
break
|
||||
|
||||
random_document = all_documents[random_document_index]
|
||||
random_start = rng.randint(0, len(random_document) - 1)
|
||||
for j in range(random_start, len(random_document)):
|
||||
tokens_b.extend(random_document[j])
|
||||
if len(tokens_b) >= target_b_length:
|
||||
break
|
||||
# We didn't actually use these segments so we "put them back" so
|
||||
# they don't go to waste.
|
||||
num_unused_segments = len(current_chunk) - a_end
|
||||
i -= num_unused_segments
|
||||
# Actual next
|
||||
else:
|
||||
is_random_next = False
|
||||
for j in range(a_end, len(current_chunk)):
|
||||
tokens_b.extend(current_chunk[j])
|
||||
truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
|
||||
|
||||
assert len(tokens_a) >= 1
|
||||
assert len(tokens_b) >= 1
|
||||
|
||||
tokens = []
|
||||
segment_ids = []
|
||||
tokens.append("[CLS]")
|
||||
segment_ids.append(0)
|
||||
for token in tokens_a:
|
||||
tokens.append(token)
|
||||
segment_ids.append(0)
|
||||
|
||||
tokens.append("[SEP]")
|
||||
segment_ids.append(0)
|
||||
|
||||
for token in tokens_b:
|
||||
tokens.append(token)
|
||||
segment_ids.append(1)
|
||||
tokens.append("[SEP]")
|
||||
segment_ids.append(1)
|
||||
|
||||
(tokens, masked_lm_positions,
|
||||
masked_lm_labels) = create_masked_lm_predictions(
|
||||
tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
|
||||
instance = TrainingInstance(
|
||||
tokens=tokens,
|
||||
segment_ids=segment_ids,
|
||||
is_random_next=is_random_next,
|
||||
masked_lm_positions=masked_lm_positions,
|
||||
masked_lm_labels=masked_lm_labels)
|
||||
instances.append(instance)
|
||||
current_chunk = []
|
||||
current_length = 0
|
||||
i += 1
|
||||
|
||||
return instances
|
||||
|
||||
|
||||
MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
|
||||
["index", "label"])
|
||||
|
||||
|
||||
def create_masked_lm_predictions(tokens, masked_lm_prob,
|
||||
max_predictions_per_seq, vocab_words, rng):
|
||||
"""Creates the predictions for the masked LM objective."""
|
||||
|
||||
cand_indexes = []
|
||||
for (i, token) in enumerate(tokens):
|
||||
if token == "[CLS]" or token == "[SEP]":
|
||||
continue
|
||||
cand_indexes.append(i)
|
||||
|
||||
rng.shuffle(cand_indexes)
|
||||
|
||||
output_tokens = list(tokens)
|
||||
|
||||
num_to_predict = min(max_predictions_per_seq,
|
||||
max(1, int(round(len(tokens) * masked_lm_prob))))
|
||||
|
||||
masked_lms = []
|
||||
covered_indexes = set()
|
||||
for index in cand_indexes:
|
||||
if len(masked_lms) >= num_to_predict:
|
||||
break
|
||||
if index in covered_indexes:
|
||||
continue
|
||||
covered_indexes.add(index)
|
||||
|
||||
masked_token = None
|
||||
# 80% of the time, replace with [MASK]
|
||||
if rng.random() < 0.8:
|
||||
masked_token = "[MASK]"
|
||||
else:
|
||||
# 10% of the time, keep original
|
||||
if rng.random() < 0.5:
|
||||
masked_token = tokens[index]
|
||||
# 10% of the time, replace with random word
|
||||
else:
|
||||
masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
|
||||
|
||||
output_tokens[index] = masked_token
|
||||
|
||||
masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
|
||||
|
||||
masked_lms = sorted(masked_lms, key=lambda x: x.index)
|
||||
|
||||
masked_lm_positions = []
|
||||
masked_lm_labels = []
|
||||
for p in masked_lms:
|
||||
masked_lm_positions.append(p.index)
|
||||
masked_lm_labels.append(p.label)
|
||||
|
||||
return (output_tokens, masked_lm_positions, masked_lm_labels)
|
||||
|
||||
|
||||
def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
|
||||
"""Truncates a pair of sequences to a maximum sequence length."""
|
||||
while True:
|
||||
total_length = len(tokens_a) + len(tokens_b)
|
||||
if total_length <= max_num_tokens:
|
||||
break
|
||||
|
||||
trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
|
||||
assert len(trunc_tokens) >= 1
|
||||
|
||||
# We want to sometimes truncate from the front and sometimes from the
|
||||
# back to add more randomness and avoid biases.
|
||||
if rng.random() < 0.5:
|
||||
del trunc_tokens[0]
|
||||
else:
|
||||
trunc_tokens.pop()
|
||||
|
||||
|
||||
def main(_):
|
||||
tf.logging.set_verbosity(tf.logging.INFO)
|
||||
|
||||
tokenizer = tokenization.FullTokenizer(
|
||||
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
|
||||
|
||||
input_files = []
|
||||
for input_pattern in FLAGS.input_file.split(","):
|
||||
input_files.extend(tf.gfile.Glob(input_pattern))
|
||||
|
||||
tf.logging.info("*** Reading from input files ***")
|
||||
for input_file in input_files:
|
||||
tf.logging.info(" %s", input_file)
|
||||
|
||||
rng = random.Random(FLAGS.random_seed)
|
||||
instances = create_training_instances(
|
||||
input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
|
||||
FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
|
||||
rng)
|
||||
|
||||
output_files = FLAGS.output_file.split(",")
|
||||
tf.logging.info("*** Writing to output files ***")
|
||||
for output_file in output_files:
|
||||
tf.logging.info(" %s", output_file)
|
||||
|
||||
write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
|
||||
FLAGS.max_predictions_per_seq, output_files)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
flags.mark_flag_as_required("input_file")
|
||||
flags.mark_flag_as_required("output_file")
|
||||
flags.mark_flag_as_required("vocab_file")
|
||||
tf.app.run()
|
||||
+419
@@ -0,0 +1,419 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Extract pre-computed feature vectors from BERT."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import codecs
|
||||
import collections
|
||||
import json
|
||||
import re
|
||||
|
||||
import modeling
|
||||
import tokenization
|
||||
import tensorflow as tf
|
||||
|
||||
flags = tf.flags
|
||||
|
||||
FLAGS = flags.FLAGS
|
||||
|
||||
flags.DEFINE_string("input_file", None, "")
|
||||
|
||||
flags.DEFINE_string("output_file", None, "")
|
||||
|
||||
flags.DEFINE_string("layers", "-1,-2,-3,-4", "")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"bert_config_file", None,
|
||||
"The config json file corresponding to the pre-trained BERT model. "
|
||||
"This specifies the model architecture.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"max_seq_length", 128,
|
||||
"The maximum total input sequence length after WordPiece tokenization. "
|
||||
"Sequences longer than this will be truncated, and sequences shorter "
|
||||
"than this will be padded.")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"init_checkpoint", None,
|
||||
"Initial checkpoint (usually from a pre-trained BERT model).")
|
||||
|
||||
flags.DEFINE_string("vocab_file", None,
|
||||
"The vocabulary file that the BERT model was trained on.")
|
||||
|
||||
flags.DEFINE_bool(
|
||||
"do_lower_case", True,
|
||||
"Whether to lower case the input text. Should be True for uncased "
|
||||
"models and False for cased models.")
|
||||
|
||||
flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.")
|
||||
|
||||
flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
|
||||
|
||||
flags.DEFINE_string("master", None,
|
||||
"If using a TPU, the address of the master.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"num_tpu_cores", 8,
|
||||
"Only used if `use_tpu` is True. Total number of TPU cores to use.")
|
||||
|
||||
flags.DEFINE_bool(
|
||||
"use_one_hot_embeddings", False,
|
||||
"If True, tf.one_hot will be used for embedding lookups, otherwise "
|
||||
"tf.nn.embedding_lookup will be used. On TPUs, this should be True "
|
||||
"since it is much faster.")
|
||||
|
||||
|
||||
class InputExample(object):
|
||||
|
||||
def __init__(self, unique_id, text_a, text_b):
|
||||
self.unique_id = unique_id
|
||||
self.text_a = text_a
|
||||
self.text_b = text_b
|
||||
|
||||
|
||||
class InputFeatures(object):
|
||||
"""A single set of features of data."""
|
||||
|
||||
def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
|
||||
self.unique_id = unique_id
|
||||
self.tokens = tokens
|
||||
self.input_ids = input_ids
|
||||
self.input_mask = input_mask
|
||||
self.input_type_ids = input_type_ids
|
||||
|
||||
|
||||
def input_fn_builder(features, seq_length):
|
||||
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
|
||||
|
||||
all_unique_ids = []
|
||||
all_input_ids = []
|
||||
all_input_mask = []
|
||||
all_input_type_ids = []
|
||||
|
||||
for feature in features:
|
||||
all_unique_ids.append(feature.unique_id)
|
||||
all_input_ids.append(feature.input_ids)
|
||||
all_input_mask.append(feature.input_mask)
|
||||
all_input_type_ids.append(feature.input_type_ids)
|
||||
|
||||
def input_fn(params):
|
||||
"""The actual input function."""
|
||||
batch_size = params["batch_size"]
|
||||
|
||||
num_examples = len(features)
|
||||
|
||||
# This is for demo purposes and does NOT scale to large data sets. We do
|
||||
# not use Dataset.from_generator() because that uses tf.py_func which is
|
||||
# not TPU compatible. The right way to load data is with TFRecordReader.
|
||||
d = tf.data.Dataset.from_tensor_slices({
|
||||
"unique_ids":
|
||||
tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
|
||||
"input_ids":
|
||||
tf.constant(
|
||||
all_input_ids, shape=[num_examples, seq_length],
|
||||
dtype=tf.int32),
|
||||
"input_mask":
|
||||
tf.constant(
|
||||
all_input_mask,
|
||||
shape=[num_examples, seq_length],
|
||||
dtype=tf.int32),
|
||||
"input_type_ids":
|
||||
tf.constant(
|
||||
all_input_type_ids,
|
||||
shape=[num_examples, seq_length],
|
||||
dtype=tf.int32),
|
||||
})
|
||||
|
||||
d = d.batch(batch_size=batch_size, drop_remainder=False)
|
||||
return d
|
||||
|
||||
return input_fn
|
||||
|
||||
|
||||
def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
|
||||
use_one_hot_embeddings):
|
||||
"""Returns `model_fn` closure for TPUEstimator."""
|
||||
|
||||
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
|
||||
"""The `model_fn` for TPUEstimator."""
|
||||
|
||||
unique_ids = features["unique_ids"]
|
||||
input_ids = features["input_ids"]
|
||||
input_mask = features["input_mask"]
|
||||
input_type_ids = features["input_type_ids"]
|
||||
|
||||
model = modeling.BertModel(
|
||||
config=bert_config,
|
||||
is_training=False,
|
||||
input_ids=input_ids,
|
||||
input_mask=input_mask,
|
||||
token_type_ids=input_type_ids,
|
||||
use_one_hot_embeddings=use_one_hot_embeddings)
|
||||
|
||||
if mode != tf.estimator.ModeKeys.PREDICT:
|
||||
raise ValueError("Only PREDICT modes are supported: %s" % (mode))
|
||||
|
||||
tvars = tf.trainable_variables()
|
||||
scaffold_fn = None
|
||||
(assignment_map,
|
||||
initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
|
||||
tvars, init_checkpoint)
|
||||
if use_tpu:
|
||||
|
||||
def tpu_scaffold():
|
||||
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
|
||||
return tf.train.Scaffold()
|
||||
|
||||
scaffold_fn = tpu_scaffold
|
||||
else:
|
||||
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
|
||||
|
||||
tf.logging.info("**** Trainable Variables ****")
|
||||
for var in tvars:
|
||||
init_string = ""
|
||||
if var.name in initialized_variable_names:
|
||||
init_string = ", *INIT_FROM_CKPT*"
|
||||
tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
|
||||
init_string)
|
||||
|
||||
all_layers = model.get_all_encoder_layers()
|
||||
|
||||
predictions = {
|
||||
"unique_id": unique_ids,
|
||||
}
|
||||
|
||||
for (i, layer_index) in enumerate(layer_indexes):
|
||||
predictions["layer_output_%d" % i] = all_layers[layer_index]
|
||||
|
||||
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
|
||||
mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
|
||||
return output_spec
|
||||
|
||||
return model_fn
|
||||
|
||||
|
||||
def convert_examples_to_features(examples, seq_length, tokenizer):
|
||||
"""Loads a data file into a list of `InputBatch`s."""
|
||||
|
||||
features = []
|
||||
for (ex_index, example) in enumerate(examples):
|
||||
tokens_a = tokenizer.tokenize(example.text_a)
|
||||
|
||||
tokens_b = None
|
||||
if example.text_b:
|
||||
tokens_b = tokenizer.tokenize(example.text_b)
|
||||
|
||||
if tokens_b:
|
||||
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
||||
# length is less than the specified length.
|
||||
# Account for [CLS], [SEP], [SEP] with "- 3"
|
||||
_truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
|
||||
else:
|
||||
# Account for [CLS] and [SEP] with "- 2"
|
||||
if len(tokens_a) > seq_length - 2:
|
||||
tokens_a = tokens_a[0:(seq_length - 2)]
|
||||
|
||||
# The convention in BERT is:
|
||||
# (a) For sequence pairs:
|
||||
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
||||
# (b) For single sequences:
|
||||
# tokens: [CLS] the dog is hairy . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0
|
||||
#
|
||||
# Where "type_ids" are used to indicate whether this is the first
|
||||
# sequence or the second sequence. The embedding vectors for `type=0` and
|
||||
# `type=1` were learned during pre-training and are added to the wordpiece
|
||||
# embedding vector (and position vector). This is not *strictly* necessary
|
||||
# since the [SEP] token unambiguously separates the sequences, but it makes
|
||||
# it easier for the model to learn the concept of sequences.
|
||||
#
|
||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
||||
# used as as the "sentence vector". Note that this only makes sense because
|
||||
# the entire model is fine-tuned.
|
||||
tokens = []
|
||||
input_type_ids = []
|
||||
tokens.append("[CLS]")
|
||||
input_type_ids.append(0)
|
||||
for token in tokens_a:
|
||||
tokens.append(token)
|
||||
input_type_ids.append(0)
|
||||
tokens.append("[SEP]")
|
||||
input_type_ids.append(0)
|
||||
|
||||
if tokens_b:
|
||||
for token in tokens_b:
|
||||
tokens.append(token)
|
||||
input_type_ids.append(1)
|
||||
tokens.append("[SEP]")
|
||||
input_type_ids.append(1)
|
||||
|
||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
|
||||
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
||||
# tokens are attended to.
|
||||
input_mask = [1] * len(input_ids)
|
||||
|
||||
# Zero-pad up to the sequence length.
|
||||
while len(input_ids) < seq_length:
|
||||
input_ids.append(0)
|
||||
input_mask.append(0)
|
||||
input_type_ids.append(0)
|
||||
|
||||
assert len(input_ids) == seq_length
|
||||
assert len(input_mask) == seq_length
|
||||
assert len(input_type_ids) == seq_length
|
||||
|
||||
if ex_index < 5:
|
||||
tf.logging.info("*** Example ***")
|
||||
tf.logging.info("unique_id: %s" % (example.unique_id))
|
||||
tf.logging.info("tokens: %s" % " ".join(
|
||||
[tokenization.printable_text(x) for x in tokens]))
|
||||
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
||||
tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
||||
tf.logging.info(
|
||||
"input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
|
||||
|
||||
features.append(
|
||||
InputFeatures(
|
||||
unique_id=example.unique_id,
|
||||
tokens=tokens,
|
||||
input_ids=input_ids,
|
||||
input_mask=input_mask,
|
||||
input_type_ids=input_type_ids))
|
||||
return features
|
||||
|
||||
|
||||
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
|
||||
"""Truncates a sequence pair in place to the maximum length."""
|
||||
|
||||
# This is a simple heuristic which will always truncate the longer sequence
|
||||
# one token at a time. This makes more sense than truncating an equal percent
|
||||
# of tokens from each, since if one sequence is very short then each token
|
||||
# that's truncated likely contains more information than a longer sequence.
|
||||
while True:
|
||||
total_length = len(tokens_a) + len(tokens_b)
|
||||
if total_length <= max_length:
|
||||
break
|
||||
if len(tokens_a) > len(tokens_b):
|
||||
tokens_a.pop()
|
||||
else:
|
||||
tokens_b.pop()
|
||||
|
||||
|
||||
def read_examples(input_file):
|
||||
"""Read a list of `InputExample`s from an input file."""
|
||||
examples = []
|
||||
unique_id = 0
|
||||
with tf.gfile.GFile(input_file, "r") as reader:
|
||||
while True:
|
||||
line = tokenization.convert_to_unicode(reader.readline())
|
||||
if not line:
|
||||
break
|
||||
line = line.strip()
|
||||
text_a = None
|
||||
text_b = None
|
||||
m = re.match(r"^(.*) \|\|\| (.*)$", line)
|
||||
if m is None:
|
||||
text_a = line
|
||||
else:
|
||||
text_a = m.group(1)
|
||||
text_b = m.group(2)
|
||||
examples.append(
|
||||
InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
|
||||
unique_id += 1
|
||||
return examples
|
||||
|
||||
|
||||
def main(_):
|
||||
tf.logging.set_verbosity(tf.logging.INFO)
|
||||
|
||||
layer_indexes = [int(x) for x in FLAGS.layers.split(",")]
|
||||
|
||||
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
|
||||
|
||||
tokenizer = tokenization.FullTokenizer(
|
||||
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
|
||||
|
||||
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
|
||||
run_config = tf.contrib.tpu.RunConfig(
|
||||
master=FLAGS.master,
|
||||
tpu_config=tf.contrib.tpu.TPUConfig(
|
||||
num_shards=FLAGS.num_tpu_cores,
|
||||
per_host_input_for_training=is_per_host))
|
||||
|
||||
examples = read_examples(FLAGS.input_file)
|
||||
|
||||
features = convert_examples_to_features(
|
||||
examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
|
||||
|
||||
unique_id_to_feature = {}
|
||||
for feature in features:
|
||||
unique_id_to_feature[feature.unique_id] = feature
|
||||
|
||||
model_fn = model_fn_builder(
|
||||
bert_config=bert_config,
|
||||
init_checkpoint=FLAGS.init_checkpoint,
|
||||
layer_indexes=layer_indexes,
|
||||
use_tpu=FLAGS.use_tpu,
|
||||
use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)
|
||||
|
||||
# If TPU is not available, this will fall back to normal Estimator on CPU
|
||||
# or GPU.
|
||||
estimator = tf.contrib.tpu.TPUEstimator(
|
||||
use_tpu=FLAGS.use_tpu,
|
||||
model_fn=model_fn,
|
||||
config=run_config,
|
||||
predict_batch_size=FLAGS.batch_size)
|
||||
|
||||
input_fn = input_fn_builder(
|
||||
features=features, seq_length=FLAGS.max_seq_length)
|
||||
|
||||
with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file,
|
||||
"w")) as writer:
|
||||
for result in estimator.predict(input_fn, yield_single_examples=True):
|
||||
unique_id = int(result["unique_id"])
|
||||
feature = unique_id_to_feature[unique_id]
|
||||
output_json = collections.OrderedDict()
|
||||
output_json["linex_index"] = unique_id
|
||||
all_features = []
|
||||
for (i, token) in enumerate(feature.tokens):
|
||||
all_layers = []
|
||||
for (j, layer_index) in enumerate(layer_indexes):
|
||||
layer_output = result["layer_output_%d" % j]
|
||||
layers = collections.OrderedDict()
|
||||
layers["index"] = layer_index
|
||||
layers["values"] = [
|
||||
round(float(x), 6) for x in layer_output[i:(i + 1)].flat
|
||||
]
|
||||
all_layers.append(layers)
|
||||
features = collections.OrderedDict()
|
||||
features["token"] = token
|
||||
features["layers"] = all_layers
|
||||
all_features.append(features)
|
||||
output_json["features"] = all_features
|
||||
writer.write(json.dumps(output_json) + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
flags.mark_flag_as_required("input_file")
|
||||
flags.mark_flag_as_required("vocab_file")
|
||||
flags.mark_flag_as_required("bert_config_file")
|
||||
flags.mark_flag_as_required("init_checkpoint")
|
||||
flags.mark_flag_as_required("output_file")
|
||||
tf.app.run()
|
||||
@@ -0,0 +1,35 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
|
||||
|
||||
def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
|
||||
initializer=None, regularizer=None,
|
||||
trainable=True,
|
||||
*args, **kwargs):
|
||||
"""Custom variable getter that forces trainable variables to be stored in
|
||||
float32 precision and then casts them to the training precision.
|
||||
"""
|
||||
storage_dtype = tf.float32 if trainable else dtype
|
||||
variable = getter(name, shape, dtype=storage_dtype,
|
||||
initializer=initializer, regularizer=regularizer,
|
||||
trainable=trainable,
|
||||
*args, **kwargs)
|
||||
if trainable and dtype != tf.float32:
|
||||
variable = tf.cast(variable, dtype)
|
||||
return variable
|
||||
|
||||
+141
@@ -0,0 +1,141 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import collections
|
||||
import copy
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
import six
|
||||
import tensorflow as tf
|
||||
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.contrib.layers.python.layers import utils
|
||||
from tensorflow.contrib.framework.python.ops import variables
|
||||
from tensorflow.python.ops import init_ops
|
||||
import numpy
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.ops import nn
|
||||
|
||||
def fused_layer_norm(inputs,
|
||||
center=True,
|
||||
scale=True,
|
||||
activation_fn=None,
|
||||
reuse=None,
|
||||
variables_collections=None,
|
||||
outputs_collections=None,
|
||||
trainable=True,
|
||||
begin_norm_axis=1,
|
||||
begin_params_axis=-1,
|
||||
scope=None,
|
||||
use_fused_batch_norm=False):
|
||||
with tf.variable_scope(
|
||||
scope, 'LayerNorm', [inputs], reuse=reuse) as sc:
|
||||
inputs = ops.convert_to_tensor(inputs)
|
||||
inputs_shape = inputs.shape
|
||||
inputs_rank = inputs_shape.ndims
|
||||
if inputs_rank is None:
|
||||
raise ValueError('Inputs %s has undefined rank.' % inputs.name)
|
||||
dtype = inputs.dtype.base_dtype
|
||||
if begin_norm_axis < 0:
|
||||
begin_norm_axis = inputs_rank + begin_norm_axis
|
||||
if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
|
||||
raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) '
|
||||
'must be < rank(inputs) (%d)' %
|
||||
(begin_params_axis, begin_norm_axis, inputs_rank))
|
||||
params_shape = inputs_shape[begin_params_axis:]
|
||||
if not params_shape.is_fully_defined():
|
||||
raise ValueError(
|
||||
'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' %
|
||||
(inputs.name, begin_params_axis, inputs_shape))
|
||||
# Allocate parameters for the beta and gamma of the normalization.
|
||||
beta, gamma = None, None
|
||||
if center:
|
||||
beta_collections = utils.get_variable_collections(variables_collections,
|
||||
'beta')
|
||||
beta = variables.model_variable(
|
||||
'beta',
|
||||
shape=params_shape,
|
||||
dtype=dtype,
|
||||
initializer=init_ops.zeros_initializer(),
|
||||
collections=beta_collections,
|
||||
trainable=trainable)
|
||||
if scale:
|
||||
gamma_collections = utils.get_variable_collections(
|
||||
variables_collections, 'gamma')
|
||||
gamma = variables.model_variable(
|
||||
'gamma',
|
||||
shape=params_shape,
|
||||
dtype=dtype,
|
||||
initializer=init_ops.ones_initializer(),
|
||||
collections=gamma_collections,
|
||||
trainable=trainable)
|
||||
if use_fused_batch_norm:
|
||||
# get static TensorShape if fully defined,
|
||||
# otherwise retrieve shape tensor
|
||||
norm_shape = inputs.shape[begin_norm_axis:]
|
||||
if norm_shape.is_fully_defined():
|
||||
bn_shape = [1, -1, 1, numpy.prod(norm_shape.as_list())]
|
||||
else:
|
||||
norm_shape = tf.shape(inputs)[begin_norm_axis:]
|
||||
bn_shape = [1, -1, 1, tf.reduce_prod(norm_shape)]
|
||||
if inputs.get_shape().is_fully_defined():
|
||||
outputs_shape = inputs.get_shape()
|
||||
else:
|
||||
outputs_shape = tf.shape(inputs)
|
||||
inputs = array_ops.reshape(inputs, bn_shape)
|
||||
if inputs.get_shape().is_fully_defined():
|
||||
# static inputs TensorShape fully defined after reshape.
|
||||
ones = array_ops.ones(inputs.get_shape()[1], dtype=dtypes.float32)
|
||||
zeros = array_ops.zeros(inputs.get_shape()[1], dtype=dtypes.float32)
|
||||
else:
|
||||
# static inputs TensorShape NOT fully defined after reshape.
|
||||
# must use dynamic shape, which means these input tensors
|
||||
# have to be created at runtime, which causes a slowdown.
|
||||
scale_shape = tf.shape(inputs)[1]
|
||||
ones = array_ops.ones(scale_shape, dtype=dtypes.float32)
|
||||
zeros = array_ops.zeros(scale_shape, dtype=dtypes.float32)
|
||||
outputs, mean, variance = nn.fused_batch_norm(
|
||||
inputs,
|
||||
ones, zeros,
|
||||
epsilon=1e-4,
|
||||
data_format="NCHW")
|
||||
outputs = array_ops.reshape(outputs, outputs_shape)
|
||||
if center and scale:
|
||||
outputs = outputs * gamma + beta
|
||||
elif center:
|
||||
outputs = outputs + beta
|
||||
elif scale:
|
||||
outputs = outputs * gamma
|
||||
else:
|
||||
# Calculate the moments on the last axis (layer activations).
|
||||
norm_axes = list(range(begin_norm_axis, inputs_rank))
|
||||
mean, variance = nn.moments(inputs, norm_axes, keep_dims=True)
|
||||
# Compute layer normalization using the batch_normalization function.
|
||||
variance_epsilon = 1e-4
|
||||
outputs = nn.batch_normalization(
|
||||
inputs,
|
||||
mean,
|
||||
variance,
|
||||
offset=beta,
|
||||
scale=gamma,
|
||||
variance_epsilon=variance_epsilon)
|
||||
outputs.set_shape(inputs_shape)
|
||||
if activation_fn is not None:
|
||||
outputs = activation_fn(outputs)
|
||||
return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
|
||||
|
||||
+36
@@ -0,0 +1,36 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
|
||||
def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
|
||||
initializer=None, regularizer=None,
|
||||
trainable=True,
|
||||
*args, **kwargs):
|
||||
"""Custom variable getter that forces trainable variables to be stored in
|
||||
float32 precision and then casts them to the training precision.
|
||||
"""
|
||||
storage_dtype = tf.float32 if trainable else dtype
|
||||
variable = getter(name, shape, dtype=storage_dtype,
|
||||
initializer=initializer, regularizer=regularizer,
|
||||
trainable=trainable,
|
||||
*args, **kwargs)
|
||||
if trainable and dtype != tf.float32:
|
||||
variable = tf.cast(variable, dtype)
|
||||
return variable
|
||||
|
||||
def get_custom_getter(compute_type):
|
||||
return float32_variable_storage_getter if compute_type == tf.float16 else None
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,439 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Functions and classes related to optimization (weight updates)."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import linalg_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
|
||||
from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
|
||||
from npu_bridge.estimator.npu import npu_loss_scale_optimizer as lso
|
||||
from npu_bridge.estimator.npu import npu_loss_scale_manager as lsm_lib
|
||||
|
||||
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None, manual_fp16=False, use_fp16=False, num_accumulation_steps=1,
|
||||
optimizer_type="adam", allreduce_post_accumulation=False):
|
||||
"""Creates an optimizer training op."""
|
||||
global_step = tf.train.get_or_create_global_step()
|
||||
|
||||
# avoid step change in learning rate at end of warmup phase
|
||||
if optimizer_type == "adam":
|
||||
power = 1.0
|
||||
decayed_learning_rate_at_crossover_point = init_lr * (
|
||||
(1.0 - float(num_warmup_steps) / float(num_train_steps)) ** power)
|
||||
else:
|
||||
power = 0.5
|
||||
decayed_learning_rate_at_crossover_point = init_lr
|
||||
|
||||
adjusted_init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point)
|
||||
print('decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e' % (decayed_learning_rate_at_crossover_point, adjusted_init_lr))
|
||||
|
||||
learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
|
||||
|
||||
# Implements linear decay of the learning rate.
|
||||
learning_rate = tf.train.polynomial_decay(
|
||||
learning_rate,
|
||||
global_step,
|
||||
num_train_steps,
|
||||
end_learning_rate=0.0,
|
||||
power=power,
|
||||
cycle=False)
|
||||
|
||||
# Implements linear warmup. I.e., if global_step < num_warmup_steps, the
|
||||
# learning rate will be `global_step/num_warmup_steps * init_lr`.
|
||||
if num_warmup_steps:
|
||||
global_steps_int = tf.cast(global_step, tf.int32)
|
||||
warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
|
||||
|
||||
global_steps_float = tf.cast(global_steps_int, tf.float32)
|
||||
warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
|
||||
|
||||
warmup_percent_done = global_steps_float / warmup_steps_float
|
||||
warmup_learning_rate = init_lr * warmup_percent_done
|
||||
|
||||
is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
|
||||
learning_rate = (
|
||||
(1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
|
||||
|
||||
if optimizer_type == "lamb":
|
||||
print("Initializing LAMB Optimizer")
|
||||
optimizer = LAMBOptimizer(
|
||||
learning_rate=learning_rate,
|
||||
weight_decay_rate=0.01,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=1e-6,
|
||||
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
|
||||
else:
|
||||
print("Initializing ADAM Weight Decay Optimizer")
|
||||
# It is recommended that you use this optimizer for fine tuning, since this
|
||||
# is how the model was trained (note that the Adam m/v variables are NOT
|
||||
# loaded from init_checkpoint.)
|
||||
optimizer = AdamWeightDecayOptimizer(
|
||||
learning_rate=learning_rate,
|
||||
weight_decay_rate=0.01,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=1e-4,
|
||||
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
|
||||
|
||||
if hvd is not None and (num_accumulation_steps == 1 or (not allreduce_post_accumulation)):
|
||||
optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none)
|
||||
|
||||
optimizer = NPUDistributedOptimizer(optimizer)
|
||||
if tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]:
|
||||
opt_tmp = optimizer
|
||||
if tf.flags.FLAGS.npu_bert_loss_scale == 0:
|
||||
loss_scale_manager = lsm_lib.ExponentialUpdateLossScaleManager(init_loss_scale=tf.flags.FLAGS.init_loss_scale_value, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
|
||||
elif tf.flags.FLAGS.npu_bert_loss_scale >= 1:
|
||||
loss_scale_manager = lsm_lib.FixedLossScaleManager(loss_scale=tf.flags.FLAGS.npu_bert_loss_scale)
|
||||
else:
|
||||
raise ValueError("Invalid loss scale: %d" % tf.flags.FLAGS.npu_bert_loss_scale)
|
||||
optimizer = lso.NPULossScaleOptimizer(opt_tmp, loss_scale_manager, is_distributed=tf.flags.FLAGS.distributed)
|
||||
|
||||
tvars = tf.trainable_variables()
|
||||
grads_and_vars = optimizer.compute_gradients(loss * 1.0 / num_accumulation_steps, tvars)
|
||||
|
||||
if num_accumulation_steps > 1:
|
||||
local_step = tf.get_variable(name="local_step", shape=[], dtype=tf.int32, trainable=False,
|
||||
initializer=tf.zeros_initializer)
|
||||
batch_finite = tf.get_variable(name="batch_finite", shape=[], dtype=tf.bool, trainable=False,
|
||||
initializer=tf.ones_initializer)
|
||||
accum_vars = [tf.get_variable(
|
||||
name=tvar.name.split(":")[0] + "/accum",
|
||||
shape=tvar.shape.as_list(),
|
||||
dtype=tf.float32,
|
||||
trainable=False,
|
||||
initializer=tf.zeros_initializer()) for tvar in tf.trainable_variables()]
|
||||
|
||||
reset_step = tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool)
|
||||
local_step = tf.cond(reset_step, lambda:local_step.assign(tf.ones_like(local_step)), lambda:local_step.assign_add(1))
|
||||
|
||||
with tf.name_scope(accumulate_step):
|
||||
grads_and_vars_and_accums = [(gv[0],gv[1],accum_vars[i]) for i, gv in enumerate(grads_and_vars) if gv[0] is not None]
|
||||
grads, tvars, accum_vars = list(zip(*grads_and_vars_and_accums))
|
||||
|
||||
all_are_finite = tf.reduce_all([tf.reduce_all(tf.is_finite(g)) for g in grads]) if (tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]) and (manual_fp16 or use_fp16) else tf.constant(True, dtype=tf.bool)
|
||||
batch_finite = tf.cond(reset_step,
|
||||
lambda: batch_finite.assign(tf.math.logical_and(tf.constant(True, dtype=tf.bool), all_are_finite)),
|
||||
lambda:batch_finite.assign(tf.math.logical_and(batch_finite, all_are_finite)))
|
||||
|
||||
# This is how the model was pre-trained.
|
||||
# ensure global norm is a finite number
|
||||
# to prevent clip_by_global_norm from having a hizzy fit.
|
||||
if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
|
||||
(clipped_grads, _) = tf.clip_by_global_norm(
|
||||
grads, clip_norm=1.0,
|
||||
use_norm=tf.cond(
|
||||
all_are_finite,
|
||||
lambda: tf.global_norm(grads),
|
||||
lambda: tf.constant(1.0)))
|
||||
else:
|
||||
with tf.name_scope("clip_grads"):
|
||||
clipped_grads = [
|
||||
(tf.clip_by_norm(grad, clip_norm=1.0))
|
||||
if grad is not None else (grad, var) for grad in grads
|
||||
]
|
||||
|
||||
accum_vars = tf.cond(reset_step,
|
||||
lambda: [accum_vars[i].assign(grad) for i, grad in enumerate(clipped_grads)],
|
||||
lambda: [accum_vars[i].assign_add(grad) for i, grad in enumerate(clipped_grads)])
|
||||
|
||||
def update(accum_vars):
|
||||
with tf.name_scope("opt_update"):
|
||||
if allreduce_post_accumulation and hvd is not None:
|
||||
accum_vars = [hvd.allreduce(tf.convert_to_tensor(accum_var), compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) if isinstance(accum_var, tf.IndexedSlices)
|
||||
else hvd.allreduce(accum_var, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) for accum_var in accum_vars]
|
||||
return optimizer.apply_gradients(list(zip(accum_vars, tvars)), global_step=global_step)
|
||||
|
||||
update_step = tf.identity(tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool), name="update_step")
|
||||
update_op = tf.cond(update_step,
|
||||
lambda: update(accum_vars), lambda: tf.no_op())
|
||||
|
||||
new_global_step = tf.cond(tf.math.logical_and(update_step, tf.cast(hvd.allreduce(tf.cast(batch_finite, tf.int32)), tf.bool)), lambda: global_step+1, lambda: global_step)
|
||||
new_global_step = tf.identity(new_global_step, name='step_update')
|
||||
train_op = tf.group(update_op, [global_step.assign(new_global_step)])
|
||||
else:
|
||||
grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
|
||||
grads, tvars = list(zip(*grads_and_vars))
|
||||
|
||||
if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
|
||||
all_are_finite = tf.reduce_all(
|
||||
[tf.reduce_all(tf.is_finite(g)) for g in grads]) if (tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]) and (use_fp16 or manual_fp16) else tf.constant(True, dtype=tf.bool)
|
||||
|
||||
# This is how the model was pre-trained.
|
||||
# ensure global norm is a finite number
|
||||
# to prevent clip_by_global_norm from having a hizzy fit.
|
||||
if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
|
||||
(clipped_grads, _) = tf.clip_by_global_norm(
|
||||
grads, clip_norm=1.0,
|
||||
use_norm=tf.cond(
|
||||
all_are_finite,
|
||||
lambda: tf.global_norm(grads),
|
||||
lambda: tf.constant(1.0)))
|
||||
else:
|
||||
with tf.name_scope("clip_grads"):
|
||||
clipped_grads = [
|
||||
(tf.clip_by_norm(grad, clip_norm=1.0))
|
||||
if grad is not None else (grad, var) for grad in grads
|
||||
]
|
||||
|
||||
with tf.name_scope("apply_grads"):
|
||||
train_op = optimizer.apply_gradients(
|
||||
list(zip(clipped_grads, tvars)), global_step=global_step)
|
||||
|
||||
#if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
|
||||
# new_global_step = tf.cond(all_are_finite, lambda: global_step + 1, lambda: global_step)
|
||||
#else:
|
||||
# new_global_step = global_step + 1
|
||||
#new_global_step = tf.identity(new_global_step, name='step_update')
|
||||
#train_op = tf.group(train_op, [global_step.assign(new_global_step)])
|
||||
return train_op
|
||||
|
||||
|
||||
class AdamWeightDecayOptimizer(tf.train.Optimizer):
|
||||
"""A basic Adam optimizer that includes "correct" L2 weight decay."""
|
||||
|
||||
def __init__(self,
|
||||
learning_rate,
|
||||
weight_decay_rate=0.0,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=1e-4,
|
||||
exclude_from_weight_decay=None,
|
||||
name="AdamWeightDecayOptimizer"):
|
||||
"""Constructs a AdamWeightDecayOptimizer."""
|
||||
super(AdamWeightDecayOptimizer, self).__init__(False, name)
|
||||
|
||||
self.learning_rate = tf.identity(learning_rate, name='learning_rate')
|
||||
self.weight_decay_rate = weight_decay_rate
|
||||
self.beta_1 = beta_1
|
||||
self.beta_2 = beta_2
|
||||
self.epsilon = epsilon
|
||||
self.exclude_from_weight_decay = exclude_from_weight_decay
|
||||
|
||||
def apply_gradients(self, grads_and_vars, global_step=None, name=None,
|
||||
manual_fp16=False):
|
||||
"""See base class."""
|
||||
assignments = []
|
||||
for (grad, param) in grads_and_vars:
|
||||
with tf.name_scope("apply_one_adam"):
|
||||
if grad is None or param is None:
|
||||
continue
|
||||
|
||||
param_name = self._get_variable_name(param.name)
|
||||
has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
|
||||
if has_shadow:
|
||||
# create shadow fp32 weights for fp16 variable
|
||||
param_fp32 = tf.get_variable(
|
||||
name=param_name + "/shadow",
|
||||
dtype=tf.float32,
|
||||
trainable=False,
|
||||
initializer=tf.cast(param.initialized_value(),tf.float32))
|
||||
else:
|
||||
param_fp32 = param
|
||||
|
||||
m = tf.get_variable(
|
||||
name=param_name + "/adam_m",
|
||||
shape=param.shape.as_list(),
|
||||
dtype=tf.float32,
|
||||
trainable=False,
|
||||
initializer=tf.zeros_initializer())
|
||||
v = tf.get_variable(
|
||||
name=param_name + "/adam_v",
|
||||
shape=param.shape.as_list(),
|
||||
dtype=tf.float32,
|
||||
trainable=False,
|
||||
initializer=tf.zeros_initializer())
|
||||
|
||||
# Standard Adam update.
|
||||
next_m = (
|
||||
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
|
||||
next_v = (
|
||||
tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
|
||||
tf.square(grad)))
|
||||
|
||||
update = next_m / (tf.sqrt(next_v) + self.epsilon)
|
||||
|
||||
# Just adding the square of the weights to the loss function is *not*
|
||||
# the correct way of using L2 regularization/weight decay with Adam,
|
||||
# since that will interact with the m and v parameters in strange ways.
|
||||
#
|
||||
# Instead we want to decay the weights in a manner that doesn't interact
|
||||
# with the m/v parameters. This is equivalent to adding the square
|
||||
# of the weights to the loss with plain (non-momentum) SGD.
|
||||
if self._do_use_weight_decay(param_name):
|
||||
update += self.weight_decay_rate * param_fp32
|
||||
|
||||
update_with_lr = self.learning_rate * update
|
||||
|
||||
next_param = param_fp32 - update_with_lr
|
||||
|
||||
if has_shadow:
|
||||
# cast shadow fp32 weights to fp16 and assign to trainable variable
|
||||
param.assign(tf.cast(next_param, param.dtype.base_dtype))
|
||||
assignments.extend(
|
||||
[param_fp32.assign(next_param),
|
||||
m.assign(next_m),
|
||||
v.assign(next_v)])
|
||||
new_global_step = global_step + 1
|
||||
new_global_step = tf.identity(new_global_step, name='step_update')
|
||||
assignments.extend([global_step.assign(new_global_step)])
|
||||
return tf.group(*assignments, name=name)
|
||||
|
||||
def _do_use_weight_decay(self, param_name):
|
||||
"""Whether to use L2 weight decay for `param_name`."""
|
||||
if not self.weight_decay_rate:
|
||||
return False
|
||||
if self.exclude_from_weight_decay:
|
||||
for r in self.exclude_from_weight_decay:
|
||||
if re.search(r, param_name) is not None:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _get_variable_name(self, param_name):
|
||||
"""Get the variable name from the tensor name."""
|
||||
m = re.match("^(.*):\\d+$", param_name)
|
||||
if m is not None:
|
||||
param_name = m.group(1)
|
||||
return param_name
|
||||
|
||||
|
||||
class LAMBOptimizer(tf.train.Optimizer):
|
||||
"""A LAMB optimizer that includes "correct" L2 weight decay."""
|
||||
|
||||
def __init__(self,
|
||||
learning_rate,
|
||||
weight_decay_rate=0.0,
|
||||
beta_1=0.9,
|
||||
beta_2=0.999,
|
||||
epsilon=1e-6,
|
||||
exclude_from_weight_decay=None,
|
||||
name="LAMBOptimizer"):
|
||||
"""Constructs a LAMBOptimizer."""
|
||||
super(LAMBOptimizer, self).__init__(False, name)
|
||||
|
||||
self.learning_rate = tf.identity(learning_rate, name='learning_rate')
|
||||
self.weight_decay_rate = weight_decay_rate
|
||||
self.beta_1 = beta_1
|
||||
self.beta_2 = beta_2
|
||||
self.epsilon = epsilon
|
||||
self.exclude_from_weight_decay = exclude_from_weight_decay
|
||||
self.steps = 0
|
||||
|
||||
def apply_gradients(self, grads_and_vars, global_step=None, name=None,
|
||||
manual_fp16=False):
|
||||
"""See base class."""
|
||||
assignments = []
|
||||
for (grad, param) in grads_and_vars:
|
||||
with tf.name_scope("apply_one_lamb"):
|
||||
if grad is None or param is None:
|
||||
continue
|
||||
|
||||
param_name = self._get_variable_name(param.name)
|
||||
has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
|
||||
if has_shadow:
|
||||
# create shadow fp32 weights for fp16 variable
|
||||
param_fp32 = tf.get_variable(
|
||||
name=param_name + "/shadow",
|
||||
dtype=tf.float32,
|
||||
trainable=False,
|
||||
initializer=tf.cast(param.initialized_value(),tf.float32))
|
||||
else:
|
||||
param_fp32 = param
|
||||
|
||||
m = tf.get_variable(
|
||||
name=param_name + "/adam_m",
|
||||
shape=param.shape.as_list(),
|
||||
dtype=tf.float32,
|
||||
trainable=False,
|
||||
initializer=tf.zeros_initializer())
|
||||
v = tf.get_variable(
|
||||
name=param_name + "/adam_v",
|
||||
shape=param.shape.as_list(),
|
||||
dtype=tf.float32,
|
||||
trainable=False,
|
||||
initializer=tf.zeros_initializer())
|
||||
|
||||
# LAMB update
|
||||
next_m = (
|
||||
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
|
||||
next_v = (
|
||||
tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
|
||||
tf.square(grad)))
|
||||
|
||||
self.steps += 1
|
||||
beta1_correction = (1 - self.beta_1 ** self.steps)
|
||||
beta2_correction = (1 - self.beta_2 ** self.steps)
|
||||
|
||||
next_m_unbiased = next_m / beta1_correction
|
||||
next_v_unbiased = next_v / beta2_correction
|
||||
|
||||
update = next_m_unbiased / (tf.sqrt(next_v_unbiased) + self.epsilon)
|
||||
|
||||
# Just adding the square of the weights to the loss function is *not*
|
||||
# the correct way of using L2 regularization/weight decay with Adam,
|
||||
# since that will interact with the m and v parameters in strange ways.
|
||||
#
|
||||
# Instead we want to decay the weights in a manner that doesn't interact
|
||||
# with the m/v parameters. This is equivalent to adding the square
|
||||
# of the weights to the loss with plain (non-momentum) SGD.
|
||||
if self._do_use_weight_decay(param_name):
|
||||
update += self.weight_decay_rate * param_fp32
|
||||
|
||||
w_norm = linalg_ops.norm(param, ord=2)
|
||||
g_norm = linalg_ops.norm(update, ord=2)
|
||||
ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
|
||||
math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0)
|
||||
|
||||
update_with_lr = ratio * self.learning_rate * update
|
||||
|
||||
next_param = param_fp32 - update_with_lr
|
||||
|
||||
if has_shadow:
|
||||
# cast shadow fp32 weights to fp16 and assign to trainable variable
|
||||
param.assign(tf.cast(next_param, param.dtype.base_dtype))
|
||||
assignments.extend(
|
||||
[param_fp32.assign(next_param),
|
||||
m.assign(next_m),
|
||||
v.assign(next_v)])
|
||||
new_global_step = global_step + 1
|
||||
new_global_step = tf.identity(new_global_step, name='step_update')
|
||||
assignments.extend([global_step.assign(new_global_step)])
|
||||
return tf.group(*assignments, name=name)
|
||||
|
||||
def _do_use_weight_decay(self, param_name):
|
||||
"""Whether to use L2 weight decay for `param_name`."""
|
||||
if not self.weight_decay_rate:
|
||||
return False
|
||||
if self.exclude_from_weight_decay:
|
||||
for r in self.exclude_from_weight_decay:
|
||||
if re.search(r, param_name) is not None:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _get_variable_name(self, param_name):
|
||||
"""Get the variable name from the tensor name."""
|
||||
m = re.match("^(.*):\\d+$", param_name)
|
||||
if m is not None:
|
||||
param_name = m.group(1)
|
||||
return param_name
|
||||
+784
@@ -0,0 +1,784 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Run masked LM/next sentence masked_lm pre-training for BERT."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import time
|
||||
import modeling
|
||||
import optimization
|
||||
import tensorflow as tf
|
||||
import glob
|
||||
from utils import LogEvalRunHook
|
||||
from tensorflow.core.protobuf import rewriter_config_pb2
|
||||
from gpu_environment import get_custom_getter
|
||||
|
||||
from npu_bridge.estimator.npu.npu_config import *
|
||||
from npu_bridge.estimator.npu.npu_estimator import *
|
||||
from npu_bridge.estimator.npu.npu_config import NPURunConfig
|
||||
from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
|
||||
|
||||
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../../../../../utils/atlasboost'))
|
||||
# import hwlog
|
||||
from benchmark_log import hwlog
|
||||
from benchmark_log.basic_utils import get_environment_info
|
||||
from benchmark_log.basic_utils import get_model_parameter
|
||||
os.environ['WHICH_OP'] = 'GEOP'
|
||||
os.environ['NEW_GE_FE_ID'] = '1'
|
||||
os.environ['GE_AICPU_FLAG'] = '1'
|
||||
os.environ['GE_USE_STATIC_MEMORY'] = '1'
|
||||
os.environ['OPTION_EXEC_HCCL_FLAG'] = '1'
|
||||
os.environ['HCCL_CONNECT_TIMEOUT'] = '600'
|
||||
|
||||
flags = tf.flags
|
||||
|
||||
FLAGS = flags.FLAGS
|
||||
|
||||
## Required parameters
|
||||
flags.DEFINE_string(
|
||||
"bert_config_file", None,
|
||||
"The config json file corresponding to the pre-trained BERT model. "
|
||||
"This specifies the model architecture.")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"input_files_dir", "./data",
|
||||
"Directory with input files, comma separated or single directory.")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"eval_files_dir", None,
|
||||
"Directory with eval files, comma separated or single directory. ")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"output_dir", "./models",
|
||||
"The output directory where the model checkpoints will be written.")
|
||||
|
||||
## Other parameters
|
||||
flags.DEFINE_string(
|
||||
"init_checkpoint", None,
|
||||
"Initial checkpoint (usually from a pre-trained BERT model).")
|
||||
|
||||
flags.DEFINE_string(
|
||||
"optimizer_type", "lamb",
|
||||
"Optimizer used for training - LAMB or ADAM")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"max_seq_length", 128,
|
||||
"The maximum total input sequence length after WordPiece tokenization. "
|
||||
"Sequences longer than this will be truncated, and sequences shorter "
|
||||
"than this will be padded. Must match data generation.")
|
||||
|
||||
flags.DEFINE_integer(
|
||||
"max_predictions_per_seq", 20,
|
||||
"Maximum number of masked LM predictions per sequence. "
|
||||
"Must match data generation.")
|
||||
|
||||
flags.DEFINE_bool("do_train", True, "Whether to run training.")
|
||||
|
||||
flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
|
||||
|
||||
flags.DEFINE_integer("train_batch_size", 64, "Total batch size for training.")
|
||||
|
||||
flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
|
||||
|
||||
flags.DEFINE_float("learning_rate", 1e-4, "The initial learning rate for Adam.")
|
||||
|
||||
flags.DEFINE_integer("num_train_steps", 1000000, "Number of training steps.")
|
||||
|
||||
flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.")
|
||||
|
||||
flags.DEFINE_integer("save_checkpoints_steps", 10000,
|
||||
"How often to save the model checkpoint.")
|
||||
|
||||
flags.DEFINE_integer("display_loss_steps", 10,
|
||||
"How often to print loss")
|
||||
|
||||
flags.DEFINE_integer("iterations_per_loop", 1000,
|
||||
"How many steps to make in each estimator call.")
|
||||
|
||||
flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.")
|
||||
|
||||
flags.DEFINE_integer("num_accumulation_steps", 1,
|
||||
"Number of accumulation steps before gradient update."
|
||||
"Global batch size = num_accumulation_steps * train_batch_size")
|
||||
|
||||
flags.DEFINE_bool("allreduce_post_accumulation", False, "Whether to all reduce after accumulation of N steps or after each step")
|
||||
|
||||
flags.DEFINE_bool(
|
||||
"verbose_logging", False,
|
||||
"If true, all of the trainable parameters are printed")
|
||||
|
||||
flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
|
||||
|
||||
flags.DEFINE_bool("report_loss", True, "Whether to report total loss during training.")
|
||||
|
||||
flags.DEFINE_bool("manual_fp16", True, "Whether to use fp32 or fp16 arithmetic on GPU. "
|
||||
"Manual casting is done instead of using AMP")
|
||||
|
||||
flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
|
||||
|
||||
flags.DEFINE_bool("use_fp16", False, "Whether to enable AMP ops.")
|
||||
|
||||
flags.DEFINE_bool("use_fp16_cls", True, "Whether to use fp16 in cls and pooler.")
|
||||
|
||||
flags.DEFINE_bool("distributed", True, "Whether to use multi-npu")
|
||||
|
||||
flags.DEFINE_bool('npu_bert_fused_gelu', True, 'Whether to use npu defined gelu op')
|
||||
|
||||
flags.DEFINE_bool('npu_bert_debug', False, 'If True, dropout and shuffle is disabled.')
|
||||
|
||||
flags.DEFINE_bool('npu_bert_use_tdt', True, 'Whether to use tdt as dataset')
|
||||
|
||||
flags.DEFINE_string("npu_bert_job_start_file", None, "CSA job start file path.")
|
||||
|
||||
flags.DEFINE_integer("npu_bert_loss_scale", 0, "Whether to use loss scale, -1 is disable, 0 is dynamic loss scale, >=1 is static loss scale")
|
||||
|
||||
flags.DEFINE_bool("npu_bert_clip_by_global_norm", False, "Use clip_by_global_norm if True, or use clip_by_norm for each gradient")
|
||||
|
||||
flags.DEFINE_bool('npu_bert_npu_dropout', True, 'Whether to use npu defined gelu op')
|
||||
|
||||
flags.DEFINE_bool('npu_gather', True, 'Whether to use gather_npu whose backward propagation avoids IndexedSlices')
|
||||
|
||||
flags.DEFINE_bool('hcom_parallel', True, 'Whether to use parallel allreduce')
|
||||
|
||||
flags.DEFINE_integer('init_loss_scale_value', 2**32, 'Initial loss scale value for loss scale optimizer')
|
||||
|
||||
# report samples/sec, total loss and learning rate during training
|
||||
class _LogSessionRunHook(tf.train.SessionRunHook):
|
||||
def __init__(self, global_batch_size, num_accumulation_steps, display_every=10, hvd_rank=-1):
|
||||
self.global_batch_size = global_batch_size
|
||||
self.display_every = display_every
|
||||
self.hvd_rank = hvd_rank
|
||||
self.num_accumulation_steps = num_accumulation_steps
|
||||
def after_create_session(self, session, coord):
|
||||
self.elapsed_secs = 0.
|
||||
self.count = 0
|
||||
self.all_count = 0
|
||||
self.avg_loss = 0.0
|
||||
|
||||
def before_run(self, run_context):
|
||||
self.t0 = time.time()
|
||||
if self.num_accumulation_steps <= 1:
|
||||
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
|
||||
return tf.train.SessionRunArgs(
|
||||
fetches=['global_step:0', 'total_loss:0',
|
||||
'learning_rate:0', 'nsp_loss:0',
|
||||
'mlm_loss:0', 'loss_scale:0', 'apply_grads/All:0'])
|
||||
else:
|
||||
return tf.train.SessionRunArgs(
|
||||
fetches=['global_step:0', 'total_loss:0',
|
||||
'learning_rate:0', 'nsp_loss:0',
|
||||
'mlm_loss:0'])
|
||||
else:
|
||||
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
|
||||
return tf.train.SessionRunArgs(
|
||||
fetches=['global_step:0', 'update_step:0', 'total_loss:0',
|
||||
'learning_rate:0', 'nsp_loss:0',
|
||||
'mlm_loss:0', 'loss_scale:0'])
|
||||
else:
|
||||
return tf.train.SessionRunArgs(
|
||||
fetches=['global_step:0', 'update_step:0', 'total_loss:0',
|
||||
'learning_rate:0', 'nsp_loss:0',
|
||||
'mlm_loss:0'])
|
||||
def after_run(self, run_context, run_values):
|
||||
self.elapsed_secs += time.time() - self.t0
|
||||
if self.num_accumulation_steps <=1:
|
||||
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
|
||||
global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler, custom_arg = run_values.results
|
||||
else:
|
||||
global_step, total_loss, lr, nsp_loss, mlm_loss = run_values. \
|
||||
results
|
||||
update_step = True
|
||||
else:
|
||||
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
|
||||
global_step, update_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
|
||||
else:
|
||||
global_step, update_step, total_loss, lr, nsp_loss, mlm_loss = run_values.\
|
||||
results
|
||||
print_step = global_step + 1 # One-based index for printing.
|
||||
self.avg_loss += total_loss
|
||||
self.all_count += 1
|
||||
if update_step:
|
||||
self.count += 1
|
||||
dt = self.elapsed_secs / self.count
|
||||
sent_per_sec = self.global_batch_size / dt * FLAGS.iterations_per_loop
|
||||
avg_loss_step = self.avg_loss / self.all_count
|
||||
if self.hvd_rank >= 0:
|
||||
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
|
||||
print('Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e Loss scale = %6.4e isFinite = %6i' %
|
||||
(self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr, loss_scaler, custom_arg), flush=True)
|
||||
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
|
||||
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
|
||||
else:
|
||||
print('Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' %
|
||||
(self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr), flush=True)
|
||||
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
|
||||
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
|
||||
else:
|
||||
if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
|
||||
print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e Loss scale = %6.4e isFinite = %6i' %
|
||||
(print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr, loss_scaler, custom_arg), flush=True)
|
||||
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
|
||||
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
|
||||
else:
|
||||
print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' %
|
||||
(print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr), flush=True)
|
||||
hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
|
||||
hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
|
||||
self.elapsed_secs = 0.
|
||||
self.count = 0
|
||||
self.avg_loss = 0.0
|
||||
self.all_count = 0
|
||||
|
||||
def model_fn_builder(bert_config, init_checkpoint, learning_rate,
|
||||
num_train_steps, num_warmup_steps,
|
||||
use_one_hot_embeddings, hvd=None):
|
||||
"""Returns `model_fn` closure for TPUEstimator."""
|
||||
|
||||
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
|
||||
"""The `model_fn` for TPUEstimator."""
|
||||
|
||||
tf.logging.info("*** Features ***")
|
||||
for name in sorted(features.keys()):
|
||||
tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
|
||||
|
||||
input_ids = features["input_ids"]
|
||||
input_mask = features["input_mask"]
|
||||
segment_ids = features["segment_ids"]
|
||||
masked_lm_positions = features["masked_lm_positions"]
|
||||
masked_lm_ids = features["masked_lm_ids"]
|
||||
masked_lm_weights = features["masked_lm_weights"]
|
||||
next_sentence_labels = features["next_sentence_labels"]
|
||||
|
||||
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
|
||||
|
||||
model = modeling.BertModel(
|
||||
config=bert_config,
|
||||
is_training=is_training,
|
||||
input_ids=input_ids,
|
||||
input_mask=input_mask,
|
||||
token_type_ids=segment_ids,
|
||||
use_one_hot_embeddings=use_one_hot_embeddings,
|
||||
compute_type=tf.float16 if FLAGS.manual_fp16 else tf.float32)
|
||||
|
||||
(masked_lm_loss,
|
||||
masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output(
|
||||
bert_config, model.get_sequence_output(), model.get_embedding_table(),
|
||||
masked_lm_positions, masked_lm_ids,
|
||||
masked_lm_weights)
|
||||
|
||||
(next_sentence_loss, next_sentence_example_loss,
|
||||
next_sentence_log_probs) = get_next_sentence_output(
|
||||
bert_config, model.get_pooled_output(), next_sentence_labels)
|
||||
|
||||
masked_lm_loss = tf.identity(masked_lm_loss, name="mlm_loss")
|
||||
next_sentence_loss = tf.identity(next_sentence_loss, name="nsp_loss")
|
||||
total_loss = masked_lm_loss + next_sentence_loss
|
||||
total_loss = tf.identity(total_loss, name='total_loss')
|
||||
|
||||
tvars = tf.trainable_variables()
|
||||
|
||||
initialized_variable_names = {}
|
||||
if init_checkpoint and (hvd is None or hvd.rank() == 0):
|
||||
print("Loading checkpoint", init_checkpoint)
|
||||
(assignment_map, initialized_variable_names
|
||||
) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
|
||||
|
||||
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
|
||||
|
||||
if FLAGS.verbose_logging:
|
||||
tf.logging.info("**** Trainable Variables ****")
|
||||
for var in tvars:
|
||||
init_string = ""
|
||||
if var.name in initialized_variable_names:
|
||||
init_string = ", *INIT_FROM_CKPT*"
|
||||
tf.logging.info(" %d :: name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape,
|
||||
init_string)
|
||||
|
||||
output_spec = None
|
||||
if mode == tf.estimator.ModeKeys.TRAIN:
|
||||
train_op = optimization.create_optimizer(
|
||||
total_loss, learning_rate, num_train_steps, num_warmup_steps,
|
||||
hvd, FLAGS.manual_fp16, FLAGS.use_fp16, FLAGS.num_accumulation_steps, FLAGS.optimizer_type, FLAGS.allreduce_post_accumulation)
|
||||
|
||||
output_spec = tf.estimator.EstimatorSpec(
|
||||
mode=mode,
|
||||
loss=total_loss,
|
||||
train_op=train_op)
|
||||
elif mode == tf.estimator.ModeKeys.EVAL:
|
||||
|
||||
def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
|
||||
masked_lm_weights, next_sentence_example_loss,
|
||||
next_sentence_log_probs, next_sentence_labels):
|
||||
"""Computes the loss and accuracy of the model."""
|
||||
masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
|
||||
[-1, masked_lm_log_probs.shape[-1]])
|
||||
masked_lm_predictions = tf.argmax(
|
||||
masked_lm_log_probs, axis=-1, output_type=tf.int32)
|
||||
masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
|
||||
masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
|
||||
masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
|
||||
masked_lm_accuracy = tf.metrics.accuracy(
|
||||
labels=masked_lm_ids,
|
||||
predictions=masked_lm_predictions,
|
||||
weights=masked_lm_weights)
|
||||
masked_lm_mean_loss = tf.metrics.mean(
|
||||
values=masked_lm_example_loss, weights=masked_lm_weights)
|
||||
|
||||
next_sentence_log_probs = tf.reshape(
|
||||
next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
|
||||
next_sentence_predictions = tf.argmax(
|
||||
next_sentence_log_probs, axis=-1, output_type=tf.int32)
|
||||
next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
|
||||
next_sentence_accuracy = tf.metrics.accuracy(
|
||||
labels=next_sentence_labels, predictions=next_sentence_predictions)
|
||||
next_sentence_mean_loss = tf.metrics.mean(
|
||||
values=next_sentence_example_loss)
|
||||
|
||||
return {
|
||||
"masked_lm_accuracy": masked_lm_accuracy,
|
||||
"masked_lm_loss": masked_lm_mean_loss,
|
||||
"next_sentence_accuracy": next_sentence_accuracy,
|
||||
"next_sentence_loss": next_sentence_mean_loss,
|
||||
}
|
||||
|
||||
eval_metric_ops = metric_fn(
|
||||
masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
|
||||
masked_lm_weights, next_sentence_example_loss,
|
||||
next_sentence_log_probs, next_sentence_labels
|
||||
)
|
||||
output_spec = tf.estimator.EstimatorSpec(
|
||||
mode=mode,
|
||||
loss=total_loss,
|
||||
eval_metric_ops=eval_metric_ops)
|
||||
else:
|
||||
raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))
|
||||
|
||||
return output_spec
|
||||
|
||||
return model_fn
|
||||
|
||||
|
||||
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
|
||||
label_ids, label_weights):
|
||||
"""Get loss and log probs for the masked LM."""
|
||||
input_tensor = gather_indexes(input_tensor, positions)
|
||||
|
||||
with tf.variable_scope("cls/predictions"):
|
||||
# We apply one more non-linear transformation before the output layer.
|
||||
# This matrix is not used after pre-training.
|
||||
with tf.variable_scope("transform", custom_getter=get_custom_getter(compute_type=tf.float16 if FLAGS.use_fp16_cls else tf.float32)):
|
||||
if FLAGS.use_fp16_cls:
|
||||
input_tensor = tf.cast(input_tensor, tf.float16)
|
||||
input_tensor = tf.layers.dense(
|
||||
input_tensor,
|
||||
units=bert_config.hidden_size,
|
||||
activation=modeling.get_activation(bert_config.hidden_act),
|
||||
kernel_initializer=modeling.create_initializer(
|
||||
bert_config.initializer_range))
|
||||
input_tensor = tf.cast(input_tensor, tf.float32)
|
||||
input_tensor = modeling.layer_norm(input_tensor)
|
||||
|
||||
# The output weights are the same as the input embeddings, but there is
|
||||
# an output-only bias for each token.
|
||||
output_bias = tf.get_variable(
|
||||
"output_bias",
|
||||
shape=[bert_config.vocab_size],
|
||||
initializer=tf.zeros_initializer())
|
||||
if FLAGS.use_fp16_cls:
|
||||
input_tensor = tf.cast(input_tensor, tf.float16)
|
||||
logits = tf.matmul(input_tensor, tf.cast(output_weights, tf.float16), transpose_b=True)
|
||||
logits = tf.cast(logits, tf.float32)
|
||||
else:
|
||||
logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
|
||||
logits = tf.nn.bias_add(logits, output_bias)
|
||||
log_probs = tf.nn.log_softmax(logits, axis=-1)
|
||||
|
||||
label_ids = tf.reshape(label_ids, [-1])
|
||||
label_weights = tf.reshape(label_weights, [-1])
|
||||
|
||||
one_hot_labels = tf.one_hot(
|
||||
label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
|
||||
|
||||
# The `positions` tensor might be zero-padded (if the sequence is too
|
||||
# short to have the maximum number of predictions). The `label_weights`
|
||||
# tensor has a value of 1.0 for every real prediction and 0.0 for the
|
||||
# padding predictions.
|
||||
per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
|
||||
numerator = tf.reduce_sum(label_weights * per_example_loss)
|
||||
denominator = tf.reduce_sum(label_weights) + 1e-5
|
||||
loss = numerator / denominator
|
||||
|
||||
return (loss, per_example_loss, log_probs)
|
||||
|
||||
|
||||
def get_next_sentence_output(bert_config, input_tensor, labels):
|
||||
"""Get loss and log probs for the next sentence prediction."""
|
||||
|
||||
# Simple binary classification. Note that 0 is "next sentence" and 1 is
|
||||
# "random sentence". This weight matrix is not used after pre-training.
|
||||
with tf.variable_scope("cls/seq_relationship"):
|
||||
output_weights = tf.get_variable(
|
||||
"output_weights",
|
||||
shape=[2, bert_config.hidden_size],
|
||||
initializer=modeling.create_initializer(bert_config.initializer_range))
|
||||
output_bias = tf.get_variable(
|
||||
"output_bias", shape=[2], initializer=tf.zeros_initializer())
|
||||
|
||||
if FLAGS.use_fp16_cls:
|
||||
input_tensor = tf.cast(input_tensor, tf.float16)
|
||||
logits = tf.matmul(input_tensor, tf.cast(output_weights, tf.float16), transpose_b=True)
|
||||
logits = tf.cast(logits, tf.float32)
|
||||
else:
|
||||
logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
|
||||
logits = tf.nn.bias_add(logits, output_bias)
|
||||
log_probs = tf.nn.log_softmax(logits, axis=-1)
|
||||
labels = tf.reshape(labels, [-1])
|
||||
one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
|
||||
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
|
||||
loss = tf.reduce_mean(per_example_loss)
|
||||
return (loss, per_example_loss, log_probs)
|
||||
|
||||
|
||||
def gather_indexes(sequence_tensor, positions):
|
||||
"""Gathers the vectors at the specific positions over a minibatch."""
|
||||
sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
|
||||
batch_size = sequence_shape[0]
|
||||
seq_length = sequence_shape[1]
|
||||
width = sequence_shape[2]
|
||||
|
||||
flat_offsets = tf.reshape(
|
||||
tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
|
||||
flat_positions = tf.reshape(positions + flat_offsets, [-1])
|
||||
flat_sequence_tensor = tf.reshape(sequence_tensor,
|
||||
[batch_size * seq_length, width])
|
||||
output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
|
||||
return output_tensor
|
||||
|
||||
|
||||
def input_fn_builder(input_files,
|
||||
batch_size,
|
||||
max_seq_length,
|
||||
max_predictions_per_seq,
|
||||
is_training,
|
||||
num_cpu_threads=4,
|
||||
hvd=None):
|
||||
"""Creates an `input_fn` closure to be passed to Estimator."""
|
||||
|
||||
def input_fn():
|
||||
"""The actual input function."""
|
||||
|
||||
name_to_features = {
|
||||
"input_ids":
|
||||
tf.FixedLenFeature([max_seq_length], tf.int64),
|
||||
"input_mask":
|
||||
tf.FixedLenFeature([max_seq_length], tf.int64),
|
||||
"segment_ids":
|
||||
tf.FixedLenFeature([max_seq_length], tf.int64),
|
||||
"masked_lm_positions":
|
||||
tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
|
||||
"masked_lm_ids":
|
||||
tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
|
||||
"masked_lm_weights":
|
||||
tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
|
||||
"next_sentence_labels":
|
||||
tf.FixedLenFeature([1], tf.int64),
|
||||
}
|
||||
|
||||
# For training, we want a lot of parallel reading and shuffling.
|
||||
# For eval, we want no shuffling and parallel reading doesn't matter.
|
||||
if is_training:
|
||||
d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
|
||||
if FLAGS.distributed:
|
||||
#rank_size = int(os.getenv('RANK_SIZE'))
|
||||
#rank_id = int(os.getenv('RANK_INDEX'))
|
||||
#device_id = int(os.getenv('DEVICE_ID'))
|
||||
#local_rank = rank_id * 8 + device_id
|
||||
#print('RANK_SIZE=', rank_size, ' RANK_ID=', local_rank)
|
||||
rank_size = int(os.getenv('RANK_SIZE'))
|
||||
rank_id = int(os.getenv('RANK_ID'))
|
||||
print('RANK_SIZE=', rank_size, ' rank_id=', rank_id)
|
||||
d = d.shard(rank_size, rank_id)
|
||||
d = d.repeat()
|
||||
if not FLAGS.npu_bert_debug:
|
||||
d = d.shuffle(buffer_size=len(input_files))
|
||||
|
||||
# `cycle_length` is the number of parallel files that get read.
|
||||
if not FLAGS.npu_bert_debug:
|
||||
#cycle_length = min(num_cpu_threads, len(input_files))
|
||||
cycle_length = min(num_cpu_threads, int(len(input_files)/int(os.getenv('RANK_SIZE'))))
|
||||
else:
|
||||
cycle_length = 1
|
||||
|
||||
# `sloppy` mode means that the interleaving is not exact. This adds
|
||||
# even more randomness to the training pipeline.
|
||||
#d = d.apply(
|
||||
# tf.contrib.data.parallel_interleave(
|
||||
# tf.data.TFRecordDataset,
|
||||
# sloppy=(not FLAGS.npu_bert_debug),
|
||||
# cycle_length=cycle_length))
|
||||
d = d.interleave(
|
||||
tf.data.TFRecordDataset,
|
||||
cycle_length=cycle_length,
|
||||
num_parallel_calls=tf.data.experimental.AUTOTUNE)
|
||||
if not FLAGS.npu_bert_debug:
|
||||
d = d.shuffle(buffer_size=100)
|
||||
else:
|
||||
d = tf.data.TFRecordDataset(input_files)
|
||||
# Since we evaluate for a fixed number of steps we don't want to encounter
|
||||
# out-of-range exceptions.
|
||||
d = d.repeat()
|
||||
|
||||
# We must `drop_remainder` on training because the TPU requires fixed
|
||||
# size dimensions. For eval, we assume we are evaluating on the CPU or GPU
|
||||
# and we *don't* want to drop the remainder, otherwise we wont cover
|
||||
# every sample.
|
||||
d = d.apply(
|
||||
tf.contrib.data.map_and_batch(
|
||||
lambda record: _decode_record(record, name_to_features),
|
||||
batch_size=batch_size,
|
||||
num_parallel_batches=num_cpu_threads,
|
||||
drop_remainder=True))
|
||||
return d
|
||||
|
||||
return input_fn
|
||||
|
||||
|
||||
def _decode_record(record, name_to_features):
|
||||
"""Decodes a record to a TensorFlow example."""
|
||||
example = tf.parse_single_example(record, name_to_features)
|
||||
|
||||
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
|
||||
# So cast all int64 to int32.
|
||||
for name in list(example.keys()):
|
||||
t = example[name]
|
||||
if t.dtype == tf.int64:
|
||||
t = tf.to_int32(t)
|
||||
example[name] = t
|
||||
|
||||
return example
|
||||
|
||||
|
||||
def main(_):
|
||||
for name, value in FLAGS.__flags.items():
|
||||
print("name:", name, " ", FLAGS[name].value)
|
||||
|
||||
tf.logging.set_verbosity(tf.logging.INFO)
|
||||
|
||||
if not FLAGS.do_train and not FLAGS.do_eval:
|
||||
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
|
||||
|
||||
if FLAGS.use_fp16:
|
||||
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
|
||||
|
||||
if FLAGS.horovod:
|
||||
import horovod.tensorflow as hvd
|
||||
hvd.init()
|
||||
|
||||
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
|
||||
|
||||
if FLAGS.npu_gather:
|
||||
if FLAGS.distributed and bert_config.num_hidden_layers == 24:
|
||||
#from hccl.split.api import set_split_strategy_by_idx
|
||||
from hccl.split.api import set_split_strategy_by_size
|
||||
#set_split_strategy_by_idx([8,72,136,200,264,328,392,397])
|
||||
set_split_strategy_by_size([10,10,10,10,15,15,15,15])
|
||||
if FLAGS.distributed and bert_config.num_hidden_layers == 12:
|
||||
from hccl.split.api import set_split_strategy_by_idx
|
||||
set_split_strategy_by_idx([8,56,104,152,200,205])
|
||||
if FLAGS.distributed and bert_config.num_hidden_layers == 6:
|
||||
from hccl.split.api import set_split_strategy_by_idx
|
||||
set_split_strategy_by_idx([8,40,72,104,109])
|
||||
|
||||
tf.gfile.MakeDirs(FLAGS.output_dir)
|
||||
|
||||
input_files = []
|
||||
for input_file_dir in FLAGS.input_files_dir.split(","):
|
||||
input_files.extend(tf.gfile.Glob(os.path.join(input_file_dir, "*")))
|
||||
|
||||
input_files.sort()
|
||||
print("Input Files:", input_files)
|
||||
|
||||
if FLAGS.horovod and len(input_files) < hvd.size():
|
||||
raise ValueError("Input Files must be sharded")
|
||||
if FLAGS.use_fp16 and FLAGS.manual_fp16:
|
||||
raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error")
|
||||
|
||||
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
|
||||
config = tf.ConfigProto()
|
||||
if FLAGS.horovod:
|
||||
config.gpu_options.visible_device_list = str(hvd.local_rank())
|
||||
if hvd.rank() == 0:
|
||||
tf.logging.info("***** Configuaration *****")
|
||||
for key in FLAGS.__flags.keys():
|
||||
tf.logging.info(' {}: {}'.format(key, getattr(FLAGS, key)))
|
||||
tf.logging.info("**************************")
|
||||
|
||||
# config.gpu_options.per_process_gpu_memory_fraction = 0.7
|
||||
if FLAGS.use_xla:
|
||||
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
|
||||
config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT
|
||||
|
||||
#run_config = tf.estimator.RunConfig(
|
||||
run_config = NPURunConfig(
|
||||
model_dir=FLAGS.output_dir,
|
||||
save_summary_steps=0,
|
||||
session_config=config,
|
||||
save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None,
|
||||
# This variable controls how often estimator reports examples/sec.
|
||||
# Default value is every 100 steps.
|
||||
# When --report_loss is True, we set to very large value to prevent
|
||||
# default info reporting from estimator.
|
||||
# Ideally we should set it to None, but that does not work.
|
||||
log_step_count_steps=1 if FLAGS.report_loss else 100,
|
||||
enable_data_pre_proc=FLAGS.npu_bert_use_tdt,
|
||||
iterations_per_loop=FLAGS.iterations_per_loop,
|
||||
hcom_parallel=FLAGS.hcom_parallel)
|
||||
|
||||
if FLAGS.distributed:
|
||||
rank_size = int(os.getenv('RANK_SIZE'))
|
||||
model_fn = model_fn_builder(
|
||||
bert_config=bert_config,
|
||||
init_checkpoint=FLAGS.init_checkpoint,
|
||||
learning_rate=FLAGS.learning_rate,
|
||||
num_train_steps=FLAGS.num_train_steps,
|
||||
num_warmup_steps=FLAGS.num_warmup_steps,
|
||||
use_one_hot_embeddings=False,
|
||||
hvd=None if not FLAGS.horovod else hvd)
|
||||
|
||||
training_hooks = []
|
||||
"""
|
||||
if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0):
|
||||
global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.horovod else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
|
||||
training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
|
||||
if FLAGS.horovod and hvd.size() > 1:
|
||||
training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
|
||||
"""
|
||||
if FLAGS.report_loss:
|
||||
global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.distributed else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * rank_size
|
||||
training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
|
||||
|
||||
|
||||
#estimator = tf.estimator.Estimator(
|
||||
estimator = NPUEstimator(
|
||||
model_fn=model_fn,
|
||||
config=run_config,
|
||||
job_start_file=FLAGS.npu_bert_job_start_file)
|
||||
|
||||
if FLAGS.do_train:
|
||||
tf.logging.info("***** Running training *****")
|
||||
tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
|
||||
train_input_fn = input_fn_builder(
|
||||
input_files=input_files,
|
||||
batch_size=FLAGS.train_batch_size,
|
||||
max_seq_length=FLAGS.max_seq_length,
|
||||
max_predictions_per_seq=FLAGS.max_predictions_per_seq,
|
||||
is_training=True,
|
||||
hvd=None if not FLAGS.horovod else hvd)
|
||||
|
||||
estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps)
|
||||
|
||||
if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
|
||||
tf.logging.info("***** Running evaluation *****")
|
||||
tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
|
||||
|
||||
eval_files = []
|
||||
for eval_file_dir in FLAGS.eval_files_dir.split(","):
|
||||
eval_files.extend(tf.gfile.Glob(os.path.join(eval_file_dir, "*")))
|
||||
|
||||
eval_input_fn = input_fn_builder(
|
||||
input_files=eval_files,
|
||||
batch_size=FLAGS.eval_batch_size,
|
||||
max_seq_length=FLAGS.max_seq_length,
|
||||
max_predictions_per_seq=FLAGS.max_predictions_per_seq,
|
||||
is_training=False,
|
||||
hvd=None if not FLAGS.horovod else hvd)
|
||||
|
||||
eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
|
||||
eval_start_time = time.time()
|
||||
result = estimator.evaluate(
|
||||
input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks)
|
||||
|
||||
eval_time_elapsed = time.time() - eval_start_time
|
||||
eval_time_wo_overhead = eval_hooks[-1].total_time
|
||||
|
||||
num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size
|
||||
|
||||
ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
|
||||
|
||||
tf.logging.info("-----------------------------")
|
||||
tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
|
||||
eval_hooks[-1].count * FLAGS.eval_batch_size)
|
||||
tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
|
||||
(eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size)
|
||||
tf.logging.info("Summary Inference Statistics on EVAL set")
|
||||
tf.logging.info("Batch size = %d", FLAGS.eval_batch_size)
|
||||
tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
|
||||
tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
|
||||
tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
|
||||
tf.logging.info("-----------------------------")
|
||||
|
||||
output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
|
||||
with tf.gfile.GFile(output_eval_file, "w") as writer:
|
||||
tf.logging.info("***** Eval results *****")
|
||||
for key in sorted(result.keys()):
|
||||
tf.logging.info(" %s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
if key == 'masked_lm_accuracy':
|
||||
hwlog.remark_print(key=hwlog.MASKED_LM_ACCURACY, value=str(result[key]))
|
||||
elif key == 'next_sentence_accuracy ':
|
||||
hwlog.remark_print(key=hwlog.NEXT_SENTENCE_ACCURACY, value=str(result[key]))
|
||||
elif key == 'global_step':
|
||||
hwlog.remark_print(key=hwlog.GLOBAL_STEP, value=str(result[key]))
|
||||
elif key == 'loss':
|
||||
hwlog.remark_print(key=hwlog.LOSS, value=str(result[key]))
|
||||
elif key == 'masked_lm_loss':
|
||||
hwlog.remark_print(key=hwlog.MASKED_LM_LOSS, value=str(result[key]))
|
||||
elif key == 'next_sentence_loss ':
|
||||
hwlog.remark_print(key=hwlog.NEXT_SENTENCE_LOSS, value=str(result[key]))
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
|
||||
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
|
||||
config_info = get_model_parameter("tensorflow_config")
|
||||
initinal_data = {"base_lr": 0.01, "dataset": "cn-clue/en-wiki", "optimizer": "Adam", "loss_scale": 512}
|
||||
flags.mark_flag_as_required("input_files_dir")
|
||||
flags.mark_flag_as_required("eval_files_dir")
|
||||
flags.mark_flag_as_required("bert_config_file")
|
||||
flags.mark_flag_as_required("output_dir")
|
||||
flags.mark_flag_as_required("npu_bert_job_start_file")
|
||||
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
|
||||
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
|
||||
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
|
||||
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
|
||||
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
|
||||
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
|
||||
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
|
||||
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
|
||||
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
|
||||
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
|
||||
if FLAGS.use_xla and FLAGS.manual_fp16:
|
||||
print('WARNING! Combining --use_xla with --manual_fp16 may prevent convergence.')
|
||||
print(' This warning message will be removed when the underlying')
|
||||
print(' issues have been fixed and you are running a TF version')
|
||||
print(' that has that fix.')
|
||||
tf.app.run()
|
||||
@@ -0,0 +1,215 @@
|
||||
"""
|
||||
Multiclass
|
||||
from:
|
||||
https://github.com/guillaumegenthial/tf_metrics/blob/master/tf_metrics/__init__.py
|
||||
|
||||
"""
|
||||
|
||||
__author__ = "Guillaume Genthial"
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.ops.metrics_impl import _streaming_confusion_matrix
|
||||
|
||||
|
||||
def precision(labels, predictions, num_classes, pos_indices=None,
|
||||
weights=None, average='micro'):
|
||||
"""Multi-class precision metric for Tensorflow
|
||||
Parameters
|
||||
----------
|
||||
labels : Tensor of tf.int32 or tf.int64
|
||||
The true labels
|
||||
predictions : Tensor of tf.int32 or tf.int64
|
||||
The predictions, same shape as labels
|
||||
num_classes : int
|
||||
The number of classes
|
||||
pos_indices : list of int, optional
|
||||
The indices of the positive classes, default is all
|
||||
weights : Tensor of tf.int32, optional
|
||||
Mask, must be of compatible shape with labels
|
||||
average : str, optional
|
||||
'micro': counts the total number of true positives, false
|
||||
positives, and false negatives for the classes in
|
||||
`pos_indices` and infer the metric from it.
|
||||
'macro': will compute the metric separately for each class in
|
||||
`pos_indices` and average. Will not account for class
|
||||
imbalance.
|
||||
'weighted': will compute the metric separately for each class in
|
||||
`pos_indices` and perform a weighted average by the total
|
||||
number of true labels for each class.
|
||||
Returns
|
||||
-------
|
||||
tuple of (scalar float Tensor, update_op)
|
||||
"""
|
||||
cm, op = _streaming_confusion_matrix(
|
||||
labels, predictions, num_classes, weights)
|
||||
pr, _, _ = metrics_from_confusion_matrix(
|
||||
cm, pos_indices, average=average)
|
||||
op, _, _ = metrics_from_confusion_matrix(
|
||||
op, pos_indices, average=average)
|
||||
return (pr, op)
|
||||
|
||||
|
||||
def recall(labels, predictions, num_classes, pos_indices=None, weights=None,
|
||||
average='micro'):
|
||||
"""Multi-class recall metric for Tensorflow
|
||||
Parameters
|
||||
----------
|
||||
labels : Tensor of tf.int32 or tf.int64
|
||||
The true labels
|
||||
predictions : Tensor of tf.int32 or tf.int64
|
||||
The predictions, same shape as labels
|
||||
num_classes : int
|
||||
The number of classes
|
||||
pos_indices : list of int, optional
|
||||
The indices of the positive classes, default is all
|
||||
weights : Tensor of tf.int32, optional
|
||||
Mask, must be of compatible shape with labels
|
||||
average : str, optional
|
||||
'micro': counts the total number of true positives, false
|
||||
positives, and false negatives for the classes in
|
||||
`pos_indices` and infer the metric from it.
|
||||
'macro': will compute the metric separately for each class in
|
||||
`pos_indices` and average. Will not account for class
|
||||
imbalance.
|
||||
'weighted': will compute the metric separately for each class in
|
||||
`pos_indices` and perform a weighted average by the total
|
||||
number of true labels for each class.
|
||||
Returns
|
||||
-------
|
||||
tuple of (scalar float Tensor, update_op)
|
||||
"""
|
||||
cm, op = _streaming_confusion_matrix(
|
||||
labels, predictions, num_classes, weights)
|
||||
_, re, _ = metrics_from_confusion_matrix(
|
||||
cm, pos_indices, average=average)
|
||||
_, op, _ = metrics_from_confusion_matrix(
|
||||
op, pos_indices, average=average)
|
||||
return (re, op)
|
||||
|
||||
|
||||
def f1(labels, predictions, num_classes, pos_indices=None, weights=None,
|
||||
average='micro'):
|
||||
return fbeta(labels, predictions, num_classes, pos_indices, weights,
|
||||
average)
|
||||
|
||||
|
||||
def fbeta(labels, predictions, num_classes, pos_indices=None, weights=None,
|
||||
average='micro', beta=1):
|
||||
"""Multi-class fbeta metric for Tensorflow
|
||||
Parameters
|
||||
----------
|
||||
labels : Tensor of tf.int32 or tf.int64
|
||||
The true labels
|
||||
predictions : Tensor of tf.int32 or tf.int64
|
||||
The predictions, same shape as labels
|
||||
num_classes : int
|
||||
The number of classes
|
||||
pos_indices : list of int, optional
|
||||
The indices of the positive classes, default is all
|
||||
weights : Tensor of tf.int32, optional
|
||||
Mask, must be of compatible shape with labels
|
||||
average : str, optional
|
||||
'micro': counts the total number of true positives, false
|
||||
positives, and false negatives for the classes in
|
||||
`pos_indices` and infer the metric from it.
|
||||
'macro': will compute the metric separately for each class in
|
||||
`pos_indices` and average. Will not account for class
|
||||
imbalance.
|
||||
'weighted': will compute the metric separately for each class in
|
||||
`pos_indices` and perform a weighted average by the total
|
||||
number of true labels for each class.
|
||||
beta : int, optional
|
||||
Weight of precision in harmonic mean
|
||||
Returns
|
||||
-------
|
||||
tuple of (scalar float Tensor, update_op)
|
||||
"""
|
||||
cm, op = _streaming_confusion_matrix(
|
||||
labels, predictions, num_classes, weights)
|
||||
_, _, fbeta = metrics_from_confusion_matrix(
|
||||
cm, pos_indices, average=average, beta=beta)
|
||||
_, _, op = metrics_from_confusion_matrix(
|
||||
op, pos_indices, average=average, beta=beta)
|
||||
return (fbeta, op)
|
||||
|
||||
|
||||
def safe_div(numerator, denominator):
|
||||
"""Safe division, return 0 if denominator is 0"""
|
||||
numerator, denominator = tf.to_float(numerator), tf.to_float(denominator)
|
||||
zeros = tf.zeros_like(numerator, dtype=numerator.dtype)
|
||||
denominator_is_zero = tf.equal(denominator, zeros)
|
||||
return tf.where(denominator_is_zero, zeros, numerator / denominator)
|
||||
|
||||
|
||||
def pr_re_fbeta(cm, pos_indices, beta=1):
|
||||
"""Uses a confusion matrix to compute precision, recall and fbeta"""
|
||||
num_classes = cm.shape[0]
|
||||
neg_indices = [i for i in range(num_classes) if i not in pos_indices]
|
||||
cm_mask = np.ones([num_classes, num_classes])
|
||||
cm_mask[neg_indices, neg_indices] = 0
|
||||
diag_sum = tf.reduce_sum(tf.diag_part(cm * cm_mask))
|
||||
|
||||
cm_mask = np.ones([num_classes, num_classes])
|
||||
cm_mask[:, neg_indices] = 0
|
||||
tot_pred = tf.reduce_sum(cm * cm_mask)
|
||||
|
||||
cm_mask = np.ones([num_classes, num_classes])
|
||||
cm_mask[neg_indices, :] = 0
|
||||
tot_gold = tf.reduce_sum(cm * cm_mask)
|
||||
|
||||
pr = safe_div(diag_sum, tot_pred)
|
||||
re = safe_div(diag_sum, tot_gold)
|
||||
fbeta = safe_div((1. + beta**2) * pr * re, beta**2 * pr + re)
|
||||
|
||||
return pr, re, fbeta
|
||||
|
||||
|
||||
def metrics_from_confusion_matrix(cm, pos_indices=None, average='micro',
|
||||
beta=1):
|
||||
"""Precision, Recall and F1 from the confusion matrix
|
||||
Parameters
|
||||
----------
|
||||
cm : tf.Tensor of type tf.int32, of shape (num_classes, num_classes)
|
||||
The streaming confusion matrix.
|
||||
pos_indices : list of int, optional
|
||||
The indices of the positive classes
|
||||
beta : int, optional
|
||||
Weight of precision in harmonic mean
|
||||
average : str, optional
|
||||
'micro', 'macro' or 'weighted'
|
||||
"""
|
||||
num_classes = cm.shape[0]
|
||||
if pos_indices is None:
|
||||
pos_indices = [i for i in range(num_classes)]
|
||||
|
||||
if average == 'micro':
|
||||
return pr_re_fbeta(cm, pos_indices, beta)
|
||||
elif average in {'macro', 'weighted'}:
|
||||
precisions, recalls, fbetas, n_golds = [], [], [], []
|
||||
for idx in pos_indices:
|
||||
pr, re, fbeta = pr_re_fbeta(cm, [idx], beta)
|
||||
precisions.append(pr)
|
||||
recalls.append(re)
|
||||
fbetas.append(fbeta)
|
||||
cm_mask = np.zeros([num_classes, num_classes])
|
||||
cm_mask[idx, :] = 1
|
||||
n_golds.append(tf.to_float(tf.reduce_sum(cm * cm_mask)))
|
||||
|
||||
if average == 'macro':
|
||||
pr = tf.reduce_mean(precisions)
|
||||
re = tf.reduce_mean(recalls)
|
||||
fbeta = tf.reduce_mean(fbetas)
|
||||
return pr, re, fbeta
|
||||
if average == 'weighted':
|
||||
n_gold = tf.reduce_sum(n_golds)
|
||||
pr_sum = sum(p * n for p, n in zip(precisions, n_golds))
|
||||
pr = safe_div(pr_sum, n_gold)
|
||||
re_sum = sum(r * n for r, n in zip(recalls, n_golds))
|
||||
re = safe_div(re_sum, n_gold)
|
||||
fbeta_sum = sum(f * n for f, n in zip(fbetas, n_golds))
|
||||
fbeta = safe_div(fbeta_sum, n_gold)
|
||||
return pr, re, fbeta
|
||||
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
@@ -0,0 +1,451 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Tokenization classes."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import unicodedata
|
||||
import six
|
||||
import tensorflow as tf
|
||||
import re
|
||||
import os
|
||||
|
||||
|
||||
PRETRAINED_VOCAB_ARCHIVE_MAP = {
|
||||
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
|
||||
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
|
||||
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
|
||||
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
|
||||
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
|
||||
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
|
||||
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
|
||||
}
|
||||
|
||||
def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
|
||||
"""Checks whether the casing config is consistent with the checkpoint name."""
|
||||
|
||||
# The casing has to be passed in by the user and there is no explicit check
|
||||
# as to whether it matches the checkpoint. The casing information probably
|
||||
# should have been stored in the bert_config.json file, but it's not, so
|
||||
# we have to heuristically detect it to validate.
|
||||
|
||||
if not init_checkpoint:
|
||||
return
|
||||
|
||||
m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
|
||||
if m is None:
|
||||
return
|
||||
|
||||
model_name = m.group(1)
|
||||
|
||||
lower_models = [
|
||||
"uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
|
||||
"multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
|
||||
]
|
||||
|
||||
cased_models = [
|
||||
"cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
|
||||
"multi_cased_L-12_H-768_A-12"
|
||||
]
|
||||
|
||||
is_bad_config = False
|
||||
if model_name in lower_models and not do_lower_case:
|
||||
is_bad_config = True
|
||||
actual_flag = "False"
|
||||
case_name = "lowercased"
|
||||
opposite_flag = "True"
|
||||
|
||||
if model_name in cased_models and do_lower_case:
|
||||
is_bad_config = True
|
||||
actual_flag = "True"
|
||||
case_name = "cased"
|
||||
opposite_flag = "False"
|
||||
|
||||
if is_bad_config:
|
||||
raise ValueError(
|
||||
"You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
|
||||
"However, `%s` seems to be a %s model, so you "
|
||||
"should pass in `--do_lower_case=%s` so that the fine-tuning matches "
|
||||
"how the model was pre-training. If this error is wrong, please "
|
||||
"just comment out this check." % (actual_flag, init_checkpoint,
|
||||
model_name, case_name, opposite_flag))
|
||||
|
||||
|
||||
|
||||
def convert_to_unicode(text):
|
||||
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
|
||||
if isinstance(text, str):
|
||||
return text
|
||||
elif isinstance(text, bytes):
|
||||
return text.decode("utf-8", "ignore")
|
||||
else:
|
||||
raise ValueError("Unsupported string type: %s" % (type(text)))
|
||||
|
||||
|
||||
def printable_text(text):
|
||||
"""Returns text encoded in a way suitable for print or `tf.logging`."""
|
||||
|
||||
# These functions want `str` for both Python2 and Python3, but in one case
|
||||
# it's a Unicode string and in the other it's a byte string.
|
||||
if isinstance(text, str):
|
||||
return text
|
||||
elif isinstance(text, bytes):
|
||||
return text.decode("utf-8", "ignore")
|
||||
else:
|
||||
raise ValueError("Unsupported string type: %s" % (type(text)))
|
||||
|
||||
|
||||
def load_vocab(vocab_file):
|
||||
"""Loads a vocabulary file into a dictionary."""
|
||||
vocab = collections.OrderedDict()
|
||||
index = 0
|
||||
with open(vocab_file, "r") as reader:
|
||||
while True:
|
||||
token = convert_to_unicode(reader.readline())
|
||||
if not token:
|
||||
break
|
||||
token = token.strip()
|
||||
vocab[token] = index
|
||||
index += 1
|
||||
return vocab
|
||||
|
||||
|
||||
def convert_by_vocab(vocab, items):
|
||||
"""Converts a sequence of [tokens|ids] using the vocab."""
|
||||
output = []
|
||||
for item in items:
|
||||
output.append(vocab[item])
|
||||
return output
|
||||
|
||||
|
||||
def whitespace_tokenize(text):
|
||||
"""Runs basic whitespace cleaning and splitting on a peice of text."""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return []
|
||||
tokens = text.split()
|
||||
return tokens
|
||||
|
||||
|
||||
class FullTokenizer(object):
|
||||
"""Runs end-to-end tokenziation."""
|
||||
|
||||
def __init__(self, vocab_file, do_lower_case=True):
|
||||
self.vocab = load_vocab(vocab_file)
|
||||
self.inv_vocab = {v: k for k, v in self.vocab.items()}
|
||||
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
|
||||
|
||||
def tokenize(self, text):
|
||||
split_tokens = []
|
||||
for token in self.basic_tokenizer.tokenize(text):
|
||||
for sub_token in self.wordpiece_tokenizer.tokenize(token):
|
||||
split_tokens.append(sub_token)
|
||||
|
||||
return split_tokens
|
||||
|
||||
def convert_tokens_to_ids(self, tokens):
|
||||
return convert_by_vocab(self.vocab, tokens)
|
||||
|
||||
def convert_ids_to_tokens(self, ids):
|
||||
return convert_by_vocab(self.inv_vocab, ids)
|
||||
|
||||
|
||||
class BertTokenizer(object):
|
||||
"""Runs end-to-end tokenization: punctuation splitting + wordpiece"""
|
||||
|
||||
def __init__(self, vocab_file, do_lower_case=True):
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
|
||||
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
|
||||
self.vocab = load_vocab(vocab_file)
|
||||
self.ids_to_tokens = collections.OrderedDict(
|
||||
[(ids, tok) for tok, ids in self.vocab.items()])
|
||||
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
|
||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
|
||||
|
||||
def tokenize(self, text):
|
||||
split_tokens = []
|
||||
for token in self.basic_tokenizer.tokenize(text):
|
||||
for sub_token in self.wordpiece_tokenizer.tokenize(token):
|
||||
split_tokens.append(sub_token)
|
||||
return split_tokens
|
||||
|
||||
def convert_tokens_to_ids(self, tokens):
|
||||
"""Converts a sequence of tokens into ids using the vocab."""
|
||||
ids = []
|
||||
for token in tokens:
|
||||
ids.append(self.vocab[token])
|
||||
return ids
|
||||
|
||||
def convert_ids_to_tokens(self, ids):
|
||||
"""Converts a sequence of ids in wordpiece tokens using the vocab."""
|
||||
tokens = []
|
||||
for i in ids:
|
||||
tokens.append(self.ids_to_tokens[i])
|
||||
return tokens
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name, do_lower_case=True):
|
||||
"""
|
||||
Instantiate a PreTrainedBertModel from a pre-trained model file.
|
||||
Download and cache the pre-trained model file if needed.
|
||||
"""
|
||||
if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
|
||||
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
|
||||
else:
|
||||
vocab_file = pretrained_model_name
|
||||
# redirect to the cache, if necessary
|
||||
try:
|
||||
resolved_vocab_file = cached_path(vocab_file)
|
||||
if resolved_vocab_file == vocab_file:
|
||||
|
||||
logger.info("loading vocabulary file {}".format(vocab_file))
|
||||
else:
|
||||
logger.info("loading vocabulary file {} from cache at {}".format(
|
||||
vocab_file, resolved_vocab_file))
|
||||
# Instantiate tokenizer.
|
||||
tokenizer = cls(resolved_vocab_file, do_lower_case)
|
||||
except FileNotFoundError:
|
||||
logger.error(
|
||||
"Model name '{}' was not found in model name list ({}). "
|
||||
"We assumed '{}' was a path or url but couldn't find any file "
|
||||
"associated to this path or url.".format(
|
||||
pretrained_model_name,
|
||||
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
|
||||
pretrained_model_name))
|
||||
tokenizer = None
|
||||
return tokenizer
|
||||
|
||||
|
||||
class BasicTokenizer(object):
|
||||
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
|
||||
|
||||
def __init__(self, do_lower_case=True):
|
||||
"""Constructs a BasicTokenizer.
|
||||
|
||||
Args:
|
||||
do_lower_case: Whether to lower case the input.
|
||||
"""
|
||||
self.do_lower_case = do_lower_case
|
||||
|
||||
def tokenize(self, text):
|
||||
"""Tokenizes a piece of text."""
|
||||
text = convert_to_unicode(text)
|
||||
text = self._clean_text(text)
|
||||
# This was added on November 1st, 2018 for the multilingual and Chinese
|
||||
# models. This is also applied to the English models now, but it doesn't
|
||||
# matter since the English models were not trained on any Chinese data
|
||||
# and generally don't have any Chinese data in them (there are Chinese
|
||||
# characters in the vocabulary because Wikipedia does have some Chinese
|
||||
# words in the English Wikipedia.).
|
||||
text = self._tokenize_chinese_chars(text)
|
||||
orig_tokens = whitespace_tokenize(text)
|
||||
split_tokens = []
|
||||
for token in orig_tokens:
|
||||
if self.do_lower_case:
|
||||
token = token.lower()
|
||||
token = self._run_strip_accents(token)
|
||||
split_tokens.extend(self._run_split_on_punc(token))
|
||||
|
||||
output_tokens = whitespace_tokenize(" ".join(split_tokens))
|
||||
return output_tokens
|
||||
|
||||
def _run_strip_accents(self, text):
|
||||
"""Strips accents from a piece of text."""
|
||||
text = unicodedata.normalize("NFD", text)
|
||||
output = []
|
||||
for char in text:
|
||||
cat = unicodedata.category(char)
|
||||
if cat == "Mn":
|
||||
continue
|
||||
output.append(char)
|
||||
return "".join(output)
|
||||
|
||||
def _run_split_on_punc(self, text):
|
||||
"""Splits punctuation on a piece of text."""
|
||||
chars = list(text)
|
||||
i = 0
|
||||
start_new_word = True
|
||||
output = []
|
||||
while i < len(chars):
|
||||
char = chars[i]
|
||||
if _is_punctuation(char):
|
||||
output.append([char])
|
||||
start_new_word = True
|
||||
else:
|
||||
if start_new_word:
|
||||
output.append([])
|
||||
start_new_word = False
|
||||
output[-1].append(char)
|
||||
i += 1
|
||||
|
||||
return ["".join(x) for x in output]
|
||||
|
||||
def _tokenize_chinese_chars(self, text):
|
||||
"""Adds whitespace around any CJK character."""
|
||||
output = []
|
||||
for char in text:
|
||||
cp = ord(char)
|
||||
if self._is_chinese_char(cp):
|
||||
output.append(" ")
|
||||
output.append(char)
|
||||
output.append(" ")
|
||||
else:
|
||||
output.append(char)
|
||||
return "".join(output)
|
||||
|
||||
def _is_chinese_char(self, cp):
|
||||
"""Checks whether CP is the codepoint of a CJK character."""
|
||||
# This defines a "chinese character" as anything in the CJK Unicode block:
|
||||
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
|
||||
#
|
||||
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
|
||||
# despite its name. The modern Korean Hangul alphabet is a different block,
|
||||
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
|
||||
# space-separated words, so they are not treated specially and handled
|
||||
# like the all of the other languages.
|
||||
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
|
||||
(cp >= 0x3400 and cp <= 0x4DBF) or #
|
||||
(cp >= 0x20000 and cp <= 0x2A6DF) or #
|
||||
(cp >= 0x2A700 and cp <= 0x2B73F) or #
|
||||
(cp >= 0x2B740 and cp <= 0x2B81F) or #
|
||||
(cp >= 0x2B820 and cp <= 0x2CEAF) or
|
||||
(cp >= 0xF900 and cp <= 0xFAFF) or #
|
||||
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _clean_text(self, text):
|
||||
"""Performs invalid character removal and whitespace cleanup on text."""
|
||||
output = []
|
||||
for char in text:
|
||||
cp = ord(char)
|
||||
if cp == 0 or cp == 0xfffd or _is_control(char):
|
||||
continue
|
||||
if _is_whitespace(char):
|
||||
output.append(" ")
|
||||
else:
|
||||
output.append(char)
|
||||
return "".join(output)
|
||||
|
||||
|
||||
class WordpieceTokenizer(object):
|
||||
"""Runs WordPiece tokenization."""
|
||||
|
||||
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
|
||||
self.vocab = vocab
|
||||
self.unk_token = unk_token
|
||||
self.max_input_chars_per_word = max_input_chars_per_word
|
||||
|
||||
def tokenize(self, text):
|
||||
"""Tokenizes a piece of text into its word pieces.
|
||||
|
||||
This uses a greedy longest-match-first algorithm to perform tokenization
|
||||
using the given vocabulary.
|
||||
|
||||
For example:
|
||||
input = "unaffable"
|
||||
output = ["un", "##aff", "##able"]
|
||||
|
||||
Args:
|
||||
text: A single token or whitespace separated tokens. This should have
|
||||
already been passed through `BasicTokenizer.
|
||||
|
||||
Returns:
|
||||
A list of wordpiece tokens.
|
||||
"""
|
||||
|
||||
text = convert_to_unicode(text)
|
||||
|
||||
output_tokens = []
|
||||
for token in whitespace_tokenize(text):
|
||||
chars = list(token)
|
||||
if len(chars) > self.max_input_chars_per_word:
|
||||
output_tokens.append(self.unk_token)
|
||||
continue
|
||||
|
||||
is_bad = False
|
||||
start = 0
|
||||
sub_tokens = []
|
||||
while start < len(chars):
|
||||
end = len(chars)
|
||||
cur_substr = None
|
||||
while start < end:
|
||||
substr = "".join(chars[start:end])
|
||||
if start > 0:
|
||||
substr = "##" + substr
|
||||
if substr in self.vocab:
|
||||
cur_substr = substr
|
||||
break
|
||||
end -= 1
|
||||
if cur_substr is None:
|
||||
is_bad = True
|
||||
break
|
||||
sub_tokens.append(cur_substr)
|
||||
start = end
|
||||
|
||||
if is_bad:
|
||||
output_tokens.append(self.unk_token)
|
||||
else:
|
||||
output_tokens.extend(sub_tokens)
|
||||
return output_tokens
|
||||
|
||||
|
||||
def _is_whitespace(char):
|
||||
"""Checks whether `chars` is a whitespace character."""
|
||||
# \t, \n, and \r are technically contorl characters but we treat them
|
||||
# as whitespace since they are generally considered as such.
|
||||
if char == " " or char == "\t" or char == "\n" or char == "\r":
|
||||
return True
|
||||
cat = unicodedata.category(char)
|
||||
if cat == "Zs":
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _is_control(char):
|
||||
"""Checks whether `chars` is a control character."""
|
||||
# These are technically control characters but we count them as whitespace
|
||||
# characters.
|
||||
if char == "\t" or char == "\n" or char == "\r":
|
||||
return False
|
||||
cat = unicodedata.category(char)
|
||||
if cat.startswith("C"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _is_punctuation(char):
|
||||
"""Checks whether `chars` is a punctuation character."""
|
||||
cp = ord(char)
|
||||
# We treat all non-letter/number ASCII as punctuation.
|
||||
# Characters such as "^", "$", and "`" are not in the Unicode
|
||||
# Punctuation class but we treat them as punctuation anyways, for
|
||||
# consistency.
|
||||
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
|
||||
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
|
||||
return True
|
||||
cat = unicodedata.category(char)
|
||||
if cat.startswith("P"):
|
||||
return True
|
||||
return False
|
||||
@@ -0,0 +1,62 @@
|
||||
import tensorflow as tf
|
||||
import time
|
||||
|
||||
# report latency and throughput during eval
|
||||
class LogEvalRunHook(tf.train.SessionRunHook):
|
||||
def __init__(self, global_batch_size, hvd_rank=-1):
|
||||
self.global_batch_size = global_batch_size
|
||||
self.hvd_rank = hvd_rank
|
||||
self.total_time = 0.0
|
||||
self.count = 0
|
||||
self.skipped = 0
|
||||
self.time_list = []
|
||||
|
||||
def before_run(self, run_context):
|
||||
self.t0 = time.time()
|
||||
|
||||
def after_run(self, run_context, run_values):
|
||||
elapsed_secs = time.time() - self.t0
|
||||
self.count += 1
|
||||
|
||||
# Removing first 2 (arbitrary) number of startup iterations from perf evaluations
|
||||
if self.count <= 2:
|
||||
print("Skipping time record for ", self.count, " due to overhead")
|
||||
self.skipped += 1
|
||||
else:
|
||||
self.time_list.append(elapsed_secs)
|
||||
self.total_time += elapsed_secs
|
||||
|
||||
# report throughput during training
|
||||
class LogTrainRunHook(tf.train.SessionRunHook):
|
||||
def __init__(self, global_batch_size, hvd_rank=-1, save_checkpoints_steps=1000):
|
||||
self.global_batch_size = global_batch_size
|
||||
self.hvd_rank = hvd_rank
|
||||
self.save_checkpoints_steps = save_checkpoints_steps
|
||||
|
||||
self.total_time = 0.0
|
||||
self.count = 0 # Holds number of iterations, including skipped iterations for fp16 loss scaling
|
||||
|
||||
def after_create_session(self, session, coord):
|
||||
self.init_global_step = session.run(tf.train.get_global_step())
|
||||
|
||||
def before_run(self, run_context):
|
||||
self.t0 = time.time()
|
||||
return tf.train.SessionRunArgs(
|
||||
fetches=['step_update:0'])
|
||||
|
||||
def after_run(self, run_context, run_values):
|
||||
elapsed_secs = time.time() - self.t0
|
||||
self.global_step = run_values.results[0]
|
||||
self.count += 1
|
||||
|
||||
# Removing first step + first two steps after every checkpoint save
|
||||
if (self.global_step - self.init_global_step) % self.save_checkpoints_steps <= 1:
|
||||
print("Skipping time record for ", self.global_step, " due to checkpoint-saving/warmup overhead")
|
||||
else:
|
||||
self.total_time += elapsed_secs
|
||||
|
||||
def end(self, session):
|
||||
num_global_steps = self.global_step - self.init_global_step
|
||||
|
||||
self.skipped = (num_global_steps // self.save_checkpoints_steps) * 2 + \
|
||||
min(2, num_global_steps % self.save_checkpoints_steps) - 1
|
||||
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"server_count": "1",
|
||||
"server_list": [{
|
||||
"device": [
|
||||
{
|
||||
"device_id": "0",
|
||||
"device_ip": "192.168.10.101",
|
||||
"rank_id": "0"
|
||||
}],
|
||||
"server_id": "127.0.0.1"
|
||||
}],
|
||||
"status": "completed",
|
||||
"version": "1.0"
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
{
|
||||
"server_count": "1",
|
||||
"server_list": [{
|
||||
"device": [
|
||||
{
|
||||
"device_id": "0",
|
||||
"device_ip": "192.168.10.101",
|
||||
"rank_id": "0"
|
||||
},
|
||||
{
|
||||
"device_id": "1",
|
||||
"device_ip": "192.168.11.101",
|
||||
"rank_id": "1"
|
||||
},
|
||||
{
|
||||
"device_id": "2",
|
||||
"device_ip": "192.168.12.101",
|
||||
"rank_id": "2"
|
||||
},
|
||||
{
|
||||
"device_id": "3",
|
||||
"device_ip": "192.168.13.101",
|
||||
"rank_id": "3"
|
||||
},
|
||||
{
|
||||
"device_id": "4",
|
||||
"device_ip": "192.168.10.100",
|
||||
"rank_id": "4"
|
||||
},
|
||||
{
|
||||
"device_id": "5",
|
||||
"device_ip": "192.168.11.100",
|
||||
"rank_id": "5"
|
||||
},
|
||||
{
|
||||
"device_id": "6",
|
||||
"device_ip": "192.168.12.100",
|
||||
"rank_id": "6"
|
||||
},
|
||||
{
|
||||
"device_id": "7",
|
||||
"device_ip": "192.168.13.100",
|
||||
"rank_id": "7"
|
||||
}],
|
||||
"server_id": "127.0.0.1"
|
||||
}],
|
||||
"status": "completed",
|
||||
"version": "1.0"
|
||||
}
|
||||
+14
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"hidden_size": 768,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 3072,
|
||||
"max_position_embeddings": 512,
|
||||
"num_attention_heads": 12,
|
||||
"num_hidden_layers": 12,
|
||||
"type_vocab_size": 2,
|
||||
"vocab_size": 21136
|
||||
}
|
||||
|
||||
+14
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"hidden_size": 768,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 3072,
|
||||
"max_position_embeddings": 512,
|
||||
"num_attention_heads": 12,
|
||||
"num_hidden_layers": 12,
|
||||
"type_vocab_size": 2,
|
||||
"vocab_size": 30522
|
||||
}
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"hidden_size": 768,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 3072,
|
||||
"max_position_embeddings": 512,
|
||||
"num_attention_heads": 12,
|
||||
"num_hidden_layers": 6,
|
||||
"type_vocab_size": 2,
|
||||
"vocab_size": 21136
|
||||
}
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"hidden_size": 768,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 3072,
|
||||
"max_position_embeddings": 512,
|
||||
"num_attention_heads": 12,
|
||||
"num_hidden_layers": 6,
|
||||
"type_vocab_size": 2,
|
||||
"vocab_size": 30522
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
|
||||
#toolkit env
|
||||
#export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
|
||||
#export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$utilDir
|
||||
#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
|
||||
#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
|
||||
#nnae env
|
||||
#export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/
|
||||
#export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:$utilDir
|
||||
#export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin
|
||||
#export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
|
||||
|
||||
if [ -d /usr/local/Ascend/nnae/latest ];then
|
||||
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
|
||||
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:$utilDir
|
||||
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
|
||||
else
|
||||
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
|
||||
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$utilDir
|
||||
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
|
||||
fi
|
||||
|
||||
export NEW_GE_FE_ID=1
|
||||
export GE_AICPU_FLAG=1
|
||||
export SOC_VERSION=Ascend910
|
||||
#export DUMP_GE_GRAPH=2
|
||||
#export DUMP_GRAPH_LEVEL=3
|
||||
#export PRINT_MODEL=1
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
export HCCL_CONNECT_TIMEOUT=600
|
||||
|
||||
|
||||
# system env
|
||||
#ulimit -c unlimited
|
||||
@@ -0,0 +1,67 @@
|
||||
#!/bin/bash
|
||||
rank_size=$1
|
||||
yamlPath=$2
|
||||
toolsPath=$3
|
||||
if [ -f /.dockerenv ];then
|
||||
CLUSTER=$4
|
||||
MPIRUN_ALL_IP="$5"
|
||||
export CLUSTER=${CLUSTER}
|
||||
fi
|
||||
|
||||
currentDir=$(cd "$(dirname "$0")/.."; pwd)
|
||||
currtime=`date +%Y%m%d%H%M%S`
|
||||
mkdir -p ${currentDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
|
||||
train_job_dir=${currentDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] see more config info in ${currentDir}/config"
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] train result in ${train_job_dir}"
|
||||
|
||||
# 从 yaml 获取配置
|
||||
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
|
||||
|
||||
# device 列表, 若无指定 device 根据 rank_size 顺序选择
|
||||
eval device_group=\$device_group_${rank_size}p
|
||||
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
|
||||
device_group="$(seq 0 "$(expr $rank_size - 1)")"
|
||||
fi
|
||||
|
||||
# get last device id in device_group, hw log in performance from the dir named first_device_id
|
||||
device_group_str=`echo ${device_group} | sed 's/ //g'`
|
||||
first_device_id=`echo ${device_group_str: 0:1}`
|
||||
|
||||
# user env
|
||||
export JOB_ID=9999001
|
||||
export RANK_TABLE_FILE=${currentDir}/config/${rank_size}p.json
|
||||
export RANK_SIZE=${rank_size}
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
export DEVICE_ID=0
|
||||
export DEVICE_INDEX=$DEVICE_ID
|
||||
|
||||
if [ x"${CLUSTER}" == x"True" ];then
|
||||
# ln hw log
|
||||
ln -snf ${train_job_dir}/0/hw_bert.log ${train_job_dir}
|
||||
this_ip=$(hostname -I |awk '{print $1}')
|
||||
for ip in $MPIRUN_ALL_IP;do
|
||||
if [ x"$ip" != x"$this_ip" ];then
|
||||
scp $yamlPath root@$ip:$yamlPath
|
||||
fi
|
||||
done
|
||||
export PATH=$PATH:/usr/local/mpirun4.0/bin
|
||||
mpirun -H ${mpirun_ip} \
|
||||
--bind-to none -map-by slot\
|
||||
--allow-run-as-root \
|
||||
--mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
|
||||
--prefix /usr/local/mpirun4.0/ \
|
||||
${currentDir}/scripts/train.sh 0 $currtime $yamlPath 0 True ${toolsPath} ${rank_size}
|
||||
else
|
||||
# ln hw log
|
||||
ln -snf ${train_job_dir}/${first_device_id}/hw_bert.log ${train_job_dir}
|
||||
rank_id=0
|
||||
for device_id in ${device_group};do
|
||||
#echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train ${device_id} & " >> ./main.log
|
||||
${currentDir}/scripts/train.sh $device_id $currtime $yamlPath $rank_id solo ${toolsPath} ${rank_size} &
|
||||
let rank_id++
|
||||
done
|
||||
fi
|
||||
wait
|
||||
|
||||
|
||||
@@ -0,0 +1,157 @@
|
||||
#!/bin/bash
|
||||
# 0 $currtime $yamlPath 0 cluster ${toolsPath}
|
||||
device_id=$1
|
||||
currtime=$2
|
||||
yamlPath=$3
|
||||
toolsPath=$6
|
||||
rank_size=$7
|
||||
|
||||
|
||||
export YAML_PATH=$3
|
||||
|
||||
mainDir=$(cd "$(dirname "$0")/.."; pwd)
|
||||
|
||||
mkdir -p ${mainDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
|
||||
export train_job_dir=${mainDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
|
||||
|
||||
|
||||
#exec_path=${train_job_dir}
|
||||
|
||||
cd ${train_job_dir}
|
||||
|
||||
export utilDir=$(cd "$(dirname "$yamlPath")/../atlas_benchmark-master/utils"; pwd)
|
||||
export utilDir=$(cd "$(dirname "$yamlPath")/../atlas_benchmark-master/utils/atlasboost"; pwd)
|
||||
source ${mainDir}/config/npu_set_env.sh
|
||||
|
||||
|
||||
# 从 yaml 获取配置
|
||||
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
|
||||
|
||||
# 声明变量
|
||||
export REMARK_LOG_FILE=hw_bert.log # 打点日志文件名称, 必须hw_后跟模型名称小写
|
||||
# 添加日志打点模块路径
|
||||
benchmark_log_path=${mainDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
|
||||
export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
|
||||
|
||||
export JOB_ID=9999001
|
||||
export RANK_TABLE_FILE=${mainDir}/config/${rank_size}p.json
|
||||
export RANK_SIZE=${rank_size}
|
||||
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
export DEVICE_ID=${device_id}
|
||||
export DEVICE_INDEX=$DEVICE_ID
|
||||
export RANK_INDEX=0
|
||||
|
||||
|
||||
export PROFILING_OPTIONS=${PROFILING_OPTIONS}
|
||||
export FP_POINT=${FP_POINT}
|
||||
export BP_POINT=${BP_POINT}
|
||||
|
||||
if [ ${PROFILING_MODE} == True ];
|
||||
then
|
||||
export PROFILING_MODE=true
|
||||
else
|
||||
export PROFILING_MODE=false
|
||||
fi
|
||||
|
||||
if [ ${PROFILING_MODE} == True ];
|
||||
then
|
||||
export AICPU_PROFILING_MODE=true
|
||||
else
|
||||
export AICPU_PROFILING_MODE=false
|
||||
fi
|
||||
|
||||
|
||||
if [ x"${device_id}" = x ] ;
|
||||
then
|
||||
echo "turing train fail" >> ${exec_path}/train_${device_id}.log
|
||||
exit
|
||||
else
|
||||
export DEVICE_ID=${device_id}
|
||||
fi
|
||||
|
||||
|
||||
env > ${currentDir}/env_${device_id}.log
|
||||
|
||||
cd ${train_job_dir}
|
||||
|
||||
if [ x"$5" != x"True" ];then
|
||||
rank_id=$4
|
||||
export RANK_ID=$4
|
||||
else
|
||||
device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
|
||||
device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
|
||||
atlasboost.set_device_id(device_id);print(atlasboost.rank())")
|
||||
device_id_mo=`echo $device_id_mo`
|
||||
rank_id=${device_id_mo##* }
|
||||
#echo rank_id is $rank_id
|
||||
export RANK_ID=${rank_id}
|
||||
device=${device_id_mo##*deviceid = }
|
||||
device_id=${device%% phyid=*}
|
||||
export DEVICE_ID=${device_id}
|
||||
#echo device_id is $device_id
|
||||
hccljson=${train_job_dir}/*.json
|
||||
cp ${hccljson} ${mainDir}/config/${rank_size}p.json
|
||||
fi
|
||||
env > ${currentDir}/env_${device_id}.log
|
||||
#mkdir exec path
|
||||
|
||||
|
||||
mkdir -p ${train_job_dir}/${device_id}/ckpt${DEVICE_ID}
|
||||
cd ${train_job_dir}/${device_id}
|
||||
|
||||
startTime=`date +%Y%m%d-%H:%M:%S`
|
||||
startTime_s=`date +%s`
|
||||
|
||||
|
||||
#start exec
|
||||
python3.7 ${mainDir}/code/pretrain/run_pretraining.py \
|
||||
--bert_config_file=${mainDir}/config/${bert_config_file} \
|
||||
--max_seq_length=${max_seq_length} \
|
||||
--max_predictions_per_seq=${max_predictions_per_seq} \
|
||||
--train_batch_size=${train_batch_size} \
|
||||
--learning_rate=${learning_rate} \
|
||||
--num_warmup_steps=${num_warmup_steps} \
|
||||
--num_train_steps=${num_train_steps} \
|
||||
--optimizer_type=${optimizer_type} \
|
||||
--manual_fp16=${manual_fp16} \
|
||||
--use_fp16_cls=${use_fp16_cls} \
|
||||
--input_files_dir=${input_files_dir} \
|
||||
--eval_files_dir=${eval_files_dir} \
|
||||
--npu_bert_debug=${npu_bert_debug} \
|
||||
--npu_bert_use_tdt=${npu_bert_use_tdt} \
|
||||
--do_train=${do_train} \
|
||||
--do_eval=${do_eval} \
|
||||
--num_accumulation_steps=${num_accumulation_steps} \
|
||||
--npu_bert_job_start_file=None \
|
||||
--iterations_per_loop=${iterations_per_loop} \
|
||||
--npu_bert_loss_scale=${npu_bert_loss_scale} \
|
||||
--distributed=${distributed} \
|
||||
--save_checkpoints_steps=${save_checkpoints_steps} \
|
||||
--npu_bert_clip_by_global_norm=${npu_bert_clip_by_global_norm} \
|
||||
--output_dir=${train_job_dir}/${device_id}/ckpt${DEVICE_ID} > ${train_job_dir}/train_${device_id}.log 2>&1
|
||||
|
||||
|
||||
if [ $? -eq 0 ] ;then
|
||||
echo ":::ABK 1.0.0 bert train success"
|
||||
echo ":::ABK 1.0.0 bert train success" >> ${train_job_dir}/train_${device_id}.log
|
||||
echo ":::ABK 1.0.0 bert train success" >> ${train_job_dir}/${device_id}/hw_bert.log
|
||||
else
|
||||
echo ":::ABK 1.0.0 bert train failed"
|
||||
echo ":::ABK 1.0.0 bert train failed" >> ${train_job_dir}/train_${device_id}.log
|
||||
echo ":::ABK 1.0.0 bert train failed" >> ${train_job_dir}/${device_id}/hw_bert.log
|
||||
fi
|
||||
|
||||
endTime=`date +%Y%m%d-%H:%M:%S`
|
||||
endTime_s=`date +%s`
|
||||
sumTime=$[ $endTime_s - $startTime_s ]
|
||||
hour=$(( $sumTime/3600 ))
|
||||
min=$(( ($sumTime-${hour}*3600)/60 ))
|
||||
sec=$(( $sumTime-${hour}*3600-${min}*60 ))
|
||||
echo ":::ABK 1.0.0 bert train total time ${hour}:${min}:${sec}"
|
||||
echo ":::ABK 1.0.0 bert train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_bert.log
|
||||
|
||||
#if [ x"$5" == x"solo" ];
|
||||
#then
|
||||
# /bin/cp -f hw_bert.log $perfDir/hw_bert.log
|
||||
#fi
|
||||
Reference in New Issue
Block a user