[add]上传训练benchmark by z00560161

2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,56 @@
+# Bert-Base_tensorflow训练说明
+
+### 1. 模型训练参数配置
+
+在train/yaml/Bert-Base.yaml中修改相应配置， 配置项含义:
+
+```
+ tensorflow_config:
+    #layer层数有6和12两种，中文数据集用 bert_base_layer6_cn.json/bert_base_layer12_cn.json 英文用bert_base_layer6_cn.json/bert_base_layer12_en.json
+    bert_config_file: bert_base_layer6_cn.json
+    #数据集句子长度是256时 设置为 256,40，句子长度是128时设置为128,20 
+    max_seq_length: 128
+    max_predictions_per_seq: 20
+    
+    # 最佳性能train_batch_size为160 
+    train_batch_size: 160
+    learning_rate: 1e-4
+    num_warmup_steps: 100
+    num_train_steps: 1000
+    optimizer_type: adam
+    manual_fp16: True
+    use_fp16_cls: True
+    input_files_dir: 数据集路径
+    eval_files_dir: 数据集路径
+    npu_bert_debug: False
+    npu_bert_use_tdt: True
+    distributed: True
+    do_train: True
+    do_eval: False
+    num_accumulation_steps: 1
+    iterations_per_loop: 100
+    npu_bert_loss_scale: 0
+    save_checkpoints_steps: 1000
+    npu_bert_clip_by_global_norm: False
+
+    # docker 镜像名称:版本号
+    docker_image: c73:b021
+
+    # 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
+    mpirun_ip: 90.90.140.199:8,90.90.140.229:8
+
+    # 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
+    device_group_1p: 6
+    device_group_2p: 0 1
+    device_group_4p: 0 1 2 3
+```
+
+------
+
+
+
+
+
+
+
+    
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
@@ -0,0 +1,13 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "type_vocab_size": 2,
+  "vocab_size": 30522 
+}
@@ -0,0 +1,442 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Create masked LM/next sentence masked_lm TF examples for BERT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import random
+import tokenization
+import tensorflow as tf
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_file", None,
+                    "Input raw text file (or comma-separated list of files).")
+
+flags.DEFINE_string(
+    "output_file", None,
+    "Output TF example file (or comma-separated list of files).")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
+
+flags.DEFINE_integer("max_predictions_per_seq", 20,
+                     "Maximum number of masked LM predictions per sequence.")
+
+flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
+
+flags.DEFINE_integer(
+    "dupe_factor", 10,
+    "Number of times to duplicate the input data (with different masks).")
+
+flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
+
+flags.DEFINE_float(
+    "short_seq_prob", 0.1,
+    "Probability of creating sequences which are shorter than the "
+    "maximum length.")
+
+
+class TrainingInstance(object):
+  """A single training instance (sentence pair)."""
+
+  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
+               is_random_next):
+    self.tokens = tokens
+    self.segment_ids = segment_ids
+    self.is_random_next = is_random_next
+    self.masked_lm_positions = masked_lm_positions
+    self.masked_lm_labels = masked_lm_labels
+
+  def __str__(self):
+    s = ""
+    s += "tokens: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.tokens]))
+    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
+    s += "is_random_next: %s\n" % self.is_random_next
+    s += "masked_lm_positions: %s\n" % (" ".join(
+        [str(x) for x in self.masked_lm_positions]))
+    s += "masked_lm_labels: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
+    s += "\n"
+    return s
+
+  def __repr__(self):
+    return self.__str__()
+
+
+def write_instance_to_example_files(instances, tokenizer, max_seq_length,
+                                    max_predictions_per_seq, output_files):
+  """Create TF example files from `TrainingInstance`s."""
+  writers = []
+  for output_file in output_files:
+    writers.append(tf.python_io.TFRecordWriter(output_file))
+
+  writer_index = 0
+
+  total_written = 0
+  for (inst_index, instance) in enumerate(instances):
+    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+    input_mask = [1] * len(input_ids)
+    segment_ids = list(instance.segment_ids)
+    assert len(input_ids) <= max_seq_length
+
+    while len(input_ids) < max_seq_length:
+      input_ids.append(0)
+      input_mask.append(0)
+      segment_ids.append(0)
+
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+
+    masked_lm_positions = list(instance.masked_lm_positions)
+    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
+    masked_lm_weights = [1.0] * len(masked_lm_ids)
+
+    while len(masked_lm_positions) < max_predictions_per_seq:
+      masked_lm_positions.append(0)
+      masked_lm_ids.append(0)
+      masked_lm_weights.append(0.0)
+
+    next_sentence_label = 1 if instance.is_random_next else 0
+
+    features = collections.OrderedDict()
+    features["input_ids"] = create_int_feature(input_ids)
+    features["input_mask"] = create_int_feature(input_mask)
+    features["segment_ids"] = create_int_feature(segment_ids)
+    features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
+    features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
+    features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
+    features["next_sentence_labels"] = create_int_feature([next_sentence_label])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+
+    writers[writer_index].write(tf_example.SerializeToString())
+    writer_index = (writer_index + 1) % len(writers)
+
+    total_written += 1
+
+    if inst_index < 20:
+      tf.logging.info("*** Example ***")
+      tf.logging.info("tokens: %s" % " ".join(
+          [tokenization.printable_text(x) for x in instance.tokens]))
+
+      for feature_name in features.keys():
+        feature = features[feature_name]
+        values = []
+        if feature.int64_list.value:
+          values = feature.int64_list.value
+        elif feature.float_list.value:
+          values = feature.float_list.value
+        tf.logging.info(
+            "%s: %s" % (feature_name, " ".join([str(x) for x in values])))
+
+  for writer in writers:
+    writer.close()
+
+  tf.logging.info("Wrote %d total instances", total_written)
+
+
+def create_int_feature(values):
+  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+  return feature
+
+
+def create_float_feature(values):
+  feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+  return feature
+
+
+def create_training_instances(input_files, tokenizer, max_seq_length,
+                              dupe_factor, short_seq_prob, masked_lm_prob,
+                              max_predictions_per_seq, rng):
+  """Create `TrainingInstance`s from raw text."""
+  all_documents = [[]]
+
+  # Input file format:
+  # (1) One sentence per line. These should ideally be actual sentences, not
+  # entire paragraphs or arbitrary spans of text. (Because we use the
+  # sentence boundaries for the "next sentence prediction" task).
+  # (2) Blank lines between documents. Document boundaries are needed so
+  # that the "next sentence prediction" task doesn't span between documents.
+  for input_file in input_files:
+    with tf.gfile.GFile(input_file, "r") as reader:
+      while True:
+        line = tokenization.convert_to_unicode(reader.readline())
+        if not line:
+          break
+        line = line.strip()
+
+        # Empty lines are used as document delimiters
+        if not line:
+          all_documents.append([])
+        tokens = tokenizer.tokenize(line)
+        if tokens:
+          all_documents[-1].append(tokens)
+
+  # Remove empty documents
+  all_documents = [x for x in all_documents if x]
+  rng.shuffle(all_documents)
+
+  vocab_words = list(tokenizer.vocab.keys())
+  instances = []
+  for _ in range(dupe_factor):
+    for document_index in range(len(all_documents)):
+      instances.extend(
+          create_instances_from_document(
+              all_documents, document_index, max_seq_length, short_seq_prob,
+              masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
+
+  rng.shuffle(instances)
+  return instances
+
+
+def create_instances_from_document(
+    all_documents, document_index, max_seq_length, short_seq_prob,
+    masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
+  """Creates `TrainingInstance`s for a single document."""
+  document = all_documents[document_index]
+
+  # Account for [CLS], [SEP], [SEP]
+  max_num_tokens = max_seq_length - 3
+
+  # We *usually* want to fill up the entire sequence since we are padding
+  # to `max_seq_length` anyways, so short sequences are generally wasted
+  # computation. However, we *sometimes*
+  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+  # sequences to minimize the mismatch between pre-training and fine-tuning.
+  # The `target_seq_length` is just a rough target however, whereas
+  # `max_seq_length` is a hard limit.
+  target_seq_length = max_num_tokens
+  if rng.random() < short_seq_prob:
+    target_seq_length = rng.randint(2, max_num_tokens)
+
+  # We DON'T just concatenate all of the tokens from a document into a long
+  # sequence and choose an arbitrary split point because this would make the
+  # next sentence prediction task too easy. Instead, we split the input into
+  # segments "A" and "B" based on the actual "sentences" provided by the user
+  # input.
+  instances = []
+  current_chunk = []
+  current_length = 0
+  i = 0
+  while i < len(document):
+    segment = document[i]
+    current_chunk.append(segment)
+    current_length += len(segment)
+    if i == len(document) - 1 or current_length >= target_seq_length:
+      if current_chunk:
+        # `a_end` is how many segments from `current_chunk` go into the `A`
+        # (first) sentence.
+        a_end = 1
+        if len(current_chunk) >= 2:
+          a_end = rng.randint(1, len(current_chunk) - 1)
+
+        tokens_a = []
+        for j in range(a_end):
+          tokens_a.extend(current_chunk[j])
+
+        tokens_b = []
+        # Random next
+        is_random_next = False
+        if len(current_chunk) == 1 or rng.random() < 0.5:
+          is_random_next = True
+          target_b_length = target_seq_length - len(tokens_a)
+
+          # This should rarely go for more than one iteration for large
+          # corpora. However, just to be careful, we try to make sure that
+          # the random document is not the same as the document
+          # we're processing.
+          for _ in range(10):
+            random_document_index = rng.randint(0, len(all_documents) - 1)
+            if random_document_index != document_index:
+              break
+
+          random_document = all_documents[random_document_index]
+          random_start = rng.randint(0, len(random_document) - 1)
+          for j in range(random_start, len(random_document)):
+            tokens_b.extend(random_document[j])
+            if len(tokens_b) >= target_b_length:
+              break
+          # We didn't actually use these segments so we "put them back" so
+          # they don't go to waste.
+          num_unused_segments = len(current_chunk) - a_end
+          i -= num_unused_segments
+        # Actual next
+        else:
+          is_random_next = False
+          for j in range(a_end, len(current_chunk)):
+            tokens_b.extend(current_chunk[j])
+        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+
+        assert len(tokens_a) >= 1
+        assert len(tokens_b) >= 1
+
+        tokens = []
+        segment_ids = []
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+        for token in tokens_a:
+          tokens.append(token)
+          segment_ids.append(0)
+
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+
+        for token in tokens_b:
+          tokens.append(token)
+          segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+        (tokens, masked_lm_positions,
+         masked_lm_labels) = create_masked_lm_predictions(
+             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
+        instance = TrainingInstance(
+            tokens=tokens,
+            segment_ids=segment_ids,
+            is_random_next=is_random_next,
+            masked_lm_positions=masked_lm_positions,
+            masked_lm_labels=masked_lm_labels)
+        instances.append(instance)
+      current_chunk = []
+      current_length = 0
+    i += 1
+
+  return instances
+
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+
+def create_masked_lm_predictions(tokens, masked_lm_prob,
+                                 max_predictions_per_seq, vocab_words, rng):
+  """Creates the predictions for the masked LM objective."""
+
+  cand_indexes = []
+  for (i, token) in enumerate(tokens):
+    if token == "[CLS]" or token == "[SEP]":
+      continue
+    cand_indexes.append(i)
+
+  rng.shuffle(cand_indexes)
+
+  output_tokens = list(tokens)
+
+  num_to_predict = min(max_predictions_per_seq,
+                       max(1, int(round(len(tokens) * masked_lm_prob))))
+
+  masked_lms = []
+  covered_indexes = set()
+  for index in cand_indexes:
+    if len(masked_lms) >= num_to_predict:
+      break
+    if index in covered_indexes:
+      continue
+    covered_indexes.add(index)
+
+    masked_token = None
+    # 80% of the time, replace with [MASK]
+    if rng.random() < 0.8:
+      masked_token = "[MASK]"
+    else:
+      # 10% of the time, keep original
+      if rng.random() < 0.5:
+        masked_token = tokens[index]
+      # 10% of the time, replace with random word
+      else:
+        masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
+
+    output_tokens[index] = masked_token
+
+    masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+
+  masked_lms = sorted(masked_lms, key=lambda x: x.index)
+
+  masked_lm_positions = []
+  masked_lm_labels = []
+  for p in masked_lms:
+    masked_lm_positions.append(p.index)
+    masked_lm_labels.append(p.label)
+
+  return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+  """Truncates a pair of sequences to a maximum sequence length."""
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_num_tokens:
+      break
+
+    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+    assert len(trunc_tokens) >= 1
+
+    # We want to sometimes truncate from the front and sometimes from the
+    # back to add more randomness and avoid biases.
+    if rng.random() < 0.5:
+      del trunc_tokens[0]
+    else:
+      trunc_tokens.pop()
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+  input_files = []
+  for input_pattern in FLAGS.input_file.split(","):
+    input_files.extend(tf.gfile.Glob(input_pattern))
+
+  tf.logging.info("*** Reading from input files ***")
+  for input_file in input_files:
+    tf.logging.info("  %s", input_file)
+
+  rng = random.Random(FLAGS.random_seed)
+  instances = create_training_instances(
+      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
+      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
+      rng)
+
+  output_files = FLAGS.output_file.split(",")
+  tf.logging.info("*** Writing to output files ***")
+  for output_file in output_files:
+    tf.logging.info("  %s", output_file)
+
+  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
+                                  FLAGS.max_predictions_per_seq, output_files)
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("input_file")
+  flags.mark_flag_as_required("output_file")
+  flags.mark_flag_as_required("vocab_file")
+  tf.app.run()
@@ -0,0 +1,419 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Extract pre-computed feature vectors from BERT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import codecs
+import collections
+import json
+import re
+
+import modeling
+import tokenization
+import tensorflow as tf
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_file", None, "")
+
+flags.DEFINE_string("output_file", None, "")
+
+flags.DEFINE_string("layers", "-1,-2,-3,-4", "")
+
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.")
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded.")
+
+flags.DEFINE_string(
+    "init_checkpoint", None,
+    "Initial checkpoint (usually from a pre-trained BERT model).")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.")
+
+flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
+
+flags.DEFINE_string("master", None,
+                    "If using a TPU, the address of the master.")
+
+flags.DEFINE_integer(
+    "num_tpu_cores", 8,
+    "Only used if `use_tpu` is True. Total number of TPU cores to use.")
+
+flags.DEFINE_bool(
+    "use_one_hot_embeddings", False,
+    "If True, tf.one_hot will be used for embedding lookups, otherwise "
+    "tf.nn.embedding_lookup will be used. On TPUs, this should be True "
+    "since it is much faster.")
+
+
+class InputExample(object):
+
+  def __init__(self, unique_id, text_a, text_b):
+    self.unique_id = unique_id
+    self.text_a = text_a
+    self.text_b = text_b
+
+
+class InputFeatures(object):
+  """A single set of features of data."""
+
+  def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
+    self.unique_id = unique_id
+    self.tokens = tokens
+    self.input_ids = input_ids
+    self.input_mask = input_mask
+    self.input_type_ids = input_type_ids
+
+
+def input_fn_builder(features, seq_length):
+  """Creates an `input_fn` closure to be passed to TPUEstimator."""
+
+  all_unique_ids = []
+  all_input_ids = []
+  all_input_mask = []
+  all_input_type_ids = []
+
+  for feature in features:
+    all_unique_ids.append(feature.unique_id)
+    all_input_ids.append(feature.input_ids)
+    all_input_mask.append(feature.input_mask)
+    all_input_type_ids.append(feature.input_type_ids)
+
+  def input_fn(params):
+    """The actual input function."""
+    batch_size = params["batch_size"]
+
+    num_examples = len(features)
+
+    # This is for demo purposes and does NOT scale to large data sets. We do
+    # not use Dataset.from_generator() because that uses tf.py_func which is
+    # not TPU compatible. The right way to load data is with TFRecordReader.
+    d = tf.data.Dataset.from_tensor_slices({
+        "unique_ids":
+            tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
+        "input_ids":
+            tf.constant(
+                all_input_ids, shape=[num_examples, seq_length],
+                dtype=tf.int32),
+        "input_mask":
+            tf.constant(
+                all_input_mask,
+                shape=[num_examples, seq_length],
+                dtype=tf.int32),
+        "input_type_ids":
+            tf.constant(
+                all_input_type_ids,
+                shape=[num_examples, seq_length],
+                dtype=tf.int32),
+    })
+
+    d = d.batch(batch_size=batch_size, drop_remainder=False)
+    return d
+
+  return input_fn
+
+
+def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
+                     use_one_hot_embeddings):
+  """Returns `model_fn` closure for TPUEstimator."""
+
+  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+    """The `model_fn` for TPUEstimator."""
+
+    unique_ids = features["unique_ids"]
+    input_ids = features["input_ids"]
+    input_mask = features["input_mask"]
+    input_type_ids = features["input_type_ids"]
+
+    model = modeling.BertModel(
+        config=bert_config,
+        is_training=False,
+        input_ids=input_ids,
+        input_mask=input_mask,
+        token_type_ids=input_type_ids,
+        use_one_hot_embeddings=use_one_hot_embeddings)
+
+    if mode != tf.estimator.ModeKeys.PREDICT:
+      raise ValueError("Only PREDICT modes are supported: %s" % (mode))
+
+    tvars = tf.trainable_variables()
+    scaffold_fn = None
+    (assignment_map,
+     initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
+         tvars, init_checkpoint)
+    if use_tpu:
+
+      def tpu_scaffold():
+        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+        return tf.train.Scaffold()
+
+      scaffold_fn = tpu_scaffold
+    else:
+      tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+    tf.logging.info("**** Trainable Variables ****")
+    for var in tvars:
+      init_string = ""
+      if var.name in initialized_variable_names:
+        init_string = ", *INIT_FROM_CKPT*"
+      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+                      init_string)
+
+    all_layers = model.get_all_encoder_layers()
+
+    predictions = {
+        "unique_id": unique_ids,
+    }
+
+    for (i, layer_index) in enumerate(layer_indexes):
+      predictions["layer_output_%d" % i] = all_layers[layer_index]
+
+    output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+        mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
+    return output_spec
+
+  return model_fn
+
+
+def convert_examples_to_features(examples, seq_length, tokenizer):
+  """Loads a data file into a list of `InputBatch`s."""
+
+  features = []
+  for (ex_index, example) in enumerate(examples):
+    tokens_a = tokenizer.tokenize(example.text_a)
+
+    tokens_b = None
+    if example.text_b:
+      tokens_b = tokenizer.tokenize(example.text_b)
+
+    if tokens_b:
+      # Modifies `tokens_a` and `tokens_b` in place so that the total
+      # length is less than the specified length.
+      # Account for [CLS], [SEP], [SEP] with "- 3"
+      _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
+    else:
+      # Account for [CLS] and [SEP] with "- 2"
+      if len(tokens_a) > seq_length - 2:
+        tokens_a = tokens_a[0:(seq_length - 2)]
+
+    # The convention in BERT is:
+    # (a) For sequence pairs:
+    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+    # (b) For single sequences:
+    #  tokens:   [CLS] the dog is hairy . [SEP]
+    #  type_ids: 0     0   0   0  0     0 0
+    #
+    # Where "type_ids" are used to indicate whether this is the first
+    # sequence or the second sequence. The embedding vectors for `type=0` and
+    # `type=1` were learned during pre-training and are added to the wordpiece
+    # embedding vector (and position vector). This is not *strictly* necessary
+    # since the [SEP] token unambiguously separates the sequences, but it makes
+    # it easier for the model to learn the concept of sequences.
+    #
+    # For classification tasks, the first vector (corresponding to [CLS]) is
+    # used as as the "sentence vector". Note that this only makes sense because
+    # the entire model is fine-tuned.
+    tokens = []
+    input_type_ids = []
+    tokens.append("[CLS]")
+    input_type_ids.append(0)
+    for token in tokens_a:
+      tokens.append(token)
+      input_type_ids.append(0)
+    tokens.append("[SEP]")
+    input_type_ids.append(0)
+
+    if tokens_b:
+      for token in tokens_b:
+        tokens.append(token)
+        input_type_ids.append(1)
+      tokens.append("[SEP]")
+      input_type_ids.append(1)
+
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+
+    # Zero-pad up to the sequence length.
+    while len(input_ids) < seq_length:
+      input_ids.append(0)
+      input_mask.append(0)
+      input_type_ids.append(0)
+
+    assert len(input_ids) == seq_length
+    assert len(input_mask) == seq_length
+    assert len(input_type_ids) == seq_length
+
+    if ex_index < 5:
+      tf.logging.info("*** Example ***")
+      tf.logging.info("unique_id: %s" % (example.unique_id))
+      tf.logging.info("tokens: %s" % " ".join(
+          [tokenization.printable_text(x) for x in tokens]))
+      tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+      tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+      tf.logging.info(
+          "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
+
+    features.append(
+        InputFeatures(
+            unique_id=example.unique_id,
+            tokens=tokens,
+            input_ids=input_ids,
+            input_mask=input_mask,
+            input_type_ids=input_type_ids))
+  return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+  """Truncates a sequence pair in place to the maximum length."""
+
+  # This is a simple heuristic which will always truncate the longer sequence
+  # one token at a time. This makes more sense than truncating an equal percent
+  # of tokens from each, since if one sequence is very short then each token
+  # that's truncated likely contains more information than a longer sequence.
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_length:
+      break
+    if len(tokens_a) > len(tokens_b):
+      tokens_a.pop()
+    else:
+      tokens_b.pop()
+
+
+def read_examples(input_file):
+  """Read a list of `InputExample`s from an input file."""
+  examples = []
+  unique_id = 0
+  with tf.gfile.GFile(input_file, "r") as reader:
+    while True:
+      line = tokenization.convert_to_unicode(reader.readline())
+      if not line:
+        break
+      line = line.strip()
+      text_a = None
+      text_b = None
+      m = re.match(r"^(.*) \|\|\| (.*)$", line)
+      if m is None:
+        text_a = line
+      else:
+        text_a = m.group(1)
+        text_b = m.group(2)
+      examples.append(
+          InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
+      unique_id += 1
+  return examples
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  layer_indexes = [int(x) for x in FLAGS.layers.split(",")]
+
+  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+  run_config = tf.contrib.tpu.RunConfig(
+      master=FLAGS.master,
+      tpu_config=tf.contrib.tpu.TPUConfig(
+          num_shards=FLAGS.num_tpu_cores,
+          per_host_input_for_training=is_per_host))
+
+  examples = read_examples(FLAGS.input_file)
+
+  features = convert_examples_to_features(
+      examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
+
+  unique_id_to_feature = {}
+  for feature in features:
+    unique_id_to_feature[feature.unique_id] = feature
+
+  model_fn = model_fn_builder(
+      bert_config=bert_config,
+      init_checkpoint=FLAGS.init_checkpoint,
+      layer_indexes=layer_indexes,
+      use_tpu=FLAGS.use_tpu,
+      use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)
+
+  # If TPU is not available, this will fall back to normal Estimator on CPU
+  # or GPU.
+  estimator = tf.contrib.tpu.TPUEstimator(
+      use_tpu=FLAGS.use_tpu,
+      model_fn=model_fn,
+      config=run_config,
+      predict_batch_size=FLAGS.batch_size)
+
+  input_fn = input_fn_builder(
+      features=features, seq_length=FLAGS.max_seq_length)
+
+  with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file,
+                                               "w")) as writer:
+    for result in estimator.predict(input_fn, yield_single_examples=True):
+      unique_id = int(result["unique_id"])
+      feature = unique_id_to_feature[unique_id]
+      output_json = collections.OrderedDict()
+      output_json["linex_index"] = unique_id
+      all_features = []
+      for (i, token) in enumerate(feature.tokens):
+        all_layers = []
+        for (j, layer_index) in enumerate(layer_indexes):
+          layer_output = result["layer_output_%d" % j]
+          layers = collections.OrderedDict()
+          layers["index"] = layer_index
+          layers["values"] = [
+              round(float(x), 6) for x in layer_output[i:(i + 1)].flat
+          ]
+          all_layers.append(layers)
+        features = collections.OrderedDict()
+        features["token"] = token
+        features["layers"] = all_layers
+        all_features.append(features)
+      output_json["features"] = all_features
+      writer.write(json.dumps(output_json) + "\n")
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("input_file")
+  flags.mark_flag_as_required("vocab_file")
+  flags.mark_flag_as_required("bert_config_file")
+  flags.mark_flag_as_required("init_checkpoint")
+  flags.mark_flag_as_required("output_file")
+  tf.app.run()
@@ -0,0 +1,35 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tensorflow as tf
+import numpy as np
+
+
+def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
+                                    initializer=None, regularizer=None,
+                                    trainable=True,
+                                    *args, **kwargs):
+    """Custom variable getter that forces trainable variables to be stored in
+       float32 precision and then casts them to the training precision.
+    """
+    storage_dtype = tf.float32 if trainable else dtype
+    variable = getter(name, shape, dtype=storage_dtype,
+                      initializer=initializer, regularizer=regularizer,
+                      trainable=trainable,
+                      *args, **kwargs)
+    if trainable and dtype != tf.float32:
+        variable = tf.cast(variable, dtype)
+    return variable
+
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import copy
+import json
+import math
+import re
+import six
+import tensorflow as tf
+
+from tensorflow.python.framework import ops
+from tensorflow.contrib.layers.python.layers import utils
+from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.ops import init_ops
+import numpy
+from tensorflow.python.ops import array_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import nn
+
+def fused_layer_norm(inputs,
+               center=True,
+               scale=True,
+               activation_fn=None,
+               reuse=None,
+               variables_collections=None,
+               outputs_collections=None,
+               trainable=True,
+               begin_norm_axis=1,
+               begin_params_axis=-1,
+               scope=None,
+               use_fused_batch_norm=False):
+  with tf.variable_scope(
+      scope, 'LayerNorm', [inputs], reuse=reuse) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    inputs_shape = inputs.shape
+    inputs_rank = inputs_shape.ndims
+    if inputs_rank is None:
+      raise ValueError('Inputs %s has undefined rank.' % inputs.name)
+    dtype = inputs.dtype.base_dtype
+    if begin_norm_axis < 0:
+      begin_norm_axis = inputs_rank + begin_norm_axis
+    if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
+      raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) '
+                       'must be < rank(inputs) (%d)' %
+                       (begin_params_axis, begin_norm_axis, inputs_rank))
+    params_shape = inputs_shape[begin_params_axis:]
+    if not params_shape.is_fully_defined():
+      raise ValueError(
+          'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' %
+          (inputs.name, begin_params_axis, inputs_shape))
+    # Allocate parameters for the beta and gamma of the normalization.
+    beta, gamma = None, None
+    if center:
+      beta_collections = utils.get_variable_collections(variables_collections,
+                                                        'beta')
+      beta = variables.model_variable(
+          'beta',
+          shape=params_shape,
+          dtype=dtype,
+          initializer=init_ops.zeros_initializer(),
+          collections=beta_collections,
+          trainable=trainable)
+    if scale:
+      gamma_collections = utils.get_variable_collections(
+          variables_collections, 'gamma')
+      gamma = variables.model_variable(
+          'gamma',
+          shape=params_shape,
+          dtype=dtype,
+          initializer=init_ops.ones_initializer(),
+          collections=gamma_collections,
+          trainable=trainable)
+    if use_fused_batch_norm:
+      # get static TensorShape if fully defined,
+      # otherwise retrieve shape tensor
+      norm_shape = inputs.shape[begin_norm_axis:]
+      if norm_shape.is_fully_defined():
+        bn_shape = [1, -1, 1, numpy.prod(norm_shape.as_list())]
+      else:
+        norm_shape = tf.shape(inputs)[begin_norm_axis:]
+        bn_shape = [1, -1, 1, tf.reduce_prod(norm_shape)]
+      if inputs.get_shape().is_fully_defined():
+        outputs_shape = inputs.get_shape()
+      else:
+        outputs_shape = tf.shape(inputs)
+      inputs = array_ops.reshape(inputs, bn_shape)
+      if inputs.get_shape().is_fully_defined():
+        # static inputs TensorShape fully defined after reshape.
+        ones = array_ops.ones(inputs.get_shape()[1], dtype=dtypes.float32)
+        zeros = array_ops.zeros(inputs.get_shape()[1], dtype=dtypes.float32)
+      else:
+        # static inputs TensorShape NOT fully defined after reshape.
+        # must use dynamic shape, which means these input tensors
+        # have to be created at runtime, which causes a slowdown.
+        scale_shape = tf.shape(inputs)[1]
+        ones = array_ops.ones(scale_shape, dtype=dtypes.float32)
+        zeros = array_ops.zeros(scale_shape, dtype=dtypes.float32)
+      outputs, mean, variance = nn.fused_batch_norm(
+          inputs,
+          ones, zeros,
+          epsilon=1e-4,
+          data_format="NCHW")
+      outputs = array_ops.reshape(outputs, outputs_shape)
+      if center and scale:
+        outputs = outputs * gamma + beta
+      elif center:
+        outputs = outputs + beta
+      elif scale:
+        outputs = outputs * gamma
+    else:
+      # Calculate the moments on the last axis (layer activations).
+      norm_axes = list(range(begin_norm_axis, inputs_rank))
+      mean, variance = nn.moments(inputs, norm_axes, keep_dims=True)
+      # Compute layer normalization using the batch_normalization function.
+      variance_epsilon = 1e-4
+      outputs = nn.batch_normalization(
+          inputs,
+          mean,
+          variance,
+          offset=beta,
+          scale=gamma,
+          variance_epsilon=variance_epsilon)
+      outputs.set_shape(inputs_shape)
+    if activation_fn is not None:
+      outputs = activation_fn(outputs)
+    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+import numpy as np
+
+def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
+                                    initializer=None, regularizer=None,
+                                    trainable=True,
+                                    *args, **kwargs):
+    """Custom variable getter that forces trainable variables to be stored in
+       float32 precision and then casts them to the training precision.
+    """
+    storage_dtype = tf.float32 if trainable else dtype
+    variable = getter(name, shape, dtype=storage_dtype,
+                      initializer=initializer, regularizer=regularizer,
+                      trainable=trainable,
+                      *args, **kwargs)
+    if trainable and dtype != tf.float32:
+        variable = tf.cast(variable, dtype)
+    return variable
+
+def get_custom_getter(compute_type):
+    return float32_variable_storage_getter if compute_type == tf.float16 else None
@@ -0,0 +1,439 @@
+# coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions and classes related to optimization (weight updates)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import tensorflow as tf
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+
+from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
+from npu_bridge.estimator.npu import npu_loss_scale_optimizer as lso
+from npu_bridge.estimator.npu import npu_loss_scale_manager as lsm_lib
+
+def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None, manual_fp16=False, use_fp16=False, num_accumulation_steps=1,
+                     optimizer_type="adam", allreduce_post_accumulation=False):
+  """Creates an optimizer training op."""
+  global_step = tf.train.get_or_create_global_step()
+  
+  # avoid step change in learning rate at end of warmup phase
+  if optimizer_type == "adam":
+      power = 1.0
+      decayed_learning_rate_at_crossover_point = init_lr * (
+                  (1.0 - float(num_warmup_steps) / float(num_train_steps)) ** power)
+  else:
+      power = 0.5
+      decayed_learning_rate_at_crossover_point = init_lr
+
+  adjusted_init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point)
+  print('decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e' % (decayed_learning_rate_at_crossover_point, adjusted_init_lr))
+
+  learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
+
+  # Implements linear decay of the learning rate.
+  learning_rate = tf.train.polynomial_decay(
+      learning_rate,
+      global_step,
+      num_train_steps,
+      end_learning_rate=0.0,
+      power=power,
+      cycle=False)
+
+  # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
+  # learning rate will be `global_step/num_warmup_steps * init_lr`.
+  if num_warmup_steps:
+    global_steps_int = tf.cast(global_step, tf.int32)
+    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
+
+    global_steps_float = tf.cast(global_steps_int, tf.float32)
+    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
+
+    warmup_percent_done = global_steps_float / warmup_steps_float
+    warmup_learning_rate = init_lr * warmup_percent_done
+
+    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
+    learning_rate = (
+        (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
+
+  if optimizer_type == "lamb":
+      print("Initializing LAMB Optimizer")
+      optimizer = LAMBOptimizer(
+          learning_rate=learning_rate,
+          weight_decay_rate=0.01,
+          beta_1=0.9,
+          beta_2=0.999,
+          epsilon=1e-6,
+          exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+  else:
+      print("Initializing ADAM Weight Decay Optimizer")
+      # It is recommended that you use this optimizer for fine tuning, since this
+      # is how the model was trained (note that the Adam m/v variables are NOT
+      # loaded from init_checkpoint.)
+      optimizer = AdamWeightDecayOptimizer(
+          learning_rate=learning_rate,
+          weight_decay_rate=0.01,
+          beta_1=0.9,
+          beta_2=0.999,
+          epsilon=1e-4,
+          exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+
+  if hvd is not None and (num_accumulation_steps == 1 or (not allreduce_post_accumulation)):
+    optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none)
+
+  optimizer = NPUDistributedOptimizer(optimizer)
+  if tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]:
+    opt_tmp = optimizer
+    if tf.flags.FLAGS.npu_bert_loss_scale == 0:
+      loss_scale_manager = lsm_lib.ExponentialUpdateLossScaleManager(init_loss_scale=tf.flags.FLAGS.init_loss_scale_value, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
+    elif tf.flags.FLAGS.npu_bert_loss_scale >= 1:
+      loss_scale_manager = lsm_lib.FixedLossScaleManager(loss_scale=tf.flags.FLAGS.npu_bert_loss_scale)
+    else:
+      raise ValueError("Invalid loss scale: %d" % tf.flags.FLAGS.npu_bert_loss_scale)
+    optimizer = lso.NPULossScaleOptimizer(opt_tmp, loss_scale_manager, is_distributed=tf.flags.FLAGS.distributed)
+
+  tvars = tf.trainable_variables()
+  grads_and_vars = optimizer.compute_gradients(loss * 1.0 / num_accumulation_steps, tvars)
+
+  if num_accumulation_steps > 1:
+      local_step = tf.get_variable(name="local_step", shape=[], dtype=tf.int32, trainable=False,
+                                   initializer=tf.zeros_initializer)
+      batch_finite = tf.get_variable(name="batch_finite", shape=[], dtype=tf.bool, trainable=False,
+                                     initializer=tf.ones_initializer)
+      accum_vars = [tf.get_variable(
+          name=tvar.name.split(":")[0] + "/accum",
+          shape=tvar.shape.as_list(),
+          dtype=tf.float32,
+          trainable=False,
+          initializer=tf.zeros_initializer()) for tvar in tf.trainable_variables()]
+
+      reset_step = tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool)
+      local_step = tf.cond(reset_step, lambda:local_step.assign(tf.ones_like(local_step)), lambda:local_step.assign_add(1))
+
+      with tf.name_scope(accumulate_step):
+        grads_and_vars_and_accums = [(gv[0],gv[1],accum_vars[i]) for i, gv in enumerate(grads_and_vars) if gv[0] is not None]
+        grads, tvars, accum_vars = list(zip(*grads_and_vars_and_accums))
+
+        all_are_finite = tf.reduce_all([tf.reduce_all(tf.is_finite(g)) for g in grads]) if (tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]) and (manual_fp16 or use_fp16) else tf.constant(True, dtype=tf.bool)
+        batch_finite = tf.cond(reset_step,
+          lambda: batch_finite.assign(tf.math.logical_and(tf.constant(True, dtype=tf.bool), all_are_finite)),
+          lambda:batch_finite.assign(tf.math.logical_and(batch_finite, all_are_finite)))
+
+      # This is how the model was pre-trained.
+      # ensure global norm is a finite number
+      # to prevent clip_by_global_norm from having a hizzy fit.
+      if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
+        (clipped_grads, _) = tf.clip_by_global_norm(
+            grads, clip_norm=1.0,
+            use_norm=tf.cond(
+                all_are_finite,
+                lambda: tf.global_norm(grads),
+                lambda: tf.constant(1.0)))
+      else:
+        with tf.name_scope("clip_grads"):
+          clipped_grads = [
+            (tf.clip_by_norm(grad, clip_norm=1.0))
+            if grad is not None else (grad, var) for grad in grads
+          ]
+
+      accum_vars = tf.cond(reset_step,
+              lambda: [accum_vars[i].assign(grad) for i, grad in enumerate(clipped_grads)],
+              lambda: [accum_vars[i].assign_add(grad) for i, grad in enumerate(clipped_grads)])
+
+      def update(accum_vars):
+        with tf.name_scope("opt_update"):
+          if allreduce_post_accumulation and hvd is not None:
+              accum_vars = [hvd.allreduce(tf.convert_to_tensor(accum_var), compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) if isinstance(accum_var, tf.IndexedSlices)
+                            else hvd.allreduce(accum_var, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) for accum_var in accum_vars]
+          return optimizer.apply_gradients(list(zip(accum_vars, tvars)), global_step=global_step)
+
+      update_step = tf.identity(tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool), name="update_step")
+      update_op = tf.cond(update_step,
+                          lambda: update(accum_vars), lambda: tf.no_op())
+
+      new_global_step = tf.cond(tf.math.logical_and(update_step, tf.cast(hvd.allreduce(tf.cast(batch_finite, tf.int32)), tf.bool)), lambda: global_step+1, lambda: global_step)
+      new_global_step = tf.identity(new_global_step, name='step_update')
+      train_op = tf.group(update_op, [global_step.assign(new_global_step)])
+  else:
+      grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
+      grads, tvars = list(zip(*grads_and_vars))
+ 
+      if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
+        all_are_finite = tf.reduce_all(
+            [tf.reduce_all(tf.is_finite(g)) for g in grads]) if (tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]) and (use_fp16 or manual_fp16) else tf.constant(True, dtype=tf.bool)
+
+      # This is how the model was pre-trained.
+      # ensure global norm is a finite number
+      # to prevent clip_by_global_norm from having a hizzy fit.
+      if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
+        (clipped_grads, _) = tf.clip_by_global_norm(
+          grads, clip_norm=1.0,
+          use_norm=tf.cond(
+              all_are_finite,
+              lambda: tf.global_norm(grads),
+              lambda: tf.constant(1.0)))
+      else:
+        with tf.name_scope("clip_grads"):
+          clipped_grads = [
+            (tf.clip_by_norm(grad, clip_norm=1.0))
+            if grad is not None else (grad, var) for grad in grads
+          ]
+      
+      with tf.name_scope("apply_grads"):
+        train_op = optimizer.apply_gradients(
+          list(zip(clipped_grads, tvars)), global_step=global_step)
+
+      #if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
+      #  new_global_step = tf.cond(all_are_finite, lambda: global_step + 1, lambda: global_step)
+      #else:
+      #  new_global_step = global_step + 1
+      #new_global_step = tf.identity(new_global_step, name='step_update')
+      #train_op = tf.group(train_op, [global_step.assign(new_global_step)])
+  return train_op
+
+
+class AdamWeightDecayOptimizer(tf.train.Optimizer):
+  """A basic Adam optimizer that includes "correct" L2 weight decay."""
+
+  def __init__(self,
+               learning_rate,
+               weight_decay_rate=0.0,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-4,
+               exclude_from_weight_decay=None,
+               name="AdamWeightDecayOptimizer"):
+    """Constructs a AdamWeightDecayOptimizer."""
+    super(AdamWeightDecayOptimizer, self).__init__(False, name)
+
+    self.learning_rate = tf.identity(learning_rate, name='learning_rate')
+    self.weight_decay_rate = weight_decay_rate
+    self.beta_1 = beta_1
+    self.beta_2 = beta_2
+    self.epsilon = epsilon
+    self.exclude_from_weight_decay = exclude_from_weight_decay
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None,
+      manual_fp16=False):
+    """See base class."""
+    assignments = []
+    for (grad, param) in grads_and_vars:
+      with tf.name_scope("apply_one_adam"):
+        if grad is None or param is None:
+          continue
+
+        param_name = self._get_variable_name(param.name)
+        has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
+        if has_shadow:
+          # create shadow fp32 weights for fp16 variable
+          param_fp32 = tf.get_variable(
+              name=param_name + "/shadow",
+              dtype=tf.float32,
+              trainable=False,
+              initializer=tf.cast(param.initialized_value(),tf.float32))
+        else:
+          param_fp32 = param
+
+        m = tf.get_variable(
+            name=param_name + "/adam_m",
+            shape=param.shape.as_list(),
+            dtype=tf.float32,
+            trainable=False,
+            initializer=tf.zeros_initializer())
+        v = tf.get_variable(
+            name=param_name + "/adam_v",
+            shape=param.shape.as_list(),
+            dtype=tf.float32,
+            trainable=False,
+            initializer=tf.zeros_initializer())
+
+        # Standard Adam update.
+        next_m = (
+            tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
+        next_v = (
+            tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
+                                                      tf.square(grad)))
+
+        update = next_m / (tf.sqrt(next_v) + self.epsilon)
+
+        # Just adding the square of the weights to the loss function is *not*
+        # the correct way of using L2 regularization/weight decay with Adam,
+        # since that will interact with the m and v parameters in strange ways.
+        #
+        # Instead we want to decay the weights in a manner that doesn't interact
+        # with the m/v parameters. This is equivalent to adding the square
+        # of the weights to the loss with plain (non-momentum) SGD.
+        if self._do_use_weight_decay(param_name):
+          update += self.weight_decay_rate * param_fp32
+
+        update_with_lr = self.learning_rate * update
+
+        next_param = param_fp32 - update_with_lr
+
+        if has_shadow:
+          # cast shadow fp32 weights to fp16 and assign to trainable variable
+          param.assign(tf.cast(next_param, param.dtype.base_dtype))
+        assignments.extend(
+            [param_fp32.assign(next_param),
+             m.assign(next_m),
+             v.assign(next_v)])
+    new_global_step = global_step + 1
+    new_global_step = tf.identity(new_global_step, name='step_update')
+    assignments.extend([global_step.assign(new_global_step)])
+    return tf.group(*assignments, name=name)
+
+  def _do_use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if not self.weight_decay_rate:
+      return False
+    if self.exclude_from_weight_decay:
+      for r in self.exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+  def _get_variable_name(self, param_name):
+    """Get the variable name from the tensor name."""
+    m = re.match("^(.*):\\d+$", param_name)
+    if m is not None:
+      param_name = m.group(1)
+    return param_name
+
+
+class LAMBOptimizer(tf.train.Optimizer):
+  """A LAMB optimizer that includes "correct" L2 weight decay."""
+
+  def __init__(self,
+               learning_rate,
+               weight_decay_rate=0.0,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-6,
+               exclude_from_weight_decay=None,
+               name="LAMBOptimizer"):
+    """Constructs a LAMBOptimizer."""
+    super(LAMBOptimizer, self).__init__(False, name)
+
+    self.learning_rate = tf.identity(learning_rate, name='learning_rate')
+    self.weight_decay_rate = weight_decay_rate
+    self.beta_1 = beta_1
+    self.beta_2 = beta_2
+    self.epsilon = epsilon
+    self.exclude_from_weight_decay = exclude_from_weight_decay
+    self.steps = 0
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None,
+      manual_fp16=False):
+    """See base class."""
+    assignments = []
+    for (grad, param) in grads_and_vars:
+      with tf.name_scope("apply_one_lamb"):
+        if grad is None or param is None:
+          continue
+
+        param_name = self._get_variable_name(param.name)
+        has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
+        if has_shadow:
+          # create shadow fp32 weights for fp16 variable
+          param_fp32 = tf.get_variable(
+              name=param_name + "/shadow",
+              dtype=tf.float32,
+              trainable=False,
+              initializer=tf.cast(param.initialized_value(),tf.float32))
+        else:
+          param_fp32 = param
+
+        m = tf.get_variable(
+            name=param_name + "/adam_m",
+            shape=param.shape.as_list(),
+            dtype=tf.float32,
+            trainable=False,
+            initializer=tf.zeros_initializer())
+        v = tf.get_variable(
+            name=param_name + "/adam_v",
+            shape=param.shape.as_list(),
+            dtype=tf.float32,
+            trainable=False,
+            initializer=tf.zeros_initializer())
+
+        # LAMB update
+        next_m = (
+            tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
+        next_v = (
+            tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
+                                                      tf.square(grad)))
+
+        self.steps += 1
+        beta1_correction = (1 - self.beta_1 ** self.steps)
+        beta2_correction = (1 - self.beta_2 ** self.steps)
+
+        next_m_unbiased = next_m / beta1_correction
+        next_v_unbiased = next_v / beta2_correction
+
+        update = next_m_unbiased / (tf.sqrt(next_v_unbiased) + self.epsilon)
+
+        # Just adding the square of the weights to the loss function is *not*
+        # the correct way of using L2 regularization/weight decay with Adam,
+        # since that will interact with the m and v parameters in strange ways.
+        #
+        # Instead we want to decay the weights in a manner that doesn't interact
+        # with the m/v parameters. This is equivalent to adding the square
+        # of the weights to the loss with plain (non-momentum) SGD.
+        if self._do_use_weight_decay(param_name):
+          update += self.weight_decay_rate * param_fp32
+
+        w_norm = linalg_ops.norm(param, ord=2)
+        g_norm = linalg_ops.norm(update, ord=2)
+        ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
+            math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0)
+
+        update_with_lr = ratio * self.learning_rate * update
+
+        next_param = param_fp32 - update_with_lr
+
+        if has_shadow:
+          # cast shadow fp32 weights to fp16 and assign to trainable variable
+          param.assign(tf.cast(next_param, param.dtype.base_dtype))
+        assignments.extend(
+            [param_fp32.assign(next_param),
+             m.assign(next_m),
+             v.assign(next_v)])
+    new_global_step = global_step + 1
+    new_global_step = tf.identity(new_global_step, name='step_update')
+    assignments.extend([global_step.assign(new_global_step)])
+    return tf.group(*assignments, name=name)
+
+  def _do_use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if not self.weight_decay_rate:
+      return False
+    if self.exclude_from_weight_decay:
+      for r in self.exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+  def _get_variable_name(self, param_name):
+    """Get the variable name from the tensor name."""
+    m = re.match("^(.*):\\d+$", param_name)
+    if m is not None:
+      param_name = m.group(1)
+    return param_name
@@ -0,0 +1,784 @@
+# coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run masked LM/next sentence masked_lm pre-training for BERT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import modeling
+import optimization
+import tensorflow as tf
+import glob
+from utils import LogEvalRunHook
+from tensorflow.core.protobuf import rewriter_config_pb2
+from gpu_environment import get_custom_getter
+
+from npu_bridge.estimator.npu.npu_config import *
+from npu_bridge.estimator.npu.npu_estimator import *
+from npu_bridge.estimator.npu.npu_config import NPURunConfig
+from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
+
+sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../../../../../utils/atlasboost'))
+# import hwlog
+from benchmark_log import hwlog
+from benchmark_log.basic_utils import get_environment_info
+from benchmark_log.basic_utils import get_model_parameter
+os.environ['WHICH_OP'] = 'GEOP'
+os.environ['NEW_GE_FE_ID'] = '1'
+os.environ['GE_AICPU_FLAG'] = '1'
+os.environ['GE_USE_STATIC_MEMORY'] = '1'
+os.environ['OPTION_EXEC_HCCL_FLAG'] = '1'
+os.environ['HCCL_CONNECT_TIMEOUT'] = '600'
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+## Required parameters
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.")
+
+flags.DEFINE_string(
+    "input_files_dir", "./data",
+    "Directory with input files, comma separated or single directory.")
+
+flags.DEFINE_string(
+    "eval_files_dir", None,
+    "Directory with eval files, comma separated or single directory. ")
+
+flags.DEFINE_string(
+    "output_dir", "./models",
+    "The output directory where the model checkpoints will be written.")
+
+## Other parameters
+flags.DEFINE_string(
+    "init_checkpoint", None,
+    "Initial checkpoint (usually from a pre-trained BERT model).")
+
+flags.DEFINE_string(
+    "optimizer_type", "lamb",
+    "Optimizer used for training - LAMB or ADAM")
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded. Must match data generation.")
+
+flags.DEFINE_integer(
+    "max_predictions_per_seq", 20,
+    "Maximum number of masked LM predictions per sequence. "
+    "Must match data generation.")
+
+flags.DEFINE_bool("do_train", True, "Whether to run training.")
+
+flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
+
+flags.DEFINE_integer("train_batch_size", 64, "Total batch size for training.")
+
+flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
+
+flags.DEFINE_float("learning_rate", 1e-4, "The initial learning rate for Adam.")
+
+flags.DEFINE_integer("num_train_steps", 1000000, "Number of training steps.")
+
+flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.")
+
+flags.DEFINE_integer("save_checkpoints_steps", 10000,
+                     "How often to save the model checkpoint.")
+
+flags.DEFINE_integer("display_loss_steps", 10,
+                     "How often to print loss")
+
+flags.DEFINE_integer("iterations_per_loop", 1000,
+                     "How many steps to make in each estimator call.")
+
+flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.")
+
+flags.DEFINE_integer("num_accumulation_steps", 1,
+                     "Number of accumulation steps before gradient update." 
+                      "Global batch size = num_accumulation_steps * train_batch_size")
+
+flags.DEFINE_bool("allreduce_post_accumulation", False, "Whether to all reduce after accumulation of N steps or after each step")
+
+flags.DEFINE_bool(
+    "verbose_logging", False,
+    "If true, all of the trainable parameters are printed")
+
+flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
+
+flags.DEFINE_bool("report_loss", True, "Whether to report total loss during training.")
+
+flags.DEFINE_bool("manual_fp16", True, "Whether to use fp32 or fp16 arithmetic on GPU. "
+                                        "Manual casting is done instead of using AMP")
+
+flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
+
+flags.DEFINE_bool("use_fp16", False, "Whether to enable AMP ops.")
+
+flags.DEFINE_bool("use_fp16_cls", True, "Whether to use fp16 in cls and pooler.")
+
+flags.DEFINE_bool("distributed", True, "Whether to use multi-npu")
+
+flags.DEFINE_bool('npu_bert_fused_gelu', True, 'Whether to use npu defined gelu op')
+
+flags.DEFINE_bool('npu_bert_debug', False, 'If True, dropout and shuffle is disabled.')
+
+flags.DEFINE_bool('npu_bert_use_tdt', True, 'Whether to use tdt as dataset')
+
+flags.DEFINE_string("npu_bert_job_start_file", None, "CSA job start file path.")
+
+flags.DEFINE_integer("npu_bert_loss_scale", 0, "Whether to use loss scale, -1 is disable, 0 is dynamic loss scale, >=1 is static loss scale")
+
+flags.DEFINE_bool("npu_bert_clip_by_global_norm", False, "Use clip_by_global_norm if True, or use clip_by_norm for each gradient")
+
+flags.DEFINE_bool('npu_bert_npu_dropout', True, 'Whether to use npu defined gelu op')
+
+flags.DEFINE_bool('npu_gather', True, 'Whether to use gather_npu whose backward propagation avoids IndexedSlices')
+
+flags.DEFINE_bool('hcom_parallel', True, 'Whether to use parallel allreduce')
+
+flags.DEFINE_integer('init_loss_scale_value', 2**32, 'Initial loss scale value for loss scale optimizer')
+
+# report samples/sec, total loss and learning rate during training
+class _LogSessionRunHook(tf.train.SessionRunHook):
+  def __init__(self, global_batch_size, num_accumulation_steps, display_every=10, hvd_rank=-1):
+    self.global_batch_size = global_batch_size
+    self.display_every = display_every
+    self.hvd_rank = hvd_rank
+    self.num_accumulation_steps = num_accumulation_steps
+  def after_create_session(self, session, coord):
+    self.elapsed_secs = 0.
+    self.count = 0
+    self.all_count = 0
+    self.avg_loss = 0.0
+
+  def before_run(self, run_context):
+    self.t0 = time.time()
+    if self.num_accumulation_steps <= 1:
+        if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+            return tf.train.SessionRunArgs(
+                fetches=['global_step:0', 'total_loss:0',
+                         'learning_rate:0', 'nsp_loss:0',
+                         'mlm_loss:0', 'loss_scale:0', 'apply_grads/All:0'])
+        else:
+            return tf.train.SessionRunArgs(
+                fetches=['global_step:0', 'total_loss:0',
+                         'learning_rate:0', 'nsp_loss:0',
+                         'mlm_loss:0'])
+    else:
+        if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+          return tf.train.SessionRunArgs(
+              fetches=['global_step:0', 'update_step:0', 'total_loss:0',
+                       'learning_rate:0', 'nsp_loss:0',
+                       'mlm_loss:0', 'loss_scale:0'])
+        else:
+          return tf.train.SessionRunArgs(
+              fetches=['global_step:0', 'update_step:0', 'total_loss:0',
+                       'learning_rate:0', 'nsp_loss:0',
+                       'mlm_loss:0'])
+  def after_run(self, run_context, run_values):
+    self.elapsed_secs += time.time() - self.t0
+    if self.num_accumulation_steps <=1:
+        if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+            global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler, custom_arg = run_values.results
+        else:
+            global_step, total_loss, lr, nsp_loss, mlm_loss = run_values. \
+                results
+        update_step = True
+    else:
+        if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+          global_step, update_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
+        else:
+          global_step, update_step, total_loss, lr, nsp_loss, mlm_loss = run_values.\
+              results
+    print_step = global_step + 1 # One-based index for printing.
+    self.avg_loss += total_loss
+    self.all_count += 1
+    if update_step:
+        self.count += 1
+        dt = self.elapsed_secs / self.count
+        sent_per_sec = self.global_batch_size / dt * FLAGS.iterations_per_loop
+        avg_loss_step = self.avg_loss / self.all_count
+        if self.hvd_rank >= 0:
+          if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+            print('Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e Loss scale = %6.4e isFinite = %6i' %
+                  (self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr, loss_scaler, custom_arg), flush=True)
+            hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
+            hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
+          else:
+            print('Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' %
+                  (self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr), flush=True)
+            hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
+            hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
+        else:
+          if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+            print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e Loss scale = %6.4e isFinite = %6i' %
+                  (print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr, loss_scaler, custom_arg), flush=True)
+            hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
+            hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
+          else:
+            print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' %
+                  (print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr), flush=True)
+            hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
+            hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
+        self.elapsed_secs = 0.
+        self.count = 0
+        self.avg_loss = 0.0
+        self.all_count = 0
+
+def model_fn_builder(bert_config, init_checkpoint, learning_rate,
+                     num_train_steps, num_warmup_steps,
+                     use_one_hot_embeddings, hvd=None):
+  """Returns `model_fn` closure for TPUEstimator."""
+
+  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+    """The `model_fn` for TPUEstimator."""
+
+    tf.logging.info("*** Features ***")
+    for name in sorted(features.keys()):
+      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+
+    input_ids = features["input_ids"]
+    input_mask = features["input_mask"]
+    segment_ids = features["segment_ids"]
+    masked_lm_positions = features["masked_lm_positions"]
+    masked_lm_ids = features["masked_lm_ids"]
+    masked_lm_weights = features["masked_lm_weights"]
+    next_sentence_labels = features["next_sentence_labels"]
+
+    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+
+    model = modeling.BertModel(
+        config=bert_config,
+        is_training=is_training,
+        input_ids=input_ids,
+        input_mask=input_mask,
+        token_type_ids=segment_ids,
+        use_one_hot_embeddings=use_one_hot_embeddings,
+        compute_type=tf.float16 if FLAGS.manual_fp16 else tf.float32)
+
+    (masked_lm_loss,
+     masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output(
+         bert_config, model.get_sequence_output(), model.get_embedding_table(), 
+         masked_lm_positions, masked_lm_ids, 
+         masked_lm_weights)
+
+    (next_sentence_loss, next_sentence_example_loss,
+     next_sentence_log_probs) = get_next_sentence_output(
+         bert_config, model.get_pooled_output(), next_sentence_labels)
+
+    masked_lm_loss = tf.identity(masked_lm_loss, name="mlm_loss")
+    next_sentence_loss = tf.identity(next_sentence_loss, name="nsp_loss")
+    total_loss = masked_lm_loss + next_sentence_loss
+    total_loss = tf.identity(total_loss, name='total_loss')
+
+    tvars = tf.trainable_variables()
+
+    initialized_variable_names = {}
+    if init_checkpoint and (hvd is None or hvd.rank() == 0):
+      print("Loading checkpoint", init_checkpoint)
+      (assignment_map, initialized_variable_names
+      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+
+      tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+    if FLAGS.verbose_logging:
+        tf.logging.info("**** Trainable Variables ****")
+        for var in tvars:
+          init_string = ""
+          if var.name in initialized_variable_names:
+            init_string = ", *INIT_FROM_CKPT*"
+          tf.logging.info("  %d :: name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape,
+                          init_string)
+
+    output_spec = None
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      train_op = optimization.create_optimizer(
+          total_loss, learning_rate, num_train_steps, num_warmup_steps,
+          hvd, FLAGS.manual_fp16, FLAGS.use_fp16, FLAGS.num_accumulation_steps, FLAGS.optimizer_type, FLAGS.allreduce_post_accumulation)
+
+      output_spec = tf.estimator.EstimatorSpec(
+          mode=mode,
+          loss=total_loss,
+          train_op=train_op)
+    elif mode == tf.estimator.ModeKeys.EVAL:
+
+      def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
+                    masked_lm_weights, next_sentence_example_loss,
+                    next_sentence_log_probs, next_sentence_labels):
+        """Computes the loss and accuracy of the model."""
+        masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
+                                         [-1, masked_lm_log_probs.shape[-1]])
+        masked_lm_predictions = tf.argmax(
+            masked_lm_log_probs, axis=-1, output_type=tf.int32)
+        masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
+        masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
+        masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
+        masked_lm_accuracy = tf.metrics.accuracy(
+            labels=masked_lm_ids,
+            predictions=masked_lm_predictions,
+            weights=masked_lm_weights)
+        masked_lm_mean_loss = tf.metrics.mean(
+            values=masked_lm_example_loss, weights=masked_lm_weights)
+
+        next_sentence_log_probs = tf.reshape(
+            next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
+        next_sentence_predictions = tf.argmax(
+            next_sentence_log_probs, axis=-1, output_type=tf.int32)
+        next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
+        next_sentence_accuracy = tf.metrics.accuracy(
+            labels=next_sentence_labels, predictions=next_sentence_predictions)
+        next_sentence_mean_loss = tf.metrics.mean(
+            values=next_sentence_example_loss)
+
+        return {
+            "masked_lm_accuracy": masked_lm_accuracy,
+            "masked_lm_loss": masked_lm_mean_loss,
+            "next_sentence_accuracy": next_sentence_accuracy,
+            "next_sentence_loss": next_sentence_mean_loss,
+        }
+
+      eval_metric_ops = metric_fn(
+          masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
+          masked_lm_weights, next_sentence_example_loss,
+          next_sentence_log_probs, next_sentence_labels
+      )
+      output_spec = tf.estimator.EstimatorSpec(
+          mode=mode,
+          loss=total_loss,
+          eval_metric_ops=eval_metric_ops)
+    else:
+      raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))
+
+    return output_spec
+
+  return model_fn
+
+
+def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
+                         label_ids, label_weights):
+  """Get loss and log probs for the masked LM."""
+  input_tensor = gather_indexes(input_tensor, positions)
+
+  with tf.variable_scope("cls/predictions"):
+    # We apply one more non-linear transformation before the output layer.
+    # This matrix is not used after pre-training.
+    with tf.variable_scope("transform", custom_getter=get_custom_getter(compute_type=tf.float16 if FLAGS.use_fp16_cls else tf.float32)):
+      if FLAGS.use_fp16_cls:
+        input_tensor = tf.cast(input_tensor, tf.float16)
+      input_tensor = tf.layers.dense(
+          input_tensor,
+          units=bert_config.hidden_size,
+          activation=modeling.get_activation(bert_config.hidden_act),
+          kernel_initializer=modeling.create_initializer(
+              bert_config.initializer_range))
+      input_tensor = tf.cast(input_tensor, tf.float32)
+      input_tensor = modeling.layer_norm(input_tensor)
+
+    # The output weights are the same as the input embeddings, but there is
+    # an output-only bias for each token.
+    output_bias = tf.get_variable(
+        "output_bias",
+        shape=[bert_config.vocab_size],
+        initializer=tf.zeros_initializer())
+    if FLAGS.use_fp16_cls:
+      input_tensor = tf.cast(input_tensor, tf.float16)
+      logits = tf.matmul(input_tensor, tf.cast(output_weights, tf.float16), transpose_b=True)
+      logits = tf.cast(logits, tf.float32)
+    else:
+      logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
+    logits = tf.nn.bias_add(logits, output_bias)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+
+    label_ids = tf.reshape(label_ids, [-1])
+    label_weights = tf.reshape(label_weights, [-1])
+
+    one_hot_labels = tf.one_hot(
+        label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
+
+    # The `positions` tensor might be zero-padded (if the sequence is too
+    # short to have the maximum number of predictions). The `label_weights`
+    # tensor has a value of 1.0 for every real prediction and 0.0 for the
+    # padding predictions.
+    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
+    numerator = tf.reduce_sum(label_weights * per_example_loss)
+    denominator = tf.reduce_sum(label_weights) + 1e-5
+    loss = numerator / denominator
+
+  return (loss, per_example_loss, log_probs)
+
+
+def get_next_sentence_output(bert_config, input_tensor, labels):
+  """Get loss and log probs for the next sentence prediction."""
+
+  # Simple binary classification. Note that 0 is "next sentence" and 1 is
+  # "random sentence". This weight matrix is not used after pre-training.
+  with tf.variable_scope("cls/seq_relationship"):
+    output_weights = tf.get_variable(
+        "output_weights",
+        shape=[2, bert_config.hidden_size],
+        initializer=modeling.create_initializer(bert_config.initializer_range))
+    output_bias = tf.get_variable(
+        "output_bias", shape=[2], initializer=tf.zeros_initializer())
+
+    if FLAGS.use_fp16_cls:
+      input_tensor = tf.cast(input_tensor, tf.float16)
+      logits = tf.matmul(input_tensor, tf.cast(output_weights, tf.float16), transpose_b=True)
+      logits = tf.cast(logits, tf.float32)
+    else:
+      logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
+    logits = tf.nn.bias_add(logits, output_bias)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+    labels = tf.reshape(labels, [-1])
+    one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
+    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
+    loss = tf.reduce_mean(per_example_loss)
+    return (loss, per_example_loss, log_probs)
+
+
+def gather_indexes(sequence_tensor, positions):
+  """Gathers the vectors at the specific positions over a minibatch."""
+  sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
+  batch_size = sequence_shape[0]
+  seq_length = sequence_shape[1]
+  width = sequence_shape[2]
+
+  flat_offsets = tf.reshape(
+      tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
+  flat_positions = tf.reshape(positions + flat_offsets, [-1])
+  flat_sequence_tensor = tf.reshape(sequence_tensor,
+                                    [batch_size * seq_length, width])
+  output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
+  return output_tensor
+
+
+def input_fn_builder(input_files,
+                     batch_size,
+                     max_seq_length,
+                     max_predictions_per_seq,
+                     is_training,
+                     num_cpu_threads=4,
+                     hvd=None):
+  """Creates an `input_fn` closure to be passed to Estimator."""
+
+  def input_fn():
+    """The actual input function."""
+
+    name_to_features = {
+        "input_ids":
+            tf.FixedLenFeature([max_seq_length], tf.int64),
+        "input_mask":
+            tf.FixedLenFeature([max_seq_length], tf.int64),
+        "segment_ids":
+            tf.FixedLenFeature([max_seq_length], tf.int64),
+        "masked_lm_positions":
+            tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
+        "masked_lm_ids":
+            tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
+        "masked_lm_weights":
+            tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
+        "next_sentence_labels":
+            tf.FixedLenFeature([1], tf.int64),
+    }
+
+    # For training, we want a lot of parallel reading and shuffling.
+    # For eval, we want no shuffling and parallel reading doesn't matter.
+    if is_training:
+      d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
+      if FLAGS.distributed: 
+        #rank_size = int(os.getenv('RANK_SIZE'))
+        #rank_id = int(os.getenv('RANK_INDEX'))
+        #device_id = int(os.getenv('DEVICE_ID'))
+        #local_rank = rank_id * 8 + device_id
+        #print('RANK_SIZE=', rank_size, ' RANK_ID=', local_rank)
+          rank_size = int(os.getenv('RANK_SIZE'))
+          rank_id = int(os.getenv('RANK_ID'))
+          print('RANK_SIZE=', rank_size, ' rank_id=', rank_id)
+          d = d.shard(rank_size, rank_id)
+      d = d.repeat()
+      if not FLAGS.npu_bert_debug:
+        d = d.shuffle(buffer_size=len(input_files))
+
+      # `cycle_length` is the number of parallel files that get read.
+      if not FLAGS.npu_bert_debug:
+        #cycle_length = min(num_cpu_threads, len(input_files))
+        cycle_length = min(num_cpu_threads, int(len(input_files)/int(os.getenv('RANK_SIZE'))))
+      else:
+        cycle_length = 1
+
+      # `sloppy` mode means that the interleaving is not exact. This adds
+      # even more randomness to the training pipeline.
+      #d = d.apply(
+      #    tf.contrib.data.parallel_interleave(
+      #        tf.data.TFRecordDataset,
+      #        sloppy=(not FLAGS.npu_bert_debug),
+      #        cycle_length=cycle_length))
+      d = d.interleave(
+          tf.data.TFRecordDataset,
+          cycle_length=cycle_length,
+          num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      if not FLAGS.npu_bert_debug:
+        d = d.shuffle(buffer_size=100)
+    else:
+      d = tf.data.TFRecordDataset(input_files)
+      # Since we evaluate for a fixed number of steps we don't want to encounter
+      # out-of-range exceptions.
+      d = d.repeat()
+
+    # We must `drop_remainder` on training because the TPU requires fixed
+    # size dimensions. For eval, we assume we are evaluating on the CPU or GPU
+    # and we *don't* want to drop the remainder, otherwise we wont cover
+    # every sample.
+    d = d.apply(
+        tf.contrib.data.map_and_batch(
+            lambda record: _decode_record(record, name_to_features),
+            batch_size=batch_size,
+            num_parallel_batches=num_cpu_threads,
+            drop_remainder=True))
+    return d
+
+  return input_fn
+
+
+def _decode_record(record, name_to_features):
+  """Decodes a record to a TensorFlow example."""
+  example = tf.parse_single_example(record, name_to_features)
+
+  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+  # So cast all int64 to int32.
+  for name in list(example.keys()):
+    t = example[name]
+    if t.dtype == tf.int64:
+      t = tf.to_int32(t)
+    example[name] = t
+
+  return example
+
+
+def main(_):
+  for name, value in FLAGS.__flags.items():
+    print("name:", name, "      ", FLAGS[name].value)
+  
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  if not FLAGS.do_train and not FLAGS.do_eval:
+    raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+  if FLAGS.use_fp16:
+    os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+
+  if FLAGS.horovod:
+    import horovod.tensorflow as hvd
+    hvd.init()
+
+  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+
+  if FLAGS.npu_gather:
+    if FLAGS.distributed and bert_config.num_hidden_layers == 24:
+      #from hccl.split.api import set_split_strategy_by_idx
+      from hccl.split.api import set_split_strategy_by_size
+      #set_split_strategy_by_idx([8,72,136,200,264,328,392,397])
+      set_split_strategy_by_size([10,10,10,10,15,15,15,15])
+    if FLAGS.distributed and bert_config.num_hidden_layers == 12:
+      from hccl.split.api import set_split_strategy_by_idx
+      set_split_strategy_by_idx([8,56,104,152,200,205])
+    if FLAGS.distributed and bert_config.num_hidden_layers == 6:
+      from hccl.split.api import set_split_strategy_by_idx
+      set_split_strategy_by_idx([8,40,72,104,109])
+    
+  tf.gfile.MakeDirs(FLAGS.output_dir)
+
+  input_files = []
+  for input_file_dir in FLAGS.input_files_dir.split(","):
+    input_files.extend(tf.gfile.Glob(os.path.join(input_file_dir, "*")))
+
+  input_files.sort()
+  print("Input Files:", input_files)
+
+  if FLAGS.horovod and len(input_files) < hvd.size():
+      raise ValueError("Input Files must be sharded")
+  if FLAGS.use_fp16 and FLAGS.manual_fp16:
+      raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error")
+
+  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+  config = tf.ConfigProto()
+  if FLAGS.horovod:
+    config.gpu_options.visible_device_list = str(hvd.local_rank())
+    if hvd.rank() == 0:
+      tf.logging.info("***** Configuaration *****")
+      for key in FLAGS.__flags.keys():
+          tf.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
+      tf.logging.info("**************************")
+
+#    config.gpu_options.per_process_gpu_memory_fraction = 0.7
+  if FLAGS.use_xla: 
+      config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+      config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT
+
+  #run_config = tf.estimator.RunConfig(
+  run_config = NPURunConfig(
+      model_dir=FLAGS.output_dir,
+      save_summary_steps=0,
+      session_config=config,
+      save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None,
+      # This variable controls how often estimator reports examples/sec.
+      # Default value is every 100 steps.
+      # When --report_loss is True, we set to very large value to prevent
+      # default info reporting from estimator.
+      # Ideally we should set it to None, but that does not work.
+      log_step_count_steps=1 if FLAGS.report_loss else 100,
+      enable_data_pre_proc=FLAGS.npu_bert_use_tdt,
+      iterations_per_loop=FLAGS.iterations_per_loop,
+      hcom_parallel=FLAGS.hcom_parallel)
+
+  if FLAGS.distributed:
+    rank_size = int(os.getenv('RANK_SIZE'))
+  model_fn = model_fn_builder(
+      bert_config=bert_config,
+      init_checkpoint=FLAGS.init_checkpoint,
+      learning_rate=FLAGS.learning_rate,
+      num_train_steps=FLAGS.num_train_steps,
+      num_warmup_steps=FLAGS.num_warmup_steps,
+      use_one_hot_embeddings=False,
+      hvd=None if not FLAGS.horovod else hvd)
+
+  training_hooks = []
+  """
+  if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0):
+    global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.horovod else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
+    training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
+  if FLAGS.horovod and hvd.size() > 1:
+    training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
+  """
+  if FLAGS.report_loss:
+    global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.distributed else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * rank_size
+    training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
+
+
+  #estimator = tf.estimator.Estimator(
+  estimator = NPUEstimator(
+      model_fn=model_fn,
+      config=run_config,
+      job_start_file=FLAGS.npu_bert_job_start_file)
+
+  if FLAGS.do_train:
+    tf.logging.info("***** Running training *****")
+    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+    train_input_fn = input_fn_builder(
+        input_files=input_files,
+        batch_size=FLAGS.train_batch_size,
+        max_seq_length=FLAGS.max_seq_length,
+        max_predictions_per_seq=FLAGS.max_predictions_per_seq,
+        is_training=True,
+        hvd=None if not FLAGS.horovod else hvd)
+
+    estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps)
+
+  if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
+    tf.logging.info("***** Running evaluation *****")
+    tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
+
+    eval_files = []
+    for eval_file_dir in FLAGS.eval_files_dir.split(","):
+        eval_files.extend(tf.gfile.Glob(os.path.join(eval_file_dir, "*")))
+
+    eval_input_fn = input_fn_builder(
+        input_files=eval_files,
+        batch_size=FLAGS.eval_batch_size,
+        max_seq_length=FLAGS.max_seq_length,
+        max_predictions_per_seq=FLAGS.max_predictions_per_seq,
+        is_training=False,
+        hvd=None if not FLAGS.horovod else hvd)
+
+    eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
+    eval_start_time = time.time()
+    result = estimator.evaluate(
+        input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks)
+
+    eval_time_elapsed = time.time() - eval_start_time
+    eval_time_wo_overhead = eval_hooks[-1].total_time
+
+    num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size
+
+    ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
+
+    tf.logging.info("-----------------------------")
+    tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
+                    eval_hooks[-1].count * FLAGS.eval_batch_size)
+    tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
+                    (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size)
+    tf.logging.info("Summary Inference Statistics on EVAL set")
+    tf.logging.info("Batch size = %d", FLAGS.eval_batch_size)
+    tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
+    tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
+    tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
+    tf.logging.info("-----------------------------")
+
+    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
+    with tf.gfile.GFile(output_eval_file, "w") as writer:
+      tf.logging.info("***** Eval results *****")
+      for key in sorted(result.keys()):
+        tf.logging.info("  %s = %s", key, str(result[key]))
+        writer.write("%s = %s\n" % (key, str(result[key])))
+        if key == 'masked_lm_accuracy':
+             hwlog.remark_print(key=hwlog.MASKED_LM_ACCURACY, value=str(result[key]))
+        elif key == 'next_sentence_accuracy ':
+             hwlog.remark_print(key=hwlog.NEXT_SENTENCE_ACCURACY, value=str(result[key]))
+        elif key == 'global_step':
+             hwlog.remark_print(key=hwlog.GLOBAL_STEP, value=str(result[key]))
+        elif key == 'loss':
+             hwlog.remark_print(key=hwlog.LOSS, value=str(result[key]))
+        elif key == 'masked_lm_loss':
+             hwlog.remark_print(key=hwlog.MASKED_LM_LOSS, value=str(result[key]))
+        elif key == 'next_sentence_loss ':
+             hwlog.remark_print(key=hwlog.NEXT_SENTENCE_LOSS, value=str(result[key]))
+        else:
+             pass
+
+
+if __name__ == "__main__":
+  hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
+  cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
+  config_info = get_model_parameter("tensorflow_config")
+  initinal_data = {"base_lr": 0.01, "dataset": "cn-clue/en-wiki", "optimizer": "Adam", "loss_scale": 512}
+  flags.mark_flag_as_required("input_files_dir")
+  flags.mark_flag_as_required("eval_files_dir")
+  flags.mark_flag_as_required("bert_config_file")
+  flags.mark_flag_as_required("output_dir")
+  flags.mark_flag_as_required("npu_bert_job_start_file")
+  hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
+  hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
+  hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
+  hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
+  hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
+  hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
+  hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
+  hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
+  hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
+  hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
+  if FLAGS.use_xla and FLAGS.manual_fp16:
+    print('WARNING! Combining --use_xla with --manual_fp16 may prevent convergence.')
+    print('         This warning message will be removed when the underlying')
+    print('         issues have been fixed and you are running a TF version')
+    print('         that has that fix.')
+  tf.app.run()
@@ -0,0 +1,215 @@
+"""
+Multiclass
+from: 
+https://github.com/guillaumegenthial/tf_metrics/blob/master/tf_metrics/__init__.py
+
+"""
+
+__author__ = "Guillaume Genthial"
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops.metrics_impl import _streaming_confusion_matrix
+
+
+def precision(labels, predictions, num_classes, pos_indices=None,
+              weights=None, average='micro'):
+    """Multi-class precision metric for Tensorflow
+    Parameters
+    ----------
+    labels : Tensor of tf.int32 or tf.int64
+        The true labels
+    predictions : Tensor of tf.int32 or tf.int64
+        The predictions, same shape as labels
+    num_classes : int
+        The number of classes
+    pos_indices : list of int, optional
+        The indices of the positive classes, default is all
+    weights : Tensor of tf.int32, optional
+        Mask, must be of compatible shape with labels
+    average : str, optional
+        'micro': counts the total number of true positives, false
+            positives, and false negatives for the classes in
+            `pos_indices` and infer the metric from it.
+        'macro': will compute the metric separately for each class in
+            `pos_indices` and average. Will not account for class
+            imbalance.
+        'weighted': will compute the metric separately for each class in
+            `pos_indices` and perform a weighted average by the total
+            number of true labels for each class.
+    Returns
+    -------
+    tuple of (scalar float Tensor, update_op)
+    """
+    cm, op = _streaming_confusion_matrix(
+        labels, predictions, num_classes, weights)
+    pr, _, _ = metrics_from_confusion_matrix(
+        cm, pos_indices, average=average)
+    op, _, _ = metrics_from_confusion_matrix(
+        op, pos_indices, average=average)
+    return (pr, op)
+
+
+def recall(labels, predictions, num_classes, pos_indices=None, weights=None,
+           average='micro'):
+    """Multi-class recall metric for Tensorflow
+    Parameters
+    ----------
+    labels : Tensor of tf.int32 or tf.int64
+        The true labels
+    predictions : Tensor of tf.int32 or tf.int64
+        The predictions, same shape as labels
+    num_classes : int
+        The number of classes
+    pos_indices : list of int, optional
+        The indices of the positive classes, default is all
+    weights : Tensor of tf.int32, optional
+        Mask, must be of compatible shape with labels
+    average : str, optional
+        'micro': counts the total number of true positives, false
+            positives, and false negatives for the classes in
+            `pos_indices` and infer the metric from it.
+        'macro': will compute the metric separately for each class in
+            `pos_indices` and average. Will not account for class
+            imbalance.
+        'weighted': will compute the metric separately for each class in
+            `pos_indices` and perform a weighted average by the total
+            number of true labels for each class.
+    Returns
+    -------
+    tuple of (scalar float Tensor, update_op)
+    """
+    cm, op = _streaming_confusion_matrix(
+        labels, predictions, num_classes, weights)
+    _, re, _ = metrics_from_confusion_matrix(
+        cm, pos_indices, average=average)
+    _, op, _ = metrics_from_confusion_matrix(
+        op, pos_indices, average=average)
+    return (re, op)
+
+
+def f1(labels, predictions, num_classes, pos_indices=None, weights=None,
+       average='micro'):
+    return fbeta(labels, predictions, num_classes, pos_indices, weights,
+                 average)
+
+
+def fbeta(labels, predictions, num_classes, pos_indices=None, weights=None,
+          average='micro', beta=1):
+    """Multi-class fbeta metric for Tensorflow
+    Parameters
+    ----------
+    labels : Tensor of tf.int32 or tf.int64
+        The true labels
+    predictions : Tensor of tf.int32 or tf.int64
+        The predictions, same shape as labels
+    num_classes : int
+        The number of classes
+    pos_indices : list of int, optional
+        The indices of the positive classes, default is all
+    weights : Tensor of tf.int32, optional
+        Mask, must be of compatible shape with labels
+    average : str, optional
+        'micro': counts the total number of true positives, false
+            positives, and false negatives for the classes in
+            `pos_indices` and infer the metric from it.
+        'macro': will compute the metric separately for each class in
+            `pos_indices` and average. Will not account for class
+            imbalance.
+        'weighted': will compute the metric separately for each class in
+            `pos_indices` and perform a weighted average by the total
+            number of true labels for each class.
+    beta : int, optional
+        Weight of precision in harmonic mean
+    Returns
+    -------
+    tuple of (scalar float Tensor, update_op)
+    """
+    cm, op = _streaming_confusion_matrix(
+        labels, predictions, num_classes, weights)
+    _, _, fbeta = metrics_from_confusion_matrix(
+        cm, pos_indices, average=average, beta=beta)
+    _, _, op = metrics_from_confusion_matrix(
+        op, pos_indices, average=average, beta=beta)
+    return (fbeta, op)
+
+
+def safe_div(numerator, denominator):
+    """Safe division, return 0 if denominator is 0"""
+    numerator, denominator = tf.to_float(numerator), tf.to_float(denominator)
+    zeros = tf.zeros_like(numerator, dtype=numerator.dtype)
+    denominator_is_zero = tf.equal(denominator, zeros)
+    return tf.where(denominator_is_zero, zeros, numerator / denominator)
+
+
+def pr_re_fbeta(cm, pos_indices, beta=1):
+    """Uses a confusion matrix to compute precision, recall and fbeta"""
+    num_classes = cm.shape[0]
+    neg_indices = [i for i in range(num_classes) if i not in pos_indices]
+    cm_mask = np.ones([num_classes, num_classes])
+    cm_mask[neg_indices, neg_indices] = 0
+    diag_sum = tf.reduce_sum(tf.diag_part(cm * cm_mask))
+
+    cm_mask = np.ones([num_classes, num_classes])
+    cm_mask[:, neg_indices] = 0
+    tot_pred = tf.reduce_sum(cm * cm_mask)
+
+    cm_mask = np.ones([num_classes, num_classes])
+    cm_mask[neg_indices, :] = 0
+    tot_gold = tf.reduce_sum(cm * cm_mask)
+
+    pr = safe_div(diag_sum, tot_pred)
+    re = safe_div(diag_sum, tot_gold)
+    fbeta = safe_div((1. + beta**2) * pr * re, beta**2 * pr + re)
+
+    return pr, re, fbeta
+
+
+def metrics_from_confusion_matrix(cm, pos_indices=None, average='micro',
+                                  beta=1):
+    """Precision, Recall and F1 from the confusion matrix
+    Parameters
+    ----------
+    cm : tf.Tensor of type tf.int32, of shape (num_classes, num_classes)
+        The streaming confusion matrix.
+    pos_indices : list of int, optional
+        The indices of the positive classes
+    beta : int, optional
+        Weight of precision in harmonic mean
+    average : str, optional
+        'micro', 'macro' or 'weighted'
+    """
+    num_classes = cm.shape[0]
+    if pos_indices is None:
+        pos_indices = [i for i in range(num_classes)]
+
+    if average == 'micro':
+        return pr_re_fbeta(cm, pos_indices, beta)
+    elif average in {'macro', 'weighted'}:
+        precisions, recalls, fbetas, n_golds = [], [], [], []
+        for idx in pos_indices:
+            pr, re, fbeta = pr_re_fbeta(cm, [idx], beta)
+            precisions.append(pr)
+            recalls.append(re)
+            fbetas.append(fbeta)
+            cm_mask = np.zeros([num_classes, num_classes])
+            cm_mask[idx, :] = 1
+            n_golds.append(tf.to_float(tf.reduce_sum(cm * cm_mask)))
+
+        if average == 'macro':
+            pr = tf.reduce_mean(precisions)
+            re = tf.reduce_mean(recalls)
+            fbeta = tf.reduce_mean(fbetas)
+            return pr, re, fbeta
+        if average == 'weighted':
+            n_gold = tf.reduce_sum(n_golds)
+            pr_sum = sum(p * n for p, n in zip(precisions, n_golds))
+            pr = safe_div(pr_sum, n_gold)
+            re_sum = sum(r * n for r, n in zip(recalls, n_golds))
+            re = safe_div(re_sum, n_gold)
+            fbeta_sum = sum(f * n for f, n in zip(fbetas, n_golds))
+            fbeta = safe_div(fbeta_sum, n_gold)
+            return pr, re, fbeta
+
+    else:
+        raise NotImplementedError()
@@ -0,0 +1,451 @@
+# coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import unicodedata
+import six
+import tensorflow as tf
+import re
+import os
+
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+}
+
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+  """Checks whether the casing config is consistent with the checkpoint name."""
+
+  # The casing has to be passed in by the user and there is no explicit check
+  # as to whether it matches the checkpoint. The casing information probably
+  # should have been stored in the bert_config.json file, but it's not, so
+  # we have to heuristically detect it to validate.
+
+  if not init_checkpoint:
+    return
+
+  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+  if m is None:
+    return
+
+  model_name = m.group(1)
+
+  lower_models = [
+      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+  ]
+
+  cased_models = [
+      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+      "multi_cased_L-12_H-768_A-12"
+  ]
+
+  is_bad_config = False
+  if model_name in lower_models and not do_lower_case:
+    is_bad_config = True
+    actual_flag = "False"
+    case_name = "lowercased"
+    opposite_flag = "True"
+
+  if model_name in cased_models and do_lower_case:
+    is_bad_config = True
+    actual_flag = "True"
+    case_name = "cased"
+    opposite_flag = "False"
+
+  if is_bad_config:
+    raise ValueError(
+        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+        "However, `%s` seems to be a %s model, so you "
+        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+        "how the model was pre-training. If this error is wrong, please "
+        "just comment out this check." % (actual_flag, init_checkpoint,
+                                          model_name, case_name, opposite_flag))
+
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if isinstance(text, str):
+        return text
+    elif isinstance(text, bytes):
+        return text.decode("utf-8", "ignore")
+    else:
+        raise ValueError("Unsupported string type: %s" % (type(text)))
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if isinstance(text, str):
+        return text
+    elif isinstance(text, bytes):
+        return text.decode("utf-8", "ignore")
+    else:
+        raise ValueError("Unsupported string type: %s" % (type(text)))
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r") as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def convert_by_vocab(vocab, items):
+  """Converts a sequence of [tokens|ids] using the vocab."""
+  output = []
+  for item in items:
+    output.append(vocab[item])
+  return output
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+  """Runs end-to-end tokenziation."""
+
+  def __init__(self, vocab_file, do_lower_case=True):
+    self.vocab = load_vocab(vocab_file)
+    self.inv_vocab = {v: k for k, v in self.vocab.items()}
+    self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+  def tokenize(self, text):
+    split_tokens = []
+    for token in self.basic_tokenizer.tokenize(text):
+      for sub_token in self.wordpiece_tokenizer.tokenize(token):
+        split_tokens.append(sub_token)
+
+    return split_tokens
+
+  def convert_tokens_to_ids(self, tokens):
+    return convert_by_vocab(self.vocab, tokens)
+
+  def convert_ids_to_tokens(self, ids):
+    return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name, do_lower_case=True):
+        """
+    Instantiate a PreTrainedBertModel from a pre-trained model file.
+    Download and cache the pre-trained model file if needed.
+    """
+        if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
+        else:
+            vocab_file = pretrained_model_name
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file)
+            if resolved_vocab_file == vocab_file:
+
+                logger.info("loading vocabulary file {}".format(vocab_file))
+            else:
+                logger.info("loading vocabulary file {} from cache at {}".format(
+                    vocab_file, resolved_vocab_file))
+            # Instantiate tokenizer.
+            tokenizer = cls(resolved_vocab_file, do_lower_case)
+        except FileNotFoundError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                "associated to this path or url.".format(
+                    pretrained_model_name,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name))
+            tokenizer = None
+        return tokenizer
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+    Args:
+      do_lower_case: Whether to lower case the input.
+    """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+    This uses a greedy longest-match-first algorithm to perform tokenization
+    using the given vocabulary.
+
+    For example:
+      input = "unaffable"
+      output = ["un", "##aff", "##able"]
+
+    Args:
+      text: A single token or whitespace separated tokens. This should have
+        already been passed through `BasicTokenizer.
+
+    Returns:
+      A list of wordpiece tokens.
+    """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
@@ -0,0 +1,62 @@
+import tensorflow as tf
+import time
+
+# report latency and throughput during eval
+class LogEvalRunHook(tf.train.SessionRunHook):
+  def __init__(self, global_batch_size, hvd_rank=-1):
+    self.global_batch_size = global_batch_size
+    self.hvd_rank = hvd_rank
+    self.total_time = 0.0
+    self.count = 0
+    self.skipped = 0
+    self.time_list = []
+
+  def before_run(self, run_context):
+    self.t0 = time.time()
+
+  def after_run(self, run_context, run_values):
+    elapsed_secs = time.time() - self.t0
+    self.count += 1
+
+    # Removing first 2 (arbitrary) number of startup iterations from perf evaluations
+    if self.count <= 2:
+      print("Skipping time record for ", self.count, " due to overhead")
+      self.skipped += 1
+    else:
+      self.time_list.append(elapsed_secs)
+      self.total_time += elapsed_secs
+
+# report throughput during training
+class LogTrainRunHook(tf.train.SessionRunHook):
+  def __init__(self, global_batch_size, hvd_rank=-1, save_checkpoints_steps=1000):
+    self.global_batch_size = global_batch_size
+    self.hvd_rank = hvd_rank
+    self.save_checkpoints_steps = save_checkpoints_steps
+
+    self.total_time = 0.0
+    self.count = 0 # Holds number of iterations, including skipped iterations for fp16 loss scaling
+
+  def after_create_session(self, session, coord):
+    self.init_global_step = session.run(tf.train.get_global_step())
+
+  def before_run(self, run_context):
+    self.t0 = time.time()
+    return tf.train.SessionRunArgs(
+        fetches=['step_update:0'])
+
+  def after_run(self, run_context, run_values):
+    elapsed_secs = time.time() - self.t0
+    self.global_step = run_values.results[0]
+    self.count += 1
+
+    # Removing first step + first two steps after every checkpoint save
+    if (self.global_step - self.init_global_step) % self.save_checkpoints_steps <= 1:
+      print("Skipping time record for ", self.global_step, " due to checkpoint-saving/warmup overhead")
+    else:
+      self.total_time += elapsed_secs
+
+  def end(self, session):
+    num_global_steps = self.global_step - self.init_global_step
+
+    self.skipped = (num_global_steps // self.save_checkpoints_steps) * 2 + \
+                   min(2, num_global_steps % self.save_checkpoints_steps) - 1
@@ -0,0 +1,14 @@
+{
+    "server_count": "1",
+    "server_list": [{
+        "device": [
+            {
+                "device_id": "0",
+                "device_ip": "192.168.10.101",
+                "rank_id": "0"
+            }],
+        "server_id": "127.0.0.1"
+    }],
+    "status": "completed",
+    "version": "1.0"
+}
@@ -0,0 +1,49 @@
+{
+    "server_count": "1",
+    "server_list": [{
+        "device": [
+            {
+                "device_id": "0",
+                "device_ip": "192.168.10.101",
+                "rank_id": "0"
+            },
+            {
+                "device_id": "1",
+                "device_ip": "192.168.11.101",
+                "rank_id": "1"
+            },
+            {
+                "device_id": "2",
+                "device_ip": "192.168.12.101",
+                "rank_id": "2"
+            },
+            {
+                "device_id": "3",
+                "device_ip": "192.168.13.101",
+                "rank_id": "3"
+            },
+            {
+                "device_id": "4",
+                "device_ip": "192.168.10.100",
+                "rank_id": "4"
+            },
+            {
+                "device_id": "5",
+                "device_ip": "192.168.11.100",
+                "rank_id": "5"
+            },
+            {
+                "device_id": "6",
+                "device_ip": "192.168.12.100",
+                "rank_id": "6"
+            },
+            {
+                "device_id": "7",
+                "device_ip": "192.168.13.100",
+                "rank_id": "7"
+            }],
+        "server_id": "127.0.0.1"
+    }],
+    "status": "completed",
+    "version": "1.0"
+}
@@ -0,0 +1,14 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "type_vocab_size": 2,
+  "vocab_size": 21136
+}
+
@@ -0,0 +1,14 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}
+
@@ -0,0 +1,14 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "type_vocab_size": 2,
+  "vocab_size": 21136
+}
+
@@ -0,0 +1,14 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}
+
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+#toolkit env
+#export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
+#export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$utilDir
+#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
+#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
+
+#nnae env
+#export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/
+#export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:$utilDir
+#export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin
+#export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
+
+if [ -d /usr/local/Ascend/nnae/latest ];then
+	export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
+	export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:$utilDir
+	export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
+	export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
+else
+	export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
+	export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$utilDir
+	export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
+	export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
+	
+fi
+
+export NEW_GE_FE_ID=1
+export GE_AICPU_FLAG=1
+export SOC_VERSION=Ascend910
+#export DUMP_GE_GRAPH=2
+#export DUMP_GRAPH_LEVEL=3
+#export PRINT_MODEL=1
+export SLOG_PRINT_TO_STDOUT=0
+export HCCL_CONNECT_TIMEOUT=600
+
+
+# system env
+#ulimit -c unlimited
@@ -0,0 +1,67 @@
+#!/bin/bash
+rank_size=$1
+yamlPath=$2
+toolsPath=$3
+if [ -f /.dockerenv ];then
+        CLUSTER=$4
+        MPIRUN_ALL_IP="$5"
+        export CLUSTER=${CLUSTER}
+fi
+
+currentDir=$(cd "$(dirname "$0")/.."; pwd)
+currtime=`date +%Y%m%d%H%M%S`
+mkdir -p ${currentDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
+train_job_dir=${currentDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
+echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] see more config info in ${currentDir}/config"
+echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] train result in ${train_job_dir}"
+
+# 从 yaml 获取配置
+eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
+
+# device 列表, 若无指定 device 根据 rank_size 顺序选择
+eval device_group=\$device_group_${rank_size}p
+if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
+    device_group="$(seq 0 "$(expr $rank_size - 1)")"
+fi
+
+# get last device id in device_group, hw log in performance from the dir named first_device_id  
+device_group_str=`echo ${device_group} | sed 's/ //g'`
+first_device_id=`echo ${device_group_str: 0:1}`
+
+# user env
+export JOB_ID=9999001
+export RANK_TABLE_FILE=${currentDir}/config/${rank_size}p.json
+export RANK_SIZE=${rank_size}
+export SLOG_PRINT_TO_STDOUT=0
+export DEVICE_ID=0
+export DEVICE_INDEX=$DEVICE_ID
+
+if [ x"${CLUSTER}" == x"True" ];then
+    # ln hw log
+    ln -snf ${train_job_dir}/0/hw_bert.log ${train_job_dir}
+    this_ip=$(hostname -I |awk '{print $1}')
+    for ip in $MPIRUN_ALL_IP;do
+        if [ x"$ip" != x"$this_ip" ];then
+            scp $yamlPath root@$ip:$yamlPath
+        fi
+    done
+    export PATH=$PATH:/usr/local/mpirun4.0/bin
+    mpirun -H ${mpirun_ip} \
+    --bind-to none -map-by slot\
+    --allow-run-as-root \
+    --mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
+    --prefix /usr/local/mpirun4.0/ \
+    ${currentDir}/scripts/train.sh 0 $currtime $yamlPath 0 True ${toolsPath} ${rank_size}
+else
+    # ln hw log
+    ln -snf ${train_job_dir}/${first_device_id}/hw_bert.log ${train_job_dir}
+    rank_id=0
+    for device_id in ${device_group};do
+      #echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train ${device_id} & " >> ./main.log
+      ${currentDir}/scripts/train.sh $device_id $currtime $yamlPath $rank_id solo ${toolsPath} ${rank_size} &
+      let rank_id++
+    done
+fi
+wait
+
+
@@ -0,0 +1,157 @@
+#!/bin/bash
+# 0 $currtime $yamlPath  0 cluster ${toolsPath}
+device_id=$1
+currtime=$2
+yamlPath=$3
+toolsPath=$6
+rank_size=$7
+
+
+export YAML_PATH=$3
+
+mainDir=$(cd "$(dirname "$0")/.."; pwd)
+
+mkdir -p ${mainDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
+export train_job_dir=${mainDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
+
+
+#exec_path=${train_job_dir}
+
+cd ${train_job_dir}
+
+export utilDir=$(cd "$(dirname "$yamlPath")/../atlas_benchmark-master/utils"; pwd)
+export utilDir=$(cd "$(dirname "$yamlPath")/../atlas_benchmark-master/utils/atlasboost"; pwd)
+source ${mainDir}/config/npu_set_env.sh
+
+
+# 从 yaml 获取配置
+eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
+
+# 声明变量
+export REMARK_LOG_FILE=hw_bert.log  # 打点日志文件名称， 必须hw_后跟模型名称小写
+# 添加日志打点模块路径
+benchmark_log_path=${mainDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
+export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
+
+export JOB_ID=9999001
+export RANK_TABLE_FILE=${mainDir}/config/${rank_size}p.json
+export RANK_SIZE=${rank_size}
+
+export SLOG_PRINT_TO_STDOUT=0
+export DEVICE_ID=${device_id}
+export DEVICE_INDEX=$DEVICE_ID
+export RANK_INDEX=0
+
+
+export PROFILING_OPTIONS=${PROFILING_OPTIONS}
+export FP_POINT=${FP_POINT}
+export BP_POINT=${BP_POINT}
+
+if [ ${PROFILING_MODE} == True ];
+then
+	export PROFILING_MODE=true
+else
+	export PROFILING_MODE=false
+fi
+
+if [ ${PROFILING_MODE} == True ];
+then
+	export AICPU_PROFILING_MODE=true
+else
+        export AICPU_PROFILING_MODE=false
+fi
+
+
+if  [ x"${device_id}" = x ] ;
+then
+    echo "turing train fail" >> ${exec_path}/train_${device_id}.log
+    exit
+else
+    export DEVICE_ID=${device_id}
+fi
+
+
+env > ${currentDir}/env_${device_id}.log
+
+cd ${train_job_dir}
+
+if [ x"$5" != x"True" ];then
+        rank_id=$4
+        export RANK_ID=$4
+else
+        device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
+                device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
+                atlasboost.set_device_id(device_id);print(atlasboost.rank())")
+        device_id_mo=`echo $device_id_mo`
+        rank_id=${device_id_mo##* }
+        #echo rank_id is $rank_id
+        export RANK_ID=${rank_id}
+        device=${device_id_mo##*deviceid = }
+        device_id=${device%% phyid=*}
+        export DEVICE_ID=${device_id}
+        #echo device_id is $device_id
+        hccljson=${train_job_dir}/*.json
+        cp ${hccljson} ${mainDir}/config/${rank_size}p.json
+fi
+env > ${currentDir}/env_${device_id}.log
+#mkdir exec path
+
+
+mkdir -p ${train_job_dir}/${device_id}/ckpt${DEVICE_ID}
+cd ${train_job_dir}/${device_id}
+
+startTime=`date +%Y%m%d-%H:%M:%S`
+startTime_s=`date +%s`
+
+
+#start exec
+python3.7 ${mainDir}/code/pretrain/run_pretraining.py \
+    --bert_config_file=${mainDir}/config/${bert_config_file} \
+    --max_seq_length=${max_seq_length} \
+    --max_predictions_per_seq=${max_predictions_per_seq} \
+    --train_batch_size=${train_batch_size} \
+    --learning_rate=${learning_rate} \
+    --num_warmup_steps=${num_warmup_steps} \
+    --num_train_steps=${num_train_steps} \
+    --optimizer_type=${optimizer_type} \
+    --manual_fp16=${manual_fp16} \
+    --use_fp16_cls=${use_fp16_cls} \
+    --input_files_dir=${input_files_dir} \
+    --eval_files_dir=${eval_files_dir} \
+    --npu_bert_debug=${npu_bert_debug} \
+    --npu_bert_use_tdt=${npu_bert_use_tdt} \
+    --do_train=${do_train} \
+    --do_eval=${do_eval} \
+    --num_accumulation_steps=${num_accumulation_steps} \
+    --npu_bert_job_start_file=None \
+    --iterations_per_loop=${iterations_per_loop} \
+    --npu_bert_loss_scale=${npu_bert_loss_scale} \
+    --distributed=${distributed} \
+    --save_checkpoints_steps=${save_checkpoints_steps} \
+    --npu_bert_clip_by_global_norm=${npu_bert_clip_by_global_norm} \
+    --output_dir=${train_job_dir}/${device_id}/ckpt${DEVICE_ID} > ${train_job_dir}/train_${device_id}.log 2>&1
+
+
+if [ $? -eq 0 ] ;then
+    echo ":::ABK 1.0.0 bert train success"
+    echo ":::ABK 1.0.0 bert train success" >> ${train_job_dir}/train_${device_id}.log
+    echo ":::ABK 1.0.0 bert train success" >> ${train_job_dir}/${device_id}/hw_bert.log
+else
+    echo ":::ABK 1.0.0 bert train failed"
+    echo ":::ABK 1.0.0 bert train failed" >> ${train_job_dir}/train_${device_id}.log
+    echo ":::ABK 1.0.0 bert train failed" >> ${train_job_dir}/${device_id}/hw_bert.log
+fi
+
+endTime=`date +%Y%m%d-%H:%M:%S`
+endTime_s=`date +%s`
+sumTime=$[ $endTime_s - $startTime_s ]
+hour=$(( $sumTime/3600 ))
+min=$(( ($sumTime-${hour}*3600)/60 ))
+sec=$(( $sumTime-${hour}*3600-${min}*60 ))
+echo ":::ABK 1.0.0 bert train total time ${hour}:${min}:${sec}"
+echo ":::ABK 1.0.0 bert train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_bert.log
+
+#if [ x"$5" == x"solo" ];
+#then
+#    /bin/cp -f hw_bert.log $perfDir/hw_bert.log
+#fi