[add]上传训练benchmark by z00560161

2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,56 @@
+# Bert-Base_tensorflow训练说明
+
+### 1. 模型训练参数配置
+
+在train/yaml/Bert-Base.yaml中修改相应配置， 配置项含义:
+
+```
+ tensorflow_config:
+    #layer层数有6和12两种，中文数据集用 bert_base_layer6_cn.json/bert_base_layer12_cn.json 英文用bert_base_layer6_cn.json/bert_base_layer12_en.json
+    bert_config_file: bert_base_layer6_cn.json
+    #数据集句子长度是256时 设置为 256,40，句子长度是128时设置为128,20 
+    max_seq_length: 128
+    max_predictions_per_seq: 20
+    
+    # 最佳性能train_batch_size为160 
+    train_batch_size: 160
+    learning_rate: 1e-4
+    num_warmup_steps: 100
+    num_train_steps: 1000
+    optimizer_type: adam
+    manual_fp16: True
+    use_fp16_cls: True
+    input_files_dir: 数据集路径
+    eval_files_dir: 数据集路径
+    npu_bert_debug: False
+    npu_bert_use_tdt: True
+    distributed: True
+    do_train: True
+    do_eval: False
+    num_accumulation_steps: 1
+    iterations_per_loop: 100
+    npu_bert_loss_scale: 0
+    save_checkpoints_steps: 1000
+    npu_bert_clip_by_global_norm: False
+
+    # docker 镜像名称:版本号
+    docker_image: c73:b021
+
+    # 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
+    mpirun_ip: 90.90.140.199:8,90.90.140.229:8
+
+    # 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
+    device_group_1p: 6
+    device_group_2p: 0 1
+    device_group_4p: 0 1 2 3
+```
+
+------
+
+
+
+
+
+
+
+    
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
@@ -0,0 +1,13 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "type_vocab_size": 2,
+  "vocab_size": 30522 
+}
@@ -0,0 +1,442 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Create masked LM/next sentence masked_lm TF examples for BERT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import random
+import tokenization
+import tensorflow as tf
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_file", None,
+                    "Input raw text file (or comma-separated list of files).")
+
+flags.DEFINE_string(
+    "output_file", None,
+    "Output TF example file (or comma-separated list of files).")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
+
+flags.DEFINE_integer("max_predictions_per_seq", 20,
+                     "Maximum number of masked LM predictions per sequence.")
+
+flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
+
+flags.DEFINE_integer(
+    "dupe_factor", 10,
+    "Number of times to duplicate the input data (with different masks).")
+
+flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
+
+flags.DEFINE_float(
+    "short_seq_prob", 0.1,
+    "Probability of creating sequences which are shorter than the "
+    "maximum length.")
+
+
+class TrainingInstance(object):
+  """A single training instance (sentence pair)."""
+
+  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
+               is_random_next):
+    self.tokens = tokens
+    self.segment_ids = segment_ids
+    self.is_random_next = is_random_next
+    self.masked_lm_positions = masked_lm_positions
+    self.masked_lm_labels = masked_lm_labels
+
+  def __str__(self):
+    s = ""
+    s += "tokens: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.tokens]))
+    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
+    s += "is_random_next: %s\n" % self.is_random_next
+    s += "masked_lm_positions: %s\n" % (" ".join(
+        [str(x) for x in self.masked_lm_positions]))
+    s += "masked_lm_labels: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
+    s += "\n"
+    return s
+
+  def __repr__(self):
+    return self.__str__()
+
+
+def write_instance_to_example_files(instances, tokenizer, max_seq_length,
+                                    max_predictions_per_seq, output_files):
+  """Create TF example files from `TrainingInstance`s."""
+  writers = []
+  for output_file in output_files:
+    writers.append(tf.python_io.TFRecordWriter(output_file))
+
+  writer_index = 0
+
+  total_written = 0
+  for (inst_index, instance) in enumerate(instances):
+    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+    input_mask = [1] * len(input_ids)
+    segment_ids = list(instance.segment_ids)
+    assert len(input_ids) <= max_seq_length
+
+    while len(input_ids) < max_seq_length:
+      input_ids.append(0)
+      input_mask.append(0)
+      segment_ids.append(0)
+
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+
+    masked_lm_positions = list(instance.masked_lm_positions)
+    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
+    masked_lm_weights = [1.0] * len(masked_lm_ids)
+
+    while len(masked_lm_positions) < max_predictions_per_seq:
+      masked_lm_positions.append(0)
+      masked_lm_ids.append(0)
+      masked_lm_weights.append(0.0)
+
+    next_sentence_label = 1 if instance.is_random_next else 0
+
+    features = collections.OrderedDict()
+    features["input_ids"] = create_int_feature(input_ids)
+    features["input_mask"] = create_int_feature(input_mask)
+    features["segment_ids"] = create_int_feature(segment_ids)
+    features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
+    features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
+    features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
+    features["next_sentence_labels"] = create_int_feature([next_sentence_label])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+
+    writers[writer_index].write(tf_example.SerializeToString())
+    writer_index = (writer_index + 1) % len(writers)
+
+    total_written += 1
+
+    if inst_index < 20:
+      tf.logging.info("*** Example ***")
+      tf.logging.info("tokens: %s" % " ".join(
+          [tokenization.printable_text(x) for x in instance.tokens]))
+
+      for feature_name in features.keys():
+        feature = features[feature_name]
+        values = []
+        if feature.int64_list.value:
+          values = feature.int64_list.value
+        elif feature.float_list.value:
+          values = feature.float_list.value
+        tf.logging.info(
+            "%s: %s" % (feature_name, " ".join([str(x) for x in values])))
+
+  for writer in writers:
+    writer.close()
+
+  tf.logging.info("Wrote %d total instances", total_written)
+
+
+def create_int_feature(values):
+  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+  return feature
+
+
+def create_float_feature(values):
+  feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+  return feature
+
+
+def create_training_instances(input_files, tokenizer, max_seq_length,
+                              dupe_factor, short_seq_prob, masked_lm_prob,
+                              max_predictions_per_seq, rng):
+  """Create `TrainingInstance`s from raw text."""
+  all_documents = [[]]
+
+  # Input file format:
+  # (1) One sentence per line. These should ideally be actual sentences, not
+  # entire paragraphs or arbitrary spans of text. (Because we use the
+  # sentence boundaries for the "next sentence prediction" task).
+  # (2) Blank lines between documents. Document boundaries are needed so
+  # that the "next sentence prediction" task doesn't span between documents.
+  for input_file in input_files:
+    with tf.gfile.GFile(input_file, "r") as reader:
+      while True:
+        line = tokenization.convert_to_unicode(reader.readline())
+        if not line:
+          break
+        line = line.strip()
+
+        # Empty lines are used as document delimiters
+        if not line:
+          all_documents.append([])
+        tokens = tokenizer.tokenize(line)
+        if tokens:
+          all_documents[-1].append(tokens)
+
+  # Remove empty documents
+  all_documents = [x for x in all_documents if x]
+  rng.shuffle(all_documents)
+
+  vocab_words = list(tokenizer.vocab.keys())
+  instances = []
+  for _ in range(dupe_factor):
+    for document_index in range(len(all_documents)):
+      instances.extend(
+          create_instances_from_document(
+              all_documents, document_index, max_seq_length, short_seq_prob,
+              masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
+
+  rng.shuffle(instances)
+  return instances
+
+
+def create_instances_from_document(
+    all_documents, document_index, max_seq_length, short_seq_prob,
+    masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
+  """Creates `TrainingInstance`s for a single document."""
+  document = all_documents[document_index]
+
+  # Account for [CLS], [SEP], [SEP]
+  max_num_tokens = max_seq_length - 3
+
+  # We *usually* want to fill up the entire sequence since we are padding
+  # to `max_seq_length` anyways, so short sequences are generally wasted
+  # computation. However, we *sometimes*
+  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+  # sequences to minimize the mismatch between pre-training and fine-tuning.
+  # The `target_seq_length` is just a rough target however, whereas
+  # `max_seq_length` is a hard limit.
+  target_seq_length = max_num_tokens
+  if rng.random() < short_seq_prob:
+    target_seq_length = rng.randint(2, max_num_tokens)
+
+  # We DON'T just concatenate all of the tokens from a document into a long
+  # sequence and choose an arbitrary split point because this would make the
+  # next sentence prediction task too easy. Instead, we split the input into
+  # segments "A" and "B" based on the actual "sentences" provided by the user
+  # input.
+  instances = []
+  current_chunk = []
+  current_length = 0
+  i = 0
+  while i < len(document):
+    segment = document[i]
+    current_chunk.append(segment)
+    current_length += len(segment)
+    if i == len(document) - 1 or current_length >= target_seq_length:
+      if current_chunk:
+        # `a_end` is how many segments from `current_chunk` go into the `A`
+        # (first) sentence.
+        a_end = 1
+        if len(current_chunk) >= 2:
+          a_end = rng.randint(1, len(current_chunk) - 1)
+
+        tokens_a = []
+        for j in range(a_end):
+          tokens_a.extend(current_chunk[j])
+
+        tokens_b = []
+        # Random next
+        is_random_next = False
+        if len(current_chunk) == 1 or rng.random() < 0.5:
+          is_random_next = True
+          target_b_length = target_seq_length - len(tokens_a)
+
+          # This should rarely go for more than one iteration for large
+          # corpora. However, just to be careful, we try to make sure that
+          # the random document is not the same as the document
+          # we're processing.
+          for _ in range(10):
+            random_document_index = rng.randint(0, len(all_documents) - 1)
+            if random_document_index != document_index:
+              break
+
+          random_document = all_documents[random_document_index]
+          random_start = rng.randint(0, len(random_document) - 1)
+          for j in range(random_start, len(random_document)):
+            tokens_b.extend(random_document[j])
+            if len(tokens_b) >= target_b_length:
+              break
+          # We didn't actually use these segments so we "put them back" so
+          # they don't go to waste.
+          num_unused_segments = len(current_chunk) - a_end
+          i -= num_unused_segments
+        # Actual next
+        else:
+          is_random_next = False
+          for j in range(a_end, len(current_chunk)):
+            tokens_b.extend(current_chunk[j])
+        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+
+        assert len(tokens_a) >= 1
+        assert len(tokens_b) >= 1
+
+        tokens = []
+        segment_ids = []
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+        for token in tokens_a:
+          tokens.append(token)
+          segment_ids.append(0)
+
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+
+        for token in tokens_b:
+          tokens.append(token)
+          segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+        (tokens, masked_lm_positions,
+         masked_lm_labels) = create_masked_lm_predictions(
+             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
+        instance = TrainingInstance(
+            tokens=tokens,
+            segment_ids=segment_ids,
+            is_random_next=is_random_next,
+            masked_lm_positions=masked_lm_positions,
+            masked_lm_labels=masked_lm_labels)
+        instances.append(instance)
+      current_chunk = []
+      current_length = 0
+    i += 1
+
+  return instances
+
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+
+def create_masked_lm_predictions(tokens, masked_lm_prob,
+                                 max_predictions_per_seq, vocab_words, rng):
+  """Creates the predictions for the masked LM objective."""
+
+  cand_indexes = []
+  for (i, token) in enumerate(tokens):
+    if token == "[CLS]" or token == "[SEP]":
+      continue
+    cand_indexes.append(i)
+
+  rng.shuffle(cand_indexes)
+
+  output_tokens = list(tokens)
+
+  num_to_predict = min(max_predictions_per_seq,
+                       max(1, int(round(len(tokens) * masked_lm_prob))))
+
+  masked_lms = []
+  covered_indexes = set()
+  for index in cand_indexes:
+    if len(masked_lms) >= num_to_predict:
+      break
+    if index in covered_indexes:
+      continue
+    covered_indexes.add(index)
+
+    masked_token = None
+    # 80% of the time, replace with [MASK]
+    if rng.random() < 0.8:
+      masked_token = "[MASK]"
+    else:
+      # 10% of the time, keep original
+      if rng.random() < 0.5:
+        masked_token = tokens[index]
+      # 10% of the time, replace with random word
+      else:
+        masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
+
+    output_tokens[index] = masked_token
+
+    masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+
+  masked_lms = sorted(masked_lms, key=lambda x: x.index)
+
+  masked_lm_positions = []
+  masked_lm_labels = []
+  for p in masked_lms:
+    masked_lm_positions.append(p.index)
+    masked_lm_labels.append(p.label)
+
+  return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+  """Truncates a pair of sequences to a maximum sequence length."""
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_num_tokens:
+      break
+
+    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+    assert len(trunc_tokens) >= 1
+
+    # We want to sometimes truncate from the front and sometimes from the
+    # back to add more randomness and avoid biases.
+    if rng.random() < 0.5:
+      del trunc_tokens[0]
+    else:
+      trunc_tokens.pop()
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+  input_files = []
+  for input_pattern in FLAGS.input_file.split(","):
+    input_files.extend(tf.gfile.Glob(input_pattern))
+
+  tf.logging.info("*** Reading from input files ***")
+  for input_file in input_files:
+    tf.logging.info("  %s", input_file)
+
+  rng = random.Random(FLAGS.random_seed)
+  instances = create_training_instances(
+      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
+      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
+      rng)
+
+  output_files = FLAGS.output_file.split(",")
+  tf.logging.info("*** Writing to output files ***")
+  for output_file in output_files:
+    tf.logging.info("  %s", output_file)
+
+  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
+                                  FLAGS.max_predictions_per_seq, output_files)
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("input_file")
+  flags.mark_flag_as_required("output_file")
+  flags.mark_flag_as_required("vocab_file")
+  tf.app.run()
@@ -0,0 +1,419 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Extract pre-computed feature vectors from BERT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import codecs
+import collections
+import json
+import re
+
+import modeling
+import tokenization
+import tensorflow as tf
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_file", None, "")
+
+flags.DEFINE_string("output_file", None, "")
+
+flags.DEFINE_string("layers", "-1,-2,-3,-4", "")
+
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.")
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded.")
+
+flags.DEFINE_string(
+    "init_checkpoint", None,
+    "Initial checkpoint (usually from a pre-trained BERT model).")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.")
+
+flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
+
+flags.DEFINE_string("master", None,
+                    "If using a TPU, the address of the master.")
+
+flags.DEFINE_integer(
+    "num_tpu_cores", 8,
+    "Only used if `use_tpu` is True. Total number of TPU cores to use.")
+
+flags.DEFINE_bool(
+    "use_one_hot_embeddings", False,
+    "If True, tf.one_hot will be used for embedding lookups, otherwise "
+    "tf.nn.embedding_lookup will be used. On TPUs, this should be True "
+    "since it is much faster.")
+
+
+class InputExample(object):
+
+  def __init__(self, unique_id, text_a, text_b):
+    self.unique_id = unique_id
+    self.text_a = text_a
+    self.text_b = text_b
+
+
+class InputFeatures(object):
+  """A single set of features of data."""
+
+  def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
+    self.unique_id = unique_id
+    self.tokens = tokens
+    self.input_ids = input_ids
+    self.input_mask = input_mask
+    self.input_type_ids = input_type_ids
+
+
+def input_fn_builder(features, seq_length):
+  """Creates an `input_fn` closure to be passed to TPUEstimator."""
+
+  all_unique_ids = []
+  all_input_ids = []
+  all_input_mask = []
+  all_input_type_ids = []
+
+  for feature in features:
+    all_unique_ids.append(feature.unique_id)
+    all_input_ids.append(feature.input_ids)
+    all_input_mask.append(feature.input_mask)
+    all_input_type_ids.append(feature.input_type_ids)
+
+  def input_fn(params):
+    """The actual input function."""
+    batch_size = params["batch_size"]
+
+    num_examples = len(features)
+
+    # This is for demo purposes and does NOT scale to large data sets. We do
+    # not use Dataset.from_generator() because that uses tf.py_func which is
+    # not TPU compatible. The right way to load data is with TFRecordReader.
+    d = tf.data.Dataset.from_tensor_slices({
+        "unique_ids":
+            tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
+        "input_ids":
+            tf.constant(
+                all_input_ids, shape=[num_examples, seq_length],
+                dtype=tf.int32),
+        "input_mask":
+            tf.constant(
+                all_input_mask,
+                shape=[num_examples, seq_length],
+                dtype=tf.int32),
+        "input_type_ids":
+            tf.constant(
+                all_input_type_ids,
+                shape=[num_examples, seq_length],
+                dtype=tf.int32),
+    })
+
+    d = d.batch(batch_size=batch_size, drop_remainder=False)
+    return d
+
+  return input_fn
+
+
+def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
+                     use_one_hot_embeddings):
+  """Returns `model_fn` closure for TPUEstimator."""
+
+  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+    """The `model_fn` for TPUEstimator."""
+
+    unique_ids = features["unique_ids"]
+    input_ids = features["input_ids"]
+    input_mask = features["input_mask"]
+    input_type_ids = features["input_type_ids"]
+
+    model = modeling.BertModel(
+        config=bert_config,
+        is_training=False,
+        input_ids=input_ids,
+        input_mask=input_mask,
+        token_type_ids=input_type_ids,
+        use_one_hot_embeddings=use_one_hot_embeddings)
+
+    if mode != tf.estimator.ModeKeys.PREDICT:
+      raise ValueError("Only PREDICT modes are supported: %s" % (mode))
+
+    tvars = tf.trainable_variables()
+    scaffold_fn = None
+    (assignment_map,
+     initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
+         tvars, init_checkpoint)
+    if use_tpu:
+
+      def tpu_scaffold():
+        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+        return tf.train.Scaffold()
+
+      scaffold_fn = tpu_scaffold
+    else:
+      tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+    tf.logging.info("**** Trainable Variables ****")
+    for var in tvars:
+      init_string = ""
+      if var.name in initialized_variable_names:
+        init_string = ", *INIT_FROM_CKPT*"
+      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+                      init_string)
+
+    all_layers = model.get_all_encoder_layers()
+
+    predictions = {
+        "unique_id": unique_ids,
+    }
+
+    for (i, layer_index) in enumerate(layer_indexes):
+      predictions["layer_output_%d" % i] = all_layers[layer_index]
+
+    output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+        mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
+    return output_spec
+
+  return model_fn
+
+
+def convert_examples_to_features(examples, seq_length, tokenizer):
+  """Loads a data file into a list of `InputBatch`s."""
+
+  features = []
+  for (ex_index, example) in enumerate(examples):
+    tokens_a = tokenizer.tokenize(example.text_a)
+
+    tokens_b = None
+    if example.text_b:
+      tokens_b = tokenizer.tokenize(example.text_b)
+
+    if tokens_b:
+      # Modifies `tokens_a` and `tokens_b` in place so that the total
+      # length is less than the specified length.
+      # Account for [CLS], [SEP], [SEP] with "- 3"
+      _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
+    else:
+      # Account for [CLS] and [SEP] with "- 2"
+      if len(tokens_a) > seq_length - 2:
+        tokens_a = tokens_a[0:(seq_length - 2)]
+
+    # The convention in BERT is:
+    # (a) For sequence pairs:
+    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+    # (b) For single sequences:
+    #  tokens:   [CLS] the dog is hairy . [SEP]
+    #  type_ids: 0     0   0   0  0     0 0
+    #
+    # Where "type_ids" are used to indicate whether this is the first
+    # sequence or the second sequence. The embedding vectors for `type=0` and
+    # `type=1` were learned during pre-training and are added to the wordpiece
+    # embedding vector (and position vector). This is not *strictly* necessary
+    # since the [SEP] token unambiguously separates the sequences, but it makes
+    # it easier for the model to learn the concept of sequences.
+    #
+    # For classification tasks, the first vector (corresponding to [CLS]) is
+    # used as as the "sentence vector". Note that this only makes sense because
+    # the entire model is fine-tuned.
+    tokens = []
+    input_type_ids = []
+    tokens.append("[CLS]")
+    input_type_ids.append(0)
+    for token in tokens_a:
+      tokens.append(token)
+      input_type_ids.append(0)
+    tokens.append("[SEP]")
+    input_type_ids.append(0)
+
+    if tokens_b:
+      for token in tokens_b:
+        tokens.append(token)
+        input_type_ids.append(1)
+      tokens.append("[SEP]")
+      input_type_ids.append(1)
+
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+
+    # Zero-pad up to the sequence length.
+    while len(input_ids) < seq_length:
+      input_ids.append(0)
+      input_mask.append(0)
+      input_type_ids.append(0)
+
+    assert len(input_ids) == seq_length
+    assert len(input_mask) == seq_length
+    assert len(input_type_ids) == seq_length
+
+    if ex_index < 5:
+      tf.logging.info("*** Example ***")
+      tf.logging.info("unique_id: %s" % (example.unique_id))
+      tf.logging.info("tokens: %s" % " ".join(
+          [tokenization.printable_text(x) for x in tokens]))
+      tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+      tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+      tf.logging.info(
+          "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
+
+    features.append(
+        InputFeatures(
+            unique_id=example.unique_id,
+            tokens=tokens,
+            input_ids=input_ids,
+            input_mask=input_mask,
+            input_type_ids=input_type_ids))
+  return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+  """Truncates a sequence pair in place to the maximum length."""
+
+  # This is a simple heuristic which will always truncate the longer sequence
+  # one token at a time. This makes more sense than truncating an equal percent
+  # of tokens from each, since if one sequence is very short then each token
+  # that's truncated likely contains more information than a longer sequence.
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_length:
+      break
+    if len(tokens_a) > len(tokens_b):
+      tokens_a.pop()
+    else:
+      tokens_b.pop()
+
+
+def read_examples(input_file):
+  """Read a list of `InputExample`s from an input file."""
+  examples = []
+  unique_id = 0
+  with tf.gfile.GFile(input_file, "r") as reader:
+    while True:
+      line = tokenization.convert_to_unicode(reader.readline())
+      if not line:
+        break
+      line = line.strip()
+      text_a = None
+      text_b = None
+      m = re.match(r"^(.*) \|\|\| (.*)$", line)
+      if m is None:
+        text_a = line
+      else:
+        text_a = m.group(1)
+        text_b = m.group(2)
+      examples.append(
+          InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
+      unique_id += 1
+  return examples
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  layer_indexes = [int(x) for x in FLAGS.layers.split(",")]
+
+  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+  run_config = tf.contrib.tpu.RunConfig(
+      master=FLAGS.master,
+      tpu_config=tf.contrib.tpu.TPUConfig(
+          num_shards=FLAGS.num_tpu_cores,
+          per_host_input_for_training=is_per_host))
+
+  examples = read_examples(FLAGS.input_file)
+
+  features = convert_examples_to_features(
+      examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
+
+  unique_id_to_feature = {}
+  for feature in features:
+    unique_id_to_feature[feature.unique_id] = feature
+
+  model_fn = model_fn_builder(
+      bert_config=bert_config,
+      init_checkpoint=FLAGS.init_checkpoint,
+      layer_indexes=layer_indexes,
+      use_tpu=FLAGS.use_tpu,
+      use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)
+
+  # If TPU is not available, this will fall back to normal Estimator on CPU
+  # or GPU.
+  estimator = tf.contrib.tpu.TPUEstimator(
+      use_tpu=FLAGS.use_tpu,
+      model_fn=model_fn,
+      config=run_config,
+      predict_batch_size=FLAGS.batch_size)
+
+  input_fn = input_fn_builder(
+      features=features, seq_length=FLAGS.max_seq_length)
+
+  with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file,
+                                               "w")) as writer:
+    for result in estimator.predict(input_fn, yield_single_examples=True):
+      unique_id = int(result["unique_id"])
+      feature = unique_id_to_feature[unique_id]
+      output_json = collections.OrderedDict()
+      output_json["linex_index"] = unique_id
+      all_features = []
+      for (i, token) in enumerate(feature.tokens):
+        all_layers = []
+        for (j, layer_index) in enumerate(layer_indexes):
+          layer_output = result["layer_output_%d" % j]
+          layers = collections.OrderedDict()
+          layers["index"] = layer_index
+          layers["values"] = [
+              round(float(x), 6) for x in layer_output[i:(i + 1)].flat
+          ]
+          all_layers.append(layers)
+        features = collections.OrderedDict()
+        features["token"] = token
+        features["layers"] = all_layers
+        all_features.append(features)
+      output_json["features"] = all_features
+      writer.write(json.dumps(output_json) + "\n")
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("input_file")
+  flags.mark_flag_as_required("vocab_file")
+  flags.mark_flag_as_required("bert_config_file")
+  flags.mark_flag_as_required("init_checkpoint")
+  flags.mark_flag_as_required("output_file")
+  tf.app.run()
@@ -0,0 +1,35 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tensorflow as tf
+import numpy as np
+
+
+def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
+                                    initializer=None, regularizer=None,
+                                    trainable=True,
+                                    *args, **kwargs):
+    """Custom variable getter that forces trainable variables to be stored in
+       float32 precision and then casts them to the training precision.
+    """
+    storage_dtype = tf.float32 if trainable else dtype
+    variable = getter(name, shape, dtype=storage_dtype,
+                      initializer=initializer, regularizer=regularizer,
+                      trainable=trainable,
+                      *args, **kwargs)
+    if trainable and dtype != tf.float32:
+        variable = tf.cast(variable, dtype)
+    return variable
+
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import copy
+import json
+import math
+import re
+import six
+import tensorflow as tf
+
+from tensorflow.python.framework import ops
+from tensorflow.contrib.layers.python.layers import utils
+from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.ops import init_ops
+import numpy
+from tensorflow.python.ops import array_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import nn
+
+def fused_layer_norm(inputs,
+               center=True,
+               scale=True,
+               activation_fn=None,
+               reuse=None,
+               variables_collections=None,
+               outputs_collections=None,
+               trainable=True,
+               begin_norm_axis=1,
+               begin_params_axis=-1,
+               scope=None,
+               use_fused_batch_norm=False):
+  with tf.variable_scope(
+      scope, 'LayerNorm', [inputs], reuse=reuse) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    inputs_shape = inputs.shape
+    inputs_rank = inputs_shape.ndims
+    if inputs_rank is None:
+      raise ValueError('Inputs %s has undefined rank.' % inputs.name)
+    dtype = inputs.dtype.base_dtype
+    if begin_norm_axis < 0:
+      begin_norm_axis = inputs_rank + begin_norm_axis
+    if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
+      raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) '
+                       'must be < rank(inputs) (%d)' %
+                       (begin_params_axis, begin_norm_axis, inputs_rank))
+    params_shape = inputs_shape[begin_params_axis:]
+    if not params_shape.is_fully_defined():
+      raise ValueError(
+          'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' %
+          (inputs.name, begin_params_axis, inputs_shape))
+    # Allocate parameters for the beta and gamma of the normalization.
+    beta, gamma = None, None
+    if center:
+      beta_collections = utils.get_variable_collections(variables_collections,
+                                                        'beta')
+      beta = variables.model_variable(
+          'beta',
+          shape=params_shape,
+          dtype=dtype,
+          initializer=init_ops.zeros_initializer(),
+          collections=beta_collections,
+          trainable=trainable)
+    if scale:
+      gamma_collections = utils.get_variable_collections(
+          variables_collections, 'gamma')
+      gamma = variables.model_variable(
+          'gamma',
+          shape=params_shape,
+          dtype=dtype,
+          initializer=init_ops.ones_initializer(),
+          collections=gamma_collections,
+          trainable=trainable)
+    if use_fused_batch_norm:
+      # get static TensorShape if fully defined,
+      # otherwise retrieve shape tensor
+      norm_shape = inputs.shape[begin_norm_axis:]
+      if norm_shape.is_fully_defined():
+        bn_shape = [1, -1, 1, numpy.prod(norm_shape.as_list())]
+      else:
+        norm_shape = tf.shape(inputs)[begin_norm_axis:]
+        bn_shape = [1, -1, 1, tf.reduce_prod(norm_shape)]
+      if inputs.get_shape().is_fully_defined():
+        outputs_shape = inputs.get_shape()
+      else:
+        outputs_shape = tf.shape(inputs)
+      inputs = array_ops.reshape(inputs, bn_shape)
+      if inputs.get_shape().is_fully_defined():
+        # static inputs TensorShape fully defined after reshape.
+        ones = array_ops.ones(inputs.get_shape()[1], dtype=dtypes.float32)
+        zeros = array_ops.zeros(inputs.get_shape()[1], dtype=dtypes.float32)
+      else:
+        # static inputs TensorShape NOT fully defined after reshape.
+        # must use dynamic shape, which means these input tensors
+        # have to be created at runtime, which causes a slowdown.
+        scale_shape = tf.shape(inputs)[1]
+        ones = array_ops.ones(scale_shape, dtype=dtypes.float32)
+        zeros = array_ops.zeros(scale_shape, dtype=dtypes.float32)
+      outputs, mean, variance = nn.fused_batch_norm(
+          inputs,
+          ones, zeros,
+          epsilon=1e-4,
+          data_format="NCHW")
+      outputs = array_ops.reshape(outputs, outputs_shape)
+      if center and scale:
+        outputs = outputs * gamma + beta
+      elif center:
+        outputs = outputs + beta
+      elif scale:
+        outputs = outputs * gamma
+    else:
+      # Calculate the moments on the last axis (layer activations).
+      norm_axes = list(range(begin_norm_axis, inputs_rank))
+      mean, variance = nn.moments(inputs, norm_axes, keep_dims=True)
+      # Compute layer normalization using the batch_normalization function.
+      variance_epsilon = 1e-4
+      outputs = nn.batch_normalization(
+          inputs,
+          mean,
+          variance,
+          offset=beta,
+          scale=gamma,
+          variance_epsilon=variance_epsilon)
+      outputs.set_shape(inputs_shape)
+    if activation_fn is not None:
+      outputs = activation_fn(outputs)
+    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+import numpy as np
+
+def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
+                                    initializer=None, regularizer=None,
+                                    trainable=True,
+                                    *args, **kwargs):
+    """Custom variable getter that forces trainable variables to be stored in
+       float32 precision and then casts them to the training precision.
+    """
+    storage_dtype = tf.float32 if trainable else dtype
+    variable = getter(name, shape, dtype=storage_dtype,
+                      initializer=initializer, regularizer=regularizer,
+                      trainable=trainable,
+                      *args, **kwargs)
+    if trainable and dtype != tf.float32:
+        variable = tf.cast(variable, dtype)
+    return variable
+
+def get_custom_getter(compute_type):
+    return float32_variable_storage_getter if compute_type == tf.float16 else None
@@ -0,0 +1,439 @@
+# coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions and classes related to optimization (weight updates)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import tensorflow as tf
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+
+from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
+from npu_bridge.estimator.npu import npu_loss_scale_optimizer as lso
+from npu_bridge.estimator.npu import npu_loss_scale_manager as lsm_lib
+
+def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None, manual_fp16=False, use_fp16=False, num_accumulation_steps=1,
+                     optimizer_type="adam", allreduce_post_accumulation=False):
+  """Creates an optimizer training op."""
+  global_step = tf.train.get_or_create_global_step()
+  
+  # avoid step change in learning rate at end of warmup phase
+  if optimizer_type == "adam":
+      power = 1.0
+      decayed_learning_rate_at_crossover_point = init_lr * (
+                  (1.0 - float(num_warmup_steps) / float(num_train_steps)) ** power)
+  else:
+      power = 0.5
+      decayed_learning_rate_at_crossover_point = init_lr
+
+  adjusted_init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point)
+  print('decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e' % (decayed_learning_rate_at_crossover_point, adjusted_init_lr))
+
+  learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
+
+  # Implements linear decay of the learning rate.
+  learning_rate = tf.train.polynomial_decay(
+      learning_rate,
+      global_step,
+      num_train_steps,
+      end_learning_rate=0.0,
+      power=power,
+      cycle=False)
+
+  # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
+  # learning rate will be `global_step/num_warmup_steps * init_lr`.
+  if num_warmup_steps:
+    global_steps_int = tf.cast(global_step, tf.int32)
+    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
+
+    global_steps_float = tf.cast(global_steps_int, tf.float32)
+    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
+
+    warmup_percent_done = global_steps_float / warmup_steps_float
+    warmup_learning_rate = init_lr * warmup_percent_done
+
+    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
+    learning_rate = (
+        (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
+
+  if optimizer_type == "lamb":
+      print("Initializing LAMB Optimizer")
+      optimizer = LAMBOptimizer(
+          learning_rate=learning_rate,
+          weight_decay_rate=0.01,
+          beta_1=0.9,
+          beta_2=0.999,
+          epsilon=1e-6,
+          exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+  else:
+      print("Initializing ADAM Weight Decay Optimizer")
+      # It is recommended that you use this optimizer for fine tuning, since this
+      # is how the model was trained (note that the Adam m/v variables are NOT
+      # loaded from init_checkpoint.)
+      optimizer = AdamWeightDecayOptimizer(
+          learning_rate=learning_rate,
+          weight_decay_rate=0.01,
+          beta_1=0.9,
+          beta_2=0.999,
+          epsilon=1e-4,
+          exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+
+  if hvd is not None and (num_accumulation_steps == 1 or (not allreduce_post_accumulation)):
+    optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none)
+
+  optimizer = NPUDistributedOptimizer(optimizer)
+  if tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]:
+    opt_tmp = optimizer
+    if tf.flags.FLAGS.npu_bert_loss_scale == 0:
+      loss_scale_manager = lsm_lib.ExponentialUpdateLossScaleManager(init_loss_scale=tf.flags.FLAGS.init_loss_scale_value, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
+    elif tf.flags.FLAGS.npu_bert_loss_scale >= 1:
+      loss_scale_manager = lsm_lib.FixedLossScaleManager(loss_scale=tf.flags.FLAGS.npu_bert_loss_scale)
+    else:
+      raise ValueError("Invalid loss scale: %d" % tf.flags.FLAGS.npu_bert_loss_scale)
+    optimizer = lso.NPULossScaleOptimizer(opt_tmp, loss_scale_manager, is_distributed=tf.flags.FLAGS.distributed)
+
+  tvars = tf.trainable_variables()
+  grads_and_vars = optimizer.compute_gradients(loss * 1.0 / num_accumulation_steps, tvars)
+
+  if num_accumulation_steps > 1:
+      local_step = tf.get_variable(name="local_step", shape=[], dtype=tf.int32, trainable=False,
+                                   initializer=tf.zeros_initializer)
+      batch_finite = tf.get_variable(name="batch_finite", shape=[], dtype=tf.bool, trainable=False,
+                                     initializer=tf.ones_initializer)
+      accum_vars = [tf.get_variable(
+          name=tvar.name.split(":")[0] + "/accum",
+          shape=tvar.shape.as_list(),
+          dtype=tf.float32,
+          trainable=False,
+          initializer=tf.zeros_initializer()) for tvar in tf.trainable_variables()]
+
+      reset_step = tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool)
+      local_step = tf.cond(reset_step, lambda:local_step.assign(tf.ones_like(local_step)), lambda:local_step.assign_add(1))
+
+      with tf.name_scope(accumulate_step):
+        grads_and_vars_and_accums = [(gv[0],gv[1],accum_vars[i]) for i, gv in enumerate(grads_and_vars) if gv[0] is not None]
+        grads, tvars, accum_vars = list(zip(*grads_and_vars_and_accums))
+
+        all_are_finite = tf.reduce_all([tf.reduce_all(tf.is_finite(g)) for g in grads]) if (tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]) and (manual_fp16 or use_fp16) else tf.constant(True, dtype=tf.bool)
+        batch_finite = tf.cond(reset_step,
+          lambda: batch_finite.assign(tf.math.logical_and(tf.constant(True, dtype=tf.bool), all_are_finite)),
+          lambda:batch_finite.assign(tf.math.logical_and(batch_finite, all_are_finite)))
+
+      # This is how the model was pre-trained.
+      # ensure global norm is a finite number
+      # to prevent clip_by_global_norm from having a hizzy fit.
+      if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
+        (clipped_grads, _) = tf.clip_by_global_norm(
+            grads, clip_norm=1.0,
+            use_norm=tf.cond(
+                all_are_finite,
+                lambda: tf.global_norm(grads),
+                lambda: tf.constant(1.0)))
+      else:
+        with tf.name_scope("clip_grads"):
+          clipped_grads = [
+            (tf.clip_by_norm(grad, clip_norm=1.0))
+            if grad is not None else (grad, var) for grad in grads
+          ]
+
+      accum_vars = tf.cond(reset_step,
+              lambda: [accum_vars[i].assign(grad) for i, grad in enumerate(clipped_grads)],
+              lambda: [accum_vars[i].assign_add(grad) for i, grad in enumerate(clipped_grads)])
+
+      def update(accum_vars):
+        with tf.name_scope("opt_update"):
+          if allreduce_post_accumulation and hvd is not None:
+              accum_vars = [hvd.allreduce(tf.convert_to_tensor(accum_var), compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) if isinstance(accum_var, tf.IndexedSlices)
+                            else hvd.allreduce(accum_var, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) for accum_var in accum_vars]
+          return optimizer.apply_gradients(list(zip(accum_vars, tvars)), global_step=global_step)
+
+      update_step = tf.identity(tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool), name="update_step")
+      update_op = tf.cond(update_step,
+                          lambda: update(accum_vars), lambda: tf.no_op())
+
+      new_global_step = tf.cond(tf.math.logical_and(update_step, tf.cast(hvd.allreduce(tf.cast(batch_finite, tf.int32)), tf.bool)), lambda: global_step+1, lambda: global_step)
+      new_global_step = tf.identity(new_global_step, name='step_update')
+      train_op = tf.group(update_op, [global_step.assign(new_global_step)])
+  else:
+      grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
+      grads, tvars = list(zip(*grads_and_vars))
+ 
+      if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
+        all_are_finite = tf.reduce_all(
+            [tf.reduce_all(tf.is_finite(g)) for g in grads]) if (tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]) and (use_fp16 or manual_fp16) else tf.constant(True, dtype=tf.bool)
+
+      # This is how the model was pre-trained.
+      # ensure global norm is a finite number
+      # to prevent clip_by_global_norm from having a hizzy fit.
+      if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
+        (clipped_grads, _) = tf.clip_by_global_norm(
+          grads, clip_norm=1.0,
+          use_norm=tf.cond(
+              all_are_finite,
+              lambda: tf.global_norm(grads),
+              lambda: tf.constant(1.0)))
+      else:
+        with tf.name_scope("clip_grads"):
+          clipped_grads = [
+            (tf.clip_by_norm(grad, clip_norm=1.0))
+            if grad is not None else (grad, var) for grad in grads
+          ]
+      
+      with tf.name_scope("apply_grads"):
+        train_op = optimizer.apply_gradients(
+          list(zip(clipped_grads, tvars)), global_step=global_step)
+
+      #if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
+      #  new_global_step = tf.cond(all_are_finite, lambda: global_step + 1, lambda: global_step)
+      #else:
+      #  new_global_step = global_step + 1
+      #new_global_step = tf.identity(new_global_step, name='step_update')
+      #train_op = tf.group(train_op, [global_step.assign(new_global_step)])
+  return train_op
+
+
+class AdamWeightDecayOptimizer(tf.train.Optimizer):
+  """A basic Adam optimizer that includes "correct" L2 weight decay."""
+
+  def __init__(self,
+               learning_rate,
+               weight_decay_rate=0.0,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-4,
+               exclude_from_weight_decay=None,
+               name="AdamWeightDecayOptimizer"):
+    """Constructs a AdamWeightDecayOptimizer."""
+    super(AdamWeightDecayOptimizer, self).__init__(False, name)
+
+    self.learning_rate = tf.identity(learning_rate, name='learning_rate')
+    self.weight_decay_rate = weight_decay_rate
+    self.beta_1 = beta_1
+    self.beta_2 = beta_2
+    self.epsilon = epsilon
+    self.exclude_from_weight_decay = exclude_from_weight_decay
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None,
+      manual_fp16=False):
+    """See base class."""
+    assignments = []
+    for (grad, param) in grads_and_vars:
+      with tf.name_scope("apply_one_adam"):
+        if grad is None or param is None:
+          continue
+
+        param_name = self._get_variable_name(param.name)
+        has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
+        if has_shadow:
+          # create shadow fp32 weights for fp16 variable
+          param_fp32 = tf.get_variable(
+              name=param_name + "/shadow",
+              dtype=tf.float32,
+              trainable=False,
+              initializer=tf.cast(param.initialized_value(),tf.float32))
+        else:
+          param_fp32 = param
+
+        m = tf.get_variable(
+            name=param_name + "/adam_m",
+            shape=param.shape.as_list(),
+            dtype=tf.float32,
+            trainable=False,
+            initializer=tf.zeros_initializer())
+        v = tf.get_variable(
+            name=param_name + "/adam_v",
+            shape=param.shape.as_list(),
+            dtype=tf.float32,
+            trainable=False,
+            initializer=tf.zeros_initializer())
+
+        # Standard Adam update.
+        next_m = (
+            tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
+        next_v = (
+            tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
+                                                      tf.square(grad)))
+
+        update = next_m / (tf.sqrt(next_v) + self.epsilon)
+
+        # Just adding the square of the weights to the loss function is *not*
+        # the correct way of using L2 regularization/weight decay with Adam,
+        # since that will interact with the m and v parameters in strange ways.
+        #
+        # Instead we want to decay the weights in a manner that doesn't interact
+        # with the m/v parameters. This is equivalent to adding the square
+        # of the weights to the loss with plain (non-momentum) SGD.
+        if self._do_use_weight_decay(param_name):
+          update += self.weight_decay_rate * param_fp32
+
+        update_with_lr = self.learning_rate * update
+
+        next_param = param_fp32 - update_with_lr
+
+        if has_shadow:
+          # cast shadow fp32 weights to fp16 and assign to trainable variable
+          param.assign(tf.cast(next_param, param.dtype.base_dtype))
+        assignments.extend(
+            [param_fp32.assign(next_param),
+             m.assign(next_m),
+             v.assign(next_v)])
+    new_global_step = global_step + 1
+    new_global_step = tf.identity(new_global_step, name='step_update')
+    assignments.extend([global_step.assign(new_global_step)])
+    return tf.group(*assignments, name=name)
+
+  def _do_use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if not self.weight_decay_rate:
+      return False
+    if self.exclude_from_weight_decay:
+      for r in self.exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+  def _get_variable_name(self, param_name):
+    """Get the variable name from the tensor name."""
+    m = re.match("^(.*):\\d+$", param_name)
+    if m is not None:
+      param_name = m.group(1)
+    return param_name
+
+
+class LAMBOptimizer(tf.train.Optimizer):
+  """A LAMB optimizer that includes "correct" L2 weight decay."""
+
+  def __init__(self,
+               learning_rate,
+               weight_decay_rate=0.0,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-6,
+               exclude_from_weight_decay=None,
+               name="LAMBOptimizer"):
+    """Constructs a LAMBOptimizer."""
+    super(LAMBOptimizer, self).__init__(False, name)
+
+    self.learning_rate = tf.identity(learning_rate, name='learning_rate')
+    self.weight_decay_rate = weight_decay_rate
+    self.beta_1 = beta_1
+    self.beta_2 = beta_2
+    self.epsilon = epsilon
+    self.exclude_from_weight_decay = exclude_from_weight_decay
+    self.steps = 0
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None,
+      manual_fp16=False):
+    """See base class."""
+    assignments = []
+    for (grad, param) in grads_and_vars:
+      with tf.name_scope("apply_one_lamb"):
+        if grad is None or param is None:
+          continue
+
+        param_name = self._get_variable_name(param.name)
+        has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
+        if has_shadow:
+          # create shadow fp32 weights for fp16 variable
+          param_fp32 = tf.get_variable(
+              name=param_name + "/shadow",
+              dtype=tf.float32,
+              trainable=False,
+              initializer=tf.cast(param.initialized_value(),tf.float32))
+        else:
+          param_fp32 = param
+
+        m = tf.get_variable(
+            name=param_name + "/adam_m",
+            shape=param.shape.as_list(),
+            dtype=tf.float32,
+            trainable=False,
+            initializer=tf.zeros_initializer())
+        v = tf.get_variable(
+            name=param_name + "/adam_v",
+            shape=param.shape.as_list(),
+            dtype=tf.float32,
+            trainable=False,
+            initializer=tf.zeros_initializer())
+
+        # LAMB update
+        next_m = (
+            tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
+        next_v = (
+            tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
+                                                      tf.square(grad)))
+
+        self.steps += 1
+        beta1_correction = (1 - self.beta_1 ** self.steps)
+        beta2_correction = (1 - self.beta_2 ** self.steps)
+
+        next_m_unbiased = next_m / beta1_correction
+        next_v_unbiased = next_v / beta2_correction
+
+        update = next_m_unbiased / (tf.sqrt(next_v_unbiased) + self.epsilon)
+
+        # Just adding the square of the weights to the loss function is *not*
+        # the correct way of using L2 regularization/weight decay with Adam,
+        # since that will interact with the m and v parameters in strange ways.
+        #
+        # Instead we want to decay the weights in a manner that doesn't interact
+        # with the m/v parameters. This is equivalent to adding the square
+        # of the weights to the loss with plain (non-momentum) SGD.
+        if self._do_use_weight_decay(param_name):
+          update += self.weight_decay_rate * param_fp32
+
+        w_norm = linalg_ops.norm(param, ord=2)
+        g_norm = linalg_ops.norm(update, ord=2)
+        ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
+            math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0)
+
+        update_with_lr = ratio * self.learning_rate * update
+
+        next_param = param_fp32 - update_with_lr
+
+        if has_shadow:
+          # cast shadow fp32 weights to fp16 and assign to trainable variable
+          param.assign(tf.cast(next_param, param.dtype.base_dtype))
+        assignments.extend(
+            [param_fp32.assign(next_param),
+             m.assign(next_m),
+             v.assign(next_v)])
+    new_global_step = global_step + 1
+    new_global_step = tf.identity(new_global_step, name='step_update')
+    assignments.extend([global_step.assign(new_global_step)])
+    return tf.group(*assignments, name=name)
+
+  def _do_use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if not self.weight_decay_rate:
+      return False
+    if self.exclude_from_weight_decay:
+      for r in self.exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+  def _get_variable_name(self, param_name):
+    """Get the variable name from the tensor name."""
+    m = re.match("^(.*):\\d+$", param_name)
+    if m is not None:
+      param_name = m.group(1)
+    return param_name
@@ -0,0 +1,784 @@
+# coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run masked LM/next sentence masked_lm pre-training for BERT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import modeling
+import optimization
+import tensorflow as tf
+import glob
+from utils import LogEvalRunHook
+from tensorflow.core.protobuf import rewriter_config_pb2
+from gpu_environment import get_custom_getter
+
+from npu_bridge.estimator.npu.npu_config import *
+from npu_bridge.estimator.npu.npu_estimator import *
+from npu_bridge.estimator.npu.npu_config import NPURunConfig
+from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
+
+sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../../../../../utils/atlasboost'))
+# import hwlog
+from benchmark_log import hwlog
+from benchmark_log.basic_utils import get_environment_info
+from benchmark_log.basic_utils import get_model_parameter
+os.environ['WHICH_OP'] = 'GEOP'
+os.environ['NEW_GE_FE_ID'] = '1'
+os.environ['GE_AICPU_FLAG'] = '1'
+os.environ['GE_USE_STATIC_MEMORY'] = '1'
+os.environ['OPTION_EXEC_HCCL_FLAG'] = '1'
+os.environ['HCCL_CONNECT_TIMEOUT'] = '600'
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+## Required parameters
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.")
+
+flags.DEFINE_string(
+    "input_files_dir", "./data",
+    "Directory with input files, comma separated or single directory.")
+
+flags.DEFINE_string(
+    "eval_files_dir", None,
+    "Directory with eval files, comma separated or single directory. ")
+
+flags.DEFINE_string(
+    "output_dir", "./models",
+    "The output directory where the model checkpoints will be written.")
+
+## Other parameters
+flags.DEFINE_string(
+    "init_checkpoint", None,
+    "Initial checkpoint (usually from a pre-trained BERT model).")
+
+flags.DEFINE_string(
+    "optimizer_type", "lamb",
+    "Optimizer used for training - LAMB or ADAM")
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded. Must match data generation.")
+
+flags.DEFINE_integer(
+    "max_predictions_per_seq", 20,
+    "Maximum number of masked LM predictions per sequence. "
+    "Must match data generation.")
+
+flags.DEFINE_bool("do_train", True, "Whether to run training.")
+
+flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
+
+flags.DEFINE_integer("train_batch_size", 64, "Total batch size for training.")
+
+flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
+
+flags.DEFINE_float("learning_rate", 1e-4, "The initial learning rate for Adam.")
+
+flags.DEFINE_integer("num_train_steps", 1000000, "Number of training steps.")
+
+flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.")
+
+flags.DEFINE_integer("save_checkpoints_steps", 10000,
+                     "How often to save the model checkpoint.")
+
+flags.DEFINE_integer("display_loss_steps", 10,
+                     "How often to print loss")
+
+flags.DEFINE_integer("iterations_per_loop", 1000,
+                     "How many steps to make in each estimator call.")
+
+flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.")
+
+flags.DEFINE_integer("num_accumulation_steps", 1,
+                     "Number of accumulation steps before gradient update." 
+                      "Global batch size = num_accumulation_steps * train_batch_size")
+
+flags.DEFINE_bool("allreduce_post_accumulation", False, "Whether to all reduce after accumulation of N steps or after each step")
+
+flags.DEFINE_bool(
+    "verbose_logging", False,
+    "If true, all of the trainable parameters are printed")
+
+flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
+
+flags.DEFINE_bool("report_loss", True, "Whether to report total loss during training.")
+
+flags.DEFINE_bool("manual_fp16", True, "Whether to use fp32 or fp16 arithmetic on GPU. "
+                                        "Manual casting is done instead of using AMP")
+
+flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
+
+flags.DEFINE_bool("use_fp16", False, "Whether to enable AMP ops.")
+
+flags.DEFINE_bool("use_fp16_cls", True, "Whether to use fp16 in cls and pooler.")
+
+flags.DEFINE_bool("distributed", True, "Whether to use multi-npu")
+
+flags.DEFINE_bool('npu_bert_fused_gelu', True, 'Whether to use npu defined gelu op')
+
+flags.DEFINE_bool('npu_bert_debug', False, 'If True, dropout and shuffle is disabled.')
+
+flags.DEFINE_bool('npu_bert_use_tdt', True, 'Whether to use tdt as dataset')
+
+flags.DEFINE_string("npu_bert_job_start_file", None, "CSA job start file path.")
+
+flags.DEFINE_integer("npu_bert_loss_scale", 0, "Whether to use loss scale, -1 is disable, 0 is dynamic loss scale, >=1 is static loss scale")
+
+flags.DEFINE_bool("npu_bert_clip_by_global_norm", False, "Use clip_by_global_norm if True, or use clip_by_norm for each gradient")
+
+flags.DEFINE_bool('npu_bert_npu_dropout', True, 'Whether to use npu defined gelu op')
+
+flags.DEFINE_bool('npu_gather', True, 'Whether to use gather_npu whose backward propagation avoids IndexedSlices')
+
+flags.DEFINE_bool('hcom_parallel', True, 'Whether to use parallel allreduce')
+
+flags.DEFINE_integer('init_loss_scale_value', 2**32, 'Initial loss scale value for loss scale optimizer')
+
+# report samples/sec, total loss and learning rate during training
+class _LogSessionRunHook(tf.train.SessionRunHook):
+  def __init__(self, global_batch_size, num_accumulation_steps, display_every=10, hvd_rank=-1):
+    self.global_batch_size = global_batch_size
+    self.display_every = display_every
+    self.hvd_rank = hvd_rank
+    self.num_accumulation_steps = num_accumulation_steps
+  def after_create_session(self, session, coord):
+    self.elapsed_secs = 0.
+    self.count = 0
+    self.all_count = 0
+    self.avg_loss = 0.0
+
+  def before_run(self, run_context):
+    self.t0 = time.time()
+    if self.num_accumulation_steps <= 1:
+        if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+            return tf.train.SessionRunArgs(
+                fetches=['global_step:0', 'total_loss:0',
+                         'learning_rate:0', 'nsp_loss:0',
+                         'mlm_loss:0', 'loss_scale:0', 'apply_grads/All:0'])
+        else:
+            return tf.train.SessionRunArgs(
+                fetches=['global_step:0', 'total_loss:0',
+                         'learning_rate:0', 'nsp_loss:0',
+                         'mlm_loss:0'])
+    else:
+        if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+          return tf.train.SessionRunArgs(
+              fetches=['global_step:0', 'update_step:0', 'total_loss:0',
+                       'learning_rate:0', 'nsp_loss:0',
+                       'mlm_loss:0', 'loss_scale:0'])
+        else:
+          return tf.train.SessionRunArgs(
+              fetches=['global_step:0', 'update_step:0', 'total_loss:0',
+                       'learning_rate:0', 'nsp_loss:0',
+                       'mlm_loss:0'])
+  def after_run(self, run_context, run_values):
+    self.elapsed_secs += time.time() - self.t0
+    if self.num_accumulation_steps <=1:
+        if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+            global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler, custom_arg = run_values.results
+        else:
+            global_step, total_loss, lr, nsp_loss, mlm_loss = run_values. \
+                results
+        update_step = True
+    else:
+        if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+          global_step, update_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
+        else:
+          global_step, update_step, total_loss, lr, nsp_loss, mlm_loss = run_values.\
+              results
+    print_step = global_step + 1 # One-based index for printing.
+    self.avg_loss += total_loss
+    self.all_count += 1
+    if update_step:
+        self.count += 1
+        dt = self.elapsed_secs / self.count
+        sent_per_sec = self.global_batch_size / dt * FLAGS.iterations_per_loop
+        avg_loss_step = self.avg_loss / self.all_count
+        if self.hvd_rank >= 0:
+          if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+            print('Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e Loss scale = %6.4e isFinite = %6i' %
+                  (self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr, loss_scaler, custom_arg), flush=True)
+            hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
+            hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
+          else:
+            print('Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' %
+                  (self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr), flush=True)
+            hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
+            hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
+        else:
+          if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+            print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e Loss scale = %6.4e isFinite = %6i' %
+                  (print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr, loss_scaler, custom_arg), flush=True)
+            hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
+            hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
+          else:
+            print('Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' %
+                  (print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr), flush=True)
+            hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
+            hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
+        self.elapsed_secs = 0.
+        self.count = 0
+        self.avg_loss = 0.0
+        self.all_count = 0
+
+def model_fn_builder(bert_config, init_checkpoint, learning_rate,
+                     num_train_steps, num_warmup_steps,
+                     use_one_hot_embeddings, hvd=None):
+  """Returns `model_fn` closure for TPUEstimator."""
+
+  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+    """The `model_fn` for TPUEstimator."""
+
+    tf.logging.info("*** Features ***")
+    for name in sorted(features.keys()):
+      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+
+    input_ids = features["input_ids"]
+    input_mask = features["input_mask"]
+    segment_ids = features["segment_ids"]
+    masked_lm_positions = features["masked_lm_positions"]
+    masked_lm_ids = features["masked_lm_ids"]
+    masked_lm_weights = features["masked_lm_weights"]
+    next_sentence_labels = features["next_sentence_labels"]
+
+    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+
+    model = modeling.BertModel(
+        config=bert_config,
+        is_training=is_training,
+        input_ids=input_ids,
+        input_mask=input_mask,
+        token_type_ids=segment_ids,
+        use_one_hot_embeddings=use_one_hot_embeddings,
+        compute_type=tf.float16 if FLAGS.manual_fp16 else tf.float32)
+
+    (masked_lm_loss,
+     masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output(
+         bert_config, model.get_sequence_output(), model.get_embedding_table(), 
+         masked_lm_positions, masked_lm_ids, 
+         masked_lm_weights)
+
+    (next_sentence_loss, next_sentence_example_loss,
+     next_sentence_log_probs) = get_next_sentence_output(
+         bert_config, model.get_pooled_output(), next_sentence_labels)
+
+    masked_lm_loss = tf.identity(masked_lm_loss, name="mlm_loss")
+    next_sentence_loss = tf.identity(next_sentence_loss, name="nsp_loss")
+    total_loss = masked_lm_loss + next_sentence_loss
+    total_loss = tf.identity(total_loss, name='total_loss')
+
+    tvars = tf.trainable_variables()
+
+    initialized_variable_names = {}
+    if init_checkpoint and (hvd is None or hvd.rank() == 0):
+      print("Loading checkpoint", init_checkpoint)
+      (assignment_map, initialized_variable_names
+      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+
+      tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+    if FLAGS.verbose_logging:
+        tf.logging.info("**** Trainable Variables ****")
+        for var in tvars:
+          init_string = ""
+          if var.name in initialized_variable_names:
+            init_string = ", *INIT_FROM_CKPT*"
+          tf.logging.info("  %d :: name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape,
+                          init_string)
+
+    output_spec = None
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      train_op = optimization.create_optimizer(
+          total_loss, learning_rate, num_train_steps, num_warmup_steps,
+          hvd, FLAGS.manual_fp16, FLAGS.use_fp16, FLAGS.num_accumulation_steps, FLAGS.optimizer_type, FLAGS.allreduce_post_accumulation)
+
+      output_spec = tf.estimator.EstimatorSpec(
+          mode=mode,
+          loss=total_loss,
+          train_op=train_op)
+    elif mode == tf.estimator.ModeKeys.EVAL:
+
+      def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
+                    masked_lm_weights, next_sentence_example_loss,
+                    next_sentence_log_probs, next_sentence_labels):
+        """Computes the loss and accuracy of the model."""
+        masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
+                                         [-1, masked_lm_log_probs.shape[-1]])
+        masked_lm_predictions = tf.argmax(
+            masked_lm_log_probs, axis=-1, output_type=tf.int32)
+        masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
+        masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
+        masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
+        masked_lm_accuracy = tf.metrics.accuracy(
+            labels=masked_lm_ids,
+            predictions=masked_lm_predictions,
+            weights=masked_lm_weights)
+        masked_lm_mean_loss = tf.metrics.mean(
+            values=masked_lm_example_loss, weights=masked_lm_weights)
+
+        next_sentence_log_probs = tf.reshape(
+            next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
+        next_sentence_predictions = tf.argmax(
+            next_sentence_log_probs, axis=-1, output_type=tf.int32)
+        next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
+        next_sentence_accuracy = tf.metrics.accuracy(
+            labels=next_sentence_labels, predictions=next_sentence_predictions)
+        next_sentence_mean_loss = tf.metrics.mean(
+            values=next_sentence_example_loss)
+
+        return {
+            "masked_lm_accuracy": masked_lm_accuracy,
+            "masked_lm_loss": masked_lm_mean_loss,
+            "next_sentence_accuracy": next_sentence_accuracy,
+            "next_sentence_loss": next_sentence_mean_loss,
+        }
+
+      eval_metric_ops = metric_fn(
+          masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
+          masked_lm_weights, next_sentence_example_loss,
+          next_sentence_log_probs, next_sentence_labels
+      )
+      output_spec = tf.estimator.EstimatorSpec(
+          mode=mode,
+          loss=total_loss,
+          eval_metric_ops=eval_metric_ops)
+    else:
+      raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))
+
+    return output_spec
+
+  return model_fn
+
+
+def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
+                         label_ids, label_weights):
+  """Get loss and log probs for the masked LM."""
+  input_tensor = gather_indexes(input_tensor, positions)
+
+  with tf.variable_scope("cls/predictions"):
+    # We apply one more non-linear transformation before the output layer.
+    # This matrix is not used after pre-training.
+    with tf.variable_scope("transform", custom_getter=get_custom_getter(compute_type=tf.float16 if FLAGS.use_fp16_cls else tf.float32)):
+      if FLAGS.use_fp16_cls:
+        input_tensor = tf.cast(input_tensor, tf.float16)
+      input_tensor = tf.layers.dense(
+          input_tensor,
+          units=bert_config.hidden_size,
+          activation=modeling.get_activation(bert_config.hidden_act),
+          kernel_initializer=modeling.create_initializer(
+              bert_config.initializer_range))
+      input_tensor = tf.cast(input_tensor, tf.float32)
+      input_tensor = modeling.layer_norm(input_tensor)
+
+    # The output weights are the same as the input embeddings, but there is
+    # an output-only bias for each token.
+    output_bias = tf.get_variable(
+        "output_bias",
+        shape=[bert_config.vocab_size],
+        initializer=tf.zeros_initializer())
+    if FLAGS.use_fp16_cls:
+      input_tensor = tf.cast(input_tensor, tf.float16)
+      logits = tf.matmul(input_tensor, tf.cast(output_weights, tf.float16), transpose_b=True)
+      logits = tf.cast(logits, tf.float32)
+    else:
+      logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
+    logits = tf.nn.bias_add(logits, output_bias)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+
+    label_ids = tf.reshape(label_ids, [-1])
+    label_weights = tf.reshape(label_weights, [-1])
+
+    one_hot_labels = tf.one_hot(
+        label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
+
+    # The `positions` tensor might be zero-padded (if the sequence is too
+    # short to have the maximum number of predictions). The `label_weights`
+    # tensor has a value of 1.0 for every real prediction and 0.0 for the
+    # padding predictions.
+    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
+    numerator = tf.reduce_sum(label_weights * per_example_loss)
+    denominator = tf.reduce_sum(label_weights) + 1e-5
+    loss = numerator / denominator
+
+  return (loss, per_example_loss, log_probs)
+
+
+def get_next_sentence_output(bert_config, input_tensor, labels):
+  """Get loss and log probs for the next sentence prediction."""
+
+  # Simple binary classification. Note that 0 is "next sentence" and 1 is
+  # "random sentence". This weight matrix is not used after pre-training.
+  with tf.variable_scope("cls/seq_relationship"):
+    output_weights = tf.get_variable(
+        "output_weights",
+        shape=[2, bert_config.hidden_size],
+        initializer=modeling.create_initializer(bert_config.initializer_range))
+    output_bias = tf.get_variable(
+        "output_bias", shape=[2], initializer=tf.zeros_initializer())
+
+    if FLAGS.use_fp16_cls:
+      input_tensor = tf.cast(input_tensor, tf.float16)
+      logits = tf.matmul(input_tensor, tf.cast(output_weights, tf.float16), transpose_b=True)
+      logits = tf.cast(logits, tf.float32)
+    else:
+      logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
+    logits = tf.nn.bias_add(logits, output_bias)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+    labels = tf.reshape(labels, [-1])
+    one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
+    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
+    loss = tf.reduce_mean(per_example_loss)
+    return (loss, per_example_loss, log_probs)
+
+
+def gather_indexes(sequence_tensor, positions):
+  """Gathers the vectors at the specific positions over a minibatch."""
+  sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
+  batch_size = sequence_shape[0]
+  seq_length = sequence_shape[1]
+  width = sequence_shape[2]
+
+  flat_offsets = tf.reshape(
+      tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
+  flat_positions = tf.reshape(positions + flat_offsets, [-1])
+  flat_sequence_tensor = tf.reshape(sequence_tensor,
+                                    [batch_size * seq_length, width])
+  output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
+  return output_tensor
+
+
+def input_fn_builder(input_files,
+                     batch_size,
+                     max_seq_length,
+                     max_predictions_per_seq,
+                     is_training,
+                     num_cpu_threads=4,
+                     hvd=None):
+  """Creates an `input_fn` closure to be passed to Estimator."""
+
+  def input_fn():
+    """The actual input function."""
+
+    name_to_features = {
+        "input_ids":
+            tf.FixedLenFeature([max_seq_length], tf.int64),
+        "input_mask":
+            tf.FixedLenFeature([max_seq_length], tf.int64),
+        "segment_ids":
+            tf.FixedLenFeature([max_seq_length], tf.int64),
+        "masked_lm_positions":
+            tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
+        "masked_lm_ids":
+            tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
+        "masked_lm_weights":
+            tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
+        "next_sentence_labels":
+            tf.FixedLenFeature([1], tf.int64),
+    }
+
+    # For training, we want a lot of parallel reading and shuffling.
+    # For eval, we want no shuffling and parallel reading doesn't matter.
+    if is_training:
+      d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
+      if FLAGS.distributed: 
+        #rank_size = int(os.getenv('RANK_SIZE'))
+        #rank_id = int(os.getenv('RANK_INDEX'))
+        #device_id = int(os.getenv('DEVICE_ID'))
+        #local_rank = rank_id * 8 + device_id
+        #print('RANK_SIZE=', rank_size, ' RANK_ID=', local_rank)
+          rank_size = int(os.getenv('RANK_SIZE'))
+          rank_id = int(os.getenv('RANK_ID'))
+          print('RANK_SIZE=', rank_size, ' rank_id=', rank_id)
+          d = d.shard(rank_size, rank_id)
+      d = d.repeat()
+      if not FLAGS.npu_bert_debug:
+        d = d.shuffle(buffer_size=len(input_files))
+
+      # `cycle_length` is the number of parallel files that get read.
+      if not FLAGS.npu_bert_debug:
+        #cycle_length = min(num_cpu_threads, len(input_files))
+        cycle_length = min(num_cpu_threads, int(len(input_files)/int(os.getenv('RANK_SIZE'))))
+      else:
+        cycle_length = 1
+
+      # `sloppy` mode means that the interleaving is not exact. This adds
+      # even more randomness to the training pipeline.
+      #d = d.apply(
+      #    tf.contrib.data.parallel_interleave(
+      #        tf.data.TFRecordDataset,
+      #        sloppy=(not FLAGS.npu_bert_debug),
+      #        cycle_length=cycle_length))
+      d = d.interleave(
+          tf.data.TFRecordDataset,
+          cycle_length=cycle_length,
+          num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      if not FLAGS.npu_bert_debug:
+        d = d.shuffle(buffer_size=100)
+    else:
+      d = tf.data.TFRecordDataset(input_files)
+      # Since we evaluate for a fixed number of steps we don't want to encounter
+      # out-of-range exceptions.
+      d = d.repeat()
+
+    # We must `drop_remainder` on training because the TPU requires fixed
+    # size dimensions. For eval, we assume we are evaluating on the CPU or GPU
+    # and we *don't* want to drop the remainder, otherwise we wont cover
+    # every sample.
+    d = d.apply(
+        tf.contrib.data.map_and_batch(
+            lambda record: _decode_record(record, name_to_features),
+            batch_size=batch_size,
+            num_parallel_batches=num_cpu_threads,
+            drop_remainder=True))
+    return d
+
+  return input_fn
+
+
+def _decode_record(record, name_to_features):
+  """Decodes a record to a TensorFlow example."""
+  example = tf.parse_single_example(record, name_to_features)
+
+  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+  # So cast all int64 to int32.
+  for name in list(example.keys()):
+    t = example[name]
+    if t.dtype == tf.int64:
+      t = tf.to_int32(t)
+    example[name] = t
+
+  return example
+
+
+def main(_):
+  for name, value in FLAGS.__flags.items():
+    print("name:", name, "      ", FLAGS[name].value)
+  
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  if not FLAGS.do_train and not FLAGS.do_eval:
+    raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+  if FLAGS.use_fp16:
+    os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+
+  if FLAGS.horovod:
+    import horovod.tensorflow as hvd
+    hvd.init()
+
+  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+
+  if FLAGS.npu_gather:
+    if FLAGS.distributed and bert_config.num_hidden_layers == 24:
+      #from hccl.split.api import set_split_strategy_by_idx
+      from hccl.split.api import set_split_strategy_by_size
+      #set_split_strategy_by_idx([8,72,136,200,264,328,392,397])
+      set_split_strategy_by_size([10,10,10,10,15,15,15,15])
+    if FLAGS.distributed and bert_config.num_hidden_layers == 12:
+      from hccl.split.api import set_split_strategy_by_idx
+      set_split_strategy_by_idx([8,56,104,152,200,205])
+    if FLAGS.distributed and bert_config.num_hidden_layers == 6:
+      from hccl.split.api import set_split_strategy_by_idx
+      set_split_strategy_by_idx([8,40,72,104,109])
+    
+  tf.gfile.MakeDirs(FLAGS.output_dir)
+
+  input_files = []
+  for input_file_dir in FLAGS.input_files_dir.split(","):
+    input_files.extend(tf.gfile.Glob(os.path.join(input_file_dir, "*")))
+
+  input_files.sort()
+  print("Input Files:", input_files)
+
+  if FLAGS.horovod and len(input_files) < hvd.size():
+      raise ValueError("Input Files must be sharded")
+  if FLAGS.use_fp16 and FLAGS.manual_fp16:
+      raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error")
+
+  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+  config = tf.ConfigProto()
+  if FLAGS.horovod:
+    config.gpu_options.visible_device_list = str(hvd.local_rank())
+    if hvd.rank() == 0:
+      tf.logging.info("***** Configuaration *****")
+      for key in FLAGS.__flags.keys():
+          tf.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
+      tf.logging.info("**************************")
+
+#    config.gpu_options.per_process_gpu_memory_fraction = 0.7
+  if FLAGS.use_xla: 
+      config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+      config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT
+
+  #run_config = tf.estimator.RunConfig(
+  run_config = NPURunConfig(
+      model_dir=FLAGS.output_dir,
+      save_summary_steps=0,
+      session_config=config,
+      save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None,
+      # This variable controls how often estimator reports examples/sec.
+      # Default value is every 100 steps.
+      # When --report_loss is True, we set to very large value to prevent
+      # default info reporting from estimator.
+      # Ideally we should set it to None, but that does not work.
+      log_step_count_steps=1 if FLAGS.report_loss else 100,
+      enable_data_pre_proc=FLAGS.npu_bert_use_tdt,
+      iterations_per_loop=FLAGS.iterations_per_loop,
+      hcom_parallel=FLAGS.hcom_parallel)
+
+  if FLAGS.distributed:
+    rank_size = int(os.getenv('RANK_SIZE'))
+  model_fn = model_fn_builder(
+      bert_config=bert_config,
+      init_checkpoint=FLAGS.init_checkpoint,
+      learning_rate=FLAGS.learning_rate,
+      num_train_steps=FLAGS.num_train_steps,
+      num_warmup_steps=FLAGS.num_warmup_steps,
+      use_one_hot_embeddings=False,
+      hvd=None if not FLAGS.horovod else hvd)
+
+  training_hooks = []
+  """
+  if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0):
+    global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.horovod else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
+    training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
+  if FLAGS.horovod and hvd.size() > 1:
+    training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
+  """
+  if FLAGS.report_loss:
+    global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.distributed else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * rank_size
+    training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
+
+
+  #estimator = tf.estimator.Estimator(
+  estimator = NPUEstimator(
+      model_fn=model_fn,
+      config=run_config,
+      job_start_file=FLAGS.npu_bert_job_start_file)
+
+  if FLAGS.do_train:
+    tf.logging.info("***** Running training *****")
+    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+    train_input_fn = input_fn_builder(
+        input_files=input_files,
+        batch_size=FLAGS.train_batch_size,
+        max_seq_length=FLAGS.max_seq_length,
+        max_predictions_per_seq=FLAGS.max_predictions_per_seq,
+        is_training=True,
+        hvd=None if not FLAGS.horovod else hvd)
+
+    estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps)
+
+  if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
+    tf.logging.info("***** Running evaluation *****")
+    tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
+
+    eval_files = []
+    for eval_file_dir in FLAGS.eval_files_dir.split(","):
+        eval_files.extend(tf.gfile.Glob(os.path.join(eval_file_dir, "*")))
+
+    eval_input_fn = input_fn_builder(
+        input_files=eval_files,
+        batch_size=FLAGS.eval_batch_size,
+        max_seq_length=FLAGS.max_seq_length,
+        max_predictions_per_seq=FLAGS.max_predictions_per_seq,
+        is_training=False,
+        hvd=None if not FLAGS.horovod else hvd)
+
+    eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
+    eval_start_time = time.time()
+    result = estimator.evaluate(
+        input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks)
+
+    eval_time_elapsed = time.time() - eval_start_time
+    eval_time_wo_overhead = eval_hooks[-1].total_time
+
+    num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size
+
+    ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
+
+    tf.logging.info("-----------------------------")
+    tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
+                    eval_hooks[-1].count * FLAGS.eval_batch_size)
+    tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
+                    (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size)
+    tf.logging.info("Summary Inference Statistics on EVAL set")
+    tf.logging.info("Batch size = %d", FLAGS.eval_batch_size)
+    tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
+    tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
+    tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
+    tf.logging.info("-----------------------------")
+
+    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
+    with tf.gfile.GFile(output_eval_file, "w") as writer:
+      tf.logging.info("***** Eval results *****")
+      for key in sorted(result.keys()):
+        tf.logging.info("  %s = %s", key, str(result[key]))
+        writer.write("%s = %s\n" % (key, str(result[key])))
+        if key == 'masked_lm_accuracy':
+             hwlog.remark_print(key=hwlog.MASKED_LM_ACCURACY, value=str(result[key]))
+        elif key == 'next_sentence_accuracy ':
+             hwlog.remark_print(key=hwlog.NEXT_SENTENCE_ACCURACY, value=str(result[key]))
+        elif key == 'global_step':
+             hwlog.remark_print(key=hwlog.GLOBAL_STEP, value=str(result[key]))
+        elif key == 'loss':
+             hwlog.remark_print(key=hwlog.LOSS, value=str(result[key]))
+        elif key == 'masked_lm_loss':
+             hwlog.remark_print(key=hwlog.MASKED_LM_LOSS, value=str(result[key]))
+        elif key == 'next_sentence_loss ':
+             hwlog.remark_print(key=hwlog.NEXT_SENTENCE_LOSS, value=str(result[key]))
+        else:
+             pass
+
+
+if __name__ == "__main__":
+  hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
+  cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
+  config_info = get_model_parameter("tensorflow_config")
+  initinal_data = {"base_lr": 0.01, "dataset": "cn-clue/en-wiki", "optimizer": "Adam", "loss_scale": 512}
+  flags.mark_flag_as_required("input_files_dir")
+  flags.mark_flag_as_required("eval_files_dir")
+  flags.mark_flag_as_required("bert_config_file")
+  flags.mark_flag_as_required("output_dir")
+  flags.mark_flag_as_required("npu_bert_job_start_file")
+  hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
+  hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
+  hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
+  hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
+  hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
+  hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
+  hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
+  hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
+  hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
+  hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
+  if FLAGS.use_xla and FLAGS.manual_fp16:
+    print('WARNING! Combining --use_xla with --manual_fp16 may prevent convergence.')
+    print('         This warning message will be removed when the underlying')
+    print('         issues have been fixed and you are running a TF version')
+    print('         that has that fix.')
+  tf.app.run()
@@ -0,0 +1,215 @@
+"""
+Multiclass
+from: 
+https://github.com/guillaumegenthial/tf_metrics/blob/master/tf_metrics/__init__.py
+
+"""
+
+__author__ = "Guillaume Genthial"
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.ops.metrics_impl import _streaming_confusion_matrix
+
+
+def precision(labels, predictions, num_classes, pos_indices=None,
+              weights=None, average='micro'):
+    """Multi-class precision metric for Tensorflow
+    Parameters
+    ----------
+    labels : Tensor of tf.int32 or tf.int64
+        The true labels
+    predictions : Tensor of tf.int32 or tf.int64
+        The predictions, same shape as labels
+    num_classes : int
+        The number of classes
+    pos_indices : list of int, optional
+        The indices of the positive classes, default is all
+    weights : Tensor of tf.int32, optional
+        Mask, must be of compatible shape with labels
+    average : str, optional
+        'micro': counts the total number of true positives, false
+            positives, and false negatives for the classes in
+            `pos_indices` and infer the metric from it.
+        'macro': will compute the metric separately for each class in
+            `pos_indices` and average. Will not account for class
+            imbalance.
+        'weighted': will compute the metric separately for each class in
+            `pos_indices` and perform a weighted average by the total
+            number of true labels for each class.
+    Returns
+    -------
+    tuple of (scalar float Tensor, update_op)
+    """
+    cm, op = _streaming_confusion_matrix(
+        labels, predictions, num_classes, weights)
+    pr, _, _ = metrics_from_confusion_matrix(
+        cm, pos_indices, average=average)
+    op, _, _ = metrics_from_confusion_matrix(
+        op, pos_indices, average=average)
+    return (pr, op)
+
+
+def recall(labels, predictions, num_classes, pos_indices=None, weights=None,
+           average='micro'):
+    """Multi-class recall metric for Tensorflow
+    Parameters
+    ----------
+    labels : Tensor of tf.int32 or tf.int64
+        The true labels
+    predictions : Tensor of tf.int32 or tf.int64
+        The predictions, same shape as labels
+    num_classes : int
+        The number of classes
+    pos_indices : list of int, optional
+        The indices of the positive classes, default is all
+    weights : Tensor of tf.int32, optional
+        Mask, must be of compatible shape with labels
+    average : str, optional
+        'micro': counts the total number of true positives, false
+            positives, and false negatives for the classes in
+            `pos_indices` and infer the metric from it.
+        'macro': will compute the metric separately for each class in
+            `pos_indices` and average. Will not account for class
+            imbalance.
+        'weighted': will compute the metric separately for each class in
+            `pos_indices` and perform a weighted average by the total
+            number of true labels for each class.
+    Returns
+    -------
+    tuple of (scalar float Tensor, update_op)
+    """
+    cm, op = _streaming_confusion_matrix(
+        labels, predictions, num_classes, weights)
+    _, re, _ = metrics_from_confusion_matrix(
+        cm, pos_indices, average=average)
+    _, op, _ = metrics_from_confusion_matrix(
+        op, pos_indices, average=average)
+    return (re, op)
+
+
+def f1(labels, predictions, num_classes, pos_indices=None, weights=None,
+       average='micro'):
+    return fbeta(labels, predictions, num_classes, pos_indices, weights,
+                 average)
+
+
+def fbeta(labels, predictions, num_classes, pos_indices=None, weights=None,
+          average='micro', beta=1):
+    """Multi-class fbeta metric for Tensorflow
+    Parameters
+    ----------
+    labels : Tensor of tf.int32 or tf.int64
+        The true labels
+    predictions : Tensor of tf.int32 or tf.int64
+        The predictions, same shape as labels
+    num_classes : int
+        The number of classes
+    pos_indices : list of int, optional
+        The indices of the positive classes, default is all
+    weights : Tensor of tf.int32, optional
+        Mask, must be of compatible shape with labels
+    average : str, optional
+        'micro': counts the total number of true positives, false
+            positives, and false negatives for the classes in
+            `pos_indices` and infer the metric from it.
+        'macro': will compute the metric separately for each class in
+            `pos_indices` and average. Will not account for class
+            imbalance.
+        'weighted': will compute the metric separately for each class in
+            `pos_indices` and perform a weighted average by the total
+            number of true labels for each class.
+    beta : int, optional
+        Weight of precision in harmonic mean
+    Returns
+    -------
+    tuple of (scalar float Tensor, update_op)
+    """
+    cm, op = _streaming_confusion_matrix(
+        labels, predictions, num_classes, weights)
+    _, _, fbeta = metrics_from_confusion_matrix(
+        cm, pos_indices, average=average, beta=beta)
+    _, _, op = metrics_from_confusion_matrix(
+        op, pos_indices, average=average, beta=beta)
+    return (fbeta, op)
+
+
+def safe_div(numerator, denominator):
+    """Safe division, return 0 if denominator is 0"""
+    numerator, denominator = tf.to_float(numerator), tf.to_float(denominator)
+    zeros = tf.zeros_like(numerator, dtype=numerator.dtype)
+    denominator_is_zero = tf.equal(denominator, zeros)
+    return tf.where(denominator_is_zero, zeros, numerator / denominator)
+
+
+def pr_re_fbeta(cm, pos_indices, beta=1):
+    """Uses a confusion matrix to compute precision, recall and fbeta"""
+    num_classes = cm.shape[0]
+    neg_indices = [i for i in range(num_classes) if i not in pos_indices]
+    cm_mask = np.ones([num_classes, num_classes])
+    cm_mask[neg_indices, neg_indices] = 0
+    diag_sum = tf.reduce_sum(tf.diag_part(cm * cm_mask))
+
+    cm_mask = np.ones([num_classes, num_classes])
+    cm_mask[:, neg_indices] = 0
+    tot_pred = tf.reduce_sum(cm * cm_mask)
+
+    cm_mask = np.ones([num_classes, num_classes])
+    cm_mask[neg_indices, :] = 0
+    tot_gold = tf.reduce_sum(cm * cm_mask)
+
+    pr = safe_div(diag_sum, tot_pred)
+    re = safe_div(diag_sum, tot_gold)
+    fbeta = safe_div((1. + beta**2) * pr * re, beta**2 * pr + re)
+
+    return pr, re, fbeta
+
+
+def metrics_from_confusion_matrix(cm, pos_indices=None, average='micro',
+                                  beta=1):
+    """Precision, Recall and F1 from the confusion matrix
+    Parameters
+    ----------
+    cm : tf.Tensor of type tf.int32, of shape (num_classes, num_classes)
+        The streaming confusion matrix.
+    pos_indices : list of int, optional
+        The indices of the positive classes
+    beta : int, optional
+        Weight of precision in harmonic mean
+    average : str, optional
+        'micro', 'macro' or 'weighted'
+    """
+    num_classes = cm.shape[0]
+    if pos_indices is None:
+        pos_indices = [i for i in range(num_classes)]
+
+    if average == 'micro':
+        return pr_re_fbeta(cm, pos_indices, beta)
+    elif average in {'macro', 'weighted'}:
+        precisions, recalls, fbetas, n_golds = [], [], [], []
+        for idx in pos_indices:
+            pr, re, fbeta = pr_re_fbeta(cm, [idx], beta)
+            precisions.append(pr)
+            recalls.append(re)
+            fbetas.append(fbeta)
+            cm_mask = np.zeros([num_classes, num_classes])
+            cm_mask[idx, :] = 1
+            n_golds.append(tf.to_float(tf.reduce_sum(cm * cm_mask)))
+
+        if average == 'macro':
+            pr = tf.reduce_mean(precisions)
+            re = tf.reduce_mean(recalls)
+            fbeta = tf.reduce_mean(fbetas)
+            return pr, re, fbeta
+        if average == 'weighted':
+            n_gold = tf.reduce_sum(n_golds)
+            pr_sum = sum(p * n for p, n in zip(precisions, n_golds))
+            pr = safe_div(pr_sum, n_gold)
+            re_sum = sum(r * n for r, n in zip(recalls, n_golds))
+            re = safe_div(re_sum, n_gold)
+            fbeta_sum = sum(f * n for f, n in zip(fbetas, n_golds))
+            fbeta = safe_div(fbeta_sum, n_gold)
+            return pr, re, fbeta
+
+    else:
+        raise NotImplementedError()
@@ -0,0 +1,451 @@
+# coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import unicodedata
+import six
+import tensorflow as tf
+import re
+import os
+
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+}
+
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+  """Checks whether the casing config is consistent with the checkpoint name."""
+
+  # The casing has to be passed in by the user and there is no explicit check
+  # as to whether it matches the checkpoint. The casing information probably
+  # should have been stored in the bert_config.json file, but it's not, so
+  # we have to heuristically detect it to validate.
+
+  if not init_checkpoint:
+    return
+
+  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+  if m is None:
+    return
+
+  model_name = m.group(1)
+
+  lower_models = [
+      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+  ]
+
+  cased_models = [
+      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+      "multi_cased_L-12_H-768_A-12"
+  ]
+
+  is_bad_config = False
+  if model_name in lower_models and not do_lower_case:
+    is_bad_config = True
+    actual_flag = "False"
+    case_name = "lowercased"
+    opposite_flag = "True"
+
+  if model_name in cased_models and do_lower_case:
+    is_bad_config = True
+    actual_flag = "True"
+    case_name = "cased"
+    opposite_flag = "False"
+
+  if is_bad_config:
+    raise ValueError(
+        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+        "However, `%s` seems to be a %s model, so you "
+        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+        "how the model was pre-training. If this error is wrong, please "
+        "just comment out this check." % (actual_flag, init_checkpoint,
+                                          model_name, case_name, opposite_flag))
+
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if isinstance(text, str):
+        return text
+    elif isinstance(text, bytes):
+        return text.decode("utf-8", "ignore")
+    else:
+        raise ValueError("Unsupported string type: %s" % (type(text)))
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if isinstance(text, str):
+        return text
+    elif isinstance(text, bytes):
+        return text.decode("utf-8", "ignore")
+    else:
+        raise ValueError("Unsupported string type: %s" % (type(text)))
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r") as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def convert_by_vocab(vocab, items):
+  """Converts a sequence of [tokens|ids] using the vocab."""
+  output = []
+  for item in items:
+    output.append(vocab[item])
+  return output
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+  """Runs end-to-end tokenziation."""
+
+  def __init__(self, vocab_file, do_lower_case=True):
+    self.vocab = load_vocab(vocab_file)
+    self.inv_vocab = {v: k for k, v in self.vocab.items()}
+    self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+  def tokenize(self, text):
+    split_tokens = []
+    for token in self.basic_tokenizer.tokenize(text):
+      for sub_token in self.wordpiece_tokenizer.tokenize(token):
+        split_tokens.append(sub_token)
+
+    return split_tokens
+
+  def convert_tokens_to_ids(self, tokens):
+    return convert_by_vocab(self.vocab, tokens)
+
+  def convert_ids_to_tokens(self, ids):
+    return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name, do_lower_case=True):
+        """
+    Instantiate a PreTrainedBertModel from a pre-trained model file.
+    Download and cache the pre-trained model file if needed.
+    """
+        if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
+        else:
+            vocab_file = pretrained_model_name
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file)
+            if resolved_vocab_file == vocab_file:
+
+                logger.info("loading vocabulary file {}".format(vocab_file))
+            else:
+                logger.info("loading vocabulary file {} from cache at {}".format(
+                    vocab_file, resolved_vocab_file))
+            # Instantiate tokenizer.
+            tokenizer = cls(resolved_vocab_file, do_lower_case)
+        except FileNotFoundError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                "associated to this path or url.".format(
+                    pretrained_model_name,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name))
+            tokenizer = None
+        return tokenizer
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+    Args:
+      do_lower_case: Whether to lower case the input.
+    """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+    This uses a greedy longest-match-first algorithm to perform tokenization
+    using the given vocabulary.
+
+    For example:
+      input = "unaffable"
+      output = ["un", "##aff", "##able"]
+
+    Args:
+      text: A single token or whitespace separated tokens. This should have
+        already been passed through `BasicTokenizer.
+
+    Returns:
+      A list of wordpiece tokens.
+    """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
@@ -0,0 +1,62 @@
+import tensorflow as tf
+import time
+
+# report latency and throughput during eval
+class LogEvalRunHook(tf.train.SessionRunHook):
+  def __init__(self, global_batch_size, hvd_rank=-1):
+    self.global_batch_size = global_batch_size
+    self.hvd_rank = hvd_rank
+    self.total_time = 0.0
+    self.count = 0
+    self.skipped = 0
+    self.time_list = []
+
+  def before_run(self, run_context):
+    self.t0 = time.time()
+
+  def after_run(self, run_context, run_values):
+    elapsed_secs = time.time() - self.t0
+    self.count += 1
+
+    # Removing first 2 (arbitrary) number of startup iterations from perf evaluations
+    if self.count <= 2:
+      print("Skipping time record for ", self.count, " due to overhead")
+      self.skipped += 1
+    else:
+      self.time_list.append(elapsed_secs)
+      self.total_time += elapsed_secs
+
+# report throughput during training
+class LogTrainRunHook(tf.train.SessionRunHook):
+  def __init__(self, global_batch_size, hvd_rank=-1, save_checkpoints_steps=1000):
+    self.global_batch_size = global_batch_size
+    self.hvd_rank = hvd_rank
+    self.save_checkpoints_steps = save_checkpoints_steps
+
+    self.total_time = 0.0
+    self.count = 0 # Holds number of iterations, including skipped iterations for fp16 loss scaling
+
+  def after_create_session(self, session, coord):
+    self.init_global_step = session.run(tf.train.get_global_step())
+
+  def before_run(self, run_context):
+    self.t0 = time.time()
+    return tf.train.SessionRunArgs(
+        fetches=['step_update:0'])
+
+  def after_run(self, run_context, run_values):
+    elapsed_secs = time.time() - self.t0
+    self.global_step = run_values.results[0]
+    self.count += 1
+
+    # Removing first step + first two steps after every checkpoint save
+    if (self.global_step - self.init_global_step) % self.save_checkpoints_steps <= 1:
+      print("Skipping time record for ", self.global_step, " due to checkpoint-saving/warmup overhead")
+    else:
+      self.total_time += elapsed_secs
+
+  def end(self, session):
+    num_global_steps = self.global_step - self.init_global_step
+
+    self.skipped = (num_global_steps // self.save_checkpoints_steps) * 2 + \
+                   min(2, num_global_steps % self.save_checkpoints_steps) - 1
@@ -0,0 +1,14 @@
+{
+    "server_count": "1",
+    "server_list": [{
+        "device": [
+            {
+                "device_id": "0",
+                "device_ip": "192.168.10.101",
+                "rank_id": "0"
+            }],
+        "server_id": "127.0.0.1"
+    }],
+    "status": "completed",
+    "version": "1.0"
+}
@@ -0,0 +1,49 @@
+{
+    "server_count": "1",
+    "server_list": [{
+        "device": [
+            {
+                "device_id": "0",
+                "device_ip": "192.168.10.101",
+                "rank_id": "0"
+            },
+            {
+                "device_id": "1",
+                "device_ip": "192.168.11.101",
+                "rank_id": "1"
+            },
+            {
+                "device_id": "2",
+                "device_ip": "192.168.12.101",
+                "rank_id": "2"
+            },
+            {
+                "device_id": "3",
+                "device_ip": "192.168.13.101",
+                "rank_id": "3"
+            },
+            {
+                "device_id": "4",
+                "device_ip": "192.168.10.100",
+                "rank_id": "4"
+            },
+            {
+                "device_id": "5",
+                "device_ip": "192.168.11.100",
+                "rank_id": "5"
+            },
+            {
+                "device_id": "6",
+                "device_ip": "192.168.12.100",
+                "rank_id": "6"
+            },
+            {
+                "device_id": "7",
+                "device_ip": "192.168.13.100",
+                "rank_id": "7"
+            }],
+        "server_id": "127.0.0.1"
+    }],
+    "status": "completed",
+    "version": "1.0"
+}
@@ -0,0 +1,14 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "type_vocab_size": 2,
+  "vocab_size": 21136
+}
+
@@ -0,0 +1,14 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}
+
@@ -0,0 +1,14 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "type_vocab_size": 2,
+  "vocab_size": 21136
+}
+
@@ -0,0 +1,14 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}
+
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+#toolkit env
+#export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
+#export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$utilDir
+#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
+#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
+
+#nnae env
+#export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/
+#export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:$utilDir
+#export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin
+#export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
+
+if [ -d /usr/local/Ascend/nnae/latest ];then
+	export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
+	export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:$utilDir
+	export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
+	export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
+else
+	export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
+	export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$utilDir
+	export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
+	export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
+	
+fi
+
+export NEW_GE_FE_ID=1
+export GE_AICPU_FLAG=1
+export SOC_VERSION=Ascend910
+#export DUMP_GE_GRAPH=2
+#export DUMP_GRAPH_LEVEL=3
+#export PRINT_MODEL=1
+export SLOG_PRINT_TO_STDOUT=0
+export HCCL_CONNECT_TIMEOUT=600
+
+
+# system env
+#ulimit -c unlimited
@@ -0,0 +1,67 @@
+#!/bin/bash
+rank_size=$1
+yamlPath=$2
+toolsPath=$3
+if [ -f /.dockerenv ];then
+        CLUSTER=$4
+        MPIRUN_ALL_IP="$5"
+        export CLUSTER=${CLUSTER}
+fi
+
+currentDir=$(cd "$(dirname "$0")/.."; pwd)
+currtime=`date +%Y%m%d%H%M%S`
+mkdir -p ${currentDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
+train_job_dir=${currentDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
+echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] see more config info in ${currentDir}/config"
+echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] train result in ${train_job_dir}"
+
+# 从 yaml 获取配置
+eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
+
+# device 列表, 若无指定 device 根据 rank_size 顺序选择
+eval device_group=\$device_group_${rank_size}p
+if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
+    device_group="$(seq 0 "$(expr $rank_size - 1)")"
+fi
+
+# get last device id in device_group, hw log in performance from the dir named first_device_id  
+device_group_str=`echo ${device_group} | sed 's/ //g'`
+first_device_id=`echo ${device_group_str: 0:1}`
+
+# user env
+export JOB_ID=9999001
+export RANK_TABLE_FILE=${currentDir}/config/${rank_size}p.json
+export RANK_SIZE=${rank_size}
+export SLOG_PRINT_TO_STDOUT=0
+export DEVICE_ID=0
+export DEVICE_INDEX=$DEVICE_ID
+
+if [ x"${CLUSTER}" == x"True" ];then
+    # ln hw log
+    ln -snf ${train_job_dir}/0/hw_bert.log ${train_job_dir}
+    this_ip=$(hostname -I |awk '{print $1}')
+    for ip in $MPIRUN_ALL_IP;do
+        if [ x"$ip" != x"$this_ip" ];then
+            scp $yamlPath root@$ip:$yamlPath
+        fi
+    done
+    export PATH=$PATH:/usr/local/mpirun4.0/bin
+    mpirun -H ${mpirun_ip} \
+    --bind-to none -map-by slot\
+    --allow-run-as-root \
+    --mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
+    --prefix /usr/local/mpirun4.0/ \
+    ${currentDir}/scripts/train.sh 0 $currtime $yamlPath 0 True ${toolsPath} ${rank_size}
+else
+    # ln hw log
+    ln -snf ${train_job_dir}/${first_device_id}/hw_bert.log ${train_job_dir}
+    rank_id=0
+    for device_id in ${device_group};do
+      #echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train ${device_id} & " >> ./main.log
+      ${currentDir}/scripts/train.sh $device_id $currtime $yamlPath $rank_id solo ${toolsPath} ${rank_size} &
+      let rank_id++
+    done
+fi
+wait
+
+
@@ -0,0 +1,157 @@
+#!/bin/bash
+# 0 $currtime $yamlPath  0 cluster ${toolsPath}
+device_id=$1
+currtime=$2
+yamlPath=$3
+toolsPath=$6
+rank_size=$7
+
+
+export YAML_PATH=$3
+
+mainDir=$(cd "$(dirname "$0")/.."; pwd)
+
+mkdir -p ${mainDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
+export train_job_dir=${mainDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
+
+
+#exec_path=${train_job_dir}
+
+cd ${train_job_dir}
+
+export utilDir=$(cd "$(dirname "$yamlPath")/../atlas_benchmark-master/utils"; pwd)
+export utilDir=$(cd "$(dirname "$yamlPath")/../atlas_benchmark-master/utils/atlasboost"; pwd)
+source ${mainDir}/config/npu_set_env.sh
+
+
+# 从 yaml 获取配置
+eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
+
+# 声明变量
+export REMARK_LOG_FILE=hw_bert.log  # 打点日志文件名称， 必须hw_后跟模型名称小写
+# 添加日志打点模块路径
+benchmark_log_path=${mainDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
+export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
+
+export JOB_ID=9999001
+export RANK_TABLE_FILE=${mainDir}/config/${rank_size}p.json
+export RANK_SIZE=${rank_size}
+
+export SLOG_PRINT_TO_STDOUT=0
+export DEVICE_ID=${device_id}
+export DEVICE_INDEX=$DEVICE_ID
+export RANK_INDEX=0
+
+
+export PROFILING_OPTIONS=${PROFILING_OPTIONS}
+export FP_POINT=${FP_POINT}
+export BP_POINT=${BP_POINT}
+
+if [ ${PROFILING_MODE} == True ];
+then
+	export PROFILING_MODE=true
+else
+	export PROFILING_MODE=false
+fi
+
+if [ ${PROFILING_MODE} == True ];
+then
+	export AICPU_PROFILING_MODE=true
+else
+        export AICPU_PROFILING_MODE=false
+fi
+
+
+if  [ x"${device_id}" = x ] ;
+then
+    echo "turing train fail" >> ${exec_path}/train_${device_id}.log
+    exit
+else
+    export DEVICE_ID=${device_id}
+fi
+
+
+env > ${currentDir}/env_${device_id}.log
+
+cd ${train_job_dir}
+
+if [ x"$5" != x"True" ];then
+        rank_id=$4
+        export RANK_ID=$4
+else
+        device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
+                device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
+                atlasboost.set_device_id(device_id);print(atlasboost.rank())")
+        device_id_mo=`echo $device_id_mo`
+        rank_id=${device_id_mo##* }
+        #echo rank_id is $rank_id
+        export RANK_ID=${rank_id}
+        device=${device_id_mo##*deviceid = }
+        device_id=${device%% phyid=*}
+        export DEVICE_ID=${device_id}
+        #echo device_id is $device_id
+        hccljson=${train_job_dir}/*.json
+        cp ${hccljson} ${mainDir}/config/${rank_size}p.json
+fi
+env > ${currentDir}/env_${device_id}.log
+#mkdir exec path
+
+
+mkdir -p ${train_job_dir}/${device_id}/ckpt${DEVICE_ID}
+cd ${train_job_dir}/${device_id}
+
+startTime=`date +%Y%m%d-%H:%M:%S`
+startTime_s=`date +%s`
+
+
+#start exec
+python3.7 ${mainDir}/code/pretrain/run_pretraining.py \
+    --bert_config_file=${mainDir}/config/${bert_config_file} \
+    --max_seq_length=${max_seq_length} \
+    --max_predictions_per_seq=${max_predictions_per_seq} \
+    --train_batch_size=${train_batch_size} \
+    --learning_rate=${learning_rate} \
+    --num_warmup_steps=${num_warmup_steps} \
+    --num_train_steps=${num_train_steps} \
+    --optimizer_type=${optimizer_type} \
+    --manual_fp16=${manual_fp16} \
+    --use_fp16_cls=${use_fp16_cls} \
+    --input_files_dir=${input_files_dir} \
+    --eval_files_dir=${eval_files_dir} \
+    --npu_bert_debug=${npu_bert_debug} \
+    --npu_bert_use_tdt=${npu_bert_use_tdt} \
+    --do_train=${do_train} \
+    --do_eval=${do_eval} \
+    --num_accumulation_steps=${num_accumulation_steps} \
+    --npu_bert_job_start_file=None \
+    --iterations_per_loop=${iterations_per_loop} \
+    --npu_bert_loss_scale=${npu_bert_loss_scale} \
+    --distributed=${distributed} \
+    --save_checkpoints_steps=${save_checkpoints_steps} \
+    --npu_bert_clip_by_global_norm=${npu_bert_clip_by_global_norm} \
+    --output_dir=${train_job_dir}/${device_id}/ckpt${DEVICE_ID} > ${train_job_dir}/train_${device_id}.log 2>&1
+
+
+if [ $? -eq 0 ] ;then
+    echo ":::ABK 1.0.0 bert train success"
+    echo ":::ABK 1.0.0 bert train success" >> ${train_job_dir}/train_${device_id}.log
+    echo ":::ABK 1.0.0 bert train success" >> ${train_job_dir}/${device_id}/hw_bert.log
+else
+    echo ":::ABK 1.0.0 bert train failed"
+    echo ":::ABK 1.0.0 bert train failed" >> ${train_job_dir}/train_${device_id}.log
+    echo ":::ABK 1.0.0 bert train failed" >> ${train_job_dir}/${device_id}/hw_bert.log
+fi
+
+endTime=`date +%Y%m%d-%H:%M:%S`
+endTime_s=`date +%s`
+sumTime=$[ $endTime_s - $startTime_s ]
+hour=$(( $sumTime/3600 ))
+min=$(( ($sumTime-${hour}*3600)/60 ))
+sec=$(( $sumTime-${hour}*3600-${min}*60 ))
+echo ":::ABK 1.0.0 bert train total time ${hour}:${min}:${sec}"
+echo ":::ABK 1.0.0 bert train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_bert.log
+
+#if [ x"$5" == x"solo" ];
+#then
+#    /bin/cp -f hw_bert.log $perfDir/hw_bert.log
+#fi
@@ -0,0 +1,54 @@
+# Bert-Large_tensorflow训练说明
+
+### 1. 模型训练参数配置
+
+在train/yaml/Bert-Large.yaml中修改相应配置， 配置项含义:
+
+```
+tensorflow_config:
+    #中文数据用 bert_config_large_cn.json 英文用bert_config_large_en.json
+    bert_config_file: bert_config_large_cn.json
+    #数据集句子长度是256时 设置为 256,40，句子长度是128时设置为128,20 
+    max_seq_length: 128
+    max_predictions_per_seq: 20
+    
+    # 最佳性能train_batch_size为96，如果超显存，可调小至32 
+    train_batch_size: 96
+    learning_rate: 3.125e-5
+    num_warmup_steps: 100
+    num_train_steps: 1000
+    optimizer_type: adam
+    manual_fp16: True
+    use_fp16_cls: True
+    input_files_dir: /home/BertData/cn-wiki-128/
+    eval_files_dir: /home/BertData/cn-wiki-128/ 
+    do_train: True
+    do_eval: True
+    num_accumulation_steps: 1
+    iterations_per_loop: 100
+    npu_bert_loss_scale: 0
+    save_checkpoints_steps: 1000
+    npu_bert_clip_by_global_norm: False
+
+    # docker 镜像名称:版本号
+    docker_image: c73:b021
+
+    # 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
+    mpirun_ip: 90.90.140.199:8,90.90.140.229:8
+
+    # 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
+    device_group_1p: 0
+    device_group_2p: 0 1
+    device_group_4p: 0 1 2 3
+    
+```
+
+------
+
+
+
+
+
+
+
+    
@@ -0,0 +1,31 @@
+# How to Contribute
+
+BERT needs to maintain permanent compatibility with the pre-trained model files,
+so we do not plan to make any major changes to this library (other than what was
+promised in the README). However, we can accept small patches related to
+re-factoring and documentation. To submit contributes, there are just a few
+small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
@@ -0,0 +1,31 @@
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.08-py3
+
+FROM tensorrtserver_client as trt
+
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt-get update && apt-get install -y pbzip2 pv bzip2 libcurl3
+
+RUN pip install toposort networkx pytest nltk tqdm html2text progressbar
+
+WORKDIR /workspace
+RUN git clone https://github.com/openai/gradient-checkpointing.git
+RUN git clone https://github.com/attardi/wikiextractor.git
+RUN git clone https://github.com/soskek/bookcorpus.git
+RUN git clone https://github.com/titipata/pubmed_parser
+
+RUN pip3 install /workspace/pubmed_parser
+
+#Copy the perf_client over
+COPY --from=trt /workspace/install/ /workspace/install/
+
+#Install the python wheel with pip
+RUN pip install /workspace/install/python/tensorrtserver*.whl
+
+WORKDIR /workspace/bert
+COPY . .
+
+ENV PYTHONPATH /workspace/bert
+ENV BERT_PREP_WORKING_DIR /workspace/bert/data
+ENV PATH //workspace/install/bin:${PATH}
+ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
@@ -0,0 +1,4 @@
+BERT TensorFlow
+
+This repository includes software from https://github.com/google-research/bert
+licensed under the Apache License, Version 2.0 (the "License")
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
@@ -0,0 +1,567 @@
+# BioBert For TensorFlow
+
+This folder provides a script and recipe to train BERT for TensorFlow to achieve state-of-the-art accuracy on *biomedical text-mining* and is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+* [Model overview](#model-overview)
+* [Quick Start Guide](#quick-start-guide)
+* [Advanced](#advanced)
+  * [Scripts and sample code](#scripts-and-sample-code)
+  * [Parameters](#parameters)
+  * [Command-line options](#command-line-options)
+  * [Getting the data](#getting-the-data)
+    * [Dataset guidelines](#dataset-guidelines)
+    * [Multi-dataset](#multi-dataset)
+  * [Training process](#training-process)
+    * [Pre-training](#pre-training)
+    * [Fine tuning](#fine-tuning)
+    * [Multi-node](#multi-node)
+  * [Inference process](#inference-process)
+* [Performance](#performance)
+  * [Benchmarking](#benchmarking)
+    * [Training performance benchmark](#training-performance-benchmark)
+    * [Inference performance benchmark](#inference-performance-benchmark)
+* [Results](#results)
+  * [Training accuracy results](#training-accuracy-results)
+    * [Pre-training accuracy](#pre-training-accuracy)
+    * [Fine-tuning accuracy](#fine-tuning-accuracy)
+      * [Fine-tuning accuracy for NER Chem](#fine-tuning-accuracy-for-ner-chem)
+  * [Training stability test](#training-stability-test)
+    * [Fine-tuning stability test](#fine-tuning-stability-test)
+  * [Training performance results](#training-performance-results)
+    * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
+      * [Pre-training training performance: multi-node on 16G](#pre-training-training-performance-multi-node-on-16g)
+      * [Fine-tuning training performance for NER on 16G](#fine-tuning-training-performance-for-ner-on-16g)
+    * [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32g)
+      * [Fine-tuning training performance for NER on 32G](#fine-tuning-training-performance-for-ner-on-32g)
+    * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g)
+      * [Pre-training training performance: multi-node on DGX-2 32G](#pre-training-training-performance-multi-node-on-dgx-2-32g)
+      * [Fine-tuning training performance for NER on DGX-2 32G](#fine-tuning-training-performance-for-ner-on-dgx-2-32g)
+* [Release notes](#release-notes)
+  * [Changelog](#changelog)
+  * [Known issues](#known-issues)
+
+
+
+## Model overview
+
+In the original [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) paper, pre-training is done on [Wikipedia](https://dumps.wikimedia.org/) and [Books Corpus](http://yknzhu.wixsite.com/mbweb), with state-of-the-art results demonstrated on [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) (Stanford Question Answering Dataset) benchmark.
+
+Meanwhile, many works, including [BioBERT](https://arxiv.org/pdf/1901.08746.pdf), [SciBERT](https://arxiv.org/pdf/1903.10676.pdf), [NCBI-BERT](https://arxiv.org/pdf/1906.05474.pdf), [ClinicalBERT (MIT)](https://arxiv.org/pdf/1904.03323.pdf), [ClinicalBERT (NYU, Princeton)](https://arxiv.org/pdf/1904.05342.pdf), and others at [BioNLP’19 workshop](https://aclweb.org/aclwiki/BioNLP_Workshop), show that additional pre-training of BERT on large biomedical text corpus such as [PubMed](https://www.ncbi.nlm.nih.gov/pubmed/) results in better performance in biomedical text-mining tasks.
+
+This repository provides scripts and recipe to adopt the [NVIDIA BERT code-base](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT) to achieve state-of-the-art results in the following biomedical text-mining benchmark tasks:
+
+- [BC5CDR-disease](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/track-3-cdr/) A Named-Entity-Recognition task to recognize diseases mentioned in a collection of 1500 PubMed titles and abstracts ([Li et al., 2016](https://academic.oup.com/database/article/doi/10.1093/database/baw068/2630414))
+
+- [BC5CDR-chemical](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/track-3-cdr/) A Named-Entity-Recognition task to recognize chemicals mentioned in a collection of 1500 PubMed titles and abstracts ([Li et al., 2016](https://academic.oup.com/database/article/doi/10.1093/database/baw068/2630414))
+
+- [ChemProt](https://biocreative.bioinformatics.udel.edu/news/corpora/) A Relation-Extraction task to determine chemical-protein interactions in a collection of 1820 PubMed abstracts ([Krallinger et al., 2017](https://biocreative.bioinformatics.udel.edu/media/store/files/2017/ProceedingsBCVI_v2.pdf?page=141))
+
+
+## Quick Start Guide
+
+To pretrain or fine tune your model for BioMedical tasks using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the BERT model.
+
+1. Clone the repository.
+
+```bash
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/TensorFlow/LanguageModeling/BERT
+```
+
+2. Build the BERT TensorFlow NGC container.
+
+```bash
+bash scripts/docker/build.sh
+```
+
+3. Download and preprocess the PubMed dataset.
+
+To download and preprocess pre-training data as well as the required vocab files, run the following script:
+
+
+```bash
+bash biobert/scripts/biobert_data_download.sh
+```
+
+Datasets for finetuning can be obtained from this [repository](https://github.com/ncbi-nlp/BLUE_Benchmark/releases/tag/0.1)
+
+Place them in `/workspace/bert/data/biobert/` to be automatically picked up by our scripts.
+
+4. Start an interactive session in the NGC container to run training/inference.
+
+After you build the container image and download the data, you can start an interactive CLI session as follows:
+
+```bash
+bash scripts/docker/launch.sh
+```
+
+5. Download the pre-trained checkpoint, vocabulary, and configuration files.
+
+We have uploaded checkpoints for fine tuning and pre-training on BioMedical Corpus’s on the NGC Model Registry. You can download them directly from the [NGC model catalog](https://ngc.nvidia.com/catalog/models).
+
+Place our `BioBERT checkpoints` in the `results/` to easily access it in your scripts.
+
+6. Start pre-training.
+
+From within the container, you can use the following script to run the 1st phase of the pre-training using cased vocabulary:
+
+```bash
+bash biobert/scripts/run_pretraining-pubmed_base_phase_1.sh <train_batch_size> <learning_rate> <cased> <precision> <use_xla> <num_gpus> <warmup_steps> <train_steps> <num_accumulation_steps> <save_checkpoint_steps> <eval_batch_size>
+```
+
+For the 2nd phase of the pre-training, issue:
+
+```bash
+bash biobert/scripts/run_pretraining-pubmed_base_phase_2.sh <path_to_phase_1_checkpoint> <train_batch_size> <learning_rate> <cased> <precision> <use_xla> <num_gpus> <warmup_steps> <train_steps> <num_accumulation_steps> <save_checkpoint_steps> <eval_batch_size>
+```
+
+
+Refer to (MultiNode Section)[multi-node] for details on utilizing multiple nodes for faster pretraining.
+
+6. Start fine tuning.
+
+The above pretrained BERT representations can be fine tuned with just one additional output layer for a state-of-the-art biomedical text-mining system.
+From within the container, you can use the following script to run fine-training for NER.
+
+Note: The scripts assume you are running on 16 V100 32GB GPUs. If you are running on GPU having less than 32GB memory or fewer GPUs, batch size, learning rate and number of GPUs needs to be adjusted.
+
+For NER on disease entities:
+
+```bash
+bash biobert/scripts/ner_bc5cdr-disease.sh  <init_checkpoint> <train_batch_size> <learning_rate> <cased> <precision> <use_xla> <num_gpu> <seq_length> <bert_model> <eval_batch_size> <epochs>
+```
+
+For NER on chemical entities:
+
+```bash
+bash biobert/scripts/ner_bc5cdr-chem.sh  <init_checkpoint> <train_batch_size> <learning_rate> <cased> <precision> <use_xla> <num_gpu> <seq_length> <bert_model> <eval_batch_size> <epochs>
+```
+
+For relation extraction, issue:
+
+```
+bash biobert/scripts/rel_chemprot.sh <init_checkpoint> <train_batch_size> <learning_rate> <cased> <precision> <use_xla> <num_gpu> <seq_length> <bert_model> <eval_batch_size> <epochs>
+```
+
+8. Start validation/evaluation.
+
+The `biobert/scripts/run_biobert_finetuning_inference.sh` script runs inference on a checkpoint fine tuned for a specific task and evaluates the validity of predictions on the basis of F1, precision and recall scores.
+
+```bash
+bash biobert/scripts/run_biobert_finetuning_inference.sh <task> <init_checkpoint> <bert_model> <cased> <precision> <use_xla> <batch_size>
+```
+
+For FP16 inference for NER on BC5DR Chemical task with XLA using a DGX-2 V100 32G, run:
+```bash
+bash biobert/scripts/run_biobert_finetuning_inference.sh ner_bc5cdr-chem /results/model.ckpt base false fp16 true 16
+```
+
+Tasks `ner_bc5cdr-chem`, `ner_bc5cdr-disease` and `rel_chemprot` are currently supported.
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+In addition to BERT TensorFlow files, the most important files added for NER and RE fine tuning tasks are:
+* `run_ner.py` - Serves as an entry point for NER training.
+* `run_re.py` - Serves as an entry point for RE training.
+
+The `biobert/scripts/` folder encapsulates all the one-click scripts required for running various functionalities supported such as:
+* `ner_bc5cdr-chem.sh` - Runs NER training and inference on the BC5CDR Chemical dataset using the `run_ner.py` file.
+* `ner_bc5cdr-disease.sh` - Runs NER training and inference on the BC5CDR Disease dataset using the `run_ner.py` file.
+* `rel_chemprot.sh` - Runs RE training and inference on the ChemProt dataset using the `run_re.py` file.
+* `run_pretraining_pubmed_base_phase_*.sh` - Runs pre-training with LAMB optimizer using the `run_pretraining.py` file in two phases. Phase 1 does training with sequence length = 128. In phase 2, the remaining 10% of the training is done with sequence length = 512.
+* `biobert_data_download.sh` - Downloads the PubMed dataset and Vocab files using files in the `data/` folder.
+* `run_biobert_finetuning_inference.sh` - Runs task specific inference using a fine tuned checkpoint.
+
+
+### Parameters
+
+Aside from the options to set hyperparameters, some relevant options to control the behaviour of the `run_ner.py` and `run_re.py` scripts are:
+
+```
+  --bert_config_file: The config json file corresponding to the pre-trained BERT model. This specifies the model architecture.
+--vocab_file: The vocabulary file that the BERT model was trained on.
+  --output_dir: The output directory where the model checkpoints will be written.
+  --[no]do_eval: Whether to run evaluation on the dev set. (default: 'false')
+  --[no]do_predict: Whether to run evaluation on the test set. (default: 'false')
+  --[no]do_train: Whether to run training. (default: 'false')
+  --learning_rate: The initial learning rate for Adam.(default: '5e-06')(a number)
+  --max_seq_length: The maximum total input sequence length after WordPiece tokenization. Sequences longer than this will be truncated, and sequences shorter than this will be padded.(default: '384')(an integer)
+  --predict_batch_size: Total batch size for predictions.(default: '8')(an integer)
+  --train_batch_size: Total batch size for training (default: '8')(an integer)
+  --[no]use_fp16: Whether to enable AMP ops.(default: 'false')
+  --[no]use_xla: Whether to enable XLA JIT compilation.(default: 'false')
+--init_checkpoint: Initial checkpoint (usually from a pre-trained BERT model).
+--num_train_epochs: Total number of training epochs to perform.(default: '3.0')(a number)
+
+```
+
+Note: When initializing from a checkpoint using `--init_checkpoint` and a corpus of your choice, keep in mind that `bert_config_file` and `vocab_file` should remain unchanged.
+
+### Command-line options
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option with the Python file, for example:
+
+```bash
+python run_ner.py --help
+python run_re.py --help
+```
+### Getting the data
+
+For pre-training BERT, we use the PubMed Dataset. For PubMed, we extract the xml files which are structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences.
+
+The next step is to run `create_pretraining_data.py` with the document level corpus as input, which generates input data and labels for the masked language modeling and next sentence prediction tasks. Pre-training can also be performed on any corpus of your choice. The collection of data generation scripts are intended to be modular to allow modifications for additional preprocessing steps or to use additional data. They can hence easily be modified for an arbitrary corpus.
+
+The preparation of an individual pre-training dataset is described in the `create_biobert_datasets_from_start.sh ` script found in the `data/` folder. The component steps to prepare the datasets are as follows:
+
+1.  Data download and extract - the dataset is downloaded and extracted.
+2.  Clean and format - document tags, etc. are removed from the dataset. The end result of this step is a `{dataset_name_one_article_per_line}.txt` file that contains the entire corpus. Each line in the text file contains an entire document from the corpus. One file per dataset is created in the `formatted_one_article_per_line` folder.
+3.  Sharding - the sentence segmented corpus file is split into a number of smaller text documents. The sharding is configured so that a document will not be split between two shards. Sentence segmentation is performed at this time using NLTK.
+4.  TFRecord file creation - each text file shard is processed by the `create_pretraining_data.py` script to produce a corresponding TFRecord file. The script generates input data and labels for masked language modeling and sentence prediction tasks for the input text shard.
+
+
+For fine tuning BioBERT for the task of Named Entity Recognition and Relation Extraction Tasks, we use BC5CDR and Chemprot Datasets. BC5CDR corpus consists of 1500 PubMed articles with 4409 annotated chemicals, 5818 diseases and 3116 chemical-disease interactions.
+ChemProt corpus consists of text exhaustively annotated by hand with mentions of chemical compounds/drugs and genes/proteins, as well as 22 different types of compound-protein relations focussing on 5 important relation classes. It was preprocessed following [Lim and Kang](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6014134/) guidelines.
+
+#### Dataset guidelines
+
+The procedure to prepare a text corpus for pre-training is described in the previous section. This section provides additional insight into how exactly raw text is processed so that it is ready for pre-training.
+
+First, raw text is tokenized using [WordPiece tokenization](https://arxiv.org/pdf/1609.08144.pdf). A [CLS] token is inserted at the start of every sequence, and the two sentences in the sequence are separated by a [SEP] token.
+
+Note: BERT pre-training looks at pairs of sentences at a time. A sentence embedding token [A] is added to the first sentence and token [B] to the next.
+
+BERT pre-training optimizes for two unsupervised classification tasks. The first is Masked Language Modelling (Masked LM). One training instance of Masked LM is a single modified sentence. Each token in the sentence has a 15% chance of being replaced by a [MASK] token. The chosen token is replaced with [MASK] 80% of the time, 10% with another random token and the remaining 10% with the same token. The task is then to predict the original token.
+
+The second task is next sentence prediction. One training instance of BERT pre-training is two sentences (a sentence pair). A sentence pair may be constructed by simply taking two adjacent sentences from a single document, or by pairing up two random sentences with equal probability. The goal of this task is to predict whether or not the second sentence followed the first in the original document.
+
+The `create_pretraining_data.py` script takes in raw text and creates training instances for both pre-training tasks.
+
+#### Multi-dataset
+
+We are able to combine multiple datasets into a single dataset for pre-training on a diverse text corpus. Once TFRecords have been created for each component dataset, you can create a combined dataset by adding the directory to `*FILES_DIR` in `run_pretraining_*.sh`. This will feed all matching files to the input pipeline in `run_pretraining.py`. However, in the training process, only one TFRecord file is consumed at a time, therefore, the training instances of any given training batch will all belong to the same source dataset.
+
+
+
+### Training process
+
+The training process consists of two steps: pre-training and fine tuning.
+
+#### Pre-training
+
+BERT is designed to pre-train deep bidirectional representations for language representations. The following scripts are to pre-train BERT on PubMed dataset. These scripts are general and can be used for pre-training language representations on additional corpus of biomedical text.
+
+Pre-training is performed using the `run_pretraining.py` script along with parameters defined in the `biobert/scripts/run_pretraining_pubmed_base_phase_1.sh` and `biobert/scripts/run_pretraining_pubmed_base_phase_2.sh` scripts.
+
+The `biobert/scripts/run_pretraining_pubmed_base_phase*.sh` scripts run a job on a single node that trains the BERT-base model from scratch using the PubMed Corpus dataset as training data. By default, the training script:
+- Runs on 16 GPUs
+- Has FP16 precision enabled
+- Is XLA enabled
+- Creates a log file containing all the output
+- Saves a checkpoint every 5000 iterations (keeps only the latest checkpoint) and at the end of training. All checkpoints, evaluation results, and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
+- Evaluates the model at the end of each phase
+
+- Phase 1
+    - Runs 19531 steps with 1953 warmup steps
+    - Sets Maximum sequence length as 128
+    - Sets Global Batch size as 64K
+
+- Phase 2
+    - Runs 4340 steps with 434 warm-up steps
+    - Sets Maximum sequence length as 512
+    - Sets Global Batch size as 32K
+    - Should start from Phase1's final checkpoint
+
+These parameters train PubMed with reasonable accuracy on a DGX-2 with 32GB V100 cards.
+
+For example:
+```bash
+biobert/scripts/run_pretraining-pubmed_base_phase_1.sh <train_batch_size> <learning_rate> <cased> <precision> <use_xla> <num_gpus> <warmup_steps> <train_steps> <num_accumulation_steps> <save_checkpoint_steps> <eval_batch_size>
+```
+
+Where:
+- `<training_batch_size>` is per-GPU batch size used for training. Batch size varies with precision, larger batch sizes run more efficiently, but require more memory.
+
+- `<learning_rate>` is the default rate of 3.2e-5 is good for global batch size 64k.
+
+- `<cased>` is set to `true` or `false` depending on whether the model should be trained on cased or uncased data.
+
+- `<precision>` is the type of math in your model, can be either `fp32` or `fp16`. Specifically:
+
+    - `fp32` is 32-bit IEEE single precision floats.
+    - `fp16` is Automatic rewrite of TensorFlow compute graph to take advantage of 16-bit arithmetic whenever it is safe.
+
+- `<num_gpus>` is the number of GPUs to use for training. Must be equal to or smaller than the number of GPUs attached to your node.
+
+- `<warmup_steps>` is the number of warm-up steps at the start of training.
+
+- `<training_steps>` is the total number of training steps.
+
+- `<save_checkpoint_steps>` controls how often checkpoints are saved. Default is 5000 steps.
+
+- `<num_accumulation_steps>` is used to mimic higher batch sizes in the respective phase by accumulating gradients N times before weight update.
+
+- `<bert_model>` is used to indicate whether to pretrain BERT Large or BERT Base model.
+
+- `<eval_batch_size>` is per-GPU batch size used for evaluation after training.
+
+The following sample code trains phase 1 of BERT-base from scratch on a single DGX-2 using FP16 arithmetic and uncased data.
+
+```bash
+biobert/scripts/run_pretraining-pubmed_base_phase_1.sh 128 3.2e-5 false fp16 true 16 1953 19531 32 5000 80
+```
+
+#### Fine tuning
+
+Fine tuning is performed using the `run_ner.py` script along with parameters defined in `biobert/scripts/ner_bc5cdr*.sh`.
+
+For example, `biobert/scripts/ner_bc5cdr-chem.sh` script trains a model and performs evaluation on the BC5CDR Chemical dataset. By default, the training script:
+
+- Trains on BERT Base Uncased Model
+- Uses 16 GPUs and batch size of 8 on each GPU
+- Has FP16 precision enabled
+- Is XLA enabled
+- Runs for 10 epochs
+- Evaluation is done at the end of training. To skip evaluation, modify `--do_eval` and  `--do_predict` to `False`.
+
+This script outputs checkpoints to the `/results` directory, by default, inside the container. Mount point of `/results` can be changed in the `scripts/docker/launch.sh` file. The training log contains information about:
+- Loss for the final step
+- Training and evaluation performance
+- F1, Precision and Recall on the Test Set of BC5CDR Chemical after evaluation.
+
+The summary after training is printed in the following format:
+```bash
+ 0: /results/biobert_finetune_ner_chem_191028154209/test_labels.txt
+ 0: /results/biobert_finetune_ner_chem_191028154209/test_labels_errs.txt
+ 0: processed 124669 tokens with 5433 phrases; found: 5484 phrases; correct: 5102.
+ 0: accuracy:  99.26%; precision:  93.03%; recall:  93.91%; FB1:  93.47
+ 0:                  : precision:  93.03%; recall:  93.91%; FB1:  93.47  5484
+```
+
+Multi-GPU training is enabled with the Horovod TensorFlow module. The following example runs training on 16 GPUs:
+
+```bash
+BERT_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
+DATA_DIR=data/biobert/BC5CDR/chem
+
+mpi_command="mpirun -np 16 -H localhost:16 \
+    --allow-run-as-root -bind-to none -map-by slot \
+    -x NCCL_DEBUG=INFO \
+    -x LD_LIBRARY_PATH \
+    -x PATH -mca pml ob1 -mca btl ^openib" \
+     python run_ner.py --horovod --use_fp16 --use_xla \
+      --vocab_file=$BERT_DIR/vocab.txt \
+     --bert_config_file=$BERT_DIR/bert_config.json \
+     --output_dir=/results --data_dir=$DATA_DIR"
+```
+
+#### Multi-node
+
+Multi-node runs can be launched on a pyxis/enroot Slurm cluster (see [Requirements](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT#requirements)) with the `biobert/scripts/run_biobert.sub` script with the following command for a 4-node DGX2 example for both phase 1 and phase 2:
+
+```bash
+BATCHSIZE=128 LEARNING_RATE='8e-6' NUM_ACCUMULATION_STEPS=8 PHASE=1 sbatch -N4 --ntasks-per-node=16 biobert/scripts/run_biobert.sub
+BATCHSIZE=16 LEARNING_RATE='3.2e-5' NUM_ACCUMULATION_STEPS=32 PHASE=1 sbatch -N4 --ntasks-per-node=16 biobert/scripts/run_biobert.sub
+```
+
+Checkpoint after phase 1 will be saved in `checkpointdir` specified in `biobert/scripts/run_biobert.sub`. The checkpoint will be automatically picked up to resume training on phase 2. Note that phase 2 should be run after phase 1.
+
+Variables to re-run the [Training performance results](#training-performance-results) are available in the `configurations.yml` file.
+
+The batch variables `BATCHSIZE`, `LEARNING_RATE`, `NUM_ACCUMULATION_STEPS` refer to the Python arguments `train_batch_size`, `learning_rate`, `num_accumulation_steps` respectively.
+The variable `PHASE` refers to phase specific arguments available in `biobert/scripts/run_biobert.sub`.
+
+Note that the `biobert/scripts/run_biobert.sub` script is a starting point that has to be adapted depending on the environment. In particular, variables such as `datadir` handle the location of the files for each phase.
+
+Refer to the file contents to see the full list of variables to adjust for your system.
+
+### Inference process
+
+Inference on a fine tuned model for Bio Medical tasks is performed using the `run_ner.py` or `run_re.py` script along with parameters defined in `biobert/scripts/run_biobert_finetuning_inference.sh`. Inference is supported on a single GPU.
+
+The `biobert/scripts/run_biobert_finetuning_inference.sh` script performs evaluation on ChemProt or BC5CDR datasets depending on the task specified. By default, the inferencing script:
+
+- Uses BC5CDR Chemical dataset
+- Has FP16 precision enabled
+- Is XLA enabled
+- Evaluates the latest checkpoint present in `/results` with a batch size of 16.
+
+This script computes F1, Precision and Recall scores. Mount point of `/results` can be changed in the `scripts/docker/launch.sh` file.
+
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+Both of these benchmarking scripts enable you to run a number of epochs, extract performance numbers, and run the BERT model for fine tuning.
+
+#### Training performance benchmark
+
+Training benchmarking can be performed by running the script:
+``` bash
+biobert/scripts/biobert_finetune_training_benchmark.sh <task> <num_gpu> <bert_model> <cased>
+```
+
+This script runs 2 epochs by default on the NER BC5CDR dataset and extracts performance numbers for various batch sizes and sequence lengths in both FP16 and FP32. These numbers are saved at `/results/tf_bert_biobert_<task>_training_benchmark__<bert_model>_<cased/uncased>_num_gpu_<num_gpu>_<DATESTAMP>`
+
+#### Inference performance benchmark
+
+Training benchmarking can be performed by running the script:
+``` bash
+biobert/scripts/biobert_finetune_inference_benchmark.sh <task> <bert_model> <cased>
+```
+
+This script runs inference on the test and dev sets and extracts performance and latency numbers for various batch sizes and sequence lengths in both FP16 with XLA and FP32 without XLA. These numbers are saved at `/results/tf_bert_biobert_<task>_training_benchmark__<bert_model>_<cased/uncased>_num_gpu_<num_gpu>_<DATESTAMP>`
+
+## Results
+
+The following sections provide detailed results of downstream fine-tuning task on NER and RE benchmark tasks.
+
+### Training accuracy results
+
+#### Pre-training accuracy
+
+Our results were obtained by running the `scripts/run_pretraining_lamb.sh` training script in the TensorFlow 19.08-py3 NGC container.
+
+| **DGX System** | **Nodes** | **Precision** | **Batch Size/GPU: Phase1, Phase2** | **Accumulation Steps: Phase1, Phase2** | **Time to Train (Hrs)** | **Final Loss** |
+|----------------|-----------|---------------|------------------------------------|----------------------------------------|----------------|-------------------------|
+| DGX2H | 4 | FP16 | 128, 16 | 8, 32 | 19.14 | 0.88 |
+| DGX2H | 16 | FP16 | 128, 16 | 2, 8 | 4.81  | 0.86 |
+| DGX2H | 32 | FP16 | 128, 16 | 1, 4 | 2.65  | 0.87 |
+
+#### Fine-tuning accuracy
+
+| **Task** | **F1** | **Precision** | **Recall** |
+|:-------:|:----:|:----:|:----:|
+| NER BC5CDR-chemical | 93.47 | 93.03 | 93.91 |
+| NER BC5CDR-disease | 86.22 | 85.05 | 87.43 |
+| RE Chemprot | 76.27 | 77.62 | 74.98 |
+
+##### Fine-tuning accuracy for NER Chem
+
+Our results were obtained by running the `biobert/scripts/ner_bc5cdr-chem.sh` training script in the TensorFlow 19.08-py3 NGC container.
+
+| **DGX System** | **Batch size / GPU** | **F1 - FP32** | **F1- mixed precision** | **Time to Train - FP32 (Minutes)** | **Time to Train - mixed precision (Minutes)** |
+|:---:|:----:|:----:|:---:|:----:|:----:|
+| DGX-1 16G | 64 |93.33|93.40|23.95|14.13|
+| DGX-1 32G | 64 |93.31|93.36|24.35|12.63|
+| DGX-2 32G | 64 |93.66|93.47|12.26|8.16|
+
+
+### Training stability test
+
+#### Fine-tuning stability test:
+
+The following tables compare F1 scores scores across 5 different training runs on the NER Chemical task with different seeds, for both FP16 and FP32.  The runs showcase consistent convergence on all 5 seeds with very little deviation.
+
+| **16 x V100 GPUs** | **seed 1** | **seed 2** | **seed 3** | **seed 4** | **seed 5** | **mean** | **std** |
+|:-----------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|
+| F1 Score (FP16)  | 93.13     | 92.92   | 93.34   | 93.66   | 93.47   | 93.3  | 0.29 |
+| F1 Score (FP32)  | 93.1      | 93.28   | 93.33   | 93.45   | 93.17   | 93.27 | 0.14 |
+
+
+### Training performance results
+
+#### Training performance: NVIDIA DGX-1 (8x V100 16G)
+
+##### Pre-training training performance: multi-node on DGX-1 16G
+
+Our results were obtained by running the `biobert/scripts/run_biobert.sub` training script in the TensorFlow 19.08-py3 NGC container using multiple NVIDIA DGX-1 with 8x V100 16G GPUs. Performance (in sentences per second) is the steady state throughput.
+
+| **Nodes** | **Sequence Length**| **Batch size / GPU: mixed precision, FP32** | **Throughput - mixed precision** | **Throughput - FP32** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - mixed precision** | **Weak scaling - FP32** |
+|:-------:|:-----:|:-------:|:-------:|:-------:|:-------------:|:------:|:------:|
+| 1  | 128 | 64,32 | 2762.06  | 744.48   | 3.71 | 1.00  | 1.00  |
+| 4  | 128 | 64,32 | 10283.08 | 2762.88  | 3.72 | 3.72  | 3.71  |
+| 16 | 128 | 64,32 | 39051.69 | 10715.14 | 3.64 | 14.14 | 14.39 |
+| 32 | 128 | 64,32 | 76077.39 | 21104.87 | 3.60 | 27.54 | 28.35 |
+| 1  | 512 | 8,8   | 432.33   | 160.38   | 2.70 | 1.00  | 1.00  |
+| 4  | 512 | 8,8   | 1593.00  | 604.36   | 2.64 | 3.68  | 3.77  |
+| 16 | 512 | 8,8   | 5941.82  | 2356.44  | 2.52 | 13.74 | 14.69 |
+| 32 | 512 | 8,8   | 11483.73 | 4631.29  | 2.48 | 26.56 | 28.88 |
+
+Note: The respective values for FP32 runs that use a batch size of 16, 2 in sequence lengths 128 and 512 respectively are not available due to out of memory errors that arise.
+
+##### Fine-tuning training performance for NER on DGX-1 16G
+
+Our results were obtained by running the `biobert/scripts/ner_bc5cdr-chem.sh` training script in the TensorFlow 19.08-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs. Performance (in sentences per second) is the mean throughput from 2 epochs.
+
+| **GPUs** | **Batch size / GPU** | **Throughput - FP32** | **Throughput - mixed precision** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - FP32** | **Weak scaling - mixed precision** |
+|:---:|:---:|:------:|:-----:|:----:|:----:|:----:|
+| 1 | 64 | 147.71 | 348.84  | 2.36 | 1.00 | 1.00 |
+| 4 | 64 | 583.78 | 1145.46 | 1.96 | 3.95 | 3.28 |
+| 8 | 64 | 981.22 | 1964.85 | 2.00 | 6.64 | 5.63 |
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+#### Training performance: NVIDIA DGX-1 (8x V100 32G)
+
+
+##### Fine-tuning training performance for NER on DGX-1 32G
+
+Our results were obtained by running the `biobert/scripts/ner_bc5cdr-chem.sh` training script in the TensorFlow 19.08-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs. Performance (in sentences per second) is the mean throughput from 2 epochs.
+
+
+| **GPUs** | **Batch size / GPU** | **Throughput - FP32** | **Throughput - mixed precision** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - FP32** | **Weak scaling - mixed precision** |
+|:---:|:---:|:------:|:-----:|:----:|:----:|:----:|
+| 1 | 64 | 144.1 | 417.39  | 2.89 | 1.00 | 1.00 |
+| 4 | 64 | 525.15 | 1354.14 | 2.57 | 3.64 | 3.24 |
+| 8 | 64 | 969.4 | 2341.39 | 2.41 | 6.73 | 5.61 |
+
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+#### Training performance: NVIDIA DGX-2 (16x V100 32G)
+
+
+##### Pre-training training performance: multi-node on DGX-2H 32G
+
+Our results were obtained by running the `biobert/scripts/run_biobert.sub` training script in the TensorFlow 19.08-py3 NGC container using multiple NVIDIA DGX-2H with 16x V100 32G GPUs. Performance (in sentences per second) is the steady state throughput.
+
+
+| **Nodes** | **Sequence Length**| **Batch size / GPU: mixed precision, FP32** | **Throughput - mixed precision** | **Throughput - FP32** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - mixed precision** | **Weak scaling - FP32** |
+|:-------:|:-----:|:-------:|:-------:|:-------:|:-------------:|:------:|:------:|
+| 1  | 128 | 128,128 | 7772.18   | 2165.04   | 3.59 | 1.00  | 1.00  |
+| 4  | 128 | 128,128 | 29785.31  | 8516.90   | 3.50 | 3.83  | 3.93  |
+| 16 | 128 | 128,128 | 115581.29 | 33699.15  | 3.43 | 14.87 | 15.57 |
+| 32 | 128 | 128,128 | 226156.53 | 66996.73  | 3.38 | 29.10 | 30.94 |
+| 64 | 128 | 128,128 | 444955.74 | 133424.95 | 3.33 | 57.25 | 61.63 |
+| 1  | 512 | 16,16   | 1260.06   | 416.92    | 3.02 | 1.00  | 1.00  |
+| 4  | 512 | 16,16   | 4781.19   | 1626.76   | 2.94 | 3.79  | 3.90  |
+| 16 | 512 | 16,16   | 18405.65  | 6418.09   | 2.87 | 14.61 | 15.39 |
+| 32 | 512 | 16,16   | 36071.06  | 12713.67  | 2.84 | 28.63 | 30.49 |
+| 64 | 512 | 16,16   | 69950.86  | 25245.96  | 2.77 | 55.51 | 60.55 |
+
+
+##### Fine-tuning training performance for NER on DGX-2 32G
+
+Our results were obtained by running the `biobert/scripts/ner_bc5cdr-chem.sh` training script in the TensorFlow 19.08-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G GPUs. Performance (in sentences per second) is the mean throughput from 2 epochs.
+
+| **GPUs** | **Batch size / GPU** | **Throughput - FP32** | **Throughput - mixed precision** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - FP32** | **Weak scaling - mixed precision** |
+|:---:|:---:|:------:|:-----:|:----:|:----:|:----:|
+| 1 | 64 | 139.59 | 475.54  | 3.4 | 1.00 | 1.00 |
+| 4 | 64 | 517.08 | 1544.01 | 2.98 | 3.70 | 3.25 |
+| 8 | 64 | 1009.84 | 2695.34 | 2.66 | 7.23 | 5.67 |
+| 16 | 64 | 1997.73 | 4268.81 | 2.13 | 14.31 | 8.98 |
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+## Release notes
+
+### Changelog
+
+November 2019
+- Initial release
+
+### Known issues
+
+
+- There are no known issues with the model.
+
+
+
@@ -0,0 +1,302 @@
+# Python version of the evaluation script from CoNLL'00-
+# Originates from: https://github.com/spyysalo/conlleval.py
+
+
+# Intentional differences:
+# - accept any space as delimiter by default
+# - optional file argument (default STDIN)
+# - option to set boundary (-b argument)
+# - LaTeX output (-l argument) not supported
+# - raw tags (-r argument) not supported
+
+# add function :evaluate(predicted_label, ori_label): which will not read from file
+
+import sys
+import re
+import codecs
+from collections import defaultdict, namedtuple
+
+ANY_SPACE = '<SPACE>'
+
+
+class FormatError(Exception):
+    pass
+
+Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')
+
+
+class EvalCounts(object):
+    def __init__(self):
+        self.correct_chunk = 0    # number of correctly identified chunks
+        self.correct_tags = 0     # number of correct chunk tags
+        self.found_correct = 0    # number of chunks in corpus
+        self.found_guessed = 0    # number of identified chunks
+        self.token_counter = 0    # token counter (ignores sentence breaks)
+
+        # counts by type
+        self.t_correct_chunk = defaultdict(int)
+        self.t_found_correct = defaultdict(int)
+        self.t_found_guessed = defaultdict(int)
+
+
+def parse_args(argv):
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='evaluate tagging results using CoNLL criteria',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    arg = parser.add_argument
+    arg('-b', '--boundary', metavar='STR', default='-X-',
+        help='sentence boundary')
+    arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE,
+        help='character delimiting items in input')
+    arg('-o', '--otag', metavar='CHAR', default='O',
+        help='alternative outside tag')
+    arg('file', nargs='?', default=None)
+    return parser.parse_args(argv)
+
+
+def parse_tag(t):
+    m = re.match(r'^([^-]*)-(.*)$', t)
+    return m.groups() if m else (t, '')
+
+
+def evaluate(iterable, options=None):
+    if options is None:
+        options = parse_args([])    # use defaults
+
+    counts = EvalCounts()
+    num_features = None       # number of features per line
+    in_correct = False        # currently processed chunks is correct until now
+    last_correct = 'O'        # previous chunk tag in corpus
+    last_correct_type = ''    # type of previously identified chunk tag
+    last_guessed = 'O'        # previously identified chunk tag
+    last_guessed_type = ''    # type of previous chunk tag in corpus
+
+    for i, line in enumerate(iterable):
+        line = line.rstrip('\r\n')
+        # print(line)
+
+        if options.delimiter == ANY_SPACE:
+            features = line.split()
+        else:
+            features = line.split(options.delimiter)
+
+        if num_features is None:
+            num_features = len(features)
+        elif num_features != len(features) and len(features) != 0:
+            raise FormatError('unexpected number of features: %d (%d) at line %d\n%s' %
+                              (len(features), num_features, i, line))
+
+        if len(features) == 0 or features[0] == options.boundary:
+            features = [options.boundary, 'O', 'O']
+        if len(features) < 3:
+            raise FormatError('unexpected number of features in line %s' % line)
+
+        guessed, guessed_type = parse_tag(features.pop())
+        correct, correct_type = parse_tag(features.pop())
+        first_item = features.pop(0)
+
+        if first_item == options.boundary:
+            guessed = 'O'
+
+        end_correct = end_of_chunk(last_correct, correct,
+                                   last_correct_type, correct_type)
+        end_guessed = end_of_chunk(last_guessed, guessed,
+                                   last_guessed_type, guessed_type)
+        start_correct = start_of_chunk(last_correct, correct,
+                                       last_correct_type, correct_type)
+        start_guessed = start_of_chunk(last_guessed, guessed,
+                                       last_guessed_type, guessed_type)
+
+        if in_correct:
+            if (end_correct and end_guessed and
+                last_guessed_type == last_correct_type):
+                in_correct = False
+                counts.correct_chunk += 1
+                counts.t_correct_chunk[last_correct_type] += 1
+            elif (end_correct != end_guessed or guessed_type != correct_type):
+                in_correct = False
+
+        if start_correct and start_guessed and guessed_type == correct_type:
+            in_correct = True
+
+        if start_correct:
+            counts.found_correct += 1
+            counts.t_found_correct[correct_type] += 1
+        if start_guessed:
+            counts.found_guessed += 1
+            counts.t_found_guessed[guessed_type] += 1
+        if first_item != options.boundary:
+            if correct == guessed and guessed_type == correct_type:
+                counts.correct_tags += 1
+            counts.token_counter += 1
+
+        last_guessed = guessed
+        last_correct = correct
+        last_guessed_type = guessed_type
+        last_correct_type = correct_type
+
+    if in_correct:
+        counts.correct_chunk += 1
+        counts.t_correct_chunk[last_correct_type] += 1
+
+    return counts
+
+
+
+def uniq(iterable):
+  seen = set()
+  return [i for i in iterable if not (i in seen or seen.add(i))]
+
+
+def calculate_metrics(correct, guessed, total):
+    tp, fp, fn = correct, guessed-correct, total-correct
+    p = 0 if tp + fp == 0 else 1.*tp / (tp + fp)
+    r = 0 if tp + fn == 0 else 1.*tp / (tp + fn)
+    f = 0 if p + r == 0 else 2 * p * r / (p + r)
+    return Metrics(tp, fp, fn, p, r, f)
+
+
+def metrics(counts):
+    c = counts
+    overall = calculate_metrics(
+        c.correct_chunk, c.found_guessed, c.found_correct
+    )
+    by_type = {}
+    for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)):
+        by_type[t] = calculate_metrics(
+            c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t]
+        )
+    return overall, by_type
+
+
+def report(counts, out=None):
+    if out is None:
+        out = sys.stdout
+
+    overall, by_type = metrics(counts)
+
+    c = counts
+    out.write('processed %d tokens with %d phrases; ' %
+              (c.token_counter, c.found_correct))
+    out.write('found: %d phrases; correct: %d.\n' %
+              (c.found_guessed, c.correct_chunk))
+
+    if c.token_counter > 0:
+        out.write('accuracy: %6.2f%%; ' %
+                  (100.*c.correct_tags/c.token_counter))
+        out.write('precision: %6.2f%%; ' % (100.*overall.prec))
+        out.write('recall: %6.2f%%; ' % (100.*overall.rec))
+        out.write('FB1: %6.2f\n' % (100.*overall.fscore))
+
+    for i, m in sorted(by_type.items()):
+        out.write('%17s: ' % i)
+        out.write('precision: %6.2f%%; ' % (100.*m.prec))
+        out.write('recall: %6.2f%%; ' % (100.*m.rec))
+        out.write('FB1: %6.2f  %d\n' % (100.*m.fscore, c.t_found_guessed[i]))
+
+
+def report_notprint(counts, out=None):
+    if out is None:
+        out = sys.stdout
+
+    overall, by_type = metrics(counts)
+
+    c = counts
+    final_report = []
+    line = []
+    line.append('processed %d tokens with %d phrases; ' %
+              (c.token_counter, c.found_correct))
+    line.append('found: %d phrases; correct: %d.\n' %
+              (c.found_guessed, c.correct_chunk))
+    final_report.append("".join(line))
+
+    if c.token_counter > 0:
+        line = []
+        line.append('accuracy: %6.2f%%; ' %
+                  (100.*c.correct_tags/c.token_counter))
+        line.append('precision: %6.2f%%; ' % (100.*overall.prec))
+        line.append('recall: %6.2f%%; ' % (100.*overall.rec))
+        line.append('FB1: %6.2f\n' % (100.*overall.fscore))
+        final_report.append("".join(line))
+
+    for i, m in sorted(by_type.items()):
+        line = []
+        line.append('%17s: ' % i)
+        line.append('precision: %6.2f%%; ' % (100.*m.prec))
+        line.append('recall: %6.2f%%; ' % (100.*m.rec))
+        line.append('FB1: %6.2f  %d\n' % (100.*m.fscore, c.t_found_guessed[i]))
+        final_report.append("".join(line))
+    return final_report
+
+
+def end_of_chunk(prev_tag, tag, prev_type, type_):
+    # check if a chunk ended between the previous and current word
+    # arguments: previous and current chunk tags, previous and current types
+    chunk_end = False
+
+    if prev_tag == 'E': chunk_end = True
+    if prev_tag == 'S': chunk_end = True
+
+    if prev_tag == 'B' and tag == 'B': chunk_end = True
+    if prev_tag == 'B' and tag == 'S': chunk_end = True
+    if prev_tag == 'B' and tag == 'O': chunk_end = True
+    if prev_tag == 'I' and tag == 'B': chunk_end = True
+    if prev_tag == 'I' and tag == 'S': chunk_end = True
+    if prev_tag == 'I' and tag == 'O': chunk_end = True
+
+    if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
+        chunk_end = True
+
+    # these chunks are assumed to have length 1
+    if prev_tag == ']': chunk_end = True
+    if prev_tag == '[': chunk_end = True
+
+    return chunk_end
+
+
+def start_of_chunk(prev_tag, tag, prev_type, type_):
+    # check if a chunk started between the previous and current word
+    # arguments: previous and current chunk tags, previous and current types
+    chunk_start = False
+
+    if tag == 'B': chunk_start = True
+    if tag == 'S': chunk_start = True
+
+    if prev_tag == 'E' and tag == 'E': chunk_start = True
+    if prev_tag == 'E' and tag == 'I': chunk_start = True
+    if prev_tag == 'S' and tag == 'E': chunk_start = True
+    if prev_tag == 'S' and tag == 'I': chunk_start = True
+    if prev_tag == 'O' and tag == 'E': chunk_start = True
+    if prev_tag == 'O' and tag == 'I': chunk_start = True
+
+    if tag != 'O' and tag != '.' and prev_type != type_:
+        chunk_start = True
+
+    # these chunks are assumed to have length 1
+    if tag == '[': chunk_start = True
+    if tag == ']': chunk_start = True
+
+    return chunk_start
+
+
+def main(argv):
+    args = parse_args(argv[1:])
+
+    if args.file is None:
+        counts = evaluate(sys.stdin, args)
+    else:
+        with open(args.file) as f:
+            counts = evaluate(f, args)
+    report(counts)
+
+
+def return_report(input_file):
+    with open(input_file, "r") as f:
+        counts = evaluate(f)
+    return report_notprint(counts)
+
+if __name__ == '__main__':
+    # sys.exit(main(sys.argv))
+    return_report('/home/pengy6/data/sentence_similarity/data/cdr/test1/wanli_result2/label_test.txt')
@@ -0,0 +1,51 @@
+import os
+import numpy as np
+import pandas as pd
+import sklearn.metrics
+import argparse
+
+
+parser = argparse.ArgumentParser(description='')
+parser.add_argument('--output_path', type=str,  help='')
+parser.add_argument('--answer_path', type=str,  help='')
+parser.add_argument('--task', type=str,  default="binary", help='default:binary, possible other options:{chemprot}')
+args = parser.parse_args()
+
+
+testdf = pd.read_csv(args.answer_path, sep="\t", index_col=0)
+preddf = pd.read_csv(args.output_path, sep="\t", header=None)
+
+
+# binary
+if args.task == "binary":
+    pred = [preddf.iloc[i].tolist() for i in preddf.index]
+    pred_class = [np.argmax(v) for v in pred]
+    pred_prob_one = [v[1] for v in pred]
+
+    p,r,f,s = sklearn.metrics.precision_recall_fscore_support(y_pred=pred_class, y_true=testdf["label"])
+    results = dict()
+    results["f1 score"] = f[1]
+    results["recall"] = r[1]
+    results["precision"] = p[1]
+    results["specificity"] = r[0]
+
+# chemprot
+# micro-average of 5 target classes
+# see "Potent pairing: ensemble of long short-term memory networks and support vector machine for chemical-protein relation extraction (Mehryary, 2018)" for details
+if args.task == "chemprot":
+    pred = [preddf.iloc[i].tolist() for i in preddf.index]
+    pred_class = [np.argmax(v) for v in pred]
+    str_to_int_mapper = dict()
+
+    for i,v in enumerate(sorted(testdf["label"].unique())):
+        str_to_int_mapper[v] = i
+    test_answer = [str_to_int_mapper[v] for v in testdf["label"]]
+
+    p,r,f,s = sklearn.metrics.precision_recall_fscore_support(y_pred=pred_class, y_true=test_answer, labels=[0,1,2,3,4], average="micro")
+    results = dict()
+    results["f1 score"] = f
+    results["recall"] = r
+    results["precision"] = p
+
+for k,v in results.items():
+    print("{:11s} : {:.2%}".format(k,v))
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+docker run --runtime=nvidia -v $PWD:/workspace/bert \
+    --rm --shm-size=1g --ulimit memlock=-1 \
+    --ulimit stack=67108864 --ipc=host -t -i \
+    bert bash -c "bash data/create_biobert_datasets_from_start.sh"
@@ -0,0 +1,187 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+task=${1:-"ner_bc5cdr-chem"}
+bert_model=${2:-"base"}
+cased=${3:-"false"}
+
+if [ "$cased" = "true" ] ; then
+    DO_LOWER_CASE=0
+    CASING_DIR_PREFIX="cased"
+    case_flag="--do_lower_case=False"
+else
+    DO_LOWER_CASE=1
+    CASING_DIR_PREFIX="uncased"
+    case_flag="--do_lower_case=True"
+fi
+
+if [ "$bert_model" = "large" ] ; then
+    export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-24_H-1024_A-16
+else
+    export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
+fi
+
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+printf -v TAG "tf_bert_biobert_%s_inference_benchmark_%s_%s" "$task" "$bert_model" "$CASING_DIR_PREFIX"
+OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
+mkdir -p ${OUTPUT_DIR}
+
+if [ "$task" = "ner_bc5cdr-chem" ] ; then
+
+
+  DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/chem
+  LOGFILE="${OUTPUT_DIR}/${task}_training_benchmark_bert_${bert_model}.log"
+
+    echo "Training performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
+
+    echo "Precision Sequence Length   Batch size  Performance(sent/sec)" >> $LOGFILE
+
+    for seq_length in 128 512; do
+        for batch_size in 8 32 64; do
+            for precision in fp16 fp32; do
+                res_dir=${OUTPUT_DIR}/bert_${bert_model}_sl_${seq_len}_prec_${precision}_bs_${batch_size}
+                mkdir -p ${res_dir}
+                tmp_file="${res_dir}/${task}_training_benchmark.log"
+
+                if [ "$precision" = "fp16" ] ; then
+                    echo "fp16 activated!"
+                    use_fp16="--use_fp16"
+                    use_xla_tag="--use_xla"
+                else
+                    echo "fp32 activated!"
+                    use_fp16=""
+                    use_xla_tag=""
+                fi
+
+                python /workspace/bert/run_ner.py \
+                --do_prepare=true \
+                --do_eval=true \
+                --do_predict=true \
+                --task_name="bc5cdr" \
+                --vocab_file=$BERT_DIR/vocab.txt \
+                --bert_config_file=$BERT_DIR/bert_config.json \
+                --init_checkpoint="$BERT_DIR/bert_model.ckpt" \
+                --data_dir=$DATASET_DIR \
+                --output_dir=$res_dir \
+                --eval_batch_size=$batch_size \
+                --predict_batch_size=$batch_size \
+                --max_seq_length=$seq_length \
+                $use_fp16 $use_xla_tag $case_flag  |& tee $tmp_file
+
+                  perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | tail -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
+                echo "$precision $seq_len  $batch_size $perf" >> $LOGFILE
+
+            done
+        done
+    done
+
+elif [ "$task" = "ner_bc5cdr-disease" ] ; then
+  DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/disease
+
+  LOGFILE="${OUTPUT_DIR}/${task}_training_benchmark_bert_${bert_model}.log"
+
+    echo "Training performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
+
+    echo "Precision Sequence Length   Batch size  Performance(sent/sec)" >> $LOGFILE
+
+    for seq_length in 128 512; do
+        for batch_size in 8 32 64; do
+            for precision in fp16 fp32; do
+                res_dir=${OUTPUT_DIR}/bert_${bert_model}_sl_${seq_len}_prec_${precision}_bs_${batch_size}
+                mkdir -p ${res_dir}
+                tmp_file="${res_dir}/${task}_training_benchmark.log"
+
+                if [ "$precision" = "fp16" ] ; then
+                    echo "fp16 activated!"
+                    use_fp16="--use_fp16"
+                    use_xla_tag="--use_xla"
+                else
+                    echo "fp32 activated!"
+                    use_fp16=""
+                    use_xla_tag=""
+                fi
+                python3 /workspace/bert/run_ner.py \
+                --do_prepare=true \
+                --do_eval=true \
+                --do_predict=true \
+                --task_name="bc5cdr" \
+                --vocab_file=$BERT_DIR/vocab.txt \
+                --bert_config_file=$BERT_DIR/bert_config.json \
+                --init_checkpoint="$BERT_DIR/bert_model.ckpt" \
+                --data_dir=$DATASET_DIR \
+                --output_dir=$res_dir \
+                --eval_batch_size=$batch_size \
+                --predict_batch_size=$batch_size \
+                --max_seq_length=$seq_length \
+                "$use_fp16" $use_xla_tag $case_flag  |& tee $tmp_file
+
+                  perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | tail -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
+                echo "$precision $seq_len  $batch_size $perf" >> $LOGFILE
+
+            done
+        done
+    done
+
+elif [ "$task" = "rel_chemprot" ] ; then
+  DATASET_DIR=/workspace/bert/data/biobert/ChemProt
+
+  LOGFILE="${OUTPUT_DIR}/${task}_training_benchmark_bert_${bert_model}.log"
+
+    echo "Training performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
+
+    echo "Precision Sequence Length   Batch size  Performance(sent/sec)" >> $LOGFILE
+
+    for seq_length in 128 512; do
+        for batch_size in 8 32 64; do
+            for precision in fp16 fp32; do
+                res_dir=${OUTPUT_DIR}/bert_${bert_model}_sl_${seq_len}_prec_${precision}_bs_${batch_size}
+                mkdir -p ${res_dir}
+                tmp_file="${res_dir}/${task}_training_benchmark.log"
+
+                if [ "$precision" = "fp16" ] ; then
+                    echo "fp16 activated!"
+                    use_fp16="--use_fp16"
+                    use_xla_tag="--use_xla"
+                else
+                    echo "fp32 activated!"
+                    use_fp16=""
+                    use_xla_tag=""
+                fi
+                python3 /workspace/bert/run_re.py \
+                --do_prepare=true \
+                --do_eval=true \
+                --do_predict=true \
+                --task_name="chemprot" \
+                --vocab_file=$BERT_DIR/vocab.txt \
+                --bert_config_file=$BERT_DIR/bert_config.json \
+                --init_checkpoint="$BERT_DIR/bert_model.ckpt" \
+                --data_dir=$DATASET_DIR \
+                --output_dir=$res_dir \
+                --eval_batch_size=$batch_size \
+                --predict_batch_size=$batch_size \
+                --max_seq_length=$seq_length \
+                "$use_fp16" $use_xla_tag $case_flag  |& tee $tmp_file
+
+                  perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | tail -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
+                echo "$precision $seq_len  $batch_size $perf" >> $LOGFILE
+
+            done
+        done
+    done
+
+else
+
+    echo "Benchmarking for " $task "currently not supported. Sorry!"
+
+fi
@@ -0,0 +1,203 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+task=${1:-"ner_bc5cdr-chem"}
+num_gpu=${2:-"2"}
+bert_model=${3:-"base"}
+cased=${4:-"false"}
+
+
+epochs=2.0
+
+if [ "$cased" = "true" ] ; then
+    DO_LOWER_CASE=0
+    CASING_DIR_PREFIX="cased"
+    case_flag="--do_lower_case=False"
+else
+    DO_LOWER_CASE=1
+    CASING_DIR_PREFIX="uncased"
+    case_flag="--do_lower_case=True"
+fi
+
+if [ "$bert_model" = "large" ] ; then
+    export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-24_H-1024_A-16
+else
+    export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
+fi
+
+if [ $num_gpu -gt 1 ] ; then
+    mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
+    --allow-run-as-root -bind-to none -map-by slot \
+    -x NCCL_DEBUG=INFO \
+    -x LD_LIBRARY_PATH \
+    -x PATH -mca pml ob1 -mca btl ^openib"
+    use_hvd="--horovod"
+else
+    mpi_command=""
+    use_hvd=""
+fi
+
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+printf -v TAG "tf_bert_biobert_%s_training_benchmark_%s_%s_num_gpu_%d" "$task" "$bert_model" "$CASING_DIR_PREFIX" "$num_gpu"
+OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
+mkdir -p ${OUTPUT_DIR}
+
+if [ "$task" = "ner_bc5cdr-chem" ] ; then
+
+  DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/chem
+  LOGFILE="${OUTPUT_DIR}/${task}_training_benchmark_bert_${bert_model}_gpu_${num_gpu}.log"
+
+    echo "Training performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
+    echo "Precision Sequence Length   Batch size  Performance(sent/sec)" >> $LOGFILE
+
+    for seq_length in 128 512; do
+        for train_batch_size in 8 32 64; do
+            for precision in fp16 fp32; do
+                res_dir=${OUTPUT_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_length}_prec_${precision}_bs_${batch_size}
+                mkdir -p ${res_dir}
+                tmp_file="${res_dir}/${task}_training_benchmark.log"
+
+                if [ "$precision" = "fp16" ] ; then
+                    echo "fp16 activated!"
+                    use_fp16="--use_fp16"
+                    use_xla_tag="--use_xla"
+                else
+                    echo "fp32 activated!"
+                    use_fp16=""
+                    use_xla_tag=""
+                fi
+
+                $mpi_command python /workspace/bert/run_ner.py \
+                  --do_prepare=true \
+                  --do_train=true \
+                  --do_eval=true \
+                  --do_predict=true \
+                  --task_name=bc5cdr \
+                  --vocab_file=$BERT_DIR/vocab.txt \
+                  --bert_config_file=$BERT_DIR/bert_config.json \
+                  --init_checkpoint="$BERT_DIR/bert_model.ckpt" \
+                  --num_train_epochs=$epochs \
+                  --data_dir=$DATASET_DIR \
+                  --output_dir=$res_dir \
+                  --train_batch_size=$train_batch_size \
+                  --max_seq_length=$seq_length \
+                  $use_hvd $use_fp16 $use_xla_tag $case_flag |& tee $tmp_file
+
+                perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
+                echo "$precision  $seq_length  $train_batch_size $perf" >> $LOGFILE
+
+            done
+        done
+    done
+
+elif [ "$task" = "ner_bc5cdr-disease" ] ; then
+  DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/disease
+  LOGFILE="${OUTPUT_DIR}/${task}_training_benchmark_bert_${bert_model}_gpu_${num_gpu}.log"
+
+    echo "Training performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
+    echo "Precision Sequence Length   Batch size  Performance(sent/sec)" >> $LOGFILE
+
+    for seq_length in 128 512; do
+        for train_batch_size in 8 32 64; do
+            for precision in fp16 fp32; do
+                res_dir=${OUTPUT_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_length}_prec_${precision}_bs_${batch_size}
+                mkdir -p ${res_dir}
+                tmp_file="${res_dir}/${task}_training_benchmark.log"
+
+                if [ "$precision" = "fp16" ] ; then
+                    echo "fp16 activated!"
+                    use_fp16="--use_fp16"
+                    use_xla_tag="--use_xla"
+                else
+                    echo "fp32 activated!"
+                    use_fp16=""
+                    use_xla_tag=""
+                fi
+
+                $mpi_command python3 /workspace/bert/run_ner.py \
+                --do_prepare=true \
+                --do_train=true \
+                --do_eval=true \
+                --do_predict=true \
+                --task_name="bc5cdr" \
+                --vocab_file=$BERT_DIR/vocab.txt \
+                --bert_config_file=$BERT_DIR/bert_config.json \
+                --init_checkpoint="$BERT_DIR/bert_model.ckpt" \
+                --num_train_epochs=$epochs \
+                --data_dir=$DATASET_DIR \
+                --output_dir=$res_dir \
+                --train_batch_size=$train_batch_size \
+                --max_seq_length=$seq_length \
+                "$use_hvd" "$use_fp16" $use_xla_tag $case_flag  |& tee $tmp_file
+
+                  perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
+                echo "$precision  $seq_length  $train_batch_size $perf" >> $LOGFILE
+
+            done
+        done
+    done
+
+elif [ "$task" = "rel_chemprot" ] ; then
+  DATASET_DIR=/workspace/bert/data/biobert/ChemProt
+  LOGFILE="${OUTPUT_DIR}/${task}_training_benchmark_bert_${bert_model}_gpu_${num_gpu}.log"
+
+    echo "Training performance benchmarking for BERT $bert_model from $BERT_DIR" >> $LOGFILE
+    echo "Precision Sequence Length   Batch size  Performance(sent/sec)" >> $LOGFILE
+
+    for seq_length in 128 512; do
+        for train_batch_size in 8 32 64; do
+            for precision in fp16 fp32; do
+                res_dir=${OUTPUT_DIR}/bert_${bert_model}_gpu_${num_gpu}_sl_${seq_length}_prec_${precision}_bs_${batch_size}
+                mkdir -p ${res_dir}
+                tmp_file="${res_dir}/${task}_training_benchmark.log"
+
+                if [ "$precision" = "fp16" ] ; then
+                    echo "fp16 activated!"
+                    use_fp16="--use_fp16"
+                    use_xla_tag="--use_xla"
+                else
+                    echo "fp32 activated!"
+                    use_fp16=""
+                    use_xla_tag=""
+                fi
+
+                $mpi_command python3 /workspace/bert/run_re.py \
+                --do_prepare=true \
+                --do_train=true \
+                --do_eval=true \
+                --do_predict=true \
+                --task_name="chemprot" \
+                --vocab_file=$BERT_DIR/vocab.txt \
+                --bert_config_file=$BERT_DIR/bert_config.json \
+                --init_checkpoint="$BERT_DIR/bert_model.ckpt" \
+                --num_train_epochs=$epochs \
+                --data_dir=$DATASET_DIR \
+                --output_dir=$res_dir \
+                --train_batch_size=$train_batch_size \
+                --max_seq_length=$seq_length \
+                "$use_hvd" "$use_fp16" $use_xla_tag $case_flag |& tee $tmp_file
+
+                perf=`cat $tmp_file | grep -F 'Throughput Average (sentences/sec) =' | head -1 | awk -F'= ' '{print $2}' | awk -F' sen' '{print $1}'`
+                echo "$precision  $seq_length  $train_batch_size $perf" >> $LOGFILE
+
+            done
+        done
+    done
+
+else
+
+    echo "Benchmarking for " $task "currently not supported. Sorry!"
+
+fi
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+init_checkpoint=${1:-"/results/biobert_tf_uncased_base/model.ckpt-4340"}
+train_batch_size=${2:-8}
+learning_rate=${3:-3.125e-6}
+cased=${4:-false}
+precision=${5:-"fp16"}
+use_xla=${6:-"true"}
+num_gpu=${7:-"16"}
+seq_length=${8:-128}
+bert_model=${9:-"base"}
+eval_batch_size=${10:-8} #Eval and Predict BS is assumed to be same
+epochs=${11:-"10.0"}
+
+if [ "$cased" = "true" ] ; then
+    DO_LOWER_CASE=0
+    CASING_DIR_PREFIX="cased"
+    case_flag="--do_lower_case=False"
+else
+    DO_LOWER_CASE=1
+    CASING_DIR_PREFIX="uncased"
+    case_flag="--do_lower_case=True"
+fi
+
+if [ "$bert_model" = "large" ] ; then
+    export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-24_H-1024_A-16
+else
+    export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
+fi
+
+
+export GBS=$(expr $train_batch_size \* $num_gpu)
+printf -v TAG "tf_bert_biobert_ner_bc5cdr_chem_%s_%s_gbs%d" "$bert_model" "$precision" $GBS
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+
+
+DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/chem
+OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
+mkdir -p ${OUTPUT_DIR}
+
+use_fp16=""
+if [ "$precision" = "fp16" ] ; then
+        echo "fp16 activated!"
+        use_fp16="--use_fp16"
+fi
+
+if [ "$use_xla" = "true" ] ; then
+    use_xla_tag="--use_xla"
+    echo "XLA activated"
+else
+    use_xla_tag=""
+fi
+
+
+if [ $num_gpu -gt 1 ] ; then
+    mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
+    --allow-run-as-root -bind-to none -map-by slot \
+    -x NCCL_DEBUG=INFO \
+    -x LD_LIBRARY_PATH \
+    -x PATH -mca pml ob1 -mca btl ^openib"
+    use_hvd="--horovod"
+else
+    mpi_command=""
+    use_hvd=""
+fi
+
+$mpi python /workspace/bert/run_ner.py \
+  --do_prepare=true \
+  --do_train=true \
+  --do_eval=true \
+  --do_predict=true \
+  --task_name=bc5cdr \
+  --vocab_file=$BERT_DIR/vocab.txt \
+  --bert_config_file=$BERT_DIR/bert_config.json \
+  --init_checkpoint=$init_checkpoint \
+  --num_train_epochs=$epochs \
+  --data_dir=$DATASET_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --learning_rate=$learning_rate \
+  --train_batch_size=$train_batch_size \
+  --eval_batch_size=$eval_batch_size \
+  --predict_batch_size=$eval_batch_size \
+  --max_seq_length=$seq_length \
+  $use_hvd $use_fp16 $use_xla_tag $case_flag
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+init_checkpoint=${1:-"/results/biobert_tf_uncased_base/model.ckpt-4340"}
+train_batch_size=${2:-8}
+learning_rate=${3:-3.125e-6}
+cased=${4:-false}
+precision=${5:-"fp16"}
+use_xla=${6:-"true"}
+num_gpu=${7:-"16"}
+seq_length=${8:-128}
+bert_model=${9:-"base"}
+eval_batch_size=${10:-8} #Eval and Predict BS is assumed to be same
+epochs=${11:-"100.0"}
+
+if [ "$cased" = "true" ] ; then
+    DO_LOWER_CASE=0
+    CASING_DIR_PREFIX="cased"
+    case_flag="--do_lower_case=False"
+else
+    DO_LOWER_CASE=1
+    CASING_DIR_PREFIX="uncased"
+    case_flag="--do_lower_case=True"
+fi
+
+if [ "$bert_model" = "large" ] ; then
+    export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-24_H-1024_A-16
+else
+    export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
+fi
+
+export GBS=$(expr $train_batch_size \* $num_gpu)
+printf -v TAG "tf_bert_biobert_ner_bc5cdr_disease_%s_%s_gbs%d" "$bert_model" "$precision" $GBS
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+
+
+
+DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/disease
+OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
+mkdir -p ${OUTPUT_DIR}
+
+use_fp16=""
+if [ "$precision" = "fp16" ] ; then
+        echo "fp16 activated!"
+        use_fp16="--use_fp16"
+fi
+
+if [ "$use_xla" = "true" ] ; then
+    use_xla_tag="--use_xla"
+    echo "XLA activated"
+else
+    use_xla_tag=""
+fi
+
+if [ $num_gpu -gt 1 ] ; then
+    mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
+    --allow-run-as-root -bind-to none -map-by slot \
+    -x NCCL_DEBUG=INFO \
+    -x LD_LIBRARY_PATH \
+    -x PATH -mca pml ob1 -mca btl ^openib"
+    use_hvd="--horovod"
+else
+    mpi_command=""
+    use_hvd=""
+fi
+
+$mpi_command python3 /workspace/bert/run_ner.py \
+  --do_prepare=true \
+  --do_train=true \
+  --do_eval=true \
+  --do_predict=true \
+  --task_name="bc5cdr" \
+  --vocab_file=$BERT_DIR/vocab.txt \
+  --bert_config_file=$BERT_DIR/bert_config.json \
+  --init_checkpoint=$init_checkpoint \
+  --num_train_epochs=$epochs \
+  --data_dir=$DATASET_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --learning_rate=$learning_rate \
+  --train_batch_size=$train_batch_size \
+  --eval_batch_size=$eval_batch_size \
+  --predict_batch_size=$eval_batch_size \
+  --max_seq_length=$seq_length \
+  "$use_hvd" "$use_fp16" $use_xla_tag $case_flag
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+init_checkpoint=${1:-"/results/biobert_tf_uncased_base/model.ckpt-4340"}
+train_batch_size=${2:-64}
+learning_rate=${3:-1.5e-6}
+cased=${4:-false}
+precision=${5:-"fp16"}
+use_xla=${6:-"true"}
+num_gpu=${7:-"16"}
+seq_length=${8:-512}
+bert_model=${9:-"base"}
+eval_batch_size=${10:-16} #Eval and Predict BS is assumed to be same
+epochs=${11:-"3.0"}
+
+if [ "$cased" = "true" ] ; then
+    DO_LOWER_CASE=0
+    CASING_DIR_PREFIX="cased"
+    case_flag="--do_lower_case=False"
+else
+    DO_LOWER_CASE=1
+    CASING_DIR_PREFIX="uncased"
+    case_flag="--do_lower_case=True"
+fi
+
+if [ "$bert_model" = "large" ] ; then
+    export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-24_H-1024_A-16
+else
+    export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
+fi
+
+export GBS=$(expr $train_batch_size \* $num_gpu)
+printf -v TAG "tf_bert_biobert_rel_chemprot_%s_%s_gbs%d" "$bert_model" "$precision" $GBS
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+
+
+DATASET_DIR=/workspace/bert/data/biobert/ChemProt
+OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
+mkdir -p ${OUTPUT_DIR}
+
+use_fp16=""
+if [ "$precision" = "fp16" ] ; then
+        echo "fp16 activated!"
+        use_fp16="--use_fp16"
+fi
+
+if [ "$use_xla" = "true" ] ; then
+    use_xla_tag="--use_xla"
+    echo "XLA activated"
+else
+    use_xla_tag=""
+fi
+
+if [ $num_gpu -gt 1 ] ; then
+    mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
+    --allow-run-as-root -bind-to none -map-by slot \
+    -x NCCL_DEBUG=INFO \
+    -x LD_LIBRARY_PATH \
+    -x PATH -mca pml ob1 -mca btl ^openib"
+    use_hvd="--horovod"
+else
+    mpi_command=""
+    use_hvd=""
+fi
+
+$mpi_command python3 /workspace/bert/run_re.py \
+  --do_prepare=true \
+  --do_train=true \
+  --do_eval=true \
+  --do_predict=true \
+  --task_name="chemprot" \
+  --vocab_file=$BERT_DIR/vocab.txt \
+  --bert_config_file=$BERT_DIR/bert_config.json \
+  --init_checkpoint=$init_checkpoint \
+  --num_train_epochs=$epochs \
+  --data_dir=$DATASET_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --learning_rate=$learning_rate \
+  --train_batch_size=$train_batch_size \
+  --eval_batch_size=$eval_batch_size \
+  --predict_batch_size=$eval_batch_size \
+  --max_seq_length=$seq_length \
+  "$use_hvd" "$use_fp16" $use_xla_tag $case_flag
+
+python3 /workspace/bert/biobert/re_eval.py --task=chemprot --output_path=$OUTPUT_DIR/test_results.tsv \
+  --answer_path=$DATASET_DIR/test.tsv |& tee $OUTPUT_DIR/test_results.txt
@@ -0,0 +1,87 @@
+#!/bin/bash
+#SBATCH --exclusive
+#SBATCH --mem=0
+#SBATCH --overcommit
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eux
+
+readonly docker_image="nvcr.io/nvidia/tensorflow:19.08-py3"
+readonly datadir="/raid/data/bert"
+readonly checkpointdir="$PWD/checkpoints"
+
+readonly mounts=".:/workspace/bert,${datadir}:/workspace/bert/data,${checkpointdir}:/results"
+
+DO_LOWER_CASE=${DO_LOWER_CASE:-1}
+if [ "$DO_LOWER_CASE" == "1" ]; then
+  CASING_DIR_PREFIX="uncased"
+else
+  CASING_DIR_PREFIX="cased"
+fi
+
+DO_BERT_BASE=${DO_BERT_BASE:-1}
+if [ "$DO_BERT_BASE" == "1" ]; then
+  CASING_DIR_SUFFIX="L-12_H-768_A-12"
+else
+  CASING_DIR_SUFFIX="L-24_H-1024_A-16"
+fi
+
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}/biobert_phase_1"
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}/biobert_phase_2"
+
+PHASE1="\
+     --train_batch_size=${BATCHSIZE:-128} \
+     --learning_rate=${LEARNING_RATE:-3.2e-5} \
+     --num_accumulation_steps=${NUM_ACCUMULATION_STEPS:-128} \
+     --input_files_dir=lower_case_${DO_LOWER_CASE}_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/training \
+     --eval_files_dir=lower_case_${DO_LOWER_CASE}_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/test \
+     --max_seq_length=128 \
+     --max_predictions_per_seq=20 \
+     --num_train_steps=19531 \
+     --num_warmup_steps=1953 \
+     --output_dir=/results/biobert_phase_1 \
+     "
+
+PHASE2="\
+     --train_batch_size=${BATCHSIZE:-16} \
+     --learning_rate=${LEARNING_RATE:-6.4e-5} \
+     --num_accumulation_steps=${NUM_ACCUMULATION_STEPS:-512} \
+     --input_files_dir=/workspace/bert/data/tfrecord/lower_case_${DO_LOWER_CASE}_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/training \
+     --eval_files_dir=/workspace/bert/data/tfrecord/lower_case_${DO_LOWER_CASE}_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/test \
+     --max_seq_length=512 \
+     --max_predictions_per_seq=80 \
+     --num_train_steps=4340 \
+     --num_warmup_steps=434 \
+     --output_dir=/results/biobert_phase_2 \
+     --init_checkpoint=/results/biobert_phase_1/model.ckpt-19531 \
+    "
+
+PHASES=( "$PHASE1" "$PHASE2" )
+
+PHASE=${PHASE:-1}
+
+BERT_CMD="\
+    python /workspace/bert/run_pretraining.py \
+     ${PHASES[$((PHASE-1))]} \
+     --bert_config_file=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_${CASING_DIR_SUFFIX}/bert_config.json \
+     --vocab_file=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_${CASING_DIR_SUFFIX}/vocab.txt \
+     --do_train=True \
+     --do_eval=True \
+     --save_checkpoints_steps=5000 \
+     --horovod --use_fp16 --use_xla \
+     --allreduce_post_accumulation=True \
+     --eval_batch_size=8"
+
+srun --mpi=pmi2 -l --container-image="${docker_image}" --container-mounts="${mounts}" bash -c "${BERT_CMD}"
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+task=${1:-"ner_bc5cdr-chem"}
+init_checkpoint=${2:-"/results/biobert_tf_uncased_base/model.ckpt-4340"}
+bert_model=${3:-"base"}
+cased=${4:-"false"}
+precision=${5:-"fp16"}
+use_xla=${6:-"true"}
+batch_size=${7:-"16"}
+
+if [ "$cased" = "true" ] ; then
+    DO_LOWER_CASE=0
+    CASING_DIR_PREFIX="cased"
+    case_flag="--do_lower_case=False"
+else
+    DO_LOWER_CASE=1
+    CASING_DIR_PREFIX="uncased"
+    case_flag="--do_lower_case=True"
+fi
+
+if [ "$bert_model" = "large" ] ; then
+    export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-24_H-1024_A-16
+else
+    export BERT_DIR=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12
+fi
+
+use_fp16=""
+if [ "$precision" = "fp16" ] ; then
+        echo "fp16 activated!"
+        use_fp16="--use_fp16"
+fi
+
+if [ "$use_xla" = "true" ] ; then
+    use_xla_tag="--use_xla"
+    echo "XLA activated"
+else
+    use_xla_tag=""
+fi
+
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+
+if [ "$task" = "ner_bc5cdr-chem" ] ; then
+
+  printf -v TAG "tf_bert_biobert_ner_bc5cdr_chem_inference_%s_%s" "$bert_model" "$precision"
+  DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/chem
+  OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
+
+  python /workspace/bert/run_ner.py \
+  --do_prepare=true \
+  --do_eval=true \
+  --do_predict=true \
+  --task_name="bc5cdr" \
+  --vocab_file=$BERT_DIR/vocab.txt \
+  --bert_config_file=$BERT_DIR/bert_config.json \
+  --init_checkpoint=$init_checkpoint \
+  --data_dir=$DATASET_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --eval_batch_size=$batch_size \
+  --predict_batch_size=$batch_size \
+  --max_seq_length=128 \
+  $use_fp16 $use_xla_tag $case_flag
+
+elif [ "$task" = "ner_bc5cdr-disease" ] ; then
+  printf -v TAG "tf_bert_biobert_ner_bc5cdr_disease_inference_%s_%s" "$bert_model" "$precision"
+  DATASET_DIR=/workspace/bert/data/biobert/BC5CDR/disease
+  OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
+
+  python3 /workspace/bert/run_ner.py \
+  --do_prepare=true \
+  --do_eval=true \
+  --do_predict=true \
+  --task_name="bc5cdr" \
+  --vocab_file=$BERT_DIR/vocab.txt \
+  --bert_config_file=$BERT_DIR/bert_config.json \
+  --init_checkpoint=$init_checkpoint \
+  --data_dir=$DATASET_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --eval_batch_size=$batch_size \
+  --predict_batch_size=$batch_size \
+  --max_seq_length=128 \
+  "$use_fp16" $use_xla_tag $case_flag
+
+elif [ "$task" = "rel_chemprot" ] ; then
+  printf -v TAG "tf_bert_biobert_rel_chemprot_inference_%s_%s_" "$bert_model" "$precision"
+  DATASET_DIR=/workspace/bert/data/biobert/ChemProt
+  OUTPUT_DIR=/results/${TAG}_${DATESTAMP}
+
+  python3 /workspace/bert/run_re.py \
+  --do_prepare=true \
+  --do_eval=true \
+  --do_predict=true \
+  --task_name="chemprot" \
+  --vocab_file=$BERT_DIR/vocab.txt \
+  --bert_config_file=$BERT_DIR/bert_config.json \
+  --init_checkpoint=$init_checkpoint \
+  --data_dir=$DATASET_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --eval_batch_size=$batch_size \
+  --predict_batch_size=$batch_size \
+  --max_seq_length=512 \
+  "$use_fp16" $use_xla_tag $case_flag
+
+  python3 /workspace/bert/biobert/re_eval.py --task=chemprot --output_path=$OUTPUT_DIR/test_results.tsv \
+  --answer_path=$DATASET_DIR/test.tsv |& tee $OUTPUT_DIR/test_results.txt
+
+else
+
+    echo "Benchmarking for " $task "currently not supported. Sorry!"
+
+fi
@@ -0,0 +1,87 @@
+#! /bin/bash
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+train_batch_size=${1:-128}
+learning_rate=${2:-"9.625e-5"}
+cased=${3:-false}
+precision=${4:-"fp16"}
+use_xla=${5:-"true"}
+num_gpus=${6:-16}
+warmup_steps=${7:-"1953"}
+train_steps=${8:-19531}
+num_accumulation_steps=${9:-32}
+save_checkpoint_steps=${10:-5000}
+eval_batch_size=${11:-80}
+
+use_fp16=""
+if [ "$precision" = "fp16" ] ; then
+        echo "fp16 activated!"
+        use_fp16="--use_fp16"
+fi
+
+if [ "$use_xla" = "true" ] ; then
+    use_xla_tag="--use_xla"
+    echo "XLA activated"
+else
+    use_xla_tag=""
+fi
+
+if [ "$cased" = "true" ] ; then
+    DO_LOWER_CASE=0
+    CASING_DIR_PREFIX="cased"
+else
+    DO_LOWER_CASE=1
+    CASING_DIR_PREFIX="uncased"
+fi
+
+BERT_CONFIG=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12/bert_config.json
+RESULTS_DIR=/results
+CHECKPOINTS_DIR=${RESULTS_DIR}/biobert_phase_1
+mkdir -p ${CHECKPOINTS_DIR}
+
+INIT_CHECKPOINT=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12/bert_model.ckpt
+
+INPUT_FILES_DIR="/workspace/bert/data/tfrecord/lower_case_${DO_LOWER_CASE}_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/training"
+EVAL_FILES_DIR="/workspace/bert/data/tfrecord/lower_case_${DO_LOWER_CASE}_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/test"
+
+
+if [ $num_gpu -gt 1 ] ; then
+    mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
+    --allow-run-as-root -bind-to none -map-by slot \
+    -x NCCL_DEBUG=INFO \
+    -x LD_LIBRARY_PATH \
+    -x PATH -mca pml ob1 -mca btl ^openib"
+    use_hvd="--horovod"
+else
+    mpi_command=""
+    use_hvd=""
+fi
+
+
+export GBS=$(expr $train_batch_size \* $num_gpus \* num_accumulation_steps)
+printf -v TAG "tf_bert_bio_1n_phase1_cased_%s_%s_gbs%d" "$cased" "$precision" $GBS
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
+printf "Logs written to %s\n" "$LOGFILE"
+
+
+$mpi python3 /workspace/bert/run_pretraining.py \
+ --input_files_dir=$INPUT_FILES_DIR \
+ --eval_files_dir=$EVAL_FILES_DIR \
+ --output_dir=$CHECKPOINTS_DIR \
+ --bert_config_file=$BERT_CONFIG \
+ --do_train=True \
+ --do_eval=True \
+ --train_batch_size=$train_batch_size \
+ --eval_batch_size=$eval_batch_size \
+ --max_seq_length=128 \
+ --max_predictions_per_seq=20 \
+ --num_train_steps=$train_steps \
+ --num_warmup_steps=$warmup_steps \
+ --save_checkpoints_steps=$save_checkpoint_steps \
+ --num_accumulation_steps=$num_accumulation_steps \
+ --learning_rate=$learning_rate \
+ --report_loss \
+ --$use_hvd $use_fp16 $use_xla_tag \
+ --init_checkpoint=$INIT_CHECKPOINT |& tee $LOGFILE
@@ -0,0 +1,85 @@
+#! /bin/bash
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+init_checkpoint=${1}
+train_batch_size=${2:-16}
+learning_rate=${3:-"2.9e-4"}
+cased=${4:-false}
+precision=${5:-"fp16"}
+use_xla=${6:-true}
+num_gpus=${7:-16}
+warmup_steps=${8:-"434"}
+train_steps=${9:-4340}
+num_accumulation_steps=${10:-128}
+save_checkpoint_steps=${11:-5000}
+eval_batch_size=${12:-26}
+
+
+use_fp16=""
+if [ "$precision" = "fp16" ] ; then
+        echo "fp16 activated!"
+        use_fp16="--use_fp16"
+fi
+
+if [ "$use_xla" = "true" ] ; then
+    use_xla_tag="--use_xla"
+    echo "XLA activated"
+else
+    use_xla_tag=""
+fi
+
+if [ "$cased" = "true" ] ; then
+    DO_LOWER_CASE=0
+    CASING_DIR_PREFIX="cased"
+else
+    DO_LOWER_CASE=1
+    CASING_DIR_PREFIX="uncased"
+fi
+
+BERT_CONFIG=/workspace/bert/data/download/google_pretrained_weights/${CASING_DIR_PREFIX}_L-12_H-768_A-12/bert_config.json
+RESULTS_DIR=/results
+CHECKPOINTS_DIR=${RESULTS_DIR}/biobert_phase_2
+mkdir -p ${CHECKPOINTS_DIR}
+
+INPUT_FILES_DIR="/workspace/bert/data/tfrecord/lower_case_${DO_LOWER_CASE}_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/training"
+EVAL_FILES_DIR="/workspace/bert/data/tfrecord/lower_case_${DO_LOWER_CASE}_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/pubmed_baseline/test"
+
+if [ $num_gpu -gt 1 ] ; then
+    mpi_command="mpirun -np $num_gpu -H localhost:$num_gpu \
+    --allow-run-as-root -bind-to none -map-by slot \
+    -x NCCL_DEBUG=INFO \
+    -x LD_LIBRARY_PATH \
+    -x PATH -mca pml ob1 -mca btl ^openib"
+    use_hvd="--horovod"
+else
+    mpi_command=""
+    use_hvd=""
+fi
+
+export GBS=$(expr $train_batch_size \* $num_gpus \* num_accumulation_steps)
+printf -v TAG "tf_bert_bio_1n_phase2_cased_%s_%s_gbs%d" "$cased" "$precision" $GBS
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
+printf "Logs written to %s\n" "$LOGFILE"
+
+
+$mpi python3 /workspace/bert/run_pretraining.py \
+ --input_files_dir=$INPUT_FILES_DIR \
+ --eval_files_dir=$EVAL_FILES_DIR \
+ --output_dir=$CHECKPOINTS_DIR \
+ --bert_config_file=$BERT_CONFIG \
+ --do_train=True \
+ --do_eval=True \
+ --train_batch_size=$train_batch_size \
+ --eval_batch_size=$eval_batch_size \
+ --max_seq_length=512 \
+ --max_predictions_per_seq=80 \
+ --num_train_steps=$train_steps \
+ --num_warmup_steps=$warmup_steps \
+ --save_checkpoints_steps=$save_checkpoint_steps \
+ --num_accumulation_steps=$num_accumulation_steps \
+ --learning_rate=$learning_rate \
+ --report_loss \
+ --$use_hvd $use_xla_tag $use_fp16 \
+ --init_checkpoint=$INIT_CHECKPOINT |& tee $LOGFILE
@@ -0,0 +1,206 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#1 DGX1 phase1
+bert--DGX1:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "1"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "7.5e-4"
+    NUM_ACCUMULATION_STEPS: "1024"
+    PHASE: "1"
+
+#4 DGX1 phase1
+bert--DGX1_n4:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "4"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "1.875e-4"
+    NUM_ACCUMULATION_STEPS: "256"
+    PHASE: "1"
+
+#16 DGX1 phase1
+bert--DGX1_n16:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "16"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "4.6875e-5"
+    NUM_ACCUMULATION_STEPS: "64"
+    PHASE: "1"
+
+#32 DGX1 phase1
+bert--DGX1_n32:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "32"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "2.34375e-5"
+    NUM_ACCUMULATION_STEPS: "32"
+    PHASE: "1"
+
+#1 DGX2 phase1
+bert--DGX2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "1"
+    BATCHSIZE: "32"
+    LEARNING_RATE: "3.75e-4"
+    NUM_ACCUMULATION_STEPS: "128"
+    PHASE: "1"
+
+#4 DGX2 phase1
+bert--DGX2_n4:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "4"
+    BATCHSIZE: "32"
+    LEARNING_RATE: "9.375e-5"
+    NUM_ACCUMULATION_STEPS: "32"
+    PHASE: "1"
+
+#16 DGX2 phase1
+bert--DGX2_n16:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "16"
+    BATCHSIZE: "256"
+    LEARNING_RATE: "3.75e-4"
+    NUM_ACCUMULATION_STEPS: "4"
+    PHASE: "1"
+
+#32 DGX2 phase1
+bert--DGX2_n32:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "32"
+    BATCHSIZE: "32"
+    LEARNING_RATE: "2.34375e-5"
+    NUM_ACCUMULATION_STEPS: "8"
+    PHASE: "1"
+
+#1 DGX1 phase2
+bert--DGX1_n1p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "1"
+    BATCHSIZE: "2"
+    LEARNING_RATE: "5e-4"
+    NUM_ACCUMULATION_STEPS: "4096"
+    PHASE: "2"
+
+#4 DGX1 phase2
+bert--DGX1_n4p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "4"
+    BATCHSIZE: "2"
+    LEARNING_RATE: "1.25e-4"
+    NUM_ACCUMULATION_STEPS: "512"
+    PHASE: "2"
+
+#16 DGX1 phase2
+bert--DGX1_n16p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "16"
+    BATCHSIZE: "2"
+    LEARNING_RATE: "1.5625e-5"
+    NUM_ACCUMULATION_STEPS: "128"
+    PHASE: "2"
+
+#32 DGX1 phase2
+bert--DGX1_n32p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX1
+  variables:
+    <<: *DGX1_VARS
+    NNODES: "32"
+    BATCHSIZE: "2"
+    LEARNING_RATE: "1.5625e-5"
+    NUM_ACCUMULATION_STEPS: "64"
+    PHASE: "2"
+
+#1 DGX2 phase2
+bert--DGX2_n1p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "1"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "2.5e-5"
+    NUM_ACCUMULATION_STEPS: "256"
+    PHASE: "2"
+
+#4 DGX2 phase2
+bert--DGX2_n4p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "4"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "6.25e-5"
+    NUM_ACCUMULATION_STEPS: "64"
+    PHASE: "2"
+
+#16 DGX2 phase2
+bert--DGX2_n16p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "16"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "1.5625e-5"
+    NUM_ACCUMULATION_STEPS: "16"
+    PHASE: "2"
+
+#32 DGX2 phase2
+bert--DGX2_n32p2:
+  <<: *BERT_ON_CLUSTER
+  <<: *DGX2
+  variables:
+    <<: *DGX2_VARS
+    NNODES: "32"
+    BATCHSIZE: "8"
+    LEARNING_RATE: "7.8125e-6"
+    NUM_ACCUMULATION_STEPS: "8"
+    PHASE: "2"
+
@@ -0,0 +1,26 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+
+class BooksDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path
+        pass
+
+
+    def download(self):
+        bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out'
+        bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
+        bookscorpus_download_command += ' --trash-bad-count'
+        bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
@@ -0,0 +1,32 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+
+class BookscorpusTextFormatting:
+    def __init__(self, books_path, output_filename, recursive = False):
+        self.books_path = books_path
+        self.recursive = recursive
+        self.output_filename = output_filename
+
+
+    # This puts one book per line
+    def merge(self):
+        with open(self.output_filename, mode='w', newline='\n') as ofile:
+            for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
+                with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file:
+                    for line in file:
+                        if line.strip() != '':
+                            ofile.write(line.strip() + ' ')
+                ofile.write("\n\n")
@@ -0,0 +1,120 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader
+from NVIDIAPretrainedWeightDownloader import NVIDIAPretrainedWeightDownloader
+from WikiDownloader import WikiDownloader
+from BooksDownloader import BooksDownloader
+from GLUEDownloader import GLUEDownloader
+from SquadDownloader import SquadDownloader
+from PubMedDownloader import PubMedDownloader
+
+class Downloader:
+    def __init__(self, dataset_name, save_path):
+        self.dataset_name = dataset_name
+        self.save_path = save_path
+
+
+    def download(self):
+        if self.dataset_name == 'bookscorpus':
+            self.download_bookscorpus()
+
+        elif self.dataset_name == 'wikicorpus_en':
+            self.download_wikicorpus('en')
+
+        elif self.dataset_name == 'wikicorpus_zh':
+            self.download_wikicorpus('zh')
+
+        elif self.dataset_name == 'pubmed_baseline':
+            self.download_pubmed('baseline')
+
+        elif self.dataset_name == 'pubmed_daily_update':
+            self.download_pubmed('daily_update')
+
+        elif self.dataset_name == 'pubmed_fulltext':
+            self.download_pubmed('fulltext')
+
+        elif self.dataset_name == 'pubmed_open_access':
+            self.download_pubmed('open_access')
+
+        elif self.dataset_name == 'google_pretrained_weights':
+            self.download_google_pretrained_weights()
+
+        elif self.dataset_name == 'nvidia_pretrained_weights':
+            self.download_nvidia_pretrained_weights()
+
+        elif self.dataset_name == 'MRPC':
+            self.download_glue(self.dataset_name)
+
+        elif self.dataset_name == 'MNLI':
+            self.download_glue(self.dataset_name)
+
+        elif self.dataset_name == 'CoLA':
+            self.download_glue(self.dataset_name)
+
+        elif self.dataset_name == 'squad':
+            self.download_squad()
+
+        elif self.dataset_name == 'all':
+            self.download_bookscorpus()
+            self.download_wikicorpus('en')
+            self.download_wikicorpus('zh')
+            self.download_pubmed('baseline')
+            self.download_pubmed('daily_update')
+            self.download_pubmed('fulltext')
+            self.download_pubmed('open_access')
+            self.download_google_pretrained_weights()
+            self.download_nvidia_pretrained_weights()
+            self.download_glue("CoLA")
+            self.download_glue("MNLI")
+            self.download_glue("MRPC")
+            self.download_squad()
+
+        else:
+            print(self.dataset_name)
+            assert False, 'Unknown dataset_name provided to downloader'
+
+
+    def download_bookscorpus(self):
+        downloader = BooksDownloader(self.save_path)
+        downloader.download()
+
+
+    def download_wikicorpus(self, language):
+        downloader = WikiDownloader(language, self.save_path)
+        downloader.download()
+
+
+    def download_pubmed(self, subset):
+        downloader = PubMedDownloader(subset, self.save_path)
+        downloader.download()
+
+
+    def download_google_pretrained_weights(self):
+        downloader = GooglePretrainedWeightDownloader(self.save_path)
+        downloader.download()
+
+
+    def download_nvidia_pretrained_weights(self):
+        downloader = NVIDIAPretrainedWeightDownloader(self.save_path)
+        downloader.download()
+
+
+    def download_glue(self, glue_task_name):
+        downloader = GLUEDownloader(glue_task_name, self.save_path)
+        downloader.download()
+
+
+    def download_squad(self):
+        downloader = SquadDownloader(self.save_path)
+        downloader.download()
@@ -0,0 +1,109 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import os
+import urllib
+import sys
+import zipfile
+import io
+
+URLLIB=urllib
+if sys.version_info >= (3, 0):
+    URLLIB=urllib.request
+
+class GLUEDownloader:
+    def __init__(self, task, save_path):
+
+        # Documentation - Download link obtained from here: https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py
+
+        self.TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
+                     "SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
+                     "MRPC":{"mrpc_dev": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
+                            "mrpc_train": 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt',
+                            "mrpc_test": 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'},
+                     "QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
+                     "STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
+                     "MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
+                     "SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
+                     "QNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLI.zip?alt=media&token=c24cad61-f2df-4f04-9ab6-aa576fa829d0',
+                     "RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
+                     "WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
+                     "diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
+
+
+        self.save_path = save_path
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        self.task = task
+
+    def download(self):
+
+        if self.task == 'MRPC':
+            self.download_mrpc()
+        elif self.task == 'diagnostic':
+            self.download_diagnostic()
+        else:
+            self.download_and_extract(self.task)
+
+    def download_and_extract(self, task):
+        print("Downloading and extracting %s..." % task)
+        data_file = "%s.zip" % task
+        URLLIB.urlretrieve(self.TASK2PATH[task], data_file)
+        print(data_file,"\n\n\n")
+        with zipfile.ZipFile(data_file) as zip_ref:
+            zip_ref.extractall(self.save_path)
+        os.remove(data_file)
+        print("\tCompleted!")
+
+    def download_mrpc(self):
+        print("Processing MRPC...")
+        mrpc_dir = os.path.join(self.save_path, "MRPC")
+        if not os.path.isdir(mrpc_dir):
+            os.mkdir(mrpc_dir)
+
+        mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
+        mrpc_dev_file = os.path.join(mrpc_dir, "dev_ids.tsv")
+        mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
+
+        URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_train"], mrpc_train_file)
+        URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_test"], mrpc_test_file)
+        URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_dev"], mrpc_dev_file)
+
+        dev_ids = []
+        with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
+            for row in ids_fh:
+                dev_ids.append(row.strip().split('\t'))
+
+        with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \
+                io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \
+                io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
+            header = data_fh.readline()
+            train_fh.write(header)
+            dev_fh.write(header)
+            for row in data_fh:
+                label, id1, id2, s1, s2 = row.strip().split('\t')
+                if [id1, id2] in dev_ids:
+                    dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+                else:
+                    train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+
+        with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
+                io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
+            header = data_fh.readline()
+            test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
+            for idx, row in enumerate(data_fh):
+                label, id1, id2, s1, s2 = row.strip().split('\t')
+                test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
+        print("\tCompleted!")
@@ -0,0 +1,158 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+import os
+import urllib.request
+import zipfile
+
+class GooglePretrainedWeightDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path + '/google_pretrained_weights'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        # Download urls
+        self.model_urls = {
+            'bert_base_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'),
+            'bert_large_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'),
+            'bert_base_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'),
+            'bert_large_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'),
+            'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
+            'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
+            'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
+        }
+
+        # SHA256sum verification for file download integrity (and checking for changes from the download source over time)
+        self.bert_base_uncased_sha = {
+            'bert_config.json': '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
+            'bert_model.ckpt.data-00000-of-00001': '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84',
+            'bert_model.ckpt.index': '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b',
+            'bert_model.ckpt.meta': 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e',
+            'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
+        }
+
+        self.bert_large_uncased_sha = {
+            'bert_config.json': 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
+            'bert_model.ckpt.data-00000-of-00001': 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1',
+            'bert_model.ckpt.index': '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093',
+            'bert_model.ckpt.meta': '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1',
+            'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
+        }
+
+        self.bert_base_cased_sha = {
+            'bert_config.json': 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
+            'bert_model.ckpt.data-00000-of-00001': '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea',
+            'bert_model.ckpt.index': '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1',
+            'bert_model.ckpt.meta': '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98',
+            'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
+        }
+
+        self.bert_large_cased_sha = {
+            'bert_config.json': '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
+            'bert_model.ckpt.data-00000-of-00001': '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0',
+            'bert_model.ckpt.index': 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf',
+            'bert_model.ckpt.meta': 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1',
+            'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
+        }
+
+        self.bert_base_multilingual_cased_sha = {
+            'bert_config.json': 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
+            'bert_model.ckpt.data-00000-of-00001': '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
+            'bert_model.ckpt.index': '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
+            'bert_model.ckpt.meta': '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
+            'vocab.txt': 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
+        }
+
+        self.bert_large_multilingual_uncased_sha = {
+            'bert_config.json': '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
+            'bert_model.ckpt.data-00000-of-00001': '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
+            'bert_model.ckpt.index': '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
+            'bert_model.ckpt.meta': '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
+            'vocab.txt': '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
+        }
+
+        self.bert_base_chinese_sha = {
+            'bert_config.json': '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
+            'bert_model.ckpt.data-00000-of-00001': '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
+            'bert_model.ckpt.index': '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
+            'bert_model.ckpt.meta': 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
+            'vocab.txt': '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
+        }
+
+        # Relate SHA to urls for loop below
+        self.model_sha = {
+            'bert_base_uncased': self.bert_base_uncased_sha,
+            'bert_large_uncased': self.bert_large_uncased_sha,
+            'bert_base_cased': self.bert_base_cased_sha,
+            'bert_large_cased': self.bert_large_cased_sha,
+            'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha,
+            'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha,
+            'bert_base_chinese': self.bert_base_chinese_sha
+        }
+
+    # Helper to get sha256sum of a file
+    def sha256sum(self, filename):
+      h  = hashlib.sha256()
+      b  = bytearray(128*1024)
+      mv = memoryview(b)
+      with open(filename, 'rb', buffering=0) as f:
+        for n in iter(lambda : f.readinto(mv), 0):
+          h.update(mv[:n])
+
+      return h.hexdigest()
+
+    def download(self):
+        # Iterate over urls: download, unzip, verify sha256sum
+        found_mismatch_sha = False
+        for model in self.model_urls:
+          url = self.model_urls[model][0]
+          file = self.save_path + '/' + self.model_urls[model][1]
+
+          print('Downloading', url)
+          response = urllib.request.urlopen(url)
+          with open(file, 'wb') as handle:
+            handle.write(response.read())
+
+          print('Unzipping', file)
+          zip = zipfile.ZipFile(file, 'r')
+          zip.extractall(self.save_path)
+          zip.close()
+
+          sha_dict = self.model_sha[model]
+          for extracted_file in sha_dict:
+            sha = sha_dict[extracted_file]
+            if sha != self.sha256sum(file[:-4] + '/' + extracted_file):
+              found_mismatch_sha = True
+              print('SHA256sum does not match on file:', extracted_file, 'from download url:', url)
+            else:
+              print(file[:-4] + '/' + extracted_file, '\t', 'verified')
+
+        if not found_mismatch_sha:
+          print("All downloads pass sha256sum verification.")
+
+    def serialize(self):
+        pass
+
+    def deserialize(self):
+        pass
+
+    def listAvailableWeights(self):
+        print("Available Weight Datasets")
+        for item in self.model_urls:
+            print(item)
+
+    def listLocallyStoredWeights(self):
+        pass
+
@@ -0,0 +1,27 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+class NVIDIAPretrainedWeightDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path + '/nvidia_pretrained_weights'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        pass
+
+
+    def download(self):
+        assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.'
@@ -0,0 +1,93 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import glob
+import gzip
+import os
+import urllib.request
+import shutil
+import sys
+
+class PubMedDownloader:
+    def __init__(self, subset, save_path):
+        self.subset = subset
+        # Modifying self.save_path in two steps to handle creation of subdirectories
+        self.save_path = save_path + '/pubmed' + '/'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        self.save_path = self.save_path + '/' + subset
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        self.download_urls = {
+            'baseline' : 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/',
+            'daily_update' : 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/',
+            'fulltext' : 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/',
+            'open_access' : 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/'
+        }
+
+
+    def download(self):
+        print('subset:', self.subset)
+        url = self.download_urls[self.subset]
+        self.download_files(url)
+        self.extract_files()
+
+
+    def download_files(self, url):
+        url = self.download_urls[self.subset]
+        output = os.popen('curl ' + url).read()
+
+        if self.subset == 'fulltext' or self.subset == 'open_access':
+            line_split = 'comm_use' if self.subset == 'fulltext' else 'non_comm_use'
+            for line in output.splitlines():
+                if line[-10:] == 'xml.tar.gz' and \
+                        line.split(' ')[-1].split('.')[0] == line_split:
+                    file = os.path.join(self.save_path, line.split(' ')[-1])
+                    if not os.path.isfile(file):
+                        print('Downloading', file)
+                        response = urllib.request.urlopen(url + line.split(' ')[-1])
+                        with open(file, "wb") as handle:
+                            handle.write(response.read())
+
+        elif self.subset == 'baseline' or self.subset == 'daily_update':
+            for line in output.splitlines():
+                if line[-3:] == '.gz':
+                    file = os.path.join(self.save_path, line.split(' ')[-1])
+                    if not os.path.isfile(file):
+                        print('Downloading', file)
+                        response = urllib.request.urlopen(url + line.split(' ')[-1])
+                        with open(file, "wb") as handle:
+                            handle.write(response.read())
+        else:
+            assert False, 'Invalid PubMed dataset/subset specified.'
+
+    def extract_files(self):
+        files = glob.glob(self.save_path + '/*.xml.gz')
+
+        for file in files:
+            print('file:', file)
+            input = gzip.GzipFile(file, mode='rb')
+            s = input.read()
+            input.close()
+
+            out = open(file[:-3], mode='wb')
+            out.write(s)
+            out.close()
+
+
+
@@ -0,0 +1,44 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+import pubmed_parser as pmp
+
+class PubMedTextFormatting:
+    def __init__(self, pubmed_path, output_filename, recursive = False):
+        self.pubmed_path = pubmed_path
+        self.recursive = recursive
+        self.output_filename = output_filename
+
+
+    # This puts one article per line
+    def merge(self):
+        print('PubMed path:', self.pubmed_path)
+
+        with open(self.output_filename, mode='w', newline='\n') as ofile:
+            for filename in glob.glob(self.pubmed_path + '/*.xml*', recursive=self.recursive):
+                print('file:', filename)
+                dicts_out = pmp.parse_medline_xml(filename)
+                for dict_out in dicts_out:
+                    if not dict_out['abstract']:
+                        continue
+                    try:
+                        for line in dict_out['abstract'].splitlines():
+                            if len(line) < 30:
+                                continue
+                            ofile.write(line.strip() + " ")
+                        ofile.write("\n\n")
+                    except:
+                        ofile.write("\n\n")
+                        continue
@@ -0,0 +1,32 @@
+Steps to reproduce datasets from web
+
+1) Build the container
+  * docker build -t bert_tf .
+2) Run the container interactively
+  * nvidia-docker run -it --ipc=host bert_tf
+  * Optional: Mount data volumes
+    * -v yourpath:/workspace/bert/data/wikipedia_corpus/download
+    * -v yourpath:/workspace/bert/data/wikipedia_corpus/extracted_articles
+    * -v yourpath:/workspace/bert/data/wikipedia_corpus/raw_data
+    * -v yourpath:/workspace/bert/data/wikipedia_corpus/intermediate_files
+    * -v yourpath:/workspace/bert/data/wikipedia_corpus/final_text_file_single
+    * -v yourpath:/workspace/bert/data/wikipedia_corpus/final_text_files_sharded
+    * -v yourpath:/workspace/bert/data/wikipedia_corpus/final_tfrecords_sharded
+    * -v yourpath:/workspace/bert/data/bookcorpus/download
+    * -v yourpath:/workspace/bert/data/bookcorpus/final_text_file_single
+    * -v yourpath:/workspace/bert/data/bookcorpus/final_text_files_sharded
+    * -v yourpath:/workspace/bert/data/bookcorpus/final_tfrecords_sharded
+  * Optional: Select visible GPUs
+    * -e CUDA_VISIBLE_DEVICES=0
+
+** Inside of the container starting here**
+3) Download pretrained weights (they contain vocab files for preprocessing)
+  * cd data/pretrained_models_google && python3 download_models.py
+4) "One-click" SQuAD download
+  * cd /workspace/bert/data/squad && . squad_download.sh
+5) "One-click" Wikipedia data download and prep (provides tfrecords)
+  * Set your configuration in data/wikipedia_corpus/config.sh
+  * cd /data/wikipedia_corpus && ./run_preprocessing.sh
+6) "One-click" BookCorpus data download and prep (provided tfrecords)
+  * Set your configuration in data/wikipedia_corpus/config.sh
+  * cd /data/bookcorpus && ./run_preprocessing.sh
@@ -0,0 +1,54 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import os
+import urllib.request
+import sys
+
+class SquadDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path + '/squad'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        if not os.path.exists(self.save_path + '/v1.1'):
+            os.makedirs(self.save_path + '/v1.1')
+
+        if not os.path.exists(self.save_path + '/v2.0'):
+            os.makedirs(self.save_path + '/v2.0')
+
+        self.download_urls = {
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json',
+            'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json',
+            'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py',
+        }
+
+    def download(self):
+        for item in self.download_urls:
+            url = item
+            file = self.download_urls[item]
+
+            print('Downloading:', url)
+            if os.path.isfile(self.save_path + '/' + file):
+                print('** Download file already exists, skipping download')
+            else:
+                response = urllib.request.urlopen(url)
+                with open(self.save_path + '/' + file, "wb") as handle:
+                    handle.write(response.read())
+
+
@@ -0,0 +1,331 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from itertools import islice
+
+import multiprocessing
+import os
+import statistics
+
+class Sharding:
+    def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
+        assert len(input_files) > 0, 'The input file list must contain at least one file.'
+        assert n_training_shards > 0, 'There must be at least one output shard.'
+        assert n_test_shards > 0, 'There must be at least one output shard.'
+
+        self.n_training_shards = n_training_shards
+        self.n_test_shards = n_test_shards
+        self.fraction_test_set = fraction_test_set
+
+        self.input_files = input_files
+
+        self.output_name_prefix = output_name_prefix
+        self.output_training_identifier = '_training'
+        self.output_test_identifier = '_test'
+        self.output_file_extension = '.txt'
+
+        self.articles = {}    # key: integer identifier, value: list of articles
+        self.sentences = {}    # key: integer identifier, value: list of sentences
+        self.output_training_files = {}    # key: filename, value: list of articles to go into file
+        self.output_test_files = {}  # key: filename, value: list of articles to go into file
+
+        self.init_output_files()
+
+
+    # Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
+    def load_articles(self):
+        print('Start: Loading Articles')
+
+        global_article_count = 0
+        for input_file in self.input_files:
+            print('input file:', input_file)
+            with open(input_file, mode='r', newline='\n') as f:
+                for i, line in enumerate(f):
+                    if line.strip():
+                        self.articles[global_article_count] = line.rstrip()
+                        global_article_count += 1
+
+        print('End: Loading Articles: There are', len(self.articles), 'articles.')
+
+
+    def segment_articles_into_sentences(self, segmenter):
+        print('Start: Sentence Segmentation')
+        if len(self.articles) is 0:
+            self.load_articles()
+
+        assert len(self.articles) is not 0, 'Please check that input files are present and contain data.'
+
+        # TODO: WIP: multiprocessing (create independent ranges and spawn processes)
+        use_multiprocessing = 'serial'
+
+        def chunks(data, size=len(self.articles)):
+            it = iter(data)
+            for i in range(0, len(data), size):
+                yield {k: data[k] for k in islice(it, size)}
+
+        if use_multiprocessing == 'manager':
+            manager = multiprocessing.Manager()
+            return_dict = manager.dict()
+            jobs = []
+            n_processes = 7    # in addition to the main process, total = n_proc+1
+
+            def work(articles, return_dict):
+                sentences = {}
+                for i, article in enumerate(articles):
+                    sentences[i] = segmenter.segment_string(articles[article])
+
+                    if i % 5000 == 0:
+                        print('Segmenting article', i)
+
+                return_dict.update(sentences)
+
+            for item in chunks(self.articles, len(self.articles)):
+                p = multiprocessing.Process(target=work, args=(item, return_dict))
+
+                # Busy wait
+                while len(jobs) >= n_processes:
+                    pass
+
+                jobs.append(p)
+                p.start()
+
+            for proc in jobs:
+                proc.join()
+
+        elif use_multiprocessing == 'queue':
+            work_queue = multiprocessing.Queue()
+            jobs = []
+
+            for item in chunks(self.articles, len(self.articles)):
+                pass
+
+        else:    # serial option
+            for i, article in enumerate(self.articles):
+                self.sentences[i] = segmenter.segment_string(self.articles[article])
+
+                if i % 5000 == 0:
+                    print('Segmenting article', i)
+
+        print('End: Sentence Segmentation')
+
+
+    def init_output_files(self):
+        print('Start: Init Output Files')
+        assert len(self.output_training_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
+        assert len(self.output_test_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
+
+        for i in range(self.n_training_shards):
+            name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
+            self.output_training_files[name] = []
+
+        for i in range(self.n_test_shards):
+            name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
+            self.output_test_files[name] = []
+
+        print('End: Init Output Files')
+
+
+    def get_sentences_per_shard(self, shard):
+        result = 0
+        for article_id in shard:
+            result += len(self.sentences[article_id])
+
+        return result
+
+
+    def distribute_articles_over_shards(self):
+        print('Start: Distribute Articles Over Shards')
+        assert len(self.articles) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.'
+
+        # Create dictionary with - key: sentence count per article, value: article id number
+        sentence_counts = defaultdict(lambda: [])
+
+        max_sentences = 0
+        total_sentences = 0
+
+        for article_id in self.sentences:
+            current_length = len(self.sentences[article_id])
+            sentence_counts[current_length].append(article_id)
+            max_sentences = max(max_sentences, current_length)
+            total_sentences += current_length
+
+        n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences)
+        nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards
+        nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards
+
+        consumed_article_set = set({})
+        unused_article_set = set(self.articles.keys())
+
+        # Make first pass and add one article worth of lines per file
+        for file in self.output_training_files:
+            current_article_id = sentence_counts[max_sentences][-1]
+            sentence_counts[max_sentences].pop(-1)
+            self.output_training_files[file].append(current_article_id)
+            consumed_article_set.add(current_article_id)
+            unused_article_set.remove(current_article_id)
+
+            # Maintain the max sentence count
+            while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                max_sentences -= 1
+
+            if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
+                nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
+                print('Warning: A single article contains more than the nominal number of sentences per training shard.')
+
+        for file in self.output_test_files:
+            current_article_id = sentence_counts[max_sentences][-1]
+            sentence_counts[max_sentences].pop(-1)
+            self.output_test_files[file].append(current_article_id)
+            consumed_article_set.add(current_article_id)
+            unused_article_set.remove(current_article_id)
+
+            # Maintain the max sentence count
+            while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                max_sentences -= 1
+
+            if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
+                nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
+                print('Warning: A single article contains more than the nominal number of sentences per test shard.')
+
+        training_counts = []
+        test_counts = []
+
+        for shard in self.output_training_files:
+            training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
+
+        for shard in self.output_test_files:
+            test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
+
+        training_median = statistics.median(training_counts)
+        test_median = statistics.median(test_counts)
+
+        # Make subsequent passes over files to find articles to add without going over limit
+        history_remaining = []
+        n_history_remaining = 4
+
+        while len(consumed_article_set) < len(self.articles):
+            for fidx, file in enumerate(self.output_training_files):
+                nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences)
+
+                # Maintain the max sentence count
+                while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                    max_sentences -= 1
+
+                while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
+                    nominal_next_article_size -= 1
+
+                if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or training_counts[fidx] > training_median:
+                    continue    # skip adding to this file, will come back later if no file can accept unused articles
+
+                current_article_id = sentence_counts[nominal_next_article_size][-1]
+                sentence_counts[nominal_next_article_size].pop(-1)
+
+                self.output_training_files[file].append(current_article_id)
+                consumed_article_set.add(current_article_id)
+                unused_article_set.remove(current_article_id)
+
+            for fidx, file in enumerate(self.output_test_files):
+                nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences)
+
+                # Maintain the max sentence count
+                while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                    max_sentences -= 1
+
+                while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
+                    nominal_next_article_size -= 1
+
+                if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or test_counts[fidx] > test_median:
+                    continue    # skip adding to this file, will come back later if no file can accept unused articles
+
+                current_article_id = sentence_counts[nominal_next_article_size][-1]
+                sentence_counts[nominal_next_article_size].pop(-1)
+
+                self.output_test_files[file].append(current_article_id)
+                consumed_article_set.add(current_article_id)
+                unused_article_set.remove(current_article_id)
+
+            # If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed
+            if len(history_remaining) == n_history_remaining:
+                history_remaining.pop(0)
+            history_remaining.append(len(unused_article_set))
+
+            history_same = True
+            for i in range(1, len(history_remaining)):
+                history_same = history_same and (history_remaining[i-1] == history_remaining[i])
+
+            if history_same:
+                nominal_sentences_per_training_shard += 1
+                # nominal_sentences_per_test_shard += 1
+
+            training_counts = []
+            test_counts = []
+            for shard in self.output_training_files:
+                training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
+
+            for shard in self.output_test_files:
+                test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
+
+            training_median = statistics.median(training_counts)
+            test_median = statistics.median(test_counts)
+
+            print('Distributing data over shards:', len(unused_article_set), 'articles remaining.')
+
+
+        if len(unused_article_set) != 0:
+            print('Warning: Some articles did not make it into output files.')
+
+
+        for shard in self.output_training_files:
+            print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard]))
+
+        for shard in self.output_test_files:
+            print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard]))
+
+        print('End: Distribute Articles Over Shards')
+
+
+    def write_shards_to_disk(self):
+        print('Start: Write Shards to Disk')
+        for shard in self.output_training_files:
+            self.write_single_shard(shard, self.output_training_files[shard], 'training')
+
+        for shard in self.output_test_files:
+            self.write_single_shard(shard, self.output_test_files[shard], 'test')
+
+        print('End: Write Shards to Disk')
+
+
+    def write_single_shard(self, shard_name, shard, split):
+        shard_split = os.path.split(shard_name)
+        shard_name = shard_split[0] + '/' + split + '/' + shard_split[1]
+        
+        with open(shard_name, mode='w', newline='\n') as f:
+            for article_id in shard:
+                for line in self.sentences[article_id]:
+                    f.write(line + '\n')
+
+                f.write('\n')  # Line break between articles
+
+
+import nltk
+
+nltk.download('punkt')
+
+class NLTKSegmenter:
+    def __init(self):
+        pass
+
+    def segment_string(self, article):
+        return nltk.tokenize.sent_tokenize(article)
+
@@ -0,0 +1,58 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import os
+import urllib.request
+import sys
+import subprocess
+
+class WikiDownloader:
+    def __init__(self, language, save_path):
+        self.save_path = save_path + '/wikicorpus_' + language
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        self.language = language
+        self.download_urls = {
+            'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
+            'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
+        }
+
+        self.output_files = {
+            'en' : 'wikicorpus_en.xml.bz2',
+            'zh' : 'wikicorpus_zh.xml.bz2'
+        }
+
+
+    def download(self):
+        if self.language in self.download_urls:
+            url = self.download_urls[self.language]
+            filename = self.output_files[self.language]
+
+            print('Downloading:', url)
+            if os.path.isfile(self.save_path + '/' + filename):
+                print('** Download file already exists, skipping download')
+            else:
+                response = urllib.request.urlopen(url)
+                with open(self.save_path + '/' + filename, "wb") as handle:
+                    handle.write(response.read())
+
+            # Always unzipping since this is relatively fast and will overwrite
+            print('Unzipping:', self.output_files[self.language])
+            subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
+
+        else:
+            assert False, 'WikiDownloader not implemented for this language yet.'
+
@@ -0,0 +1,46 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+
+class WikicorpusTextFormatting:
+    def __init__(self, wiki_path, output_filename, recursive = False):
+        self.wiki_path = wiki_path
+        self.recursive = recursive
+        self.output_filename = output_filename
+
+
+    # This puts one article per line
+    def merge(self):
+        with open(self.output_filename, mode='w', newline='\n') as ofile:
+            for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
+                for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
+                    print(filename)
+                    article_lines = []
+                    article_open = False
+
+                    with open(filename, mode='r', newline='\n') as file:
+                        for line in file:
+                            if '<doc id=' in line:
+                                article_open = True
+                            elif '</doc>' in line:
+                                article_open = False
+                                for oline in article_lines[1:]:
+                                    if oline != '\n':
+                                        ofile.write(oline.rstrip() + " ")
+                                ofile.write("\n\n")
+                                article_lines = []
+                            else:
+                                if article_open:
+                                    article_lines.append(line)
@@ -0,0 +1,12 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,387 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import BookscorpusTextFormatting
+import Downloader
+import TextSharding
+import WikicorpusTextFormatting
+import PubMedTextFormatting
+
+import argparse
+import itertools
+import multiprocessing
+import os
+import pprint
+import subprocess
+
+
+def main(args):
+    working_dir = os.environ['BERT_PREP_WORKING_DIR']
+
+    print('Working Directory:', working_dir)
+    print('Action:', args.action)
+    print('Dataset Name:', args.dataset)
+
+    if args.input_files:
+        args.input_files = args.input_files.split(',')
+
+    hdf5_tfrecord_folder_prefix = "/lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \
+                                  + "_max_pred_" + str(args.max_predictions_per_seq) + "_masked_lm_prob_" + str(args.masked_lm_prob) \
+                                  + "_random_seed_" + str(args.random_seed) + "_dupe_factor_" + str(args.dupe_factor) \
+                                  + "_shard_" + str(args.n_training_shards) + "_test_split_" + str(int(args.fraction_test_set * 100))
+    directory_structure = {
+        'download' : working_dir + '/download',    # Downloaded and decompressed
+        'extracted' : working_dir +'/extracted',    # Extracted from whatever the initial format is (e.g., wikiextractor)
+        'formatted' : working_dir + '/formatted_one_article_per_line',    # This is the level where all sources should look the same
+        'sharded' : working_dir + '/sharded',
+        'tfrecord' : working_dir + '/tfrecord' + hdf5_tfrecord_folder_prefix,
+        'hdf5': working_dir + '/hdf5'+ hdf5_tfrecord_folder_prefix,
+    }
+
+    print('\nDirectory Structure:')
+    pp = pprint.PrettyPrinter(indent=2)
+    pp.pprint(directory_structure)
+    print('')
+
+    if args.action == 'download':
+        if not os.path.exists(directory_structure['download']):
+            os.makedirs(directory_structure['download'])
+
+        downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
+        downloader.download()
+
+    elif args.action == 'text_formatting':
+        assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' \
+               and args.dataset != 'squad' and args.dataset != 'MRPC' and args.dataset != 'CoLA' and \
+               args.dataset != 'MNLI', 'Cannot perform text_formatting on pretrained weights'
+
+        if not os.path.exists(directory_structure['extracted']):
+            os.makedirs(directory_structure['extracted'])
+
+        if not os.path.exists(directory_structure['formatted']):
+            os.makedirs(directory_structure['formatted'])
+
+        if args.dataset == 'bookscorpus':
+            books_path = directory_structure['download'] + '/bookscorpus'
+            #books_path = directory_structure['download']
+            output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'
+            books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
+            books_formatter.merge()
+
+        elif args.dataset == 'wikicorpus_en':
+            if args.skip_wikiextractor == 0:
+                path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
+                wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
+                print('WikiExtractor Command:', wikiextractor_command)
+                wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+
+            wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
+            output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
+            wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
+            wiki_formatter.merge()
+
+        elif args.dataset == 'wikicorpus_zh':
+            assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
+            if args.skip_wikiextractor == 0:
+                path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
+                wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
+                print('WikiExtractor Command:', wikiextractor_command)
+                wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+
+            wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
+            output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
+            wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
+            wiki_formatter.merge()
+
+        elif args.dataset == 'pubmed_baseline':
+            pubmed_path = directory_structure['download'] + '/pubmed' + '/baseline'
+            output_filename = directory_structure['formatted'] + '/pubmed_baseline_one_article_per_line.txt'
+            pubmed_formatter = PubMedTextFormatting.PubMedTextFormatting(pubmed_path, output_filename, recursive=True)
+            pubmed_formatter.merge()
+
+    elif args.action == 'sharding':
+        # Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
+        if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset or 'pubmed' in args.dataset:
+            if args.input_files is None:
+                if args.dataset == 'bookscorpus':
+                    args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt']
+                elif args.dataset == 'wikicorpus_en':
+                    args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
+                elif args.dataset == 'wikicorpus_zh':
+                    args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt']
+                elif args.dataset == 'books_wiki_en_corpus':
+                    args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
+                elif args.dataset == 'pubmed_baseline':
+                    args.input_files = [directory_structure['formatted'] + '/pubmed_baseline_one_article_per_line.txt']
+
+            output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
+
+            if not os.path.exists(directory_structure['sharded']):
+                os.makedirs(directory_structure['sharded'])
+
+            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset):
+                os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
+                
+            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/training'):
+                os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/training')
+                
+            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/test'):
+                os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/test')
+
+            # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
+            # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
+            # Different languages (e.g., Chinese simplified/traditional) may require translation and
+            # other packages to be called from here -- just add a conditional branch for those extra steps
+            segmenter = TextSharding.NLTKSegmenter()
+            sharding = TextSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set)
+
+            sharding.load_articles()
+            sharding.segment_articles_into_sentences(segmenter)
+            sharding.distribute_articles_over_shards()
+            sharding.write_shards_to_disk()
+
+        else:
+            assert False, 'Unsupported dataset for sharding'
+
+    elif args.action == 'create_tfrecord_files':
+        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset):
+            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset)
+        
+        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset + '/training'):
+            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset + '/training')
+            
+        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset + '/test'):
+            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset + '/test')
+
+        last_process = None
+
+        def create_record_worker(filename_prefix, shard_id, output_format='tfrecord', split='training'):
+            bert_preprocessing_command = 'python /workspace/bert/utils/create_pretraining_data.py'
+            bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
+            bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
+            bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
+            bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
+            bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
+            bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
+            bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
+            bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
+            bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
+            bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
+
+            last_process = bert_preprocessing_process
+
+            # This could be better optimized (fine if all take equal time)
+            if shard_id % args.n_processes == 0 and shard_id > 0:
+                bert_preprocessing_process.wait()
+
+            return last_process
+
+        output_file_prefix = args.dataset
+
+        for i in range(args.n_training_shards):
+            last_process = create_record_worker(output_file_prefix + '_training', i, 'tfrecord', 'training')
+
+        last_process.wait()
+
+        for i in range(args.n_test_shards):
+            last_process = create_record_worker(output_file_prefix + '_test', i, 'tfrecord', 'test')
+
+        last_process.wait()
+
+
+    elif args.action == 'create_hdf5_files':
+        assert False, 'HDF5 format not fully supported in this release.'
+
+        if not os.path.exists(directory_structure['hdf5'] + "/" + args.dataset):
+            os.makedirs(directory_structure['hdf5'] + "/" + args.dataset)
+
+        last_process = None
+
+        def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
+            bert_preprocessing_command = 'python /workspace/bert/utils/create_pretraining_data.py'
+            bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
+            bert_preprocessing_command += ' --output_file=' + directory_structure['hdf5'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
+            bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
+            bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
+            bert_preprocessing_command += ' --max_seq_length=' + args.max_seq_length
+            bert_preprocessing_command += ' --max_predictions_per_seq=' + args.max_predictions_per_seq
+            bert_preprocessing_command += ' --masked_lm_prob=' + args.masked_lm_prob
+            bert_preprocessing_command += ' --random_seed=' + args.random_seed
+            bert_preprocessing_command += ' --dupe_factor=' + args.dupe_factor
+            bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
+
+            last_process = bert_preprocessing_process
+
+            # This could be better optimized (fine if all take equal time)
+            if shard_id % args.n_processes == 0 and shard_id > 0:
+                bert_preprocessing_process.wait()
+
+        for i in range(args.n_training_shards):
+            create_record_worker(args.output_file_prefix + '_training', i)
+
+        last_process.wait()
+
+        for i in range(args.n_test_shards):
+            create_record_worker(args.output_file_prefix + '_test', i)
+
+        last_process.wait()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Preprocessing Application for Everything BERT-related'
+    )
+
+    parser.add_argument(
+        '--action',
+        type=str,
+        help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords',
+        choices={
+            'download',                   # Download and verify mdf5/sha sums
+            'text_formatting',            # Convert into a file that contains one article/book per line
+            'sharding',                   # Convert previous formatted text into shards containing one sentence per line
+            'create_tfrecord_files',      # Turn each shard into a TFrecord with masking and next sentence prediction info
+            'create_hdf5_files'           # Turn each shard into a HDF5 file with masking and next sentence prediction info
+        }
+    )
+
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        help='Specify the dataset to perform --action on',
+        choices={
+            'bookscorpus',
+            'wikicorpus_en',
+            'wikicorpus_zh',
+            'books_wiki_en_corpus',
+            'pubmed_baseline',
+            'pubmed_daily_update',
+            'pubmed_fulltext',
+            'pubmed_open_access',
+            'google_pretrained_weights',
+            'nvidia_pretrained_weights',
+            'squad',
+            'MRPC',
+            'CoLA',
+            'MNLI',
+            'all'
+        }
+    )
+
+    parser.add_argument(
+        '--input_files',
+        type=str,
+        help='Specify the input files in a comma-separated list (no spaces)'
+    )
+
+    parser.add_argument(
+        '--n_training_shards',
+        type=int,
+        help='Specify the number of training shards to generate',
+        default=1472
+    )
+
+    parser.add_argument(
+        '--n_test_shards',
+        type=int,
+        help='Specify the number of test shards to generate',
+        default=1472
+    )
+
+    parser.add_argument(
+        '--fraction_test_set',
+        type=float,
+        help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)',
+        default=0.1
+    )
+
+    parser.add_argument(
+        '--segmentation_method',
+        type=str,
+        help='Specify your choice of sentence segmentation',
+        choices={
+            'nltk'
+        },
+        default='nltk'
+    )
+
+    parser.add_argument(
+        '--n_processes',
+        type=int,
+        help='Specify the max number of processes to allow at one time',
+        default=4
+    )
+
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        help='Specify the base seed to use for any random number generation',
+        default=12345
+    )
+
+    parser.add_argument(
+        '--dupe_factor',
+        type=int,
+        help='Specify the duplication factor',
+        default=5
+    )
+
+    parser.add_argument(
+        '--masked_lm_prob',
+        type=float,
+        help='Specify the probability for masked lm',
+        default=0.15
+    )
+
+    parser.add_argument(
+        '--max_seq_length',
+        type=int,
+        help='Specify the maximum sequence length',
+        default=512
+    )
+
+    parser.add_argument(
+        '--max_predictions_per_seq',
+        type=int,
+        help='Specify the maximum number of masked words per sequence',
+        default=20
+    )
+
+    parser.add_argument(
+        '--do_lower_case',
+        type=int,
+        help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)',
+        default=1
+    )
+
+    parser.add_argument(
+        '--vocab_file',
+        type=str,
+        help='Specify absolute path to vocab file to use)'
+    )
+
+    parser.add_argument(
+        '--skip_wikiextractor',
+        type=int,
+        help='Specify whether to skip wikiextractor step 0=False, 1=True',
+        default=0
+    )
+
+    parser.add_argument(
+        '--interactive_json_config_generator',
+        type=str,
+        help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords'
+    )
+
+    args = parser.parse_args()
+    main(args)
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export BERT_PREP_WORKING_DIR="${BERT_PREP_WORKING_DIR}"
+
+# Download
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset pubmed_baseline
+
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset google_pretrained_weights  # Includes vocab
+
+# Properly format the text files
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action text_formatting --dataset pubmed_baseline
+
+
+# Shard the text files
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action sharding --dataset pubmed_baseline
+
+### BERT BASE
+
+## UNCASED
+
+# Create TFRecord files Phase 1
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 128 \
+ --max_predictions_per_seq 20 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-12_H-768_A-12/vocab.txt
+
+
+# Create TFRecord files Phase 2
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 512 \
+ --max_predictions_per_seq 80 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-12_H-768_A-12/vocab.txt
+
+
+## CASED
+
+# Create TFRecord files Phase 1
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 128 \
+ --max_predictions_per_seq 20 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/cased_L-12_H-768_A-12/vocab.txt \
+ --do_lower_case=0
+
+
+# Create TFRecord files Phase 2
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 512 \
+ --max_predictions_per_seq 80 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/cased_L-12_H-768_A-12/vocab.txt \
+ --do_lower_case=0
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export BERT_PREP_WORKING_DIR="${BERT_PREP_WORKING_DIR}"
+
+# Download
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset bookscorpus
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset wikicorpus_en
+
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset google_pretrained_weights  # Includes vocab
+
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset squad
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset "CoLA"
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset "MRPC"
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset "MNLI"
+
+
+# Properly format the text files
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action text_formatting --dataset bookscorpus
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action text_formatting --dataset wikicorpus_en
+
+
+# Shard the text files (group wiki+books then shard)
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action sharding --dataset books_wiki_en_corpus
+
+
+# Create TFRecord files Phase 1
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset books_wiki_en_corpus --max_seq_length 128 \
+ --max_predictions_per_seq 20 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt
+
+
+# Create TFRecord files Phase 2
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset books_wiki_en_corpus --max_seq_length 512 \
+ --max_predictions_per_seq 80 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt
@@ -0,0 +1,13 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "type_vocab_size": 2,
+  "vocab_size": 30528
+}
@@ -0,0 +1,419 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Extract pre-computed feature vectors from BERT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import codecs
+import collections
+import json
+import re
+
+import modeling
+import tokenization
+import tensorflow as tf
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_file", None, "")
+
+flags.DEFINE_string("output_file", None, "")
+
+flags.DEFINE_string("layers", "-1,-2,-3,-4", "")
+
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.")
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded.")
+
+flags.DEFINE_string(
+    "init_checkpoint", None,
+    "Initial checkpoint (usually from a pre-trained BERT model).")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.")
+
+flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
+
+flags.DEFINE_string("master", None,
+                    "If using a TPU, the address of the master.")
+
+flags.DEFINE_integer(
+    "num_tpu_cores", 8,
+    "Only used if `use_tpu` is True. Total number of TPU cores to use.")
+
+flags.DEFINE_bool(
+    "use_one_hot_embeddings", False,
+    "If True, tf.one_hot will be used for embedding lookups, otherwise "
+    "tf.nn.embedding_lookup will be used. On TPUs, this should be True "
+    "since it is much faster.")
+
+
+class InputExample(object):
+
+  def __init__(self, unique_id, text_a, text_b):
+    self.unique_id = unique_id
+    self.text_a = text_a
+    self.text_b = text_b
+
+
+class InputFeatures(object):
+  """A single set of features of data."""
+
+  def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
+    self.unique_id = unique_id
+    self.tokens = tokens
+    self.input_ids = input_ids
+    self.input_mask = input_mask
+    self.input_type_ids = input_type_ids
+
+
+def input_fn_builder(features, seq_length):
+  """Creates an `input_fn` closure to be passed to TPUEstimator."""
+
+  all_unique_ids = []
+  all_input_ids = []
+  all_input_mask = []
+  all_input_type_ids = []
+
+  for feature in features:
+    all_unique_ids.append(feature.unique_id)
+    all_input_ids.append(feature.input_ids)
+    all_input_mask.append(feature.input_mask)
+    all_input_type_ids.append(feature.input_type_ids)
+
+  def input_fn(params):
+    """The actual input function."""
+    batch_size = params["batch_size"]
+
+    num_examples = len(features)
+
+    # This is for demo purposes and does NOT scale to large data sets. We do
+    # not use Dataset.from_generator() because that uses tf.py_func which is
+    # not TPU compatible. The right way to load data is with TFRecordReader.
+    d = tf.data.Dataset.from_tensor_slices({
+        "unique_ids":
+            tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
+        "input_ids":
+            tf.constant(
+                all_input_ids, shape=[num_examples, seq_length],
+                dtype=tf.int32),
+        "input_mask":
+            tf.constant(
+                all_input_mask,
+                shape=[num_examples, seq_length],
+                dtype=tf.int32),
+        "input_type_ids":
+            tf.constant(
+                all_input_type_ids,
+                shape=[num_examples, seq_length],
+                dtype=tf.int32),
+    })
+
+    d = d.batch(batch_size=batch_size, drop_remainder=False)
+    return d
+
+  return input_fn
+
+
+def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
+                     use_one_hot_embeddings):
+  """Returns `model_fn` closure for TPUEstimator."""
+
+  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+    """The `model_fn` for TPUEstimator."""
+
+    unique_ids = features["unique_ids"]
+    input_ids = features["input_ids"]
+    input_mask = features["input_mask"]
+    input_type_ids = features["input_type_ids"]
+
+    model = modeling.BertModel(
+        config=bert_config,
+        is_training=False,
+        input_ids=input_ids,
+        input_mask=input_mask,
+        token_type_ids=input_type_ids,
+        use_one_hot_embeddings=use_one_hot_embeddings)
+
+    if mode != tf.estimator.ModeKeys.PREDICT:
+      raise ValueError("Only PREDICT modes are supported: %s" % (mode))
+
+    tvars = tf.trainable_variables()
+    scaffold_fn = None
+    (assignment_map,
+     initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
+         tvars, init_checkpoint)
+    if use_tpu:
+
+      def tpu_scaffold():
+        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+        return tf.train.Scaffold()
+
+      scaffold_fn = tpu_scaffold
+    else:
+      tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+    tf.logging.info("**** Trainable Variables ****")
+    for var in tvars:
+      init_string = ""
+      if var.name in initialized_variable_names:
+        init_string = ", *INIT_FROM_CKPT*"
+      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+                      init_string)
+
+    all_layers = model.get_all_encoder_layers()
+
+    predictions = {
+        "unique_id": unique_ids,
+    }
+
+    for (i, layer_index) in enumerate(layer_indexes):
+      predictions["layer_output_%d" % i] = all_layers[layer_index]
+
+    output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+        mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
+    return output_spec
+
+  return model_fn
+
+
+def convert_examples_to_features(examples, seq_length, tokenizer):
+  """Loads a data file into a list of `InputBatch`s."""
+
+  features = []
+  for (ex_index, example) in enumerate(examples):
+    tokens_a = tokenizer.tokenize(example.text_a)
+
+    tokens_b = None
+    if example.text_b:
+      tokens_b = tokenizer.tokenize(example.text_b)
+
+    if tokens_b:
+      # Modifies `tokens_a` and `tokens_b` in place so that the total
+      # length is less than the specified length.
+      # Account for [CLS], [SEP], [SEP] with "- 3"
+      _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
+    else:
+      # Account for [CLS] and [SEP] with "- 2"
+      if len(tokens_a) > seq_length - 2:
+        tokens_a = tokens_a[0:(seq_length - 2)]
+
+    # The convention in BERT is:
+    # (a) For sequence pairs:
+    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+    # (b) For single sequences:
+    #  tokens:   [CLS] the dog is hairy . [SEP]
+    #  type_ids: 0     0   0   0  0     0 0
+    #
+    # Where "type_ids" are used to indicate whether this is the first
+    # sequence or the second sequence. The embedding vectors for `type=0` and
+    # `type=1` were learned during pre-training and are added to the wordpiece
+    # embedding vector (and position vector). This is not *strictly* necessary
+    # since the [SEP] token unambiguously separates the sequences, but it makes
+    # it easier for the model to learn the concept of sequences.
+    #
+    # For classification tasks, the first vector (corresponding to [CLS]) is
+    # used as as the "sentence vector". Note that this only makes sense because
+    # the entire model is fine-tuned.
+    tokens = []
+    input_type_ids = []
+    tokens.append("[CLS]")
+    input_type_ids.append(0)
+    for token in tokens_a:
+      tokens.append(token)
+      input_type_ids.append(0)
+    tokens.append("[SEP]")
+    input_type_ids.append(0)
+
+    if tokens_b:
+      for token in tokens_b:
+        tokens.append(token)
+        input_type_ids.append(1)
+      tokens.append("[SEP]")
+      input_type_ids.append(1)
+
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+
+    # Zero-pad up to the sequence length.
+    while len(input_ids) < seq_length:
+      input_ids.append(0)
+      input_mask.append(0)
+      input_type_ids.append(0)
+
+    assert len(input_ids) == seq_length
+    assert len(input_mask) == seq_length
+    assert len(input_type_ids) == seq_length
+
+    if ex_index < 5:
+      tf.logging.info("*** Example ***")
+      tf.logging.info("unique_id: %s" % (example.unique_id))
+      tf.logging.info("tokens: %s" % " ".join(
+          [tokenization.printable_text(x) for x in tokens]))
+      tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+      tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+      tf.logging.info(
+          "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
+
+    features.append(
+        InputFeatures(
+            unique_id=example.unique_id,
+            tokens=tokens,
+            input_ids=input_ids,
+            input_mask=input_mask,
+            input_type_ids=input_type_ids))
+  return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+  """Truncates a sequence pair in place to the maximum length."""
+
+  # This is a simple heuristic which will always truncate the longer sequence
+  # one token at a time. This makes more sense than truncating an equal percent
+  # of tokens from each, since if one sequence is very short then each token
+  # that's truncated likely contains more information than a longer sequence.
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_length:
+      break
+    if len(tokens_a) > len(tokens_b):
+      tokens_a.pop()
+    else:
+      tokens_b.pop()
+
+
+def read_examples(input_file):
+  """Read a list of `InputExample`s from an input file."""
+  examples = []
+  unique_id = 0
+  with tf.gfile.GFile(input_file, "r") as reader:
+    while True:
+      line = tokenization.convert_to_unicode(reader.readline())
+      if not line:
+        break
+      line = line.strip()
+      text_a = None
+      text_b = None
+      m = re.match(r"^(.*) \|\|\| (.*)$", line)
+      if m is None:
+        text_a = line
+      else:
+        text_a = m.group(1)
+        text_b = m.group(2)
+      examples.append(
+          InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
+      unique_id += 1
+  return examples
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  layer_indexes = [int(x) for x in FLAGS.layers.split(",")]
+
+  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+  run_config = tf.contrib.tpu.RunConfig(
+      master=FLAGS.master,
+      tpu_config=tf.contrib.tpu.TPUConfig(
+          num_shards=FLAGS.num_tpu_cores,
+          per_host_input_for_training=is_per_host))
+
+  examples = read_examples(FLAGS.input_file)
+
+  features = convert_examples_to_features(
+      examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
+
+  unique_id_to_feature = {}
+  for feature in features:
+    unique_id_to_feature[feature.unique_id] = feature
+
+  model_fn = model_fn_builder(
+      bert_config=bert_config,
+      init_checkpoint=FLAGS.init_checkpoint,
+      layer_indexes=layer_indexes,
+      use_tpu=FLAGS.use_tpu,
+      use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)
+
+  # If TPU is not available, this will fall back to normal Estimator on CPU
+  # or GPU.
+  estimator = tf.contrib.tpu.TPUEstimator(
+      use_tpu=FLAGS.use_tpu,
+      model_fn=model_fn,
+      config=run_config,
+      predict_batch_size=FLAGS.batch_size)
+
+  input_fn = input_fn_builder(
+      features=features, seq_length=FLAGS.max_seq_length)
+
+  with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file,
+                                               "w")) as writer:
+    for result in estimator.predict(input_fn, yield_single_examples=True):
+      unique_id = int(result["unique_id"])
+      feature = unique_id_to_feature[unique_id]
+      output_json = collections.OrderedDict()
+      output_json["linex_index"] = unique_id
+      all_features = []
+      for (i, token) in enumerate(feature.tokens):
+        all_layers = []
+        for (j, layer_index) in enumerate(layer_indexes):
+          layer_output = result["layer_output_%d" % j]
+          layers = collections.OrderedDict()
+          layers["index"] = layer_index
+          layers["values"] = [
+              round(float(x), 6) for x in layer_output[i:(i + 1)].flat
+          ]
+          all_layers.append(layers)
+        features = collections.OrderedDict()
+        features["token"] = token
+        features["layers"] = all_layers
+        all_features.append(features)
+      output_json["features"] = all_features
+      writer.write(json.dumps(output_json) + "\n")
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("input_file")
+  flags.mark_flag_as_required("vocab_file")
+  flags.mark_flag_as_required("bert_config_file")
+  flags.mark_flag_as_required("init_checkpoint")
+  flags.mark_flag_as_required("output_file")
+  tf.app.run()
@@ -0,0 +1,35 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tensorflow as tf
+import numpy as np
+
+
+def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
+                                    initializer=None, regularizer=None,
+                                    trainable=True,
+                                    *args, **kwargs):
+    """Custom variable getter that forces trainable variables to be stored in
+       float32 precision and then casts them to the training precision.
+    """
+    storage_dtype = tf.float32 if trainable else dtype
+    variable = getter(name, shape, dtype=storage_dtype,
+                      initializer=initializer, regularizer=regularizer,
+                      trainable=trainable,
+                      *args, **kwargs)
+    if trainable and dtype != tf.float32:
+        variable = tf.cast(variable, dtype)
+    return variable
+
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import copy
+import json
+import math
+import re
+import six
+import tensorflow as tf
+
+from tensorflow.python.framework import ops
+from tensorflow.contrib.layers.python.layers import utils
+from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.ops import init_ops
+import numpy
+from tensorflow.python.ops import array_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import nn
+
+def fused_layer_norm(inputs,
+               center=True,
+               scale=True,
+               activation_fn=None,
+               reuse=None,
+               variables_collections=None,
+               outputs_collections=None,
+               trainable=True,
+               begin_norm_axis=1,
+               begin_params_axis=-1,
+               scope=None,
+               use_fused_batch_norm=False):
+  with tf.variable_scope(
+      scope, 'LayerNorm', [inputs], reuse=reuse) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    inputs_shape = inputs.shape
+    inputs_rank = inputs_shape.ndims
+    if inputs_rank is None:
+      raise ValueError('Inputs %s has undefined rank.' % inputs.name)
+    dtype = inputs.dtype.base_dtype
+    if begin_norm_axis < 0:
+      begin_norm_axis = inputs_rank + begin_norm_axis
+    if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
+      raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) '
+                       'must be < rank(inputs) (%d)' %
+                       (begin_params_axis, begin_norm_axis, inputs_rank))
+    params_shape = inputs_shape[begin_params_axis:]
+    if not params_shape.is_fully_defined():
+      raise ValueError(
+          'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' %
+          (inputs.name, begin_params_axis, inputs_shape))
+    # Allocate parameters for the beta and gamma of the normalization.
+    beta, gamma = None, None
+    if center:
+      beta_collections = utils.get_variable_collections(variables_collections,
+                                                        'beta')
+      beta = variables.model_variable(
+          'beta',
+          shape=params_shape,
+          dtype=dtype,
+          initializer=init_ops.zeros_initializer(),
+          collections=beta_collections,
+          trainable=trainable)
+    if scale:
+      gamma_collections = utils.get_variable_collections(
+          variables_collections, 'gamma')
+      gamma = variables.model_variable(
+          'gamma',
+          shape=params_shape,
+          dtype=dtype,
+          initializer=init_ops.ones_initializer(),
+          collections=gamma_collections,
+          trainable=trainable)
+    if use_fused_batch_norm:
+      # get static TensorShape if fully defined,
+      # otherwise retrieve shape tensor
+      norm_shape = inputs.shape[begin_norm_axis:]
+      if norm_shape.is_fully_defined():
+        bn_shape = [1, -1, 1, numpy.prod(norm_shape.as_list())]
+      else:
+        norm_shape = tf.shape(inputs)[begin_norm_axis:]
+        bn_shape = [1, -1, 1, tf.reduce_prod(norm_shape)]
+      if inputs.get_shape().is_fully_defined():
+        outputs_shape = inputs.get_shape()
+      else:
+        outputs_shape = tf.shape(inputs)
+      inputs = array_ops.reshape(inputs, bn_shape)
+      if inputs.get_shape().is_fully_defined():
+        # static inputs TensorShape fully defined after reshape.
+        ones = array_ops.ones(inputs.get_shape()[1], dtype=dtypes.float32)
+        zeros = array_ops.zeros(inputs.get_shape()[1], dtype=dtypes.float32)
+      else:
+        # static inputs TensorShape NOT fully defined after reshape.
+        # must use dynamic shape, which means these input tensors
+        # have to be created at runtime, which causes a slowdown.
+        scale_shape = tf.shape(inputs)[1]
+        ones = array_ops.ones(scale_shape, dtype=dtypes.float32)
+        zeros = array_ops.zeros(scale_shape, dtype=dtypes.float32)
+      outputs, mean, variance = nn.fused_batch_norm(
+          inputs,
+          ones, zeros,
+          epsilon=1e-4,
+          data_format="NCHW")
+      outputs = array_ops.reshape(outputs, outputs_shape)
+      if center and scale:
+        outputs = outputs * gamma + beta
+      elif center:
+        outputs = outputs + beta
+      elif scale:
+        outputs = outputs * gamma
+    else:
+      # Calculate the moments on the last axis (layer activations).
+      norm_axes = list(range(begin_norm_axis, inputs_rank))
+      mean, variance = nn.moments(inputs, norm_axes, keep_dims=True)
+      # Compute layer normalization using the batch_normalization function.
+      variance_epsilon = 1e-4
+      outputs = nn.batch_normalization(
+          inputs,
+          mean,
+          variance,
+          offset=beta,
+          scale=gamma,
+          variance_epsilon=variance_epsilon)
+      outputs.set_shape(inputs_shape)
+    if activation_fn is not None:
+      outputs = activation_fn(outputs)
+    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+import numpy as np
+
+def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
+                                    initializer=None, regularizer=None,
+                                    trainable=True,
+                                    *args, **kwargs):
+    """Custom variable getter that forces trainable variables to be stored in
+       float32 precision and then casts them to the training precision.
+    """
+    storage_dtype = tf.float32 if trainable else dtype
+    variable = getter(name, shape, dtype=storage_dtype,
+                      initializer=initializer, regularizer=regularizer,
+                      trainable=trainable,
+                      *args, **kwargs)
+    if trainable and dtype != tf.float32:
+        variable = tf.cast(variable, dtype)
+    return variable
+
+def get_custom_getter(compute_type):
+    return float32_variable_storage_getter if compute_type == tf.float16 else None
@@ -0,0 +1,277 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import json
+import random
+import re
+
+import modeling
+import six
+import tensorflow as tf
+
+
+class BertModelTest(tf.test.TestCase):
+
+  class BertModelTester(object):
+
+    def __init__(self,
+                 parent,
+                 batch_size=13,
+                 seq_length=7,
+                 is_training=True,
+                 use_input_mask=True,
+                 use_token_type_ids=True,
+                 vocab_size=99,
+                 hidden_size=32,
+                 num_hidden_layers=5,
+                 num_attention_heads=4,
+                 intermediate_size=37,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=16,
+                 initializer_range=0.02,
+                 scope=None):
+      self.parent = parent
+      self.batch_size = batch_size
+      self.seq_length = seq_length
+      self.is_training = is_training
+      self.use_input_mask = use_input_mask
+      self.use_token_type_ids = use_token_type_ids
+      self.vocab_size = vocab_size
+      self.hidden_size = hidden_size
+      self.num_hidden_layers = num_hidden_layers
+      self.num_attention_heads = num_attention_heads
+      self.intermediate_size = intermediate_size
+      self.hidden_act = hidden_act
+      self.hidden_dropout_prob = hidden_dropout_prob
+      self.attention_probs_dropout_prob = attention_probs_dropout_prob
+      self.max_position_embeddings = max_position_embeddings
+      self.type_vocab_size = type_vocab_size
+      self.initializer_range = initializer_range
+      self.scope = scope
+
+    def create_model(self):
+      input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length],
+                                           self.vocab_size)
+
+      input_mask = None
+      if self.use_input_mask:
+        input_mask = BertModelTest.ids_tensor(
+            [self.batch_size, self.seq_length], vocab_size=2)
+
+      token_type_ids = None
+      if self.use_token_type_ids:
+        token_type_ids = BertModelTest.ids_tensor(
+            [self.batch_size, self.seq_length], self.type_vocab_size)
+
+      config = modeling.BertConfig(
+          vocab_size=self.vocab_size,
+          hidden_size=self.hidden_size,
+          num_hidden_layers=self.num_hidden_layers,
+          num_attention_heads=self.num_attention_heads,
+          intermediate_size=self.intermediate_size,
+          hidden_act=self.hidden_act,
+          hidden_dropout_prob=self.hidden_dropout_prob,
+          attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+          max_position_embeddings=self.max_position_embeddings,
+          type_vocab_size=self.type_vocab_size,
+          initializer_range=self.initializer_range)
+
+      model = modeling.BertModel(
+          config=config,
+          is_training=self.is_training,
+          input_ids=input_ids,
+          input_mask=input_mask,
+          token_type_ids=token_type_ids,
+          scope=self.scope)
+
+      outputs = {
+          "embedding_output": model.get_embedding_output(),
+          "sequence_output": model.get_sequence_output(),
+          "pooled_output": model.get_pooled_output(),
+          "all_encoder_layers": model.get_all_encoder_layers(),
+      }
+      return outputs
+
+    def check_output(self, result):
+      self.parent.assertAllEqual(
+          result["embedding_output"].shape,
+          [self.batch_size, self.seq_length, self.hidden_size])
+
+      self.parent.assertAllEqual(
+          result["sequence_output"].shape,
+          [self.batch_size, self.seq_length, self.hidden_size])
+
+      self.parent.assertAllEqual(result["pooled_output"].shape,
+                                 [self.batch_size, self.hidden_size])
+
+  def test_default(self):
+    self.run_tester(BertModelTest.BertModelTester(self))
+
+  def test_config_to_json_string(self):
+    config = modeling.BertConfig(vocab_size=99, hidden_size=37)
+    obj = json.loads(config.to_json_string())
+    self.assertEqual(obj["vocab_size"], 99)
+    self.assertEqual(obj["hidden_size"], 37)
+
+  def run_tester(self, tester):
+    with self.test_session() as sess:
+      ops = tester.create_model()
+      init_op = tf.group(tf.global_variables_initializer(),
+                         tf.local_variables_initializer())
+      sess.run(init_op)
+      output_result = sess.run(ops)
+      tester.check_output(output_result)
+
+      self.assert_all_tensors_reachable(sess, [init_op, ops])
+
+  @classmethod
+  def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
+    """Creates a random int32 tensor of the shape within the vocab size."""
+    if rng is None:
+      rng = random.Random()
+
+    total_dims = 1
+    for dim in shape:
+      total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+      values.append(rng.randint(0, vocab_size - 1))
+
+    return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name)
+
+  def assert_all_tensors_reachable(self, sess, outputs):
+    """Checks that all the tensors in the graph are reachable from outputs."""
+    graph = sess.graph
+
+    ignore_strings = [
+        "^.*/assert_less_equal/.*$",
+        "^.*/dilation_rate$",
+        "^.*/Tensordot/concat$",
+        "^.*/Tensordot/concat/axis$",
+        "^testing/.*$",
+    ]
+
+    ignore_regexes = [re.compile(x) for x in ignore_strings]
+
+    unreachable = self.get_unreachable_ops(graph, outputs)
+    filtered_unreachable = []
+    for x in unreachable:
+      do_ignore = False
+      for r in ignore_regexes:
+        m = r.match(x.name)
+        if m is not None:
+          do_ignore = True
+      if do_ignore:
+        continue
+      filtered_unreachable.append(x)
+    unreachable = filtered_unreachable
+
+    self.assertEqual(
+        len(unreachable), 0, "The following ops are unreachable: %s" %
+        (" ".join([x.name for x in unreachable])))
+
+  @classmethod
+  def get_unreachable_ops(cls, graph, outputs):
+    """Finds all of the tensors in graph that are unreachable from outputs."""
+    outputs = cls.flatten_recursive(outputs)
+    output_to_op = collections.defaultdict(list)
+    op_to_all = collections.defaultdict(list)
+    assign_out_to_in = collections.defaultdict(list)
+
+    for op in graph.get_operations():
+      for x in op.inputs:
+        op_to_all[op.name].append(x.name)
+      for y in op.outputs:
+        output_to_op[y.name].append(op.name)
+        op_to_all[op.name].append(y.name)
+      if str(op.type) == "Assign":
+        for y in op.outputs:
+          for x in op.inputs:
+            assign_out_to_in[y.name].append(x.name)
+
+    assign_groups = collections.defaultdict(list)
+    for out_name in assign_out_to_in.keys():
+      name_group = assign_out_to_in[out_name]
+      for n1 in name_group:
+        assign_groups[n1].append(out_name)
+        for n2 in name_group:
+          if n1 != n2:
+            assign_groups[n1].append(n2)
+
+    seen_tensors = {}
+    stack = [x.name for x in outputs]
+    while stack:
+      name = stack.pop()
+      if name in seen_tensors:
+        continue
+      seen_tensors[name] = True
+
+      if name in output_to_op:
+        for op_name in output_to_op[name]:
+          if op_name in op_to_all:
+            for input_name in op_to_all[op_name]:
+              if input_name not in stack:
+                stack.append(input_name)
+
+      expanded_names = []
+      if name in assign_groups:
+        for assign_name in assign_groups[name]:
+          expanded_names.append(assign_name)
+
+      for expanded_name in expanded_names:
+        if expanded_name not in stack:
+          stack.append(expanded_name)
+
+    unreachable_ops = []
+    for op in graph.get_operations():
+      is_unreachable = False
+      all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs]
+      for name in all_names:
+        if name not in seen_tensors:
+          is_unreachable = True
+      if is_unreachable:
+        unreachable_ops.append(op)
+    return unreachable_ops
+
+  @classmethod
+  def flatten_recursive(cls, item):
+    """Flattens (potentially nested) a tuple/dictionary/list to a list."""
+    output = []
+    if isinstance(item, list):
+      output.extend(item)
+    elif isinstance(item, tuple):
+      output.extend(list(item))
+    elif isinstance(item, dict):
+      for (_, v) in six.iteritems(item):
+        output.append(v)
+    else:
+      return [item]
+
+    flat_output = []
+    for x in output:
+      flat_output.extend(cls.flatten_recursive(x))
+    return flat_output
+
+
+if __name__ == "__main__":
+  tf.test.main()
@@ -0,0 +1,305 @@
+## Models
+
+There are two multilingual models currently available. We do not plan to release
+more single-language models, but we may release `BERT-Large` versions of these
+two in the future:
+
+*   **[`BERT-Base, Multilingual Cased (New, recommended)`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**:
+    104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**:
+    102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**:
+    Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M
+    parameters
+
+**The `Multilingual Cased (New)` model also fixes normalization issues in many
+languages, so it is recommended in languages with non-Latin alphabets (and is
+often better for most languages with Latin alphabets). When using this model,
+make sure to pass `--do_lower_case=false` to `run_pretraining.py` and other
+scripts.**
+
+See the [list of languages](#list-of-languages) that the Multilingual model
+supports. The Multilingual model does include Chinese (and English), but if your
+fine-tuning data is Chinese-only, then the Chinese model will likely produce
+better results.
+
+## Results
+
+To evaluate these systems, we use the
+[XNLI dataset](https://github.com/facebookresearch/XNLI) dataset, which is a
+version of [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) where the
+dev and test sets have been translated (by humans) into 15 languages. Note that
+the training set was *machine* translated (we used the translations provided by
+XNLI, not Google NMT). For clarity, we only report on 6 languages below:
+
+<!-- mdformat off(no table) -->
+
+| System                            | English  | Chinese  | Spanish  | German   | Arabic   | Urdu     |
+| --------------------------------- | -------- | -------- | -------- | -------- | -------- | -------- |
+| XNLI Baseline - Translate Train   | 73.7     | 67.0     | 68.8     | 66.5     | 65.8     | 56.6     |
+| XNLI Baseline - Translate Test    | 73.7     | 68.3     | 70.7     | 68.7     | 66.8     | 59.3     |
+| BERT - Translate Train Cased      | **81.9** | **76.6** | **77.8** | **75.9** | **70.7** | 61.6     |
+| BERT - Translate Train Uncased    | 81.4     | 74.2     | 77.3     | 75.2     | 70.5     | 61.7     |
+| BERT - Translate Test Uncased     | 81.4     | 70.1     | 74.9     | 74.4     | 70.4     | **62.1** |
+| BERT - Zero Shot Uncased          | 81.4     | 63.8     | 74.3     | 70.5     | 62.1     | 58.3     |
+
+<!-- mdformat on -->
+
+The first two rows are baselines from the XNLI paper and the last three rows are
+our results with BERT.
+
+**Translate Train** means that the MultiNLI training set was machine translated
+from English into the foreign language. So training and evaluation were both
+done in the foreign language. Unfortunately, training was done on
+machine-translated data, so it is impossible to quantify how much of the lower
+accuracy (compared to English) is due to the quality of the machine translation
+vs. the quality of the pre-trained model.
+
+**Translate Test** means that the XNLI test set was machine translated from the
+foreign language into English. So training and evaluation were both done on
+English. However, test evaluation was done on machine-translated English, so the
+accuracy depends on the quality of the machine translation system.
+
+**Zero Shot** means that the Multilingual BERT system was fine-tuned on English
+MultiNLI, and then evaluated on the foreign language XNLI test. In this case,
+machine translation was not involved at all in either the pre-training or
+fine-tuning.
+
+Note that the English result is worse than the 84.2 MultiNLI baseline because
+this training used Multilingual BERT rather than English-only BERT. This implies
+that for high-resource languages, the Multilingual model is somewhat worse than
+a single-language model. However, it is not feasible for us to train and
+maintain dozens of single-language model. Therefore, if your goal is to maximize
+performance with a language other than English or Chinese, you might find it
+beneficial to run pre-training for additional steps starting from our
+Multilingual model on data from your language of interest.
+
+Here is a comparison of training Chinese models with the Multilingual
+`BERT-Base` and Chinese-only `BERT-Base`:
+
+System                  | Chinese
+----------------------- | -------
+XNLI Baseline           | 67.0
+BERT Multilingual Model | 74.2
+BERT Chinese-only Model | 77.2
+
+Similar to English, the single-language model does 3% better than the
+Multilingual model.
+
+## Fine-tuning Example
+
+The multilingual model does **not** require any special consideration or API
+changes. We did update the implementation of `BasicTokenizer` in
+`tokenization.py` to support Chinese character tokenization, so please update if
+you forked it. However, we did not change the tokenization API.
+
+To test the new models, we did modify `run_classifier.py` to add support for the
+[XNLI dataset](https://github.com/facebookresearch/XNLI). This is a 15-language
+version of MultiNLI where the dev/test sets have been human-translated, and the
+training set has been machine-translated.
+
+To run the fine-tuning code, please download the
+[XNLI dev/test set](https://s3.amazonaws.com/xnli/XNLI-1.0.zip) and the
+[XNLI machine-translated training set](https://s3.amazonaws.com/xnli/XNLI-MT-1.0.zip)
+and then unpack both .zip files into some directory `$XNLI_DIR`.
+
+To run fine-tuning on XNLI. The language is hard-coded into `run_classifier.py`
+(Chinese by default), so please modify `XnliProcessor` if you want to run on
+another language.
+
+This is a large dataset, so this will training will take a few hours on a GPU
+(or about 30 minutes on a Cloud TPU). To run an experiment quickly for
+debugging, just set `num_train_epochs` to a small value like `0.1`.
+
+```shell
+export BERT_BASE_DIR=/path/to/bert/chinese_L-12_H-768_A-12 # or multilingual_L-12_H-768_A-12
+export XNLI_DIR=/path/to/xnli
+
+python run_classifier.py \
+  --task_name=XNLI \
+  --do_train=true \
+  --do_eval=true \
+  --data_dir=$XNLI_DIR \
+  --vocab_file=$BERT_BASE_DIR/vocab.txt \
+  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
+  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
+  --max_seq_length=128 \
+  --train_batch_size=32 \
+  --learning_rate=5e-5 \
+  --num_train_epochs=2.0 \
+  --output_dir=/tmp/xnli_output/
+```
+
+With the Chinese-only model, the results should look something like this:
+
+```
+ ***** Eval results *****
+eval_accuracy = 0.774116
+eval_loss = 0.83554
+global_step = 24543
+loss = 0.74603
+```
+
+## Details
+
+### Data Source and Sampling
+
+The languages chosen were the
+[top 100 languages with the largest Wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias).
+The entire Wikipedia dump for each language (excluding user and talk pages) was
+taken as the training data for each language
+
+However, the size of the Wikipedia for a given language varies greatly, and
+therefore low-resource languages may be "under-represented" in terms of the
+neural network model (under the assumption that languages are "competing" for
+limited model capacity to some extent).
+
+However, the size of a Wikipedia also correlates with the number of speakers of
+a language, and we also don't want to overfit the model by performing thousands
+of epochs over a tiny Wikipedia for a particular language.
+
+To balance these two factors, we performed exponentially smoothed weighting of
+the data during pre-training data creation (and WordPiece vocab creation). In
+other words, let's say that the probability of a language is *P(L)*, e.g.,
+*P(English) = 0.21* means that after concatenating all of the Wikipedias
+together, 21% of our data is English. We exponentiate each probability by some
+factor *S* and then re-normalize, and sample from that distribution. In our case
+we use *S=0.7*. So, high-resource languages like English will be under-sampled,
+and low-resource languages like Icelandic will be over-sampled. E.g., in the
+original distribution English would be sampled 1000x more than Icelandic, but
+after smoothing it's only sampled 100x more.
+
+### Tokenization
+
+For tokenization, we use a 110k shared WordPiece vocabulary. The word counts are
+weighted the same way as the data, so low-resource languages are upweighted by
+some factor. We intentionally do *not* use any marker to denote the input
+language (so that zero-shot training can work).
+
+Because Chinese (and Japanese Kanji and Korean Hanja) does not have whitespace
+characters, we add spaces around every character in the
+[CJK Unicode range](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_\(Unicode_block\))
+before applying WordPiece. This means that Chinese is effectively
+character-tokenized. Note that the CJK Unicode block only includes
+Chinese-origin characters and does *not* include Hangul Korean or
+Katakana/Hiragana Japanese, which are tokenized with whitespace+WordPiece like
+all other languages.
+
+For all other languages, we apply the
+[same recipe as English](https://github.com/google-research/bert#tokenization):
+(a) lower casing+accent removal, (b) punctuation splitting, (c) whitespace
+tokenization. We understand that accent markers have substantial meaning in some
+languages, but felt that the benefits of reducing the effective vocabulary make
+up for this. Generally the strong contextual models of BERT should make up for
+any ambiguity introduced by stripping accent markers.
+
+### List of Languages
+
+The multilingual model supports the following languages. These languages were
+chosen because they are the top 100 languages with the largest Wikipedias:
+
+*   Afrikaans
+*   Albanian
+*   Arabic
+*   Aragonese
+*   Armenian
+*   Asturian
+*   Azerbaijani
+*   Bashkir
+*   Basque
+*   Bavarian
+*   Belarusian
+*   Bengali
+*   Bishnupriya Manipuri
+*   Bosnian
+*   Breton
+*   Bulgarian
+*   Burmese
+*   Catalan
+*   Cebuano
+*   Chechen
+*   Chinese (Simplified)
+*   Chinese (Traditional)
+*   Chuvash
+*   Croatian
+*   Czech
+*   Danish
+*   Dutch
+*   English
+*   Estonian
+*   Finnish
+*   French
+*   Galician
+*   Georgian
+*   German
+*   Greek
+*   Gujarati
+*   Haitian
+*   Hebrew
+*   Hindi
+*   Hungarian
+*   Icelandic
+*   Ido
+*   Indonesian
+*   Irish
+*   Italian
+*   Japanese
+*   Javanese
+*   Kannada
+*   Kazakh
+*   Kirghiz
+*   Korean
+*   Latin
+*   Latvian
+*   Lithuanian
+*   Lombard
+*   Low Saxon
+*   Luxembourgish
+*   Macedonian
+*   Malagasy
+*   Malay
+*   Malayalam
+*   Marathi
+*   Minangkabau
+*   Nepali
+*   Newar
+*   Norwegian (Bokmal)
+*   Norwegian (Nynorsk)
+*   Occitan
+*   Persian (Farsi)
+*   Piedmontese
+*   Polish
+*   Portuguese
+*   Punjabi
+*   Romanian
+*   Russian
+*   Scots
+*   Serbian
+*   Serbo-Croatian
+*   Sicilian
+*   Slovak
+*   Slovenian
+*   South Azerbaijani
+*   Spanish
+*   Sundanese
+*   Swahili
+*   Swedish
+*   Tagalog
+*   Tajik
+*   Tamil
+*   Tatar
+*   Telugu
+*   Turkish
+*   Ukrainian
+*   Urdu
+*   Uzbek
+*   Vietnamese
+*   Volapük
+*   Waray-Waray
+*   Welsh
+*   West
+*   Western Punjabi
+*   Yoruba
+
+The **Multilingual Cased (New)** release contains additionally **Thai** and
+**Mongolian**, which were not included in the original release.
@@ -0,0 +1,173 @@
+```
+# Licensed under the Apache License, Version 2.0 (the "License")
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+```
+<img src="http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png" style="width: 90px; float: right;">
+
+
+# Table Of Contents
+- [BERT Question Answering Fine-Tuning and Inference with Mixed Precision](#bert-question-answering-inference/fine-tuning-with-mixed-precision)
+- [BioBERT Named-Entity Recognition Inference with Mixed Precision](#biobert-named-entity-recognition-inference-with-mixed-precision)
+
+
+# BERT Question Answering Inference/Fine-Tuning with Mixed Precision
+
+## 1. Overview
+
+Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks.
+
+The original paper can be found here: https://arxiv.org/abs/1810.04805.
+
+NVIDIA's BERT 19.10 is an optimized version of Google's official implementation, leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy.
+
+### 1.a Learning objectives
+
+This repository contains multiple notebooks which demonstrate:
+- Inference on QA task with BERT Large model
+- The use/download of pretrained NVIDIA BERT models
+- Fine-Tuning on SQuaD 2.0 Dataset
+- Use of Mixed Precision for Inference and Fine-Tuning
+
+Here is a short description of each relevant file:
+ - _bert_squad_tf_inference.ipynb_ : BERT Q&A Inference with TF Checkpoint model
+ - _bert_squad_tf_finetuning.ipynb_ : BERT Fine-Tuning on SQuaD dataset
+
+## 2. Quick Start Guide
+
+### 2.a Build the BERT TensorFlow NGC container:
+To run the notebook you first need to build the Bert TensorFlow container using the following command from the main directory of this repository:
+
+``` bash
+docker build . --rm -t bert
+```
+### 2.b Dataset
+
+We need to download the vocabulary and the bert_config files:
+
+``` python3
+python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights  # Includes vocab
+```
+
+This is only needed during fine-tuning in order to download the Squad dataset:
+
+``` python3
+python3 /workspace/bert/data/bertPrep.py --action download --dataset squad
+```
+
+### 2.c Start of the NGC container to run inference:
+Once the image is built, you need to run the container with the `--publish
+0.0.0.0:8888:8888` option to publish Jupyter's port `8888` to the host machine
+at port `8888` over all network interfaces (`0.0.0.0`):
+
+```bash
+nvidia-docker run \
+  -v $PWD:/workspace/bert \
+  -v $PWD/results:/results \
+  --shm-size=1g \
+  --ulimit memlock=-1 \
+  --ulimit stack=67108864 \
+  --publish 0.0.0.0:8888:8888 \
+  -it bert:latest bash
+```
+
+Then you can use the following command within the BERT Tensorflow container under
+`/workspace/bert`:
+
+```bash
+jupyter notebook --ip=0.0.0.0 --allow-root
+```
+
+And navigate a web browser to the IP address or hostname of the host machine
+at port `8888`:
+
+```
+http://[host machine]:8888
+```
+
+Use the token listed in the output from running the `jupyter` command to log
+in, for example:
+
+```
+http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b
+```
+
+
+# BioBERT Named-Entity Recognition Inference with Mixed Precision
+
+## 1. Overview
+
+Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. 
+
+BioBERT is a domain specific version of BERT that has been trained on PubMed abstracts.
+
+The original BioBERT paper can be found here: https://arxiv.org/abs/1901.08746
+
+NVIDIA's BioBERT is an optimized version of the implementation presented in the paper, leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy.
+
+### 1.a Learning objectives
+
+This repository contains an example notebook that demonstrates:
+- Inference on NER task with BioBERT model
+- The use/download of fine-tuned NVIDIA BioBERT models
+- Use of Mixed Precision for Inference
+
+Here is a short description of the relevant file:
+ - _biobert_ner_tf_inference.ipynb_ : BioBERT Inference with TF Checkpoint model
+ 
+## 2. Quick Start Guide
+
+### 2.a Build the BERT TensorFlow NGC container:
+To run the notebook you first need to build the Bert TensorFlow container using the following command from the main directory of this repository:
+
+``` bash
+docker build . --rm -t bert
+```
+### 2.b Start of the NGC container to run inference:
+Once the image is built, you need to run the container with the `--publish
+0.0.0.0:8888:8888` option to publish Jupyter's port `8888` to the host machine
+at port `8888` over all network interfaces (`0.0.0.0`):
+
+```bash
+nvidia-docker run \
+  -v $PWD:/workspace/bert \
+  -v $PWD/results:/results \
+  --shm-size=1g \
+  --ulimit memlock=-1 \
+  --ulimit stack=67108864 \
+  --publish 0.0.0.0:8888:8888 \
+  -it bert:latest bash
+```
+
+Then you can use the following commands within the BERT Tensorflow container under
+`/workspace/bert`:
+
+
+Install spaCy. You'll use this to pre-process text and to visualize the results using displaCy.
+```
+pip install spacy
+python -m spacy download en_core_web_sm
+```
+
+Launch Jupyter.
+```bash
+jupyter notebook --ip=0.0.0.0 --allow-root
+```
+
+And navigate a web browser to the IP address or hostname of the host machine
+at port `8888`:
+
+```
+http://[host machine]:8888
+```
+
+Use the token listed in the output from running the `jupyter` command to log
+in, for example:
+
+```
+http://[host machine]:8888/?token=aae96ae9387cd28151868fee318c3b3581a2d794f3b25c6b
+```
+
@@ -0,0 +1,624 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright 2019 NVIDIA Corporation. All Rights Reserved.\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "# =============================================================================="
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
+    "\n",
+    "# BERT Question Answering Fine-Tuning with Mixed Precision"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Overview\n",
+    "\n",
+    "Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. \n",
+    "\n",
+    "The original paper can be found here: https://arxiv.org/abs/1810.04805.\n",
+    "\n",
+    "NVIDIA's BERT 19.10 is an optimized version of Google's official implementation, leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.a Learning objectives\n",
+    "\n",
+    "This notebook demonstrates:\n",
+    "- Fine-Tuning on Question Answering (QA) task with BERT Large model\n",
+    "- The use/download of pretrained NVIDIA BERT models\n",
+    "- Use of Mixed Precision for Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Requirements\n",
+    "\n",
+    "Please refer to Section 2. of the ReadMe file"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. BERT Question Answering Task\n",
+    "\n",
+    "Here we run QA fine-tuning on a pre-trained BERT model.\n",
+    "To fine-tune we will use the [SQuaD 1.1 Dataset](https://rajpurkar.github.io/SQuAD-explorer/) which contains 100,000+ question-answer pairs on 500+ articles."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "data_dir =  '/workspace/bert/data/download'\n",
+    "\n",
+    "# SQuAD json for training\n",
+    "train_file = os.path.join(data_dir, 'squad/v1.1/train-v1.1.json')\n",
+    "# json for inference\n",
+    "predict_file = os.path.join(data_dir, 'squad/v1.1/dev-v1.1.json')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.a Mixed Precision\n",
+    "\n",
+    "Mixed precision training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of tensor cores in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.\n",
+    "\n",
+    "For information about:\n",
+    "- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.\n",
+    "- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.\n",
+    "- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this notebook we control mixed precision execution with the following flag: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "use_fp16 = True;\n",
+    "\n",
+    "import os\n",
+    "os.environ[\"TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE\"] = \"1\" if use_fp16 else \"0\" \n",
+    "\n",
+    "# For detailed debug uncomment the following line:\n",
+    "#os.environ[\"TF_CPP_VMODULE\"]=\"auto_mixed_precision=2\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Pre-Trained NVIDIA BERT TF Models\n",
+    "\n",
+    "Based on the model size, we have the following two default configurations of BERT.\n",
+    "\n",
+    "| **Model** | **Hidden layers** | **Hidden unit size** | **Attention heads** | **Feedforward filter size** | **Max sequence length** | **Parameters** |\n",
+    "|:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|\n",
+    "|BERTBASE |12 encoder| 768| 12|4 x  768|512|110M|\n",
+    "|BERTLARGE|24 encoder|1024| 16|4 x 1024|512|330M|\n",
+    "\n",
+    "We will large use pre-trained models avaialble on NGC (NVIDIA GPU Cluster, https://ngc.nvidia.com).\n",
+    "There are many configuration available, in particular we will download and use the following:\n",
+    "\n",
+    "**bert_tf_large_fp16_384**\n",
+    "\n",
+    "Which is pre-trained using the Wikipedia and Book corpus datasets as training data. \n",
+    "We will fine-tune on the SQuaD 1.1 Dataset."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's create the folders for the pre-trained models:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# bert_tf_large_fp16_384\n",
+    "DATA_DIR_FP16 = '/workspace/bert/data/download/pretrained_model_fp16'\n",
+    "!mkdir -p $DATA_DIR_FP16\n",
+    "!wget -nc -q --show-progress -O $DATA_DIR_FP16/bert_for_tensorflow.zip \\\n",
+    "https://api.ngc.nvidia.com/v2/models/nvidia/bert_for_tensorflow/versions/1/zip\n",
+    "!unzip -n -d $DATA_DIR_FP16/ $DATA_DIR_FP16/bert_for_tensorflow.zip     "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the code that follows we will refer to this model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "notebooks_dir = '/workspace/bert/notebooks'\n",
+    "\n",
+    "working_dir = '/workspace/bert'\n",
+    "if working_dir not in sys.path:\n",
+    "    sys.path.append(working_dir)\n",
+    "\n",
+    "init_checkpoint = os.path.join(data_dir, 'pretrained_model_fp16/model.ckpt-1000000')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Running QA task fine-tuning\n",
+    "\n",
+    "In order to run Q-A inference we will follow step-by-step a simplified flow implemented in run_squad.py:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import run_squad\n",
+    "\n",
+    "import json\n",
+    "import tensorflow as tf\n",
+    "import modeling\n",
+    "import tokenization\n",
+    "import time\n",
+    "import random\n",
+    "\n",
+    "import optimization\n",
+    "\n",
+    "tf.logging.set_verbosity(tf.logging.INFO)\n",
+    "\n",
+    "# Create the output directory where all the results are saved.\n",
+    "output_dir = os.path.join(working_dir, 'results')\n",
+    "tf.gfile.MakeDirs(output_dir)\n",
+    "\n",
+    "# The config json file corresponding to the pre-trained BERT model.\n",
+    "# This specifies the model architecture.\n",
+    "bert_config_file = os.path.join(data_dir, 'google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json')\n",
+    "\n",
+    "# The vocabulary file that the BERT model was trained on.\n",
+    "vocab_file = os.path.join(data_dir, 'google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt')\n",
+    "\n",
+    "# Whether to lower case the input text. \n",
+    "# Should be True for uncased models and False for cased models.\n",
+    "do_lower_case = True\n",
+    "  \n",
+    "# Total batch size for predictions\n",
+    "predict_batch_size = 1\n",
+    "params = dict([('batch_size', predict_batch_size)])\n",
+    "\n",
+    "# The maximum total input sequence length after WordPiece tokenization. \n",
+    "# Sequences longer than this will be truncated, and sequences shorter than this will be padded.\n",
+    "max_seq_length = 384\n",
+    "\n",
+    "# When splitting up a long document into chunks, how much stride to take between chunks.\n",
+    "doc_stride = 128\n",
+    "\n",
+    "# The maximum number of tokens for the question. \n",
+    "# Questions longer than this will be truncated to this length.\n",
+    "max_query_length = 64\n",
+    "\n",
+    "# This is a WA to use flags from here:\n",
+    "flags = tf.flags\n",
+    "\n",
+    "if 'f' not in tf.flags.FLAGS: \n",
+    "    tf.app.flags.DEFINE_string('f', '', 'kernel')\n",
+    "FLAGS = flags.FLAGS\n",
+    "# FLAGS.verbose_logging = True\n",
+    "\n",
+    "# The total number of n-best predictions to generate in the nbest_predictions.json output file.\n",
+    "n_best_size = 20\n",
+    "\n",
+    "# The maximum length of an answer that can be generated. \n",
+    "# This is needed  because the start and end predictions are not conditioned on one another.\n",
+    "max_answer_length = 30\n",
+    "\n",
+    "# The initial learning rate for Adam\n",
+    "learning_rate = 5e-6\n",
+    "\n",
+    "# Total batch size for training\n",
+    "train_batch_size = 3\n",
+    "\n",
+    "# Proportion of training to perform linear learning rate warmup for\n",
+    "warmup_proportion = 0.1\n",
+    "\n",
+    "# # Total number of training epochs to perform (results will improve if trained with epochs)\n",
+    "num_train_epochs = 2\n",
+    "\n",
+    "global_batch_size = train_batch_size\n",
+    "training_hooks = []\n",
+    "training_hooks.append(run_squad.LogTrainRunHook(global_batch_size, 0))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's create the tokenizer and the training tf_record:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Validate the casing config consistency with the checkpoint name.\n",
+    "tokenization.validate_case_matches_checkpoint(do_lower_case, init_checkpoint)\n",
+    "\n",
+    "# Create the tokenizer.\n",
+    "tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)\n",
+    "    \n",
+    "# Load the configuration from file\n",
+    "bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
+    "\n",
+    "config = tf.ConfigProto(log_device_placement=True) \n",
+    "\n",
+    "run_config = tf.estimator.RunConfig(\n",
+    "      model_dir=output_dir,\n",
+    "      session_config=config,\n",
+    "      save_checkpoints_steps=1000,\n",
+    "      keep_checkpoint_max=1)\n",
+    "\n",
+    "# Read the training examples from the training file:\n",
+    "train_examples = run_squad.read_squad_examples(input_file=train_file, is_training=True)\n",
+    "\n",
+    "num_train_steps = int(len(train_examples) / global_batch_size * num_train_epochs)\n",
+    "num_warmup_steps = int(num_train_steps * warmup_proportion)\n",
+    "\n",
+    "# Pre-shuffle the input to avoid having to make a very large shuffle\n",
+    "# buffer in in the `input_fn`.\n",
+    "rng = random.Random(12345)\n",
+    "rng.shuffle(train_examples)\n",
+    "\n",
+    "start_index = 0 \n",
+    "end_index = len(train_examples)\n",
+    "tmp_filenames = os.path.join(output_dir, \"train.tf_record\")\n",
+    "\n",
+    "# We write to a temporary file to avoid storing very large constant tensors\n",
+    "# in memory.\n",
+    "train_writer = run_squad.FeatureWriter(\n",
+    "    filename=tmp_filenames,\n",
+    "    is_training=True)\n",
+    "\n",
+    "run_squad.convert_examples_to_features(\n",
+    "    examples=train_examples[start_index:end_index],\n",
+    "    tokenizer=tokenizer,\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    doc_stride=doc_stride,\n",
+    "    max_query_length=max_query_length,\n",
+    "    is_training=True,\n",
+    "    output_fn=train_writer.process_feature)\n",
+    "\n",
+    "train_writer.close()\n",
+    "\n",
+    "tf.logging.info(\"***** Running training *****\")\n",
+    "tf.logging.info(\"  Num orig examples = %d\", end_index - start_index)\n",
+    "tf.logging.info(\"  Num split examples = %d\", train_writer.num_features)\n",
+    "tf.logging.info(\"  Batch size = %d\", train_batch_size)\n",
+    "tf.logging.info(\"  Num steps = %d\", num_train_steps)\n",
+    "tf.logging.info(\"  LR = %f\", learning_rate)\n",
+    "\n",
+    "del train_examples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to create the model for the estimator:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument\n",
+    "    unique_ids = features[\"unique_ids\"]\n",
+    "    input_ids = features[\"input_ids\"]\n",
+    "    input_mask = features[\"input_mask\"]\n",
+    "    segment_ids = features[\"segment_ids\"]\n",
+    "    \n",
+    "    is_training = (mode == tf.estimator.ModeKeys.TRAIN)\n",
+    "\n",
+    "    (start_logits, end_logits) = run_squad.create_model(\n",
+    "        bert_config=bert_config,\n",
+    "        is_training=is_training,\n",
+    "        input_ids=input_ids,\n",
+    "        input_mask=input_mask,\n",
+    "        segment_ids=segment_ids,\n",
+    "        use_one_hot_embeddings=False)\n",
+    "\n",
+    "    tvars = tf.trainable_variables()\n",
+    "\n",
+    "    initialized_variable_names = {}\n",
+    "    if init_checkpoint:\n",
+    "        (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)\n",
+    "        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
+    "\n",
+    "    output_spec = None\n",
+    "    if mode == tf.estimator.ModeKeys.TRAIN:\n",
+    "        seq_length = modeling.get_shape_list(input_ids)[1]\n",
+    "        \n",
+    "        def compute_loss(logits, positions):\n",
+    "            one_hot_positions = tf.one_hot(positions, depth=seq_length, dtype=tf.float32)\n",
+    "            log_probs = tf.nn.log_softmax(logits, axis=-1)\n",
+    "            loss = -tf.reduce_mean(tf.reduce_sum(one_hot_positions * log_probs, axis=-1))\n",
+    "            return loss\n",
+    "\n",
+    "        start_positions = features[\"start_positions\"]\n",
+    "        end_positions = features[\"end_positions\"]\n",
+    "        start_loss = compute_loss(start_logits, start_positions)\n",
+    "        end_loss = compute_loss(end_logits, end_positions)\n",
+    "        total_loss = (start_loss + end_loss) / 2.0\n",
+    "        \n",
+    "        train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, None, False, use_fp16)\n",
+    "        \n",
+    "        output_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op)\n",
+    "    \n",
+    "    elif mode == tf.estimator.ModeKeys.PREDICT:\n",
+    "        predictions = {\n",
+    "            \"unique_ids\": unique_ids,\n",
+    "            \"start_logits\": start_logits,\n",
+    "            \"end_logits\": end_logits,\n",
+    "        }\n",
+    "        output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)\n",
+    "\n",
+    "    return output_spec\n",
+    "\n",
+    "estimator = tf.estimator.Estimator(\n",
+    "  model_fn=model_fn,\n",
+    "  config=run_config,\n",
+    "  params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.a Fine Tuning\n",
+    "\n",
+    "Fine tuning is performed using the run_squad.py.\n",
+    "\n",
+    "The run_squad.sh script trains a model and performs evaluation on the SQuaD v1.1 dataset. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "train_input_fn = run_squad.input_fn_builder(\n",
+    "    input_file=tmp_filenames,\n",
+    "    batch_size=train_batch_size,\n",
+    "    seq_length=max_seq_length,\n",
+    "    is_training=True,\n",
+    "    drop_remainder=True,\n",
+    "    hvd=None)\n",
+    "\n",
+    "train_start_time = time.time()\n",
+    "estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=num_train_steps)\n",
+    "train_time_elapsed = time.time() - train_start_time\n",
+    "train_time_wo_startup = training_hooks[-1].total_time\n",
+    "\n",
+    "avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_wo_startup if train_time_wo_startup else 0\n",
+    "\n",
+    "tf.logging.info(\"-----------------------------\")\n",
+    "tf.logging.info(\"Total Training Time = %0.2f Training Time W/O start up overhead = %0.2f \"\n",
+    "                \"Sentences processed = %d\", train_time_elapsed, train_time_wo_startup,\n",
+    "                num_train_steps * global_batch_size)\n",
+    "tf.logging.info(\"Training Performance = %0.4f sentences/sec\", avg_sentences_per_second)\n",
+    "tf.logging.info(\"-----------------------------\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.b Inference\n",
+    "\n",
+    "Now we run inference with the fine-tuned model just saved:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_examples = run_squad.read_squad_examples(\n",
+    "        input_file=predict_file, is_training=False)\n",
+    "\n",
+    "eval_writer = run_squad.FeatureWriter(\n",
+    "    filename=os.path.join(output_dir, \"eval.tf_record\"),\n",
+    "    is_training=False)\n",
+    "\n",
+    "eval_features = []\n",
+    "def append_feature(feature):\n",
+    "    eval_features.append(feature)\n",
+    "    eval_writer.process_feature(feature)\n",
+    "\n",
+    "\n",
+    "# Loads a data file into a list of InputBatch's\n",
+    "run_squad.convert_examples_to_features(\n",
+    "    examples=eval_examples,\n",
+    "    tokenizer=tokenizer,\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    doc_stride=doc_stride,\n",
+    "    max_query_length=max_query_length,\n",
+    "    is_training=False,\n",
+    "    output_fn=append_feature)\n",
+    "\n",
+    "eval_writer.close()\n",
+    "\n",
+    "tf.logging.info(\"***** Running predictions *****\")\n",
+    "tf.logging.info(\"  Num orig examples = %d\", len(eval_examples))\n",
+    "tf.logging.info(\"  Num split examples = %d\", len(eval_features))\n",
+    "tf.logging.info(\"  Batch size = %d\", predict_batch_size)\n",
+    "\n",
+    "predict_input_fn = run_squad.input_fn_builder(\n",
+    "    input_file=eval_writer.filename,\n",
+    "    batch_size=predict_batch_size,\n",
+    "    seq_length=max_seq_length,\n",
+    "    is_training=False,\n",
+    "    drop_remainder=False)\n",
+    "\n",
+    "all_results = []\n",
+    "eval_hooks = [run_squad.LogEvalRunHook(predict_batch_size)]\n",
+    "eval_start_time = time.time()\n",
+    "for result in estimator.predict(\n",
+    "        predict_input_fn, yield_single_examples=True, hooks=eval_hooks, checkpoint_path=None):\n",
+    "    unique_id = int(result[\"unique_ids\"])\n",
+    "    start_logits = [float(x) for x in result[\"start_logits\"].flat]\n",
+    "    end_logits = [float(x) for x in result[\"end_logits\"].flat]\n",
+    "    all_results.append(\n",
+    "      run_squad.RawResult(\n",
+    "          unique_id=unique_id,\n",
+    "          start_logits=start_logits,\n",
+    "          end_logits=end_logits))\n",
+    "\n",
+    "eval_time_elapsed = time.time() - eval_start_time\n",
+    "eval_time_wo_startup = eval_hooks[-1].total_time\n",
+    "num_sentences = eval_hooks[-1].count * predict_batch_size\n",
+    "avg_sentences_per_second = num_sentences * 1.0 / eval_time_wo_startup\n",
+    "\n",
+    "tf.logging.info(\"-----------------------------\")\n",
+    "tf.logging.info(\"Total Inference Time = %0.2f Inference Time W/O start up overhead = %0.2f \"\n",
+    "                \"Sentences processed = %d\", eval_time_elapsed, eval_time_wo_startup,\n",
+    "                num_sentences)\n",
+    "tf.logging.info(\"Inference Performance = %0.4f sentences/sec\", avg_sentences_per_second)\n",
+    "tf.logging.info(\"-----------------------------\")\n",
+    "\n",
+    "output_prediction_file = os.path.join(output_dir, \"predictions.json\")\n",
+    "output_nbest_file = os.path.join(output_dir, \"nbest_predictions.json\")\n",
+    "output_null_log_odds_file = os.path.join(output_dir, \"null_odds.json\")\n",
+    "\n",
+    "run_squad.write_predictions(eval_examples, eval_features, all_results,\n",
+    "                  n_best_size, max_answer_length,\n",
+    "                  do_lower_case, output_prediction_file,\n",
+    "                  output_nbest_file, output_null_log_odds_file)\n",
+    "\n",
+    "tf.logging.info(\"Inference Results:\")\n",
+    "\n",
+    "# Here we show only the prediction results, nbest prediction is also available in the output directory\n",
+    "results = \"\"\n",
+    "with open(output_prediction_file, 'r') as json_file:\n",
+    "    data = json.load(json_file)\n",
+    "    for question in eval_examples:\n",
+    "        results += \"<tr><td>{}</td><td>{}</td><td>{}</td></tr>\".format(question.qas_id, question.question_text, data[question.qas_id])\n",
+    "\n",
+    "\n",
+    "from IPython.display import display, HTML\n",
+    "display(HTML(\"<table><tr><th>Id</th><th>Question</th><th>Answer</th></tr>{}</table>\".format(results)))        "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.b Evaluation\n",
+    "\n",
+    "Let's run evaluation using the script in the SQuaD1.1 folder and our fine-tuned model:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python /workspace/bert/data/download/squad/v1.1/evaluate-v1.1.py \\\n",
+    "    $predict_file \\\n",
+    "    $output_dir/predictions.json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. What's next\n",
+    "\n",
+    "Now that you have fine-tuned a BERT model you may want to take a look ad the run_squad script which containd more options for fine-tuning."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,577 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright 2019 NVIDIA Corporation. All Rights Reserved.\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "# =============================================================================="
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
+    "\n",
+    "# BERT Question Answering Inference with Mixed Precision\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Overview\n",
+    "\n",
+    "Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. \n",
+    "\n",
+    "The original paper can be found here: https://arxiv.org/abs/1810.04805.\n",
+    "\n",
+    "NVIDIA's BERT 19.10 is an optimized version of Google's official implementation, leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.a Learning objectives\n",
+    "\n",
+    "This notebook demonstrates:\n",
+    "- Inference on QA task with BERT Large model\n",
+    "- The use/download of fine-tuned NVIDIA BERT models\n",
+    "- Use of Mixed Precision for Inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Requirements\n",
+    "\n",
+    "Please refer to the ReadMe file"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. BERT Inference: Question Answering\n",
+    "\n",
+    "We can run inference on a fine-tuned BERT model for tasks like Question Answering.\n",
+    "\n",
+    "Here we use a BERT model fine-tuned on a [SQuaD 2.0 Dataset](https://rajpurkar.github.io/SQuAD-explorer/) which contains 100,000+ question-answer pairs on 500+ articles combined with over 50,000 new, unanswerable questions."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.a Paragraph and Queries\n",
+    "\n",
+    "In this example we will ask our BERT model questions related to the following paragraph:\n",
+    "\n",
+    "**The Apollo Program**\n",
+    "_\"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\"_\n",
+    "\n",
+    "The questions and relative answers expected are shown below:\n",
+    "\n",
+    " - **Q1:** \"What project put the first Americans into space?\" \n",
+    "  - **A1:** \"Project Mercury\"\n",
+    " - **Q2:** \"What program was created to carry out these projects and missions?\"\n",
+    "  - **A2:** \"The Apollo program\"\n",
+    " - **Q3:** \"What year did the first manned Apollo flight occur?\"\n",
+    "  - **A3:** \"1968\"\n",
+    " - **Q4:** \"What President is credited with the original notion of putting Americans in space?\"\n",
+    "  - **A4:** \"John F. Kennedy\"\n",
+    " - **Q5:** \"Who did the U.S. collaborate with on an Earth orbit mission in 1975?\"\n",
+    "  - **A5:** \"Soviet Union\"\n",
+    " - **Q6:** \"How long did Project Apollo run?\"\n",
+    "  - **A6:** \"1961 to 1972\"\n",
+    " - **Q7:** \"What program helped develop space travel techniques that Project Apollo used?\"\n",
+    "  - **A7:** \"Gemini Mission\"\n",
+    " - **Q8:** \"What space station supported three manned missions in 1973-1974?\"\n",
+    "  - **A8:** \"Skylab\"\n",
+    "  \n",
+    "---\n",
+    "\n",
+    "The paragraph and the questions can be easily customized by changing the code below:\n",
+    "\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile input.json\n",
+    "{\"data\": \n",
+    " [\n",
+    "     {\"title\": \"Project Apollo\",\n",
+    "      \"paragraphs\": [\n",
+    "          {\"context\":\"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\", \n",
+    "           \"qas\": [\n",
+    "               { \"question\": \"What project put the first Americans into space?\", \n",
+    "                 \"id\": \"Q1\"\n",
+    "               },\n",
+    "               { \"question\": \"What program was created to carry out these projects and missions?\",\n",
+    "                 \"id\": \"Q2\"\n",
+    "               },\n",
+    "               { \"question\": \"What year did the first manned Apollo flight occur?\",\n",
+    "                 \"id\": \"Q3\"\n",
+    "               },                \n",
+    "               { \"question\": \"What President is credited with the original notion of putting Americans in space?\",\n",
+    "                 \"id\": \"Q4\"\n",
+    "               },\n",
+    "               { \"question\": \"Who did the U.S. collaborate with on an Earth orbit mission in 1975?\",\n",
+    "                 \"id\": \"Q5\"\n",
+    "               },\n",
+    "               { \"question\": \"How long did Project Apollo run?\",\n",
+    "                 \"id\": \"Q6\"\n",
+    "               },               \n",
+    "               { \"question\": \"What program helped develop space travel techniques that Project Apollo used?\",\n",
+    "                 \"id\": \"Q7\"\n",
+    "               },                \n",
+    "               {\"question\": \"What space station supported three manned missions in 1973-1974?\",\n",
+    "                 \"id\": \"Q8\"\n",
+    "               }                \n",
+    "]}]}]}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "notebooks_dir = '/workspace/bert/notebooks'\n",
+    "data_dir = '/workspace/bert/data/download'\n",
+    "\n",
+    "working_dir = '/workspace/bert'\n",
+    "if working_dir not in sys.path:\n",
+    "    sys.path.append(working_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_file = os.path.join(notebooks_dir, 'input.json')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.b Mixed Precision\n",
+    "\n",
+    "Mixed precision training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of tensor cores in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.\n",
+    "\n",
+    "For information about:\n",
+    "- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.\n",
+    "- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.\n",
+    "- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this notebook we control mixed precision execution with the environmental variable:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"TF_ENABLE_AUTO_MIXED_PRECISION\"] = \"1\" "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can choose the mixed precision model (which takes much less time to train than the fp32 version) without losing accuracy, with the following flag: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "use_mixed_precision_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To effectively evaluate the speedup of mixed precision try a bigger workload by uncommenting the following line:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#input_file = '/workspace/bert/data/download/squad/v2.0/dev-v2.0.json'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Fine-Tuned NVIDIA BERT TF Models\n",
+    "\n",
+    "Based on the model size, we have the following two default configurations of BERT.\n",
+    "\n",
+    "| **Model** | **Hidden layers** | **Hidden unit size** | **Attention heads** | **Feedforward filter size** | **Max sequence length** | **Parameters** |\n",
+    "|:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|\n",
+    "|BERTBASE |12 encoder| 768| 12|4 x  768|512|110M|\n",
+    "|BERTLARGE|24 encoder|1024| 16|4 x 1024|512|330M|\n",
+    "\n",
+    "We will take advantage of the fine-tuned models available on NGC (NVIDIA GPU Cluster, https://ngc.nvidia.com).\n",
+    "Among the many configurations available we will download these two:\n",
+    "\n",
+    " - **bert_tf_v2_large_fp32_384**\n",
+    "\n",
+    " - **bert_tf_v2_large_fp16_384**\n",
+    "\n",
+    "Which are trained on the SQuaD 2.0 Dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# bert_tf_v2_large_fp32_384\n",
+    "DATA_DIR_FP32='/workspace/bert/data/download/finetuned_model_fp32'\n",
+    "!mkdir -p $DATA_DIR_FP32\n",
+    "!wget -nc -q --show-progress -O $DATA_DIR_FP32/bert_tf_v2_large_fp32_384.zip \\\n",
+    "https://api.ngc.nvidia.com/v2/models/nvidia/bert_tf_v2_large_fp32_384/versions/1/zip\n",
+    "!unzip -n -d $DATA_DIR_FP32/ $DATA_DIR_FP32/bert_tf_v2_large_fp32_384.zip \n",
+    "    \n",
+    "# bert_tf_v2_large_fp16_384\n",
+    "DATA_DIR_FP16='/workspace/bert/data/download/finetuned_model_fp16'\n",
+    "!mkdir -p $DATA_DIR_FP16\n",
+    "!wget -nc -q --show-progress -O $DATA_DIR_FP16/bert_tf_v2_large_fp16_384.zip \\\n",
+    "https://api.ngc.nvidia.com/v2/models/nvidia/bert_tf_v2_large_fp16_384/versions/1/zip\n",
+    "!unzip -n -d $DATA_DIR_FP16/ $DATA_DIR_FP16/bert_tf_v2_large_fp16_384.zip "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the code that follows we will refer to these models."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Running QA task inference\n",
+    "\n",
+    "In order to run QA inference we will follow step-by-step the flow implemented in run_squad.py.\n",
+    "\n",
+    "Configuration:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import run_squad\n",
+    "import json\n",
+    "import tensorflow as tf\n",
+    "import modeling\n",
+    "import tokenization\n",
+    "import time\n",
+    "import random\n",
+    "\n",
+    "tf.logging.set_verbosity(tf.logging.INFO)\n",
+    "\n",
+    "# Create the output directory where all the results are saved.\n",
+    "output_dir = os.path.join(working_dir, 'results')\n",
+    "tf.gfile.MakeDirs(output_dir)\n",
+    "\n",
+    "# The config json file corresponding to the pre-trained BERT model.\n",
+    "# This specifies the model architecture.\n",
+    "bert_config_file = os.path.join(data_dir, 'google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json')\n",
+    "\n",
+    "# The vocabulary file that the BERT model was trained on.\n",
+    "vocab_file = os.path.join(data_dir, 'google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt')\n",
+    "\n",
+    "# Depending on the mixed precision flag we use different fine-tuned model\n",
+    "if use_mixed_precision_model:\n",
+    "    init_checkpoint = os.path.join(data_dir, 'finetuned_model_fp16/model.ckpt-8144')\n",
+    "else:\n",
+    "    init_checkpoint = os.path.join(data_dir, 'finetuned_model_fp32/model.ckpt-8144')\n",
+    "\n",
+    "# Whether to lower case the input text. \n",
+    "# Should be True for uncased models and False for cased models.\n",
+    "do_lower_case = True\n",
+    "  \n",
+    "# Total batch size for predictions\n",
+    "predict_batch_size = 1\n",
+    "params = dict([('batch_size', predict_batch_size)])\n",
+    "\n",
+    "# The maximum total input sequence length after WordPiece tokenization. \n",
+    "# Sequences longer than this will be truncated, and sequences shorter than this will be padded.\n",
+    "max_seq_length = 384\n",
+    "\n",
+    "# When splitting up a long document into chunks, how much stride to take between chunks.\n",
+    "doc_stride = 128\n",
+    "\n",
+    "# The maximum number of tokens for the question. \n",
+    "# Questions longer than this will be truncated to this length.\n",
+    "max_query_length = 64\n",
+    "\n",
+    "# This is a WA to use flags from here:\n",
+    "flags = tf.flags\n",
+    "\n",
+    "if 'f' not in tf.flags.FLAGS: \n",
+    "    tf.app.flags.DEFINE_string('f', '', 'kernel')\n",
+    "FLAGS = flags.FLAGS\n",
+    "\n",
+    "# The total number of n-best predictions to generate in the nbest_predictions.json output file.\n",
+    "n_best_size = 20\n",
+    "\n",
+    "# The maximum length of an answer that can be generated. \n",
+    "# This is needed  because the start and end predictions are not conditioned on one another.\n",
+    "max_answer_length = 30"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's define the tokenizer and create the model for the estimator:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Validate the casing config consistency with the checkpoint name.\n",
+    "tokenization.validate_case_matches_checkpoint(do_lower_case, init_checkpoint)\n",
+    "\n",
+    "# Create the tokenizer.\n",
+    "tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)\n",
+    "\n",
+    "# Load the configuration from file\n",
+    "bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
+    "\n",
+    "def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument\n",
+    "    unique_ids = features[\"unique_ids\"]\n",
+    "    input_ids = features[\"input_ids\"]\n",
+    "    input_mask = features[\"input_mask\"]\n",
+    "    segment_ids = features[\"segment_ids\"]\n",
+    "\n",
+    "    (start_logits, end_logits) = run_squad.create_model(\n",
+    "        bert_config=bert_config,\n",
+    "        is_training=False,\n",
+    "        input_ids=input_ids,\n",
+    "        input_mask=input_mask,\n",
+    "        segment_ids=segment_ids,\n",
+    "        use_one_hot_embeddings=False)\n",
+    "\n",
+    "    tvars = tf.trainable_variables()\n",
+    "\n",
+    "    initialized_variable_names = {}\n",
+    "    (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)\n",
+    "    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
+    "    output_spec = None\n",
+    "    predictions = {\"unique_ids\": unique_ids,\n",
+    "                   \"start_logits\": start_logits,\n",
+    "                   \"end_logits\": end_logits}\n",
+    "    output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)\n",
+    "    return output_spec\n",
+    "\n",
+    "config = tf.ConfigProto(log_device_placement=True) \n",
+    "\n",
+    "run_config = tf.estimator.RunConfig(\n",
+    "      model_dir=None,\n",
+    "      session_config=config,\n",
+    "      save_checkpoints_steps=1000,\n",
+    "      keep_checkpoint_max=1)\n",
+    "\n",
+    "estimator = tf.estimator.Estimator(\n",
+    "  model_fn=model_fn,\n",
+    "  config=run_config,\n",
+    "  params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.a Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "eval_examples = run_squad.read_squad_examples(\n",
+    "        input_file=input_file, is_training=False)\n",
+    "\n",
+    "eval_writer = run_squad.FeatureWriter(\n",
+    "    filename=os.path.join(output_dir, \"eval.tf_record\"),\n",
+    "    is_training=False)\n",
+    "\n",
+    "eval_features = []\n",
+    "def append_feature(feature):\n",
+    "    eval_features.append(feature)\n",
+    "    eval_writer.process_feature(feature)\n",
+    "\n",
+    "\n",
+    "# Loads a data file into a list of InputBatch's\n",
+    "run_squad.convert_examples_to_features(\n",
+    "    examples=eval_examples,\n",
+    "    tokenizer=tokenizer,\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    doc_stride=doc_stride,\n",
+    "    max_query_length=max_query_length,\n",
+    "    is_training=False,\n",
+    "    output_fn=append_feature)\n",
+    "\n",
+    "eval_writer.close()\n",
+    "\n",
+    "tf.logging.info(\"***** Running predictions *****\")\n",
+    "tf.logging.info(\"  Num orig examples = %d\", len(eval_examples))\n",
+    "tf.logging.info(\"  Num split examples = %d\", len(eval_features))\n",
+    "tf.logging.info(\"  Batch size = %d\", predict_batch_size)\n",
+    "\n",
+    "predict_input_fn = run_squad.input_fn_builder(\n",
+    "    input_file=eval_writer.filename,\n",
+    "    batch_size=predict_batch_size,\n",
+    "    seq_length=max_seq_length,\n",
+    "    is_training=False,\n",
+    "    drop_remainder=False)\n",
+    "\n",
+    "all_results = []\n",
+    "eval_hooks = [run_squad.LogEvalRunHook(predict_batch_size)]\n",
+    "eval_start_time = time.time()\n",
+    "for result in estimator.predict(\n",
+    "        predict_input_fn, yield_single_examples=True, hooks=eval_hooks, checkpoint_path=init_checkpoint):\n",
+    "    unique_id = int(result[\"unique_ids\"])\n",
+    "    start_logits = [float(x) for x in result[\"start_logits\"].flat]\n",
+    "    end_logits = [float(x) for x in result[\"end_logits\"].flat]\n",
+    "    all_results.append(\n",
+    "      run_squad.RawResult(\n",
+    "          unique_id=unique_id,\n",
+    "          start_logits=start_logits,\n",
+    "          end_logits=end_logits))\n",
+    "\n",
+    "eval_time_elapsed = time.time() - eval_start_time\n",
+    "\n",
+    "eval_time_wo_startup = eval_hooks[-1].total_time\n",
+    "num_sentences = eval_hooks[-1].count * predict_batch_size\n",
+    "avg_sentences_per_second = num_sentences * 1.0 / eval_time_wo_startup\n",
+    "\n",
+    "tf.logging.info(\"-----------------------------\")\n",
+    "tf.logging.info(\"Total Inference Time = %0.2f Inference Time W/O start up overhead = %0.2f \"\n",
+    "                \"Sentences processed = %d\", eval_time_elapsed, eval_time_wo_startup,\n",
+    "                num_sentences)\n",
+    "tf.logging.info(\"Inference Performance = %0.4f sentences/sec\", avg_sentences_per_second)\n",
+    "tf.logging.info(\"-----------------------------\")\n",
+    "\n",
+    "output_prediction_file = os.path.join(output_dir, \"predictions.json\")\n",
+    "output_nbest_file = os.path.join(output_dir, \"nbest_predictions.json\")\n",
+    "output_null_log_odds_file = os.path.join(output_dir, \"null_odds.json\")\n",
+    "\n",
+    "run_squad.write_predictions(eval_examples, eval_features, all_results,\n",
+    "                  n_best_size, max_answer_length,\n",
+    "                  do_lower_case, output_prediction_file,\n",
+    "                  output_nbest_file, output_null_log_odds_file)\n",
+    "\n",
+    "tf.logging.info(\"Inference Results:\")\n",
+    "\n",
+    "# Here we show only the prediction results, nbest prediction is also available in the output directory\n",
+    "results = \"\"\n",
+    "with open(output_prediction_file, 'r') as json_file:\n",
+    "    data = json.load(json_file)\n",
+    "    for question in eval_examples:\n",
+    "        results += \"<tr><td>{}</td><td>{}</td><td>{}</td></tr>\".format(question.qas_id, question.question_text, data[question.qas_id])\n",
+    "\n",
+    "\n",
+    "from IPython.display import display, HTML\n",
+    "display(HTML(\"<table><tr><th>Id</th><th>Question</th><th>Answer</th></tr>{}</table>\".format(results)))        "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. What's next"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that you are familiar with running QA Inference on BERT, using mixed precision, you may want to try\n",
+    "your own paragraphs and queries. \n",
+    "\n",
+    "You may also want to take a look to the notebook __bert_squad_tf_finetuning.ipynb__ on how to run fine-tuning on BERT, available in the same directory."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,765 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "jDXroBuNw60P"
+   },
+   "outputs": [],
+   "source": [
+    "# Copyright 2019 NVIDIA Corporation. All Rights Reserved.\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "# =============================================================================="
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/LanguageModeling/BERT/notebooks/bert_squad_tf_inference_colab.ipynb#scrollTo=5hRb96NKE3X0\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "k-XnFINow60d"
+   },
+   "source": [
+    "<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
+    "\n",
+    "# BERT Question Answering Inference with Mixed Precision\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "TfF7V662w60j"
+   },
+   "source": [
+    "## 1. Overview\n",
+    "\n",
+    "Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. \n",
+    "\n",
+    "The original paper can be found here: https://arxiv.org/abs/1810.04805.\n",
+    "\n",
+    "NVIDIA's BERT 19.10 is an optimized version of Google's official implementation, leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "Ah3Lv9zyw60l"
+   },
+   "source": [
+    "### 1.a Learning objectives\n",
+    "\n",
+    "This notebook demonstrates:\n",
+    "- Inference on QA task with BERT Large model\n",
+    "- The use/download of fine-tuned NVIDIA BERT models\n",
+    "- Use of Mixed Precision for Inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "hxNJ8HByw60o"
+   },
+   "source": [
+    "## 2. Requirements\n",
+    "\n",
+    "### 2.a GPU\n",
+    "\n",
+    "Before running this notebook, please set the Colab runtime environment to GPU via the menu *Runtime => Change runtime type => GPU*.\n",
+    "\n",
+    "This demo will work on any NVIDIA GPU with CUDA cores, though for improved FP16 inference, a Volta, Turing or newer generation GPU with Tensor cores is desired.  On Google Colab, this normally means a T4 GPU. If you are assigned an older K80 GPU, another trial at another time might give you a T4 GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "hxNJ8HByw60o"
+   },
+   "source": [
+    "### 2.b Download the required files from NVIDIA-Github:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "KV_WnOY4zUa_"
+   },
+   "outputs": [],
+   "source": [
+    "!wget -nc -q --show-progress -O ./master.zip \\\n",
+    "https://github.com/NVIDIA/DeepLearningExamples/archive/master.zip\n",
+    "!unzip -q -n -d . ./master.zip "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "5D7i7Pao5qoj"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "WORKSPACE_DIR='./DeepLearningExamples-master/TensorFlow/LanguageModeling/BERT/'\n",
+    "os.chdir(WORKSPACE_DIR)\n",
+    "print (os.getcwd())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "mjlZbP0dw60r"
+   },
+   "source": [
+    "## 3. BERT Inference: Question Answering\n",
+    "\n",
+    "We can run inference on a fine-tuned BERT model for tasks like Question Answering.\n",
+    "\n",
+    "Here we use a BERT model fine-tuned on a [SQuaD 2.0 Dataset](https://rajpurkar.github.io/SQuAD-explorer/) which contains 100,000+ question-answer pairs on 500+ articles combined with over 50,000 new, unanswerable questions."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "mOc16svBw60t"
+   },
+   "source": [
+    "### 3.a Paragraph and Queries\n",
+    "\n",
+    "In this example we will ask our BERT model questions related to the following paragraph:\n",
+    "\n",
+    "**The Apollo Program**\n",
+    "_\"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\"_\n",
+    "\n",
+    "The questions and relative answers expected are shown below:\n",
+    "\n",
+    " - **Q1:** \"What project put the first Americans into space?\" \n",
+    "  - **A1:** \"Project Mercury\"\n",
+    " - **Q2:** \"What program was created to carry out these projects and missions?\"\n",
+    "  - **A2:** \"The Apollo program\"\n",
+    " - **Q3:** \"What year did the first manned Apollo flight occur?\"\n",
+    "  - **A3:** \"1968\"\n",
+    " - **Q4:** \"What President is credited with the original notion of putting Americans in space?\"\n",
+    "  - **A4:** \"John F. Kennedy\"\n",
+    " - **Q5:** \"Who did the U.S. collaborate with on an Earth orbit mission in 1975?\"\n",
+    "  - **A5:** \"Soviet Union\"\n",
+    " - **Q6:** \"How long did Project Apollo run?\"\n",
+    "  - **A6:** \"1961 to 1972\"\n",
+    " - **Q7:** \"What program helped develop space travel techniques that Project Apollo used?\"\n",
+    "  - **A7:** \"Gemini Mission\"\n",
+    " - **Q8:** \"What space station supported three manned missions in 1973-1974?\"\n",
+    "  - **A8:** \"Skylab\"\n",
+    "  \n",
+    "---\n",
+    "\n",
+    "The paragraph and the questions can be easily customized by changing the code below:\n",
+    "\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "srU0TT1Iw60v"
+   },
+   "outputs": [],
+   "source": [
+    "%%writefile input.json\n",
+    "{\"data\": \n",
+    " [\n",
+    "     {\"title\": \"Project Apollo\",\n",
+    "      \"paragraphs\": [\n",
+    "          {\"context\":\"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\", \n",
+    "           \"qas\": [\n",
+    "               { \"question\": \"What project put the first Americans into space?\", \n",
+    "                 \"id\": \"Q1\"\n",
+    "               },\n",
+    "               { \"question\": \"What program was created to carry out these projects and missions?\",\n",
+    "                 \"id\": \"Q2\"\n",
+    "               },\n",
+    "               { \"question\": \"What year did the first manned Apollo flight occur?\",\n",
+    "                 \"id\": \"Q3\"\n",
+    "               },                \n",
+    "               { \"question\": \"What President is credited with the original notion of putting Americans in space?\",\n",
+    "                 \"id\": \"Q4\"\n",
+    "               },\n",
+    "               { \"question\": \"Who did the U.S. collaborate with on an Earth orbit mission in 1975?\",\n",
+    "                 \"id\": \"Q5\"\n",
+    "               },\n",
+    "               { \"question\": \"How long did Project Apollo run?\",\n",
+    "                 \"id\": \"Q6\"\n",
+    "               },               \n",
+    "               { \"question\": \"What program helped develop space travel techniques that Project Apollo used?\",\n",
+    "                 \"id\": \"Q7\"\n",
+    "               },                \n",
+    "               {\"question\": \"What space station supported three manned missions in 1973-1974?\",\n",
+    "                 \"id\": \"Q8\"\n",
+    "               }                \n",
+    "]}]}]}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "ujyka-8Iw603"
+   },
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "\n",
+    "working_dir = os.getcwd();\n",
+    "data_dir = os.path.join(working_dir, 'data/download');\n",
+    "if working_dir not in sys.path:\n",
+    "    sys.path.append(working_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "6gA3-6LVw61D"
+   },
+   "outputs": [],
+   "source": [
+    "input_file = os.path.join(working_dir, 'input.json')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "D9p8XaBnw61N"
+   },
+   "source": [
+    "### 3.b Mixed Precision\n",
+    "\n",
+    "Mixed precision training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of tensor cores in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.\n",
+    "\n",
+    "For information about:\n",
+    "- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.\n",
+    "- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.\n",
+    "- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "ceeYPqQcw61P"
+   },
+   "source": [
+    "In this notebook we control mixed precision execution with the environmental variable:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "k4jIJevFw61R"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"TF_ENABLE_AUTO_MIXED_PRECISION\"] = \"1\" "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "rt_4-ZA5w61Y"
+   },
+   "source": [
+    "We can choose the mixed precision model (which takes much less time to train than the fp32 version) without losing accuracy, with the following flag: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "BRdclfEaw61Z"
+   },
+   "outputs": [],
+   "source": [
+    "use_mixed_precision_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "iu4Jb5puw61p"
+   },
+   "source": [
+    "## 4. Fine-Tuned NVIDIA BERT TF Models\n",
+    "\n",
+    "Based on the model size, we have the following two default configurations of BERT.\n",
+    "\n",
+    "| **Model** | **Hidden layers** | **Hidden unit size** | **Attention heads** | **Feedforward filter size** | **Max sequence length** | **Parameters** |\n",
+    "|:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|\n",
+    "|BERTBASE |12 encoder| 768| 12|4 x  768|512|110M|\n",
+    "|BERTLARGE|24 encoder|1024| 16|4 x 1024|512|330M|\n",
+    "\n",
+    "We will take advantage of the fine-tuned models available on NGC (NVIDIA GPU Cluster, https://ngc.nvidia.com).\n",
+    "Among the many configurations available we will download these two:\n",
+    "\n",
+    " - **bert_tf_v2_large_fp32_384**\n",
+    "\n",
+    " - **bert_tf_v2_large_fp16_384**\n",
+    "\n",
+    "Which are trained on the SQuaD 2.0 Dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "5JWKZfP8w61t"
+   },
+   "outputs": [],
+   "source": [
+    "# bert_tf_v2_large_fp32_384\n",
+    "DATA_DIR_FP32 = os.path.join(data_dir, 'finetuned_model_fp32')\n",
+    "!mkdir -p $DATA_DIR_FP32\n",
+    "!wget -nc -q --show-progress -O $DATA_DIR_FP32/bert_tf_v2_large_fp32_384.zip \\\n",
+    "https://api.ngc.nvidia.com/v2/models/nvidia/bert_tf_v2_large_fp32_384/versions/1/zip\n",
+    "!unzip -n -d $DATA_DIR_FP32/ $DATA_DIR_FP32/bert_tf_v2_large_fp32_384.zip \n",
+    "    \n",
+    "# bert_tf_v2_large_fp16_384\n",
+    "DATA_DIR_FP16  = os.path.join(data_dir, 'finetuned_model_fp16')\n",
+    "!mkdir -p $DATA_DIR_FP16\n",
+    "!wget -nc -q --show-progress -O $DATA_DIR_FP16/bert_tf_v2_large_fp16_384.zip \\\n",
+    "https://api.ngc.nvidia.com/v2/models/nvidia/bert_tf_v2_large_fp16_384/versions/1/zip\n",
+    "!unzip -n -d $DATA_DIR_FP16/ $DATA_DIR_FP16/bert_tf_v2_large_fp16_384.zip "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "GrFrZickw61z"
+   },
+   "source": [
+    "In the code that follows we will refer to these models."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "cU8mGJDa1FfX"
+   },
+   "source": [
+    "Download the Google pretrained weights and vocab file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "5hRb96NKE3X0"
+   },
+   "outputs": [],
+   "source": [
+    "os.chdir(\"./data\");\n",
+    "from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader\n",
+    "gd = GooglePretrainedWeightDownloader(data_dir)\n",
+    "gd.download()\n",
+    "os.chdir(\"..\");"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "VY1Dipam15DE"
+   },
+   "source": [
+    "We need the horovod package:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "jqAJob92C2wA"
+   },
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    __import__(\"horovod\")\n",
+    "except ImportError:\n",
+    "    os.system(\"pip install horovod\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "5NuuGNsDw611"
+   },
+   "source": [
+    "## 5. Running QA task inference\n",
+    "\n",
+    "In order to run QA inference we will follow step-by-step the flow implemented in run_squad.py.\n",
+    "\n",
+    "Configuration:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "_c2qCQ9-w613"
+   },
+   "outputs": [],
+   "source": [
+    "import run_squad\n",
+    "import json\n",
+    "import tensorflow as tf\n",
+    "import modeling\n",
+    "import tokenization\n",
+    "import time\n",
+    "import random\n",
+    "\n",
+    "tf.logging.set_verbosity(tf.logging.INFO)\n",
+    "\n",
+    "# Create the output directory where all the results are saved.\n",
+    "output_dir = os.path.join(working_dir, 'results')\n",
+    "tf.gfile.MakeDirs(output_dir)\n",
+    "\n",
+    "# The config json file corresponding to the pre-trained BERT model.\n",
+    "# This specifies the model architecture.\n",
+    "bert_config_file = os.path.join(data_dir, 'google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json')\n",
+    "\n",
+    "# The vocabulary file that the BERT model was trained on.\n",
+    "vocab_file = os.path.join(data_dir, 'google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt')\n",
+    "\n",
+    "# Depending on the mixed precision flag we use different fine-tuned model\n",
+    "if use_mixed_precision_model:\n",
+    "    init_checkpoint = os.path.join(data_dir, 'finetuned_model_fp16/model.ckpt-8144')\n",
+    "else:\n",
+    "    init_checkpoint = os.path.join(data_dir, 'finetuned_model_fp32/model.ckpt-8144')\n",
+    "\n",
+    "# Whether to lower case the input text. \n",
+    "# Should be True for uncased models and False for cased models.\n",
+    "do_lower_case = True\n",
+    "  \n",
+    "# Total batch size for predictions\n",
+    "predict_batch_size = 1\n",
+    "params = dict([('batch_size', predict_batch_size)])\n",
+    "\n",
+    "# The maximum total input sequence length after WordPiece tokenization. \n",
+    "# Sequences longer than this will be truncated, and sequences shorter than this will be padded.\n",
+    "max_seq_length = 384\n",
+    "\n",
+    "# When splitting up a long document into chunks, how much stride to take between chunks.\n",
+    "doc_stride = 128\n",
+    "\n",
+    "# The maximum number of tokens for the question. \n",
+    "# Questions longer than this will be truncated to this length.\n",
+    "max_query_length = 64\n",
+    "\n",
+    "# This is a WA to use flags from here:\n",
+    "flags = tf.flags\n",
+    "\n",
+    "if 'f' not in tf.flags.FLAGS: \n",
+    "    tf.app.flags.DEFINE_string('f', '', 'kernel')\n",
+    "FLAGS = flags.FLAGS\n",
+    "\n",
+    "# The total number of n-best predictions to generate in the nbest_predictions.json output file.\n",
+    "n_best_size = 20\n",
+    "\n",
+    "# The maximum length of an answer that can be generated. \n",
+    "# This is needed  because the start and end predictions are not conditioned on one another.\n",
+    "max_answer_length = 30"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "2h_eLUgPw618"
+   },
+   "source": [
+    "Let's define the tokenizer and create the model for the estimator:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "RXHdoUb9w619"
+   },
+   "outputs": [],
+   "source": [
+    "# Validate the casing config consistency with the checkpoint name.\n",
+    "tokenization.validate_case_matches_checkpoint(do_lower_case, init_checkpoint)\n",
+    "\n",
+    "# Create the tokenizer.\n",
+    "tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)\n",
+    "\n",
+    "# Load the configuration from file\n",
+    "bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
+    "\n",
+    "def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument\n",
+    "    unique_ids = features[\"unique_ids\"]\n",
+    "    input_ids = features[\"input_ids\"]\n",
+    "    input_mask = features[\"input_mask\"]\n",
+    "    segment_ids = features[\"segment_ids\"]\n",
+    "\n",
+    "    (start_logits, end_logits) = run_squad.create_model(\n",
+    "        bert_config=bert_config,\n",
+    "        is_training=False,\n",
+    "        input_ids=input_ids,\n",
+    "        input_mask=input_mask,\n",
+    "        segment_ids=segment_ids,\n",
+    "        use_one_hot_embeddings=False)\n",
+    "\n",
+    "    tvars = tf.trainable_variables()\n",
+    "\n",
+    "    initialized_variable_names = {}\n",
+    "    (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)\n",
+    "    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
+    "    output_spec = None\n",
+    "    predictions = {\"unique_ids\": unique_ids,\n",
+    "                   \"start_logits\": start_logits,\n",
+    "                   \"end_logits\": end_logits}\n",
+    "    output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)\n",
+    "    return output_spec\n",
+    "\n",
+    "config = tf.ConfigProto(log_device_placement=True) \n",
+    "\n",
+    "run_config = tf.estimator.RunConfig(\n",
+    "      model_dir=None,\n",
+    "      session_config=config,\n",
+    "      save_checkpoints_steps=1000,\n",
+    "      keep_checkpoint_max=1)\n",
+    "\n",
+    "estimator = tf.estimator.Estimator(\n",
+    "  model_fn=model_fn,\n",
+    "  config=run_config,\n",
+    "  params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "xSKkf4JLw62E"
+   },
+   "source": [
+    "### 5.a Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "3OKhc349w62F",
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "eval_examples = run_squad.read_squad_examples(\n",
+    "        input_file=input_file, is_training=False)\n",
+    "\n",
+    "eval_writer = run_squad.FeatureWriter(\n",
+    "    filename=os.path.join(output_dir, \"eval.tf_record\"),\n",
+    "    is_training=False)\n",
+    "\n",
+    "eval_features = []\n",
+    "def append_feature(feature):\n",
+    "    eval_features.append(feature)\n",
+    "    eval_writer.process_feature(feature)\n",
+    "\n",
+    "\n",
+    "# Loads a data file into a list of InputBatch's\n",
+    "run_squad.convert_examples_to_features(\n",
+    "    examples=eval_examples,\n",
+    "    tokenizer=tokenizer,\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    doc_stride=doc_stride,\n",
+    "    max_query_length=max_query_length,\n",
+    "    is_training=False,\n",
+    "    output_fn=append_feature)\n",
+    "\n",
+    "eval_writer.close()\n",
+    "\n",
+    "tf.logging.info(\"***** Running predictions *****\")\n",
+    "tf.logging.info(\"  Num orig examples = %d\", len(eval_examples))\n",
+    "tf.logging.info(\"  Num split examples = %d\", len(eval_features))\n",
+    "tf.logging.info(\"  Batch size = %d\", predict_batch_size)\n",
+    "\n",
+    "predict_input_fn = run_squad.input_fn_builder(\n",
+    "    input_file=eval_writer.filename,\n",
+    "    batch_size=predict_batch_size,\n",
+    "    seq_length=max_seq_length,\n",
+    "    is_training=False,\n",
+    "    drop_remainder=False)\n",
+    "\n",
+    "all_results = []\n",
+    "eval_hooks = [run_squad.LogEvalRunHook(predict_batch_size)]\n",
+    "eval_start_time = time.time()\n",
+    "for result in estimator.predict(\n",
+    "        predict_input_fn, yield_single_examples=True, hooks=eval_hooks, checkpoint_path=init_checkpoint):\n",
+    "    unique_id = int(result[\"unique_ids\"])\n",
+    "    start_logits = [float(x) for x in result[\"start_logits\"].flat]\n",
+    "    end_logits = [float(x) for x in result[\"end_logits\"].flat]\n",
+    "    all_results.append(\n",
+    "      run_squad.RawResult(\n",
+    "          unique_id=unique_id,\n",
+    "          start_logits=start_logits,\n",
+    "          end_logits=end_logits))\n",
+    "\n",
+    "eval_time_elapsed = time.time() - eval_start_time\n",
+    "\n",
+    "eval_time_wo_startup = eval_hooks[-1].total_time\n",
+    "num_sentences = eval_hooks[-1].count * predict_batch_size\n",
+    "avg_sentences_per_second = num_sentences * 1.0 / eval_time_wo_startup\n",
+    "\n",
+    "tf.logging.info(\"-----------------------------\")\n",
+    "tf.logging.info(\"Total Inference Time = %0.2f Inference Time W/O start up overhead = %0.2f \"\n",
+    "                \"Sentences processed = %d\", eval_time_elapsed, eval_time_wo_startup,\n",
+    "                num_sentences)\n",
+    "tf.logging.info(\"Inference Performance = %0.4f sentences/sec\", avg_sentences_per_second)\n",
+    "tf.logging.info(\"-----------------------------\")\n",
+    "\n",
+    "output_prediction_file = os.path.join(output_dir, \"predictions.json\")\n",
+    "output_nbest_file = os.path.join(output_dir, \"nbest_predictions.json\")\n",
+    "output_null_log_odds_file = os.path.join(output_dir, \"null_odds.json\")\n",
+    "\n",
+    "run_squad.write_predictions(eval_examples, eval_features, all_results,\n",
+    "                  n_best_size, max_answer_length,\n",
+    "                  do_lower_case, output_prediction_file,\n",
+    "                  output_nbest_file, output_null_log_odds_file)\n",
+    "\n",
+    "tf.logging.info(\"Inference Results:\")\n",
+    "\n",
+    "# Here we show only the prediction results, nbest prediction is also available in the output directory\n",
+    "results = \"\"\n",
+    "with open(output_prediction_file, 'r') as json_file:\n",
+    "    data = json.load(json_file)\n",
+    "    for question in eval_examples:\n",
+    "        results += \"<tr><td>{}</td><td>{}</td><td>{}</td></tr>\".format(question.qas_id, question.question_text, data[question.qas_id])\n",
+    "\n",
+    "\n",
+    "from IPython.display import display, HTML\n",
+    "display(HTML(\"<table><tr><th>Id</th><th>Question</th><th>Answer</th></tr>{}</table>\".format(results)))        "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "EMT0sKxHw62L"
+   },
+   "source": [
+    "## 6. What's next"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "mKBM_UD6w62N"
+   },
+   "source": [
+    "Now that you are familiar with running QA Inference on BERT, using mixed precision, you may want to try\n",
+    "your own paragraphs and queries. \n",
+    "\n",
+    "You may also want to take a look to the notebook __bert_squad_tf_finetuning.ipynb__ on how to run fine-tuning on BERT, available in the same directory."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "name": "bert_squad_tf_inference.ipynb",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
@@ -0,0 +1,610 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright 2019 NVIDIA Corporation. All Rights Reserved.\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "# =============================================================================="
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
+    "\n",
+    "# BioBERT Named-Entity Recognition Inference with Mixed Precision\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Overview\n",
+    "\n",
+    "Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. \n",
+    "\n",
+    "BioBERT is a domain specific version of BERT that has been trained on PubMed abstracts.\n",
+    "\n",
+    "The original BioBERT paper can be found here: https://arxiv.org/abs/1901.08746\n",
+    "\n",
+    "NVIDIA's BioBERT is an optimized version of the implementation presented in the paper, leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.a Learning objectives\n",
+    "\n",
+    "This notebook demonstrates:\n",
+    "- Inference on NER task with BioBERT model\n",
+    "- The use/download of fine-tuned NVIDIA BioBERT models\n",
+    "- Use of Mixed Precision for Inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Requirements\n",
+    "\n",
+    "Please refer to the ReadMe file"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. BioBERT Inference: Named-Entity Recognition\n",
+    "\n",
+    "We can run inference on a fine-tuned BioBERT model for tasks like Named-Entity Recognition.\n",
+    "\n",
+    "Here we use a BioBERT model fine-tuned on a [BC5CDR-disease Dataset](https://www.ncbi.nlm.nih.gov/research/bionlp/Data/) which consists of 1500 PubMed articles with 5818 annotated diseases."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.a Extract Disease Information from Text\n",
+    "\n",
+    "In this example we will use Named-Entity Recognition model created using BioBERT to extract disease information from the following paragraph:\n",
+    "\n",
+    "**Input Text**\n",
+    "\n",
+    "_\"The authors describe the case of a 56 - year - old woman with chronic, severe heart failure \n",
+    "secondary to dilated cardiomyopathy and absence of significant ventricular arrhythmias \n",
+    "who developed QT prolongation and torsade de pointes ventricular tachycardia during one cycle \n",
+    "of intermittent low dose (2.5 mcg/kg per min) dobutamine. \n",
+    "This report of torsade de pointes ventricular tachycardia during intermittent dobutamine \n",
+    "supports the hypothesis that unpredictable fatal arrhythmias may occur even with low doses \n",
+    "and in patients with no history of significant rhythm disturbances.\n",
+    "The mechanisms of proarrhythmic effects of Dubutamine are discussed.\"_\n",
+    "\n",
+    "**Output visualized using displaCy**\n",
+    "\n",
+    "<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">The authors describe the case of a 56 year old woman with chronic , severe \n",
+    "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
+    "    heart failure \n",
+    "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
+    "</mark>\n",
+    "secondary to \n",
+    "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
+    "    dilated cardiomyopathy \n",
+    "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
+    "</mark>\n",
+    "and absence of significant \n",
+    "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
+    "    ventricular arrhythmias \n",
+    "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
+    "</mark>\n",
+    "who developed QT \n",
+    "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
+    "    prolongation \n",
+    "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
+    "</mark>\n",
+    "and torsade de pointes \n",
+    "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
+    "    ventricular tachycardia \n",
+    "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
+    "</mark>\n",
+    "during one cycle of intermittent low dose ( 2.5 mcg / kg per min ) dobutamine . This report of torsade de pointes \n",
+    "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
+    "    ventricular tachycardia \n",
+    "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
+    "</mark>\n",
+    "during intermittent dobutamine supports the hypothesis that unpredictable fatal \n",
+    "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
+    "    arrhythmias \n",
+    "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
+    "</mark>\n",
+    "may occur even with low doses and in patients with no history of significant \n",
+    "<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
+    "    rhythm disturbances \n",
+    "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DISEASE</span>\n",
+    "</mark>\n",
+    ". The mechanisms of proarrhythmic effects of Dubutamine are discussed . </div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text= \"\"\"\n",
+    "The authors describe the case of a 56 year old woman with chronic, severe heart failure\n",
+    "secondary to dilated cardiomyopathy and absence of significant ventricular arrhythmias\n",
+    "who developed QT prolongation and torsade de pointes ventricular tachycardia during one cycle\n",
+    "of intermittent low dose (2.5 mcg/kg per min) dobutamine.\n",
+    "This report of torsade de pointes ventricular tachycardia during intermittent dobutamine\n",
+    "supports the hypothesis that unpredictable fatal arrhythmias may occur even with low doses\n",
+    "and in patients with no history of significant rhythm disturbances.\n",
+    "The mechanisms of proarrhythmic effects of Dubutamine are discussed.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "notebooks_dir = '/workspace/bert/notebooks'\n",
+    "working_dir = '/workspace/bert'\n",
+    "if working_dir not in sys.path:\n",
+    "    sys.path.append(working_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert the text into the IOB tags format seen during training, using dummy placeholder labels\n",
+    "import spacy\n",
+    "nlp = spacy.load(\"en_core_web_sm\")\n",
+    "\n",
+    "text = text.strip()\n",
+    "doc = nlp(text)\n",
+    "input_file = os.path.join(notebooks_dir, 'input.tsv')\n",
+    "with open(os.path.join(input_file), 'w') as wf: \n",
+    "    for word in doc:\n",
+    "        if word.text is '\\n':\n",
+    "            continue\n",
+    "        wf.write(word.text + '\\tO\\n')\n",
+    "    wf.write('\\n') # Indicate end of text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.b Mixed Precision\n",
+    "\n",
+    "Mixed precision training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of tensor cores in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.\n",
+    "\n",
+    "For information about:\n",
+    "- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.\n",
+    "- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.\n",
+    "- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this notebook we control mixed precision execution with the environmental variable:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"TF_ENABLE_AUTO_MIXED_PRECISION\"] = \"1\" "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The model we'll use was trained with mixed precision model, which takes much less time to train than the fp32 version, without losing accuracy. So we'll need to set with the following flag: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "use_mixed_precision_model = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Fine-Tuned NVIDIA BioBERT TF Models\n",
+    "\n",
+    "We have the following Named Entity Reconition models fine-tuned from BioBERT available on NGC (NVIDIA GPU Cluster, https://ngc.nvidia.com).\n",
+    "\n",
+    "| **Model** | **Description** |\n",
+    "|:---------:|:----------:|\n",
+    "|BioBERT NER BC5CDR Disease  | NER model to extract disease information from text, trained on the BC5CDR-Disease dataset |\n",
+    "|BioBERT NER BC5CDR Chemical | NER model to extract chemical information from text, trained on the BC5CDR-Chemical dataset. |\n",
+    "\n",
+    "\n",
+    "For this exampple, we will download the Diease NER model trained from the BC5CDR-disease Dataset.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# biobert_uncased_base_ner_disease\n",
+    "DATA_DIR_FP16='/workspace/bert/data/download/finetuned_model_fp16'\n",
+    "!mkdir -p $DATA_DIR_FP16\n",
+    "!wget -nc -q --show-progress -O $DATA_DIR_FP16/biobert_uncased_base_ner_disease.zip \\\n",
+    "https://api.ngc.nvidia.com/v2/models/nvidia/biobert_uncased_base_ner_disease/versions/1/zip\n",
+    "!unzip -n -d $DATA_DIR_FP16/ $DATA_DIR_FP16/biobert_uncased_base_ner_disease.zip "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the code that follows we will refer to these models."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Running NER task inference\n",
+    "\n",
+    "In order to run NER inference we will follow step-by-step the flow implemented in run_ner.py."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.a Configure Things"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import run_ner\n",
+    "from run_ner import BC5CDRProcessor, model_fn_builder, file_based_input_fn_builder, filed_based_convert_examples_to_features, result_to_pair\n",
+    "\n",
+    "import os, sys\n",
+    "import time\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "import modeling\n",
+    "import tokenization\n",
+    "\n",
+    "tf.logging.set_verbosity(tf.logging.ERROR)\n",
+    "\n",
+    "# Create the output directory where all the results are saved.\n",
+    "output_dir = os.path.join(working_dir, 'output')\n",
+    "tf.gfile.MakeDirs(output_dir)\n",
+    "\n",
+    "# The config json file corresponding to the pre-trained BERT model.\n",
+    "# This specifies the model architecture.\n",
+    "bert_config_file = os.path.join(DATA_DIR_FP16, 'bert_config.json')\n",
+    "\n",
+    "# The vocabulary file that the BERT model was trained on.\n",
+    "vocab_file = os.path.join(DATA_DIR_FP16, 'vocab.txt')\n",
+    "\n",
+    "init_checkpoint = os.path.join(DATA_DIR_FP16, 'model.ckpt-10251')\n",
+    "\n",
+    "# Whether to lower case the input text. \n",
+    "# Should be True for uncased models and False for cased models.\n",
+    "# The BioBERT available in NGC is uncased\n",
+    "do_lower_case = True\n",
+    "  \n",
+    "# Total batch size for predictions\n",
+    "predict_batch_size = 1\n",
+    "params = dict([('batch_size', predict_batch_size)])\n",
+    "\n",
+    "# The maximum total input sequence length after WordPiece tokenization. \n",
+    "# Sequences longer than this will be truncated, and sequences shorter than this will be padded.\n",
+    "max_seq_length = 128\n",
+    "\n",
+    "# This is a WA to use flags from here:\n",
+    "flags = tf.flags\n",
+    "\n",
+    "if 'f' not in tf.flags.FLAGS: \n",
+    "    tf.app.flags.DEFINE_string('f', '', 'kernel')\n",
+    "FLAGS = flags.FLAGS\n",
+    "\n",
+    "FLAGS.output_dir = output_dir"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.b Define Tokenizer & Create Estimator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Validate the casing config consistency with the checkpoint name.\n",
+    "tokenization.validate_case_matches_checkpoint(do_lower_case, init_checkpoint)\n",
+    "\n",
+    "# Create the tokenizer.\n",
+    "tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)\n",
+    "\n",
+    "# Load the configuration from file\n",
+    "bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
+    "\n",
+    "\n",
+    "# Use the data processor for BC5CDR\n",
+    "processor = BC5CDRProcessor()\n",
+    "# Get labels in the index order that was used during training\n",
+    "label_list = processor.get_labels()\n",
+    "\n",
+    "# Reverse index the labels. This will be used later when evaluating predictions.\n",
+    "id2label = {}\n",
+    "for (i, label) in enumerate(label_list, 1):\n",
+    "    id2label[i] = label\n",
+    "\n",
+    "\n",
+    "config = tf.ConfigProto(log_device_placement=True) \n",
+    "run_config = tf.estimator.RunConfig(\n",
+    "      model_dir=None,\n",
+    "      session_config=config,\n",
+    "      save_checkpoints_steps=1000,\n",
+    "      keep_checkpoint_max=1)\n",
+    "\n",
+    "\n",
+    "# Use model function builder to create the model function\n",
+    "model_fn = model_fn_builder(\n",
+    "    bert_config=bert_config,\n",
+    "    num_labels=len(label_list) + 1,\n",
+    "    init_checkpoint=init_checkpoint,\n",
+    "    use_fp16=use_mixed_precision_model)\n",
+    "\n",
+    "estimator = tf.estimator.Estimator(\n",
+    "  model_fn=model_fn,\n",
+    "  config=run_config,\n",
+    "  params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.c Run Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the input data using the BC5CDR processor\n",
+    "predict_examples = processor.get_test_examples(notebooks_dir, file_name='input.tsv')\n",
+    "\n",
+    "\n",
+    "# Convert to tf_records and save it\n",
+    "predict_file = os.path.join(output_dir, \"predict.tf_record\")\n",
+    "filed_based_convert_examples_to_features(predict_examples, label_list,\n",
+    "                                         max_seq_length, tokenizer,\n",
+    "                                         predict_file)\n",
+    "\n",
+    "\n",
+    "tf.logging.info(\"***** Running predictions *****\")\n",
+    "tf.logging.info(\"  Num orig examples = %d\", len(predict_examples))\n",
+    "tf.logging.info(\"  Batch size = %d\", predict_batch_size)\n",
+    "\n",
+    "# Run prediction on this tf_record file\n",
+    "predict_input_fn = file_based_input_fn_builder(\n",
+    "    input_file=predict_file,\n",
+    "    batch_size=predict_batch_size,\n",
+    "    seq_length=max_seq_length,\n",
+    "    is_training=False,\n",
+    "    drop_remainder=False)\n",
+    "\n",
+    "\n",
+    "pred_start_time = time.time()\n",
+    "\n",
+    "predictions = estimator.predict(input_fn=predict_input_fn)\n",
+    "predictions = list(predictions)\n",
+    "\n",
+    "pred_time_elapsed = time.time() - pred_start_time\n",
+    "\n",
+    "tf.logging.info(\"-----------------------------\")\n",
+    "tf.logging.info(\"Total Inference Time = %0.2f\", pred_time_elapsed)\n",
+    "# tf.logging.info(\"Inference Performance = %0.4f sentences/sec\", avg_sentences_per_second)\n",
+    "tf.logging.info(\"-----------------------------\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.d Save Predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's now process the predictions and save them to file(s)\n",
+    "tf.logging.info(\"Save Predictions:\")\n",
+    "\n",
+    "# File containing the list of predictions as IOB tags\n",
+    "output_predict_file = os.path.join(FLAGS.output_dir, \"label_test.txt\")\n",
+    "# File containing the list of words, the dummy token and the predicted IOB tag\n",
+    "test_labels_file = os.path.join(FLAGS.output_dir, \"test_labels.txt\")\n",
+    "test_labels_err_file = os.path.join(FLAGS.output_dir, \"test_labels_errs.txt\")\n",
+    "\n",
+    "with tf.gfile.Open(output_predict_file, 'w') as writer, \\\n",
+    "        tf.gfile.Open(test_labels_file, 'w') as tl, \\\n",
+    "        tf.gfile.Open(test_labels_err_file, 'w') as tle:\n",
+    "    i=0\n",
+    "    for prediction in estimator.predict(input_fn=predict_input_fn, yield_single_examples=True):\n",
+    "        output_line = \"\\n\".join(id2label[id] for id in prediction if id != 0) + \"\\n\"\n",
+    "        writer.write(output_line)\n",
+    "        result_to_pair(predict_examples[i], prediction, id2label, tl, tle)\n",
+    "        i = i + 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.e Visualize Predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's create a function that can formats the predictions for display using displaCy\n",
+    "def predictions_for_displacy(predict_examples, predictions, id2label):\n",
+    "    processed_text = ''\n",
+    "    entities = []\n",
+    "    current_pos = 0\n",
+    "    start_pos = 0\n",
+    "    end_pos = 0\n",
+    "    end_detected = False\n",
+    "    prev_label = ''\n",
+    "\n",
+    "    for predict_line, pred_ids in zip(predict_examples, predictions):\n",
+    "        words = str(predict_line.text).split(' ')\n",
+    "        labels = str(predict_line.label).split(' ')\n",
+    "\n",
+    "        # get from CLS to SEP\n",
+    "        pred_labels = []\n",
+    "        for id in pred_ids:\n",
+    "            if id == 0:\n",
+    "                continue\n",
+    "            curr_label = id2label[id]\n",
+    "            if curr_label == '[CLS]':\n",
+    "                continue\n",
+    "            elif curr_label == '[SEP]':\n",
+    "                break\n",
+    "            elif curr_label == 'X':\n",
+    "                continue\n",
+    "            pred_labels.append(curr_label)\n",
+    "\n",
+    "        for tok, label, pred_label in zip(words, labels, pred_labels):\n",
+    "            if pred_label is 'B':\n",
+    "                start_pos = current_pos\n",
+    "            elif pred_label is 'I' and prev_label is not 'B' and prev_label is not 'I':\n",
+    "                start_pos = current_pos\n",
+    "            elif pred_label is 'O' and (prev_label is 'B' or prev_label is 'I'):\n",
+    "                end_pos = current_pos\n",
+    "                end_detected = True\n",
+    "\n",
+    "            if end_detected:\n",
+    "                entities.append({'start':start_pos, 'end': end_pos, 'label': 'DISEASE'})\n",
+    "                start_pos = 0\n",
+    "                end_pos = 0\n",
+    "                end_detected = False\n",
+    "\n",
+    "            processed_text = processed_text + tok + ' '\n",
+    "            current_pos = current_pos + len(tok) + 1\n",
+    "            prev_label = pred_label\n",
+    "\n",
+    "    #Handle entity at the very end\n",
+    "    if start_pos > 0 and end_detected is False:\n",
+    "        entities.append({'start':start_pos, 'end': current_pos, 'label': 'DISEASE'})\n",
+    "    \n",
+    "    displacy_input = [{\"text\": processed_text,\n",
+    "                            \"ents\": entities,\n",
+    "                            \"title\": None}]\n",
+    "    \n",
+    "    return displacy_input"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert the predictions to the Named Entities format required by displaCy and visualize\n",
+    "displacy_input = predictions_for_displacy(predict_examples, predictions, id2label)\n",
+    "html = spacy.displacy.render(displacy_input, style=\"ent\", manual=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. What's next"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that you are familiar with running NER Inference on BioBERT, using mixed precision, you may want to try extracting disease information from other biomedical text. "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -0,0 +1,31 @@
+{"data": 
+ [
+     {"title": "Project Apollo",
+      "paragraphs": [
+          {"context":"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.", 
+           "qas": [
+               { "question": "What project put the first Americans into space?", 
+                 "id": "Q1"
+               },
+               { "question": "What program was created to carry out these projects and missions?",
+                 "id": "Q2"
+               },
+               { "question": "What year did the first manned Apollo flight occur?",
+                 "id": "Q3"
+               },                
+               { "question": "What President is credited with the original notion of putting Americans in space?",
+                 "id": "Q4"
+               },
+               { "question": "Who did the U.S. collaborate with on an Earth orbit mission in 1975?",
+                 "id": "Q5"
+               },
+               { "question": "How long did Project Apollo run?",
+                 "id": "Q6"
+               },               
+               { "question": "What program helped develop space travel techniques that Project Apollo used?",
+                 "id": "Q7"
+               },                
+               {"question": "What space station supported three manned missions in 1973-1974?",
+                 "id": "Q8"
+               }                
+]}]}]}
@@ -0,0 +1,467 @@
+# coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions and classes related to optimization (weight updates)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import tensorflow as tf
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+
+from npu_bridge.estimator.npu.npu_optimizer import NPUOptimizer
+from npu_bridge.estimator.npu import npu_loss_scale_manager as lsm_lib
+from npu_bridge.estimator import npu_ops
+
+
+def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, hvd=None, manual_fp16=False, use_fp16=False,
+                     num_accumulation_steps=1,
+                     optimizer_type="adam", allreduce_post_accumulation=False):
+    """Creates an optimizer training op."""
+    global_step = tf.train.get_or_create_global_step()
+
+    # avoid step change in learning rate at end of warmup phase
+    if optimizer_type == "adam":
+        power = 1.0
+        decayed_learning_rate_at_crossover_point = init_lr * (
+                (1.0 - float(num_warmup_steps) / float(num_train_steps)) ** power)
+    else:
+        power = 0.5
+        decayed_learning_rate_at_crossover_point = init_lr
+
+    adjusted_init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point)
+    print('decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e' % (
+    decayed_learning_rate_at_crossover_point, adjusted_init_lr))
+
+    learning_rate = tf.constant(value=adjusted_init_lr, shape=[], dtype=tf.float32)
+
+    # Implements linear decay of the learning rate.
+    learning_rate = tf.train.polynomial_decay(
+        learning_rate,
+        global_step,
+        num_train_steps,
+        end_learning_rate=0.0,
+        power=power,
+        cycle=False)
+
+    # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
+    # learning rate will be `global_step/num_warmup_steps * init_lr`.
+    if num_warmup_steps:
+        global_steps_int = tf.cast(global_step, tf.int32)
+        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
+
+        global_steps_float = tf.cast(global_steps_int, tf.float32)
+        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
+
+        warmup_percent_done = global_steps_float / warmup_steps_float
+        warmup_learning_rate = init_lr * warmup_percent_done
+
+        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
+        learning_rate = (
+                (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
+
+    if optimizer_type == "lamb":
+        print("Initializing LAMB Optimizer")
+        optimizer = LAMBOptimizer(
+            learning_rate=learning_rate,
+            weight_decay_rate=0.01,
+            beta_1=0.9,
+            beta_2=0.999,
+            epsilon=1e-6,
+            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+    else:
+        print("Initializing ADAM Weight Decay Optimizer")
+        # It is recommended that you use this optimizer for fine tuning, since this
+        # is how the model was trained (note that the Adam m/v variables are NOT
+        # loaded from init_checkpoint.)
+        optimizer = AdamWeightDecayOptimizer(
+            learning_rate=learning_rate,
+            weight_decay_rate=0.01,
+            beta_1=0.9,
+            beta_2=0.999,
+            epsilon=1e-6,
+            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+
+    if hvd is not None and (num_accumulation_steps == 1 or (not allreduce_post_accumulation)):
+        optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True,
+                                             compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none)
+
+    if tf.flags.FLAGS.npu_bert_loss_scale not in [None, -1]:
+        if tf.flags.FLAGS.npu_bert_loss_scale == 0:
+            loss_scale_manager = lsm_lib.ExponentialUpdateLossScaleManager(
+                init_loss_scale=tf.flags.FLAGS.init_loss_scale_value, incr_every_n_steps=1000,
+                decr_every_n_nan_or_inf=2, decr_ratio=0.5)
+        elif tf.flags.FLAGS.npu_bert_loss_scale >= 1:
+            loss_scale_manager = lsm_lib.FixedLossScaleManager(loss_scale=tf.flags.FLAGS.npu_bert_loss_scale)
+        else:
+            raise ValueError("Invalid loss scale: %d" % tf.flags.FLAGS.npu_bert_loss_scale)
+        optimizer = NPUOptimizer(optimizer, loss_scale_manager, is_distributed=tf.flags.FLAGS.distributed,
+                                 is_loss_scale=True, is_tailing_optimization=tf.flags.FLAGS.npu_bert_tail_optimize)
+    else:
+        optimizer = NPUOptimizer(optimizer, is_distributed=tf.flags.FLAGS.distributed)
+
+    tvars = tf.trainable_variables()
+    grads_and_vars = optimizer.compute_gradients(loss * 1.0 / num_accumulation_steps, tvars)
+
+    if num_accumulation_steps > 1:
+        local_step = tf.get_variable(name="local_step", shape=[], dtype=tf.int32, trainable=False,
+                                     initializer=tf.zeros_initializer)
+        batch_finite = tf.get_variable(name="batch_finite", shape=[], dtype=tf.bool, trainable=False,
+                                       initializer=tf.ones_initializer)
+        accum_vars = [tf.get_variable(
+            name=tvar.name.split(":")[0] + "/accum",
+            shape=tvar.shape.as_list(),
+            dtype=tf.float32,
+            trainable=False,
+            initializer=tf.zeros_initializer()) for tvar in tf.trainable_variables()]
+
+        reset_step = tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool)
+        local_step = tf.cond(reset_step, lambda: local_step.assign(tf.ones_like(local_step)),
+                             lambda: local_step.assign_add(1))
+
+        with tf.name_scope(accumulate_step):
+            grads_and_vars_and_accums = [(gv[0], gv[1], accum_vars[i]) for i, gv in enumerate(grads_and_vars) if
+                                         gv[0] is not None]
+            grads, tvars, accum_vars = list(zip(*grads_and_vars_and_accums))
+
+            all_are_finite = tf.reduce_all([tf.reduce_all(tf.is_finite(g)) for g in grads]) if (
+                                                                                                           tf.flags.FLAGS.npu_bert_loss_scale not in [
+                                                                                                       None, -1]) and (
+                                                                                                           manual_fp16 or use_fp16) else tf.constant(
+                True, dtype=tf.bool)
+            batch_finite = tf.cond(reset_step,
+                                   lambda: batch_finite.assign(
+                                       tf.math.logical_and(tf.constant(True, dtype=tf.bool), all_are_finite)),
+                                   lambda: batch_finite.assign(tf.math.logical_and(batch_finite, all_are_finite)))
+
+        # This is how the model was pre-trained.
+        # ensure global norm is a finite number
+        # to prevent clip_by_global_norm from having a hizzy fit.
+        if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
+            (clipped_grads, _) = tf.clip_by_global_norm(
+                grads, clip_norm=1.0,
+                use_norm=tf.cond(
+                    all_are_finite,
+                    lambda: tf.global_norm(grads),
+                    lambda: tf.constant(1.0)))
+        else:
+            with tf.name_scope("clip_grads"):
+                clipped_grads = [
+                    (tf.clip_by_norm(grad, clip_norm=1.0))
+                    if grad is not None else (grad, var) for grad in grads
+                ]
+
+        accum_vars = tf.cond(reset_step,
+                             lambda: [accum_vars[i].assign(grad) for i, grad in enumerate(clipped_grads)],
+                             lambda: [accum_vars[i].assign_add(grad) for i, grad in enumerate(clipped_grads)])
+
+        def update(accum_vars):
+            with tf.name_scope("opt_update"):
+                if allreduce_post_accumulation and hvd is not None:
+                    accum_vars = [hvd.allreduce(tf.convert_to_tensor(accum_var),
+                                                compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none) if isinstance(
+                        accum_var, tf.IndexedSlices)
+                                  else hvd.allreduce(accum_var,
+                                                     compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none)
+                                  for accum_var in accum_vars]
+                return optimizer.apply_gradients(list(zip(accum_vars, tvars)), global_step=global_step)
+
+        update_step = tf.identity(tf.cast(tf.math.equal(local_step % num_accumulation_steps, 0), dtype=tf.bool),
+                                  name="update_step")
+        update_op = tf.cond(update_step,
+                            lambda: update(accum_vars), lambda: tf.no_op())
+
+        new_global_step = tf.cond(
+            tf.math.logical_and(update_step, tf.cast(hvd.allreduce(tf.cast(batch_finite, tf.int32)), tf.bool)),
+            lambda: global_step + 1, lambda: global_step)
+        new_global_step = tf.identity(new_global_step, name='step_update')
+        train_op = tf.group(update_op, [global_step.assign(new_global_step)])
+    else:
+        grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
+        grads, tvars = list(zip(*grads_and_vars))
+
+        if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
+            all_are_finite = tf.constant(True, dtype=tf.bool)
+
+        # This is how the model was pre-trained.
+        # ensure global norm is a finite number
+        # to prevent clip_by_global_norm from having a hizzy fit.
+        if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
+            (clipped_grads, _) = tf.clip_by_global_norm(
+                grads, clip_norm=1.0,
+                use_norm=tf.cond(
+                    all_are_finite,
+                    lambda: tf.global_norm(grads),
+                    lambda: tf.constant(1.0)))
+        else:
+            with tf.name_scope("clip_grads"):
+                clipped_grads = [
+                    (tf.clip_by_norm(grad, clip_norm=1.0))
+                    if grad is not None else (grad, var) for grad in grads
+                ]
+
+        with tf.name_scope("apply_grads"):
+            train_op = optimizer.apply_gradients(
+                list(zip(clipped_grads, tvars)), global_step=global_step)
+
+        # if tf.flags.FLAGS.npu_bert_clip_by_global_norm:
+        #  new_global_step = tf.cond(all_are_finite, lambda: global_step + 1, lambda: global_step)
+        # else:
+        #  new_global_step = global_step + 1
+        # new_global_step = tf.identity(new_global_step, name='step_update')
+        # train_op = tf.group(train_op, [global_step.assign(new_global_step)])
+    return train_op
+
+
+class AdamWeightDecayOptimizer(tf.train.Optimizer):
+    """A basic Adam optimizer that includes "correct" L2 weight decay."""
+
+    def __init__(self,
+                 learning_rate,
+                 weight_decay_rate=0.0,
+                 beta_1=0.9,
+                 beta_2=0.999,
+                 epsilon=1e-6,
+                 exclude_from_weight_decay=None,
+                 name="AdamWeightDecayOptimizer"):
+        """Constructs a AdamWeightDecayOptimizer."""
+        super(AdamWeightDecayOptimizer, self).__init__(False, name)
+
+        self.learning_rate = tf.identity(learning_rate, name='learning_rate')
+        self.weight_decay_rate = weight_decay_rate
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.exclude_from_weight_decay = exclude_from_weight_decay
+
+    def apply_gradients(self, grads_and_vars, global_step=None, name=None,
+                        manual_fp16=False):
+        """See base class."""
+        assignments = []
+        for (grad, param) in grads_and_vars:
+            with tf.name_scope("apply_one_adam"):
+                if grad is None or param is None:
+                    continue
+
+                param_name = self._get_variable_name(param.name)
+                has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
+                if has_shadow:
+                    # create shadow fp32 weights for fp16 variable
+                    param_fp32 = tf.get_variable(
+                        name=param_name + "/shadow",
+                        dtype=tf.float32,
+                        trainable=False,
+                        initializer=tf.cast(param.initialized_value(), tf.float32))
+                else:
+                    param_fp32 = param
+
+                m = tf.get_variable(
+                    name=param_name + "/adam_m",
+                    shape=param.shape.as_list(),
+                    dtype=tf.float32,
+                    trainable=False,
+                    initializer=tf.zeros_initializer())
+                v = tf.get_variable(
+                    name=param_name + "/adam_v",
+                    shape=param.shape.as_list(),
+                    dtype=tf.float32,
+                    trainable=False,
+                    initializer=tf.zeros_initializer())
+                if tf.flags.FLAGS.npu_bert_use_fused_adam_momentum:
+                    if self._do_use_weight_decay(param_name):
+                        assignments.extend([npu_ops.adam_apply_one_with_decay_assign(grad, v, m, param_fp32, self.learning_rate,
+                                                                                     self.beta_1, 1.0 - self.beta_1, self.beta_2, 1.0 - self.beta_2,
+                                                                                     self.weight_decay_rate, self.epsilon)])
+                    else:
+                        assignments.extend([npu_ops.adam_apply_one_assign(grad, v, m, param_fp32, self.learning_rate, self.beta_1,
+                                                                          1.0 - self.beta_1, self.beta_2, 1.0 - self.beta_2, self.epsilon)])
+                else:
+                    # Standard Adam update.
+                    next_m = (
+                            tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
+                    next_v = (
+                            tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
+                                                                      tf.square(grad)))
+
+                    update = next_m / (tf.sqrt(next_v) + self.epsilon)
+
+                    # Just adding the square of the weights to the loss function is *not*
+                    # the correct way of using L2 regularization/weight decay with Adam,
+                    # since that will interact with the m and v parameters in strange ways.
+                    #
+                    # Instead we want to decay the weights in a manner that doesn't interact
+                    # with the m/v parameters. This is equivalent to adding the square
+                    # of the weights to the loss with plain (non-momentum) SGD.
+                    if self._do_use_weight_decay(param_name):
+                        update += self.weight_decay_rate * param_fp32
+
+                    update_with_lr = self.learning_rate * update
+
+                    next_param = param_fp32 - update_with_lr
+
+                    if has_shadow:
+                        # cast shadow fp32 weights to fp16 and assign to trainable variable
+                        param.assign(tf.cast(next_param, param.dtype.base_dtype))
+                    assignments.extend(
+                        [param_fp32.assign(next_param),
+                         m.assign(next_m),
+                         v.assign(next_v)])
+        new_global_step = global_step + 1
+        new_global_step = tf.identity(new_global_step, name='step_update')
+        assignments.extend([global_step.assign(new_global_step)])
+        return tf.group(*assignments, name=name)
+
+    def _do_use_weight_decay(self, param_name):
+        """Whether to use L2 weight decay for `param_name`."""
+        if not self.weight_decay_rate:
+            return False
+        if self.exclude_from_weight_decay:
+            for r in self.exclude_from_weight_decay:
+                if re.search(r, param_name) is not None:
+                    return False
+        return True
+
+    def _get_variable_name(self, param_name):
+        """Get the variable name from the tensor name."""
+        m = re.match("^(.*):\\d+$", param_name)
+        if m is not None:
+            param_name = m.group(1)
+        return param_name
+
+
+class LAMBOptimizer(tf.train.Optimizer):
+    """A LAMB optimizer that includes "correct" L2 weight decay."""
+
+    def __init__(self,
+                 learning_rate,
+                 weight_decay_rate=0.0,
+                 beta_1=0.9,
+                 beta_2=0.999,
+                 epsilon=1e-6,
+                 exclude_from_weight_decay=None,
+                 name="LAMBOptimizer"):
+        """Constructs a LAMBOptimizer."""
+        super(LAMBOptimizer, self).__init__(False, name)
+
+        self.learning_rate = tf.identity(learning_rate, name='learning_rate')
+        self.weight_decay_rate = weight_decay_rate
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.exclude_from_weight_decay = exclude_from_weight_decay
+        self.steps = 0
+
+    def apply_gradients(self, grads_and_vars, global_step=None, name=None,
+                        manual_fp16=False):
+        """See base class."""
+        assignments = []
+        for (grad, param) in grads_and_vars:
+            with tf.name_scope("apply_one_lamb"):
+                if grad is None or param is None:
+                    continue
+
+                param_name = self._get_variable_name(param.name)
+                has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32
+                if has_shadow:
+                    # create shadow fp32 weights for fp16 variable
+                    param_fp32 = tf.get_variable(
+                        name=param_name + "/shadow",
+                        dtype=tf.float32,
+                        trainable=False,
+                        initializer=tf.cast(param.initialized_value(), tf.float32))
+                else:
+                    param_fp32 = param
+
+                m = tf.get_variable(
+                    name=param_name + "/adam_m",
+                    shape=param.shape.as_list(),
+                    dtype=tf.float32,
+                    trainable=False,
+                    initializer=tf.zeros_initializer())
+                v = tf.get_variable(
+                    name=param_name + "/adam_v",
+                    shape=param.shape.as_list(),
+                    dtype=tf.float32,
+                    trainable=False,
+                    initializer=tf.zeros_initializer())
+
+                # LAMB update
+                next_m = (
+                        tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
+                next_v = (
+                        tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
+                                                                  tf.square(grad)))
+
+                self.steps += 1
+                beta1_correction = (1 - self.beta_1 ** self.steps)
+                beta2_correction = (1 - self.beta_2 ** self.steps)
+
+                next_m_unbiased = next_m / beta1_correction
+                next_v_unbiased = next_v / beta2_correction
+
+                update = next_m_unbiased / (tf.sqrt(next_v_unbiased) + self.epsilon)
+
+                # Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                if self._do_use_weight_decay(param_name):
+                    update += self.weight_decay_rate * param_fp32
+
+                w_norm = linalg_ops.norm(param, ord=2)
+                g_norm = linalg_ops.norm(update, ord=2)
+                ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
+                    math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0)
+
+                update_with_lr = ratio * self.learning_rate * update
+
+                next_param = param_fp32 - update_with_lr
+
+                if has_shadow:
+                    # cast shadow fp32 weights to fp16 and assign to trainable variable
+                    param.assign(tf.cast(next_param, param.dtype.base_dtype))
+                assignments.extend(
+                    [param_fp32.assign(next_param),
+                     m.assign(next_m),
+                     v.assign(next_v)])
+        new_global_step = global_step + 1
+        new_global_step = tf.identity(new_global_step, name='step_update')
+        assignments.extend([global_step.assign(new_global_step)])
+        return tf.group(*assignments, name=name)
+
+    def _do_use_weight_decay(self, param_name):
+        """Whether to use L2 weight decay for `param_name`."""
+        if not self.weight_decay_rate:
+            return False
+        if self.exclude_from_weight_decay:
+            for r in self.exclude_from_weight_decay:
+                if re.search(r, param_name) is not None:
+                    return False
+        return True
+
+    def _get_variable_name(self, param_name):
+        """Get the variable name from the tensor name."""
+        m = re.match("^(.*):\\d+$", param_name)
+        if m is not None:
+            param_name = m.group(1)
+        return param_name
@@ -0,0 +1,48 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import optimization
+import tensorflow as tf
+
+
+class OptimizationTest(tf.test.TestCase):
+
+  def test_adam(self):
+    with self.test_session() as sess:
+      w = tf.get_variable(
+          "w",
+          shape=[3],
+          initializer=tf.constant_initializer([0.1, -0.2, -0.1]))
+      x = tf.constant([0.4, 0.2, -0.5])
+      loss = tf.reduce_mean(tf.square(x - w))
+      tvars = tf.trainable_variables()
+      grads = tf.gradients(loss, tvars)
+      global_step = tf.train.get_or_create_global_step()
+      optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2)
+      train_op = optimizer.apply_gradients(zip(grads, tvars), global_step)
+      init_op = tf.group(tf.global_variables_initializer(),
+                         tf.local_variables_initializer())
+      sess.run(init_op)
+      for _ in range(100):
+        sess.run(train_op)
+      w_np = sess.run(w)
+      self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
+
+
+if __name__ == "__main__":
+  tf.test.main()
@@ -0,0 +1,73 @@
+#!/bin/bash
+#SBATCH --exclusive
+#SBATCH --mem=0
+#SBATCH --overcommit
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eux
+
+readonly docker_image="nvcr.io/nvidia/tensorflow:19.08-py3"
+readonly datadir="/raid/data/bert"
+readonly checkpointdir="$PWD/checkpoints"
+
+readonly mounts=".:/workspace/bert,${datadir}:/workspace/bert/data,${checkpointdir}:/results"
+
+
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}/phase_1"
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}/phase_2"
+
+PHASE1="\
+     --train_batch_size=${BATCHSIZE:-16} \
+     --learning_rate=${LEARNING_RATE:-1.875e-4} \
+     --num_accumulation_steps=${NUM_ACCUMULATION_STEPS:-128} \
+     --input_files_dir=/workspace/bert/data/tfrecord/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training \
+     --eval_files_dir=/workspace/bert/data/tfrecord/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/test \
+     --max_seq_length=128 \
+     --max_predictions_per_seq=20 \
+     --num_train_steps=7038 \
+     --num_warmup_steps=2000 \
+     --output_dir=/results/phase_1 \
+     "
+
+PHASE2="\
+     --train_batch_size=${BATCHSIZE:-2} \
+     --learning_rate=${LEARNING_RATE:-1.25e-4} \
+     --num_accumulation_steps=${NUM_ACCUMULATION_STEPS:-512} \
+     --input_files_dir=/workspace/bert/data/tfrecord/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training \
+     --eval_files_dir=/workspace/bert/data/tfrecord/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/test \
+     --max_seq_length=512 \
+     --max_predictions_per_seq=80 \
+     --num_train_steps=1564 \
+     --num_warmup_steps=200 \
+     --output_dir=/results/phase_2 \
+     --init_checkpoint=/results/phase_1/model.ckpt-7038 \
+    "
+
+PHASES=( "$PHASE1" "$PHASE2" )
+
+PHASE=${PHASE:-1}
+
+BERT_CMD="\
+    python /workspace/bert/run_pretraining.py \
+     ${PHASES[$((PHASE-1))]} \
+     --bert_config_file=/workspace/bert/data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json \
+     --do_train=True \
+     --do_eval=True \
+     --save_checkpoints_steps=100 \
+     --horovod --use_fp16 --use_xla \
+     --allreduce_post_accumulation=True \
+     --eval_batch_size=8"
+
+srun --mpi=pmi2 -l --container-image="${docker_image}" --container-mounts="${mounts}" bash -c "${BERT_CMD}"
@@ -0,0 +1,706 @@
+# coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import csv
+import os
+import modeling
+import optimization
+import tokenization
+import tensorflow as tf
+import horovod.tensorflow as hvd
+import time
+from utils.utils import LogEvalRunHook, LogTrainRunHook
+from utils.create_glue_data import *
+import numpy as np
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+## Required parameters
+flags.DEFINE_string(
+    "data_dir", None,
+    "The input data dir. Should contain the .tsv files (or other data files) "
+    "for the task.")
+
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.")
+
+flags.DEFINE_string("task_name", None, "The name of the task to train.")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_string(
+    "output_dir", None,
+    "The output directory where the model checkpoints will be written.")
+
+## Other parameters
+
+flags.DEFINE_string(
+    "init_checkpoint", None,
+    "Initial checkpoint (usually from a pre-trained BERT model).")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded.")
+
+flags.DEFINE_bool("do_train", False, "Whether to run training.")
+
+flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
+
+flags.DEFINE_bool(
+    "do_predict", False,
+    "Whether to run the model in inference mode on the test set.")
+
+flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
+
+flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
+
+flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")
+
+flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
+
+flags.DEFINE_bool("use_trt", False, "Whether to use TF-TRT")
+
+flags.DEFINE_float("num_train_epochs", 3.0,
+                   "Total number of training epochs to perform.")
+
+flags.DEFINE_float(
+    "warmup_proportion", 0.1,
+    "Proportion of training to perform linear learning rate warmup for. "
+    "E.g., 0.1 = 10% of training.")
+
+flags.DEFINE_integer("save_checkpoints_steps", 1000,
+                     "How often to save the model checkpoint.")
+
+flags.DEFINE_integer("iterations_per_loop", 1000,
+                     "How many steps to make in each estimator call.")
+flags.DEFINE_integer("num_accumulation_steps", 1,
+                     "Number of accumulation steps before gradient update" 
+                      "Global batch size = num_accumulation_steps * train_batch_size")
+flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
+
+flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
+flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
+
+flags.DEFINE_bool(
+    "verbose_logging", False,
+    "If true, all of the warnings related to data processing will be printed. "
+    "A number of warnings are expected for a normal SQuAD evaluation.")
+
+
+def file_based_input_fn_builder(input_file, batch_size, seq_length, is_training,
+                                drop_remainder, hvd=None):
+  """Creates an `input_fn` closure to be passed to Estimator."""
+
+  name_to_features = {
+      "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
+      "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
+      "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
+      "label_ids": tf.FixedLenFeature([], tf.int64),
+  }
+
+  def _decode_record(record, name_to_features):
+    """Decodes a record to a TensorFlow example."""
+    example = tf.parse_single_example(record, name_to_features)
+
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in list(example.keys()):
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.to_int32(t)
+      example[name] = t
+
+    return example
+
+  def input_fn():
+    """The actual input function."""
+
+    # For training, we want a lot of parallel reading and shuffling.
+    # For eval, we want no shuffling and parallel reading doesn't matter.
+    d = tf.data.TFRecordDataset(input_file)
+    if is_training:
+      if hvd is not None: d = d.shard(hvd.size(), hvd.rank())
+      d = d.repeat()
+      d = d.shuffle(buffer_size=100)
+
+    d = d.apply(
+        tf.contrib.data.map_and_batch(
+            lambda record: _decode_record(record, name_to_features),
+            batch_size=batch_size,
+            drop_remainder=drop_remainder))
+
+    return d
+
+  return input_fn
+
+
+def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
+                 labels, num_labels, use_one_hot_embeddings):
+  """Creates a classification model."""
+  model = modeling.BertModel(
+      config=bert_config,
+      is_training=is_training,
+      input_ids=input_ids,
+      input_mask=input_mask,
+      token_type_ids=segment_ids,
+      use_one_hot_embeddings=use_one_hot_embeddings,
+      compute_type=tf.float16 if FLAGS.use_fp16 else tf.float32)
+
+  # In the demo, we are doing a simple classification task on the entire
+  # segment.
+  #
+  # If you want to use the token-level output, use model.get_sequence_output()
+  # instead.
+  output_layer = model.get_pooled_output()
+
+  hidden_size = output_layer.shape[-1].value
+
+  output_weights = tf.get_variable(
+      "output_weights", [num_labels, hidden_size],
+      initializer=tf.truncated_normal_initializer(stddev=0.02))
+
+  output_bias = tf.get_variable(
+      "output_bias", [num_labels], initializer=tf.zeros_initializer())
+
+  with tf.variable_scope("loss"):
+    if is_training:
+      # I.e., 0.1 dropout
+      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
+
+    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
+    logits = tf.nn.bias_add(logits, output_bias, name='cls_logits')
+    probabilities = tf.nn.softmax(logits, axis=-1, name='cls_probabilities')
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+
+    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
+
+    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1, name='cls_per_example_loss')
+    loss = tf.reduce_mean(per_example_loss, name='cls_loss')
+
+    return (loss, per_example_loss, logits, probabilities)
+
+def get_frozen_tftrt_model(bert_config, shape, num_labels, use_one_hot_embeddings, init_checkpoint):
+  tf_config = tf.ConfigProto()
+  output_node_names = ['loss/cls_loss', 'loss/cls_per_example_loss', 'loss/cls_logits', 'loss/cls_probabilities']
+
+  with tf.Session(config=tf_config) as tf_sess:
+    input_ids = tf.placeholder(tf.int32, shape, 'input_ids')
+    input_mask = tf.placeholder(tf.int32, shape, 'input_mask')
+    segment_ids = tf.placeholder(tf.int32, shape, 'segment_ids')
+    label_ids = tf.placeholder(tf.int32, (None), 'label_ids')
+
+    create_model(bert_config, False, input_ids, input_mask, segment_ids, label_ids,
+            num_labels, use_one_hot_embeddings)
+
+    tvars = tf.trainable_variables()
+    (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+    tf_sess.run(tf.global_variables_initializer())
+    print("LOADED!")
+    tf.logging.info("**** Trainable Variables ****")
+    for var in tvars:
+      init_string = ""
+      if var.name in initialized_variable_names:
+        init_string = ", *INIT_FROM_CKPT*"
+      else:
+        init_string = ", *NOTTTTTTTTTTTTTTTTTTTTT"
+        tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape, init_string)
+
+    frozen_graph = tf.graph_util.convert_variables_to_constants(tf_sess, 
+            tf_sess.graph.as_graph_def(), output_node_names)
+
+    num_nodes = len(frozen_graph.node)
+    print('Converting graph using TensorFlow-TensorRT...')
+    from tensorflow.python.compiler.tensorrt import trt_convert as trt
+    converter = trt.TrtGraphConverter(
+        input_graph_def=frozen_graph,
+        nodes_blacklist=output_node_names,
+        max_workspace_size_bytes=(4096 << 20) - 1000,
+        precision_mode = "FP16" if FLAGS.use_fp16 else "FP32",
+        minimum_segment_size=4,
+        is_dynamic_op=True,
+        maximum_cached_engines=1000
+    )
+    frozen_graph = converter.convert()
+
+    print('Total node count before and after TF-TRT conversion:',
+          num_nodes, '->', len(frozen_graph.node))
+    print('TRT node count:',
+          len([1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp']))
+    
+    with tf.gfile.GFile("frozen_modelTRT.pb", "wb") as f:
+      f.write(frozen_graph.SerializeToString())      
+        
+  return frozen_graph
+
+
+
+def model_fn_builder(task_name, bert_config, num_labels, init_checkpoint, learning_rate,
+                     num_train_steps, num_warmup_steps,
+                     use_one_hot_embeddings, hvd=None):
+  """Returns `model_fn` closure for Estimator."""
+
+  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+    """The `model_fn` for Estimator."""
+
+    def metric_fn(per_example_loss, label_ids, logits):
+        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
+        if task_name == "cola":
+            FN, FN_op = tf.metrics.false_negatives(labels=label_ids, predictions=predictions)
+            FP, FP_op = tf.metrics.false_positives(labels=label_ids, predictions=predictions)
+            TP, TP_op = tf.metrics.true_positives(labels=label_ids, predictions=predictions)
+            TN, TN_op = tf.metrics.true_negatives(labels=label_ids, predictions=predictions)
+
+            MCC = (TP * TN - FP * FN) / ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) ** 0.5
+            MCC_op = tf.group(FN_op, TN_op, TP_op, FP_op, tf.identity(MCC, name="MCC"))
+            return {"MCC": (MCC, MCC_op)}
+        else:
+            accuracy = tf.metrics.accuracy(
+                labels=label_ids, predictions=predictions)
+            loss = tf.metrics.mean(values=per_example_loss)
+            return {
+                "eval_accuracy": accuracy,
+                "eval_loss": loss,
+            }
+    tf.logging.info("*** Features ***")
+    for name in sorted(features.keys()):
+      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+
+    input_ids = features["input_ids"]
+    input_mask = features["input_mask"]
+    segment_ids = features["segment_ids"]
+    label_ids = features["label_ids"]
+
+    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+
+    if not is_training and FLAGS.use_trt:
+        trt_graph = get_frozen_tftrt_model(bert_config, input_ids.shape, num_labels, use_one_hot_embeddings, init_checkpoint)
+        (total_loss, per_example_loss, logits, probabilities)  = tf.import_graph_def(trt_graph,
+                input_map={'input_ids':input_ids, 'input_mask':input_mask, 'segment_ids':segment_ids, 'label_ids':label_ids},
+                return_elements=['loss/cls_loss:0', 'loss/cls_per_example_loss:0', 'loss/cls_logits:0', 'loss/cls_probabilities:0'],
+                name='')
+        if mode == tf.estimator.ModeKeys.PREDICT:
+            predictions = {"probabilities": probabilities}
+            output_spec = tf.estimator.EstimatorSpec(
+                mode=mode, predictions=predictions)
+        elif mode == tf.estimator.ModeKeys.EVAL:
+            eval_metric_ops = metric_fn(per_example_loss, label_ids, logits)
+            output_spec = tf.estimator.EstimatorSpec(
+                mode=mode,
+                loss=total_loss,
+                eval_metric_ops=eval_metric_ops)
+        return output_spec
+    (total_loss, per_example_loss, logits, probabilities) = create_model(
+        bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
+        num_labels, use_one_hot_embeddings)
+
+    tvars = tf.trainable_variables()
+    initialized_variable_names = {}
+    if init_checkpoint and (hvd is None or hvd.rank() == 0):
+      (assignment_map, initialized_variable_names
+      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+      tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+    if FLAGS.verbose_logging:
+        tf.logging.info("**** Trainable Variables ****")
+        for var in tvars:
+          init_string = ""
+          if var.name in initialized_variable_names:
+            init_string = ", *INIT_FROM_CKPT*"
+          tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+                          init_string)
+
+    output_spec = None
+    if mode == tf.estimator.ModeKeys.TRAIN:
+
+      train_op = optimization.create_optimizer(
+          total_loss, learning_rate, num_train_steps, num_warmup_steps,
+          hvd, False, FLAGS.use_fp16, FLAGS.num_accumulation_steps)
+
+      output_spec = tf.estimator.EstimatorSpec(
+          mode=mode,
+          loss=total_loss,
+          train_op=train_op)
+    elif mode == tf.estimator.ModeKeys.EVAL:
+      eval_metric_ops = metric_fn(per_example_loss, label_ids, logits)
+      output_spec = tf.estimator.EstimatorSpec(
+          mode=mode,
+          loss=total_loss,
+          eval_metric_ops=eval_metric_ops)
+    else:
+      output_spec = tf.estimator.EstimatorSpec(
+          mode=mode, predictions=probabilities)
+    return output_spec
+
+  return model_fn
+
+
+# This function is not used by this file but is still used by the Colab and
+# people who depend on it.
+def input_fn_builder(features, batch_size, seq_length, is_training, drop_remainder, hvd=None):
+  """Creates an `input_fn` closure to be passed to Estimator."""
+
+  all_input_ids = []
+  all_input_mask = []
+  all_segment_ids = []
+  all_label_ids = []
+
+  for feature in features:
+    all_input_ids.append(feature.input_ids)
+    all_input_mask.append(feature.input_mask)
+    all_segment_ids.append(feature.segment_ids)
+    all_label_ids.append(feature.label_id)
+
+  def input_fn():
+    """The actual input function."""
+
+    num_examples = len(features)
+
+    # This is for demo purposes and does NOT scale to large data sets. We do
+    # not use Dataset.from_generator() because that uses tf.py_func which is
+    # not TPU compatible. The right way to load data is with TFRecordReader.
+    d = tf.data.Dataset.from_tensor_slices({
+        "input_ids":
+            tf.constant(
+                all_input_ids, shape=[num_examples, seq_length],
+                dtype=tf.int32),
+        "input_mask":
+            tf.constant(
+                all_input_mask,
+                shape=[num_examples, seq_length],
+                dtype=tf.int32),
+        "segment_ids":
+            tf.constant(
+                all_segment_ids,
+                shape=[num_examples, seq_length],
+                dtype=tf.int32),
+        "label_ids":
+            tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
+    })
+
+    if is_training:
+      if hvd is not None: d = d.shard(hvd.size(), hvd.rank())
+      d = d.repeat()
+      d = d.shuffle(buffer_size=100)
+
+    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
+    return d
+
+  return input_fn
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  if FLAGS.horovod:
+    hvd.init()
+  if FLAGS.use_fp16:
+    os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+  processors = {
+      "cola": ColaProcessor,
+      "mnli": MnliProcessor,
+      "mrpc": MrpcProcessor,
+      "xnli": XnliProcessor,
+  }
+
+  if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
+    raise ValueError(
+        "At least one of `do_train`, `do_eval` or `do_predict' must be True.")
+
+  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+
+  if FLAGS.max_seq_length > bert_config.max_position_embeddings:
+    raise ValueError(
+        "Cannot use sequence length %d because the BERT model "
+        "was only trained up to sequence length %d" %
+        (FLAGS.max_seq_length, bert_config.max_position_embeddings))
+
+  tf.gfile.MakeDirs(FLAGS.output_dir)
+
+  task_name = FLAGS.task_name.lower()
+
+  if task_name not in processors:
+    raise ValueError("Task not found: %s" % (task_name))
+
+  processor = processors[task_name]()
+
+  label_list = processor.get_labels()
+
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+  master_process = True
+  training_hooks = []
+  global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps
+  hvd_rank = 0
+
+  config = tf.ConfigProto()
+  if FLAGS.horovod:
+
+      tf.logging.info("Multi-GPU training with TF Horovod")
+      tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank())
+      global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
+      master_process = (hvd.rank() == 0)
+      hvd_rank = hvd.rank()
+      config.gpu_options.visible_device_list = str(hvd.local_rank())
+      if hvd.size() > 1:
+          training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
+  if FLAGS.use_xla:
+    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+
+  run_config = tf.estimator.RunConfig(
+      model_dir=FLAGS.output_dir if master_process else None,
+      session_config=config,
+      save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
+      keep_checkpoint_max=1)
+
+  if master_process:
+      tf.logging.info("***** Configuaration *****")
+      for key in FLAGS.__flags.keys():
+          tf.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
+      tf.logging.info("**************************")
+
+  train_examples = None
+  num_train_steps = None
+  num_warmup_steps = None
+  training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank))
+
+  if FLAGS.do_train:
+    train_examples = processor.get_train_examples(FLAGS.data_dir)
+    num_train_steps = int(
+        len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
+    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
+
+    start_index = 0
+    end_index = len(train_examples)
+    tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]
+
+    if FLAGS.horovod:
+      tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size())]
+      num_examples_per_rank = len(train_examples) // hvd.size()
+      remainder = len(train_examples) % hvd.size()
+      if hvd.rank() < remainder:
+        start_index = hvd.rank() * (num_examples_per_rank+1)
+        end_index = start_index + num_examples_per_rank + 1
+      else:
+        start_index = hvd.rank() * num_examples_per_rank + remainder
+        end_index = start_index + (num_examples_per_rank)
+
+  model_fn = model_fn_builder(
+      task_name=task_name,
+      bert_config=bert_config,
+      num_labels=len(label_list),
+      init_checkpoint=FLAGS.init_checkpoint,
+      learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(),
+      num_train_steps=num_train_steps,
+      num_warmup_steps=num_warmup_steps,
+      use_one_hot_embeddings=False,
+      hvd=None if not FLAGS.horovod else hvd)
+
+  estimator = tf.estimator.Estimator(
+      model_fn=model_fn,
+      config=run_config)
+
+  if FLAGS.do_train:
+
+    file_based_convert_examples_to_features(
+        train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])
+
+    tf.logging.info("***** Running training *****")
+    tf.logging.info("  Num examples = %d", len(train_examples))
+    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+    tf.logging.info("  Num steps = %d", num_train_steps)
+    train_input_fn = file_based_input_fn_builder(
+        input_file=tmp_filenames,
+        batch_size=FLAGS.train_batch_size,
+        seq_length=FLAGS.max_seq_length,
+        is_training=True,
+        drop_remainder=True,
+        hvd=None if not FLAGS.horovod else hvd)
+
+    train_start_time = time.time()
+    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks)
+    train_time_elapsed = time.time() - train_start_time
+    train_time_wo_overhead = training_hooks[-1].total_time
+    avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
+    ss_sentences_per_second = (num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead
+
+    if master_process:
+        tf.logging.info("-----------------------------")
+        tf.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed,
+                        num_train_steps * global_batch_size)
+        tf.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead,
+                        (num_train_steps - training_hooks[-1].skipped) * global_batch_size)
+        tf.logging.info("Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second)
+        tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
+        tf.logging.info("-----------------------------")
+
+  if FLAGS.do_eval and master_process:
+    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
+    eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
+    file_based_convert_examples_to_features(
+        eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
+
+    tf.logging.info("***** Running evaluation *****")
+    tf.logging.info("  Num examples = %d", len(eval_examples))
+    tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
+
+    eval_drop_remainder = False
+    eval_input_fn = file_based_input_fn_builder(
+        input_file=eval_file,
+        batch_size=FLAGS.eval_batch_size,
+        seq_length=FLAGS.max_seq_length,
+        is_training=False,
+        drop_remainder=eval_drop_remainder)
+
+    eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
+    eval_start_time = time.time()
+    result = estimator.evaluate(input_fn=eval_input_fn, hooks=eval_hooks)
+
+    eval_time_elapsed = time.time() - eval_start_time
+    eval_time_wo_overhead = eval_hooks[-1].total_time
+
+    time_list = eval_hooks[-1].time_list
+    time_list.sort()
+    num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size
+
+    avg = np.mean(time_list)
+    cf_50 = max(time_list[:int(len(time_list) * 0.50)])
+    cf_90 = max(time_list[:int(len(time_list) * 0.90)])
+    cf_95 = max(time_list[:int(len(time_list) * 0.95)])
+    cf_99 = max(time_list[:int(len(time_list) * 0.99)])
+    cf_100 = max(time_list[:int(len(time_list) * 1)])
+    ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
+
+    tf.logging.info("-----------------------------")
+    tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
+                    eval_hooks[-1].count * FLAGS.eval_batch_size)
+    tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
+                    (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size)
+    tf.logging.info("Summary Inference Statistics on EVAL set")
+    tf.logging.info("Batch size = %d", FLAGS.eval_batch_size)
+    tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
+    tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
+    tf.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
+    tf.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
+    tf.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
+    tf.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000)
+    tf.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000)
+    tf.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
+    tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
+    tf.logging.info("-----------------------------")
+
+
+    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
+    with tf.gfile.GFile(output_eval_file, "w") as writer:
+      tf.logging.info("***** Eval results *****")
+      for key in sorted(result.keys()):
+        tf.logging.info("  %s = %s", key, str(result[key]))
+        writer.write("%s = %s\n" % (key, str(result[key])))
+
+  if FLAGS.do_predict and master_process:
+    predict_examples = processor.get_test_examples(FLAGS.data_dir)
+    predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
+    file_based_convert_examples_to_features(predict_examples, label_list,
+                                            FLAGS.max_seq_length, tokenizer,
+                                            predict_file)
+
+    tf.logging.info("***** Running prediction*****")
+    tf.logging.info("  Num examples = %d", len(predict_examples))
+    tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
+
+    predict_drop_remainder = False
+    predict_input_fn = file_based_input_fn_builder(
+        input_file=predict_file,
+        batch_size=FLAGS.predict_batch_size,
+        seq_length=FLAGS.max_seq_length,
+        is_training=False,
+        drop_remainder=predict_drop_remainder)
+
+    predict_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
+    predict_start_time = time.time()
+
+    output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
+    with tf.gfile.GFile(output_predict_file, "w") as writer:
+        tf.logging.info("***** Predict results *****")
+        for prediction in estimator.predict(input_fn=predict_input_fn, hooks=predict_hooks,
+                                            yield_single_examples=False):
+            output_line = "\t".join(
+                str(class_probability) for class_probability in prediction) + "\n"
+            writer.write(output_line)
+
+
+    predict_time_elapsed = time.time() - predict_start_time
+    predict_time_wo_overhead = predict_hooks[-1].total_time
+
+    time_list = predict_hooks[-1].time_list
+    time_list.sort()
+    num_sentences = (predict_hooks[-1].count - predict_hooks[-1].skipped) * FLAGS.predict_batch_size
+
+    avg = np.mean(time_list)
+    cf_50 = max(time_list[:int(len(time_list) * 0.50)])
+    cf_90 = max(time_list[:int(len(time_list) * 0.90)])
+    cf_95 = max(time_list[:int(len(time_list) * 0.95)])
+    cf_99 = max(time_list[:int(len(time_list) * 0.99)])
+    cf_100 = max(time_list[:int(len(time_list) * 1)])
+    ss_sentences_per_second = num_sentences * 1.0 / predict_time_wo_overhead
+
+    tf.logging.info("-----------------------------")
+    tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", predict_time_elapsed,
+                    predict_hooks[-1].count * FLAGS.predict_batch_size)
+    tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", predict_time_wo_overhead,
+                    (predict_hooks[-1].count - predict_hooks[-1].skipped) * FLAGS.predict_batch_size)
+
+    tf.logging.info("Summary Inference Statistics on TEST SET")
+    tf.logging.info("Batch size = %d", FLAGS.predict_batch_size)
+    tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
+    tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
+    tf.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
+    tf.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
+    tf.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
+    tf.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000)
+    tf.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000)
+    tf.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
+    tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
+    tf.logging.info("-----------------------------")
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("data_dir")
+  flags.mark_flag_as_required("task_name")
+  flags.mark_flag_as_required("vocab_file")
+  flags.mark_flag_as_required("bert_config_file")
+  flags.mark_flag_as_required("output_dir")
+  tf.app.run()
@@ -0,0 +1,314 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner with TF-Hub."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import optimization
+import run_classifier
+import tokenization
+import tensorflow as tf
+import tensorflow_hub as hub
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    "bert_hub_module_handle", None,
+    "Handle for the BERT TF-Hub module.")
+
+
+def create_model(is_training, input_ids, input_mask, segment_ids, labels,
+                 num_labels, bert_hub_module_handle):
+  """Creates a classification model."""
+  tags = set()
+  if is_training:
+    tags.add("train")
+  bert_module = hub.Module(bert_hub_module_handle, tags=tags, trainable=True)
+  bert_inputs = dict(
+      input_ids=input_ids,
+      input_mask=input_mask,
+      segment_ids=segment_ids)
+  bert_outputs = bert_module(
+      inputs=bert_inputs,
+      signature="tokens",
+      as_dict=True)
+
+  # In the demo, we are doing a simple classification task on the entire
+  # segment.
+  #
+  # If you want to use the token-level output, use
+  # bert_outputs["sequence_output"] instead.
+  output_layer = bert_outputs["pooled_output"]
+
+  hidden_size = output_layer.shape[-1].value
+
+  output_weights = tf.get_variable(
+      "output_weights", [num_labels, hidden_size],
+      initializer=tf.truncated_normal_initializer(stddev=0.02))
+
+  output_bias = tf.get_variable(
+      "output_bias", [num_labels], initializer=tf.zeros_initializer())
+
+  with tf.variable_scope("loss"):
+    if is_training:
+      # I.e., 0.1 dropout
+      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
+
+    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
+    logits = tf.nn.bias_add(logits, output_bias)
+    probabilities = tf.nn.softmax(logits, axis=-1)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+
+    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
+
+    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
+    loss = tf.reduce_mean(per_example_loss)
+
+    return (loss, per_example_loss, logits, probabilities)
+
+
+def model_fn_builder(num_labels, learning_rate, num_train_steps,
+                     num_warmup_steps, use_tpu, bert_hub_module_handle):
+  """Returns `model_fn` closure for TPUEstimator."""
+
+  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+    """The `model_fn` for TPUEstimator."""
+
+    tf.logging.info("*** Features ***")
+    for name in sorted(features.keys()):
+      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+
+    input_ids = features["input_ids"]
+    input_mask = features["input_mask"]
+    segment_ids = features["segment_ids"]
+    label_ids = features["label_ids"]
+
+    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+
+    (total_loss, per_example_loss, logits, probabilities) = create_model(
+        is_training, input_ids, input_mask, segment_ids, label_ids, num_labels,
+        bert_hub_module_handle)
+
+    output_spec = None
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      train_op = optimization.create_optimizer(
+          total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
+
+      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+          mode=mode,
+          loss=total_loss,
+          train_op=train_op)
+    elif mode == tf.estimator.ModeKeys.EVAL:
+
+      def metric_fn(per_example_loss, label_ids, logits):
+        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
+        accuracy = tf.metrics.accuracy(label_ids, predictions)
+        loss = tf.metrics.mean(per_example_loss)
+        return {
+            "eval_accuracy": accuracy,
+            "eval_loss": loss,
+        }
+
+      eval_metrics = (metric_fn, [per_example_loss, label_ids, logits])
+      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+          mode=mode,
+          loss=total_loss,
+          eval_metrics=eval_metrics)
+    elif mode == tf.estimator.ModeKeys.PREDICT:
+      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+          mode=mode, predictions={"probabilities": probabilities})
+    else:
+      raise ValueError(
+          "Only TRAIN, EVAL and PREDICT modes are supported: %s" % (mode))
+
+    return output_spec
+
+  return model_fn
+
+
+def create_tokenizer_from_hub_module(bert_hub_module_handle):
+  """Get the vocab file and casing info from the Hub module."""
+  with tf.Graph().as_default():
+    bert_module = hub.Module(bert_hub_module_handle)
+    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
+    with tf.Session() as sess:
+      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
+                                            tokenization_info["do_lower_case"]])
+  return tokenization.FullTokenizer(
+      vocab_file=vocab_file, do_lower_case=do_lower_case)
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  processors = {
+      "cola": run_classifier.ColaProcessor,
+      "mnli": run_classifier.MnliProcessor,
+      "mrpc": run_classifier.MrpcProcessor,
+  }
+
+  if not FLAGS.do_train and not FLAGS.do_eval:
+    raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+  tf.gfile.MakeDirs(FLAGS.output_dir)
+
+  task_name = FLAGS.task_name.lower()
+
+  if task_name not in processors:
+    raise ValueError("Task not found: %s" % (task_name))
+
+  processor = processors[task_name]()
+
+  label_list = processor.get_labels()
+
+  tokenizer = create_tokenizer_from_hub_module(FLAGS.bert_hub_module_handle)
+
+  tpu_cluster_resolver = None
+  if FLAGS.use_tpu and FLAGS.tpu_name:
+    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
+        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
+
+  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+  run_config = tf.contrib.tpu.RunConfig(
+      cluster=tpu_cluster_resolver,
+      master=FLAGS.master,
+      model_dir=FLAGS.output_dir,
+      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
+      tpu_config=tf.contrib.tpu.TPUConfig(
+          iterations_per_loop=FLAGS.iterations_per_loop,
+          num_shards=FLAGS.num_tpu_cores,
+          per_host_input_for_training=is_per_host))
+
+  train_examples = None
+  num_train_steps = None
+  num_warmup_steps = None
+  if FLAGS.do_train:
+    train_examples = processor.get_train_examples(FLAGS.data_dir)
+    num_train_steps = int(
+        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
+    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
+
+  model_fn = model_fn_builder(
+      num_labels=len(label_list),
+      learning_rate=FLAGS.learning_rate,
+      num_train_steps=num_train_steps,
+      num_warmup_steps=num_warmup_steps,
+      use_tpu=FLAGS.use_tpu,
+      bert_hub_module_handle=FLAGS.bert_hub_module_handle)
+
+  # If TPU is not available, this will fall back to normal Estimator on CPU
+  # or GPU.
+  estimator = tf.contrib.tpu.TPUEstimator(
+      use_tpu=FLAGS.use_tpu,
+      model_fn=model_fn,
+      config=run_config,
+      train_batch_size=FLAGS.train_batch_size,
+      eval_batch_size=FLAGS.eval_batch_size,
+      predict_batch_size=FLAGS.predict_batch_size)
+
+  if FLAGS.do_train:
+    train_features = run_classifier.convert_examples_to_features(
+        train_examples, label_list, FLAGS.max_seq_length, tokenizer)
+    tf.logging.info("***** Running training *****")
+    tf.logging.info("  Num examples = %d", len(train_examples))
+    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+    tf.logging.info("  Num steps = %d", num_train_steps)
+    train_input_fn = run_classifier.input_fn_builder(
+        features=train_features,
+        seq_length=FLAGS.max_seq_length,
+        is_training=True,
+        drop_remainder=True)
+    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
+
+  if FLAGS.do_eval:
+    eval_examples = processor.get_dev_examples(FLAGS.data_dir)
+    eval_features = run_classifier.convert_examples_to_features(
+        eval_examples, label_list, FLAGS.max_seq_length, tokenizer)
+
+    tf.logging.info("***** Running evaluation *****")
+    tf.logging.info("  Num examples = %d", len(eval_examples))
+    tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
+
+    # This tells the estimator to run through the entire set.
+    eval_steps = None
+    # However, if running eval on the TPU, you will need to specify the
+    # number of steps.
+    if FLAGS.use_tpu:
+      # Eval will be slightly WRONG on the TPU because it will truncate
+      # the last batch.
+      eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size)
+
+    eval_drop_remainder = True if FLAGS.use_tpu else False
+    eval_input_fn = run_classifier.input_fn_builder(
+        features=eval_features,
+        seq_length=FLAGS.max_seq_length,
+        is_training=False,
+        drop_remainder=eval_drop_remainder)
+
+    result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
+
+    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
+    with tf.gfile.GFile(output_eval_file, "w") as writer:
+      tf.logging.info("***** Eval results *****")
+      for key in sorted(result.keys()):
+        tf.logging.info("  %s = %s", key, str(result[key]))
+        writer.write("%s = %s\n" % (key, str(result[key])))
+
+  if FLAGS.do_predict:
+    predict_examples = processor.get_test_examples(FLAGS.data_dir)
+    if FLAGS.use_tpu:
+      # Discard batch remainder if running on TPU
+      n = len(predict_examples)
+      predict_examples = predict_examples[:(n - n % FLAGS.predict_batch_size)]
+
+    predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
+    run_classifier.file_based_convert_examples_to_features(
+        predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
+        predict_file)
+
+    tf.logging.info("***** Running prediction*****")
+    tf.logging.info("  Num examples = %d", len(predict_examples))
+    tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
+
+    predict_input_fn = run_classifier.file_based_input_fn_builder(
+        input_file=predict_file,
+        seq_length=FLAGS.max_seq_length,
+        is_training=False,
+        drop_remainder=FLAGS.use_tpu)
+
+    result = estimator.predict(input_fn=predict_input_fn)
+
+    output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
+    with tf.gfile.GFile(output_predict_file, "w") as writer:
+      tf.logging.info("***** Predict results *****")
+      for prediction in result:
+        probabilities = prediction["probabilities"]
+        output_line = "\t".join(
+            str(class_probability)
+            for class_probability in probabilities) + "\n"
+        writer.write(output_line)
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("data_dir")
+  flags.mark_flag_as_required("task_name")
+  flags.mark_flag_as_required("bert_hub_module_handle")
+  flags.mark_flag_as_required("output_dir")
+  tf.app.run()
@@ -0,0 +1,871 @@
+#! usr/bin/env python3
+# -*- coding:utf-8 -*-
+"""
+Copyright 2018 The Google AI Language Team Authors.
+BASED ON Google_BERT.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os, sys
+import pickle
+
+import tensorflow as tf
+import numpy as np
+
+sys.path.append("/workspace/bert")
+
+from biobert.conlleval import evaluate, report_notprint
+import modeling
+import optimization
+import tokenization
+import tf_metrics
+
+import time
+import horovod.tensorflow as hvd
+from utils.utils import LogEvalRunHook, LogTrainRunHook
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    "task_name", "NER", "The name of the task to train."
+)
+
+flags.DEFINE_string(
+    "data_dir", None,
+    "The input datadir.",
+)
+
+flags.DEFINE_string(
+    "output_dir", None,
+    "The output directory where the model checkpoints will be written."
+)
+
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "The config json file corresponding to the pre-trained BERT model."
+)
+
+flags.DEFINE_string(
+    "vocab_file", None,
+    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_string(
+    "init_checkpoint", None,
+    "Initial checkpoint (usually from a pre-trained BERT model)."
+)
+
+flags.DEFINE_bool(
+    "do_lower_case", False,
+    "Whether to lower case the input text."
+)
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sequence length after WordPiece tokenization."
+)
+
+flags.DEFINE_bool(
+    "do_train", False,
+    "Whether to run training."
+)
+
+flags.DEFINE_bool(
+    "do_eval", False,
+    "Whether to run eval on the dev set.")
+
+flags.DEFINE_bool(
+    "do_predict", False,
+    "Whether to run the model in inference mode on the test set.")
+
+flags.DEFINE_integer(
+    "train_batch_size", 64,
+    "Total batch size for training.")
+
+flags.DEFINE_integer(
+    "eval_batch_size", 16,
+    "Total batch size for eval.")
+
+flags.DEFINE_integer(
+    "predict_batch_size", 16,
+    "Total batch size for predict.")
+
+flags.DEFINE_float(
+    "learning_rate", 5e-6,
+    "The initial learning rate for Adam.")
+
+flags.DEFINE_float(
+    "num_train_epochs", 10.0,
+    "Total number of training epochs to perform.")
+
+flags.DEFINE_float(
+    "warmup_proportion", 0.1,
+    "Proportion of training to perform linear learning rate warmup for. "
+    "E.g., 0.1 = 10% of training.")
+
+flags.DEFINE_integer(
+    "save_checkpoints_steps", 1000,
+    "How often to save the model checkpoint.")
+
+flags.DEFINE_integer(
+    "iterations_per_loop", 1000,
+    "How many steps to make in each estimator call.")
+
+tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
+
+flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
+flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
+flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text, label=None):
+        """Constructs a InputExample.
+
+        Args:
+          guid: Unique id for the example.
+          text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+          label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text = text
+        self.label = label
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_ids, ):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_ids = label_ids
+        # self.label_mask = label_mask
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_data(cls, input_file):
+        """Reads a BIO data."""
+        with tf.gfile.Open(input_file, "r") as f:
+            lines = []
+            words = []
+            labels = []
+            for line in f:
+                contends = line.strip()
+                if len(contends) == 0:
+                    assert len(words) == len(labels)
+                    if len(words) > 30:
+                        # split if the sentence is longer than 30
+                        while len(words) > 30:
+                            tmplabel = labels[:30]
+                            for iidx in range(len(tmplabel)):
+                                if tmplabel.pop() == 'O':
+                                    break
+                            l = ' '.join(
+                                [label for label in labels[:len(tmplabel) + 1] if len(label) > 0])
+                            w = ' '.join(
+                                [word for word in words[:len(tmplabel) + 1] if len(word) > 0])
+                            lines.append([l, w])
+                            words = words[len(tmplabel) + 1:]
+                            labels = labels[len(tmplabel) + 1:]
+
+                    if len(words) == 0:
+                        continue
+                    l = ' '.join([label for label in labels if len(label) > 0])
+                    w = ' '.join([word for word in words if len(word) > 0])
+                    lines.append([l, w])
+                    words = []
+                    labels = []
+                    continue
+
+                word = line.strip().split()[0]
+                label = line.strip().split()[-1]
+                words.append(word)
+                labels.append(label)
+            return lines
+
+
+class BC5CDRProcessor(DataProcessor):
+    def get_train_examples(self, data_dir):
+        l1 = self._read_data(os.path.join(data_dir, "train.tsv"))
+        l2 = self._read_data(os.path.join(data_dir, "devel.tsv"))
+        return self._create_example(l1 + l2, "train")
+
+    def get_dev_examples(self, data_dir, file_name="devel.tsv"):
+        return self._create_example(
+            self._read_data(os.path.join(data_dir, file_name)), "dev"
+        )
+
+    def get_test_examples(self, data_dir, file_name="test.tsv"):
+        return self._create_example(
+            self._read_data(os.path.join(data_dir, file_name)), "test")
+
+    def get_labels(self):
+        return ["B", "I", "O", "X", "[CLS]", "[SEP]"]
+
+    def _create_example(self, lines, set_type):
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text = tokenization.convert_to_unicode(line[1])
+            label = tokenization.convert_to_unicode(line[0])
+            examples.append(InputExample(guid=guid, text=text, label=label))
+        return examples
+
+
+class CLEFEProcessor(DataProcessor):
+    def get_train_examples(self, data_dir):
+        lines1 = self._read_data2(os.path.join(data_dir, "Training.tsv"))
+        lines2 = self._read_data2(os.path.join(data_dir, "Development.tsv"))
+        return self._create_example(
+            lines1 + lines2, "train"
+        )
+
+    def get_dev_examples(self, data_dir, file_name="Development.tsv"):
+        return self._create_example(
+            self._read_data2(os.path.join(data_dir, file_name)), "dev"
+        )
+
+    def get_test_examples(self, data_dir, file_name="Test.tsv"):
+        return self._create_example(
+            self._read_data2(os.path.join(data_dir, file_name)), "test")
+
+    def get_labels(self):
+        return ["B", "I", "O", "X", "[CLS]", "[SEP]"]
+
+    def _create_example(self, lines, set_type):
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text = tokenization.convert_to_unicode(line[1])
+            label = tokenization.convert_to_unicode(line[0])
+            examples.append(InputExample(guid=guid, text=text, label=label))
+        return examples
+
+    @classmethod
+    def _read_data2(cls, input_file):
+        with tf.gfile.Open(input_file, "r") as f:
+            lines = []
+            words = []
+            labels = []
+            for line in f:
+                contends = line.strip()
+                if len(contends) == 0:
+                    assert len(words) == len(labels)
+                    if len(words) == 0:
+                        continue
+                    l = ' '.join([label for label in labels if len(label) > 0])
+                    w = ' '.join([word for word in words if len(word) > 0])
+                    lines.append([l, w])
+                    words = []
+                    labels = []
+                    continue
+                elif contends.startswith('###'):
+                    continue
+
+                word = line.strip().split()[0]
+                label = line.strip().split()[-1]
+                words.append(word)
+                labels.append(label)
+            return lines
+
+
+class I2b22012Processor(CLEFEProcessor):
+    def get_labels(self):
+        return ['B-CLINICAL_DEPT', 'B-EVIDENTIAL', 'B-OCCURRENCE', 'B-PROBLEM', 'B-TEST', 'B-TREATMENT', 'I-CLINICAL_DEPT', 'I-EVIDENTIAL', 'I-OCCURRENCE', 'I-PROBLEM', 'I-TEST', 'I-TREATMENT', "O", "X", "[CLS]", "[SEP]"]
+
+
+def write_tokens(tokens, labels, mode):
+    if mode == "test":
+        path = os.path.join(FLAGS.output_dir, "token_" + mode + ".txt")
+        if tf.gfile.Exists(path):
+            wf = tf.gfile.Open(path, 'a')
+        else:
+            wf = tf.gfile.Open(path, 'w')
+        for token, label in zip(tokens, labels):
+            if token != "**NULL**":
+                wf.write(token + ' ' + str(label) + '\n')
+        wf.close()
+
+
+def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, mode):
+    label_map = {}
+    for (i, label) in enumerate(label_list, 1):
+        label_map[label] = i
+    label2id_file = os.path.join(FLAGS.output_dir, 'label2id.pkl')
+    if not tf.gfile.Exists(label2id_file):
+        with tf.gfile.Open(label2id_file, 'wb') as w:
+            pickle.dump(label_map, w)
+    textlist = example.text.split(' ')
+    labellist = example.label.split(' ')
+    tokens = []
+    labels = []
+    for i, word in enumerate(textlist):
+        token = tokenizer.tokenize(word)
+        tokens.extend(token)
+        label_1 = labellist[i]
+        for m in range(len(token)):
+            if m == 0:
+                labels.append(label_1)
+            else:
+                labels.append("X")
+    # tokens = tokenizer.tokenize(example.text)
+    if len(tokens) >= max_seq_length - 1:
+        tokens = tokens[0:(max_seq_length - 2)]
+        labels = labels[0:(max_seq_length - 2)]
+    ntokens = []
+    segment_ids = []
+    label_ids = []
+    ntokens.append("[CLS]")
+    segment_ids.append(0)
+    # append("O") or append("[CLS]") not sure!
+    label_ids.append(label_map["[CLS]"])
+    for i, token in enumerate(tokens):
+        ntokens.append(token)
+        segment_ids.append(0)
+        label_ids.append(label_map[labels[i]])
+    ntokens.append("[SEP]")
+    segment_ids.append(0)
+    # append("O") or append("[SEP]") not sure!
+    label_ids.append(label_map["[SEP]"])
+    input_ids = tokenizer.convert_tokens_to_ids(ntokens)
+    input_mask = [1] * len(input_ids)
+    # label_mask = [1] * len(input_ids)
+    while len(input_ids) < max_seq_length:
+        input_ids.append(0)
+        input_mask.append(0)
+        segment_ids.append(0)
+        # we don't concerned about it!
+        label_ids.append(0)
+        ntokens.append("**NULL**")
+        # label_mask.append(0)
+    # print(len(input_ids))
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+    assert len(label_ids) == max_seq_length
+    # assert len(label_mask) == max_seq_length
+
+    if ex_index < 5:
+        tf.logging.info("*** Example ***")
+        tf.logging.info("guid: %s" % (example.guid))
+        tf.logging.info("tokens: %s" % " ".join(
+            [tokenization.printable_text(x) for x in tokens]))
+        tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+        tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+        tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+        tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids]))
+        # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask]))
+
+    feature = InputFeatures(
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        label_ids=label_ids,
+        # label_mask = label_mask
+    )
+    # write_tokens(ntokens, label_ids, mode)
+    return feature
+
+
+def filed_based_convert_examples_to_features(
+        examples, label_list, max_seq_length, tokenizer, output_file, mode=None):
+    writer = tf.python_io.TFRecordWriter(output_file)
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 5000 == 0:
+            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+        feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer,
+                                         mode)
+
+        def create_int_feature(values):
+            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+            return f
+
+        features = collections.OrderedDict()
+        features["input_ids"] = create_int_feature(feature.input_ids)
+        features["input_mask"] = create_int_feature(feature.input_mask)
+        features["segment_ids"] = create_int_feature(feature.segment_ids)
+        features["label_ids"] = create_int_feature(feature.label_ids)
+        # features["label_mask"] = create_int_feature(feature.label_mask)
+        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+        writer.write(tf_example.SerializeToString())
+
+
+def file_based_input_fn_builder(input_file, batch_size, seq_length, is_training, drop_remainder, hvd=None):
+    name_to_features = {
+        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
+        "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
+        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
+        "label_ids": tf.FixedLenFeature([seq_length], tf.int64),
+        # "label_ids":tf.VarLenFeature(tf.int64),
+        # "label_mask": tf.FixedLenFeature([seq_length], tf.int64),
+    }
+
+    def _decode_record(record, name_to_features):
+        example = tf.parse_single_example(record, name_to_features)
+        for name in list(example.keys()):
+            t = example[name]
+            if t.dtype == tf.int64:
+                t = tf.to_int32(t)
+            example[name] = t
+        return example
+
+    def input_fn(params):
+        #batch_size = params["batch_size"]
+        d = tf.data.TFRecordDataset(input_file)
+        if is_training:
+            if hvd is not None: d = d.shard(hvd.size(), hvd.rank())
+            d = d.repeat()
+            d = d.shuffle(buffer_size=100)
+
+        d = d.apply(tf.contrib.data.map_and_batch(
+            lambda record: _decode_record(record, name_to_features),
+            batch_size=batch_size,
+            drop_remainder=drop_remainder
+        ))
+        return d
+
+    return input_fn
+
+
+def create_model(bert_config, is_training, input_ids, input_mask,
+                 segment_ids, labels, num_labels, use_one_hot_embeddings):
+    model = modeling.BertModel(
+        config=bert_config,
+        is_training=is_training,
+        input_ids=input_ids,
+        input_mask=input_mask,
+        token_type_ids=segment_ids,
+        use_one_hot_embeddings=use_one_hot_embeddings
+    )
+
+    output_layer = model.get_sequence_output()
+
+    hidden_size = output_layer.shape[-1].value
+
+    output_weight = tf.get_variable(
+        "output_weights", [num_labels, hidden_size],
+        initializer=tf.truncated_normal_initializer(stddev=0.02)
+    )
+    output_bias = tf.get_variable(
+        "output_bias", [num_labels], initializer=tf.zeros_initializer()
+    )
+    with tf.variable_scope("loss"):
+        if is_training:
+            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
+        output_layer = tf.reshape(output_layer, [-1, hidden_size])
+        logits = tf.matmul(output_layer, output_weight, transpose_b=True)
+        logits = tf.nn.bias_add(logits, output_bias)
+        logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels])
+        # mask = tf.cast(input_mask,tf.float32)
+        # loss = tf.contrib.seq2seq.sequence_loss(logits,labels,mask)
+        # return (loss, logits, predict)
+        ##########################################################################
+        log_probs = tf.nn.log_softmax(logits, axis=-1)
+        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
+        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
+        loss = tf.reduce_mean(per_example_loss)
+        probabilities = tf.nn.softmax(logits, axis=-1)
+        predict = tf.argmax(probabilities, axis=-1)
+        return (loss, per_example_loss, logits, predict)
+        ##########################################################################
+
+
+def model_fn_builder(bert_config, num_labels, init_checkpoint=None, learning_rate=None,
+                     num_train_steps=None, num_warmup_steps=None,
+                     use_one_hot_embeddings=False, hvd=None, use_fp16=False):
+    def model_fn(features, labels, mode, params):
+        tf.logging.info("*** Features ***")
+        for name in sorted(features.keys()):
+            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+        input_ids = features["input_ids"]
+        input_mask = features["input_mask"]
+        segment_ids = features["segment_ids"]
+        label_ids = features["label_ids"]
+        # label_mask = features["label_mask"]
+        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+
+        (total_loss, per_example_loss, logits, predicts) = create_model(
+            bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
+            num_labels, use_one_hot_embeddings)
+        tvars = tf.trainable_variables()
+        initialized_variable_names = {}
+        scaffold_fn = None
+        if init_checkpoint and (hvd is None or hvd.rank() == 0):
+            (assignment_map,
+             initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars,
+                                                                                       init_checkpoint)
+            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+        tf.logging.info("**** Trainable Variables ****")
+
+        for var in tvars:
+            init_string = ""
+            if var.name in initialized_variable_names:
+                init_string = ", *INIT_FROM_CKPT*"
+            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+                            init_string)
+        output_spec = None
+        if mode == tf.estimator.ModeKeys.TRAIN:
+            train_op = optimization.create_optimizer(
+                total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, use_fp16)
+            output_spec = tf.estimator.EstimatorSpec(
+              mode=mode,
+              loss=total_loss,
+              train_op=train_op)
+        elif mode == tf.estimator.ModeKeys.EVAL:
+
+            def metric_fn(per_example_loss, label_ids, logits):
+                # def metric_fn(label_ids, logits):
+                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
+                precision = tf_metrics.precision(label_ids, predictions, num_labels, [1, 2], average="macro")
+                recall = tf_metrics.recall(label_ids, predictions, num_labels, [1, 2], average="macro")
+                f = tf_metrics.f1(label_ids, predictions, num_labels, [1, 2], average="macro")
+                #
+                return {
+                    "eval_precision": precision,
+                    "eval_recall": recall,
+                    "eval_f": f,
+                    # "eval_loss": loss,
+                }
+
+            eval_metric_ops = metric_fn(per_example_loss, label_ids, logits)
+            output_spec = tf.estimator.EstimatorSpec(
+              mode=mode,
+              loss=total_loss,
+              eval_metric_ops=eval_metric_ops)
+        else:
+            output_spec = tf.estimator.EstimatorSpec(
+              mode=mode, predictions=predicts)#probabilities)
+        return output_spec
+
+    return model_fn
+
+
+def result_to_pair(predict_line, pred_ids, id2label, writer, err_writer):
+
+    words = str(predict_line.text).split(' ')
+    labels = str(predict_line.label).split(' ')
+    if len(words) != len(labels):
+        tf.logging.error('Text and label not equal')
+        tf.logging.error(predict_line.text)
+        tf.logging.error(predict_line.label)
+        exit(1)
+
+    # get from CLS to SEP
+    pred_labels = []
+    for id in pred_ids:
+        if id == 0:
+            continue
+        curr_label = id2label[id]
+        if curr_label == '[CLS]':
+            continue
+        elif curr_label == '[SEP]':
+            break
+        elif curr_label == 'X':
+            continue
+        pred_labels.append(curr_label)
+    if len(pred_labels) > len(words):
+        err_writer.write(predict_line.guid + '\n')
+        err_writer.write(predict_line.text + '\n')
+        err_writer.write(predict_line.label + '\n')
+        err_writer.write(' '.join([str(i) for i in pred_ids]) + '\n')
+        err_writer.write(' '.join([id2label.get(i, '**NULL**') for i in pred_ids]) + '\n\n')
+        pred_labels = pred_labels[:len(words)]
+    elif len(pred_labels) < len(words):
+        err_writer.write(predict_line.guid + '\n')
+        err_writer.write(predict_line.text + '\n')
+        err_writer.write(predict_line.label + '\n')
+        err_writer.write(' '.join([str(i) for i in pred_ids]) + '\n')
+        err_writer.write(' '.join([id2label.get(i, '**NULL**') for i in pred_ids]) + '\n\n')
+        pred_labels += ['O'] * (len(words) - len(pred_labels))
+
+    for tok, label, pred_label in zip(words, labels, pred_labels):
+        writer.write(tok + ' ' + label + ' ' + pred_label + '\n')
+    writer.write('\n')
+
+
+def main(_):
+    tf.logging.set_verbosity(tf.logging.INFO)
+
+    if FLAGS.horovod:
+      hvd.init()
+    if FLAGS.use_fp16:
+        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+
+    processors = {
+        "bc5cdr": BC5CDRProcessor,
+        "clefe": CLEFEProcessor,
+        'i2b2': I2b22012Processor
+    }
+    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
+       raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+
+    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
+        raise ValueError(
+            "Cannot use sequence length %d because the BERT model "
+            "was only trained up to sequence length %d" %
+            (FLAGS.max_seq_length, bert_config.max_position_embeddings))
+
+    task_name = FLAGS.task_name.lower()
+    if task_name not in processors:
+        raise ValueError("Task not found: %s" % (task_name))
+
+    tf.gfile.MakeDirs(FLAGS.output_dir)
+
+    processor = processors[task_name]()
+
+    label_list = processor.get_labels()
+
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+
+    master_process = True
+    training_hooks = []
+    global_batch_size = FLAGS.train_batch_size
+    hvd_rank = 0
+
+    config = tf.ConfigProto()
+    if FLAGS.horovod:
+      global_batch_size = FLAGS.train_batch_size * hvd.size()
+      master_process = (hvd.rank() == 0)
+      hvd_rank = hvd.rank()
+      config.gpu_options.visible_device_list = str(hvd.local_rank())
+      if hvd.size() > 1:
+        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
+
+    if FLAGS.use_xla:
+        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+    run_config = tf.estimator.RunConfig(
+      model_dir=FLAGS.output_dir if master_process else None,
+      session_config=config,
+      save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
+      keep_checkpoint_max=1)
+
+    if master_process:
+      tf.logging.info("***** Configuaration *****")
+      for key in FLAGS.__flags.keys():
+          tf.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
+      tf.logging.info("**************************")
+
+    train_examples = None
+    num_train_steps = None
+    num_warmup_steps = None
+    training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank))
+
+    if FLAGS.do_train:
+        train_examples = processor.get_train_examples(FLAGS.data_dir)
+        num_train_steps = int(
+            len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
+        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
+
+        start_index = 0
+        end_index = len(train_examples)
+        tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]
+
+        if FLAGS.horovod:
+          tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size())]
+          num_examples_per_rank = len(train_examples) // hvd.size()
+          remainder = len(train_examples) % hvd.size()
+          if hvd.rank() < remainder:
+            start_index = hvd.rank() * (num_examples_per_rank+1)
+            end_index = start_index + num_examples_per_rank + 1
+          else:
+            start_index = hvd.rank() * num_examples_per_rank + remainder
+            end_index = start_index + (num_examples_per_rank)
+
+    model_fn = model_fn_builder(
+        bert_config=bert_config,
+        num_labels=len(label_list) + 1,
+        init_checkpoint=FLAGS.init_checkpoint,
+        learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(),
+        num_train_steps=num_train_steps,
+        num_warmup_steps=num_warmup_steps,
+        use_one_hot_embeddings=False,
+        hvd=None if not FLAGS.horovod else hvd,
+        use_fp16=FLAGS.use_fp16)
+
+    estimator = tf.estimator.Estimator(
+      model_fn=model_fn,
+      config=run_config)
+
+    if FLAGS.do_train:
+        #train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
+        #filed_based_convert_examples_to_features(
+        #    train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
+        filed_based_convert_examples_to_features(
+          train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])
+        tf.logging.info("***** Running training *****")
+        tf.logging.info("  Num examples = %d", len(train_examples))
+        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+        tf.logging.info("  Num steps = %d", num_train_steps)
+        train_input_fn = file_based_input_fn_builder(
+            input_file=tmp_filenames, #train_file,
+            batch_size=FLAGS.train_batch_size,
+            seq_length=FLAGS.max_seq_length,
+            is_training=True,
+            drop_remainder=True,
+            hvd=None if not FLAGS.horovod else hvd)
+        
+        #estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
+        train_start_time = time.time()
+        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks)
+        train_time_elapsed = time.time() - train_start_time
+        train_time_wo_overhead = training_hooks[-1].total_time
+        avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
+        ss_sentences_per_second = (num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead
+
+        if master_process:
+          tf.logging.info("-----------------------------")
+          tf.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed,
+                        num_train_steps * global_batch_size)
+          tf.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead,
+                        (num_train_steps - training_hooks[-1].skipped) * global_batch_size)
+          tf.logging.info("Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second)
+          tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
+          tf.logging.info("-----------------------------")
+
+    if FLAGS.do_eval and master_process:
+        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
+        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
+        filed_based_convert_examples_to_features(
+            eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
+
+        tf.logging.info("***** Running evaluation *****")
+        tf.logging.info("  Num examples = %d", len(eval_examples))
+        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
+        eval_steps = None
+        eval_drop_remainder = False
+        eval_input_fn = file_based_input_fn_builder(
+            input_file=eval_file,
+            batch_size=FLAGS.eval_batch_size,
+            seq_length=FLAGS.max_seq_length,
+            is_training=False,
+            drop_remainder=eval_drop_remainder)
+        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
+        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
+        with tf.gfile.Open(output_eval_file, "w") as writer:
+            tf.logging.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                tf.logging.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+    if FLAGS.do_predict and master_process:
+        predict_examples = processor.get_test_examples(FLAGS.data_dir)
+        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
+        filed_based_convert_examples_to_features(predict_examples, label_list,
+                                                 FLAGS.max_seq_length, tokenizer,
+                                                 predict_file, mode="test")
+
+        with tf.gfile.Open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf:
+            label2id = pickle.load(rf)
+            id2label = {value: key for key, value in label2id.items()}
+        token_path = os.path.join(FLAGS.output_dir, "token_test.txt")
+        if tf.gfile.Exists(token_path):
+            tf.gfile.Remove(token_path)
+
+        tf.logging.info("***** Running prediction*****")
+        tf.logging.info("  Num examples = %d", len(predict_examples))
+        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
+
+        predict_drop_remainder = False
+        predict_input_fn = file_based_input_fn_builder(
+            input_file=predict_file,
+            batch_size=FLAGS.predict_batch_size,
+            seq_length=FLAGS.max_seq_length,
+            is_training=False,
+            drop_remainder=predict_drop_remainder)
+
+        eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
+        eval_start_time = time.time()
+
+        output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt")
+        test_labels_file = os.path.join(FLAGS.output_dir, "test_labels.txt")
+        test_labels_err_file = os.path.join(FLAGS.output_dir, "test_labels_errs.txt")
+        with tf.gfile.Open(output_predict_file, 'w') as writer, \
+                tf.gfile.Open(test_labels_file, 'w') as tl, \
+                tf.gfile.Open(test_labels_err_file, 'w') as tle:
+            print(id2label)
+            i=0
+            for prediction in estimator.predict(input_fn=predict_input_fn, hooks=eval_hooks,
+                                                yield_single_examples=True):
+                output_line = "\n".join(id2label[id] for id in prediction if id != 0) + "\n"
+                writer.write(output_line)
+                result_to_pair(predict_examples[i], prediction, id2label, tl, tle)
+                i = i + 1
+
+        eval_time_elapsed = time.time() - eval_start_time
+        eval_time_wo_overhead = eval_hooks[-1].total_time
+
+        time_list = eval_hooks[-1].time_list
+        time_list.sort()
+        num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size
+
+        avg = np.mean(time_list)
+        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
+        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
+        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
+        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
+        cf_100 = max(time_list[:int(len(time_list) * 1)])
+        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
+
+        tf.logging.info("-----------------------------")
+        tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
+                        eval_hooks[-1].count * FLAGS.predict_batch_size)
+        tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
+                        (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size)
+        tf.logging.info("Summary Inference Statistics")
+        tf.logging.info("Batch size = %d", FLAGS.predict_batch_size)
+        tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
+        tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
+        tf.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
+        tf.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
+        tf.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
+        tf.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000)
+        tf.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000)
+        tf.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
+        tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
+        tf.logging.info("-----------------------------")
+
+        tf.logging.info('Reading: %s', test_labels_file)
+        with tf.gfile.Open(test_labels_file, "r") as f:
+            counts = evaluate(f)
+        eval_result = report_notprint(counts)
+        print(''.join(eval_result))
+        with tf.gfile.Open(os.path.join(FLAGS.output_dir, 'test_results_conlleval.txt'), 'w') as fd:
+            fd.write(''.join(eval_result))
+
+
+
+if __name__ == "__main__":
+    flags.mark_flag_as_required("data_dir")
+    flags.mark_flag_as_required("task_name")
+    flags.mark_flag_as_required("vocab_file")
+    flags.mark_flag_as_required("bert_config_file")
+    flags.mark_flag_as_required("output_dir")
+    tf.app.run()
@@ -0,0 +1,818 @@
+# coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run masked LM/next sentence masked_lm pre-training for BERT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import modeling
+import optimization
+import tensorflow as tf
+import glob
+from utils.utils import LogEvalRunHook
+from tensorflow.core.protobuf import rewriter_config_pb2
+from gpu_environment import get_custom_getter
+
+from npu_bridge.estimator.npu.npu_config import *
+from npu_bridge.estimator.npu.npu_estimator import *
+from npu_bridge.estimator.npu.npu_config import NPURunConfig
+from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
+
+import sys
+
+sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)), '../../../../../utils/atlasboost'))
+# import hwlog
+from benchmark_log import hwlog
+from benchmark_log.basic_utils import get_environment_info
+from benchmark_log.basic_utils import get_model_parameter
+
+os.environ['WHICH_OP'] = 'GEOP'
+os.environ['NEW_GE_FE_ID'] = '1'
+os.environ['GE_AICPU_FLAG'] = '1'
+os.environ['GE_USE_STATIC_MEMORY'] = '1'
+os.environ['OPTION_EXEC_HCCL_FLAG'] = '1'
+os.environ['HCCL_CONNECT_TIMEOUT'] = '600'
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+## Required parameters
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.")
+
+flags.DEFINE_string(
+    "input_files_dir", None,
+    "Directory with input files, comma separated or single directory.")
+
+flags.DEFINE_string(
+    "eval_files_dir", None,
+    "Directory with eval files, comma separated or single directory. ")
+
+flags.DEFINE_string(
+    "output_dir", None,
+    "The output directory where the model checkpoints will be written.")
+
+## Other parameters
+flags.DEFINE_string(
+    "init_checkpoint", None,
+    "Initial checkpoint (usually from a pre-trained BERT model).")
+
+flags.DEFINE_string(
+    "optimizer_type", "lamb",
+    "Optimizer used for training - LAMB or ADAM")
+
+flags.DEFINE_integer(
+    "max_seq_length", 512,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded. Must match data generation.")
+
+flags.DEFINE_integer(
+    "max_predictions_per_seq", 80,
+    "Maximum number of masked LM predictions per sequence. "
+    "Must match data generation.")
+
+flags.DEFINE_bool("do_train", False, "Whether to run training.")
+
+flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
+
+flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
+
+flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
+
+flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
+
+flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.")
+
+flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.")
+
+flags.DEFINE_integer("save_checkpoints_steps", 1000,
+                     "How often to save the model checkpoint.")
+flags.DEFINE_integer("display_loss_steps", 10,
+                     "How often to print loss")
+
+flags.DEFINE_integer("iterations_per_loop", 1000,
+                     "How many steps to make in each estimator call.")
+
+flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.")
+
+flags.DEFINE_integer("num_accumulation_steps", 1,
+                     "Number of accumulation steps before gradient update."
+                     "Global batch size = num_accumulation_steps * train_batch_size")
+
+flags.DEFINE_bool("allreduce_post_accumulation", False,
+                  "Whether to all reduce after accumulation of N steps or after each step")
+
+flags.DEFINE_bool(
+    "verbose_logging", False,
+    "If true, all of the trainable parameters are printed")
+
+flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
+
+flags.DEFINE_bool("report_loss", True, "Whether to report total loss during training.")
+
+flags.DEFINE_bool("manual_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU. "
+                                        "Manual casting is done instead of using AMP")
+
+flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
+
+flags.DEFINE_bool("use_fp16", False, "Whether to enable AMP ops.")
+
+flags.DEFINE_bool("use_fp16_cls", False, "Whether to use fp16 in cls and pooler.")
+
+flags.DEFINE_bool("distributed", False, "Whether to use multi-npu")
+
+flags.DEFINE_bool('npu_bert_fused_gelu', True, 'Whether to use npu defined gelu op')
+
+flags.DEFINE_bool('npu_bert_debug', False, 'If True, dropout and shuffle is disabled.')
+
+flags.DEFINE_bool('npu_bert_use_tdt', True, 'Whether to use tdt as dataset')
+
+flags.DEFINE_string("npu_bert_job_start_file", None, "CSA job start file path.")
+
+flags.DEFINE_integer("npu_bert_loss_scale", -1,
+                     "Whether to use loss scale, -1 is disable, 0 is dynamic loss scale, >=1 is static loss scale")
+
+flags.DEFINE_bool("npu_bert_clip_by_global_norm", True,
+                  "Use clip_by_global_norm if True, or use clip_by_norm for each gradient")
+
+flags.DEFINE_bool('npu_bert_npu_dropout', True, 'Whether to use npu defined gelu op')
+
+flags.DEFINE_bool('npu_bert_tail_optimize', False, 'Whether to use npu allreduce tail optimization')
+
+flags.DEFINE_bool('npu_gather', True, 'Whether to use gather_npu whose backward propagation avoids IndexedSlices')
+
+flags.DEFINE_bool('hcom_parallel', True, 'Whether to use parallel allreduce')
+
+flags.DEFINE_integer('init_loss_scale_value', 2 ** 32, 'Initial loss scale value for loss scale optimizer')
+
+flags.DEFINE_bool('npu_bert_use_fused_batch_norm', False,
+                  'Whether to use fused batch norm implementation in fused_layer_norm')
+
+flags.DEFINE_bool('npu_bert_use_fused_adam_momentum', True, 'Whether to use fused apply and assign in adam')
+
+flags.DEFINE_integer('graph_memory_max_size', 27 * 1024 * 1024 * 1024, 'feature map memory max size')
+
+flags.DEFINE_integer('variable_memory_max_size', 4 * 1024 * 1024 * 1024, 'variable memory max size')
+
+
+# report samples/sec, total loss and learning rate during training
+class _LogSessionRunHook(tf.train.SessionRunHook):
+    def __init__(self, global_batch_size, num_accumulation_steps, display_every=10, hvd_rank=-1):
+        self.global_batch_size = global_batch_size
+        self.display_every = display_every
+        self.hvd_rank = hvd_rank
+        self.num_accumulation_steps = num_accumulation_steps
+
+    def after_create_session(self, session, coord):
+        self.elapsed_secs = 0.
+        self.count = 0
+        self.all_count = 0
+        self.avg_loss = 0.0
+
+    def before_run(self, run_context):
+        self.t0 = time.time()
+        if self.num_accumulation_steps <= 1:
+            if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+                if tf.flags.FLAGS.npu_bert_tail_optimize:
+                    return tf.train.SessionRunArgs(
+                        fetches=['global_step:0', 'total_loss:0',
+                                 'learning_rate:0', 'nsp_loss:0',
+                                 'mlm_loss:0', 'loss_scale:0'])
+                else:
+                    return tf.train.SessionRunArgs(
+                        fetches=['global_step:0', 'total_loss:0',
+                                 'learning_rate:0', 'nsp_loss:0',
+                                 'mlm_loss:0', 'loss_scale:0'])
+            else:
+                return tf.train.SessionRunArgs(
+                    fetches=['global_step:0', 'total_loss:0',
+                             'learning_rate:0', 'nsp_loss:0',
+                             'mlm_loss:0'])
+        else:
+            if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+                return tf.train.SessionRunArgs(
+                    fetches=['global_step:0', 'update_step:0', 'total_loss:0',
+                             'learning_rate:0', 'nsp_loss:0',
+                             'mlm_loss:0', 'loss_scale:0'])
+            else:
+                return tf.train.SessionRunArgs(
+                    fetches=['global_step:0', 'update_step:0', 'total_loss:0',
+                             'learning_rate:0', 'nsp_loss:0',
+                             'mlm_loss:0'])
+
+    def after_run(self, run_context, run_values):
+        self.elapsed_secs += time.time() - self.t0
+        if self.num_accumulation_steps <= 1:
+            if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+                global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
+            else:
+                global_step, total_loss, lr, nsp_loss, mlm_loss = run_values. \
+                    results
+            update_step = True
+        else:
+            if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+                global_step, update_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
+            else:
+                global_step, update_step, total_loss, lr, nsp_loss, mlm_loss = run_values. \
+                    results
+        print_step = global_step + 1  # One-based index for printing.
+        self.avg_loss += total_loss
+        self.all_count += 1
+        if update_step:
+            self.count += 1
+            dt = self.elapsed_secs / self.count
+            sent_per_sec = self.global_batch_size / dt * FLAGS.iterations_per_loop
+            avg_loss_step = self.avg_loss / self.all_count
+            if self.hvd_rank >= 0:
+                if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+                    print(
+                        'Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e Loss scale = %6.4e' %
+                        (self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr,
+                         loss_scaler), flush=True)
+                    hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
+                    hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
+                else:
+                    print(
+                        'Rank = %2d :: Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' %
+                        (self.hvd_rank, print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr),
+                        flush=True)
+                    hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
+                    hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
+            else:
+                if (tf.flags.FLAGS.npu_bert_loss_scale == 0) and (FLAGS.manual_fp16 or FLAGS.use_fp16):
+                    print(
+                        'Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e Loss scale = %6.4e' %
+                        (print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr, loss_scaler),
+                        flush=True)
+                    hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
+                    hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
+                else:
+                    print(
+                        'Step = %6i Throughput = %11.1f MLM Loss = %10.4e NSP Loss = %10.4e Loss = %9.6f Average Loss = %9.6f LR = %6.4e' %
+                        (print_step, sent_per_sec, mlm_loss, nsp_loss, total_loss, avg_loss_step, lr), flush=True)
+                    hwlog.remark_print(key=hwlog.CURRENT_STEP, value='%6i' % print_step)
+                    hwlog.remark_print(key=hwlog.THROWOUT, value='%11.1f' % sent_per_sec)
+
+
+        self.elapsed_secs = 0.
+        self.count = 0
+        self.avg_loss = 0.0
+        self.all_count = 0
+
+
+def model_fn_builder(bert_config, init_checkpoint, learning_rate,
+                     num_train_steps, num_warmup_steps,
+                     use_one_hot_embeddings, hvd=None):
+    """Returns `model_fn` closure for TPUEstimator."""
+
+    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+        """The `model_fn` for TPUEstimator."""
+
+        tf.logging.info("*** Features ***")
+        for name in sorted(features.keys()):
+            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+
+        input_ids = features["input_ids"]
+        input_mask = features["input_mask"]
+        segment_ids = features["segment_ids"]
+        masked_lm_positions = features["masked_lm_positions"]
+        masked_lm_ids = features["masked_lm_ids"]
+        masked_lm_weights = features["masked_lm_weights"]
+        next_sentence_labels = features["next_sentence_labels"]
+
+        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+
+        model = modeling.BertModel(
+            config=bert_config,
+            is_training=is_training,
+            input_ids=input_ids,
+            input_mask=input_mask,
+            token_type_ids=segment_ids,
+            use_one_hot_embeddings=use_one_hot_embeddings,
+            compute_type=tf.float16 if FLAGS.manual_fp16 else tf.float32)
+
+        (masked_lm_loss,
+         masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output(
+            bert_config, model.get_sequence_output(), model.get_embedding_table(),
+            masked_lm_positions, masked_lm_ids,
+            masked_lm_weights)
+
+        (next_sentence_loss, next_sentence_example_loss,
+         next_sentence_log_probs) = get_next_sentence_output(
+            bert_config, model.get_pooled_output(), next_sentence_labels)
+
+        masked_lm_loss = tf.identity(masked_lm_loss, name="mlm_loss")
+        next_sentence_loss = tf.identity(next_sentence_loss, name="nsp_loss")
+        total_loss = masked_lm_loss + next_sentence_loss
+        total_loss = tf.identity(total_loss, name='total_loss')
+
+        tvars = tf.trainable_variables()
+
+        initialized_variable_names = {}
+        if init_checkpoint and (hvd is None or hvd.rank() == 0):
+            print("Loading checkpoint", init_checkpoint)
+            (assignment_map, initialized_variable_names
+             ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+
+            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+        if FLAGS.verbose_logging:
+            tf.logging.info("**** Trainable Variables ****")
+            for var in tvars:
+                init_string = ""
+                if var.name in initialized_variable_names:
+                    init_string = ", *INIT_FROM_CKPT*"
+                tf.logging.info("  %d :: name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name,
+                                var.shape,
+                                init_string)
+
+        output_spec = None
+        if mode == tf.estimator.ModeKeys.TRAIN:
+            train_op = optimization.create_optimizer(
+                total_loss, learning_rate, num_train_steps, num_warmup_steps,
+                hvd, FLAGS.manual_fp16, FLAGS.use_fp16, FLAGS.num_accumulation_steps, FLAGS.optimizer_type,
+                FLAGS.allreduce_post_accumulation)
+
+            output_spec = tf.estimator.EstimatorSpec(
+                mode=mode,
+                loss=total_loss,
+                train_op=train_op)
+        elif mode == tf.estimator.ModeKeys.EVAL:
+
+            def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
+                          masked_lm_weights, next_sentence_example_loss,
+                          next_sentence_log_probs, next_sentence_labels):
+                """Computes the loss and accuracy of the model."""
+                masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
+                                                 [-1, masked_lm_log_probs.shape[-1]])
+                masked_lm_predictions = tf.argmax(
+                    masked_lm_log_probs, axis=-1, output_type=tf.int32)
+                masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
+                masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
+                masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
+                masked_lm_accuracy = tf.metrics.accuracy(
+                    labels=masked_lm_ids,
+                    predictions=masked_lm_predictions,
+                    weights=masked_lm_weights)
+                masked_lm_mean_loss = tf.metrics.mean(
+                    values=masked_lm_example_loss, weights=masked_lm_weights)
+
+                next_sentence_log_probs = tf.reshape(
+                    next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
+                next_sentence_predictions = tf.argmax(
+                    next_sentence_log_probs, axis=-1, output_type=tf.int32)
+                next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
+                next_sentence_accuracy = tf.metrics.accuracy(
+                    labels=next_sentence_labels, predictions=next_sentence_predictions)
+                next_sentence_mean_loss = tf.metrics.mean(
+                    values=next_sentence_example_loss)
+
+                return {
+                    "masked_lm_accuracy": masked_lm_accuracy,
+                    "masked_lm_loss": masked_lm_mean_loss,
+                    "next_sentence_accuracy": next_sentence_accuracy,
+                    "next_sentence_loss": next_sentence_mean_loss,
+                }
+
+            eval_metric_ops = metric_fn(
+                masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
+                masked_lm_weights, next_sentence_example_loss,
+                next_sentence_log_probs, next_sentence_labels
+            )
+            output_spec = tf.estimator.EstimatorSpec(
+                mode=mode,
+                loss=total_loss,
+                eval_metric_ops=eval_metric_ops)
+        else:
+            raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))
+
+        return output_spec
+
+    return model_fn
+
+
+def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
+                         label_ids, label_weights):
+    """Get loss and log probs for the masked LM."""
+    input_tensor = gather_indexes(input_tensor, positions)
+
+    with tf.variable_scope("cls/predictions"):
+        # We apply one more non-linear transformation before the output layer.
+        # This matrix is not used after pre-training.
+        with tf.variable_scope("transform", custom_getter=get_custom_getter(
+                compute_type=tf.float16 if FLAGS.use_fp16_cls else tf.float32)):
+            if FLAGS.use_fp16_cls:
+                input_tensor = tf.cast(input_tensor, tf.float16)
+            input_tensor = tf.layers.dense(
+                input_tensor,
+                units=bert_config.hidden_size,
+                activation=modeling.get_activation(bert_config.hidden_act),
+                kernel_initializer=modeling.create_initializer(
+                    bert_config.initializer_range))
+            input_tensor = tf.cast(input_tensor, tf.float32)
+            input_tensor = modeling.layer_norm(input_tensor)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        output_bias = tf.get_variable(
+            "output_bias",
+            shape=[bert_config.vocab_size],
+            initializer=tf.zeros_initializer())
+        if FLAGS.use_fp16_cls:
+            input_tensor = tf.cast(input_tensor, tf.float16)
+            logits = tf.matmul(input_tensor, tf.cast(output_weights, tf.float16), transpose_b=True)
+            logits = tf.cast(logits, tf.float32)
+        else:
+            logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
+        logits = tf.nn.bias_add(logits, output_bias)
+        log_probs = tf.nn.log_softmax(logits, axis=-1)
+
+        label_ids = tf.reshape(label_ids, [-1])
+        label_weights = tf.reshape(label_weights, [-1])
+
+        one_hot_labels = tf.one_hot(
+            label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
+
+        # The `positions` tensor might be zero-padded (if the sequence is too
+        # short to have the maximum number of predictions). The `label_weights`
+        # tensor has a value of 1.0 for every real prediction and 0.0 for the
+        # padding predictions.
+        per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
+        numerator = tf.reduce_sum(label_weights * per_example_loss)
+        denominator = tf.reduce_sum(label_weights) + 1e-5
+        loss = numerator / denominator
+
+    return (loss, per_example_loss, log_probs)
+
+
+def get_next_sentence_output(bert_config, input_tensor, labels):
+    """Get loss and log probs for the next sentence prediction."""
+
+    # Simple binary classification. Note that 0 is "next sentence" and 1 is
+    # "random sentence". This weight matrix is not used after pre-training.
+    with tf.variable_scope("cls/seq_relationship"):
+        output_weights = tf.get_variable(
+            "output_weights",
+            shape=[2, bert_config.hidden_size],
+            initializer=modeling.create_initializer(bert_config.initializer_range))
+        output_bias = tf.get_variable(
+            "output_bias", shape=[2], initializer=tf.zeros_initializer())
+
+        if FLAGS.use_fp16_cls:
+            input_tensor = tf.cast(input_tensor, tf.float16)
+            logits = tf.matmul(input_tensor, tf.cast(output_weights, tf.float16), transpose_b=True)
+            logits = tf.cast(logits, tf.float32)
+        else:
+            logits = tf.matmul(tf.cast(input_tensor, tf.float32), output_weights, transpose_b=True)
+        logits = tf.nn.bias_add(logits, output_bias)
+        log_probs = tf.nn.log_softmax(logits, axis=-1)
+        labels = tf.reshape(labels, [-1])
+        one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
+        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
+        loss = tf.reduce_mean(per_example_loss)
+        return (loss, per_example_loss, log_probs)
+
+
+def gather_indexes(sequence_tensor, positions):
+    """Gathers the vectors at the specific positions over a minibatch."""
+    sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
+    batch_size = sequence_shape[0]
+    seq_length = sequence_shape[1]
+    width = sequence_shape[2]
+
+    flat_offsets = tf.reshape(
+        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
+    flat_positions = tf.reshape(positions + flat_offsets, [-1])
+    flat_sequence_tensor = tf.reshape(sequence_tensor,
+                                      [batch_size * seq_length, width])
+    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
+    return output_tensor
+
+
+def input_fn_builder(input_files,
+                     batch_size,
+                     max_seq_length,
+                     max_predictions_per_seq,
+                     is_training,
+                     num_cpu_threads=4,
+                     hvd=None):
+    """Creates an `input_fn` closure to be passed to Estimator."""
+
+    def input_fn():
+        """The actual input function."""
+
+        name_to_features = {
+            "input_ids":
+                tf.FixedLenFeature([max_seq_length], tf.int64),
+            "input_mask":
+                tf.FixedLenFeature([max_seq_length], tf.int64),
+            "segment_ids":
+                tf.FixedLenFeature([max_seq_length], tf.int64),
+            "masked_lm_positions":
+                tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
+            "masked_lm_ids":
+                tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
+            "masked_lm_weights":
+                tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
+            "next_sentence_labels":
+                tf.FixedLenFeature([1], tf.int64),
+        }
+
+        # For training, we want a lot of parallel reading and shuffling.
+        # For eval, we want no shuffling and parallel reading doesn't matter.
+        if is_training:
+            d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
+            if FLAGS.distributed:
+                rank_size = int(os.getenv('RANK_SIZE'))
+                rank_id = int(os.getenv('RANK_ID'))
+                print('RANK_SIZE=', rank_size, ' rank_id=', rank_id)
+                d = d.shard(rank_size, rank_id)
+            d = d.repeat()
+            if not FLAGS.npu_bert_debug:
+                d = d.shuffle(buffer_size=len(input_files))
+
+            # `cycle_length` is the number of parallel files that get read.
+            if not FLAGS.npu_bert_debug:
+                # cycle_length = min(num_cpu_threads, len(input_files))
+                cycle_length = min(num_cpu_threads, int(len(input_files) / int(os.getenv('RANK_SIZE'))))
+            else:
+                cycle_length = 1
+
+                # `sloppy` mode means that the interleaving is not exact. This adds
+                # even more randomness to the training pipeline.
+         #  d = d.apply(
+		    #  tf.contrib.data.parallel_interleave(
+            #  tf.data.TFRecordDataset,
+            #  sloppy=(not FLAGS.npu_bert_debug),
+            #  cycle_length=cycle_length))
+            d = d.interleave(tf.data.TFRecordDataset, cycle_length=cycle_length,
+                             num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+            if not FLAGS.npu_bert_debug:
+                d = d.shuffle(buffer_size=100)
+        else:
+            d = tf.data.TFRecordDataset(input_files)
+            # Since we evaluate for a fixed number of steps we don't want to encounter
+            # out-of-range exceptions.
+            d = d.repeat()
+
+        # We must `drop_remainder` on training because the TPU requires fixed
+        # size dimensions. For eval, we assume we are evaluating on the CPU or GPU
+        # and we *don't* want to drop the remainder, otherwise we wont cover
+        # every sample.
+        d = d.apply(
+            tf.contrib.data.map_and_batch(
+                lambda record: _decode_record(record, name_to_features),
+                batch_size=batch_size,
+                num_parallel_batches=num_cpu_threads,
+                drop_remainder=True))
+        return d
+
+    return input_fn
+
+
+def _decode_record(record, name_to_features):
+    """Decodes a record to a TensorFlow example."""
+    example = tf.parse_single_example(record, name_to_features)
+
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in list(example.keys()):
+        t = example[name]
+        if t.dtype == tf.int64:
+            t = tf.to_int32(t)
+        example[name] = t
+
+    return example
+
+
+def main(_):
+    for name, value in FLAGS.__flags.items():
+        print("name:", name, "      ", FLAGS[name].value)
+
+    tf.logging.set_verbosity(tf.logging.INFO)
+
+    if not FLAGS.do_train and not FLAGS.do_eval:
+        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+    if FLAGS.use_fp16:
+        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+
+    if FLAGS.horovod:
+        import horovod.tensorflow as hvd
+        hvd.init()
+
+    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+
+    if FLAGS.npu_gather:
+        if FLAGS.distributed and bert_config.num_hidden_layers == 24:
+            from hccl.split.api import set_split_strategy_by_idx
+            set_split_strategy_by_idx([49, 113, 177, 241, 305, 353, 385, 397])
+        if FLAGS.distributed and bert_config.num_hidden_layers == 12:
+            from hccl.split.api import set_split_strategy_by_idx
+            set_split_strategy_by_idx([8, 56, 104, 152, 200, 205])
+        if FLAGS.distributed and bert_config.num_hidden_layers == 6:
+            from hccl.split.api import set_split_strategy_by_idx
+            set_split_strategy_by_idx([8, 40, 72, 104, 109])
+
+    tf.gfile.MakeDirs(FLAGS.output_dir)
+
+    input_files = []
+    for input_file_dir in FLAGS.input_files_dir.split(","):
+        input_files.extend(tf.gfile.Glob(os.path.join(input_file_dir, "*")))
+
+    input_files.sort()
+    print("Input Files:", input_files)
+
+    if FLAGS.horovod and len(input_files) < hvd.size():
+        raise ValueError("Input Files must be sharded")
+    if FLAGS.use_fp16 and FLAGS.manual_fp16:
+        raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error")
+
+    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+    config = tf.ConfigProto()
+    if FLAGS.horovod:
+        config.gpu_options.visible_device_list = str(hvd.local_rank())
+        if hvd.rank() == 0:
+            tf.logging.info("***** Configuaration *****")
+            for key in FLAGS.__flags.keys():
+                tf.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
+            tf.logging.info("**************************")
+
+        #    config.gpu_options.per_process_gpu_memory_fraction = 0.7
+    if FLAGS.use_xla:
+        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+        config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT
+
+    # run_config = tf.estimator.RunConfig(
+    run_config = NPURunConfig(
+        model_dir=FLAGS.output_dir,
+        save_summary_steps=0,
+        session_config=config,
+        save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.horovod or hvd.rank() == 0 else None,
+        # This variable controls how often estimator reports examples/sec.
+        # Default value is every 100 steps.
+        # When --report_loss is True, we set to very large value to prevent
+        # default info reporting from estimator.
+        # Ideally we should set it to None, but that does not work.
+        log_step_count_steps=1 if FLAGS.report_loss else 100,
+        enable_data_pre_proc=FLAGS.npu_bert_use_tdt,
+        iterations_per_loop=FLAGS.iterations_per_loop,
+        is_tailing_optimization=FLAGS.npu_bert_tail_optimize,
+        hcom_parallel=FLAGS.hcom_parallel,
+        graph_memory_max_size=FLAGS.graph_memory_max_size,
+        variable_memory_max_size=FLAGS.variable_memory_max_size)
+
+    if FLAGS.distributed:
+        rank_size = int(os.getenv('RANK_SIZE'))
+    model_fn = model_fn_builder(
+        bert_config=bert_config,
+        init_checkpoint=FLAGS.init_checkpoint,
+        learning_rate=FLAGS.learning_rate if not (FLAGS.distributed) else FLAGS.learning_rate * rank_size,
+        num_train_steps=FLAGS.num_train_steps,
+        num_warmup_steps=FLAGS.num_warmup_steps,
+        use_one_hot_embeddings=False,
+        hvd=None if not FLAGS.horovod else hvd)
+
+    training_hooks = []
+    """
+    if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0):
+      global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.horovod else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size()
+      training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
+    if FLAGS.horovod and hvd.size() > 1:
+      training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
+    """
+    if FLAGS.report_loss:
+        global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.distributed else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * rank_size
+        training_hooks.append(
+            _LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, FLAGS.display_loss_steps))
+
+    # estimator = tf.estimator.Estimator(
+    estimator = NPUEstimator(
+        model_fn=model_fn,
+        config=run_config,
+        job_start_file=FLAGS.npu_bert_job_start_file)
+
+    if FLAGS.do_train:
+        tf.logging.info("***** Running training *****")
+        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+        train_input_fn = input_fn_builder(
+            input_files=input_files,
+            batch_size=FLAGS.train_batch_size,
+            max_seq_length=FLAGS.max_seq_length,
+            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
+            is_training=True,
+            hvd=None if not FLAGS.horovod else hvd)
+
+        estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps)
+
+    if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
+        tf.logging.info("***** Running evaluation *****")
+        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
+
+        eval_files = []
+        for eval_file_dir in FLAGS.eval_files_dir.split(","):
+            eval_files.extend(tf.gfile.Glob(os.path.join(eval_file_dir, "*")))
+
+        eval_input_fn = input_fn_builder(
+            input_files=eval_files,
+            batch_size=FLAGS.eval_batch_size,
+            max_seq_length=FLAGS.max_seq_length,
+            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
+            is_training=False,
+            hvd=None if not FLAGS.horovod else hvd)
+
+        eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
+        eval_start_time = time.time()
+        result = estimator.evaluate(
+            input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks)
+
+        eval_time_elapsed = time.time() - eval_start_time
+        eval_time_wo_overhead = eval_hooks[-1].total_time
+
+        num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size
+
+        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
+
+        tf.logging.info("-----------------------------")
+        tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
+                        eval_hooks[-1].count * FLAGS.eval_batch_size)
+        tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
+                        (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.eval_batch_size)
+        tf.logging.info("Summary Inference Statistics on EVAL set")
+        tf.logging.info("Batch size = %d", FLAGS.eval_batch_size)
+        tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
+        tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
+        tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
+        tf.logging.info("-----------------------------")
+
+        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
+        with tf.gfile.GFile(output_eval_file, "w") as writer:
+            tf.logging.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                tf.logging.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+                if key == 'masked_lm_accuracy':
+                    hwlog.remark_print(key=hwlog.MASKED_LM_ACCURACY, value=str(result[key]))
+                elif key == 'next_sentence_accuracy ':
+                    hwlog.remark_print(key=hwlog.NEXT_SENTENCE_ACCURACY, value=str(result[key]))
+                elif key == 'global_step':
+                    hwlog.remark_print(key=hwlog.GLOBAL_STEP, value=str(result[key]))
+                elif key == 'loss':
+                    hwlog.remark_print(key=hwlog.LOSS, value=str(result[key]))
+                elif key == 'masked_lm_loss':
+                    hwlog.remark_print(key=hwlog.MASKED_LM_LOSS, value=str(result[key]))
+                elif key == 'next_sentence_loss ':
+                    hwlog.remark_print(key=hwlog.NEXT_SENTENCE_LOSS, value=str(result[key]))
+                else:
+                    pass
+
+
+if __name__ == "__main__":
+    hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
+    cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
+    config_info = get_model_parameter("tensorflow_config")
+    initinal_data = {"base_lr": 0.01, "dataset": "cn-clue/en-wiki", "optimizer": "Adam", "loss_scale": 512}
+    hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
+    hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
+    hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
+    hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
+    hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
+    hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
+    hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
+    hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
+    hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
+    hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
+    flags.mark_flag_as_required("input_files_dir")
+    flags.mark_flag_as_required("eval_files_dir")
+    flags.mark_flag_as_required("bert_config_file")
+    flags.mark_flag_as_required("output_dir")
+    flags.mark_flag_as_required("npu_bert_job_start_file")
+    if FLAGS.use_xla and FLAGS.manual_fp16:
+        print('WARNING! Combining --use_xla with --manual_fp16 may prevent convergence.')
+        print('         This warning message will be removed when the underlying')
+        print('         issues have been fixed and you are running a TF version')
+        print('         that has that fix.')
+    tf.app.run()
+
@@ -0,0 +1,939 @@
+# coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import csv
+import logging
+import os, sys
+import numpy as np
+
+import tensorflow as tf
+
+sys.path.append("/workspace/bert")
+
+import modeling
+import optimization
+import tokenization
+
+import time
+import horovod.tensorflow as hvd
+from utils.utils import LogEvalRunHook, LogTrainRunHook
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+## Required parameters
+flags.DEFINE_string(
+    "data_dir", None,
+    "The input data dir. Should contain the .tsv files (or other data files) "
+    "for the task.")
+
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.")
+
+flags.DEFINE_string("task_name", None, "The name of the task to train.")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_string(
+    "output_dir", None,
+    "The output directory where the model checkpoints will be written.")
+
+## Other parameters
+
+flags.DEFINE_string(
+    "init_checkpoint", None,
+    "Initial checkpoint (usually from a pre-trained BERT model).")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded.")
+
+flags.DEFINE_bool("do_train", False, "Whether to run training.")
+
+flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
+
+flags.DEFINE_bool(
+    "do_predict", False,
+    "Whether to run the model in inference mode on the test set.")
+
+flags.DEFINE_integer("train_batch_size", 16, "Total batch size for training.")
+
+flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
+
+flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")
+
+flags.DEFINE_float("learning_rate", 5e-6, "The initial learning rate for Adam.")
+
+flags.DEFINE_float("num_train_epochs", 3.0,
+                   "Total number of training epochs to perform.")
+
+flags.DEFINE_float(
+    "warmup_proportion", 0.1,
+    "Proportion of training to perform linear learning rate warmup for. "
+    "E.g., 0.1 = 10% of training.")
+
+flags.DEFINE_integer("save_checkpoints_steps", 1000,
+                     "How often to save the model checkpoint.")
+
+flags.DEFINE_integer("iterations_per_loop", 1000,
+                     "How many steps to make in each estimator call.")
+
+tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
+
+flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
+flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
+flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+          guid: Unique id for the example.
+          text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+          text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+          label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class PaddingInputExample(object):
+    """Fake example so the num input examples is a multiple of the batch size.
+
+    When running eval/predict on the TPU, we need to pad the number of examples
+    to be a multiple of the batch size, because the TPU requires a fixed batch
+    size. The alternative is to drop the last batch, which is bad because it means
+    the entire output data won't be generated.
+
+    We use this class instead of `None` because treating `None` as padding
+    battches could cause silent errors.
+    """
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 label_id,
+                 is_real_example=True):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+        self.is_real_example = is_real_example
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_test_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for prediction."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with tf.gfile.Open(input_file, "r") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                lines.append(line)
+            return lines
+
+
+class _ChemProtProcessor(DataProcessor):
+    """Processor for the ChemProt data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir, file_name="dev.tsv"):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, file_name)), "dev")
+
+    def get_test_examples(self, data_dir, file_name="test.tsv"):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, file_name)), "test")
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            # skip header
+            if i == 0:
+                continue
+            guid = line[0]
+            text_a = tokenization.convert_to_unicode(line[1])
+            if set_type == "test":
+                label = self.get_labels()[-1]
+            else:
+                try:
+                    label = tokenization.convert_to_unicode(line[2])
+                except IndexError:
+                    logging.exception(line)
+                    exit(1)
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class ChemProtProcessor(_ChemProtProcessor):
+    def get_labels(self):
+        """See base class."""
+        return ["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9", "false"]
+
+
+
+class MedNLIProcessor(DataProcessor):
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir, file_name="dev.tsv"):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, file_name)), "dev")
+
+    def get_test_examples(self, data_dir, file_name="test.tsv"):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, file_name)), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ['contradiction', 'entailment', 'neutral']
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = line[1]
+            text_a = tokenization.convert_to_unicode(line[2])
+            text_b = tokenization.convert_to_unicode(line[3])
+            if set_type == "test":
+                label = self.get_labels()[-1]
+            else:
+                label = tokenization.convert_to_unicode(line[0])
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+def convert_single_example(ex_index, example, label_list, max_seq_length,
+                           tokenizer):
+    """Converts a single `InputExample` into a single `InputFeatures`."""
+
+    if isinstance(example, PaddingInputExample):
+        return InputFeatures(
+            input_ids=[0] * max_seq_length,
+            input_mask=[0] * max_seq_length,
+            segment_ids=[0] * max_seq_length,
+            label_id=0,
+            is_real_example=False)
+
+    label_map = {}
+    for (i, label) in enumerate(label_list):
+        label_map[label] = i
+
+    tokens_a = tokenizer.tokenize(example.text_a)
+    tokens_b = None
+    if example.text_b:
+        tokens_b = tokenizer.tokenize(example.text_b)
+
+    if tokens_b:
+        # Modifies `tokens_a` and `tokens_b` in place so that the total
+        # length is less than the specified length.
+        # Account for [CLS], [SEP], [SEP] with "- 3"
+        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+    else:
+        # Account for [CLS] and [SEP] with "- 2"
+        if len(tokens_a) > max_seq_length - 2:
+            tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+    # The convention in BERT is:
+    # (a) For sequence pairs:
+    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+    # (b) For single sequences:
+    #  tokens:   [CLS] the dog is hairy . [SEP]
+    #  type_ids: 0     0   0   0  0     0 0
+    #
+    # Where "type_ids" are used to indicate whether this is the first
+    # sequence or the second sequence. The embedding vectors for `type=0` and
+    # `type=1` were learned during pre-training and are added to the wordpiece
+    # embedding vector (and position vector). This is not *strictly* necessary
+    # since the [SEP] token unambiguously separates the sequences, but it makes
+    # it easier for the model to learn the concept of sequences.
+    #
+    # For classification tasks, the first vector (corresponding to [CLS]) is
+    # used as the "sentence vector". Note that this only makes sense because
+    # the entire model is fine-tuned.
+    tokens = []
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(0)
+    for token in tokens_a:
+        tokens.append(token)
+        segment_ids.append(0)
+    tokens.append("[SEP]")
+    segment_ids.append(0)
+
+    if tokens_b:
+        for token in tokens_b:
+            tokens.append(token)
+            segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+
+    # Zero-pad up to the sequence length.
+    while len(input_ids) < max_seq_length:
+        input_ids.append(0)
+        input_mask.append(0)
+        segment_ids.append(0)
+
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+
+    label_id = label_map[example.label]
+    if ex_index < 5:
+        tf.logging.info("*** Example ***")
+        tf.logging.info("guid: %s" % (example.guid))
+        tf.logging.info("tokens: %s" % " ".join(
+            [tokenization.printable_text(x) for x in tokens]))
+        tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+        tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+        tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+        tf.logging.info("label: %s (id = %d)" % (example.label, label_id))
+
+    feature = InputFeatures(
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        label_id=label_id,
+        is_real_example=True)
+    return feature
+
+
+def file_based_convert_examples_to_features(
+        examples, label_list, max_seq_length, tokenizer, output_file):
+    """Convert a set of `InputExample`s to a TFRecord file."""
+
+    writer = tf.python_io.TFRecordWriter(output_file)
+
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+        feature = convert_single_example(ex_index, example, label_list,
+                                         max_seq_length, tokenizer)
+
+        def create_int_feature(values):
+            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+            return f
+
+        features = collections.OrderedDict()
+        features["input_ids"] = create_int_feature(feature.input_ids)
+        features["input_mask"] = create_int_feature(feature.input_mask)
+        features["segment_ids"] = create_int_feature(feature.segment_ids)
+        features["label_ids"] = create_int_feature([feature.label_id])
+        features["is_real_example"] = create_int_feature(
+            [int(feature.is_real_example)])
+
+        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+        writer.write(tf_example.SerializeToString())
+    writer.close()
+
+
+def file_based_input_fn_builder(input_file, batch_size, seq_length, is_training,
+                                drop_remainder, hvd=None):
+    """Creates an `input_fn` closure to be passed to TPUEstimator."""
+
+    name_to_features = {
+        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
+        "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
+        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
+        "label_ids": tf.FixedLenFeature([], tf.int64),
+        "is_real_example": tf.FixedLenFeature([], tf.int64),
+    }
+
+    def _decode_record(record, name_to_features):
+        """Decodes a record to a TensorFlow example."""
+        example = tf.parse_single_example(record, name_to_features)
+
+        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+        # So cast all int64 to int32.
+        for name in list(example.keys()):
+            t = example[name]
+            if t.dtype == tf.int64:
+                t = tf.to_int32(t)
+            example[name] = t
+
+        return example
+
+    def input_fn(params):
+        """The actual input function."""
+        #batch_size = params["batch_size"]
+
+        # For training, we want a lot of parallel reading and shuffling.
+        # For eval, we want no shuffling and parallel reading doesn't matter.
+        d = tf.data.TFRecordDataset(input_file)
+        if is_training:
+            if hvd is not None: d = d.shard(hvd.size(), hvd.rank())
+            d = d.repeat()
+            d = d.shuffle(buffer_size=100)
+
+        d = d.apply(
+            tf.contrib.data.map_and_batch(
+                lambda record: _decode_record(record, name_to_features),
+                batch_size=batch_size,
+                drop_remainder=drop_remainder))
+
+        return d
+
+    return input_fn
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
+                 labels, num_labels, use_one_hot_embeddings):
+    """Creates a classification model."""
+    model = modeling.BertModel(
+        config=bert_config,
+        is_training=is_training,
+        input_ids=input_ids,
+        input_mask=input_mask,
+        token_type_ids=segment_ids,
+        use_one_hot_embeddings=use_one_hot_embeddings)
+
+    # In the demo, we are doing a simple classification task on the entire
+    # segment.
+    #
+    # If you want to use the token-level output, use model.get_sequence_output()
+    # instead.
+    output_layer = model.get_pooled_output()
+
+    hidden_size = output_layer.shape[-1].value
+
+    output_weights = tf.get_variable(
+        "output_weights", [num_labels, hidden_size],
+        initializer=tf.truncated_normal_initializer(stddev=0.02))
+
+    output_bias = tf.get_variable(
+        "output_bias", [num_labels], initializer=tf.zeros_initializer())
+
+    with tf.variable_scope("loss"):
+        if is_training:
+            # I.e., 0.1 dropout
+            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
+
+        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
+        logits = tf.nn.bias_add(logits, output_bias)
+        probabilities = tf.nn.softmax(logits, axis=-1)
+        log_probs = tf.nn.log_softmax(logits, axis=-1)
+
+        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
+
+        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
+        loss = tf.reduce_mean(per_example_loss)
+
+        return (loss, per_example_loss, logits, probabilities)
+
+
+def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate=None,
+                     num_train_steps=None, num_warmup_steps=None,
+                     use_one_hot_embeddings=False, hvd=None, use_fp16=False):
+    """Returns `model_fn` closure for TPUEstimator."""
+
+    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+        """The `model_fn` for TPUEstimator."""
+
+        tf.logging.info("*** Features ***")
+        for name in sorted(features.keys()):
+            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+
+        input_ids = features["input_ids"]
+        input_mask = features["input_mask"]
+        segment_ids = features["segment_ids"]
+        label_ids = features["label_ids"]
+        is_real_example = None
+        if "is_real_example" in features:
+            is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
+        else:
+            is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
+
+        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+
+        (total_loss, per_example_loss, logits, probabilities) = create_model(
+            bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
+            num_labels, use_one_hot_embeddings)
+
+        tvars = tf.trainable_variables()
+        initialized_variable_names = {}
+        scaffold_fn = None
+        if init_checkpoint and (hvd is None or hvd.rank() == 0):
+            (assignment_map, initialized_variable_names
+             ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+        tf.logging.info("**** Trainable Variables ****")
+        for var in tvars:
+            init_string = ""
+            if var.name in initialized_variable_names:
+                init_string = ", *INIT_FROM_CKPT*"
+            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+                            init_string)
+
+        output_spec = None
+        if mode == tf.estimator.ModeKeys.TRAIN:
+
+            train_op = optimization.create_optimizer(
+                total_loss, learning_rate, num_train_steps, num_warmup_steps, hvd, False, use_fp16)
+
+            output_spec = tf.estimator.EstimatorSpec(
+              mode=mode,
+              loss=total_loss,
+              train_op=train_op)
+        elif mode == tf.estimator.ModeKeys.EVAL:
+
+            def metric_fn(per_example_loss, label_ids, logits, is_real_example):
+                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
+                accuracy = tf.metrics.accuracy(
+                    labels=label_ids, predictions=predictions, weights=is_real_example)
+                loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
+                return {
+                    "eval_accuracy": accuracy,
+                    "eval_loss": loss,
+                }
+
+            eval_metric_ops = metric_fn(per_example_loss, label_ids, logits, is_real_example)
+            output_spec = tf.estimator.EstimatorSpec(
+              mode=mode,
+              loss=total_loss,
+              eval_metric_ops=eval_metric_ops)
+        else:
+            output_spec = tf.estimator.EstimatorSpec(
+                    mode=mode, predictions={"probabilities": probabilities})#predicts)#probabilities)
+        return output_spec
+
+    return model_fn
+
+
+# This function is not used by this file but is still used by the Colab and
+# people who depend on it.
+def input_fn_builder(features, seq_length, is_training, drop_remainder):
+    """Creates an `input_fn` closure to be passed to TPUEstimator."""
+
+    all_input_ids = []
+    all_input_mask = []
+    all_segment_ids = []
+    all_label_ids = []
+
+    for feature in features:
+        all_input_ids.append(feature.input_ids)
+        all_input_mask.append(feature.input_mask)
+        all_segment_ids.append(feature.segment_ids)
+        all_label_ids.append(feature.label_id)
+
+    def input_fn(params):
+        """The actual input function."""
+        batch_size = params["batch_size"]
+
+        num_examples = len(features)
+
+        # This is for demo purposes and does NOT scale to large data sets. We do
+        # not use Dataset.from_generator() because that uses tf.py_func which is
+        # not TPU compatible. The right way to load data is with TFRecordReader.
+        d = tf.data.Dataset.from_tensor_slices({
+            "input_ids":
+                tf.constant(
+                    all_input_ids, shape=[num_examples, seq_length],
+                    dtype=tf.int32),
+            "input_mask":
+                tf.constant(
+                    all_input_mask,
+                    shape=[num_examples, seq_length],
+                    dtype=tf.int32),
+            "segment_ids":
+                tf.constant(
+                    all_segment_ids,
+                    shape=[num_examples, seq_length],
+                    dtype=tf.int32),
+            "label_ids":
+                tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
+        })
+
+        if is_training:
+            d = d.repeat()
+            d = d.shuffle(buffer_size=100)
+
+        d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
+        return d
+
+    return input_fn
+
+
+# This function is not used by this file but is still used by the Colab and
+# people who depend on it.
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer):
+    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+        feature = convert_single_example(ex_index, example, label_list,
+                                         max_seq_length, tokenizer)
+
+        features.append(feature)
+    return features
+
+
+def main(_):
+    tf.logging.set_verbosity(tf.logging.INFO)
+
+    if FLAGS.horovod:
+      hvd.init()
+    if FLAGS.use_fp16:
+        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+
+    processors = {
+        "chemprot": ChemProtProcessor,
+        'mednli': MedNLIProcessor,
+    }
+
+    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
+                                                  FLAGS.init_checkpoint)
+
+    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
+        raise ValueError(
+            "At least one of `do_train`, `do_eval` or `do_predict' must be True.")
+
+    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+
+    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
+        raise ValueError(
+            "Cannot use sequence length %d because the BERT model "
+            "was only trained up to sequence length %d" %
+            (FLAGS.max_seq_length, bert_config.max_position_embeddings))
+
+    tf.gfile.MakeDirs(FLAGS.output_dir)
+
+    task_name = FLAGS.task_name.lower()
+
+    if task_name not in processors:
+        raise ValueError("Task not found: %s" % (task_name))
+
+    processor = processors[task_name]()
+
+    label_list = processor.get_labels()
+
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+
+    master_process = True
+    training_hooks = []
+    global_batch_size = FLAGS.train_batch_size
+    hvd_rank = 0
+
+    config = tf.ConfigProto()
+    if FLAGS.horovod:
+      global_batch_size = FLAGS.train_batch_size * hvd.size()
+      master_process = (hvd.rank() == 0)
+      hvd_rank = hvd.rank()
+      config.gpu_options.visible_device_list = str(hvd.local_rank())
+      if hvd.size() > 1:
+        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
+
+    if FLAGS.use_xla:
+        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+    run_config = tf.estimator.RunConfig(
+      model_dir=FLAGS.output_dir if master_process else None,
+      session_config=config,
+      save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
+      keep_checkpoint_max=1)
+
+    if master_process:
+      tf.logging.info("***** Configuaration *****")
+      for key in FLAGS.__flags.keys():
+          tf.logging.info('  {}: {}'.format(key, getattr(FLAGS, key)))
+      tf.logging.info("**************************")
+
+    train_examples = None
+    num_train_steps = None
+    num_warmup_steps = None
+
+    training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank))
+
+    if FLAGS.do_train:
+        train_examples = processor.get_train_examples(FLAGS.data_dir)
+        num_train_steps = int(
+            len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
+        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
+
+        start_index = 0
+        end_index = len(train_examples)
+        tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]
+
+        if FLAGS.horovod:
+          tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i)) for i in range(hvd.size())]
+          num_examples_per_rank = len(train_examples) // hvd.size()
+          remainder = len(train_examples) % hvd.size()
+          if hvd.rank() < remainder:
+            start_index = hvd.rank() * (num_examples_per_rank+1)
+            end_index = start_index + num_examples_per_rank + 1
+          else:
+            start_index = hvd.rank() * num_examples_per_rank + remainder
+            end_index = start_index + (num_examples_per_rank)
+
+
+    model_fn = model_fn_builder(
+        bert_config=bert_config,
+        num_labels=len(label_list),
+        init_checkpoint=FLAGS.init_checkpoint,
+        learning_rate=FLAGS.learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(),
+        num_train_steps=num_train_steps,
+        num_warmup_steps=num_warmup_steps,
+        use_one_hot_embeddings=False,
+        hvd=None if not FLAGS.horovod else hvd,
+        use_fp16=FLAGS.use_fp16)
+
+    estimator = tf.estimator.Estimator(
+      model_fn=model_fn,
+      config=run_config)
+
+
+    if FLAGS.do_train:
+        file_based_convert_examples_to_features(
+          train_examples[start_index:end_index], label_list, FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])
+        tf.logging.info("***** Running training *****")
+        tf.logging.info("  Num examples = %d", len(train_examples))
+        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+        tf.logging.info("  Num steps = %d", num_train_steps)
+        train_input_fn = file_based_input_fn_builder(
+            input_file=tmp_filenames,
+            batch_size=FLAGS.train_batch_size,
+            seq_length=FLAGS.max_seq_length,
+            is_training=True,
+            drop_remainder=True,
+            hvd=None if not FLAGS.horovod else hvd)
+
+        train_start_time = time.time()
+        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=training_hooks)
+        train_time_elapsed = time.time() - train_start_time
+        train_time_wo_overhead = training_hooks[-1].total_time
+        avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
+        ss_sentences_per_second = (num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead
+
+        if master_process:
+          tf.logging.info("-----------------------------")
+          tf.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed,
+                        num_train_steps * global_batch_size)
+          tf.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead,
+                        (num_train_steps - training_hooks[-1].skipped) * global_batch_size)
+          tf.logging.info("Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second)
+          tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
+          tf.logging.info("-----------------------------")
+
+
+    if FLAGS.do_eval and master_process:
+        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
+        num_actual_eval_examples = len(eval_examples)
+
+
+        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
+        file_based_convert_examples_to_features(
+            eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
+
+        tf.logging.info("***** Running evaluation *****")
+        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
+                        len(eval_examples), num_actual_eval_examples,
+                        len(eval_examples) - num_actual_eval_examples)
+        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
+
+        # This tells the estimator to run through the entire set.
+        eval_steps = None
+
+        eval_drop_remainder = False
+        eval_input_fn = file_based_input_fn_builder(
+            input_file=eval_file,
+            batch_size=FLAGS.eval_batch_size,
+            seq_length=FLAGS.max_seq_length,
+            is_training=False,
+            drop_remainder=eval_drop_remainder)
+
+        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
+
+        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
+        with tf.gfile.GFile(output_eval_file, "w") as writer:
+            tf.logging.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                tf.logging.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+    if FLAGS.do_predict and master_process:
+        predict_examples = processor.get_test_examples(FLAGS.data_dir)
+        num_actual_predict_examples = len(predict_examples)
+
+        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
+        file_based_convert_examples_to_features(predict_examples, label_list,
+                                                FLAGS.max_seq_length, tokenizer,
+                                                predict_file)
+
+        tf.logging.info("***** Running prediction*****")
+        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
+                        len(predict_examples), num_actual_predict_examples,
+                        len(predict_examples) - num_actual_predict_examples)
+        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
+
+        predict_drop_remainder = False
+        predict_input_fn = file_based_input_fn_builder(
+            input_file=predict_file,
+            batch_size=FLAGS.predict_batch_size,
+            seq_length=FLAGS.max_seq_length,
+            is_training=False,
+            drop_remainder=predict_drop_remainder)
+
+        eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
+        eval_start_time = time.time()
+
+
+        output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
+        with tf.gfile.GFile(output_predict_file, "w") as writer:
+            num_written_lines = 0
+            tf.logging.info("***** Predict results *****")
+            for prediction in estimator.predict(input_fn=predict_input_fn, hooks=eval_hooks,
+                                                     yield_single_examples=True):
+                probabilities = prediction["probabilities"]
+                output_line = "\t".join(
+                    str(class_probability)
+                    for class_probability in probabilities) + "\n"
+                writer.write(output_line)
+                num_written_lines += 1
+        assert num_written_lines == num_actual_predict_examples
+
+        eval_time_elapsed = time.time() - eval_start_time
+        eval_time_wo_overhead = eval_hooks[-1].total_time
+
+        time_list = eval_hooks[-1].time_list
+        time_list.sort()
+        num_sentences = (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size
+
+        avg = np.mean(time_list)
+        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
+        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
+        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
+        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
+        cf_100 = max(time_list[:int(len(time_list) * 1)])
+        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
+
+        tf.logging.info("-----------------------------")
+        tf.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
+                        eval_hooks[-1].count * FLAGS.predict_batch_size)
+        tf.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead,
+                        (eval_hooks[-1].count - eval_hooks[-1].skipped) * FLAGS.predict_batch_size)
+        tf.logging.info("Summary Inference Statistics")
+        tf.logging.info("Batch size = %d", FLAGS.predict_batch_size)
+        tf.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
+        tf.logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
+        tf.logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
+        tf.logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
+        tf.logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
+        tf.logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000)
+        tf.logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000)
+        tf.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
+        tf.logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second)
+        tf.logging.info("-----------------------------")
+
+if __name__ == "__main__":
+    flags.mark_flag_as_required("data_dir")
+    flags.mark_flag_as_required("task_name")
+    flags.mark_flag_as_required("vocab_file")
+    flags.mark_flag_as_required("bert_config_file")
+    flags.mark_flag_as_required("output_dir")
+    tf.app.run()
--- a/Show More
+++ b/Show More