[add]上传训练benchmark by z00560161

2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,46 @@
+# ResNext50_tensorflow训练说明
+
+### 1. 模型训练参数配置
+
+在train/yaml/ResNext50.yaml中修改相应配置， 配置项含义:
+
+```
+tensorflow_config:
+    # 基本参数
+    max_steps: 1000
+    data_url: /home/imagenet_TF/
+    epoches: 1
+    epochs_between_evals: 1
+    batch_size: 32
+
+    # 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
+    mpirun_ip: 90.90.176.152:8,90.90.176.154:8
+
+    # docker 镜像名称:版本号
+    docker_image: mpirun3:latest
+
+
+    # 1. 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
+    # 2. 仅在小于 8p 时生效
+    # 3. 若不使用该配置, 请使用在行首添加'#'注释的方法将其关闭
+    # device_group: 0 1 2 3
+    device_group_1p: 0
+    device_group_2p: 0 1
+    device_group_4p: 0 1 2 3
+    
+    profiling_mode: false
+    profiling_options: training_trace
+    fp_point: fp32_vars/conv2d/Conv2Dfp32_vars/BatchNorm/FusedBatchNormV3_Reduce
+    bp_point: loss_scale/gradients/AddN_70
+    aicpu_profiling_mode: false
+```
+
+------
+
+
+
+
+
+
+
+    
@@ -0,0 +1 @@
+13650
@@ -0,0 +1,115 @@
+import tensorflow as tf
+
+import os
+log_dir = '../result/'+os.path.basename(__file__).split('.')[0]
+
+#256
+config = {
+    # ============ for testing =====================
+    'accelerator': '1980',    # 'gpu', '1980' 
+    'shuffle_enable': 'yes',
+    'shuffle_buffer_size': 10000,
+    'rank_size': 1, 
+    'shard': False,
+
+    # ======= basic config ======= # 
+    'mode':'train',                                         # "train","evaluate","train_and_evaluate"
+    'epochs_between_evals': 4,                              #used if mode is "train_and_evaluate"
+    'stop_threshold': 80.0,                                 #used if mode is "train_and_evaluate"
+    #'data_dir':'/opt/npu/resnet_data_new',
+    'data_url': '/home/imagenet_TF',  #data
+    'data_type': 'TFRECORD',
+    'model_name': 'resnet50', 
+    'num_classes': 1001,
+    'num_epochs': 1,
+    'height':224,
+    'width':224, 
+    'dtype': tf.float32,
+    'data_format': 'channels_last',
+    'use_nesterov': True,
+    'eval_interval': 1,
+    'num_evaluating_samples': 50000, 
+    'loss_scale': 1024,                                #could be float or string. If float, static loss scaling is applied. 
+                                                            #If string, the corresponding automatic loss scaling algorithm is used.
+                                                            #Must be one of 'Backoff' of 'LogMax' (case insensitive).
+    'use_lars': False,
+    'label_smoothing':0.1,                                  #If greater than 0 then smooth the labels.
+    'weight_decay': 0.0001,
+    'batch_size':32,                                        #minibatch size per node, total batchsize = batch_size*hvd.size()*itersize
+                               
+    'momentum': [0.9],
+
+    #=======  data processing config =======
+    'min_object_covered': 0.1,                              #used for random crop
+    'aspect_ratio_range':[3. / 4., 4. / 3.],
+    'area_range':[0.16, 1.0],
+    'max_attempts': 100,
+
+    #=======  data augment config ======= 
+    'increased_aug': False,
+    'brightness':0.3,
+    'saturation': 0.6,
+    'contrast': 0.6,
+    'hue': 0.13,
+    'num_preproc_threads': 22,
+
+    #=======  initialization config ======= 
+    'conv_init': tf.variance_scaling_initializer(),
+    'bn_init_mode': 'adv_bn_init',                         # "conv_bn_init" or "adv_bn_init",initializer the gamma in bn in different modes
+                                                            # "adv_bn_init" means initialize gamma to 0 in each residual block's last bn, and initialize other gamma to 1
+                                                            # "conv_bn_init" means initialize all the gamma to a constant, defined by "bn_gamma_initial_value"
+    'bn_gamma_initial_value': 1.0,
+
+    #======== model architecture ==========
+    #'resnet_version': 'v1.5',  
+    'resnet_version': 'resnext',
+    'arch_type': 'original',                                   # ------ input -------
+                                                            # C1,C2,C3: input block, stride in different layer
+                                                            # ------ shortcut ------
+                                                            # D1: average_pooling + conv1*1 in shortcut  in downsample block
+                                                            # D2: conv3*3,stride=2 in shortcut in downsample block
+                                                            # D3: conv1*1 +average_pooling in shortcut  in downsample block
+                                                            # ------ mainstream ----
+                                                            # E1: average_pooling + conv3*3 in mainstream in downsample block  
+                                                            # E2: conv3*3 + average_pooling in mainstream in downsample block 
+
+    #=======  logger config ======= 
+    'display_every': 1,
+    'log_name': 'resnet50.log',
+    'log_dir': log_dir,
+    #'ckpt_dir': '/data/resnext50/opp2/ckpt0',   
+    'ckpt_dir': os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../result/ckpt0'),
+
+    #=======  Learning Rate Config ======= 
+    'lr_warmup_mode': 'linear',                             # "linear" or "cosine"
+    'warmup_lr': 0.0,
+    'warmup_epochs': 10,
+    'learning_rate_maximum': 0.1,                    
+
+    'lr_decay_mode': 'steps',                              # "steps", "poly", "poly_cycle", "cosine", "linear_cosine", "linear_twice", "constant" for 1980 only
+    'learning_rate_end': 0.00001,
+
+    'decay_steps': '10,20,30',                              #for "steps"
+    'lr_decay_steps': '6.4,0.64,0.064',
+
+    'ploy_power': 2.0,                                      #for "poly" and "poly_cycle"
+
+    'cdr_first_decay_ratio': 0.33,                          #for "cosine_decay_restarts"
+    'cdr_t_mul':2.0,
+    'cdr_m_mul':0.1,
+
+    'lc_periods':0.47,                                      #for "linear_consine"
+    'lc_beta':0.00001, 
+    
+    'lr_mid': 0.5,                                          #for "linear_twice"
+    'epoch_mid': 80,
+    
+    'bn_lr_scale':1.0,
+
+  }
+
+def res50_config():
+    config['global_batch_size'] = config['batch_size'] * config['rank_size']
+    config['do_checkpoint'] = True
+
+    return config
@@ -0,0 +1,115 @@
+import tensorflow as tf
+
+import os
+log_dir = '../result/'+os.path.basename(__file__).split('.')[0]
+
+#256
+config = {
+    # ============ for testing =====================
+    'accelerator': '1980',    # 'gpu', '1980' 
+    'shuffle_enable': 'yes',
+    'shuffle_buffer_size': 10000,
+    'rank_size': 8, 
+    'shard': True,
+
+    # ======= basic config ======= # 
+    'mode':'train',                                         # "train","evaluate","train_and_evaluate"
+    'epochs_between_evals': 4,                              #used if mode is "train_and_evaluate"
+    'stop_threshold': 80.0,                                 #used if mode is "train_and_evaluate"
+    'data_dir':'/opt/npu/resnet_data_new',
+    'data_url': '/home/imagenet_TF',
+    'data_type': 'TFRECORD',
+    'model_name': 'resnet50', 
+    'num_classes': 1001,
+    'num_epochs': 120,                #None
+    'height':224,
+    'width':224, 
+    'dtype': tf.float32,
+    'data_format': 'channels_last',
+    'use_nesterov': True,
+    'eval_interval': 1,
+    'loss_scale': 1024,                                #could be float or string. If float, static loss scaling is applied. 
+                                                            #If string, the corresponding automatic loss scaling algorithm is used.
+                                                            #Must be one of 'Backoff' of 'LogMax' (case insensitive).
+    'use_lars': False,
+    'label_smoothing':0.1,                                  #If greater than 0 then smooth the labels.
+    'weight_decay': 0.0001,
+    'batch_size':32,                                        #minibatch size per node, total batchsize = batch_size*hvd.size()*itersize
+                               
+    'momentum': [0.9],
+
+    #=======  data processing config =======
+    'min_object_covered': 0.1,                              #used for random crop
+    'aspect_ratio_range':[3. / 4., 4. / 3.],
+    'area_range':[0.16, 1.0],
+    'max_attempts': 100,
+
+    #=======  data augment config ======= 
+    'increased_aug': False,
+    'brightness':0.3,
+    'saturation': 0.6,
+    'contrast': 0.6,
+    'hue': 0.13,
+    'num_preproc_threads': 22,
+
+    #=======  initialization config ======= 
+    'conv_init': tf.variance_scaling_initializer(),
+    'bn_init_mode': 'adv_bn_init',                         # "conv_bn_init" or "adv_bn_init",initializer the gamma in bn in different modes
+                                                            # "adv_bn_init" means initialize gamma to 0 in each residual block's last bn, and initialize other gamma to 1
+                                                            # "conv_bn_init" means initialize all the gamma to a constant, defined by "bn_gamma_initial_value"
+    'bn_gamma_initial_value': 1.0,
+
+    #======== model architecture ==========
+    #'resnet_version': 'v1.5',  
+    'resnet_version': 'resnext',  
+    'arch_type': 'original',                                   # ------ input -------
+                                                            # C1,C2,C3: input block, stride in different layer
+                                                            # ------ shortcut ------
+                                                            # D1: average_pooling + conv1*1 in shortcut  in downsample block
+                                                            # D2: conv3*3,stride=2 in shortcut in downsample block
+                                                            # D3: conv1*1 +average_pooling in shortcut  in downsample block
+                                                            # ------ mainstream ----
+                                                            # E1: average_pooling + conv3*3 in mainstream in downsample block  
+                                                            # E2: conv3*3 + average_pooling in mainstream in downsample block 
+
+    #=======  logger config ======= 
+    'display_every': 1,
+    'log_name': 'resnet50.log',
+    #'ckpt_dir': '/data/resnext50_10w/ckpt0',                   #log_dir
+    #'ckpt_dir': '/d_solution/ckpt0',
+    'ckpt_dir': os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../result/ckpt0'),
+    'log_dir': log_dir,
+
+    #=======  Learning Rate Config ======= 
+    'lr_warmup_mode': 'linear',                             # "linear" or "cosine"
+    'warmup_lr': 0.0,
+    'warmup_epochs': 5,
+    'learning_rate_maximum': 0.1,                    
+
+    'lr_decay_mode': 'cosine',                              # "steps", "poly", "poly_cycle", "cosine", "linear_cosine", "linear_twice", "constant" for 1980 only
+    'learning_rate_end': 0.000001,
+
+    'decay_steps': '10,20,30',                              #for "steps"
+    'lr_decay_steps': '6.4,0.64,0.064',
+
+    'ploy_power': 2.0,                                      #for "poly" and "poly_cycle"
+
+    'cdr_first_decay_ratio': 0.33,                          #for "cosine_decay_restarts"
+    'cdr_t_mul':2.0,
+    'cdr_m_mul':0.1,
+
+    'lc_periods':0.47,                                      #for "linear_consine"
+    'lc_beta':0.00001, 
+    
+    'lr_mid': 0.5,                                          #for "linear_twice"
+    'epoch_mid': 80,
+    
+    'bn_lr_scale':1.0,
+
+  }
+
+def res50_config():
+    config['global_batch_size'] = config['batch_size'] * config['rank_size']
+    config['do_checkpoint'] = True
+
+    return config
@@ -0,0 +1,236 @@
+import numpy as np
+from . import preprocessing
+import tensorflow as tf
+from tensorflow.python.util import nest
+import os,sys
+import numpy as np 
+sys.path.append("..")
+from trainers.train_helper import stage
+
+class DataLoader:
+
+    def __init__(self, config):
+        self.config = config   
+
+        # dataset info
+        num_training_samples = 1281167
+        self.config['num_evaluating_samples'] = 50000
+        #num_evaluating_samples = get_num_records(self.eval_filenames)
+        self.config['num_training_samples'] = num_training_samples
+        print( 'total num_training_sampels: %d' %  num_training_samples )
+        
+        self.training_samples_per_rank = num_training_samples
+
+
+    def get_train_input_fn_synthetic(self):
+        batch_size = self.config['batch_size']
+        input_shape = [self.config['height'], self.config['width'], 3]
+        input_element = nest.map_structure(lambda s: tf.constant(0.5, tf.float32, s), tf.TensorShape(input_shape))
+        label_element = nest.map_structure(lambda s: tf.constant(1, tf.int32, s), tf.TensorShape([1]))
+        element = (input_element, label_element)
+        ds = tf.data.Dataset.from_tensors(element).repeat()
+        ds = ds.batch(batch_size)
+        return ds
+        
+    def get_train_input_fn(self):
+        # filenames = self.train_filenames
+        filenames = None
+        take_count = self.training_samples_per_rank
+        batch_size = self.config['batch_size']
+        height = self.config['height']
+        width = self.config['width']
+        brightness = self.config['brightness']
+        contrast = self.config['contrast']
+        saturation = self.config['saturation']
+        hue = self.config['hue']
+        num_threads = self.config['num_preproc_threads']
+        increased_aug = self.config['increased_aug']
+        shard = self.config['shard']
+
+        return make_dataset(self.config, filenames, take_count, batch_size, height, width,
+                 brightness, contrast, saturation, hue,
+                 training=True, num_threads=num_threads, nsummary=10, shard=shard, synthetic=False,
+                 increased_aug=increased_aug )
+
+    def get_eval_input_fn(self):
+        # filenames = self.eval_filenames
+        filenames = None
+        # take_count = get_num_records(self.eval_filenames)
+        take_count = 50000
+        batch_size = self.config['batch_size']
+        height = self.config['height']
+        width = self.config['width']
+        brightness = self.config['brightness']
+        contrast = self.config['contrast']
+        saturation = self.config['saturation']
+        hue = self.config['hue'] 
+        num_threads = self.config['num_preproc_threads']
+        shard = self.config['shard']
+
+        return make_dataset(self.config, filenames, take_count, batch_size, height, width,
+                 brightness, contrast, saturation, hue,
+                 training=False, num_threads=num_threads, nsummary=10, shard=shard, synthetic=False,
+                 increased_aug=False)
+
+    def get_input_pipeline_op(self, inputs, labels, mode):
+        with tf.device('/cpu:0'):
+            preload_op, (inputs, labels) = stage([inputs, labels])
+
+        with tf.device('/gpu:0'):
+            gpucopy_op, (inputs, labels) = stage([inputs, labels])
+        return preload_op, gpucopy_op, inputs, labels
+
+    def normalize_and_format(self, inputs, data_format):
+
+        dataset_mean = np.array([121, 115, 100], dtype=np.float32)
+        dataset_std = np.array([70, 68, 71], dtype=np.float32)
+        inputs = tf.subtract(inputs, dataset_mean)
+        inputs = tf.multiply(inputs, 1. / dataset_std)
+        if data_format == 'channels_first':
+            inputs = tf.transpose(inputs, [0, 3, 1, 2])
+        return inputs
+
+
+
+
+#-------------------------------- Funcs -----------------------------------
+def get_num_records(filenames):
+    def count_records(tf_record_filename):
+        count = 0
+        for _ in tf.python_io.tf_record_iterator(tf_record_filename):
+            count += 1
+        return count
+
+    nfile = len(filenames)
+    return (count_records(filenames[0]) * (nfile - 1) +
+            count_records(filenames[-1]))
+
+def _parse_example_proto(example_serialized):
+  feature_map = {
+      'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
+                                          default_value=''),
+      'image/class/label': tf.FixedLenFeature([], dtype=tf.int64, default_value=-1),
+      'image/class/text': tf.FixedLenFeature([], dtype=tf.string,
+                                             default_value=''),
+  }
+  sparse_float32 = tf.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update(
+      {k: sparse_float32 for k in ['image/object/bbox/xmin',
+                                   'image/object/bbox/ymin',
+                                   'image/object/bbox/xmax',
+                                   'image/object/bbox/ymax']})
+
+  features = tf.parse_single_example(example_serialized, feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+
+  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
+  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
+  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
+  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
+
+  # Note that we impose an ordering of (y, x) just to make life difficult.
+  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
+
+  # Force the variable number of bounding boxes into the shape
+  # [1, num_boxes, coords].
+  bbox = tf.expand_dims(bbox, 0)
+  bbox = tf.transpose(bbox, [0, 2, 1])
+
+  return features['image/encoded'], label, bbox
+
+def parse_record(raw_record,is_training):
+  image_buffer, label, bbox = _parse_example_proto(raw_record)
+  # for 1980 only
+  config={'min_object_covered': 0.1, 'aspect_ratio_range': [3. / 4., 4. / 3.], 'area_range': [0.08, 1.0], 'max_attempts': 100}
+  image = preprocessing.parse_and_preprocess_image_record(
+    config, image_buffer, height=224, width=224,
+    brightness=0.4, contrast=0.4, saturation=0.4, hue=0.13,
+    distort=is_training, nsummary=10, increased_aug=True, random_search_aug=False)
+  return image, label
+
+def read_rawdata(file_path_tensor):
+    def _read_file(file_path):
+        image = tf.gfile.GFile(file_path, 'rb').read()
+        return image
+    return tf.py_func(_read_file, inp=[file_path_tensor], Tout=tf.string)
+
+def parse_function(filename, label):
+    image = read_rawdata(filename)
+    image_decoded = tf.image.decode_jpeg(image, channels=3)
+    image_resized = tf.image.resize_images(image_decoded, [224, 224])
+    # 7.3，raw默认格式为int64，目前resnet50只支持int32，下沉前不影响，下沉后，没有增加该转换算子，影响性能考虑。    
+    label = tf.cast(label, dtype=tf.int32)
+    return image_resized, label
+
+
+def make_dataset(config, filenames, take_count, batch_size, height, width,
+                 brightness, contrast, saturation, hue,
+                 training=False, num_threads=10, nsummary=10, shard=False, synthetic=False,
+                 increased_aug=False, random_search_aug=False):
+    if synthetic and training:
+        input_shape = [height, width, 3]
+        input_element = nest.map_structure(lambda s: tf.constant(0.5, tf.float32, s), tf.TensorShape(input_shape))
+        label_element = nest.map_structure(lambda s: tf.constant(1, tf.int32, s), tf.TensorShape([1]))
+        element = (input_element, label_element)
+        ds = tf.data.Dataset.from_tensors(element).repeat()
+        ds = ds.batch(batch_size) 
+        return ds
+    else:
+        shuffle_buffer_size = 10000
+        num_readers = 10
+        rank_size = int(os.getenv('RANK_SIZE'))
+        rank_id = int(os.getenv('DEVICE_INDEX'))
+
+        if config['data_type'] == 'RAW DATA':
+            images = []
+            labels = []
+            with tf.gfile.GFile(config['label_index_url'], 'r') as f:
+                for line in f.readlines():
+                    tmp_list = line.strip().split(" ")
+                    image_file = os.path.join(config['data_url'], tmp_list[0])
+                    #image_raw = tf.gfile.GFile(image_file, 'rb').read()
+                    #images.append(image_raw)
+                    images.append(image_file)
+                    labels.append(int(tmp_list[-1]))
+
+            #images = tf.convert_to_tensor(images, dtype=tf.string)
+            #labels = tf.convert_to_tensor(labels, dtype=tf.int32)
+            ds = tf.data.Dataset.from_tensor_slices((images, labels))
+        else:
+            if training:
+                filename_pattern = os.path.join(config['data_url'], '%s-*')
+                filenames = sorted(tf.gfile.Glob(filename_pattern % 'train'))
+            else:
+                filename_pattern = os.path.join(config['data_url'], '%s-*')
+                filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
+
+            ds = tf.data.Dataset.from_tensor_slices(filenames)
+
+        if shard:
+            # split the dataset into parts for each GPU
+            ds = ds.shard(rank_size, rank_id)
+
+        if not training:
+            ds = ds.take(take_count)  # make sure all ranks have the same amount
+
+        if training:
+            ds = ds.shuffle(1000, seed=7 * (1 + rank_id))
+
+        if config['data_type'] == 'TFRECORD':
+            ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1)
+            counter = tf.data.Dataset.range(sys.maxsize)
+            ds = tf.data.Dataset.zip((ds, counter))
+
+        if training:
+            ds = ds.apply(tf.data.experimental.shuffle_and_repeat(shuffle_buffer_size, seed=5*(1+rank_id)))
+
+        if config['data_type'] == 'RAW DATA':
+            ds = ds.map(lambda image, label: parse_function(image, label), num_parallel_calls=14)
+        else:
+            ds = ds.map(lambda image, label: parse_record(image, training), num_parallel_calls=14)
+        #ds = ds.prefetch(10)
+        ds = ds.batch(batch_size, drop_remainder=True)
+        return ds
+
+
@@ -0,0 +1,152 @@
+import tensorflow as tf
+#import horovod.tensorflow as hvd
+from tensorflow.contrib.image.python.ops import distort_image_ops
+import math
+#from .data_aug_search import random_aug_search
+
+
+
+def deserialize_image_record(record): 
+    feature_map = {
+        'image/encoded': tf.FixedLenFeature([], tf.string, ''),
+        'image/class/label': tf.FixedLenFeature([1], tf.int64, -1),
+        'image/class/text': tf.FixedLenFeature([], tf.string, ''),
+        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32)
+    }
+    with tf.name_scope('deserialize_image_record'):
+        obj = tf.parse_single_example(record, feature_map)
+        imgdata = obj['image/encoded']
+        label = tf.cast(obj['image/class/label'], tf.int32)
+        bbox = tf.stack([obj['image/object/bbox/%s' % x].values
+                         for x in ['ymin', 'xmin', 'ymax', 'xmax']])
+        bbox = tf.transpose(tf.expand_dims(bbox, 0), [0, 2, 1])
+        text = obj['image/class/text']
+        return imgdata, label, bbox, text
+
+def decode_jpeg(imgdata, channels=3):
+    return tf.image.decode_jpeg(imgdata, channels=channels,
+                                fancy_upscaling=False,
+                                dct_method='INTEGER_FAST')
+
+
+def crop_and_resize_image(config, image, height, width, 
+                          distort=False, nsummary=10):
+    with tf.name_scope('crop_and_resize'):
+        # Evaluation is done on a center-crop of this ratio
+        eval_crop_ratio = 0.8
+        if distort:
+            initial_shape = [int(round(height / eval_crop_ratio)),
+                             int(round(width / eval_crop_ratio)),
+                             3]
+            jpeg_shape = tf.image.extract_jpeg_shape( image )
+
+            bbox_begin, bbox_size, bbox = \
+                tf.image.sample_distorted_bounding_box(
+                    initial_shape,
+                    bounding_boxes=tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]),
+                    # tf.zeros(shape=[1,0,4]), # No bounding boxes
+                    min_object_covered=config['min_object_covered'],
+                    aspect_ratio_range=config['aspect_ratio_range'],
+                    area_range=config['area_range'],
+                    max_attempts=config['max_attempts'],
+                 #   seed=11 ,  # Need to set for deterministic results
+                    use_image_if_no_bounding_boxes=True)
+            bbox = bbox[0, 0]  # Remove batch, box_idx dims
+
+     #       offset_y, offset_x, _ = tf.unstack(bbox_begin)
+     #       target_height, target_width, _ = tf.unstack( bbox_size )
+     #       
+
+
+
+
+     #       offset_y = tf.minimum( offset_y, jpeg_shape[0] - 1 )
+     #       offset_x = tf.minimum( offset_x, jpeg_shape[1] - 1 )
+
+     #       target_height, target_width, _ = tf.unstack( bbox_size )
+     #       new_height = tf.maximum( tf.minimum( offset_y + target_height, jpeg_shape[0] ) - offset_y, 0 )
+     #       new_width = tf.maximum( tf.minimum( offset_x + target_width, jpeg_shape[1] ) - offset_x, 0 )
+
+            y_min = tf.cast( bbox[0] * (tf.cast( jpeg_shape[0], tf.float32) ), tf.int32)
+            x_min = tf.cast( bbox[1] * (tf.cast(jpeg_shape[1], tf.float32) ), tf.int32) 
+            y_max = tf.cast( bbox[2] * (tf.cast(jpeg_shape[0], tf.float32) ), tf.int32)
+            x_max = tf.cast( bbox[3] * (tf.cast(jpeg_shape[1], tf.float32) ), tf.int32)
+
+            crop_height = y_max - y_min
+            crop_width = x_max - x_min
+     #       crop_window = tf.stack( [offset_y, offset_x, new_height, new_width] )
+            crop_window = tf.stack( [y_min, x_min, crop_height, crop_width] )
+            image = tf.image.decode_and_crop_jpeg( image, crop_window, channels=3 )
+            image = tf.image.resize_images( image, [height, width] )
+            
+            
+       #     def func_decode_and_crop(image):
+       #         image = tf.image.decode_and_crop_jpeg( image, crop_window, channels=3 )
+       #         image = tf.image.resize_images( image, [height, width] )
+       #         return image
+
+       #     def func_crop_and_resize(image):
+       #         image = decode_jpeg(image, channels=3)
+       #         image = tf.image.crop_and_resize(
+       #            image[None, :, :, :], bbox[None, :], [0], [height, width])[0]
+       #         return image
+
+
+       #     condtion_1 = tf.logical_and( tf.less(target_height, jpeg_shape[0]), tf.less( target_width, jpeg_shape[1] ) )
+       #     condtion_2 = tf.logical_and( tf.less(target_height + offset_y, jpeg_shape[0]), tf.less( target_width + offset_x, jpeg_shape[1] ) )
+
+       #     image = tf.cond( tf.logical_and( condtion_1, condtion_2 ),  lambda:func_decode_and_crop(image), lambda:func_crop_and_resize(image)   )
+
+
+        else:
+            # Central crop
+
+            image = decode_jpeg(image, channels=3)
+            ratio_y = ratio_x = eval_crop_ratio
+            bbox = tf.constant([0.5 * (1 - ratio_y), 0.5 * (1 - ratio_x),
+                                0.5 * (1 + ratio_y), 0.5 * (1 + ratio_x)])
+            image = tf.image.crop_and_resize(
+               image[None, :, :, :], bbox[None, :], [0], [height, width])[0]
+        
+        return image
+
+
+def parse_and_preprocess_image_record(config, record, height, width,
+                                      brightness, contrast, saturation, hue,
+                                      distort, nsummary=10, increased_aug=False, random_search_aug=False):
+    #imgdata, label, bbox, text = deserialize_image_record(record)
+    #label -= 1  # Change to 0-based (don't use background class)
+    with tf.name_scope('preprocess_train'):
+            image = crop_and_resize_image(config, record, height, width, distort)
+            if distort:
+                image = tf.image.random_flip_left_right(image)
+                if increased_aug:
+                    image = tf.image.random_brightness(image, max_delta=brightness)  
+                    #image = distort_image_ops.random_hsv_in_yiq(image, 
+                    #                                            lower_saturation=saturation, 
+                    #                                            upper_saturation=2.0 - saturation, 
+                    #                                            max_delta_hue=hue * math.pi)
+                    image = tf.image.random_contrast(image, lower=contrast, upper=2.0 - contrast)
+                    image = tf.image.random_saturation(image, lower=saturation, upper=2.0-saturation)
+               #     tf.summary.image('distorted_color_image', tf.expand_dims(image, 0))
+
+            image = tf.clip_by_value(image, 0., 255.)
+            #image = tf.cast(image, tf.uint8)
+  #          if random_search_aug:
+  #              image = random_aug_search(image)
+    image = normalize(image)
+    image = tf.cast(image, tf.float16)
+    return image
+def normalize(inputs):
+     imagenet_mean = [121.0, 115.0, 100.0]             #np.array([121, 115, 100], dtype=np.float32)
+     imagenet_std =  [70.0, 68.0, 71.0]                #np.array([70, 68, 71], dtype=np.float32)
+     imagenet_mean = tf.expand_dims(tf.expand_dims(imagenet_mean, 0), 0)
+     imagenet_std = tf.expand_dims(tf.expand_dims(imagenet_std, 0), 0)
+     inputs = inputs - imagenet_mean          #tf.subtract(inputs, imagenet_mean)
+     inputs = inputs * (1.0 / imagenet_std)
+     #inputs = tf.multiply(inputs, 1. / imagenet_std)
+
+     return inputs
@@ -0,0 +1,50 @@
+import tensorflow as tf
+from .lr_schedule import warmup_decay, get_lr, get_1980_lr
+
+
+class HyperParams:
+    def __init__(self, config):
+        self.config=config
+        nsteps_per_epoch = self.config['num_training_samples'] // self.config['global_batch_size']
+        self.config['nsteps_per_epoch'] = nsteps_per_epoch
+        # nstep = self.config['num_training_samples'] * self.config['num_epochs'] // self.config['global_batch_size']
+        if self.config['num_epochs']:
+            nstep = nsteps_per_epoch * self.config['num_epochs']   #------calculate nsteps in a different way------
+        else:
+            nstep = self.config['max_train_steps']
+        self.config['nstep'] = nstep
+        
+        self.config['total_steps_include_iterations'] = int( self.config['nstep'] + self.config['iterations_per_loop'])
+        self.config['save_summary_steps'] = nsteps_per_epoch
+        self.config['save_checkpoints_steps'] = nsteps_per_epoch
+
+
+    def get_hyper_params(self):
+        hyper_params = {}
+        hyper_params['learning_rate'] = self.get_learning_rate()
+
+        return hyper_params
+ 
+
+    def get_learning_rate(self): 
+        global_step = tf.train.get_global_step()
+        nsteps_per_epoch = self.config['nsteps_per_epoch']
+
+        warmup_lr = self.config['warmup_lr']
+        lr = self.config['learning_rate_maximum']
+        lr_end = self.config['learning_rate_end']
+        lr_decay_mode = self.config['lr_decay_mode']
+
+
+       
+        with tf.device('/cpu:0'):  # Allow fallback to CPU if no GPU support for these ops
+
+            if lr_decay_mode == 'constant' or self.config['num_epochs'] == None:
+                learning_rate = tf.constant(lr, tf.float32)
+            else:
+                learning_rate = get_1980_lr(self.config, global_step, warmup_lr, lr_end, lr, self.config['warmup_epochs'], nsteps_per_epoch, self.config['nstep'], lr_decay_mode )
+
+            learning_rate = tf.identity(learning_rate, 'learning_rate')
+        return learning_rate
+
+
@@ -0,0 +1,172 @@
+import tensorflow as tf
+import numpy as np
+
+def get_lr(lr, lr_end, lr_decay_mode, warmup_it, decay_steps, global_step, steps, lr_steps, ploy_power,
+           cdr_first_decay_ratio, cdr_t_mul, cdr_m_mul, cdr_alpha, cd_alpha, lc_periods, lc_alpha, lc_beta, lr_mid, it_mid):
+    if lr_decay_mode == 'steps':
+        learning_rate = tf.train.piecewise_constant(global_step,
+                                                    steps, lr_steps)
+    elif lr_decay_mode == 'poly' or lr_decay_mode == 'poly_cycle':
+        cycle = lr_decay_mode == 'poly_cycle'
+        learning_rate = tf.train.polynomial_decay(lr,
+                                                  global_step - warmup_it,
+                                                  decay_steps=decay_steps - warmup_it,
+                                                  end_learning_rate=lr_end,
+                                                  power=ploy_power,
+                                                  cycle=cycle)
+    elif lr_decay_mode == 'cosine_decay_restarts':
+        learning_rate = tf.train.cosine_decay_restarts(lr, 
+                                                       global_step - warmup_it,
+                                                       (decay_steps - warmup_it) * cdr_first_decay_ratio,
+                                                       t_mul=cdr_t_mul, 
+                                                       m_mul=cdr_m_mul,
+                                                       alpha=cdr_alpha)
+    elif lr_decay_mode == 'cosine':
+        learning_rate = tf.train.cosine_decay(lr,
+                                              global_step - warmup_it,
+                                              decay_steps=decay_steps - warmup_it,
+                                              alpha=cd_alpha) 
+    elif lr_decay_mode == 'linear_cosine':
+        learning_rate = tf.train.linear_cosine_decay(lr,
+                                                     global_step - warmup_it,
+                                                     decay_steps=decay_steps - warmup_it,
+                                                     num_periods=lc_periods,#0.47,
+                                                     alpha=lc_alpha,#0.0,
+                                                     beta=lc_beta)#0.00001)
+    elif lr_decay_mode == 'linear_twice':
+        learning_rate = decay_linear_twice(lr, lr_mid, lr_end, warmup_it, it_mid, decay_steps, global_step )
+
+    else:
+        raise ValueError('Invalid type of lr_decay_mode')
+    return learning_rate
+
+
+def cos_warmup_1980(  global_step, warmup_steps, max_lr ):
+    PI = 3.14159265359
+    ang = PI +  PI * ( float(global_step+1) / float(warmup_steps) )
+    offset  = max_lr * 0.5*( 1.0 + np.cos( ang ) )
+    return offset
+
+def cos_decay_1980(  global_step, warmup_steps, total_steps, max_lr,end_lr ):
+    PI = 3.14159265359
+    ang =  PI * ( float(global_step - warmup_steps+1) / float(total_steps - warmup_steps) )
+    #offset  = max_lr * 0.5*( 1.0 + np.cos( ang ) )
+    
+    #zp-cosine
+    cosine_decay_tmp=0.5*( 1.0 + np.cos( ang ) )
+    decayed_tmp = (1 - end_lr) * cosine_decay_tmp + end_lr
+    offset = max_lr * decayed_tmp
+    return offset                    
+
+def get_1980_lr(config, global_step, lr_init, lr_end, lr_max, warmup_epochs, steps_per_epoch, nsteps, lr_decay_mode):
+    lr_each_step = []
+
+    if lr_decay_mode == 'steps':
+        decay_epoch_index = [30 * steps_per_epoch,60 * steps_per_epoch,80 * steps_per_epoch]
+        total_steps = int(nsteps)
+        for i in range(total_steps):
+            if i < decay_epoch_index[0]:
+                lr = lr_max
+            elif i < decay_epoch_index[1]:
+                lr = lr_max * 0.1
+            elif i < decay_epoch_index[2]:
+                lr = lr_max * 0.01
+            else:
+                lr = lr_max * 0.001
+            lr_each_step.append(lr)
+    elif lr_decay_mode == 'poly':
+        total_steps = int(nsteps)
+        warmup_steps = steps_per_epoch * warmup_epochs
+        inc_each_step = ( float(lr_max) - float(lr_init) ) / float(warmup_steps)
+        for i in range( config['total_steps_include_iterations'] ):
+          if i < warmup_steps:
+            lr = float(lr_init) + inc_each_step * float(i) 
+          elif i <= total_steps:
+            base =  ( 1.0 - (float(i)-float(warmup_steps))/(float(total_steps)-float(warmup_steps)) ) 
+            lr = float(lr_max) * base 
+          else:
+            lr = 0.0
+          lr_each_step.append(lr)
+
+    elif lr_decay_mode == 'cosine':
+        total_steps = int(nsteps)
+        
+        warmup_steps = steps_per_epoch * warmup_epochs
+        for i in range( config['total_steps_include_iterations'] ):
+          if i < warmup_steps:
+            lr = cos_warmup_1980( i, warmup_steps, lr_max )
+          elif i <= total_steps:
+            lr = cos_decay_1980( i, warmup_steps, total_steps, lr_max ,lr_end)
+          else:
+            lr = lr_end * 0.01
+          lr_each_step.append(lr)
+    elif lr_decay_mode == 'linear_cosine':
+        total_steps = int(nsteps)
+        warmup_steps = steps_per_epoch * warmup_epochs
+        inc_each_step = ( float(lr_max) - float(lr_init) ) / float(warmup_steps)
+        for i in range( config['total_steps_include_iterations'] ):
+          if i < warmup_steps:
+            lr = float(lr_init) + inc_each_step * float(i) 
+          elif i <= total_steps:
+            lr = cos_decay_1980( i, warmup_steps, total_steps, lr_max )
+          else:
+            lr = 0.0
+          lr_each_step.append(lr)
+    else:
+        total_steps = int(nsteps)
+        warmup_steps = steps_per_epoch * warmup_epochs
+        for i in range(total_steps):
+            if i < warmup_steps:
+                lr = lr_init + (lr_max - lr_init) * i / warmup_steps
+            else: 
+                lr = lr_max - ( lr_max - lr_end ) * (i - warmup_steps) / (total_steps - warmup_steps)
+            lr_each_step.append( lr )
+
+   # current_step = tf.to_int32( tf.cast(global_step,tf.float32) / float(steps_per_epoch) )
+    current_step = global_step
+    lr_each_step = tf.convert_to_tensor( lr_each_step )
+    print (lr_each_step)
+    learning_rate = tf.gather( lr_each_step, current_step )
+
+    return learning_rate
+
+def warmup_decay(lr_warmup_mode, warmup_lr, global_step, warmup_steps, warmup_end_lr):
+    if lr_warmup_mode == 'linear':
+        learning_rate = linear_warmup(warmup_lr, global_step, warmup_steps, warmup_end_lr)
+    elif lr_warmup_mode == 'cosine':
+        learning_rate = cos_warmup(warmup_lr, global_step, warmup_steps, warmup_end_lr)
+    else:
+        raise ValueError('Invalid type of lr_warmup_mode')
+    return learning_rate
+
+
+def linear_warmup(warmup_lr, global_step, warmup_steps, warmup_end_lr):
+    from tensorflow.python.ops import math_ops
+    p = tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32)
+    diff = math_ops.subtract(warmup_end_lr, warmup_lr)
+    res = math_ops.add(warmup_lr, math_ops.multiply(diff, p))
+    return res
+
+def cos_warmup( warmup_lr, global_step, warmup_steps, warmup_end_lr ):
+    PI = 3.14159265359
+    diff = tf.subtract( warmup_end_lr, warmup_lr )
+    ang = PI +  PI * ( tf.cast( global_step, tf.float32 ) / tf.cast( warmup_steps,tf.float32 ))
+    offset = diff * 0.5 * ( 1.0 + tf.math.cos( ang ) )
+    res =  tf.add( warmup_lr, offset )
+    return res
+
+
+def decay_linear( lr_start, lr_end, it_start, it_end, global_step ):
+    down_steps = it_end - it_start
+    down_range = lr_start - lr_end 
+    down_per_step = float( down_range ) / float( down_steps )
+    res = tf.subtract( tf.cast(lr_start, tf.float32),  tf.multiply( tf.cast(down_per_step, tf.float32), tf.subtract(tf.cast(global_step, tf.float32), tf.cast(it_start, tf.float32) )) )
+    return res
+
+def decay_linear_twice(lr_start, lr_mid, lr_end, it_start, it_mid, it_end, global_step ):
+    learning_rate = tf.cond( global_step < it_start, lambda: tf.cast(lr_start, tf.float32), lambda: decay_linear(lr_start, lr_mid, it_start, it_mid, global_step))
+    learning_rate = tf.cond( global_step > it_mid, lambda: decay_linear(lr_mid, lr_end, it_mid, it_end, global_step) , lambda: learning_rate )
+    return learning_rate
+
+
+
@@ -0,0 +1,23 @@
+import tensorflow as tf
+#from tensorflow.contrib.hccl.python.ops import hccl_ops
+from npu_bridge.hccl import hccl_ops
+
+class Layers:
+ 
+    def get_accuracy(self, labels, predicted_classes, logits, config):
+        accuracy = tf.metrics.accuracy(
+            labels=labels, predictions=predicted_classes) 
+        top5acc = tf.metrics.mean(
+            tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32))
+        if config['rank_size'] == 1:
+            newaccuracy = (accuracy[0], accuracy[1])
+            newtop5acc = (top5acc[0], top5acc[1])
+        else:
+            newaccuracy = (hccl_ops.allreduce(accuracy[0],"sum")/config['rank_size'], accuracy[1])
+            newtop5acc = (hccl_ops.allreduce(top5acc[0],"sum")/config['rank_size'], top5acc[1])
+        metrics = {'val-top1acc': newaccuracy, 'val-top5acc': newtop5acc}
+        return metrics
+
+
+
+
@@ -0,0 +1,36 @@
+import tensorflow as tf
+
+class Loss:
+    def __init__(self,config):
+        self.config = config 
+
+    def get_loss(self, logits, labels):
+        labels_one_hot = tf.one_hot(labels, self.config['num_classes'])
+        loss = tf.losses.softmax_cross_entropy(
+            logits=logits, onehot_labels=labels_one_hot,label_smoothing=self.config['label_smoothing'])
+        loss = tf.identity(loss, name='loss')
+        return loss
+
+    def get_total_loss(self, loss):
+        reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+        total_loss = tf.add_n([loss] + reg_losses, name='total_loss')
+        return total_loss
+ 
+
+    def optimize_loss(self, total_loss, opt):
+        gate_gradients = (tf.train.Optimizer.GATE_NONE)
+        # grads_and_vars = opt.compute_gradients(total_loss, colocate_gradients_with_ops=True, gate_gradients=gate_gradients)
+        grads_and_vars = opt.compute_gradients(total_loss, gate_gradients=gate_gradients)
+
+        # train_op = opt.apply_gradients( grads_and_vars, global_step=None )
+        train_op = opt.apply_gradients( grads_and_vars)
+
+        return train_op
+
+   
+
+
+        
+
+
+
@@ -0,0 +1,7 @@
+ ps -ef | grep TdtMain | awk '{print $2}' | xargs kill -9
+rm -rf *.pbtxt
+rm -rf /var/log/npu/slog/*.log
+rm ckpt* -rf
+find ./ -name "*.pyc" | xargs rm -rf
+find ./ -name __pycache__ | xargs rm -rf
+rm /var/log/npu/dataset/* -rf
@@ -0,0 +1,141 @@
+import tensorflow as tf
+import sys
+import ast
+#sys.path.append("..")
+#sys.path.append("../models")
+#sys.path.append("./resnet50_train/")
+#sys.path.append("./resnet50_train/models")
+import os
+sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../../../utils'))
+sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../../../utils/atlasboost'))
+base_path=os.path.split(os.path.realpath(__file__))[0]
+print ("#########base_path:", base_path)
+path_1 = base_path + "/.."
+print (path_1)
+path_2 = base_path + "/../models"
+print (path_2)
+path_3 = base_path + "/../../"
+print (path_3)
+
+
+sys.path.append(base_path + "/..")
+sys.path.append(base_path + "/../models")
+sys.path.append(base_path + "/../../")
+sys.path.append(base_path + "/../../models")
+
+from utils import create_session as cs
+from utils import logger as lg
+from data_loader.resnet50 import data_loader as dl
+from models.resnet50 import res50_model as ml
+from optimizers import optimizer as op
+from losses import res50_loss as ls
+from trainers import gpu_base_trainer as tr
+# from configs import res50_config as cfg
+from hyper_param import hyper_param as hp
+from layers import layers as ly
+from datetime import datetime
+# from utils import hwlog
+import argparse
+
+from benchmark_log import hwlog
+from benchmark_log.basic_utils import get_environment_info
+from benchmark_log.basic_utils import get_model_parameter
+# import hwlog
+# remark_logger = hwlog.get_logger(__file__, "hw_Resnext50.log")
+# initinal_data={"base_lr": 0.128, "dataset": "imagenet1024", "optimizer": "SGD", "loss_scale": 512, "batchsize": 32}
+# hwlog.add_additional_info(remark_logger, "Resnext50", "tensorflow", initinal_data) # logger_obj, model_name, framework, initinal_data
+
+
+def main():
+    #-------------------choose the config file in .sh file-----------
+    cmdline = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    cmdline.add_argument('--config_file', default="",
+                         help="""config file used.""")
+    cmdline.add_argument('--iterations_per_loop', default=1,
+                         help="""config file used.""")
+    cmdline.add_argument('--max_train_steps', default=200,
+                         help="""config file used.""")
+    cmdline.add_argument('--debug', default=True, type=ast.literal_eval,
+                         help="""config file used.""")
+    cmdline.add_argument('--eval', default=False, type=ast.literal_eval,
+                         help="""config file used.""")
+    cmdline.add_argument('--model_dir', default="./model_dir",
+                         help="""config file used.""")
+    FLAGS, unknown_args = cmdline.parse_known_args()
+    if len(unknown_args) > 0:
+        for bad_arg in unknown_args:
+            print("ERROR: Unknown command line arg: %s" % bad_arg)
+        raise ValueError("Invalid command line arg(s)")
+
+    cfg_file = FLAGS.config_file
+    configs = 'configs'
+    cfg = getattr(__import__(configs, fromlist=[cfg_file]), cfg_file)
+    #------------------------------------------------------------------
+
+    config = cfg.res50_config()
+    config['iterations_per_loop'] = int(FLAGS.iterations_per_loop)
+    config['max_train_steps'] = int(FLAGS.max_train_steps)
+    config['debug'] = FLAGS.debug
+    config['eval'] = FLAGS.eval
+    config['model_dir'] = FLAGS.model_dir
+    print("iterations_per_loop:%d" %(config['iterations_per_loop']))
+    print("max_train_steps    :%d" %(config['max_train_steps']))
+    print("debug              :%s" %(config['debug']))
+    print("eval               :%s" %(config['eval']))
+    print("model_dir          :%s" %(config['model_dir']))
+    Session = cs.CreateSession(config)
+    data = dl.DataLoader(config)
+    hyper_param = hp.HyperParams(config)
+    layers = ly.Layers() 
+    optimizer = op.Optimizer(config)
+    loss = ls.Loss(config)
+    logger = lg.LogSessionRunHook(config)   # add tensorboard summary
+
+    model = ml.Model(config, data, hyper_param,layers, optimizer, loss, logger)   # get the model 
+    trainer = tr.GPUBaseTrain(Session, config, data, model, logger)   # use Estimator to build training process
+
+    # work_num = "device " + str(os.environ.get("DEVICE_INDEX"))
+    # date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
+    # try:
+    if config['mode'] =='train':
+        trainer.train()
+        if config['eval'] :
+            trainer.evaluate()
+    elif config['mode'] =='evaluate':
+        trainer.evaluate()
+    elif config['mode'] =='train_and_evaluate':
+        trainer.train_and_evaluate()
+    else:
+        raise ValueError('Invalid type of mode')
+        # hwlog.vlogger.info("namespace:%s,time_ts:%s,event_type:benchmark_stop" % (work_num, date_time))
+        # hwlog.vlogger.info("atlas benchmark train success")
+        # remark_logger.info("ABK train success")
+    # except:
+    #     # hwlog.vlogger.info("namespace:%s,time_ts:%s,event_type:benchmark_stop" % (work_num, date_time))
+    #     # hwlog.vlogger.info("atlas benchmark train failed")
+    #     remark_logger.info("ABK train failed")
+
+    # add by zwx5326390
+
+
+
+if __name__ == '__main__':
+    # add zwx5326390 日志打点
+    hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
+    cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
+    config_info = get_model_parameter("tensorflow_config")
+    initinal_data = {"base_lr": 0.01, "dataset": "imagenet1024", "optimizer": "SGD", "loss_scale": 512,
+                     "batchsize": 32}
+    hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
+    hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
+    hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
+    hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
+    hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
+    hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
+    hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
+    hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
+    hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
+    hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
+    hwlog.remark_print(key=hwlog.INPUT_BATCH_SIZE, value=initinal_data.get("batchsize"))
+    main()
@@ -0,0 +1,21 @@
+#!/bin/bash
+#export CUDA_VISIBLE_DEVICES=0
+dir=`pwd`
+
+#cp -rf ./config /tmp/
+export JOB_ID=10086
+#export PROFILING_DIR=/var/log/npu/profiling/container/0
+export DEVICE_ID=0
+#export PROFILING_MODE=true
+export PRINT_MODEL=1
+#export ENABLE_DATA_PRE_PROC=1
+export RANK_ID=0
+export RANK_SIZE=1
+export RANK_TABLE_FILE=/home/lxh/config/new_rank_table_1p.json
+export FUSION_TENSOR_SIZE=1000000000
+export PYTHONPATH=${dir}
+export LD_LIBRARY_PATH=/usr/local/HiAI/runtime/lib64/
+/usr/local/HiAI/runtime/bin/TdtMain --configfile=/home/lxh/test/config/job_tdt_2p_$DEVICE_ID.json  &
+sleep 5
+
+python3.6 res50.py --config_file res50_baseline 
@@ -0,0 +1,4 @@
+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=7
+
+python3.5 res50.py --config_file res50_baseline_gpu
@@ -0,0 +1,24 @@
+import tensorflow as tf
+
+def _fp32_trainvar_getter(getter, name, shape=None, dtype=None,
+                          trainable=True, regularizer=None,
+                          *args, **kwargs):
+    storage_dtype = dtype
+    variable = getter(name, shape, dtype=storage_dtype,
+                      trainable=trainable,
+                      regularizer=regularizer if trainable and 'BatchNorm' not in name and 'batchnorm' not in name and 'batch_norm' not in name and 'Batch_Norm' not in name else None,
+                      *args, **kwargs)
+
+    return variable
+
+
+def fp32_trainable_vars(name='fp32_vars', *args, **kwargs):
+    """A varible scope with custom variable getter to convert fp16 trainable
+    variables with fp32 storage followed by fp16 cast.
+    """
+    return tf.variable_scope(
+        name, custom_getter=_fp32_trainvar_getter, *args, **kwargs)
+
+def custom_getter_with_fp16_and_weight_decay(dtype, weight_decay):
+    return fp32_trainable_vars(dtype=dtype, regularizer=tf.contrib.layers.l2_regularizer(weight_decay))
+
@@ -0,0 +1,222 @@
+
+import tensorflow as tf
+from . import resnet, res50_helper
+from trainers.train_helper import stage
+#from tensorflow.contrib.offline_train.python.npu.npu_optimizer import NPUDistributedOptimizer
+from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
+#from tensorflow.contrib.offline_train.python import npu_ops
+from npu_bridge.estimator import npu_ops
+_NUM_EXAMPLES_NAME="num_examples"
+
+
+class Model(object):
+    def __init__(self, config, data, hyper_param, layers, optimizer, loss, logger):
+        self.config = config
+        self.data = data
+        self.hyper_param = hyper_param
+        self.layers = layers
+        self.optimizer = optimizer
+        self.loss = loss
+        self.logger = logger  
+
+    def get_estimator_model_func(self, features, labels, mode, params=None):
+        labels = tf.reshape(labels, (-1,))  # Squash unnecessary unary dim         #----------------not use when use onehot label
+    
+        model_func = self.get_model_func()
+        inputs = features  # TODO: Should be using feature columns?
+        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+
+        with tf.device('/gpu:0'):
+            if self.config['accelerator'] == 'gpu':
+                inputs = tf.cast(inputs, self.config['dtype'])
+
+            inputs = tf.cast(inputs, self.config['dtype'])
+            with res50_helper.custom_getter_with_fp16_and_weight_decay(dtype=self.config['dtype'], weight_decay=self.config['weight_decay']):   # no BN decay
+
+                top_layer = model_func(
+                    inputs, data_format=self.config['data_format'], training=is_training,
+                    conv_initializer=self.config['conv_init'],
+                    bn_init_mode=self.config['bn_init_mode'], bn_gamma_initial_value=self.config['bn_gamma_initial_value'])
+                
+
+            logits = top_layer
+            predicted_classes = tf.argmax(logits, axis=1, output_type=tf.int32)
+            logits = tf.cast(logits, tf.float32)
+
+            #loss = self.loss.get_loss(logits, labels)  
+            #loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)
+
+            labels_one_hot = tf.one_hot(labels, depth=1001)
+            loss = tf.losses.softmax_cross_entropy(
+                logits=logits, onehot_labels=labels_one_hot, label_smoothing=self.config['label_smoothing'])
+
+
+            base_loss = tf.identity(loss, name='loss')  # For access by logger (TODO: Better way to access it?)
+     #       base_loss = tf.add_n([loss])                                    
+
+            def exclude_batch_norm(name):
+              #return 'batch_normalization' not in name
+              return 'BatchNorm' not in name
+            loss_filter_fn = exclude_batch_norm
+          
+            # Add weight decay to the loss.
+            l2_loss = self.config['weight_decay'] * tf.add_n(
+                # loss is computed using fp32 for numerical stability.
+                [tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()
+                 if loss_filter_fn(v.name)])
+            #tf.summary.scalar('l2_loss', l2_loss)
+     #       total_loss = base_loss + l2_loss
+            if self.config['use_lars']:
+                total_loss = base_loss
+            else:
+                total_loss = base_loss + l2_loss
+   
+            total_loss = tf.identity(total_loss, name = 'total_loss')
+
+
+            if mode == tf.estimator.ModeKeys.EVAL:
+                with tf.device(None):
+                    metrics = self.layers.get_accuracy( labels, predicted_classes, logits, self.config)
+
+                return tf.estimator.EstimatorSpec(
+                    mode, loss=loss, eval_metric_ops=metrics)
+
+            assert (mode == tf.estimator.ModeKeys.TRAIN)
+
+            #reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+            #total_loss = tf.add_n([tf.saturate_cast(loss, self.config['dtype']) ] + reg_losses, name='total_loss')
+            #total_loss = tf.add_n([loss], name='total_loss')
+    
+            batch_size = tf.shape(inputs)[0]
+    
+            global_step = tf.train.get_global_step()
+            with tf.device('/cpu:0'):
+                learning_rate = self.hyper_param.get_learning_rate()
+
+            #-----------------------batchsize scaling----------------------------------
+            momentum = self.config['momentum'][0]
+            #------------------------------end------------------------------------------
+ 
+            opt = tf.train.MomentumOptimizer(
+                learning_rate, momentum, use_nesterov=self.config['use_nesterov'])
+            opt=NPUDistributedOptimizer(opt) 
+            if self.config['accelerator'] == 'gpu':
+                opt = self.optimizer.get_lbs_optimizer(opt)           
+            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or []
+            with tf.control_dependencies(update_ops):
+                if self.config['accelerator'] == 'gpu':
+                    gate_gradients = (tf.train.Optimizer.GATE_NONE)
+                    grads_and_vars = opt.compute_gradients(total_loss, gate_gradients=gate_gradients)
+                    train_op = opt.apply_gradients( grads_and_vars,global_step = global_step)
+                else:
+                    with tf.name_scope('loss_scale'):
+                        loss_scale = float( self.config['loss_scale'] )
+                        scaled_grads_and_vars = opt.compute_gradients( total_loss * loss_scale )
+                        unscaled_grads_and_vars = [ (g/loss_scale, v)  for g,v in scaled_grads_and_vars ]
+
+
+            #-----------------------------------------Lars------------------------------------------
+                        with tf.name_scope('LARS'):
+                            fp32_grads_and_vars = [ (tf.cast(g, tf.float32), v)  for g,v in unscaled_grads_and_vars ]
+                            grad_var_list = []
+                            
+                            if self.config['use_lars']:
+                                if self.config['accelerator'] == 'gpu':
+                                    for g, var in  fp32_grads_and_vars: 
+    
+                                        if 'BatchNorm' not in var.name and 'bias' not in var.name:
+                                            grad_norm = tf.norm(g,ord='euclidean') 
+                                            weight_norm = tf.norm(var,ord='euclidean')
+                                            grad_norm_wd = tf.add( grad_norm,  tf.multiply( self.config['weight_decay'] , weight_norm ) )
+                                            rescale_factor = tf.div( tf.multiply(0.001, weight_norm), tf.add(grad_norm_wd, tf.constant(1e-5, tf.float32)) )
+                                            decayed_g = tf.add( g, tf.multiply(self.config['weight_decay'], var ) )
+    
+                                            with tf.name_scope('lars_grad'):
+                                                g = tf.multiply(rescale_factor, decayed_g)
+    
+                                        g_and_v = ( g, var )
+                                        grad_var_list.append( g_and_v )
+    
+                                elif self.config['accelerator'] == '1980':
+                                    print('lars9999999999999999999999')
+                                    g_list_bn_bias = []
+                                    var_list_bn_bias = []
+                                    g_list_else = []
+                                    var_list_else = []
+                                    for g, var in fp32_grads_and_vars: 
+                                        if 'BatchNorm' not in var.name and 'bias' not in var.name:
+                                            g_list_else.append(g)
+                                            var_list_else.append(var)
+                                        else:
+                                            g_list_bn_bias.append(g)
+                                            var_list_bn_bias.append(var)
+    
+    
+                                    g_list_else_lars = npu_ops.LARS(inputs_w=var_list_else, 
+                                                    inputs_g=g_list_else, 
+                                                    weight_decay=self.config['weight_decay'],
+                                                    hyperpara=0.001,
+                                                    epsilon=1e-5)
+    
+                                    g_list_lars = g_list_bn_bias + g_list_else_lars
+                                    var_list = var_list_bn_bias + var_list_else
+    
+                                    for (g, var) in zip(g_list_lars,var_list):
+                                        g_and_v = ( g, var )
+                                        grad_var_list.append( g_and_v )
+    
+    
+                            else:
+                                print('do not use lars111111111111111111')
+                                for g, var in  fp32_grads_and_vars:
+                                    #if 'BatchNorm' not in var.name and 'bias' not in var.name:
+                                    #    decayed_g = tf.add( g, tf.multiply( self.config['weight_decay'], var ) )
+                                    #    g = decayed_g
+                                    g_and_v = ( g, var )
+                                    grad_var_list.append( g_and_v )
+            #-----------------------------------------end Lars------------------------------------------
+
+
+
+
+                        train_op = opt.apply_gradients( grad_var_list, global_step = global_step )
+
+            train_op = tf.group(train_op)
+
+            #with tf.device('/cpu:0'):
+                #tf.summary.scalar('total_loss', total_loss)
+                #tf.summary.scalar('base_loss', base_loss)
+                #tf.summary.scalar('learning_rate', learning_rate)
+                #tf.contrib.summary.flush()
+#                if self.config['do_checkpoint']:
+#                    summary_hook = tf.train.SummarySaverHook( save_steps=20, 
+#                                                        output_dir=self.config['log_dir']+'/train_summary',
+#                                                        summary_op = tf.summary.merge_all() ) 
+
+            #return  tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op, training_hooks=[summary_hook] )\
+            #                   if self.config['do_checkpoint'] else tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op )
+            return   tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op )
+          
+            # return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)
+
+
+
+    def get_model_func(self): 
+        model_name = self.config['model_name']
+        if model_name.startswith('resnet'):
+            nlayer = int(model_name[len('resnet'):])
+            return lambda images, *args, **kwargs: \
+                resnet.inference_resnet_v1(self.config,images, nlayer, *args, **kwargs)
+        else:
+            raise ValueError("Invalid model type: %s" % model_name)
+
+
+
+
+
+
+
+        
+
+
+
@@ -0,0 +1,545 @@
+import tensorflow as tf
+
+_BATCH_NORM_EPSILON = 1e-4
+_BATCH_NORM_DECAY = 0.9
+
+_Cardi = 32
+
+class LayerBuilder(object):
+    def __init__(self, activation=None, data_format='channels_last',
+                 training=False, use_batch_norm=False, batch_norm_config=None,
+                 conv_initializer=None, bn_init_mode='adv_bn_init', bn_gamma_initial_value=1.0 ):
+        self.activation = activation
+        self.data_format = data_format
+        self.training = training
+        self.use_batch_norm = use_batch_norm
+        self.batch_norm_config = batch_norm_config
+        self.conv_initializer = conv_initializer
+        self.bn_init_mode = bn_init_mode
+        self.bn_gamma_initial_value = bn_gamma_initial_value
+        if self.batch_norm_config is None:
+            self.batch_norm_config = {
+                'decay': _BATCH_NORM_DECAY,
+                'epsilon': _BATCH_NORM_EPSILON,
+                'scale': True,
+                'zero_debias_moving_mean': False,
+            }
+
+    def _conv2d(self, inputs, activation, *args, **kwargs):
+        x = tf.layers.conv2d(
+            inputs, data_format=self.data_format,
+          #  use_bias=not self.use_batch_norm,
+            use_bias=False,
+            kernel_initializer=self.conv_initializer,
+            activation=None if self.use_batch_norm else activation,
+            *args, **kwargs)
+        if self.use_batch_norm:
+            param_initializers = {
+                'moving_mean': tf.zeros_initializer(),
+                'moving_variance': tf.ones_initializer(),
+                'beta': tf.zeros_initializer(),
+            }
+            if self.bn_init_mode == 'adv_bn_init':
+                param_initializers['gamma'] = tf.ones_initializer()
+            elif self.bn_init_mode == 'conv_bn_init':
+                param_initializers['gamma'] = tf.constant_initializer(self.bn_gamma_initial_value)
+            else:
+                raise ValueError("--bn_init_mode must be 'conv_bn_init' or 'adv_bn_init' ")
+
+            x = self.batch_norm(x)
+            x = activation(x) if activation is not None else x
+        return x
+
+    def conv2d_linear_last_bn(self, inputs, *args, **kwargs):
+        x = tf.layers.conv2d(
+            inputs, data_format=self.data_format,
+            use_bias=False,
+            kernel_initializer=self.conv_initializer,
+            activation=None, *args, **kwargs)
+        param_initializers = {
+            'moving_mean': tf.zeros_initializer(),
+            'moving_variance': tf.ones_initializer(),
+            'beta': tf.zeros_initializer(),
+        }
+        if self.bn_init_mode == 'adv_bn_init':
+            param_initializers['gamma'] = tf.zeros_initializer()
+        elif self.bn_init_mode == 'conv_bn_init':
+            param_initializers['gamma'] = tf.constant_initializer(self.bn_gamma_initial_value)
+        else:
+            raise ValueError("--bn_init_mode must be 'conv_bn_init' or 'adv_bn_init' ")    
+
+        x = self.batch_norm(x, param_initializers=param_initializers)
+        return x
+
+    def conv2d_no_act_no_bn(self, inputs, *args, **kwargs):
+        x = tf.layers.conv2d(
+            inputs, data_format=self.data_format,
+            use_bias=False,
+            kernel_initializer=self.conv_initializer,
+            activation=None, *args, **kwargs)
+        return x
+
+    def conv2d_linear(self, inputs, *args, **kwargs):
+        return self._conv2d(inputs, None, *args, **kwargs)
+
+    def conv2d(self, inputs, *args, **kwargs):
+        return self._conv2d(inputs, self.activation, *args, **kwargs)
+
+    def pad2d(self, inputs, begin, end=None):
+        if end is None:
+            end = begin
+        try:
+            _ = begin[1]
+        except TypeError:
+            begin = [begin, begin]
+        try:
+            _ = end[1]
+        except TypeError:
+            end = [end, end]
+        if self.data_format == 'channels_last':
+            padding = [[0, 0], [begin[0], end[0]], [begin[1], end[1]], [0, 0]]
+        else:
+            padding = [[0, 0], [0, 0], [begin[0], end[0]], [begin[1], end[1]]]
+        return tf.pad(inputs, padding)
+
+    def max_pooling2d(self, inputs, *args, **kwargs):
+        return tf.layers.max_pooling2d(
+            inputs, data_format=self.data_format, *args, **kwargs)
+
+    def average_pooling2d_stride_1(self, inputs, *args, **kwargs):
+     #   inputs = tf.nn.avg_pool(inputs, ksize=[1,1,1,1],strides=[1,1,1,1], padding="VALID", data_format="NHWC" )
+        return inputs
+
+    def average_pooling2d(self, inputs, *args, **kwargs):
+        inputs = tf.nn.avg_pool(inputs, ksize=[1,2,2,1],strides=[1,2,2,1], padding="VALID", data_format="NHWC" )
+        return inputs
+
+ #       return tf.layers.average_pooling2d(
+ #           inputs, data_format=self.data_format, *args, **kwargs)
+
+    def dense_linear(self, inputs, units, **kwargs):
+        return tf.layers.dense(inputs, units, activation=None)
+
+    def dense(self, inputs, units, **kwargs):
+        return tf.layers.dense(inputs, units, activation=self.activation)
+
+    def activate(self, inputs, activation=None):
+        activation = activation or self.activation
+        return activation(inputs) if activation is not None else inputs
+
+    def batch_norm(self, inputs, **kwargs):
+        all_kwargs = dict(self.batch_norm_config)
+        all_kwargs.update(kwargs)
+        data_format = 'NHWC' if self.data_format == 'channels_last' else 'NCHW'
+        bn_inputs = inputs
+        outputs = tf.contrib.layers.batch_norm(
+            inputs, is_training=self.training, data_format=data_format,
+            fused=True, **all_kwargs)
+
+        return outputs
+
+    def spatial_average2d(self, inputs):
+        shape = inputs.get_shape().as_list()
+        if self.data_format == 'channels_last':
+            n, h, w, c = shape
+        else:
+            n, c, h, w = shape
+        n = -1 if n is None else n
+        x = tf.layers.average_pooling2d(inputs, (h, w), (1, 1),
+                                        data_format=self.data_format)
+        return tf.reshape(x, [n, c])
+
+    def flatten2d(self, inputs):
+        x = inputs
+        if self.data_format != 'channel_last':
+            # Note: This ensures the output order matches that of NHWC networks
+            x = tf.transpose(x, [0, 2, 3, 1])
+        input_shape = x.get_shape().as_list()
+        num_inputs = 1
+        for dim in input_shape[1:]:
+            num_inputs *= dim
+        return tf.reshape(x, [-1, num_inputs], name='flatten')
+
+    def residual2d(self, inputs, network, units=None, scale=1.0, activate=False):
+        outputs = network(inputs)
+        c_axis = -1 if self.data_format == 'channels_last' else 1
+        h_axis = 1 if self.data_format == 'channels_last' else 2
+        w_axis = h_axis + 1
+        ishape, oshape = [y.get_shape().as_list() for y in [inputs, outputs]]
+        ichans, ochans = ishape[c_axis], oshape[c_axis]
+        strides = ((ishape[h_axis] - 1) // oshape[h_axis] + 1,
+                   (ishape[w_axis] - 1) // oshape[w_axis] + 1)
+        with tf.name_scope('residual'):
+            if (ochans != ichans or strides[0] != 1 or strides[1] != 1):
+                inputs = self.conv2d_linear(inputs, units, 1, strides, 'SAME')
+            x = inputs + scale * outputs
+            if activate:
+                x = self.activate(x)
+        return x
+
+
+def resnet_bottleneck_v1(builder, inputs, depth, depth_bottleneck, stride, filters, arch_type,
+                         basic=False):
+    num_inputs = inputs.get_shape().as_list()[3]
+    x = inputs
+    #with tf.name_scope('resnet_model'):
+    if depth == num_inputs:
+        if stride == 1:#v1.5
+            shortcut = x
+        else:#v1
+            shortcut = builder.max_pooling2d(x, 1, stride)
+    else: # the downsample(first) block in each layer
+        if 'D1' in arch_type:
+            if stride == 1:
+              shortcut = builder.average_pooling2d_stride_1(x, stride, stride)             #--------------------Resnet-D------------
+            else:
+              shortcut = builder.average_pooling2d(x, stride, stride)             #--------------------Resnet-D------------
+            shortcut = builder.conv2d_linear(shortcut, depth, 1, 1, 'SAME')
+        elif 'D2' in arch_type:
+            shortcut = builder.conv2d_linear(x, depth, 3, stride, 'SAME')
+        elif 'D3' in arch_type:
+            shortcut = builder.conv2d_linear(x, depth, 1, 1, 'SAME')
+            shortcut = builder.average_pooling2d(shortcut, stride, stride)             #--------------------Resnet-D------------
+        else:
+            shortcut = builder.conv2d_linear(x, depth, 1, stride, 'SAME')
+        conv_input = x
+
+    if basic:
+        x = builder.pad2d(x, 1)
+        x = builder.conv2d(x, depth_bottleneck, 3, stride, 'VALID')
+        x = builder.conv2d_linear(x, depth, 3, 1, 'SAME')
+    else:
+        conv_input = x
+        x = builder.conv2d(x, depth_bottleneck, 1, 1, 'SAME')
+        conv_input = x
+        if stride == 1:
+            x = builder.conv2d(x, depth_bottleneck, 3, stride, 'SAME')
+        else:
+            if 'E1' in arch_type:
+                x = builder.average_pooling2d( x, stride, stride )
+                x = builder.conv2d(x, depth_bottleneck, 3, 1, 'SAME')
+            elif 'E2' in arch_type:
+                x = builder.conv2d(x, depth_bottleneck, 3, 1, 'SAME')
+                if stride == 1:
+                  x = builder.average_pooling2d_stride_1( x, stride, stride )
+                else:
+                  x = builder.average_pooling2d( x, stride, stride )
+            else:  # E0
+                x = builder.conv2d(x, depth_bottleneck, 3, stride, 'SAME')
+            
+        # x = builder.conv2d_linear(x, depth,            1, 1,      'SAME')
+        conv_input = x
+        x = builder.conv2d_linear_last_bn(x, depth, 1, 1, 'SAME')
+
+    x = tf.nn.relu(x + shortcut)
+    return x
+
+
+
+
+def resnext_bottleneck(builder, inputs, depth, depth_bottleneck, stride, filters, arch_type,
+                         basic=False):
+    num_inputs = inputs.get_shape().as_list()[3]
+    x = inputs
+    with tf.name_scope('resnet_v1'):
+        if depth == num_inputs:
+            if stride == 1:#v1.5
+                shortcut = x
+            else:#v1
+                shortcut = builder.max_pooling2d(x, 1, stride)
+        else: # the downsample(first) block in each layer
+            shortcut = builder.conv2d_linear(x, depth, 1, stride, 'SAME')
+        if basic:
+            x = builder.pad2d(x, 1)
+            x = builder.conv2d(x, depth_bottleneck, 3, stride, 'VALID')
+            x = builder.conv2d_linear(x, depth, 3, 1, 'SAME')
+        else:
+
+            #----- split layer ------
+            x = builder.conv2d( x, depth_bottleneck, 1, 1, 'SAME' )   
+
+            group_inputs = tf.split( x, _Cardi, axis=3 )
+
+            layers_split=[]
+            tmp = x
+            for i in range(_Cardi):
+              with tf.name_scope('cardi_'+str(i)):
+                split = builder.conv2d_no_act_no_bn( group_inputs[i], depth_bottleneck/_Cardi, 3, stride, 'SAME' )
+                layers_split.append(split)
+
+            x = tf.concat(layers_split, axis=3)
+            x = builder.batch_norm(x)
+            x = tf.nn.relu(x)
+
+            x = builder.conv2d_linear_last_bn(x, depth, 1, 1, 'SAME')
+        x = tf.nn.relu(x + shortcut)
+        return x
+
+
+
+
+
+
+def resnet_bottleneck_v2(builder, inputs, depth, depth_bottleneck, stride, filters, arch_type,
+                         basic=False):
+    num_inputs = inputs.get_shape().as_list()[1]
+    x = inputs
+    with tf.name_scope('resnet_v1'):
+        # ------- shortcut ---------------
+        if depth == num_inputs:
+            if stride == 1:#v1.5
+                shortcut = x
+                x = builder.batch_norm(x)
+                x = tf.nn.relu(x)
+            else:#v1
+                shortcut = builder.max_pooling2d(x, 1, stride)
+        else: # the downsample(first) block in each layer
+            x = builder.batch_norm(x)
+            x = tf.nn.relu(x)
+
+            if 'D1' in arch_type:
+                shortcut = builder.average_pooling2d(x, stride, stride)             #--------------------Resnet-D------------
+                shortcut = builder.conv2d_linear(shortcut, depth, 1, 1, 'SAME')
+            elif 'D2' in arch_type:
+                shortcut = builder.conv2d_linear(x, depth, 3, stride, 'SAME')
+            elif 'D3' in arch_type:
+                shortcut = builder.conv2d_linear(x, depth, 1, 1, 'SAME')
+                shortcut = builder.average_pooling2d(shortcut, stride, stride)             #--------------------Resnet-D------------
+            else:
+                shortcut = builder.conv2d_linear(x, depth, 1, stride, 'SAME')
+
+        # -------- mainstream ----------------
+        if basic:
+            x = builder.pad2d(x, 1)
+            x = builder.conv2d(x, depth_bottleneck, 3, stride, 'VALID')
+            x = builder.conv2d_linear(x, depth, 3, 1, 'SAME')
+        else:
+            x = builder.conv2d(x, depth_bottleneck, 1, 1, 'SAME')
+            x = builder.batch_norm(x)
+            x = tf.nn.relu(x)
+
+            if stride == 1:
+                x = builder.conv2d(x, depth_bottleneck, 3, stride, 'SAME')
+                x = builder.batch_norm(x)
+                x = tf.nn.relu(x)
+            else:
+                if 'E1' in arch_type:
+                    x = builder.average_pooling2d( x, stride, stride )
+                    x = builder.conv2d(x, depth_bottleneck, 3, 1, 'SAME')
+                    x = builder.batch_norm(x)
+                    x = tf.nn.relu(x)
+                elif 'E2' in arch_type:
+                    x = builder.conv2d(x, depth_bottleneck, 3, 1, 'SAME')
+                    x = builder.batch_norm(x)
+                    x = tf.nn.relu(x)
+                    x = builder.average_pooling2d( x, stride, stride )
+                else:  # E0
+                    x = builder.conv2d(x, depth_bottleneck, 3, stride, 'SAME')
+                    x = builder.batch_norm(x)
+                    x = tf.nn.relu(x)
+            
+            x = builder.conv2d_linear(x, depth, 1, 1, 'SAME')
+
+
+        x = x + shortcut
+        return x
+        
+def inference_resnext_impl(builder, inputs, layer_counts, arch_type='C1+D', resnet_version='v1.5', basic=False):
+    x = inputs
+    #x = builder.batch_norm(x)
+    x = builder.pad2d(x, 3)
+    x = builder.conv2d(x, 64, 7, 2, 'VALID')
+    #x = builder.conv2d(x, 64, 7, 2, 'SAME')
+    
+
+    num_filters=64
+    x = builder.max_pooling2d(x, 3, 2, 'SAME')
+    #x, argmax = tf.nn.max_pool_with_argmax(input=x, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME')
+
+    for i in range(layer_counts[0]):
+        x = resnext_bottleneck(builder, x, 256, 128, 1, num_filters, arch_type, basic)
+    for i in range(layer_counts[1]):
+        num_filters=num_filters*2
+        x = resnext_bottleneck(builder, x, 512, 256, 2 if i == 0 else 1, num_filters, arch_type, basic)
+    for i in range(layer_counts[2]):
+        num_filters=num_filters*2
+        x = resnext_bottleneck(builder, x, 1024, 512, 2 if i == 0 else 1, num_filters, arch_type, basic)
+    for i in range(layer_counts[3]):
+        num_filters=num_filters*2
+        x = resnext_bottleneck(builder, x, 2048, 1024, 2 if i == 0 else 1, num_filters, arch_type, basic)
+    print ('====================Final x:', x)
+       
+
+    axes = [1,2]
+    x = tf.reduce_mean( x, axes, keepdims=True )		
+    x = tf.identity(x, 'final_reduce_mean')
+    x = tf.reshape( x, [-1, 2048] )
+    x = tf.layers.dense(inputs=x, units=1001,kernel_initializer= tf.variance_scaling_initializer() )
+    x = tf.identity( x, 'final_dense' )
+    return x       
+        
+
+def inference_resnet_v1_impl(builder, inputs, layer_counts, arch_type='C1+D', resnet_version='v1.5', basic=False):
+    x = inputs
+    #x = builder.pad2d(x, 1)
+
+    if 'C1' in arch_type:  # --- Resnet C -----
+        x = builder.conv2d(x, 32, 3, 2, 'SAME')
+        x = builder.conv2d(x, 32, 3, 1, 'SAME')
+        x = builder.conv2d(x, 64, 3, 1, 'SAME')
+    elif 'C2' in arch_type:  
+        x = builder.conv2d(x, 32, 3, 1, 'SAME')
+        x = builder.conv2d(x, 32, 3, 2, 'VALID')
+        x = builder.conv2d(x, 64, 3, 1, 'VALID')
+    elif 'C3' in arch_type:  
+        x = builder.conv2d(x, 32, 3, 1, 'VALID')
+        x = builder.conv2d(x, 32, 3, 1, 'VALID')
+        x = builder.conv2d(x, 64, 3, 2, 'VALID')
+    else:
+        x = builder.conv2d(x, 64, 7, 2, 'SAME')
+
+    num_filters=64
+
+    pooled_inputs = x
+    #x = builder.max_pooling2d(x, 3, 2, 'SAME')
+    x, argmax = tf.nn.max_pool_with_argmax(input=x, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME')
+
+    for i in range(layer_counts[0]):
+        x = resnet_bottleneck_v1(builder, x, 256, 64, 1, num_filters, arch_type, basic)
+    for i in range(layer_counts[1]):
+        num_filters=num_filters*2
+        x = resnet_bottleneck_v1(builder, x, 512, 128, 2 if i == 0 else 1, num_filters, arch_type, basic)
+    for i in range(layer_counts[2]):
+        num_filters=num_filters*2
+        x = resnet_bottleneck_v1(builder, x, 1024, 256, 2 if i == 0 else 1, num_filters, arch_type, basic)
+    for i in range(layer_counts[3]):
+        num_filters=num_filters*2
+        x = resnet_bottleneck_v1(builder, x, 2048, 512, 2 if i == 0 else 1, num_filters, arch_type, basic)
+
+    axes = [1,2]
+    x = tf.reduce_mean( x, axes, keepdims=True )		
+    x = tf.identity(x, 'final_reduce_mean')
+    x = tf.reshape( x, [-1, 2048] )
+    x = tf.layers.dense(inputs=x, units=1001,kernel_initializer=tf.random_normal_initializer(stddev=0.01))
+    x = tf.identity( x, 'final_dense' )
+    return x
+
+def inference_resnet_v2_impl(builder, inputs, layer_counts, arch_type='C1+D', basic=False):
+    x = inputs
+    x = builder.pad2d(x, 3)
+
+    if 'C1' in arch_type:  # --- Resnet C -----
+        x = builder.conv2d(x, 32, 3, 2, 'VALID')
+        x = builder.batch_norm(x)
+        x = tf.nn.relu(x)
+        x = builder.conv2d(x, 32, 3, 1, 'VALID')
+        x = builder.batch_norm(x)
+        x = tf.nn.relu(x)
+        x = builder.conv2d(x, 64, 3, 1, 'SAME')
+        x = builder.batch_norm(x)
+        x = tf.nn.relu(x)
+    elif 'C2' in arch_type:  
+        x = builder.conv2d(x, 32, 3, 1, 'SAME')
+        x = builder.batch_norm(x)
+        x = tf.nn.relu(x)
+        x = builder.conv2d(x, 32, 3, 2, 'VALID')
+        x = builder.batch_norm(x)
+        x = tf.nn.relu(x)
+        x = builder.conv2d(x, 64, 3, 1, 'VALID')
+        x = builder.batch_norm(x)
+        x = tf.nn.relu(x)
+    elif 'C3' in arch_type:  
+        x = builder.conv2d(x, 32, 3, 1, 'VALID')
+        x = builder.batch_norm(x)
+        x = tf.nn.relu(x)
+        x = builder.conv2d(x, 32, 3, 1, 'VALID')
+        x = builder.batch_norm(x)
+        x = tf.nn.relu(x)
+        x = builder.conv2d(x, 64, 3, 2, 'VALID')
+        x = builder.batch_norm(x)
+        x = tf.nn.relu(x)
+    else:
+        x = builder.conv2d(x, 64, 7, 2, 'VALID')
+        x = builder.batch_norm(x)
+        x = tf.nn.relu(x)
+
+    num_filters=64
+
+    pooled_inputs = x
+    x = builder.max_pooling2d(x, 3, 2, 'SAME')
+
+    for i in range(layer_counts[0]):
+        x = resnet_bottleneck_v2(builder, x, 256, 64, 1, num_filters, arch_type, basic)
+    for i in range(layer_counts[1]):
+        num_filters=num_filters*2
+        x = resnet_bottleneck_v2(builder, x, 512, 128, 2 if i == 0 else 1, num_filters, arch_type, basic)
+    for i in range(layer_counts[2]):
+        num_filters=num_filters*2
+        x = resnet_bottleneck_v2(builder, x, 1024, 256, 2 if i == 0 else 1, num_filters, arch_type, basic)
+    for i in range(layer_counts[3]):
+        num_filters=num_filters*2
+        x = resnet_bottleneck_v2(builder, x, 2048, 512, 2 if i == 0 else 1, num_filters, arch_type, basic)
+    return builder.spatial_average2d(x)
+
+def inference_resnet_v1(config, inputs, nlayer, data_format='channels_last',
+                        training=False, conv_initializer=None, bn_init_mode='adv_bn_init', bn_gamma_initial_value=1.0 ):
+    """Deep Residual Networks family of models
+    https://arxiv.org/abs/1512.03385
+    """
+    if config['resnet_version'] == 'v1.5':
+        builder = LayerBuilder(tf.nn.relu, data_format, training, use_batch_norm=True,
+                               conv_initializer=conv_initializer, bn_init_mode=bn_init_mode, bn_gamma_initial_value=bn_gamma_initial_value)
+        if nlayer == 18:
+            return inference_resnet_v1_impl(builder, inputs, [2, 2, 2, 2], config['arch_type'], config['resnet_version'], basic=True)
+        elif nlayer == 34:
+            return inference_resnet_v1_impl(builder, inputs, [3, 4, 6, 3], config['arch_type'], config['resnet_version'], basic=True)
+        elif nlayer == 50:
+            return inference_resnet_v1_impl(builder, inputs, [3, 4, 6, 3], config['arch_type'], config['resnet_version'])
+        elif nlayer == 101:
+            return inference_resnet_v1_impl(builder, inputs, [3, 4, 23, 3], config['arch_type'], config['resnet_version'])
+        elif nlayer == 152:
+            return inference_resnet_v1_impl(builder, inputs, [3, 8, 36, 3], config['arch_type'], config['resnet_version'])
+        else:
+            raise ValueError("Invalid nlayer (%i); must be one of: 18,34,50,101,152" %
+                             nlayer)
+
+    elif config['resnet_version'] == 'v2':
+        builder = LayerBuilder( None, data_format, training, use_batch_norm=False,
+                               conv_initializer=conv_initializer, bn_init_mode=bn_init_mode, bn_gamma_initial_value=bn_gamma_initial_value)
+        if nlayer == 18:
+            return inference_resnet_v2_impl(builder, inputs, [2, 2, 2, 2], config['arch_type'], basic=True)
+        elif nlayer == 34:
+            return inference_resnet_v2_impl(builder, inputs, [3, 4, 6, 3], config['arch_type'], basic=True)
+        elif nlayer == 50:
+            return inference_resnet_v2_impl(builder, inputs, [3, 4, 6, 3], config['arch_type'])
+        elif nlayer == 101:
+            return inference_resnet_v2_impl(builder, inputs, [3, 4, 23, 3], config['arch_type'])
+        elif nlayer == 152:
+            return inference_resnet_v2_impl(builder, inputs, [3, 8, 36, 3], config['arch_type'])
+        else:
+            raise ValueError("Invalid nlayer (%i); must be one of: 18,34,50,101,152" %
+                             nlayer)
+                             
+    elif config['resnet_version'] == 'resnext':
+        builder = LayerBuilder( tf.nn.relu, data_format, training, use_batch_norm=True,
+                               conv_initializer=conv_initializer, bn_init_mode=bn_init_mode, bn_gamma_initial_value=bn_gamma_initial_value)
+        if nlayer == 18:
+            return inference_resnext_impl(builder, inputs, [2, 2, 2, 2], config['arch_type'], basic=True)
+        elif nlayer == 34:
+            return inference_resnext_impl(builder, inputs, [3, 4, 6, 3], config['arch_type'], basic=True)
+        elif nlayer == 50:
+            return inference_resnext_impl(builder, inputs, [3, 4, 6, 3], config['arch_type'])
+        elif nlayer == 101:
+            return inference_resnext_impl(builder, inputs, [3, 4, 23, 3], config['arch_type'])
+        elif nlayer == 152:
+            return inference_resnext_impl(builder, inputs, [3, 8, 36, 3], config['arch_type'])
+        else:
+            raise ValueError("Invalid nlayer (%i); must be one of: 18,34,50,101,152" %
+                             nlayer)                             
+                             
+                             
+    else:
+        raise ValueError("Invalid resnet version")
+   
+
+
@@ -0,0 +1,228 @@
+import six
+import tensorflow as tf
+
+class Optimizer: 
+    def __init__(self, config):
+        self.config = config 
+
+    def get_lbs_optimizer(self, opt):  #TODO input is ( self, hyper_param )
+
+        # opt = LargeBatchSizeOptimizer(opt, weight_decay=self.config['weight_decay'], 
+        #                                    accum_dtype = self.config['dtype'],
+        #                                    use_lars = self.config['use_lars'],
+        #                                    bn_lr_scale = self.config.get('bn_lr_scale', 1.0)
+        #                                 )
+        opt = MixedPrecisionOptimizer(opt, self.config) 
+
+        return opt
+
+class MixedPrecisionOptimizer(tf.train.Optimizer):
+    """An optimizer that updates trainable variables in fp32."""
+
+    def __init__(self, optimizer, config):
+        super(MixedPrecisionOptimizer, self).__init__(
+            optimizer._use_locking,
+            optimizer._name + '-MP',
+        )
+        self._optimizer = optimizer
+        self._config = config
+        loss_scale=self._config['loss_scale']
+        self._loss_scale = float(loss_scale)
+        self._fp32_to_fp16 = {}
+
+        var_list = (
+                tf.trainable_variables() +
+                tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+        with tf.device('/gpu:0'):
+            self.var_fp32_copy = [ tf.Variable( tf.cast(v.initialized_value(), tf.float32), 
+                                    dtype=tf.float32, trainable=False, 
+                                    collections=[tf.GraphKeys.GLOBAL_VARIABLES, "FP32_MASTER_COPIES"] ) for v in var_list ]
+
+    def compute_gradients(self, loss, var_list=None,
+                            gate_gradients=tf.train.Optimizer.GATE_OP,
+                            aggregation_method=None,
+                            colocate_gradients_with_ops=False,
+                            grad_loss=None):
+        if var_list is None:
+            var_list = (
+                    tf.trainable_variables() +
+                    tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+
+        if self._loss_scale != 1.0:
+            loss = tf.scalar_mul(self._loss_scale, loss)
+
+        grads_and_vars_fp16 = self._optimizer.compute_gradients(
+            loss, var_list=var_list,
+            gate_gradients=gate_gradients,
+            aggregation_method=aggregation_method,
+            colocate_gradients_with_ops=colocate_gradients_with_ops,
+            grad_loss=grad_loss,
+        )
+        # creating FP-32 variables and filling the fp32 dict
+        grads_and_vars_fp32 = []
+
+        with tf.variable_scope('FP32-master-copy'):
+            for i, (grad, var) in enumerate(grads_and_vars_fp16):
+                if grad is not None:
+                    if var.dtype.base_dtype == tf.float16:
+                        fp32_var = self.var_fp32_copy[i]
+                        self._fp32_to_fp16[fp32_var.name] = var
+                        fp32_grad = tf.cast(grad, tf.float32)
+                        grads_and_vars_fp32.append((fp32_grad, fp32_var))
+                    else:
+                        grads_and_vars_fp32.append((grad, var))
+                else:
+                    grads_and_vars_fp32.append((None, var))
+
+        grads_and_vars_fp32_rescaled = [ (g/self._loss_scale, v)  for g,v in grads_and_vars_fp32 ]
+
+
+        return grads_and_vars_fp32_rescaled
+
+    def apply_gradients(self, grads_and_vars, *args, **kwargs):
+        update_op = self._optimizer.apply_gradients(grads_and_vars, *args, **kwargs)
+        apply_ops = []
+        with tf.control_dependencies([update_op]):
+            for grad, var in grads_and_vars:
+                if var.name in self._fp32_to_fp16:
+                    dst_var = self._fp32_to_fp16[var.name]
+                    apply_ops.append(
+                        tf.assign(dst_var, tf.saturate_cast(var, tf.float16)))
+        if apply_ops:
+            return tf.group(apply_ops)
+        return update_op
+
+
+class LargeBatchSizeOptimizer(tf.train.Optimizer):
+    """ LARC implementation
+        -------------------
+        Parameters:
+          - optimizer:     initial optimizer that you wanna apply
+                           example: tf.train.MomentumOptimizer
+          - learning_rate: initial learning_rate from initial optimizer
+          - clip:          if True apply LARC otherwise LARS
+          - epsilon:       default value is weights or grads are 0.
+          - name
+          - use_locking
+    """
+
+    def __init__(self, optimizer, weight_decay, clip=True, epsilon=1., accum_dtype=tf.float16, use_lars=True, bn_lr_scale=1.0,
+                 name="LarcOptimizer", use_locking=False):
+        super(LargeBatchSizeOptimizer, self).__init__(
+            name=name, use_locking=use_locking)
+        self._optimizer = optimizer
+      #  self._learning_rate = learning_rate
+        self._weight_decay = weight_decay
+        self._clip = clip
+        self._epsilon = float(epsilon)
+        self._accum_dtype=accum_dtype
+        self._use_lars=use_lars
+        self._bn_lr_scale=bn_lr_scale 
+
+        var_list = (
+                tf.trainable_variables() +
+                tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+        with tf.device('/gpu:0'):
+            self._grads_accum = [ tf.Variable( tf.cast(tf.zeros_like(v.initialized_value()), self._accum_dtype), dtype=self._accum_dtype, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES] ) for v in var_list ]
+
+        
+    def compute_gradients(self, *args, **kwargs):
+        return self._optimizer.compute_gradients(*args, **kwargs)
+
+
+    def apply_gradients(self, gradvars, loss_scale, *args, **kwargs):
+
+        global_step = tf.train.get_global_step()
+
+        grads_and_vars_clean = []
+        for grad, var in gradvars:
+            if grad is not None:
+                grads_and_vars_clean.append( (grad, var) )
+
+        processed_grads_and_vars = self.post_process_grads(grads_and_vars_clean, loss_scale) # post_process_grads includes Lars
+
+        def apply():
+            red_grad_updates = self._optimizer.apply_gradients( processed_grads_and_vars, global_step=tf.train.get_global_step() ) 
+            return tf.group(red_grad_updates)
+
+        update_weight_op_1 = apply()
+        return update_weight_op_1 
+
+        apply_gradients_op = update_weight_op_1
+
+        with tf.device('/cpu:0'):
+            #tf.summary.scalar('loss_scale', loss_scale)
+            for grad, var in gradvars:
+                g = grad / loss_scale
+                v_norm_2 = tf.norm(var, ord='euclidean')
+                g_norm_2 = tf.norm(g, ord='euclidean')
+                v_g_norm2_ratio = v_norm_2 / (
+                        g_norm_2 + self._weight_decay * v_norm_2)
+                if grad is not None:
+                    if 'BatchNorm' in var.name:
+                        with tf.name_scope('bn_norm2/'):
+                            tf.summary.scalar(var.name + '/norm2',
+                                              v_norm_2)
+                        with tf.name_scope('grad_bn_norm2/'):
+                            tf.summary.scalar(var.name + '/grad_norm2',
+                                              g_norm_2)
+                        with tf.name_scope('bn_ratio_var_grad/'):
+                            tf.summary.scalar(var.name + '/ratio_var_grad',
+                                              v_g_norm2_ratio)
+                    else:
+                        with tf.name_scope('conv_norm2/'):
+                            tf.summary.scalar(var.name + '/norm2',
+                                              v_norm_2)
+                        with tf.name_scope('grad_conv_norm2/'):
+                            tf.summary.scalar(var.name + '/grad_norm2',
+                                              g_norm_2)
+                        with tf.name_scope('conv_ratio_var_grad/'):
+                            tf.summary.scalar(var.name + '/ratio_var_grad',
+                                              v_g_norm2_ratio)
+
+        return apply_gradients_op
+
+    def post_process_grads(self, grads_and_vars, loss_scale):
+
+        g_and_v_scaled = []
+        for g, v in grads_and_vars:
+            g = g / loss_scale
+            g_and_v_scaled.append((g,v))
+
+        # Lars
+        if self._use_lars:
+            grad_var_list = []
+            #-----------------------------------------------LARS and weight decay-----------------------------------
+            for g, var in  g_and_v_scaled:
+                if 'BatchNorm' not in var.name and 'bias' not in var.name:
+                    grad_norm = tf.norm(g,ord='euclidean') 
+                    weight_norm = tf.norm(var,ord='euclidean')
+                    
+                    grad_norm_wd = tf.add( grad_norm,  tf.multiply( self._weight_decay, weight_norm ) )
+                    rescale_factor = tf.div( tf.multiply(0.001, weight_norm), tf.add(grad_norm_wd, tf.constant(1e-5, tf.float32)) )
+
+                    coeffi = tf.clip_by_value( rescale_factor, 0.001, 50.0 )
+                    decayed_g = tf.add( g, tf.multiply( self._weight_decay, var ) )
+
+                    g = tf.multiply(coeffi, decayed_g) 
+                else:
+                    g = self._bn_lr_scale * g
+
+                g_and_v = ( g, var )
+                grad_var_list.append( g_and_v )
+            #-------------------------------------------LARS end---------------------------------
+            return grad_var_list
+        else:
+            grad_var_list_without_lars = []
+            #----------------------------------------weight decay-----------------------------------
+            for g, var in  g_and_v_scaled:
+                if 'BatchNorm' not in var.name and 'bias' not in var.name:
+                    decayed_g = tf.add( g, tf.multiply( self._weight_decay, var ) )
+                    g = decayed_g
+                else:
+                    g = self._bn_lr_scale * g
+
+                g_and_v = ( g, var )
+                grad_var_list_without_lars.append( g_and_v )
+
+            return grad_var_list_without_lars
@@ -0,0 +1,172 @@
+import tensorflow as tf
+import math
+import time
+from . import train_helper
+from .train_helper import stage
+from utils.logger import rank0log
+
+from tensorflow.contrib.offline_train.python.npu.npu_config import NPURunConfig
+from tensorflow.contrib.offline_train.python.npu.npu_estimator import NPUEstimator
+from tensorflow.contrib.offline_train.python.npu.npu_optimizer import NPUDistributedOptimizer
+
+class GPUBaseTrain(object):
+    def __init__(self, session, config, data, model, logger):
+        self.sess = session
+        self.config = config
+        self.data = data
+        self.model = model
+        self.logger = logger
+        self.print_logger = self.logger.logger
+        self.all_preds = []
+        self.all_targets = []
+        if self.config['accelerator'] == 'gpu':
+            self.classifier, self.training_hook = self.get_classifier()
+        else:
+            from tensorflow.contrib.offline_train.python.npu.npu_config import NPURunConfig
+            from tensorflow.contrib.offline_train.python.npu.npu_estimator import NPUEstimator
+            from tensorflow.contrib.offline_train.python.npu.npu_optimizer import NPUDistributedOptimizer
+            self.classifier, self.training_hook = self.get_npu_classifier()
+            
+        
+
+    def get_classifier(self):
+        classifier = tf.estimator.Estimator(
+            model_fn=self.model.get_estimator_model_func,
+            model_dir=self.config['log_dir'],
+            config = tf.estimator.RunConfig( 
+                    session_config=self.sess.get_config(), 
+                    save_summary_steps=self.config['save_summary_steps'] if self.config['do_checkpoint'] else None,
+                    save_checkpoints_steps=self.config['save_checkpoints_steps'] if self.config['do_checkpoint'] else None,
+                    keep_checkpoint_max=None
+                     )
+            )
+
+        training_hooks = [train_helper.PrefillStagingAreasHook()]
+        training_hooks.append(self.logger)
+
+        return classifier, training_hooks
+
+    def get_npu_classifier(self):
+        session_config = tf.ConfigProto(
+           inter_op_parallelism_threads=10,
+           intra_op_parallelism_threads=10,
+           allow_soft_placement=True)
+
+        if self.config['debug'] :
+            run_config = NPURunConfig(enable_auto_mix_precision=True, enable_data_pre_proc=True, save_checkpoints_steps=112590, session_config=session_config, model_dir = self.config['model_dir'], iterations_per_loop=self.config['iterations_per_loop'], keep_checkpoint_max=5)
+        else :
+            run_config = NPURunConfig(enable_auto_mix_precision=True, save_summary_steps=0, log_step_count_steps=None, enable_data_pre_proc=True,save_checkpoints_secs=1e9, session_config=session_config, model_dir = self.config['model_dir'], iterations_per_loop=self.config['iterations_per_loop'])
+#        run_config = NPURunConfig(enable_data_pre_proc=True,save_checkpoints_secs=1e9, session_config=session_config, model_dir = self.config['model_dir'])
+
+     #   classifier = tf.estimator.Estimator(
+     #       model_fn=self.model.get_estimator_model_func,
+     #       model_dir=self.config['log_dir'],
+     #       config = tf.estimator.RunConfig( 
+     #               session_config=self.sess.get_config(), 
+     #               save_summary_steps=self.config['save_summary_steps'] if self.config['do_checkpoint'] else None,
+     #               save_checkpoints_steps=self.config['save_checkpoints_steps'] if self.config['do_checkpoint'] else None,
+     #               keep_checkpoint_max=None
+     #                )
+     #       )
+
+        classifier =NPUEstimator(
+            model_fn= self.model.get_estimator_model_func, 
+            config= run_config
+#            job_start_file='/tmp/config/deviceid_devindex_jobstart'
+      	  )
+      
+        training_hooks = []
+        if self.config['debug']:
+            training_hooks = [train_helper.PrefillStagingAreasHook()]
+            training_hooks.append(self.logger)
+
+        return classifier, training_hooks
+
+    def train(self):
+        print ('training steps: %d' % self.config['nstep'])
+        self.classifier.train( input_fn=lambda:self.data.get_train_input_fn(),
+                             #  max_steps = self.config['max_train_steps'],
+                               max_steps = self.config['nstep'],
+                               #steps = 100,
+                               hooks = self.training_hook
+                              )
+
+
+    def evaluate(self):
+        rank0log(self.print_logger, "Evaluating")
+        rank0log(self.print_logger, "Validation dataset size: {}".format(self.config['num_evaluating_samples'] ))
+        time.sleep(5)  # a little extra margin...
+        try:
+            ckpts = train_helper.sort_and_load_ckpts(self.config['log_dir'])
+            for i, c in enumerate(ckpts):
+                if i < len(ckpts) - 1:
+                    if i % self.config['eval_interval'] != 0:
+                        continue
+                eval_result = self.classifier.evaluate(
+                    input_fn=lambda: self.data.get_eval_input_fn(),
+                    checkpoint_path=c['path'])
+                c['epoch'] = math.ceil(c['step'] / (self.config['num_training_samples']/ (self.config['batch_size'])))
+                c['top1'] = eval_result['val-top1acc']
+                c['top5'] = eval_result['val-top5acc']
+                c['loss'] = eval_result['loss']
+
+            rank0log(self.print_logger, ' step  epoch  top1    top5     loss   checkpoint_time(UTC)')
+            for i, c in enumerate(ckpts):
+                if 'top1' not in c:
+                    continue
+                rank0log(self.print_logger,'{:5d}  {:5.1f}  {:5.3f}  {:6.2f}  {:6.2f}  {time}'
+                         .format(c['step'],
+                                 c['epoch'],
+                                 c['top1'] * 100,
+                                 c['top5'] * 100,
+                                 c['loss'],
+                                 time=time.strftime('%Y-%m-%d %H:%M:%S', 
+                                    time.localtime(c['mtime']))))
+            rank0log(self.print_logger, "Finished evaluation")
+        except KeyboardInterrupt:
+            self.print_logger.error("Keyboard interrupt")
+
+    def train_and_evaluate(self):
+        success = False
+        epochs_between_evals = self.config.get('epochs_between_evals', 4)
+
+
+        for i in range(self.config['num_epochs'] // epochs_between_evals):
+
+            rank0log(self.print_logger, "Starting a training cycle")
+
+            self.classifier.train(input_fn=lambda:self.data.get_train_input_fn(),
+                            steps = self.config['nsteps_per_epoch']*epochs_between_evals,
+                            hooks = self.training_hook )
+
+            rank0log(self.print_logger, "Starting to evaluate")
+            rank0log(self.print_logger, "Validation dataset size: {}".format(self.config['num_evaluating_samples'] ))
+            time.sleep(5)  # a little extra margin...
+
+            ckpts = train_helper.sort_and_load_ckpts(self.config['log_dir'])
+            c = ckpts[-1]
+            eval_result = self.classifier.evaluate(
+                input_fn=lambda: self.data.get_eval_input_fn(),
+                checkpoint_path=c['path'])
+
+            c['epoch'] = math.ceil(c['step'] / (self.config['num_training_samples']/ (self.config['batch_size'] * hvd.size())))
+            c['top1'] = eval_result['val-top1acc']
+            c['top5'] = eval_result['val-top5acc']
+            c['loss'] = eval_result['loss']
+
+            rank0log(self.print_logger, ' step  epoch  top1    top5     loss   checkpoint_time(UTC)')
+
+            rank0log(self.print_logger,'{:5d}  {:5.1f}  {:5.3f}  {:6.2f}  {:6.2f}  {time}'
+                    .format(c['step'],
+                            c['epoch'],
+                            c['top1'] * 100,
+                            c['top5'] * 100,
+                            c['loss'],
+                            time=time.strftime('%Y-%m-%d %H:%M:%S',
+                                time.localtime(c['mtime']))))
+            if eval_result['val-top1acc']*100 > self.config.get('stop_threshold', 74.9):
+                success = True
+                break
+
+
+
@@ -0,0 +1,253 @@
+import os
+
+import sys
+import tensorflow as tf
+import math
+import time
+from . import train_helper
+from .train_helper import stage
+from utils.logger import rank0log
+# add by zwx5326390
+from datetime import datetime
+# import hwlog
+from benchmark_log import hwlog
+
+#from tensorflow.contrib.offline_train.python.npu.npu_config import NPURunConfig
+#from tensorflow.contrib.offline_train.python.npu.npu_estimator import NPUEstimator
+#from tensorflow.contrib.offline_train.python.npu.npu_optimizer import NPUDistributedOptimizer
+from npu_bridge.estimator.npu.npu_config import NPURunConfig
+from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
+from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
+
+# remark_logger = hwlog.get_logger(__file__, "hw_Resnext50.log")
+# file_name = hwlog.get_file_name(__file__)
+
+class GPUBaseTrain(object):
+    def __init__(self, session, config, data, model, logger):
+        self.sess = session
+        self.config = config
+        self.data = data
+        self.model = model
+        self.logger = logger
+        self.print_logger = self.logger.logger
+        self.all_preds = []
+        self.all_targets = []
+
+        # add by zwx5326390
+        # work_num = "device " + str(os.environ.get("DEVICE_INDEX"))
+        # date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
+        # hwlog.vlogger.info("namespace:%s,time_ts:%s,event_type:benchmark_start" % (work_num, date_time))
+
+        if self.config['accelerator'] == 'gpu':
+            self.classifier, self.training_hook = self.get_classifier()
+        else:
+            #from tensorflow.contrib.offline_train.python.npu.npu_config import NPURunConfig
+            #from tensorflow.contrib.offline_train.python.npu.npu_estimator import NPUEstimator
+            #from tensorflow.contrib.offline_train.python.npu.npu_optimizer import NPUDistributedOptimizer
+            from npu_bridge.estimator.npu.npu_config import NPURunConfig
+            from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
+            from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
+            self.classifier, self.training_hook = self.get_npu_classifier()
+            
+        
+
+    def get_classifier(self):
+        classifier = tf.estimator.Estimator(
+            model_fn=self.model.get_estimator_model_func,
+            model_dir=self.config['log_dir'],
+            config = tf.estimator.RunConfig( 
+                    session_config=self.sess.get_config(), 
+                    save_summary_steps=self.config['save_summary_steps'] if self.config['do_checkpoint'] else None,
+                    save_checkpoints_steps=self.config['save_checkpoints_steps'] if self.config['do_checkpoint'] else None,
+                    keep_checkpoint_max=None
+                     )
+            )
+
+        training_hooks = [train_helper.PrefillStagingAreasHook()]
+        training_hooks.append(self.logger)
+
+        return classifier, training_hooks
+
+    def get_npu_classifier(self):
+        session_config = tf.ConfigProto(
+           inter_op_parallelism_threads=10,
+           intra_op_parallelism_threads=10,
+           allow_soft_placement=True)
+        print (" config.debug:")
+        print ( self.config['debug'])
+        print (self.config['log_dir'])
+        if self.config['debug'] :
+            run_config = NPURunConfig(hcom_parallel=True, precision_mode='allow_mix_precision', enable_data_pre_proc=True, save_checkpoints_steps=112590, session_config=session_config, model_dir = self.config['model_dir'], iterations_per_loop=self.config['iterations_per_loop'], keep_checkpoint_max=5)
+        else :
+            run_config = NPURunConfig(hcom_parallel=True, precision_mode='allow_mix_precision', save_summary_steps=0, log_step_count_steps=None, enable_data_pre_proc=True,save_checkpoints_secs=1e9, session_config=session_config, model_dir = self.config['model_dir'], iterations_per_loop=self.config['iterations_per_loop'])
+#        run_config = NPURunConfig(enable_data_pre_proc=True,save_checkpoints_secs=1e9, session_config=session_config, model_dir = self.config['model_dir'])
+
+     #   classifier = tf.estimator.Estimator(
+     #       model_fn=self.model.get_estimator_model_func,
+     #       model_dir=self.config['log_dir'],
+     #       config = tf.estimator.RunConfig( 
+     #               session_config=self.sess.get_config(), 
+     #               save_summary_steps=self.config['save_summary_steps'] if self.config['do_checkpoint'] else None,
+     #               save_checkpoints_steps=self.config['save_checkpoints_steps'] if self.config['do_checkpoint'] else None,
+     #               keep_checkpoint_max=None
+     #                )
+     #       )
+
+        classifier =NPUEstimator(
+            model_fn= self.model.get_estimator_model_func, 
+            config= run_config
+#            job_start_file='/tmp/config/deviceid_devindex_jobstart'
+      	  )
+      
+        training_hooks = []
+        if self.config['debug']:
+            training_hooks = [train_helper.PrefillStagingAreasHook()]
+            training_hooks.append(self.logger)
+
+        return classifier, training_hooks
+
+    def train(self):
+        # add by zwx5326390
+        # work_num = "device " + str(os.environ.get("DEVICE_INDEX"))
+        # date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
+        # hwlog.vlogger.info("nemespace:%s,time_ts:%s,event_type:epoch_start, num_train_epochs: %d" % ( \
+        #     work_num, date_time, self.config['num_epochs']))
+        # date_time = hwlog.get_time()
+        # remark_logger.info("ABK time_ts: %s, current_epoch: %d, batch_size: %d, file: %s, lineno: %s" % \
+        #                    (date_time, self.config['num_epochs'], self.config['batch_size'], file_name,
+        #                     sys._getframe().f_lineno))
+        hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=self.config['num_epochs'])
+
+        print ('training steps: %d' % self.config['nstep'])
+        self.classifier.train( input_fn=lambda:self.data.get_train_input_fn(),
+                             #  max_steps = self.config['max_train_steps'],
+                               max_steps = self.config['nstep'],
+                               #steps = 100,
+                               hooks = self.training_hook
+                              )
+        # hwlog.vlogger.info("namespace:%s,time_ts:%s,event_type:epoch_stop, num_train_epochs: %d" % ( \
+        #     work_num, date_time, self.config['num_epochs']))
+
+    def evaluate(self):
+        rank0log(self.print_logger, "Evaluating")
+        rank0log(self.print_logger, "Validation dataset size: {}".format(self.config['num_evaluating_samples'] ))
+        time.sleep(5)  # a little extra margin...
+        try:
+            ckpts = train_helper.sort_and_load_ckpts(self.config['ckpt_dir'])
+            for i, c in enumerate(ckpts):
+                if i < len(ckpts) - 1:
+                    if i % self.config['eval_interval'] != 0:
+                        continue
+                eval_result = self.classifier.evaluate(
+                    input_fn=lambda: self.data.get_eval_input_fn(),
+                    checkpoint_path=c['path'])
+
+                # add by zwx5326390
+                # work_num = "device " + str(os.environ.get("DEVICE_INDEX"))
+                # date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
+                # hwlog.vlogger.info("namespace:%s, time_ts:%s, val-top1acc:%d, val-top5acc: %d" % (
+                #     work_num, date_time, eval_result.get("val-top1acc"), eval_result.get("val-top5acc")
+                # ))
+                # date_time = hwlog.get_time()
+                # remark_logger.info("ABK time_ts: %s, accuracy: %f, accuracy_top_5: %f, file: %s, lineno: %s" % \
+                #                    (date_time, float(eval_result.get("val-top1acc")),
+                #                     float(eval_result.get("val-top5acc")), \
+                #                     file_name, sys._getframe().f_lineno))
+                hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value=float(eval_result.get("val-top1acc")))
+                hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value=float(eval_result.get("val-top5acc")))
+
+                #c['epoch'] = math.ceil(c['step'] / (self.config['num_training_samples']/ (self.config['batch_size'])))
+                c['epoch'] = math.ceil(c['step'] / (self.config['num_training_samples']/ (self.config['batch_size'] * self.config['rank_size'])))
+                c['top1'] = eval_result['val-top1acc']
+                c['top5'] = eval_result['val-top5acc']
+                c['loss'] = eval_result['loss']
+
+            rank0log(self.print_logger, ' step  epoch  top1    top5     loss   checkpoint_time(UTC)')
+            for i, c in enumerate(ckpts):
+                if 'top1' not in c:
+                    continue
+                rank0log(self.print_logger,'{:5d}  {:5.1f}  {:5.3f}  {:6.2f}  {:6.2f}  {time}'
+                         .format(c['step'],
+                                 c['epoch'],
+                                 c['top1'] * 100,
+                                 c['top5'] * 100,
+                                 c['loss'],
+                                 time=time.strftime('%Y-%m-%d %H:%M:%S', 
+                                    time.localtime(c['mtime']))))
+            rank0log(self.print_logger, "Finished evaluation")
+        except KeyboardInterrupt:
+            self.print_logger.error("Keyboard interrupt")
+
+    def train_and_evaluate(self):
+        success = False
+        epochs_between_evals = self.config.get('epochs_between_evals', 4)
+
+
+        for i in range(self.config['num_epochs'] // epochs_between_evals):
+
+            rank0log(self.print_logger, "Starting a training cycle")
+
+            # add by zwx5326390
+            # work_num = "device " + str(os.environ.get("DEVICE_INDEX"))
+            # date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
+            # hwlog.vlogger.info("nemespace:%s,time_ts:%s,event_type:epoch_start, num_train_epochs: %d" % (\
+            #     work_num, date_time, self.config['num_epochs']))
+            # date_time = hwlog.get_time()
+            # remark_logger.info("ABK time_ts: %s, current_epoch: %d, batch_size: %d, file: %s, lineno: %s" % \
+            #                    (date_time, self.config['num_epochs'], self.config['batch_size'], file_name,
+            #                     sys._getframe().f_lineno))
+            hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=self.config['num_epochs'])
+
+            self.classifier.train(input_fn=lambda:self.data.get_train_input_fn(),
+                            steps = self.config['nsteps_per_epoch']*epochs_between_evals,
+                            hooks = self.training_hook )
+            
+            # hwlog.vlogger.info("namespace:%s,time_ts:%s,event_type:epoch_stop, num_train_epochs: %d" % ( \
+            #     work_num, date_time, self.config['num_epochs']))
+
+            rank0log(self.print_logger, "Starting to evaluate")
+            rank0log(self.print_logger, "Validation dataset size: {}".format(self.config['num_evaluating_samples'] ))
+            time.sleep(5)  # a little extra margin...
+
+            ckpts = train_helper.sort_and_load_ckpts(self.config['log_dir'])
+            c = ckpts[-1]
+            eval_result = self.classifier.evaluate(
+                input_fn=lambda: self.data.get_eval_input_fn(),
+                checkpoint_path=c['path'])
+
+            # add by zwx5326390
+            # work_num = "device " + str(os.environ.get("DEVICE_INDEX"))
+            # date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
+            # hwlog.vlogger.info("namespace:%s, time_ts:%s, val-top1acc:%d, val-top5acc: %d" % (
+            #     work_num, date_time, eval_result.get("val-top1acc"), eval_result.get("val-top5acc")
+            # ))
+            # date_time = hwlog.get_time()
+            # remark_logger.info("ABK time_ts: %s, accuracy: %f, accuracy_top_5: %f, file: %s, lineno: %s" % \
+            #                    (date_time, float(eval_result.get("val-top1acc")), float(eval_result.get("val-top5acc")), \
+            #                     file_name, sys._getframe().f_lineno))
+
+            hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value=float(eval_result.get("val-top1acc")))
+            hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value=float(eval_result.get("val-top5acc")))
+
+			
+            c['epoch'] = math.ceil(c['step'] / (self.config['num_training_samples']/ (self.config['batch_size'] * self.config['rank_size'])))
+            c['top1'] = eval_result['val-top1acc']
+            c['top5'] = eval_result['val-top5acc']
+            c['loss'] = eval_result['loss']
+
+            rank0log(self.print_logger, ' step  epoch  top1    top5     loss   checkpoint_time(UTC)')
+
+            rank0log(self.print_logger,'{:5d}  {:5.1f}  {:5.3f}  {:6.2f}  {:6.2f}  {time}'
+                    .format(c['step'],
+                            c['epoch'],
+                            c['top1'] * 100,
+                            c['top5'] * 100,
+                            c['loss'],
+                            time=time.strftime('%Y-%m-%d %H:%M:%S',
+                                time.localtime(c['mtime']))))
+            if eval_result['val-top1acc']*100 > self.config.get('stop_threshold', 74.9):
+                success = True
+                break
+
+
+
@@ -0,0 +1,39 @@
+import tensorflow as tf
+from tensorflow.python.ops import data_flow_ops
+import re
+import os
+from operator import itemgetter
+
+class PrefillStagingAreasHook(tf.train.SessionRunHook):
+    def after_create_session(self, session, coord):
+        enqueue_ops = tf.get_collection('STAGING_AREA_PUTS')
+        for i in range(len(enqueue_ops)):
+            session.run(enqueue_ops[:i + 1])
+
+def stage(tensors):
+    """Stages the given tensors in a StagingArea for asynchronous put/get.
+    """
+    stage_area = data_flow_ops.StagingArea(
+        dtypes=[tensor.dtype for tensor in tensors],
+        shapes=[tensor.get_shape() for tensor in tensors])
+    put_op = stage_area.put(tensors)
+    get_tensors = stage_area.get()
+    tf.add_to_collection('STAGING_AREA_PUTS', put_op)
+    return put_op, get_tensors
+
+
+def sort_and_load_ckpts(log_dir):
+    ckpts = []
+    for f in os.listdir(log_dir):
+        m = re.match(r'model.ckpt-([0-9]+).index', f)
+        if m is None:
+            continue
+        fullpath = os.path.join(log_dir, f)
+        ckpts.append({'step': int(m.group(1)),
+                      'path': os.path.splitext(fullpath)[0],
+                      'mtime': os.stat(fullpath).st_mtime,
+                      })
+    ckpts.sort(key=itemgetter('step'))
+    return ckpts
+
+
@@ -0,0 +1,48 @@
+import tensorflow as tf
+import os,sys
+
+class CreateSession():
+    def __init__(self, config): 
+        self.config = config
+
+        if self.config['accelerator'] == '1980':
+            from tensorflow.python.client import device_lib
+            #from tensorflow.contrib.offline_train.python import npu_ops
+            from npu_bridge.estimator import npu_ops
+            #self.estimator_config = tf.ConfigProto(allow_soft_placement=True, min_group_size=20, use_off_line=True)
+            self.estimator_config = tf.ConfigProto(allow_soft_placement=True)
+            custom_op = self.estimator_config.graph_options.rewrite_options.custom_optimizers.add()
+            custom_op.name = "NpuOptimizer"
+            custom_op.parameter_map["use_off_line"].b = True
+            custom_op.parameter_map["min_group_size"].b = 20
+        else:
+            self.estimator_config = tf.ConfigProto(allow_soft_placement=False)
+
+        self.estimator_config.gpu_options.allow_growth = True
+
+        if self.config['accelerator'] == '1980':
+            local_device_protos = device_lib.list_local_devices(self.estimator_config)
+
+        self.set_env()
+      
+
+    def set_env(self):
+        # TODO, get env from config file
+        gpu_thread_count = 2
+        os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
+        os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count)
+        os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
+        os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
+
+        # barrier = self.hvd.allreduce(tf.constant(0, dtype=tf.float32))
+        # tf.Session(config=self.estimator_config).run(barrier)
+
+
+    def get_config(self):
+        self.estimator_config.gpu_options.visible_device_list = str(0)
+#        self.estimator_config.gpu_options.force_gpu_compatible = True  # Force pinned memory
+        self.estimator_config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads
+        self.estimator_config.inter_op_parallelism_threads = 5
+        return self.estimator_config
+
+
@@ -0,0 +1,103 @@
+from __future__ import print_function
+import tensorflow as tf
+import logging
+import numpy as np
+import time
+import sys,os
+from datetime import datetime
+# import hwlog
+# remark_logger = hwlog.get_logger(__file__, "hw_Resnext50.log")
+# file_name = hwlog.get_file_name(__file__)
+from benchmark_log import hwlog
+
+class LogSessionRunHook(tf.train.SessionRunHook):
+    def __init__(self, config, warmup_steps=5):
+  #  def __init__(self, global_batch_size, num_records, display_every=10, logger=None):
+        self.global_batch_size = config['global_batch_size']
+        self.iterations_per_loop = config['iterations_per_loop']
+        self.warmup_steps = warmup_steps
+        self.iter_times = []
+        self.num_records = config['num_training_samples']
+        self.display_every = config['display_every']
+        self.logger = get_logger(config['log_name'], config['log_dir'])
+        rank0log(self.logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__))
+
+
+
+    def after_create_session(self, session, coord):
+        rank0log(self.logger, 'Step   Epoch   Speed   Loss   FinLoss   LR')
+        self.elapsed_secs = 0.
+        self.count = 0
+
+    def before_run(self, run_context):
+        self.t0 = time.time()
+        return tf.train.SessionRunArgs( 
+            fetches=[tf.train.get_global_step(), 'loss:0', 'total_loss:0', 'learning_rate:0'])
+#                     'loss:0', 'loss:0', 'learning_rate:0'])
+
+    def after_run(self, run_context, run_values):
+        batch_time = time.time() - self.t0
+        self.iter_times.append(batch_time)
+        self.elapsed_secs += batch_time
+        self.count += 1
+        global_step, loss, total_loss, lr = run_values.results
+        if global_step == 1 or global_step % self.display_every == 0:
+            dt = self.elapsed_secs / self.count
+            img_per_sec = self.global_batch_size * self.iterations_per_loop / dt
+            epoch = global_step * self.global_batch_size / self.num_records
+            self.logger.info('step:%6i  epoch:%5.1f  FPS:%7.1f  loss:%6.3f  total_loss:%6.3f  lr:%7.7f' %
+                             (global_step, epoch, img_per_sec, loss, total_loss, lr))
+            self.elapsed_secs = 0.
+            self.count = 0
+            # add by zwx5326390
+            # work_num = "device " + str(os.environ.get("DEVICE_INDEX"))
+            # date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
+            # hwlog.vlogger.info("namespace:%s, time_ts:%s, FPS:%f, steps: %s" % (work_num, date_time,
+            #                                                                     img_per_sec,
+            #                                                                     global_step))
+            # date_time = hwlog.get_time()
+            # remark_logger.info("ABK time_ts: %s, fps: %f, steps: %s, file: %s, lineno: %s" % \
+            #                    (date_time, img_per_sec, global_step, file_name, \
+            #                     sys._getframe().f_lineno))
+            hwlog.remark_print(key=hwlog.FPS, value='%7.1f'%img_per_sec)
+
+
+    def get_average_speed(self):
+        avg_time = np.mean(self.iter_times[self.warmup_steps:])
+        speed = self.global_batch_size / avg_time
+        return speed
+
+
+
+def rank0log(logger, *args, **kwargs):
+    if logger: 
+        logger.info(''.join([str(x) for x in list(args)]))
+    else:
+        print(*args, **kwargs)
+
+
+def get_logger(log_name, log_dir):
+    logger = logging.getLogger(log_name)
+    logger.setLevel(logging.INFO)  # INFO, ERROR
+    # file handler which logs debug messages
+    if not os.path.isdir(log_dir):
+        try:
+            os.makedirs(log_dir)
+        except FileExistsError:
+            # if log_dir is common for multiple ranks like on nfs
+            pass
+    # console handler
+    ch = logging.StreamHandler()
+    ch.setLevel(logging.INFO)
+    # add formatter to the handlers
+    # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    formatter = logging.Formatter('%(message)s')
+    ch.setFormatter(formatter)
+    logger.addHandler(ch)
+    fh = logging.FileHandler(os.path.join(log_dir, log_name))
+    fh.setLevel(logging.DEBUG)
+    fh.setFormatter(formatter)
+    # add handlers to logger
+    logger.addHandler(fh)
+    return logger
+
@@ -0,0 +1,6 @@
+{
+    "server_count": "1",
+    "server_list": [{"device":[{devices}],"server_id":"127.0.0.1"}],
+    "status": "completed",
+    "version": "1.0"
+}
@@ -0,0 +1,18 @@
+#!/bin/sh
+currentDir=$(cd "$(dirname "$0")"; pwd)
+cd ${currentDir}
+
+device_group=$@
+device_num=$#
+
+touch ${currentDir}/main.log
+
+for device_phy_id in ${device_group}
+do
+    echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train.sh ${device_phy_id} & " >> ${currentDir}/main.log
+    ${currentDir}/train.sh ${device_phy_id}  &
+done
+
+wait
+
+echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] all train.sh exit " >> ${currentDir}/main.log
@@ -0,0 +1,41 @@
+# main env
+if [ -d /usr/local/Ascend/nnae/latest ];then
+
+	export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
+	export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages
+	export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
+	export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
+else
+	export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
+	export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$projectDir
+	export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
+	export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
+	
+fi
+export SOC_VERSION=Ascend910
+export HCCL_CONNECT_TIMEOUT=600
+
+# user env
+export JOB_ID={JOB_ID}
+export RANK_TABLE_FILE={RANK_TABLE_FILE}
+#export RANK_SIZE={RANK_SIZE}
+#export RANK_INDEX={RANK_INDEX}
+#export RANK_ID={RANK_ID}
+
+# profiling env
+export PROFILING_MODE={PROFILING_MODE}
+export AICPU_PROFILING_MODE={AICPU_PROFILING_MODE}
+export PROFILING_OPTIONS={PROFILING_OPTIONS}
+export FP_POINT={FP_POINT}
+export BP_POINT={BP_POINT}
+
+
+# debug env
+#export DUMP_GE_GRAPH=2
+#export DUMP_OP=1
+#export DUMP_OP_LESS=1
+#export PRINT_MODEL=1
+#export TE_PARALLEL_COMPILER=0
+
+# system env
+ulimit -c unlimited
@@ -0,0 +1,33 @@
+#!/bin/sh
+currentDir=$(cd "$(dirname "$0")"; pwd)
+cd ${currentDir}
+
+PWD=${currentDir}
+
+device_id=$1
+if  [ x"${device_id}" = x ] ;
+then
+    echo "turing train fail" >> ${currentDir}/train_${device_id}.log
+    exit
+else
+    export DEVICE_ID=${device_id}
+fi
+
+DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
+export DEVICE_INDEX=${DEVICE_INDEX}
+
+env > ${currentDir}/env_${device_id}.log
+
+#mkdir exec path
+mkdir -p ${currentDir}/${device_id}
+rm -rf ${currentDir}/${device_id}/*
+cd ${currentDir}/${device_id}
+
+#start exec
+python3.7 {RUN_ALGORITHM_CMD} {CHECKPOINT_DIR} > ${currentDir}/train_${device_id}.log 2>&1
+if [ $? -eq 0 ] ;
+then
+    echo "turing train success" >> ${currentDir}/train_${device_id}.log
+else
+    echo "turing train fail" >> ${currentDir}/train_${device_id}.log
+fi
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+rank_size=$1
+yamlPath=$2
+toolsPath=$3
+currentDir=$(cd "$(dirname "$0")/.."; pwd)
+if [ -f /.dockerenv ];then
+        CLUSTER=$4
+		MPIRUN_ALL_IP="$5"
+        export CLUSTER=${CLUSTER}
+fi
+#export RANK_ID=npu${rank_size}p
+
+# 从 yaml 获取配置
+eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
+
+data_url_new=`echo ${data_url//\//\\\\/}`
+echo ${data_url}
+echo ${max_steps}
+echo ${epoches}
+if [ x"${CLUSTER}" == x"True" ];then
+    jsonFilePath=${currentDir}/code/resnext50_train/configs/res50_32bs_8p.py
+elif [ ${rank_size} -lt 8 ];then
+    jsonFilePath=${currentDir}/code/resnext50_train/configs/res50_32bs_1p.py
+    if [ ${rank_size} -eq 1 ];then
+        sed -i "0,/rank_size.*$/s//rank_size\': ${rank_size},/" ${jsonFilePath}
+
+    elif [ ${rank_size} -eq 2 ];then
+        sed -i "0,/rank_size.*$/s//rank_size\': ${rank_size},/" ${jsonFilePath}
+    else
+        sed -i "0,/rank_size.*$/s//rank_size\': ${rank_size},/" ${jsonFilePath}
+    fi
+        
+else
+    jsonFilePath=${currentDir}/code/resnext50_train/configs/res50_32bs_8p.py
+    if [ ${rank_size} -eq 8 ];then
+        sed -i "0,/rank_size.*$/s//rank_size\': ${rank_size},/" ${jsonFilePath}
+    else
+        rank_size=16
+        sed -i "0,/rank_size.*$/s//rank_size\': ${rank_size},/" ${jsonFilePath}
+    fi
+fi
+
+#echo "jsonfilepath is "${jsonFilePath}
+sed -i "s/data_url.*$/data_url\': \'${data_url_new}\',/g" ${jsonFilePath}
+#sed -i "s/max_train_steps.*$/max_train_steps\': ${max_steps},/g" ${jsonFilePath}
+sed -i "s/num_epochs.*$/num_epochs\': ${epoches},/g" ${jsonFilePath}
+sed -i "0,/batch_size.*$/s//batch_size\': ${batch_size},/" ${jsonFilePath}
+sed -i "s/epochs_between_evals.*$/epochs_between_evals\': ${epochs_between_evals},/g" ${jsonFilePath}
+sed -i 's/\r//g' ${jsonFilePath}
+
+
+currtime=`date +%Y%m%d%H%M%S`
+mkdir -p ${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/
+train_job_dir=${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/
+echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${train_job_dir} &"
+# device 列表, 若无指定 device 或大于等于 8p 时根据 rank_size 顺序选择
+eval device_group=\$device_group_${rank_size}p
+if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
+    device_group="$(seq 0 "$(expr $rank_size - 1)")"
+fi
+
+# get last device id in device_group, hw log in performance from the dir named last_device_id  
+device_group_str=`echo ${device_group} | sed 's/ //g'`
+first_device_id=`echo ${device_group_str: 0:1}`
+echo ${device_group_str}
+echo ${first_device_id}
+
+rank_id=0
+
+if [ x"${CLUSTER}" == x"True" ];then
+    # ln hw log
+    ln -snf ${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/0/hw_resnext50.log ${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/
+    this_ip=$(hostname -I |awk '{print $1}')
+    for ip in $MPIRUN_ALL_IP;do
+        if [ x"$this_ip" != x"$ip" ];then
+           scp $yamlPath root@$ip:$yamlPath
+           scp $jsonFilePath root@$ip:$jsonFilePath
+        fi
+    done
+    export PATH=$PATH:/usr/local/mpirun4.0/bin
+    mpirun -H ${mpirun_ip} \
+    --bind-to none -map-by slot\
+    --allow-run-as-root \
+    --mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
+    --prefix /usr/local/mpirun4.0/ \
+    ${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} ${CLUSTER}
+else
+    # ln hw log
+    ln -snf ${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/${first_device_id}/hw_resnext50.log ${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/
+    for device_id in $device_group;do
+      #echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train ${device_id} & " >> ${currentDir}/result/main.log
+      ${currentDir}/scripts/train.sh $device_id $rank_size $yamlPath $currtime ${toolsPath} $rank_id&
+      let rank_id++
+    done
+fi
+wait
+
+
@@ -0,0 +1,117 @@
+#!/usr/bin/env bash
+
+device_id=$1
+rank_size=$2
+yamlPath=$3
+currtime=$4
+toolsPath=$5
+currentDir=$(cd "$(dirname "$0")/.."; pwd)
+mkdir -p ${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/
+export train_job_dir=${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/
+
+source ${currentDir}/config/npu_set_env.sh
+
+eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
+
+# 声明变量
+export REMARK_LOG_FILE=hw_resnext50.log  # 打点日志文件名称， 必须hw_后跟模型名称小写
+# 添加日志打点模块路径
+benchmark_log_path=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
+export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
+
+
+
+# user env
+export HCCL_CONNECT_TIMEOUT=600
+export JOB_ID=9999001
+export RANK_TABLE_FILE=${currentDir}/config/${rank_size}p.json
+export RANK_SIZE=${rank_size}
+export RANK_INDEX=0
+export SLOG_PRINT_TO_STDOUT=0
+export DEVICE_ID=$1
+DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
+export DEVICE_INDEX=${DEVICE_INDEX}
+export YAML_PATH=$3
+export MODEL_CKPT_PATH=${currentDir}/result/ckpt${device_id}
+
+if [ ${profiling_mode} == True ];
+then
+	export PROFILING_MODE=true
+else
+	export PROFILING_MODE=false
+fi
+
+if [ ${aicpu_profiling_mode} == True ];
+then
+	export AICPU_PROFILING_MODE=true
+else
+    export AICPU_PROFILING_MODE=false
+fi
+
+export PROFILING_OPTIONS=${profiling_options}
+export FP_POINT=${fp_point}
+export BP_POINT=${bp_point}
+
+cd ${train_job_dir}
+curd_dir=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils/atlasboost
+export PYTHONPATH=$PYTHONPATH:${curd_dir}
+
+if [ x"$6" != x"True" ];then
+        rank_id=$6
+        export RANK_ID=$6
+else
+        device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
+                device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
+                atlasboost.set_device_id(device_id);print(atlasboost.rank())")
+        device_id_mo=`echo $device_id_mo`
+        rank_id=${device_id_mo##* }
+        export RANK_ID=${rank_id}
+        device=${device_id_mo##*deviceid = }
+        device_id=${device%% phyid=*}
+        export DEVICE_ID=${device_id}
+        hccljson=${train_job_dir}/*.json
+        cp ${hccljson} ${currentDir}/config/${rank_size}p.json
+fi
+
+#mkdir exec path
+mkdir -p ${train_job_dir}/${device_id}
+cd ${train_job_dir}/${device_id}
+
+startTime=`date +%Y%m%d-%H:%M:%S`
+startTime_s=`date +%s`
+#cd ${currentDir}/code
+# 根据单卡/多卡区分调用参数
+if [ x"$6" == x"True" ];then
+    export CLUSTER=True
+	# 多卡多机
+	rm -rf ${currentDir}/result/*.log
+	rm -rf ${currentDir}/code/core.*
+    python3.7 ${currentDir}/code/resnext50_train/mains/res50.py --config_file=res50_32bs_8p --max_train_steps=${max_steps} --iterations_per_loop=1000 --debug=True --eval=True --model_dir=${currentDir}/result/ckpt${device_id} > ${train_job_dir}/train_${device_id}.log 2>&1
+elif [ ${rank_size} -le 4 ];then
+    # 单卡
+    python3.7 ${currentDir}/code/resnext50_train/mains/res50.py --config_file=res50_32bs_1p --max_train_steps=${max_steps} --iterations_per_loop=1000 --debug=True --eval=False --model_dir=${currentDir}/result/ckpt${device_id} > ${train_job_dir}/train_${device_id}.log 2>&1
+elif [ ${rank_size} -le 8 ];then
+    # 多卡单机
+    python3.7 ${currentDir}/code/resnext50_train/mains/res50.py --config_file=res50_32bs_8p --max_train_steps=${max_steps} --iterations_per_loop=1000 --debug=True --eval=True --model_dir=${currentDir}/result/ckpt${device_id} > ${train_job_dir}/train_${device_id}.log 2>&1
+fi
+
+if [ $? -eq 0 ] ;then
+    echo ":::ABK 1.0.0 resnext50 train success"
+    echo ":::ABK 1.0.0 resnext50 train success" >> ${train_job_dir}/train_${device_id}.log
+    echo ":::ABK 1.0.0 resnext50 train success" >> ${train_job_dir}/${device_id}/hw_resnext50.log
+else
+    echo ":::ABK 1.0.0 resnext50 train failed"
+    echo ":::ABK 1.0.0 resnext50 train failed" >> ${train_job_dir}/train_${device_id}.log
+    echo ":::ABK 1.0.0 resnext50 train failed" >> ${train_job_dir}/${device_id}/hw_resnext50.log
+fi
+
+endTime=`date +%Y%m%d-%H:%M:%S`
+endTime_s=`date +%s`
+sumTime=$[ $endTime_s - $startTime_s ]
+hour=$(( $sumTime/3600 ))
+min=$(( ($sumTime-${hour}*3600)/60 ))
+sec=$(( $sumTime-${hour}*3600-${min}*60 ))
+echo ${hour}:${min}:${sec}
+echo ":::ABK 1.0.0 resnext50 train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_resnext50.log
+
+