[add]上传训练benchmark by z00560161

This commit is contained in:
liang_chaoming@huawei.com
2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,46 @@
# ResNext50_tensorflow训练说明
### 1. 模型训练参数配置
在train/yaml/ResNext50.yaml中修改相应配置, 配置项含义:
```
tensorflow_config:
# 基本参数
max_steps: 1000
data_url: /home/imagenet_TF/
epoches: 1
epochs_between_evals: 1
batch_size: 32
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
mpirun_ip: 90.90.176.152:8,90.90.176.154:8
# docker 镜像名称:版本号
docker_image: mpirun3:latest
# 1. 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
# 2. 仅在小于 8p 时生效
# 3. 若不使用该配置, 请使用在行首添加'#'注释的方法将其关闭
# device_group: 0 1 2 3
device_group_1p: 0
device_group_2p: 0 1
device_group_4p: 0 1 2 3
profiling_mode: false
profiling_options: training_trace
fp_point: fp32_vars/conv2d/Conv2Dfp32_vars/BatchNorm/FusedBatchNormV3_Reduce
bp_point: loss_scale/gradients/AddN_70
aicpu_profiling_mode: false
```
------
@@ -0,0 +1,115 @@
import tensorflow as tf
import os
log_dir = '../result/'+os.path.basename(__file__).split('.')[0]
#256
config = {
# ============ for testing =====================
'accelerator': '1980', # 'gpu', '1980'
'shuffle_enable': 'yes',
'shuffle_buffer_size': 10000,
'rank_size': 1,
'shard': False,
# ======= basic config ======= #
'mode':'train', # "train","evaluate","train_and_evaluate"
'epochs_between_evals': 4, #used if mode is "train_and_evaluate"
'stop_threshold': 80.0, #used if mode is "train_and_evaluate"
#'data_dir':'/opt/npu/resnet_data_new',
'data_url': '/home/imagenet_TF', #data
'data_type': 'TFRECORD',
'model_name': 'resnet50',
'num_classes': 1001,
'num_epochs': 1,
'height':224,
'width':224,
'dtype': tf.float32,
'data_format': 'channels_last',
'use_nesterov': True,
'eval_interval': 1,
'num_evaluating_samples': 50000,
'loss_scale': 1024, #could be float or string. If float, static loss scaling is applied.
#If string, the corresponding automatic loss scaling algorithm is used.
#Must be one of 'Backoff' of 'LogMax' (case insensitive).
'use_lars': False,
'label_smoothing':0.1, #If greater than 0 then smooth the labels.
'weight_decay': 0.0001,
'batch_size':32, #minibatch size per node, total batchsize = batch_size*hvd.size()*itersize
'momentum': [0.9],
#======= data processing config =======
'min_object_covered': 0.1, #used for random crop
'aspect_ratio_range':[3. / 4., 4. / 3.],
'area_range':[0.16, 1.0],
'max_attempts': 100,
#======= data augment config =======
'increased_aug': False,
'brightness':0.3,
'saturation': 0.6,
'contrast': 0.6,
'hue': 0.13,
'num_preproc_threads': 22,
#======= initialization config =======
'conv_init': tf.variance_scaling_initializer(),
'bn_init_mode': 'adv_bn_init', # "conv_bn_init" or "adv_bn_init",initializer the gamma in bn in different modes
# "adv_bn_init" means initialize gamma to 0 in each residual block's last bn, and initialize other gamma to 1
# "conv_bn_init" means initialize all the gamma to a constant, defined by "bn_gamma_initial_value"
'bn_gamma_initial_value': 1.0,
#======== model architecture ==========
#'resnet_version': 'v1.5',
'resnet_version': 'resnext',
'arch_type': 'original', # ------ input -------
# C1,C2,C3: input block, stride in different layer
# ------ shortcut ------
# D1: average_pooling + conv1*1 in shortcut in downsample block
# D2: conv3*3,stride=2 in shortcut in downsample block
# D3: conv1*1 +average_pooling in shortcut in downsample block
# ------ mainstream ----
# E1: average_pooling + conv3*3 in mainstream in downsample block
# E2: conv3*3 + average_pooling in mainstream in downsample block
#======= logger config =======
'display_every': 1,
'log_name': 'resnet50.log',
'log_dir': log_dir,
#'ckpt_dir': '/data/resnext50/opp2/ckpt0',
'ckpt_dir': os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../result/ckpt0'),
#======= Learning Rate Config =======
'lr_warmup_mode': 'linear', # "linear" or "cosine"
'warmup_lr': 0.0,
'warmup_epochs': 10,
'learning_rate_maximum': 0.1,
'lr_decay_mode': 'steps', # "steps", "poly", "poly_cycle", "cosine", "linear_cosine", "linear_twice", "constant" for 1980 only
'learning_rate_end': 0.00001,
'decay_steps': '10,20,30', #for "steps"
'lr_decay_steps': '6.4,0.64,0.064',
'ploy_power': 2.0, #for "poly" and "poly_cycle"
'cdr_first_decay_ratio': 0.33, #for "cosine_decay_restarts"
'cdr_t_mul':2.0,
'cdr_m_mul':0.1,
'lc_periods':0.47, #for "linear_consine"
'lc_beta':0.00001,
'lr_mid': 0.5, #for "linear_twice"
'epoch_mid': 80,
'bn_lr_scale':1.0,
}
def res50_config():
config['global_batch_size'] = config['batch_size'] * config['rank_size']
config['do_checkpoint'] = True
return config
@@ -0,0 +1,115 @@
import tensorflow as tf
import os
log_dir = '../result/'+os.path.basename(__file__).split('.')[0]
#256
config = {
# ============ for testing =====================
'accelerator': '1980', # 'gpu', '1980'
'shuffle_enable': 'yes',
'shuffle_buffer_size': 10000,
'rank_size': 8,
'shard': True,
# ======= basic config ======= #
'mode':'train', # "train","evaluate","train_and_evaluate"
'epochs_between_evals': 4, #used if mode is "train_and_evaluate"
'stop_threshold': 80.0, #used if mode is "train_and_evaluate"
'data_dir':'/opt/npu/resnet_data_new',
'data_url': '/home/imagenet_TF',
'data_type': 'TFRECORD',
'model_name': 'resnet50',
'num_classes': 1001,
'num_epochs': 120, #None
'height':224,
'width':224,
'dtype': tf.float32,
'data_format': 'channels_last',
'use_nesterov': True,
'eval_interval': 1,
'loss_scale': 1024, #could be float or string. If float, static loss scaling is applied.
#If string, the corresponding automatic loss scaling algorithm is used.
#Must be one of 'Backoff' of 'LogMax' (case insensitive).
'use_lars': False,
'label_smoothing':0.1, #If greater than 0 then smooth the labels.
'weight_decay': 0.0001,
'batch_size':32, #minibatch size per node, total batchsize = batch_size*hvd.size()*itersize
'momentum': [0.9],
#======= data processing config =======
'min_object_covered': 0.1, #used for random crop
'aspect_ratio_range':[3. / 4., 4. / 3.],
'area_range':[0.16, 1.0],
'max_attempts': 100,
#======= data augment config =======
'increased_aug': False,
'brightness':0.3,
'saturation': 0.6,
'contrast': 0.6,
'hue': 0.13,
'num_preproc_threads': 22,
#======= initialization config =======
'conv_init': tf.variance_scaling_initializer(),
'bn_init_mode': 'adv_bn_init', # "conv_bn_init" or "adv_bn_init",initializer the gamma in bn in different modes
# "adv_bn_init" means initialize gamma to 0 in each residual block's last bn, and initialize other gamma to 1
# "conv_bn_init" means initialize all the gamma to a constant, defined by "bn_gamma_initial_value"
'bn_gamma_initial_value': 1.0,
#======== model architecture ==========
#'resnet_version': 'v1.5',
'resnet_version': 'resnext',
'arch_type': 'original', # ------ input -------
# C1,C2,C3: input block, stride in different layer
# ------ shortcut ------
# D1: average_pooling + conv1*1 in shortcut in downsample block
# D2: conv3*3,stride=2 in shortcut in downsample block
# D3: conv1*1 +average_pooling in shortcut in downsample block
# ------ mainstream ----
# E1: average_pooling + conv3*3 in mainstream in downsample block
# E2: conv3*3 + average_pooling in mainstream in downsample block
#======= logger config =======
'display_every': 1,
'log_name': 'resnet50.log',
#'ckpt_dir': '/data/resnext50_10w/ckpt0', #log_dir
#'ckpt_dir': '/d_solution/ckpt0',
'ckpt_dir': os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../result/ckpt0'),
'log_dir': log_dir,
#======= Learning Rate Config =======
'lr_warmup_mode': 'linear', # "linear" or "cosine"
'warmup_lr': 0.0,
'warmup_epochs': 5,
'learning_rate_maximum': 0.1,
'lr_decay_mode': 'cosine', # "steps", "poly", "poly_cycle", "cosine", "linear_cosine", "linear_twice", "constant" for 1980 only
'learning_rate_end': 0.000001,
'decay_steps': '10,20,30', #for "steps"
'lr_decay_steps': '6.4,0.64,0.064',
'ploy_power': 2.0, #for "poly" and "poly_cycle"
'cdr_first_decay_ratio': 0.33, #for "cosine_decay_restarts"
'cdr_t_mul':2.0,
'cdr_m_mul':0.1,
'lc_periods':0.47, #for "linear_consine"
'lc_beta':0.00001,
'lr_mid': 0.5, #for "linear_twice"
'epoch_mid': 80,
'bn_lr_scale':1.0,
}
def res50_config():
config['global_batch_size'] = config['batch_size'] * config['rank_size']
config['do_checkpoint'] = True
return config
@@ -0,0 +1,236 @@
import numpy as np
from . import preprocessing
import tensorflow as tf
from tensorflow.python.util import nest
import os,sys
import numpy as np
sys.path.append("..")
from trainers.train_helper import stage
class DataLoader:
def __init__(self, config):
self.config = config
# dataset info
num_training_samples = 1281167
self.config['num_evaluating_samples'] = 50000
#num_evaluating_samples = get_num_records(self.eval_filenames)
self.config['num_training_samples'] = num_training_samples
print( 'total num_training_sampels: %d' % num_training_samples )
self.training_samples_per_rank = num_training_samples
def get_train_input_fn_synthetic(self):
batch_size = self.config['batch_size']
input_shape = [self.config['height'], self.config['width'], 3]
input_element = nest.map_structure(lambda s: tf.constant(0.5, tf.float32, s), tf.TensorShape(input_shape))
label_element = nest.map_structure(lambda s: tf.constant(1, tf.int32, s), tf.TensorShape([1]))
element = (input_element, label_element)
ds = tf.data.Dataset.from_tensors(element).repeat()
ds = ds.batch(batch_size)
return ds
def get_train_input_fn(self):
# filenames = self.train_filenames
filenames = None
take_count = self.training_samples_per_rank
batch_size = self.config['batch_size']
height = self.config['height']
width = self.config['width']
brightness = self.config['brightness']
contrast = self.config['contrast']
saturation = self.config['saturation']
hue = self.config['hue']
num_threads = self.config['num_preproc_threads']
increased_aug = self.config['increased_aug']
shard = self.config['shard']
return make_dataset(self.config, filenames, take_count, batch_size, height, width,
brightness, contrast, saturation, hue,
training=True, num_threads=num_threads, nsummary=10, shard=shard, synthetic=False,
increased_aug=increased_aug )
def get_eval_input_fn(self):
# filenames = self.eval_filenames
filenames = None
# take_count = get_num_records(self.eval_filenames)
take_count = 50000
batch_size = self.config['batch_size']
height = self.config['height']
width = self.config['width']
brightness = self.config['brightness']
contrast = self.config['contrast']
saturation = self.config['saturation']
hue = self.config['hue']
num_threads = self.config['num_preproc_threads']
shard = self.config['shard']
return make_dataset(self.config, filenames, take_count, batch_size, height, width,
brightness, contrast, saturation, hue,
training=False, num_threads=num_threads, nsummary=10, shard=shard, synthetic=False,
increased_aug=False)
def get_input_pipeline_op(self, inputs, labels, mode):
with tf.device('/cpu:0'):
preload_op, (inputs, labels) = stage([inputs, labels])
with tf.device('/gpu:0'):
gpucopy_op, (inputs, labels) = stage([inputs, labels])
return preload_op, gpucopy_op, inputs, labels
def normalize_and_format(self, inputs, data_format):
dataset_mean = np.array([121, 115, 100], dtype=np.float32)
dataset_std = np.array([70, 68, 71], dtype=np.float32)
inputs = tf.subtract(inputs, dataset_mean)
inputs = tf.multiply(inputs, 1. / dataset_std)
if data_format == 'channels_first':
inputs = tf.transpose(inputs, [0, 3, 1, 2])
return inputs
#-------------------------------- Funcs -----------------------------------
def get_num_records(filenames):
def count_records(tf_record_filename):
count = 0
for _ in tf.python_io.tf_record_iterator(tf_record_filename):
count += 1
return count
nfile = len(filenames)
return (count_records(filenames[0]) * (nfile - 1) +
count_records(filenames[-1]))
def _parse_example_proto(example_serialized):
feature_map = {
'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
default_value=''),
'image/class/label': tf.FixedLenFeature([], dtype=tf.int64, default_value=-1),
'image/class/text': tf.FixedLenFeature([], dtype=tf.string,
default_value=''),
}
sparse_float32 = tf.VarLenFeature(dtype=tf.float32)
# Sparse features in Example proto.
feature_map.update(
{k: sparse_float32 for k in ['image/object/bbox/xmin',
'image/object/bbox/ymin',
'image/object/bbox/xmax',
'image/object/bbox/ymax']})
features = tf.parse_single_example(example_serialized, feature_map)
label = tf.cast(features['image/class/label'], dtype=tf.int32)
xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
# Note that we impose an ordering of (y, x) just to make life difficult.
bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
# Force the variable number of bounding boxes into the shape
# [1, num_boxes, coords].
bbox = tf.expand_dims(bbox, 0)
bbox = tf.transpose(bbox, [0, 2, 1])
return features['image/encoded'], label, bbox
def parse_record(raw_record,is_training):
image_buffer, label, bbox = _parse_example_proto(raw_record)
# for 1980 only
config={'min_object_covered': 0.1, 'aspect_ratio_range': [3. / 4., 4. / 3.], 'area_range': [0.08, 1.0], 'max_attempts': 100}
image = preprocessing.parse_and_preprocess_image_record(
config, image_buffer, height=224, width=224,
brightness=0.4, contrast=0.4, saturation=0.4, hue=0.13,
distort=is_training, nsummary=10, increased_aug=True, random_search_aug=False)
return image, label
def read_rawdata(file_path_tensor):
def _read_file(file_path):
image = tf.gfile.GFile(file_path, 'rb').read()
return image
return tf.py_func(_read_file, inp=[file_path_tensor], Tout=tf.string)
def parse_function(filename, label):
image = read_rawdata(filename)
image_decoded = tf.image.decode_jpeg(image, channels=3)
image_resized = tf.image.resize_images(image_decoded, [224, 224])
# 7.3raw默认格式为int64,目前resnet50只支持int32,下沉前不影响,下沉后,没有增加该转换算子,影响性能考虑。
label = tf.cast(label, dtype=tf.int32)
return image_resized, label
def make_dataset(config, filenames, take_count, batch_size, height, width,
brightness, contrast, saturation, hue,
training=False, num_threads=10, nsummary=10, shard=False, synthetic=False,
increased_aug=False, random_search_aug=False):
if synthetic and training:
input_shape = [height, width, 3]
input_element = nest.map_structure(lambda s: tf.constant(0.5, tf.float32, s), tf.TensorShape(input_shape))
label_element = nest.map_structure(lambda s: tf.constant(1, tf.int32, s), tf.TensorShape([1]))
element = (input_element, label_element)
ds = tf.data.Dataset.from_tensors(element).repeat()
ds = ds.batch(batch_size)
return ds
else:
shuffle_buffer_size = 10000
num_readers = 10
rank_size = int(os.getenv('RANK_SIZE'))
rank_id = int(os.getenv('DEVICE_INDEX'))
if config['data_type'] == 'RAW DATA':
images = []
labels = []
with tf.gfile.GFile(config['label_index_url'], 'r') as f:
for line in f.readlines():
tmp_list = line.strip().split(" ")
image_file = os.path.join(config['data_url'], tmp_list[0])
#image_raw = tf.gfile.GFile(image_file, 'rb').read()
#images.append(image_raw)
images.append(image_file)
labels.append(int(tmp_list[-1]))
#images = tf.convert_to_tensor(images, dtype=tf.string)
#labels = tf.convert_to_tensor(labels, dtype=tf.int32)
ds = tf.data.Dataset.from_tensor_slices((images, labels))
else:
if training:
filename_pattern = os.path.join(config['data_url'], '%s-*')
filenames = sorted(tf.gfile.Glob(filename_pattern % 'train'))
else:
filename_pattern = os.path.join(config['data_url'], '%s-*')
filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
ds = tf.data.Dataset.from_tensor_slices(filenames)
if shard:
# split the dataset into parts for each GPU
ds = ds.shard(rank_size, rank_id)
if not training:
ds = ds.take(take_count) # make sure all ranks have the same amount
if training:
ds = ds.shuffle(1000, seed=7 * (1 + rank_id))
if config['data_type'] == 'TFRECORD':
ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1)
counter = tf.data.Dataset.range(sys.maxsize)
ds = tf.data.Dataset.zip((ds, counter))
if training:
ds = ds.apply(tf.data.experimental.shuffle_and_repeat(shuffle_buffer_size, seed=5*(1+rank_id)))
if config['data_type'] == 'RAW DATA':
ds = ds.map(lambda image, label: parse_function(image, label), num_parallel_calls=14)
else:
ds = ds.map(lambda image, label: parse_record(image, training), num_parallel_calls=14)
#ds = ds.prefetch(10)
ds = ds.batch(batch_size, drop_remainder=True)
return ds
@@ -0,0 +1,152 @@
import tensorflow as tf
#import horovod.tensorflow as hvd
from tensorflow.contrib.image.python.ops import distort_image_ops
import math
#from .data_aug_search import random_aug_search
def deserialize_image_record(record):
feature_map = {
'image/encoded': tf.FixedLenFeature([], tf.string, ''),
'image/class/label': tf.FixedLenFeature([1], tf.int64, -1),
'image/class/text': tf.FixedLenFeature([], tf.string, ''),
'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32)
}
with tf.name_scope('deserialize_image_record'):
obj = tf.parse_single_example(record, feature_map)
imgdata = obj['image/encoded']
label = tf.cast(obj['image/class/label'], tf.int32)
bbox = tf.stack([obj['image/object/bbox/%s' % x].values
for x in ['ymin', 'xmin', 'ymax', 'xmax']])
bbox = tf.transpose(tf.expand_dims(bbox, 0), [0, 2, 1])
text = obj['image/class/text']
return imgdata, label, bbox, text
def decode_jpeg(imgdata, channels=3):
return tf.image.decode_jpeg(imgdata, channels=channels,
fancy_upscaling=False,
dct_method='INTEGER_FAST')
def crop_and_resize_image(config, image, height, width,
distort=False, nsummary=10):
with tf.name_scope('crop_and_resize'):
# Evaluation is done on a center-crop of this ratio
eval_crop_ratio = 0.8
if distort:
initial_shape = [int(round(height / eval_crop_ratio)),
int(round(width / eval_crop_ratio)),
3]
jpeg_shape = tf.image.extract_jpeg_shape( image )
bbox_begin, bbox_size, bbox = \
tf.image.sample_distorted_bounding_box(
initial_shape,
bounding_boxes=tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]),
# tf.zeros(shape=[1,0,4]), # No bounding boxes
min_object_covered=config['min_object_covered'],
aspect_ratio_range=config['aspect_ratio_range'],
area_range=config['area_range'],
max_attempts=config['max_attempts'],
# seed=11 , # Need to set for deterministic results
use_image_if_no_bounding_boxes=True)
bbox = bbox[0, 0] # Remove batch, box_idx dims
# offset_y, offset_x, _ = tf.unstack(bbox_begin)
# target_height, target_width, _ = tf.unstack( bbox_size )
#
# offset_y = tf.minimum( offset_y, jpeg_shape[0] - 1 )
# offset_x = tf.minimum( offset_x, jpeg_shape[1] - 1 )
# target_height, target_width, _ = tf.unstack( bbox_size )
# new_height = tf.maximum( tf.minimum( offset_y + target_height, jpeg_shape[0] ) - offset_y, 0 )
# new_width = tf.maximum( tf.minimum( offset_x + target_width, jpeg_shape[1] ) - offset_x, 0 )
y_min = tf.cast( bbox[0] * (tf.cast( jpeg_shape[0], tf.float32) ), tf.int32)
x_min = tf.cast( bbox[1] * (tf.cast(jpeg_shape[1], tf.float32) ), tf.int32)
y_max = tf.cast( bbox[2] * (tf.cast(jpeg_shape[0], tf.float32) ), tf.int32)
x_max = tf.cast( bbox[3] * (tf.cast(jpeg_shape[1], tf.float32) ), tf.int32)
crop_height = y_max - y_min
crop_width = x_max - x_min
# crop_window = tf.stack( [offset_y, offset_x, new_height, new_width] )
crop_window = tf.stack( [y_min, x_min, crop_height, crop_width] )
image = tf.image.decode_and_crop_jpeg( image, crop_window, channels=3 )
image = tf.image.resize_images( image, [height, width] )
# def func_decode_and_crop(image):
# image = tf.image.decode_and_crop_jpeg( image, crop_window, channels=3 )
# image = tf.image.resize_images( image, [height, width] )
# return image
# def func_crop_and_resize(image):
# image = decode_jpeg(image, channels=3)
# image = tf.image.crop_and_resize(
# image[None, :, :, :], bbox[None, :], [0], [height, width])[0]
# return image
# condtion_1 = tf.logical_and( tf.less(target_height, jpeg_shape[0]), tf.less( target_width, jpeg_shape[1] ) )
# condtion_2 = tf.logical_and( tf.less(target_height + offset_y, jpeg_shape[0]), tf.less( target_width + offset_x, jpeg_shape[1] ) )
# image = tf.cond( tf.logical_and( condtion_1, condtion_2 ), lambda:func_decode_and_crop(image), lambda:func_crop_and_resize(image) )
else:
# Central crop
image = decode_jpeg(image, channels=3)
ratio_y = ratio_x = eval_crop_ratio
bbox = tf.constant([0.5 * (1 - ratio_y), 0.5 * (1 - ratio_x),
0.5 * (1 + ratio_y), 0.5 * (1 + ratio_x)])
image = tf.image.crop_and_resize(
image[None, :, :, :], bbox[None, :], [0], [height, width])[0]
return image
def parse_and_preprocess_image_record(config, record, height, width,
brightness, contrast, saturation, hue,
distort, nsummary=10, increased_aug=False, random_search_aug=False):
#imgdata, label, bbox, text = deserialize_image_record(record)
#label -= 1 # Change to 0-based (don't use background class)
with tf.name_scope('preprocess_train'):
image = crop_and_resize_image(config, record, height, width, distort)
if distort:
image = tf.image.random_flip_left_right(image)
if increased_aug:
image = tf.image.random_brightness(image, max_delta=brightness)
#image = distort_image_ops.random_hsv_in_yiq(image,
# lower_saturation=saturation,
# upper_saturation=2.0 - saturation,
# max_delta_hue=hue * math.pi)
image = tf.image.random_contrast(image, lower=contrast, upper=2.0 - contrast)
image = tf.image.random_saturation(image, lower=saturation, upper=2.0-saturation)
# tf.summary.image('distorted_color_image', tf.expand_dims(image, 0))
image = tf.clip_by_value(image, 0., 255.)
#image = tf.cast(image, tf.uint8)
# if random_search_aug:
# image = random_aug_search(image)
image = normalize(image)
image = tf.cast(image, tf.float16)
return image
def normalize(inputs):
imagenet_mean = [121.0, 115.0, 100.0] #np.array([121, 115, 100], dtype=np.float32)
imagenet_std = [70.0, 68.0, 71.0] #np.array([70, 68, 71], dtype=np.float32)
imagenet_mean = tf.expand_dims(tf.expand_dims(imagenet_mean, 0), 0)
imagenet_std = tf.expand_dims(tf.expand_dims(imagenet_std, 0), 0)
inputs = inputs - imagenet_mean #tf.subtract(inputs, imagenet_mean)
inputs = inputs * (1.0 / imagenet_std)
#inputs = tf.multiply(inputs, 1. / imagenet_std)
return inputs
@@ -0,0 +1,50 @@
import tensorflow as tf
from .lr_schedule import warmup_decay, get_lr, get_1980_lr
class HyperParams:
def __init__(self, config):
self.config=config
nsteps_per_epoch = self.config['num_training_samples'] // self.config['global_batch_size']
self.config['nsteps_per_epoch'] = nsteps_per_epoch
# nstep = self.config['num_training_samples'] * self.config['num_epochs'] // self.config['global_batch_size']
if self.config['num_epochs']:
nstep = nsteps_per_epoch * self.config['num_epochs'] #------calculate nsteps in a different way------
else:
nstep = self.config['max_train_steps']
self.config['nstep'] = nstep
self.config['total_steps_include_iterations'] = int( self.config['nstep'] + self.config['iterations_per_loop'])
self.config['save_summary_steps'] = nsteps_per_epoch
self.config['save_checkpoints_steps'] = nsteps_per_epoch
def get_hyper_params(self):
hyper_params = {}
hyper_params['learning_rate'] = self.get_learning_rate()
return hyper_params
def get_learning_rate(self):
global_step = tf.train.get_global_step()
nsteps_per_epoch = self.config['nsteps_per_epoch']
warmup_lr = self.config['warmup_lr']
lr = self.config['learning_rate_maximum']
lr_end = self.config['learning_rate_end']
lr_decay_mode = self.config['lr_decay_mode']
with tf.device('/cpu:0'): # Allow fallback to CPU if no GPU support for these ops
if lr_decay_mode == 'constant' or self.config['num_epochs'] == None:
learning_rate = tf.constant(lr, tf.float32)
else:
learning_rate = get_1980_lr(self.config, global_step, warmup_lr, lr_end, lr, self.config['warmup_epochs'], nsteps_per_epoch, self.config['nstep'], lr_decay_mode )
learning_rate = tf.identity(learning_rate, 'learning_rate')
return learning_rate
@@ -0,0 +1,172 @@
import tensorflow as tf
import numpy as np
def get_lr(lr, lr_end, lr_decay_mode, warmup_it, decay_steps, global_step, steps, lr_steps, ploy_power,
cdr_first_decay_ratio, cdr_t_mul, cdr_m_mul, cdr_alpha, cd_alpha, lc_periods, lc_alpha, lc_beta, lr_mid, it_mid):
if lr_decay_mode == 'steps':
learning_rate = tf.train.piecewise_constant(global_step,
steps, lr_steps)
elif lr_decay_mode == 'poly' or lr_decay_mode == 'poly_cycle':
cycle = lr_decay_mode == 'poly_cycle'
learning_rate = tf.train.polynomial_decay(lr,
global_step - warmup_it,
decay_steps=decay_steps - warmup_it,
end_learning_rate=lr_end,
power=ploy_power,
cycle=cycle)
elif lr_decay_mode == 'cosine_decay_restarts':
learning_rate = tf.train.cosine_decay_restarts(lr,
global_step - warmup_it,
(decay_steps - warmup_it) * cdr_first_decay_ratio,
t_mul=cdr_t_mul,
m_mul=cdr_m_mul,
alpha=cdr_alpha)
elif lr_decay_mode == 'cosine':
learning_rate = tf.train.cosine_decay(lr,
global_step - warmup_it,
decay_steps=decay_steps - warmup_it,
alpha=cd_alpha)
elif lr_decay_mode == 'linear_cosine':
learning_rate = tf.train.linear_cosine_decay(lr,
global_step - warmup_it,
decay_steps=decay_steps - warmup_it,
num_periods=lc_periods,#0.47,
alpha=lc_alpha,#0.0,
beta=lc_beta)#0.00001)
elif lr_decay_mode == 'linear_twice':
learning_rate = decay_linear_twice(lr, lr_mid, lr_end, warmup_it, it_mid, decay_steps, global_step )
else:
raise ValueError('Invalid type of lr_decay_mode')
return learning_rate
def cos_warmup_1980( global_step, warmup_steps, max_lr ):
PI = 3.14159265359
ang = PI + PI * ( float(global_step+1) / float(warmup_steps) )
offset = max_lr * 0.5*( 1.0 + np.cos( ang ) )
return offset
def cos_decay_1980( global_step, warmup_steps, total_steps, max_lr,end_lr ):
PI = 3.14159265359
ang = PI * ( float(global_step - warmup_steps+1) / float(total_steps - warmup_steps) )
#offset = max_lr * 0.5*( 1.0 + np.cos( ang ) )
#zp-cosine
cosine_decay_tmp=0.5*( 1.0 + np.cos( ang ) )
decayed_tmp = (1 - end_lr) * cosine_decay_tmp + end_lr
offset = max_lr * decayed_tmp
return offset
def get_1980_lr(config, global_step, lr_init, lr_end, lr_max, warmup_epochs, steps_per_epoch, nsteps, lr_decay_mode):
lr_each_step = []
if lr_decay_mode == 'steps':
decay_epoch_index = [30 * steps_per_epoch,60 * steps_per_epoch,80 * steps_per_epoch]
total_steps = int(nsteps)
for i in range(total_steps):
if i < decay_epoch_index[0]:
lr = lr_max
elif i < decay_epoch_index[1]:
lr = lr_max * 0.1
elif i < decay_epoch_index[2]:
lr = lr_max * 0.01
else:
lr = lr_max * 0.001
lr_each_step.append(lr)
elif lr_decay_mode == 'poly':
total_steps = int(nsteps)
warmup_steps = steps_per_epoch * warmup_epochs
inc_each_step = ( float(lr_max) - float(lr_init) ) / float(warmup_steps)
for i in range( config['total_steps_include_iterations'] ):
if i < warmup_steps:
lr = float(lr_init) + inc_each_step * float(i)
elif i <= total_steps:
base = ( 1.0 - (float(i)-float(warmup_steps))/(float(total_steps)-float(warmup_steps)) )
lr = float(lr_max) * base
else:
lr = 0.0
lr_each_step.append(lr)
elif lr_decay_mode == 'cosine':
total_steps = int(nsteps)
warmup_steps = steps_per_epoch * warmup_epochs
for i in range( config['total_steps_include_iterations'] ):
if i < warmup_steps:
lr = cos_warmup_1980( i, warmup_steps, lr_max )
elif i <= total_steps:
lr = cos_decay_1980( i, warmup_steps, total_steps, lr_max ,lr_end)
else:
lr = lr_end * 0.01
lr_each_step.append(lr)
elif lr_decay_mode == 'linear_cosine':
total_steps = int(nsteps)
warmup_steps = steps_per_epoch * warmup_epochs
inc_each_step = ( float(lr_max) - float(lr_init) ) / float(warmup_steps)
for i in range( config['total_steps_include_iterations'] ):
if i < warmup_steps:
lr = float(lr_init) + inc_each_step * float(i)
elif i <= total_steps:
lr = cos_decay_1980( i, warmup_steps, total_steps, lr_max )
else:
lr = 0.0
lr_each_step.append(lr)
else:
total_steps = int(nsteps)
warmup_steps = steps_per_epoch * warmup_epochs
for i in range(total_steps):
if i < warmup_steps:
lr = lr_init + (lr_max - lr_init) * i / warmup_steps
else:
lr = lr_max - ( lr_max - lr_end ) * (i - warmup_steps) / (total_steps - warmup_steps)
lr_each_step.append( lr )
# current_step = tf.to_int32( tf.cast(global_step,tf.float32) / float(steps_per_epoch) )
current_step = global_step
lr_each_step = tf.convert_to_tensor( lr_each_step )
print (lr_each_step)
learning_rate = tf.gather( lr_each_step, current_step )
return learning_rate
def warmup_decay(lr_warmup_mode, warmup_lr, global_step, warmup_steps, warmup_end_lr):
if lr_warmup_mode == 'linear':
learning_rate = linear_warmup(warmup_lr, global_step, warmup_steps, warmup_end_lr)
elif lr_warmup_mode == 'cosine':
learning_rate = cos_warmup(warmup_lr, global_step, warmup_steps, warmup_end_lr)
else:
raise ValueError('Invalid type of lr_warmup_mode')
return learning_rate
def linear_warmup(warmup_lr, global_step, warmup_steps, warmup_end_lr):
from tensorflow.python.ops import math_ops
p = tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32)
diff = math_ops.subtract(warmup_end_lr, warmup_lr)
res = math_ops.add(warmup_lr, math_ops.multiply(diff, p))
return res
def cos_warmup( warmup_lr, global_step, warmup_steps, warmup_end_lr ):
PI = 3.14159265359
diff = tf.subtract( warmup_end_lr, warmup_lr )
ang = PI + PI * ( tf.cast( global_step, tf.float32 ) / tf.cast( warmup_steps,tf.float32 ))
offset = diff * 0.5 * ( 1.0 + tf.math.cos( ang ) )
res = tf.add( warmup_lr, offset )
return res
def decay_linear( lr_start, lr_end, it_start, it_end, global_step ):
down_steps = it_end - it_start
down_range = lr_start - lr_end
down_per_step = float( down_range ) / float( down_steps )
res = tf.subtract( tf.cast(lr_start, tf.float32), tf.multiply( tf.cast(down_per_step, tf.float32), tf.subtract(tf.cast(global_step, tf.float32), tf.cast(it_start, tf.float32) )) )
return res
def decay_linear_twice(lr_start, lr_mid, lr_end, it_start, it_mid, it_end, global_step ):
learning_rate = tf.cond( global_step < it_start, lambda: tf.cast(lr_start, tf.float32), lambda: decay_linear(lr_start, lr_mid, it_start, it_mid, global_step))
learning_rate = tf.cond( global_step > it_mid, lambda: decay_linear(lr_mid, lr_end, it_mid, it_end, global_step) , lambda: learning_rate )
return learning_rate
@@ -0,0 +1,23 @@
import tensorflow as tf
#from tensorflow.contrib.hccl.python.ops import hccl_ops
from npu_bridge.hccl import hccl_ops
class Layers:
def get_accuracy(self, labels, predicted_classes, logits, config):
accuracy = tf.metrics.accuracy(
labels=labels, predictions=predicted_classes)
top5acc = tf.metrics.mean(
tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32))
if config['rank_size'] == 1:
newaccuracy = (accuracy[0], accuracy[1])
newtop5acc = (top5acc[0], top5acc[1])
else:
newaccuracy = (hccl_ops.allreduce(accuracy[0],"sum")/config['rank_size'], accuracy[1])
newtop5acc = (hccl_ops.allreduce(top5acc[0],"sum")/config['rank_size'], top5acc[1])
metrics = {'val-top1acc': newaccuracy, 'val-top5acc': newtop5acc}
return metrics
@@ -0,0 +1,36 @@
import tensorflow as tf
class Loss:
def __init__(self,config):
self.config = config
def get_loss(self, logits, labels):
labels_one_hot = tf.one_hot(labels, self.config['num_classes'])
loss = tf.losses.softmax_cross_entropy(
logits=logits, onehot_labels=labels_one_hot,label_smoothing=self.config['label_smoothing'])
loss = tf.identity(loss, name='loss')
return loss
def get_total_loss(self, loss):
reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
total_loss = tf.add_n([loss] + reg_losses, name='total_loss')
return total_loss
def optimize_loss(self, total_loss, opt):
gate_gradients = (tf.train.Optimizer.GATE_NONE)
# grads_and_vars = opt.compute_gradients(total_loss, colocate_gradients_with_ops=True, gate_gradients=gate_gradients)
grads_and_vars = opt.compute_gradients(total_loss, gate_gradients=gate_gradients)
# train_op = opt.apply_gradients( grads_and_vars, global_step=None )
train_op = opt.apply_gradients( grads_and_vars)
return train_op
@@ -0,0 +1,7 @@
ps -ef | grep TdtMain | awk '{print $2}' | xargs kill -9
rm -rf *.pbtxt
rm -rf /var/log/npu/slog/*.log
rm ckpt* -rf
find ./ -name "*.pyc" | xargs rm -rf
find ./ -name __pycache__ | xargs rm -rf
rm /var/log/npu/dataset/* -rf
@@ -0,0 +1,141 @@
import tensorflow as tf
import sys
import ast
#sys.path.append("..")
#sys.path.append("../models")
#sys.path.append("./resnet50_train/")
#sys.path.append("./resnet50_train/models")
import os
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../../../utils'))
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../../../utils/atlasboost'))
base_path=os.path.split(os.path.realpath(__file__))[0]
print ("#########base_path:", base_path)
path_1 = base_path + "/.."
print (path_1)
path_2 = base_path + "/../models"
print (path_2)
path_3 = base_path + "/../../"
print (path_3)
sys.path.append(base_path + "/..")
sys.path.append(base_path + "/../models")
sys.path.append(base_path + "/../../")
sys.path.append(base_path + "/../../models")
from utils import create_session as cs
from utils import logger as lg
from data_loader.resnet50 import data_loader as dl
from models.resnet50 import res50_model as ml
from optimizers import optimizer as op
from losses import res50_loss as ls
from trainers import gpu_base_trainer as tr
# from configs import res50_config as cfg
from hyper_param import hyper_param as hp
from layers import layers as ly
from datetime import datetime
# from utils import hwlog
import argparse
from benchmark_log import hwlog
from benchmark_log.basic_utils import get_environment_info
from benchmark_log.basic_utils import get_model_parameter
# import hwlog
# remark_logger = hwlog.get_logger(__file__, "hw_Resnext50.log")
# initinal_data={"base_lr": 0.128, "dataset": "imagenet1024", "optimizer": "SGD", "loss_scale": 512, "batchsize": 32}
# hwlog.add_additional_info(remark_logger, "Resnext50", "tensorflow", initinal_data) # logger_obj, model_name, framework, initinal_data
def main():
#-------------------choose the config file in .sh file-----------
cmdline = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
cmdline.add_argument('--config_file', default="",
help="""config file used.""")
cmdline.add_argument('--iterations_per_loop', default=1,
help="""config file used.""")
cmdline.add_argument('--max_train_steps', default=200,
help="""config file used.""")
cmdline.add_argument('--debug', default=True, type=ast.literal_eval,
help="""config file used.""")
cmdline.add_argument('--eval', default=False, type=ast.literal_eval,
help="""config file used.""")
cmdline.add_argument('--model_dir', default="./model_dir",
help="""config file used.""")
FLAGS, unknown_args = cmdline.parse_known_args()
if len(unknown_args) > 0:
for bad_arg in unknown_args:
print("ERROR: Unknown command line arg: %s" % bad_arg)
raise ValueError("Invalid command line arg(s)")
cfg_file = FLAGS.config_file
configs = 'configs'
cfg = getattr(__import__(configs, fromlist=[cfg_file]), cfg_file)
#------------------------------------------------------------------
config = cfg.res50_config()
config['iterations_per_loop'] = int(FLAGS.iterations_per_loop)
config['max_train_steps'] = int(FLAGS.max_train_steps)
config['debug'] = FLAGS.debug
config['eval'] = FLAGS.eval
config['model_dir'] = FLAGS.model_dir
print("iterations_per_loop:%d" %(config['iterations_per_loop']))
print("max_train_steps :%d" %(config['max_train_steps']))
print("debug :%s" %(config['debug']))
print("eval :%s" %(config['eval']))
print("model_dir :%s" %(config['model_dir']))
Session = cs.CreateSession(config)
data = dl.DataLoader(config)
hyper_param = hp.HyperParams(config)
layers = ly.Layers()
optimizer = op.Optimizer(config)
loss = ls.Loss(config)
logger = lg.LogSessionRunHook(config) # add tensorboard summary
model = ml.Model(config, data, hyper_param,layers, optimizer, loss, logger) # get the model
trainer = tr.GPUBaseTrain(Session, config, data, model, logger) # use Estimator to build training process
# work_num = "device " + str(os.environ.get("DEVICE_INDEX"))
# date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
# try:
if config['mode'] =='train':
trainer.train()
if config['eval'] :
trainer.evaluate()
elif config['mode'] =='evaluate':
trainer.evaluate()
elif config['mode'] =='train_and_evaluate':
trainer.train_and_evaluate()
else:
raise ValueError('Invalid type of mode')
# hwlog.vlogger.info("namespace:%s,time_ts:%s,event_type:benchmark_stop" % (work_num, date_time))
# hwlog.vlogger.info("atlas benchmark train success")
# remark_logger.info("ABK train success")
# except:
# # hwlog.vlogger.info("namespace:%s,time_ts:%s,event_type:benchmark_stop" % (work_num, date_time))
# # hwlog.vlogger.info("atlas benchmark train failed")
# remark_logger.info("ABK train failed")
# add by zwx5326390
if __name__ == '__main__':
# add zwx5326390 日志打点
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
config_info = get_model_parameter("tensorflow_config")
initinal_data = {"base_lr": 0.01, "dataset": "imagenet1024", "optimizer": "SGD", "loss_scale": 512,
"batchsize": 32}
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
hwlog.remark_print(key=hwlog.INPUT_BATCH_SIZE, value=initinal_data.get("batchsize"))
main()
@@ -0,0 +1,21 @@
#!/bin/bash
#export CUDA_VISIBLE_DEVICES=0
dir=`pwd`
#cp -rf ./config /tmp/
export JOB_ID=10086
#export PROFILING_DIR=/var/log/npu/profiling/container/0
export DEVICE_ID=0
#export PROFILING_MODE=true
export PRINT_MODEL=1
#export ENABLE_DATA_PRE_PROC=1
export RANK_ID=0
export RANK_SIZE=1
export RANK_TABLE_FILE=/home/lxh/config/new_rank_table_1p.json
export FUSION_TENSOR_SIZE=1000000000
export PYTHONPATH=${dir}
export LD_LIBRARY_PATH=/usr/local/HiAI/runtime/lib64/
/usr/local/HiAI/runtime/bin/TdtMain --configfile=/home/lxh/test/config/job_tdt_2p_$DEVICE_ID.json &
sleep 5
python3.6 res50.py --config_file res50_baseline
@@ -0,0 +1,4 @@
#!/bin/bash
export CUDA_VISIBLE_DEVICES=7
python3.5 res50.py --config_file res50_baseline_gpu
@@ -0,0 +1,24 @@
import tensorflow as tf
def _fp32_trainvar_getter(getter, name, shape=None, dtype=None,
trainable=True, regularizer=None,
*args, **kwargs):
storage_dtype = dtype
variable = getter(name, shape, dtype=storage_dtype,
trainable=trainable,
regularizer=regularizer if trainable and 'BatchNorm' not in name and 'batchnorm' not in name and 'batch_norm' not in name and 'Batch_Norm' not in name else None,
*args, **kwargs)
return variable
def fp32_trainable_vars(name='fp32_vars', *args, **kwargs):
"""A varible scope with custom variable getter to convert fp16 trainable
variables with fp32 storage followed by fp16 cast.
"""
return tf.variable_scope(
name, custom_getter=_fp32_trainvar_getter, *args, **kwargs)
def custom_getter_with_fp16_and_weight_decay(dtype, weight_decay):
return fp32_trainable_vars(dtype=dtype, regularizer=tf.contrib.layers.l2_regularizer(weight_decay))
@@ -0,0 +1,222 @@
import tensorflow as tf
from . import resnet, res50_helper
from trainers.train_helper import stage
#from tensorflow.contrib.offline_train.python.npu.npu_optimizer import NPUDistributedOptimizer
from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
#from tensorflow.contrib.offline_train.python import npu_ops
from npu_bridge.estimator import npu_ops
_NUM_EXAMPLES_NAME="num_examples"
class Model(object):
def __init__(self, config, data, hyper_param, layers, optimizer, loss, logger):
self.config = config
self.data = data
self.hyper_param = hyper_param
self.layers = layers
self.optimizer = optimizer
self.loss = loss
self.logger = logger
def get_estimator_model_func(self, features, labels, mode, params=None):
labels = tf.reshape(labels, (-1,)) # Squash unnecessary unary dim #----------------not use when use onehot label
model_func = self.get_model_func()
inputs = features # TODO: Should be using feature columns?
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
with tf.device('/gpu:0'):
if self.config['accelerator'] == 'gpu':
inputs = tf.cast(inputs, self.config['dtype'])
inputs = tf.cast(inputs, self.config['dtype'])
with res50_helper.custom_getter_with_fp16_and_weight_decay(dtype=self.config['dtype'], weight_decay=self.config['weight_decay']): # no BN decay
top_layer = model_func(
inputs, data_format=self.config['data_format'], training=is_training,
conv_initializer=self.config['conv_init'],
bn_init_mode=self.config['bn_init_mode'], bn_gamma_initial_value=self.config['bn_gamma_initial_value'])
logits = top_layer
predicted_classes = tf.argmax(logits, axis=1, output_type=tf.int32)
logits = tf.cast(logits, tf.float32)
#loss = self.loss.get_loss(logits, labels)
#loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)
labels_one_hot = tf.one_hot(labels, depth=1001)
loss = tf.losses.softmax_cross_entropy(
logits=logits, onehot_labels=labels_one_hot, label_smoothing=self.config['label_smoothing'])
base_loss = tf.identity(loss, name='loss') # For access by logger (TODO: Better way to access it?)
# base_loss = tf.add_n([loss])
def exclude_batch_norm(name):
#return 'batch_normalization' not in name
return 'BatchNorm' not in name
loss_filter_fn = exclude_batch_norm
# Add weight decay to the loss.
l2_loss = self.config['weight_decay'] * tf.add_n(
# loss is computed using fp32 for numerical stability.
[tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()
if loss_filter_fn(v.name)])
#tf.summary.scalar('l2_loss', l2_loss)
# total_loss = base_loss + l2_loss
if self.config['use_lars']:
total_loss = base_loss
else:
total_loss = base_loss + l2_loss
total_loss = tf.identity(total_loss, name = 'total_loss')
if mode == tf.estimator.ModeKeys.EVAL:
with tf.device(None):
metrics = self.layers.get_accuracy( labels, predicted_classes, logits, self.config)
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops=metrics)
assert (mode == tf.estimator.ModeKeys.TRAIN)
#reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
#total_loss = tf.add_n([tf.saturate_cast(loss, self.config['dtype']) ] + reg_losses, name='total_loss')
#total_loss = tf.add_n([loss], name='total_loss')
batch_size = tf.shape(inputs)[0]
global_step = tf.train.get_global_step()
with tf.device('/cpu:0'):
learning_rate = self.hyper_param.get_learning_rate()
#-----------------------batchsize scaling----------------------------------
momentum = self.config['momentum'][0]
#------------------------------end------------------------------------------
opt = tf.train.MomentumOptimizer(
learning_rate, momentum, use_nesterov=self.config['use_nesterov'])
opt=NPUDistributedOptimizer(opt)
if self.config['accelerator'] == 'gpu':
opt = self.optimizer.get_lbs_optimizer(opt)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or []
with tf.control_dependencies(update_ops):
if self.config['accelerator'] == 'gpu':
gate_gradients = (tf.train.Optimizer.GATE_NONE)
grads_and_vars = opt.compute_gradients(total_loss, gate_gradients=gate_gradients)
train_op = opt.apply_gradients( grads_and_vars,global_step = global_step)
else:
with tf.name_scope('loss_scale'):
loss_scale = float( self.config['loss_scale'] )
scaled_grads_and_vars = opt.compute_gradients( total_loss * loss_scale )
unscaled_grads_and_vars = [ (g/loss_scale, v) for g,v in scaled_grads_and_vars ]
#-----------------------------------------Lars------------------------------------------
with tf.name_scope('LARS'):
fp32_grads_and_vars = [ (tf.cast(g, tf.float32), v) for g,v in unscaled_grads_and_vars ]
grad_var_list = []
if self.config['use_lars']:
if self.config['accelerator'] == 'gpu':
for g, var in fp32_grads_and_vars:
if 'BatchNorm' not in var.name and 'bias' not in var.name:
grad_norm = tf.norm(g,ord='euclidean')
weight_norm = tf.norm(var,ord='euclidean')
grad_norm_wd = tf.add( grad_norm, tf.multiply( self.config['weight_decay'] , weight_norm ) )
rescale_factor = tf.div( tf.multiply(0.001, weight_norm), tf.add(grad_norm_wd, tf.constant(1e-5, tf.float32)) )
decayed_g = tf.add( g, tf.multiply(self.config['weight_decay'], var ) )
with tf.name_scope('lars_grad'):
g = tf.multiply(rescale_factor, decayed_g)
g_and_v = ( g, var )
grad_var_list.append( g_and_v )
elif self.config['accelerator'] == '1980':
print('lars9999999999999999999999')
g_list_bn_bias = []
var_list_bn_bias = []
g_list_else = []
var_list_else = []
for g, var in fp32_grads_and_vars:
if 'BatchNorm' not in var.name and 'bias' not in var.name:
g_list_else.append(g)
var_list_else.append(var)
else:
g_list_bn_bias.append(g)
var_list_bn_bias.append(var)
g_list_else_lars = npu_ops.LARS(inputs_w=var_list_else,
inputs_g=g_list_else,
weight_decay=self.config['weight_decay'],
hyperpara=0.001,
epsilon=1e-5)
g_list_lars = g_list_bn_bias + g_list_else_lars
var_list = var_list_bn_bias + var_list_else
for (g, var) in zip(g_list_lars,var_list):
g_and_v = ( g, var )
grad_var_list.append( g_and_v )
else:
print('do not use lars111111111111111111')
for g, var in fp32_grads_and_vars:
#if 'BatchNorm' not in var.name and 'bias' not in var.name:
# decayed_g = tf.add( g, tf.multiply( self.config['weight_decay'], var ) )
# g = decayed_g
g_and_v = ( g, var )
grad_var_list.append( g_and_v )
#-----------------------------------------end Lars------------------------------------------
train_op = opt.apply_gradients( grad_var_list, global_step = global_step )
train_op = tf.group(train_op)
#with tf.device('/cpu:0'):
#tf.summary.scalar('total_loss', total_loss)
#tf.summary.scalar('base_loss', base_loss)
#tf.summary.scalar('learning_rate', learning_rate)
#tf.contrib.summary.flush()
# if self.config['do_checkpoint']:
# summary_hook = tf.train.SummarySaverHook( save_steps=20,
# output_dir=self.config['log_dir']+'/train_summary',
# summary_op = tf.summary.merge_all() )
#return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op, training_hooks=[summary_hook] )\
# if self.config['do_checkpoint'] else tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op )
return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op )
# return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)
def get_model_func(self):
model_name = self.config['model_name']
if model_name.startswith('resnet'):
nlayer = int(model_name[len('resnet'):])
return lambda images, *args, **kwargs: \
resnet.inference_resnet_v1(self.config,images, nlayer, *args, **kwargs)
else:
raise ValueError("Invalid model type: %s" % model_name)
@@ -0,0 +1,545 @@
import tensorflow as tf
_BATCH_NORM_EPSILON = 1e-4
_BATCH_NORM_DECAY = 0.9
_Cardi = 32
class LayerBuilder(object):
def __init__(self, activation=None, data_format='channels_last',
training=False, use_batch_norm=False, batch_norm_config=None,
conv_initializer=None, bn_init_mode='adv_bn_init', bn_gamma_initial_value=1.0 ):
self.activation = activation
self.data_format = data_format
self.training = training
self.use_batch_norm = use_batch_norm
self.batch_norm_config = batch_norm_config
self.conv_initializer = conv_initializer
self.bn_init_mode = bn_init_mode
self.bn_gamma_initial_value = bn_gamma_initial_value
if self.batch_norm_config is None:
self.batch_norm_config = {
'decay': _BATCH_NORM_DECAY,
'epsilon': _BATCH_NORM_EPSILON,
'scale': True,
'zero_debias_moving_mean': False,
}
def _conv2d(self, inputs, activation, *args, **kwargs):
x = tf.layers.conv2d(
inputs, data_format=self.data_format,
# use_bias=not self.use_batch_norm,
use_bias=False,
kernel_initializer=self.conv_initializer,
activation=None if self.use_batch_norm else activation,
*args, **kwargs)
if self.use_batch_norm:
param_initializers = {
'moving_mean': tf.zeros_initializer(),
'moving_variance': tf.ones_initializer(),
'beta': tf.zeros_initializer(),
}
if self.bn_init_mode == 'adv_bn_init':
param_initializers['gamma'] = tf.ones_initializer()
elif self.bn_init_mode == 'conv_bn_init':
param_initializers['gamma'] = tf.constant_initializer(self.bn_gamma_initial_value)
else:
raise ValueError("--bn_init_mode must be 'conv_bn_init' or 'adv_bn_init' ")
x = self.batch_norm(x)
x = activation(x) if activation is not None else x
return x
def conv2d_linear_last_bn(self, inputs, *args, **kwargs):
x = tf.layers.conv2d(
inputs, data_format=self.data_format,
use_bias=False,
kernel_initializer=self.conv_initializer,
activation=None, *args, **kwargs)
param_initializers = {
'moving_mean': tf.zeros_initializer(),
'moving_variance': tf.ones_initializer(),
'beta': tf.zeros_initializer(),
}
if self.bn_init_mode == 'adv_bn_init':
param_initializers['gamma'] = tf.zeros_initializer()
elif self.bn_init_mode == 'conv_bn_init':
param_initializers['gamma'] = tf.constant_initializer(self.bn_gamma_initial_value)
else:
raise ValueError("--bn_init_mode must be 'conv_bn_init' or 'adv_bn_init' ")
x = self.batch_norm(x, param_initializers=param_initializers)
return x
def conv2d_no_act_no_bn(self, inputs, *args, **kwargs):
x = tf.layers.conv2d(
inputs, data_format=self.data_format,
use_bias=False,
kernel_initializer=self.conv_initializer,
activation=None, *args, **kwargs)
return x
def conv2d_linear(self, inputs, *args, **kwargs):
return self._conv2d(inputs, None, *args, **kwargs)
def conv2d(self, inputs, *args, **kwargs):
return self._conv2d(inputs, self.activation, *args, **kwargs)
def pad2d(self, inputs, begin, end=None):
if end is None:
end = begin
try:
_ = begin[1]
except TypeError:
begin = [begin, begin]
try:
_ = end[1]
except TypeError:
end = [end, end]
if self.data_format == 'channels_last':
padding = [[0, 0], [begin[0], end[0]], [begin[1], end[1]], [0, 0]]
else:
padding = [[0, 0], [0, 0], [begin[0], end[0]], [begin[1], end[1]]]
return tf.pad(inputs, padding)
def max_pooling2d(self, inputs, *args, **kwargs):
return tf.layers.max_pooling2d(
inputs, data_format=self.data_format, *args, **kwargs)
def average_pooling2d_stride_1(self, inputs, *args, **kwargs):
# inputs = tf.nn.avg_pool(inputs, ksize=[1,1,1,1],strides=[1,1,1,1], padding="VALID", data_format="NHWC" )
return inputs
def average_pooling2d(self, inputs, *args, **kwargs):
inputs = tf.nn.avg_pool(inputs, ksize=[1,2,2,1],strides=[1,2,2,1], padding="VALID", data_format="NHWC" )
return inputs
# return tf.layers.average_pooling2d(
# inputs, data_format=self.data_format, *args, **kwargs)
def dense_linear(self, inputs, units, **kwargs):
return tf.layers.dense(inputs, units, activation=None)
def dense(self, inputs, units, **kwargs):
return tf.layers.dense(inputs, units, activation=self.activation)
def activate(self, inputs, activation=None):
activation = activation or self.activation
return activation(inputs) if activation is not None else inputs
def batch_norm(self, inputs, **kwargs):
all_kwargs = dict(self.batch_norm_config)
all_kwargs.update(kwargs)
data_format = 'NHWC' if self.data_format == 'channels_last' else 'NCHW'
bn_inputs = inputs
outputs = tf.contrib.layers.batch_norm(
inputs, is_training=self.training, data_format=data_format,
fused=True, **all_kwargs)
return outputs
def spatial_average2d(self, inputs):
shape = inputs.get_shape().as_list()
if self.data_format == 'channels_last':
n, h, w, c = shape
else:
n, c, h, w = shape
n = -1 if n is None else n
x = tf.layers.average_pooling2d(inputs, (h, w), (1, 1),
data_format=self.data_format)
return tf.reshape(x, [n, c])
def flatten2d(self, inputs):
x = inputs
if self.data_format != 'channel_last':
# Note: This ensures the output order matches that of NHWC networks
x = tf.transpose(x, [0, 2, 3, 1])
input_shape = x.get_shape().as_list()
num_inputs = 1
for dim in input_shape[1:]:
num_inputs *= dim
return tf.reshape(x, [-1, num_inputs], name='flatten')
def residual2d(self, inputs, network, units=None, scale=1.0, activate=False):
outputs = network(inputs)
c_axis = -1 if self.data_format == 'channels_last' else 1
h_axis = 1 if self.data_format == 'channels_last' else 2
w_axis = h_axis + 1
ishape, oshape = [y.get_shape().as_list() for y in [inputs, outputs]]
ichans, ochans = ishape[c_axis], oshape[c_axis]
strides = ((ishape[h_axis] - 1) // oshape[h_axis] + 1,
(ishape[w_axis] - 1) // oshape[w_axis] + 1)
with tf.name_scope('residual'):
if (ochans != ichans or strides[0] != 1 or strides[1] != 1):
inputs = self.conv2d_linear(inputs, units, 1, strides, 'SAME')
x = inputs + scale * outputs
if activate:
x = self.activate(x)
return x
def resnet_bottleneck_v1(builder, inputs, depth, depth_bottleneck, stride, filters, arch_type,
basic=False):
num_inputs = inputs.get_shape().as_list()[3]
x = inputs
#with tf.name_scope('resnet_model'):
if depth == num_inputs:
if stride == 1:#v1.5
shortcut = x
else:#v1
shortcut = builder.max_pooling2d(x, 1, stride)
else: # the downsample(first) block in each layer
if 'D1' in arch_type:
if stride == 1:
shortcut = builder.average_pooling2d_stride_1(x, stride, stride) #--------------------Resnet-D------------
else:
shortcut = builder.average_pooling2d(x, stride, stride) #--------------------Resnet-D------------
shortcut = builder.conv2d_linear(shortcut, depth, 1, 1, 'SAME')
elif 'D2' in arch_type:
shortcut = builder.conv2d_linear(x, depth, 3, stride, 'SAME')
elif 'D3' in arch_type:
shortcut = builder.conv2d_linear(x, depth, 1, 1, 'SAME')
shortcut = builder.average_pooling2d(shortcut, stride, stride) #--------------------Resnet-D------------
else:
shortcut = builder.conv2d_linear(x, depth, 1, stride, 'SAME')
conv_input = x
if basic:
x = builder.pad2d(x, 1)
x = builder.conv2d(x, depth_bottleneck, 3, stride, 'VALID')
x = builder.conv2d_linear(x, depth, 3, 1, 'SAME')
else:
conv_input = x
x = builder.conv2d(x, depth_bottleneck, 1, 1, 'SAME')
conv_input = x
if stride == 1:
x = builder.conv2d(x, depth_bottleneck, 3, stride, 'SAME')
else:
if 'E1' in arch_type:
x = builder.average_pooling2d( x, stride, stride )
x = builder.conv2d(x, depth_bottleneck, 3, 1, 'SAME')
elif 'E2' in arch_type:
x = builder.conv2d(x, depth_bottleneck, 3, 1, 'SAME')
if stride == 1:
x = builder.average_pooling2d_stride_1( x, stride, stride )
else:
x = builder.average_pooling2d( x, stride, stride )
else: # E0
x = builder.conv2d(x, depth_bottleneck, 3, stride, 'SAME')
# x = builder.conv2d_linear(x, depth, 1, 1, 'SAME')
conv_input = x
x = builder.conv2d_linear_last_bn(x, depth, 1, 1, 'SAME')
x = tf.nn.relu(x + shortcut)
return x
def resnext_bottleneck(builder, inputs, depth, depth_bottleneck, stride, filters, arch_type,
basic=False):
num_inputs = inputs.get_shape().as_list()[3]
x = inputs
with tf.name_scope('resnet_v1'):
if depth == num_inputs:
if stride == 1:#v1.5
shortcut = x
else:#v1
shortcut = builder.max_pooling2d(x, 1, stride)
else: # the downsample(first) block in each layer
shortcut = builder.conv2d_linear(x, depth, 1, stride, 'SAME')
if basic:
x = builder.pad2d(x, 1)
x = builder.conv2d(x, depth_bottleneck, 3, stride, 'VALID')
x = builder.conv2d_linear(x, depth, 3, 1, 'SAME')
else:
#----- split layer ------
x = builder.conv2d( x, depth_bottleneck, 1, 1, 'SAME' )
group_inputs = tf.split( x, _Cardi, axis=3 )
layers_split=[]
tmp = x
for i in range(_Cardi):
with tf.name_scope('cardi_'+str(i)):
split = builder.conv2d_no_act_no_bn( group_inputs[i], depth_bottleneck/_Cardi, 3, stride, 'SAME' )
layers_split.append(split)
x = tf.concat(layers_split, axis=3)
x = builder.batch_norm(x)
x = tf.nn.relu(x)
x = builder.conv2d_linear_last_bn(x, depth, 1, 1, 'SAME')
x = tf.nn.relu(x + shortcut)
return x
def resnet_bottleneck_v2(builder, inputs, depth, depth_bottleneck, stride, filters, arch_type,
basic=False):
num_inputs = inputs.get_shape().as_list()[1]
x = inputs
with tf.name_scope('resnet_v1'):
# ------- shortcut ---------------
if depth == num_inputs:
if stride == 1:#v1.5
shortcut = x
x = builder.batch_norm(x)
x = tf.nn.relu(x)
else:#v1
shortcut = builder.max_pooling2d(x, 1, stride)
else: # the downsample(first) block in each layer
x = builder.batch_norm(x)
x = tf.nn.relu(x)
if 'D1' in arch_type:
shortcut = builder.average_pooling2d(x, stride, stride) #--------------------Resnet-D------------
shortcut = builder.conv2d_linear(shortcut, depth, 1, 1, 'SAME')
elif 'D2' in arch_type:
shortcut = builder.conv2d_linear(x, depth, 3, stride, 'SAME')
elif 'D3' in arch_type:
shortcut = builder.conv2d_linear(x, depth, 1, 1, 'SAME')
shortcut = builder.average_pooling2d(shortcut, stride, stride) #--------------------Resnet-D------------
else:
shortcut = builder.conv2d_linear(x, depth, 1, stride, 'SAME')
# -------- mainstream ----------------
if basic:
x = builder.pad2d(x, 1)
x = builder.conv2d(x, depth_bottleneck, 3, stride, 'VALID')
x = builder.conv2d_linear(x, depth, 3, 1, 'SAME')
else:
x = builder.conv2d(x, depth_bottleneck, 1, 1, 'SAME')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
if stride == 1:
x = builder.conv2d(x, depth_bottleneck, 3, stride, 'SAME')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
else:
if 'E1' in arch_type:
x = builder.average_pooling2d( x, stride, stride )
x = builder.conv2d(x, depth_bottleneck, 3, 1, 'SAME')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
elif 'E2' in arch_type:
x = builder.conv2d(x, depth_bottleneck, 3, 1, 'SAME')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
x = builder.average_pooling2d( x, stride, stride )
else: # E0
x = builder.conv2d(x, depth_bottleneck, 3, stride, 'SAME')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
x = builder.conv2d_linear(x, depth, 1, 1, 'SAME')
x = x + shortcut
return x
def inference_resnext_impl(builder, inputs, layer_counts, arch_type='C1+D', resnet_version='v1.5', basic=False):
x = inputs
#x = builder.batch_norm(x)
x = builder.pad2d(x, 3)
x = builder.conv2d(x, 64, 7, 2, 'VALID')
#x = builder.conv2d(x, 64, 7, 2, 'SAME')
num_filters=64
x = builder.max_pooling2d(x, 3, 2, 'SAME')
#x, argmax = tf.nn.max_pool_with_argmax(input=x, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME')
for i in range(layer_counts[0]):
x = resnext_bottleneck(builder, x, 256, 128, 1, num_filters, arch_type, basic)
for i in range(layer_counts[1]):
num_filters=num_filters*2
x = resnext_bottleneck(builder, x, 512, 256, 2 if i == 0 else 1, num_filters, arch_type, basic)
for i in range(layer_counts[2]):
num_filters=num_filters*2
x = resnext_bottleneck(builder, x, 1024, 512, 2 if i == 0 else 1, num_filters, arch_type, basic)
for i in range(layer_counts[3]):
num_filters=num_filters*2
x = resnext_bottleneck(builder, x, 2048, 1024, 2 if i == 0 else 1, num_filters, arch_type, basic)
print ('====================Final x:', x)
axes = [1,2]
x = tf.reduce_mean( x, axes, keepdims=True )
x = tf.identity(x, 'final_reduce_mean')
x = tf.reshape( x, [-1, 2048] )
x = tf.layers.dense(inputs=x, units=1001,kernel_initializer= tf.variance_scaling_initializer() )
x = tf.identity( x, 'final_dense' )
return x
def inference_resnet_v1_impl(builder, inputs, layer_counts, arch_type='C1+D', resnet_version='v1.5', basic=False):
x = inputs
#x = builder.pad2d(x, 1)
if 'C1' in arch_type: # --- Resnet C -----
x = builder.conv2d(x, 32, 3, 2, 'SAME')
x = builder.conv2d(x, 32, 3, 1, 'SAME')
x = builder.conv2d(x, 64, 3, 1, 'SAME')
elif 'C2' in arch_type:
x = builder.conv2d(x, 32, 3, 1, 'SAME')
x = builder.conv2d(x, 32, 3, 2, 'VALID')
x = builder.conv2d(x, 64, 3, 1, 'VALID')
elif 'C3' in arch_type:
x = builder.conv2d(x, 32, 3, 1, 'VALID')
x = builder.conv2d(x, 32, 3, 1, 'VALID')
x = builder.conv2d(x, 64, 3, 2, 'VALID')
else:
x = builder.conv2d(x, 64, 7, 2, 'SAME')
num_filters=64
pooled_inputs = x
#x = builder.max_pooling2d(x, 3, 2, 'SAME')
x, argmax = tf.nn.max_pool_with_argmax(input=x, ksize=(1,3,3,1), strides=(1,2,2,1), padding='SAME')
for i in range(layer_counts[0]):
x = resnet_bottleneck_v1(builder, x, 256, 64, 1, num_filters, arch_type, basic)
for i in range(layer_counts[1]):
num_filters=num_filters*2
x = resnet_bottleneck_v1(builder, x, 512, 128, 2 if i == 0 else 1, num_filters, arch_type, basic)
for i in range(layer_counts[2]):
num_filters=num_filters*2
x = resnet_bottleneck_v1(builder, x, 1024, 256, 2 if i == 0 else 1, num_filters, arch_type, basic)
for i in range(layer_counts[3]):
num_filters=num_filters*2
x = resnet_bottleneck_v1(builder, x, 2048, 512, 2 if i == 0 else 1, num_filters, arch_type, basic)
axes = [1,2]
x = tf.reduce_mean( x, axes, keepdims=True )
x = tf.identity(x, 'final_reduce_mean')
x = tf.reshape( x, [-1, 2048] )
x = tf.layers.dense(inputs=x, units=1001,kernel_initializer=tf.random_normal_initializer(stddev=0.01))
x = tf.identity( x, 'final_dense' )
return x
def inference_resnet_v2_impl(builder, inputs, layer_counts, arch_type='C1+D', basic=False):
x = inputs
x = builder.pad2d(x, 3)
if 'C1' in arch_type: # --- Resnet C -----
x = builder.conv2d(x, 32, 3, 2, 'VALID')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
x = builder.conv2d(x, 32, 3, 1, 'VALID')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
x = builder.conv2d(x, 64, 3, 1, 'SAME')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
elif 'C2' in arch_type:
x = builder.conv2d(x, 32, 3, 1, 'SAME')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
x = builder.conv2d(x, 32, 3, 2, 'VALID')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
x = builder.conv2d(x, 64, 3, 1, 'VALID')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
elif 'C3' in arch_type:
x = builder.conv2d(x, 32, 3, 1, 'VALID')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
x = builder.conv2d(x, 32, 3, 1, 'VALID')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
x = builder.conv2d(x, 64, 3, 2, 'VALID')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
else:
x = builder.conv2d(x, 64, 7, 2, 'VALID')
x = builder.batch_norm(x)
x = tf.nn.relu(x)
num_filters=64
pooled_inputs = x
x = builder.max_pooling2d(x, 3, 2, 'SAME')
for i in range(layer_counts[0]):
x = resnet_bottleneck_v2(builder, x, 256, 64, 1, num_filters, arch_type, basic)
for i in range(layer_counts[1]):
num_filters=num_filters*2
x = resnet_bottleneck_v2(builder, x, 512, 128, 2 if i == 0 else 1, num_filters, arch_type, basic)
for i in range(layer_counts[2]):
num_filters=num_filters*2
x = resnet_bottleneck_v2(builder, x, 1024, 256, 2 if i == 0 else 1, num_filters, arch_type, basic)
for i in range(layer_counts[3]):
num_filters=num_filters*2
x = resnet_bottleneck_v2(builder, x, 2048, 512, 2 if i == 0 else 1, num_filters, arch_type, basic)
return builder.spatial_average2d(x)
def inference_resnet_v1(config, inputs, nlayer, data_format='channels_last',
training=False, conv_initializer=None, bn_init_mode='adv_bn_init', bn_gamma_initial_value=1.0 ):
"""Deep Residual Networks family of models
https://arxiv.org/abs/1512.03385
"""
if config['resnet_version'] == 'v1.5':
builder = LayerBuilder(tf.nn.relu, data_format, training, use_batch_norm=True,
conv_initializer=conv_initializer, bn_init_mode=bn_init_mode, bn_gamma_initial_value=bn_gamma_initial_value)
if nlayer == 18:
return inference_resnet_v1_impl(builder, inputs, [2, 2, 2, 2], config['arch_type'], config['resnet_version'], basic=True)
elif nlayer == 34:
return inference_resnet_v1_impl(builder, inputs, [3, 4, 6, 3], config['arch_type'], config['resnet_version'], basic=True)
elif nlayer == 50:
return inference_resnet_v1_impl(builder, inputs, [3, 4, 6, 3], config['arch_type'], config['resnet_version'])
elif nlayer == 101:
return inference_resnet_v1_impl(builder, inputs, [3, 4, 23, 3], config['arch_type'], config['resnet_version'])
elif nlayer == 152:
return inference_resnet_v1_impl(builder, inputs, [3, 8, 36, 3], config['arch_type'], config['resnet_version'])
else:
raise ValueError("Invalid nlayer (%i); must be one of: 18,34,50,101,152" %
nlayer)
elif config['resnet_version'] == 'v2':
builder = LayerBuilder( None, data_format, training, use_batch_norm=False,
conv_initializer=conv_initializer, bn_init_mode=bn_init_mode, bn_gamma_initial_value=bn_gamma_initial_value)
if nlayer == 18:
return inference_resnet_v2_impl(builder, inputs, [2, 2, 2, 2], config['arch_type'], basic=True)
elif nlayer == 34:
return inference_resnet_v2_impl(builder, inputs, [3, 4, 6, 3], config['arch_type'], basic=True)
elif nlayer == 50:
return inference_resnet_v2_impl(builder, inputs, [3, 4, 6, 3], config['arch_type'])
elif nlayer == 101:
return inference_resnet_v2_impl(builder, inputs, [3, 4, 23, 3], config['arch_type'])
elif nlayer == 152:
return inference_resnet_v2_impl(builder, inputs, [3, 8, 36, 3], config['arch_type'])
else:
raise ValueError("Invalid nlayer (%i); must be one of: 18,34,50,101,152" %
nlayer)
elif config['resnet_version'] == 'resnext':
builder = LayerBuilder( tf.nn.relu, data_format, training, use_batch_norm=True,
conv_initializer=conv_initializer, bn_init_mode=bn_init_mode, bn_gamma_initial_value=bn_gamma_initial_value)
if nlayer == 18:
return inference_resnext_impl(builder, inputs, [2, 2, 2, 2], config['arch_type'], basic=True)
elif nlayer == 34:
return inference_resnext_impl(builder, inputs, [3, 4, 6, 3], config['arch_type'], basic=True)
elif nlayer == 50:
return inference_resnext_impl(builder, inputs, [3, 4, 6, 3], config['arch_type'])
elif nlayer == 101:
return inference_resnext_impl(builder, inputs, [3, 4, 23, 3], config['arch_type'])
elif nlayer == 152:
return inference_resnext_impl(builder, inputs, [3, 8, 36, 3], config['arch_type'])
else:
raise ValueError("Invalid nlayer (%i); must be one of: 18,34,50,101,152" %
nlayer)
else:
raise ValueError("Invalid resnet version")
@@ -0,0 +1,228 @@
import six
import tensorflow as tf
class Optimizer:
def __init__(self, config):
self.config = config
def get_lbs_optimizer(self, opt): #TODO input is ( self, hyper_param )
# opt = LargeBatchSizeOptimizer(opt, weight_decay=self.config['weight_decay'],
# accum_dtype = self.config['dtype'],
# use_lars = self.config['use_lars'],
# bn_lr_scale = self.config.get('bn_lr_scale', 1.0)
# )
opt = MixedPrecisionOptimizer(opt, self.config)
return opt
class MixedPrecisionOptimizer(tf.train.Optimizer):
"""An optimizer that updates trainable variables in fp32."""
def __init__(self, optimizer, config):
super(MixedPrecisionOptimizer, self).__init__(
optimizer._use_locking,
optimizer._name + '-MP',
)
self._optimizer = optimizer
self._config = config
loss_scale=self._config['loss_scale']
self._loss_scale = float(loss_scale)
self._fp32_to_fp16 = {}
var_list = (
tf.trainable_variables() +
tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
with tf.device('/gpu:0'):
self.var_fp32_copy = [ tf.Variable( tf.cast(v.initialized_value(), tf.float32),
dtype=tf.float32, trainable=False,
collections=[tf.GraphKeys.GLOBAL_VARIABLES, "FP32_MASTER_COPIES"] ) for v in var_list ]
def compute_gradients(self, loss, var_list=None,
gate_gradients=tf.train.Optimizer.GATE_OP,
aggregation_method=None,
colocate_gradients_with_ops=False,
grad_loss=None):
if var_list is None:
var_list = (
tf.trainable_variables() +
tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
if self._loss_scale != 1.0:
loss = tf.scalar_mul(self._loss_scale, loss)
grads_and_vars_fp16 = self._optimizer.compute_gradients(
loss, var_list=var_list,
gate_gradients=gate_gradients,
aggregation_method=aggregation_method,
colocate_gradients_with_ops=colocate_gradients_with_ops,
grad_loss=grad_loss,
)
# creating FP-32 variables and filling the fp32 dict
grads_and_vars_fp32 = []
with tf.variable_scope('FP32-master-copy'):
for i, (grad, var) in enumerate(grads_and_vars_fp16):
if grad is not None:
if var.dtype.base_dtype == tf.float16:
fp32_var = self.var_fp32_copy[i]
self._fp32_to_fp16[fp32_var.name] = var
fp32_grad = tf.cast(grad, tf.float32)
grads_and_vars_fp32.append((fp32_grad, fp32_var))
else:
grads_and_vars_fp32.append((grad, var))
else:
grads_and_vars_fp32.append((None, var))
grads_and_vars_fp32_rescaled = [ (g/self._loss_scale, v) for g,v in grads_and_vars_fp32 ]
return grads_and_vars_fp32_rescaled
def apply_gradients(self, grads_and_vars, *args, **kwargs):
update_op = self._optimizer.apply_gradients(grads_and_vars, *args, **kwargs)
apply_ops = []
with tf.control_dependencies([update_op]):
for grad, var in grads_and_vars:
if var.name in self._fp32_to_fp16:
dst_var = self._fp32_to_fp16[var.name]
apply_ops.append(
tf.assign(dst_var, tf.saturate_cast(var, tf.float16)))
if apply_ops:
return tf.group(apply_ops)
return update_op
class LargeBatchSizeOptimizer(tf.train.Optimizer):
""" LARC implementation
-------------------
Parameters:
- optimizer: initial optimizer that you wanna apply
example: tf.train.MomentumOptimizer
- learning_rate: initial learning_rate from initial optimizer
- clip: if True apply LARC otherwise LARS
- epsilon: default value is weights or grads are 0.
- name
- use_locking
"""
def __init__(self, optimizer, weight_decay, clip=True, epsilon=1., accum_dtype=tf.float16, use_lars=True, bn_lr_scale=1.0,
name="LarcOptimizer", use_locking=False):
super(LargeBatchSizeOptimizer, self).__init__(
name=name, use_locking=use_locking)
self._optimizer = optimizer
# self._learning_rate = learning_rate
self._weight_decay = weight_decay
self._clip = clip
self._epsilon = float(epsilon)
self._accum_dtype=accum_dtype
self._use_lars=use_lars
self._bn_lr_scale=bn_lr_scale
var_list = (
tf.trainable_variables() +
tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
with tf.device('/gpu:0'):
self._grads_accum = [ tf.Variable( tf.cast(tf.zeros_like(v.initialized_value()), self._accum_dtype), dtype=self._accum_dtype, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES] ) for v in var_list ]
def compute_gradients(self, *args, **kwargs):
return self._optimizer.compute_gradients(*args, **kwargs)
def apply_gradients(self, gradvars, loss_scale, *args, **kwargs):
global_step = tf.train.get_global_step()
grads_and_vars_clean = []
for grad, var in gradvars:
if grad is not None:
grads_and_vars_clean.append( (grad, var) )
processed_grads_and_vars = self.post_process_grads(grads_and_vars_clean, loss_scale) # post_process_grads includes Lars
def apply():
red_grad_updates = self._optimizer.apply_gradients( processed_grads_and_vars, global_step=tf.train.get_global_step() )
return tf.group(red_grad_updates)
update_weight_op_1 = apply()
return update_weight_op_1
apply_gradients_op = update_weight_op_1
with tf.device('/cpu:0'):
#tf.summary.scalar('loss_scale', loss_scale)
for grad, var in gradvars:
g = grad / loss_scale
v_norm_2 = tf.norm(var, ord='euclidean')
g_norm_2 = tf.norm(g, ord='euclidean')
v_g_norm2_ratio = v_norm_2 / (
g_norm_2 + self._weight_decay * v_norm_2)
if grad is not None:
if 'BatchNorm' in var.name:
with tf.name_scope('bn_norm2/'):
tf.summary.scalar(var.name + '/norm2',
v_norm_2)
with tf.name_scope('grad_bn_norm2/'):
tf.summary.scalar(var.name + '/grad_norm2',
g_norm_2)
with tf.name_scope('bn_ratio_var_grad/'):
tf.summary.scalar(var.name + '/ratio_var_grad',
v_g_norm2_ratio)
else:
with tf.name_scope('conv_norm2/'):
tf.summary.scalar(var.name + '/norm2',
v_norm_2)
with tf.name_scope('grad_conv_norm2/'):
tf.summary.scalar(var.name + '/grad_norm2',
g_norm_2)
with tf.name_scope('conv_ratio_var_grad/'):
tf.summary.scalar(var.name + '/ratio_var_grad',
v_g_norm2_ratio)
return apply_gradients_op
def post_process_grads(self, grads_and_vars, loss_scale):
g_and_v_scaled = []
for g, v in grads_and_vars:
g = g / loss_scale
g_and_v_scaled.append((g,v))
# Lars
if self._use_lars:
grad_var_list = []
#-----------------------------------------------LARS and weight decay-----------------------------------
for g, var in g_and_v_scaled:
if 'BatchNorm' not in var.name and 'bias' not in var.name:
grad_norm = tf.norm(g,ord='euclidean')
weight_norm = tf.norm(var,ord='euclidean')
grad_norm_wd = tf.add( grad_norm, tf.multiply( self._weight_decay, weight_norm ) )
rescale_factor = tf.div( tf.multiply(0.001, weight_norm), tf.add(grad_norm_wd, tf.constant(1e-5, tf.float32)) )
coeffi = tf.clip_by_value( rescale_factor, 0.001, 50.0 )
decayed_g = tf.add( g, tf.multiply( self._weight_decay, var ) )
g = tf.multiply(coeffi, decayed_g)
else:
g = self._bn_lr_scale * g
g_and_v = ( g, var )
grad_var_list.append( g_and_v )
#-------------------------------------------LARS end---------------------------------
return grad_var_list
else:
grad_var_list_without_lars = []
#----------------------------------------weight decay-----------------------------------
for g, var in g_and_v_scaled:
if 'BatchNorm' not in var.name and 'bias' not in var.name:
decayed_g = tf.add( g, tf.multiply( self._weight_decay, var ) )
g = decayed_g
else:
g = self._bn_lr_scale * g
g_and_v = ( g, var )
grad_var_list_without_lars.append( g_and_v )
return grad_var_list_without_lars
@@ -0,0 +1,172 @@
import tensorflow as tf
import math
import time
from . import train_helper
from .train_helper import stage
from utils.logger import rank0log
from tensorflow.contrib.offline_train.python.npu.npu_config import NPURunConfig
from tensorflow.contrib.offline_train.python.npu.npu_estimator import NPUEstimator
from tensorflow.contrib.offline_train.python.npu.npu_optimizer import NPUDistributedOptimizer
class GPUBaseTrain(object):
def __init__(self, session, config, data, model, logger):
self.sess = session
self.config = config
self.data = data
self.model = model
self.logger = logger
self.print_logger = self.logger.logger
self.all_preds = []
self.all_targets = []
if self.config['accelerator'] == 'gpu':
self.classifier, self.training_hook = self.get_classifier()
else:
from tensorflow.contrib.offline_train.python.npu.npu_config import NPURunConfig
from tensorflow.contrib.offline_train.python.npu.npu_estimator import NPUEstimator
from tensorflow.contrib.offline_train.python.npu.npu_optimizer import NPUDistributedOptimizer
self.classifier, self.training_hook = self.get_npu_classifier()
def get_classifier(self):
classifier = tf.estimator.Estimator(
model_fn=self.model.get_estimator_model_func,
model_dir=self.config['log_dir'],
config = tf.estimator.RunConfig(
session_config=self.sess.get_config(),
save_summary_steps=self.config['save_summary_steps'] if self.config['do_checkpoint'] else None,
save_checkpoints_steps=self.config['save_checkpoints_steps'] if self.config['do_checkpoint'] else None,
keep_checkpoint_max=None
)
)
training_hooks = [train_helper.PrefillStagingAreasHook()]
training_hooks.append(self.logger)
return classifier, training_hooks
def get_npu_classifier(self):
session_config = tf.ConfigProto(
inter_op_parallelism_threads=10,
intra_op_parallelism_threads=10,
allow_soft_placement=True)
if self.config['debug'] :
run_config = NPURunConfig(enable_auto_mix_precision=True, enable_data_pre_proc=True, save_checkpoints_steps=112590, session_config=session_config, model_dir = self.config['model_dir'], iterations_per_loop=self.config['iterations_per_loop'], keep_checkpoint_max=5)
else :
run_config = NPURunConfig(enable_auto_mix_precision=True, save_summary_steps=0, log_step_count_steps=None, enable_data_pre_proc=True,save_checkpoints_secs=1e9, session_config=session_config, model_dir = self.config['model_dir'], iterations_per_loop=self.config['iterations_per_loop'])
# run_config = NPURunConfig(enable_data_pre_proc=True,save_checkpoints_secs=1e9, session_config=session_config, model_dir = self.config['model_dir'])
# classifier = tf.estimator.Estimator(
# model_fn=self.model.get_estimator_model_func,
# model_dir=self.config['log_dir'],
# config = tf.estimator.RunConfig(
# session_config=self.sess.get_config(),
# save_summary_steps=self.config['save_summary_steps'] if self.config['do_checkpoint'] else None,
# save_checkpoints_steps=self.config['save_checkpoints_steps'] if self.config['do_checkpoint'] else None,
# keep_checkpoint_max=None
# )
# )
classifier =NPUEstimator(
model_fn= self.model.get_estimator_model_func,
config= run_config
# job_start_file='/tmp/config/deviceid_devindex_jobstart'
)
training_hooks = []
if self.config['debug']:
training_hooks = [train_helper.PrefillStagingAreasHook()]
training_hooks.append(self.logger)
return classifier, training_hooks
def train(self):
print ('training steps: %d' % self.config['nstep'])
self.classifier.train( input_fn=lambda:self.data.get_train_input_fn(),
# max_steps = self.config['max_train_steps'],
max_steps = self.config['nstep'],
#steps = 100,
hooks = self.training_hook
)
def evaluate(self):
rank0log(self.print_logger, "Evaluating")
rank0log(self.print_logger, "Validation dataset size: {}".format(self.config['num_evaluating_samples'] ))
time.sleep(5) # a little extra margin...
try:
ckpts = train_helper.sort_and_load_ckpts(self.config['log_dir'])
for i, c in enumerate(ckpts):
if i < len(ckpts) - 1:
if i % self.config['eval_interval'] != 0:
continue
eval_result = self.classifier.evaluate(
input_fn=lambda: self.data.get_eval_input_fn(),
checkpoint_path=c['path'])
c['epoch'] = math.ceil(c['step'] / (self.config['num_training_samples']/ (self.config['batch_size'])))
c['top1'] = eval_result['val-top1acc']
c['top5'] = eval_result['val-top5acc']
c['loss'] = eval_result['loss']
rank0log(self.print_logger, ' step epoch top1 top5 loss checkpoint_time(UTC)')
for i, c in enumerate(ckpts):
if 'top1' not in c:
continue
rank0log(self.print_logger,'{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}'
.format(c['step'],
c['epoch'],
c['top1'] * 100,
c['top5'] * 100,
c['loss'],
time=time.strftime('%Y-%m-%d %H:%M:%S',
time.localtime(c['mtime']))))
rank0log(self.print_logger, "Finished evaluation")
except KeyboardInterrupt:
self.print_logger.error("Keyboard interrupt")
def train_and_evaluate(self):
success = False
epochs_between_evals = self.config.get('epochs_between_evals', 4)
for i in range(self.config['num_epochs'] // epochs_between_evals):
rank0log(self.print_logger, "Starting a training cycle")
self.classifier.train(input_fn=lambda:self.data.get_train_input_fn(),
steps = self.config['nsteps_per_epoch']*epochs_between_evals,
hooks = self.training_hook )
rank0log(self.print_logger, "Starting to evaluate")
rank0log(self.print_logger, "Validation dataset size: {}".format(self.config['num_evaluating_samples'] ))
time.sleep(5) # a little extra margin...
ckpts = train_helper.sort_and_load_ckpts(self.config['log_dir'])
c = ckpts[-1]
eval_result = self.classifier.evaluate(
input_fn=lambda: self.data.get_eval_input_fn(),
checkpoint_path=c['path'])
c['epoch'] = math.ceil(c['step'] / (self.config['num_training_samples']/ (self.config['batch_size'] * hvd.size())))
c['top1'] = eval_result['val-top1acc']
c['top5'] = eval_result['val-top5acc']
c['loss'] = eval_result['loss']
rank0log(self.print_logger, ' step epoch top1 top5 loss checkpoint_time(UTC)')
rank0log(self.print_logger,'{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}'
.format(c['step'],
c['epoch'],
c['top1'] * 100,
c['top5'] * 100,
c['loss'],
time=time.strftime('%Y-%m-%d %H:%M:%S',
time.localtime(c['mtime']))))
if eval_result['val-top1acc']*100 > self.config.get('stop_threshold', 74.9):
success = True
break
@@ -0,0 +1,253 @@
import os
import sys
import tensorflow as tf
import math
import time
from . import train_helper
from .train_helper import stage
from utils.logger import rank0log
# add by zwx5326390
from datetime import datetime
# import hwlog
from benchmark_log import hwlog
#from tensorflow.contrib.offline_train.python.npu.npu_config import NPURunConfig
#from tensorflow.contrib.offline_train.python.npu.npu_estimator import NPUEstimator
#from tensorflow.contrib.offline_train.python.npu.npu_optimizer import NPUDistributedOptimizer
from npu_bridge.estimator.npu.npu_config import NPURunConfig
from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
# remark_logger = hwlog.get_logger(__file__, "hw_Resnext50.log")
# file_name = hwlog.get_file_name(__file__)
class GPUBaseTrain(object):
def __init__(self, session, config, data, model, logger):
self.sess = session
self.config = config
self.data = data
self.model = model
self.logger = logger
self.print_logger = self.logger.logger
self.all_preds = []
self.all_targets = []
# add by zwx5326390
# work_num = "device " + str(os.environ.get("DEVICE_INDEX"))
# date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
# hwlog.vlogger.info("namespace:%s,time_ts:%s,event_type:benchmark_start" % (work_num, date_time))
if self.config['accelerator'] == 'gpu':
self.classifier, self.training_hook = self.get_classifier()
else:
#from tensorflow.contrib.offline_train.python.npu.npu_config import NPURunConfig
#from tensorflow.contrib.offline_train.python.npu.npu_estimator import NPUEstimator
#from tensorflow.contrib.offline_train.python.npu.npu_optimizer import NPUDistributedOptimizer
from npu_bridge.estimator.npu.npu_config import NPURunConfig
from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
self.classifier, self.training_hook = self.get_npu_classifier()
def get_classifier(self):
classifier = tf.estimator.Estimator(
model_fn=self.model.get_estimator_model_func,
model_dir=self.config['log_dir'],
config = tf.estimator.RunConfig(
session_config=self.sess.get_config(),
save_summary_steps=self.config['save_summary_steps'] if self.config['do_checkpoint'] else None,
save_checkpoints_steps=self.config['save_checkpoints_steps'] if self.config['do_checkpoint'] else None,
keep_checkpoint_max=None
)
)
training_hooks = [train_helper.PrefillStagingAreasHook()]
training_hooks.append(self.logger)
return classifier, training_hooks
def get_npu_classifier(self):
session_config = tf.ConfigProto(
inter_op_parallelism_threads=10,
intra_op_parallelism_threads=10,
allow_soft_placement=True)
print (" config.debug:")
print ( self.config['debug'])
print (self.config['log_dir'])
if self.config['debug'] :
run_config = NPURunConfig(hcom_parallel=True, precision_mode='allow_mix_precision', enable_data_pre_proc=True, save_checkpoints_steps=112590, session_config=session_config, model_dir = self.config['model_dir'], iterations_per_loop=self.config['iterations_per_loop'], keep_checkpoint_max=5)
else :
run_config = NPURunConfig(hcom_parallel=True, precision_mode='allow_mix_precision', save_summary_steps=0, log_step_count_steps=None, enable_data_pre_proc=True,save_checkpoints_secs=1e9, session_config=session_config, model_dir = self.config['model_dir'], iterations_per_loop=self.config['iterations_per_loop'])
# run_config = NPURunConfig(enable_data_pre_proc=True,save_checkpoints_secs=1e9, session_config=session_config, model_dir = self.config['model_dir'])
# classifier = tf.estimator.Estimator(
# model_fn=self.model.get_estimator_model_func,
# model_dir=self.config['log_dir'],
# config = tf.estimator.RunConfig(
# session_config=self.sess.get_config(),
# save_summary_steps=self.config['save_summary_steps'] if self.config['do_checkpoint'] else None,
# save_checkpoints_steps=self.config['save_checkpoints_steps'] if self.config['do_checkpoint'] else None,
# keep_checkpoint_max=None
# )
# )
classifier =NPUEstimator(
model_fn= self.model.get_estimator_model_func,
config= run_config
# job_start_file='/tmp/config/deviceid_devindex_jobstart'
)
training_hooks = []
if self.config['debug']:
training_hooks = [train_helper.PrefillStagingAreasHook()]
training_hooks.append(self.logger)
return classifier, training_hooks
def train(self):
# add by zwx5326390
# work_num = "device " + str(os.environ.get("DEVICE_INDEX"))
# date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
# hwlog.vlogger.info("nemespace:%s,time_ts:%s,event_type:epoch_start, num_train_epochs: %d" % ( \
# work_num, date_time, self.config['num_epochs']))
# date_time = hwlog.get_time()
# remark_logger.info("ABK time_ts: %s, current_epoch: %d, batch_size: %d, file: %s, lineno: %s" % \
# (date_time, self.config['num_epochs'], self.config['batch_size'], file_name,
# sys._getframe().f_lineno))
hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=self.config['num_epochs'])
print ('training steps: %d' % self.config['nstep'])
self.classifier.train( input_fn=lambda:self.data.get_train_input_fn(),
# max_steps = self.config['max_train_steps'],
max_steps = self.config['nstep'],
#steps = 100,
hooks = self.training_hook
)
# hwlog.vlogger.info("namespace:%s,time_ts:%s,event_type:epoch_stop, num_train_epochs: %d" % ( \
# work_num, date_time, self.config['num_epochs']))
def evaluate(self):
rank0log(self.print_logger, "Evaluating")
rank0log(self.print_logger, "Validation dataset size: {}".format(self.config['num_evaluating_samples'] ))
time.sleep(5) # a little extra margin...
try:
ckpts = train_helper.sort_and_load_ckpts(self.config['ckpt_dir'])
for i, c in enumerate(ckpts):
if i < len(ckpts) - 1:
if i % self.config['eval_interval'] != 0:
continue
eval_result = self.classifier.evaluate(
input_fn=lambda: self.data.get_eval_input_fn(),
checkpoint_path=c['path'])
# add by zwx5326390
# work_num = "device " + str(os.environ.get("DEVICE_INDEX"))
# date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
# hwlog.vlogger.info("namespace:%s, time_ts:%s, val-top1acc:%d, val-top5acc: %d" % (
# work_num, date_time, eval_result.get("val-top1acc"), eval_result.get("val-top5acc")
# ))
# date_time = hwlog.get_time()
# remark_logger.info("ABK time_ts: %s, accuracy: %f, accuracy_top_5: %f, file: %s, lineno: %s" % \
# (date_time, float(eval_result.get("val-top1acc")),
# float(eval_result.get("val-top5acc")), \
# file_name, sys._getframe().f_lineno))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value=float(eval_result.get("val-top1acc")))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value=float(eval_result.get("val-top5acc")))
#c['epoch'] = math.ceil(c['step'] / (self.config['num_training_samples']/ (self.config['batch_size'])))
c['epoch'] = math.ceil(c['step'] / (self.config['num_training_samples']/ (self.config['batch_size'] * self.config['rank_size'])))
c['top1'] = eval_result['val-top1acc']
c['top5'] = eval_result['val-top5acc']
c['loss'] = eval_result['loss']
rank0log(self.print_logger, ' step epoch top1 top5 loss checkpoint_time(UTC)')
for i, c in enumerate(ckpts):
if 'top1' not in c:
continue
rank0log(self.print_logger,'{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}'
.format(c['step'],
c['epoch'],
c['top1'] * 100,
c['top5'] * 100,
c['loss'],
time=time.strftime('%Y-%m-%d %H:%M:%S',
time.localtime(c['mtime']))))
rank0log(self.print_logger, "Finished evaluation")
except KeyboardInterrupt:
self.print_logger.error("Keyboard interrupt")
def train_and_evaluate(self):
success = False
epochs_between_evals = self.config.get('epochs_between_evals', 4)
for i in range(self.config['num_epochs'] // epochs_between_evals):
rank0log(self.print_logger, "Starting a training cycle")
# add by zwx5326390
# work_num = "device " + str(os.environ.get("DEVICE_INDEX"))
# date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
# hwlog.vlogger.info("nemespace:%s,time_ts:%s,event_type:epoch_start, num_train_epochs: %d" % (\
# work_num, date_time, self.config['num_epochs']))
# date_time = hwlog.get_time()
# remark_logger.info("ABK time_ts: %s, current_epoch: %d, batch_size: %d, file: %s, lineno: %s" % \
# (date_time, self.config['num_epochs'], self.config['batch_size'], file_name,
# sys._getframe().f_lineno))
hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=self.config['num_epochs'])
self.classifier.train(input_fn=lambda:self.data.get_train_input_fn(),
steps = self.config['nsteps_per_epoch']*epochs_between_evals,
hooks = self.training_hook )
# hwlog.vlogger.info("namespace:%s,time_ts:%s,event_type:epoch_stop, num_train_epochs: %d" % ( \
# work_num, date_time, self.config['num_epochs']))
rank0log(self.print_logger, "Starting to evaluate")
rank0log(self.print_logger, "Validation dataset size: {}".format(self.config['num_evaluating_samples'] ))
time.sleep(5) # a little extra margin...
ckpts = train_helper.sort_and_load_ckpts(self.config['log_dir'])
c = ckpts[-1]
eval_result = self.classifier.evaluate(
input_fn=lambda: self.data.get_eval_input_fn(),
checkpoint_path=c['path'])
# add by zwx5326390
# work_num = "device " + str(os.environ.get("DEVICE_INDEX"))
# date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
# hwlog.vlogger.info("namespace:%s, time_ts:%s, val-top1acc:%d, val-top5acc: %d" % (
# work_num, date_time, eval_result.get("val-top1acc"), eval_result.get("val-top5acc")
# ))
# date_time = hwlog.get_time()
# remark_logger.info("ABK time_ts: %s, accuracy: %f, accuracy_top_5: %f, file: %s, lineno: %s" % \
# (date_time, float(eval_result.get("val-top1acc")), float(eval_result.get("val-top5acc")), \
# file_name, sys._getframe().f_lineno))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value=float(eval_result.get("val-top1acc")))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value=float(eval_result.get("val-top5acc")))
c['epoch'] = math.ceil(c['step'] / (self.config['num_training_samples']/ (self.config['batch_size'] * self.config['rank_size'])))
c['top1'] = eval_result['val-top1acc']
c['top5'] = eval_result['val-top5acc']
c['loss'] = eval_result['loss']
rank0log(self.print_logger, ' step epoch top1 top5 loss checkpoint_time(UTC)')
rank0log(self.print_logger,'{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}'
.format(c['step'],
c['epoch'],
c['top1'] * 100,
c['top5'] * 100,
c['loss'],
time=time.strftime('%Y-%m-%d %H:%M:%S',
time.localtime(c['mtime']))))
if eval_result['val-top1acc']*100 > self.config.get('stop_threshold', 74.9):
success = True
break
@@ -0,0 +1,39 @@
import tensorflow as tf
from tensorflow.python.ops import data_flow_ops
import re
import os
from operator import itemgetter
class PrefillStagingAreasHook(tf.train.SessionRunHook):
def after_create_session(self, session, coord):
enqueue_ops = tf.get_collection('STAGING_AREA_PUTS')
for i in range(len(enqueue_ops)):
session.run(enqueue_ops[:i + 1])
def stage(tensors):
"""Stages the given tensors in a StagingArea for asynchronous put/get.
"""
stage_area = data_flow_ops.StagingArea(
dtypes=[tensor.dtype for tensor in tensors],
shapes=[tensor.get_shape() for tensor in tensors])
put_op = stage_area.put(tensors)
get_tensors = stage_area.get()
tf.add_to_collection('STAGING_AREA_PUTS', put_op)
return put_op, get_tensors
def sort_and_load_ckpts(log_dir):
ckpts = []
for f in os.listdir(log_dir):
m = re.match(r'model.ckpt-([0-9]+).index', f)
if m is None:
continue
fullpath = os.path.join(log_dir, f)
ckpts.append({'step': int(m.group(1)),
'path': os.path.splitext(fullpath)[0],
'mtime': os.stat(fullpath).st_mtime,
})
ckpts.sort(key=itemgetter('step'))
return ckpts
@@ -0,0 +1,48 @@
import tensorflow as tf
import os,sys
class CreateSession():
def __init__(self, config):
self.config = config
if self.config['accelerator'] == '1980':
from tensorflow.python.client import device_lib
#from tensorflow.contrib.offline_train.python import npu_ops
from npu_bridge.estimator import npu_ops
#self.estimator_config = tf.ConfigProto(allow_soft_placement=True, min_group_size=20, use_off_line=True)
self.estimator_config = tf.ConfigProto(allow_soft_placement=True)
custom_op = self.estimator_config.graph_options.rewrite_options.custom_optimizers.add()
custom_op.name = "NpuOptimizer"
custom_op.parameter_map["use_off_line"].b = True
custom_op.parameter_map["min_group_size"].b = 20
else:
self.estimator_config = tf.ConfigProto(allow_soft_placement=False)
self.estimator_config.gpu_options.allow_growth = True
if self.config['accelerator'] == '1980':
local_device_protos = device_lib.list_local_devices(self.estimator_config)
self.set_env()
def set_env(self):
# TODO, get env from config file
gpu_thread_count = 2
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count)
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
# barrier = self.hvd.allreduce(tf.constant(0, dtype=tf.float32))
# tf.Session(config=self.estimator_config).run(barrier)
def get_config(self):
self.estimator_config.gpu_options.visible_device_list = str(0)
# self.estimator_config.gpu_options.force_gpu_compatible = True # Force pinned memory
self.estimator_config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
self.estimator_config.inter_op_parallelism_threads = 5
return self.estimator_config
@@ -0,0 +1,103 @@
from __future__ import print_function
import tensorflow as tf
import logging
import numpy as np
import time
import sys,os
from datetime import datetime
# import hwlog
# remark_logger = hwlog.get_logger(__file__, "hw_Resnext50.log")
# file_name = hwlog.get_file_name(__file__)
from benchmark_log import hwlog
class LogSessionRunHook(tf.train.SessionRunHook):
def __init__(self, config, warmup_steps=5):
# def __init__(self, global_batch_size, num_records, display_every=10, logger=None):
self.global_batch_size = config['global_batch_size']
self.iterations_per_loop = config['iterations_per_loop']
self.warmup_steps = warmup_steps
self.iter_times = []
self.num_records = config['num_training_samples']
self.display_every = config['display_every']
self.logger = get_logger(config['log_name'], config['log_dir'])
rank0log(self.logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__))
def after_create_session(self, session, coord):
rank0log(self.logger, 'Step Epoch Speed Loss FinLoss LR')
self.elapsed_secs = 0.
self.count = 0
def before_run(self, run_context):
self.t0 = time.time()
return tf.train.SessionRunArgs(
fetches=[tf.train.get_global_step(), 'loss:0', 'total_loss:0', 'learning_rate:0'])
# 'loss:0', 'loss:0', 'learning_rate:0'])
def after_run(self, run_context, run_values):
batch_time = time.time() - self.t0
self.iter_times.append(batch_time)
self.elapsed_secs += batch_time
self.count += 1
global_step, loss, total_loss, lr = run_values.results
if global_step == 1 or global_step % self.display_every == 0:
dt = self.elapsed_secs / self.count
img_per_sec = self.global_batch_size * self.iterations_per_loop / dt
epoch = global_step * self.global_batch_size / self.num_records
self.logger.info('step:%6i epoch:%5.1f FPS:%7.1f loss:%6.3f total_loss:%6.3f lr:%7.7f' %
(global_step, epoch, img_per_sec, loss, total_loss, lr))
self.elapsed_secs = 0.
self.count = 0
# add by zwx5326390
# work_num = "device " + str(os.environ.get("DEVICE_INDEX"))
# date_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
# hwlog.vlogger.info("namespace:%s, time_ts:%s, FPS:%f, steps: %s" % (work_num, date_time,
# img_per_sec,
# global_step))
# date_time = hwlog.get_time()
# remark_logger.info("ABK time_ts: %s, fps: %f, steps: %s, file: %s, lineno: %s" % \
# (date_time, img_per_sec, global_step, file_name, \
# sys._getframe().f_lineno))
hwlog.remark_print(key=hwlog.FPS, value='%7.1f'%img_per_sec)
def get_average_speed(self):
avg_time = np.mean(self.iter_times[self.warmup_steps:])
speed = self.global_batch_size / avg_time
return speed
def rank0log(logger, *args, **kwargs):
if logger:
logger.info(''.join([str(x) for x in list(args)]))
else:
print(*args, **kwargs)
def get_logger(log_name, log_dir):
logger = logging.getLogger(log_name)
logger.setLevel(logging.INFO) # INFO, ERROR
# file handler which logs debug messages
if not os.path.isdir(log_dir):
try:
os.makedirs(log_dir)
except FileExistsError:
# if log_dir is common for multiple ranks like on nfs
pass
# console handler
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
# add formatter to the handlers
# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
formatter = logging.Formatter('%(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
fh = logging.FileHandler(os.path.join(log_dir, log_name))
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
# add handlers to logger
logger.addHandler(fh)
return logger
@@ -0,0 +1,6 @@
{
"server_count": "1",
"server_list": [{"device":[{devices}],"server_id":"127.0.0.1"}],
"status": "completed",
"version": "1.0"
}
@@ -0,0 +1,18 @@
#!/bin/sh
currentDir=$(cd "$(dirname "$0")"; pwd)
cd ${currentDir}
device_group=$@
device_num=$#
touch ${currentDir}/main.log
for device_phy_id in ${device_group}
do
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train.sh ${device_phy_id} & " >> ${currentDir}/main.log
${currentDir}/train.sh ${device_phy_id} &
done
wait
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] all train.sh exit " >> ${currentDir}/main.log
@@ -0,0 +1,41 @@
# main env
if [ -d /usr/local/Ascend/nnae/latest ];then
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
else
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$projectDir
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
fi
export SOC_VERSION=Ascend910
export HCCL_CONNECT_TIMEOUT=600
# user env
export JOB_ID={JOB_ID}
export RANK_TABLE_FILE={RANK_TABLE_FILE}
#export RANK_SIZE={RANK_SIZE}
#export RANK_INDEX={RANK_INDEX}
#export RANK_ID={RANK_ID}
# profiling env
export PROFILING_MODE={PROFILING_MODE}
export AICPU_PROFILING_MODE={AICPU_PROFILING_MODE}
export PROFILING_OPTIONS={PROFILING_OPTIONS}
export FP_POINT={FP_POINT}
export BP_POINT={BP_POINT}
# debug env
#export DUMP_GE_GRAPH=2
#export DUMP_OP=1
#export DUMP_OP_LESS=1
#export PRINT_MODEL=1
#export TE_PARALLEL_COMPILER=0
# system env
ulimit -c unlimited
@@ -0,0 +1,33 @@
#!/bin/sh
currentDir=$(cd "$(dirname "$0")"; pwd)
cd ${currentDir}
PWD=${currentDir}
device_id=$1
if [ x"${device_id}" = x ] ;
then
echo "turing train fail" >> ${currentDir}/train_${device_id}.log
exit
else
export DEVICE_ID=${device_id}
fi
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
export DEVICE_INDEX=${DEVICE_INDEX}
env > ${currentDir}/env_${device_id}.log
#mkdir exec path
mkdir -p ${currentDir}/${device_id}
rm -rf ${currentDir}/${device_id}/*
cd ${currentDir}/${device_id}
#start exec
python3.7 {RUN_ALGORITHM_CMD} {CHECKPOINT_DIR} > ${currentDir}/train_${device_id}.log 2>&1
if [ $? -eq 0 ] ;
then
echo "turing train success" >> ${currentDir}/train_${device_id}.log
else
echo "turing train fail" >> ${currentDir}/train_${device_id}.log
fi
@@ -0,0 +1,99 @@
#!/bin/bash
rank_size=$1
yamlPath=$2
toolsPath=$3
currentDir=$(cd "$(dirname "$0")/.."; pwd)
if [ -f /.dockerenv ];then
CLUSTER=$4
MPIRUN_ALL_IP="$5"
export CLUSTER=${CLUSTER}
fi
#export RANK_ID=npu${rank_size}p
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
data_url_new=`echo ${data_url//\//\\\\/}`
echo ${data_url}
echo ${max_steps}
echo ${epoches}
if [ x"${CLUSTER}" == x"True" ];then
jsonFilePath=${currentDir}/code/resnext50_train/configs/res50_32bs_8p.py
elif [ ${rank_size} -lt 8 ];then
jsonFilePath=${currentDir}/code/resnext50_train/configs/res50_32bs_1p.py
if [ ${rank_size} -eq 1 ];then
sed -i "0,/rank_size.*$/s//rank_size\': ${rank_size},/" ${jsonFilePath}
elif [ ${rank_size} -eq 2 ];then
sed -i "0,/rank_size.*$/s//rank_size\': ${rank_size},/" ${jsonFilePath}
else
sed -i "0,/rank_size.*$/s//rank_size\': ${rank_size},/" ${jsonFilePath}
fi
else
jsonFilePath=${currentDir}/code/resnext50_train/configs/res50_32bs_8p.py
if [ ${rank_size} -eq 8 ];then
sed -i "0,/rank_size.*$/s//rank_size\': ${rank_size},/" ${jsonFilePath}
else
rank_size=16
sed -i "0,/rank_size.*$/s//rank_size\': ${rank_size},/" ${jsonFilePath}
fi
fi
#echo "jsonfilepath is "${jsonFilePath}
sed -i "s/data_url.*$/data_url\': \'${data_url_new}\',/g" ${jsonFilePath}
#sed -i "s/max_train_steps.*$/max_train_steps\': ${max_steps},/g" ${jsonFilePath}
sed -i "s/num_epochs.*$/num_epochs\': ${epoches},/g" ${jsonFilePath}
sed -i "0,/batch_size.*$/s//batch_size\': ${batch_size},/" ${jsonFilePath}
sed -i "s/epochs_between_evals.*$/epochs_between_evals\': ${epochs_between_evals},/g" ${jsonFilePath}
sed -i 's/\r//g' ${jsonFilePath}
currtime=`date +%Y%m%d%H%M%S`
mkdir -p ${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/
train_job_dir=${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${train_job_dir} &"
# device 列表, 若无指定 device 或大于等于 8p 时根据 rank_size 顺序选择
eval device_group=\$device_group_${rank_size}p
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
device_group="$(seq 0 "$(expr $rank_size - 1)")"
fi
# get last device id in device_group, hw log in performance from the dir named last_device_id
device_group_str=`echo ${device_group} | sed 's/ //g'`
first_device_id=`echo ${device_group_str: 0:1}`
echo ${device_group_str}
echo ${first_device_id}
rank_id=0
if [ x"${CLUSTER}" == x"True" ];then
# ln hw log
ln -snf ${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/0/hw_resnext50.log ${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/
this_ip=$(hostname -I |awk '{print $1}')
for ip in $MPIRUN_ALL_IP;do
if [ x"$this_ip" != x"$ip" ];then
scp $yamlPath root@$ip:$yamlPath
scp $jsonFilePath root@$ip:$jsonFilePath
fi
done
export PATH=$PATH:/usr/local/mpirun4.0/bin
mpirun -H ${mpirun_ip} \
--bind-to none -map-by slot\
--allow-run-as-root \
--mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
--prefix /usr/local/mpirun4.0/ \
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} ${CLUSTER}
else
# ln hw log
ln -snf ${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/${first_device_id}/hw_resnext50.log ${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/
for device_id in $device_group;do
#echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train ${device_id} & " >> ${currentDir}/result/main.log
${currentDir}/scripts/train.sh $device_id $rank_size $yamlPath $currtime ${toolsPath} $rank_id&
let rank_id++
done
fi
wait
@@ -0,0 +1,117 @@
#!/usr/bin/env bash
device_id=$1
rank_size=$2
yamlPath=$3
currtime=$4
toolsPath=$5
currentDir=$(cd "$(dirname "$0")/.."; pwd)
mkdir -p ${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/
export train_job_dir=${currentDir%train*}/train/result/tf_resnext50/training_job_${currtime}/
source ${currentDir}/config/npu_set_env.sh
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
# 声明变量
export REMARK_LOG_FILE=hw_resnext50.log # 打点日志文件名称, 必须hw_后跟模型名称小写
# 添加日志打点模块路径
benchmark_log_path=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
# user env
export HCCL_CONNECT_TIMEOUT=600
export JOB_ID=9999001
export RANK_TABLE_FILE=${currentDir}/config/${rank_size}p.json
export RANK_SIZE=${rank_size}
export RANK_INDEX=0
export SLOG_PRINT_TO_STDOUT=0
export DEVICE_ID=$1
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
export DEVICE_INDEX=${DEVICE_INDEX}
export YAML_PATH=$3
export MODEL_CKPT_PATH=${currentDir}/result/ckpt${device_id}
if [ ${profiling_mode} == True ];
then
export PROFILING_MODE=true
else
export PROFILING_MODE=false
fi
if [ ${aicpu_profiling_mode} == True ];
then
export AICPU_PROFILING_MODE=true
else
export AICPU_PROFILING_MODE=false
fi
export PROFILING_OPTIONS=${profiling_options}
export FP_POINT=${fp_point}
export BP_POINT=${bp_point}
cd ${train_job_dir}
curd_dir=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils/atlasboost
export PYTHONPATH=$PYTHONPATH:${curd_dir}
if [ x"$6" != x"True" ];then
rank_id=$6
export RANK_ID=$6
else
device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
atlasboost.set_device_id(device_id);print(atlasboost.rank())")
device_id_mo=`echo $device_id_mo`
rank_id=${device_id_mo##* }
export RANK_ID=${rank_id}
device=${device_id_mo##*deviceid = }
device_id=${device%% phyid=*}
export DEVICE_ID=${device_id}
hccljson=${train_job_dir}/*.json
cp ${hccljson} ${currentDir}/config/${rank_size}p.json
fi
#mkdir exec path
mkdir -p ${train_job_dir}/${device_id}
cd ${train_job_dir}/${device_id}
startTime=`date +%Y%m%d-%H:%M:%S`
startTime_s=`date +%s`
#cd ${currentDir}/code
# 根据单卡/多卡区分调用参数
if [ x"$6" == x"True" ];then
export CLUSTER=True
# 多卡多机
rm -rf ${currentDir}/result/*.log
rm -rf ${currentDir}/code/core.*
python3.7 ${currentDir}/code/resnext50_train/mains/res50.py --config_file=res50_32bs_8p --max_train_steps=${max_steps} --iterations_per_loop=1000 --debug=True --eval=True --model_dir=${currentDir}/result/ckpt${device_id} > ${train_job_dir}/train_${device_id}.log 2>&1
elif [ ${rank_size} -le 4 ];then
# 单卡
python3.7 ${currentDir}/code/resnext50_train/mains/res50.py --config_file=res50_32bs_1p --max_train_steps=${max_steps} --iterations_per_loop=1000 --debug=True --eval=False --model_dir=${currentDir}/result/ckpt${device_id} > ${train_job_dir}/train_${device_id}.log 2>&1
elif [ ${rank_size} -le 8 ];then
# 多卡单机
python3.7 ${currentDir}/code/resnext50_train/mains/res50.py --config_file=res50_32bs_8p --max_train_steps=${max_steps} --iterations_per_loop=1000 --debug=True --eval=True --model_dir=${currentDir}/result/ckpt${device_id} > ${train_job_dir}/train_${device_id}.log 2>&1
fi
if [ $? -eq 0 ] ;then
echo ":::ABK 1.0.0 resnext50 train success"
echo ":::ABK 1.0.0 resnext50 train success" >> ${train_job_dir}/train_${device_id}.log
echo ":::ABK 1.0.0 resnext50 train success" >> ${train_job_dir}/${device_id}/hw_resnext50.log
else
echo ":::ABK 1.0.0 resnext50 train failed"
echo ":::ABK 1.0.0 resnext50 train failed" >> ${train_job_dir}/train_${device_id}.log
echo ":::ABK 1.0.0 resnext50 train failed" >> ${train_job_dir}/${device_id}/hw_resnext50.log
fi
endTime=`date +%Y%m%d-%H:%M:%S`
endTime_s=`date +%s`
sumTime=$[ $endTime_s - $startTime_s ]
hour=$(( $sumTime/3600 ))
min=$(( ($sumTime-${hour}*3600)/60 ))
sec=$(( $sumTime-${hour}*3600-${min}*60 ))
echo ${hour}:${min}:${sec}
echo ":::ABK 1.0.0 resnext50 train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_resnext50.log