[add]上传训练benchmark by z00560161
This commit is contained in:
@@ -0,0 +1,46 @@
|
||||
tensorflow_config:
|
||||
#layer层数有6和12两种,中文数据集用 bert_base_layer6_cn.json/bert_base_layer12_cn.json 英文用bert_base_layer6_cn.json/bert_base_layer12_en.json
|
||||
bert_config_file: bert_base_layer6_cn.json
|
||||
#数据集句子长度是256时 设置为 256,40,句子长度是128时设置为128,20
|
||||
max_seq_length: 128
|
||||
max_predictions_per_seq: 20
|
||||
|
||||
# 最佳性能train_batch_size为160
|
||||
train_batch_size: 160
|
||||
learning_rate: 1e-4
|
||||
num_warmup_steps: 100
|
||||
num_train_steps: 1000
|
||||
optimizer_type: adam
|
||||
manual_fp16: True
|
||||
use_fp16_cls: True
|
||||
input_files_dir: /home/BertData/cn-wiki-128/
|
||||
eval_files_dir: /home/BertData/cn-wiki-128/
|
||||
npu_bert_debug: False
|
||||
npu_bert_use_tdt: True
|
||||
distributed: True
|
||||
do_train: True
|
||||
do_eval: False
|
||||
num_accumulation_steps: 1
|
||||
iterations_per_loop: 100
|
||||
npu_bert_loss_scale: 0
|
||||
save_checkpoints_steps: 1000
|
||||
npu_bert_clip_by_global_norm: False
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: c73:b021
|
||||
|
||||
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
|
||||
mpirun_ip: 90.90.140.199:8,90.90.140.229:8
|
||||
|
||||
# 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
|
||||
device_group_1p: 6
|
||||
device_group_2p: 0 1
|
||||
device_group_4p: 0 1 2 3
|
||||
|
||||
#profiling 配置
|
||||
PROFILING_MODE: false
|
||||
AICPU_PROFILING_MOD: false
|
||||
PROFILING_OPTIONS: training_trace
|
||||
FP_POINT: bert/embeddings/GatherV2
|
||||
BP_POINT: gradients/bert/embeddings/IdentityN_1_grad/UnsortedSegmentSum
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
tensorflow_config:
|
||||
#中文数据用 bert_config_large_cn.json 英文用bert_config_large_en.json
|
||||
bert_config_file: bert_config_large_cn.json
|
||||
#数据集句子长度是256时 设置为 256,40,句子长度是128时设置为128,20
|
||||
max_seq_length: 128
|
||||
max_predictions_per_seq: 20
|
||||
|
||||
# 最佳性能train_batch_size为96,如果超显存,可调小至32
|
||||
train_batch_size: 96
|
||||
learning_rate: 3.125e-5
|
||||
num_warmup_steps: 100
|
||||
num_train_steps: 1000
|
||||
optimizer_type: adam
|
||||
manual_fp16: True
|
||||
use_fp16_cls: True
|
||||
input_files_dir: /home/data/bert_nv/dataset/cn-wiki-128/
|
||||
eval_files_dir: /home/data/bert_nv/dataset/cn-wiki-128/
|
||||
do_train: True
|
||||
do_eval: False
|
||||
num_accumulation_steps: 1
|
||||
iterations_per_loop: 100
|
||||
npu_bert_loss_scale: 0
|
||||
distributed: True
|
||||
graph_memory_max_size: 27917287424
|
||||
variable_memory_max_size: 5368709120
|
||||
npu_bert_tail_optimize: True
|
||||
save_checkpoints_steps: 1000
|
||||
npu_bert_clip_by_global_norm: False
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: c75:b031
|
||||
|
||||
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
|
||||
mpirun_ip: 90.90.140.199:8,90.90.140.229:8
|
||||
|
||||
# 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
|
||||
device_group_1p: 0
|
||||
device_group_2p: 1 4
|
||||
device_group_4p: 0 1 2 3
|
||||
|
||||
#profiling 配置
|
||||
PROFILING_MODE: false
|
||||
AICPU_PROFILING_MODE: false
|
||||
PROFILING_OPTIONS: training_trace
|
||||
FP_POINT: bert/embeddings/GatherV2
|
||||
BP_POINT: gradients/bert/embeddings/IdentityN_1_grad/UnsortedSegmentSum
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
pytorch_config:
|
||||
# 基本参数
|
||||
data_url: /home/zhusiyi/dataset/peta/
|
||||
|
||||
epoches: 150
|
||||
# 1p 参数为256 2p 512 4p 1024 8p为2048
|
||||
batch_size: 256
|
||||
|
||||
seed: 49
|
||||
|
||||
# 默认参数1p 0.01 2p 0.016 4p 0.016 8p 0.016
|
||||
lr: 0.016
|
||||
|
||||
#单P指定 卡 默认7卡
|
||||
device: 7
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: c73:b021
|
||||
@@ -0,0 +1,52 @@
|
||||
tensorflow_config:
|
||||
# 基本参数
|
||||
data_url: /home/imagenet_TF/
|
||||
epoches: 1
|
||||
epochs_between_evals: 1
|
||||
batch_size: 32
|
||||
log_dir: ./ckpt
|
||||
|
||||
# 1p参数
|
||||
mode_1p: train # train、evaluate、train_and_evaluate三种模式
|
||||
max_train_steps_1p: 100
|
||||
iterations_per_loop_1p: 10
|
||||
display_every: 10
|
||||
log_name_1p: densenet121_1p.log
|
||||
|
||||
# 8p参数
|
||||
mode_8p: train_and_evaluate # train、evaluate、train_and_evaluate三种模式
|
||||
iterations_per_loop_8p: 1000
|
||||
lr: 0.1
|
||||
log_name_8p: densenet121_8p.log
|
||||
|
||||
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
|
||||
mpirun_ip: 90.90.176.154:8,90.90.176.54:8
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: c73:b02
|
||||
|
||||
# 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
|
||||
device_group_1p: 0
|
||||
device_group_2p: 0 1
|
||||
device_group_4p: 0 1 2 3
|
||||
|
||||
pytorch_config:
|
||||
# 基本参数
|
||||
data_url: /home/imagenet/
|
||||
|
||||
epoches: 90
|
||||
# 1p 参数为256 2p 512 4p 1024 8p为2048
|
||||
batch_size: 256
|
||||
|
||||
# 默认参数1p 0.1 2p 0.2 4p 0.4 8p 0.8
|
||||
lr: 0.1
|
||||
|
||||
seed: 49
|
||||
|
||||
# 单P指定卡 默认0卡
|
||||
device_single: 0
|
||||
|
||||
# 2p 4p 8p 修改 2p可选 1,2或2,3 两两相连 4p 可选'0,1,2,3'或'1,2,3,4'规则同2p 8p默认为'0,1,2,3,4,5,6,7'
|
||||
device_group_multi: '0,1,2,3,4,5,6,7'
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: c73:b021
|
||||
@@ -0,0 +1,20 @@
|
||||
pytorch_config:
|
||||
# 基本参数
|
||||
data_url: /home/imagenet/
|
||||
|
||||
epoches: 90
|
||||
# 1p 参数为512 2p 1024 4p 2048 8p为4096
|
||||
batch_size: 512
|
||||
|
||||
seed: 49
|
||||
|
||||
# 默认参数1p 0.4 2p 0.8 4p 1.6 8p 3.2
|
||||
lr: 0.4
|
||||
|
||||
#单P指定 卡 默认0卡
|
||||
device: 0
|
||||
|
||||
# 2p 4p 8p 修改 2p可选 1,2或2,3 两两相连 4p 可选'0,1,2,3'或'1,2,3,4'规则同2p 8p默认为'0,1,2,3,4,5,6,7'
|
||||
device_group: '0,1,2,3,4,5,6,7'
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: c73:b021
|
||||
@@ -0,0 +1,49 @@
|
||||
tensorflow_config:
|
||||
# 基本参数
|
||||
max_steps: 1000
|
||||
data_url: /home/imagenet_TF/
|
||||
epoches: 1
|
||||
|
||||
# 训练(train) 或 评测(evaluate)
|
||||
mode: train
|
||||
batch_size: 256
|
||||
#仅在 mode 为 evaluate 时用到
|
||||
ckpt_path: /opt/0908/benchmark-benchmark_Alpha/train/result/tf_mobilenet/trainingJob_20200905171017/0/results/model.ckpt-123125
|
||||
|
||||
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
|
||||
mpirun_ip: 90.90.176.152:8,90.90.176.154:8
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: c73:b021
|
||||
|
||||
# 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
|
||||
device_group_1p: 0
|
||||
device_group_2p: 0 1
|
||||
device_group_4p: 0 1 2 3
|
||||
|
||||
profiling_mode: false
|
||||
profiling_options: training_trace
|
||||
fp_point: L2Loss
|
||||
bp_point: gradients/AddN_30
|
||||
aicpu_profiling_mode: false
|
||||
|
||||
|
||||
pytorch_config:
|
||||
# 基本参数
|
||||
data_url: /home/imagenet/
|
||||
|
||||
epoches: 600
|
||||
|
||||
# 单p默认512 2p 1024 4p 2048 8p默认4096
|
||||
batch_size: 512
|
||||
|
||||
# 默认参数1p 0.05 2p 0.1 4p 0.2 8p 0.4
|
||||
lr: 0.05
|
||||
|
||||
seed: 123456
|
||||
device_single: 'npu:0'
|
||||
# 指定2p 4p 8p卡 2p可选'0,1' 4p可选'0,1,2,3' 8p为'0,1,2,3,4,5,6,7'
|
||||
device_group_mutli: '0,1,2,3'
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: c73:b021
|
||||
@@ -0,0 +1,19 @@
|
||||
tensorflow_config:
|
||||
# 基本参数
|
||||
data_url: /home/imagenet_TF/
|
||||
# 1p/8p,epoches设为150
|
||||
epoches: 1
|
||||
epochs_between_evals: 1
|
||||
max_train_steps: 1000
|
||||
batch_size: 128
|
||||
|
||||
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
|
||||
mpirun_ip: 90.90.176.152:8,90.90.176.154:8
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: c73:b02
|
||||
|
||||
# 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
|
||||
device_group_1p: 0
|
||||
device_group_2p: 0 1
|
||||
device_group_4p: 0 1 2 3
|
||||
@@ -0,0 +1,66 @@
|
||||
tensorflow_config:
|
||||
# 基本参数
|
||||
data_url: /home/imagenet_TF/
|
||||
batch_size: 32
|
||||
# 1p/8p, epoches设为90
|
||||
epoches: 1
|
||||
# 跑精度时max_train_steps设为None
|
||||
max_train_steps: 1000
|
||||
epochs_between_evals: 1
|
||||
iterations_per_loop: 100
|
||||
save_checkpoints_steps: 115200
|
||||
|
||||
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
|
||||
mpirun_ip: 90.90.176.152:8,90.90.176.154:8
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: c73:b02
|
||||
|
||||
# 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
|
||||
device_group_1p: 0
|
||||
device_group_2p: 0 1
|
||||
device_group_4p: 0 1 2 3
|
||||
|
||||
pytorch_config:
|
||||
# 基本参数
|
||||
data_url: /home/imagenet/
|
||||
#跑1p时batch_size为512;2p时为1024;4p时为2048;跑8p时batch_size为4096
|
||||
batch_size: 512
|
||||
epoches: 90
|
||||
# train_and_evaluate、evaluate两种模式
|
||||
mode: train_and_evaluate
|
||||
ckpt_path: /home/train/result/pt_resnet50/training_job_20200916042624/7/checkpoint_npu7model_best.pth.tar
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: c73:b02
|
||||
# 默认参数1p时为0.2,2p/4p/8p时为2.048
|
||||
lr: 0.2
|
||||
# 指定 device id, 数量需与 rank_size 相同
|
||||
device_group_1p: 0
|
||||
device_group_2p: 0 1
|
||||
device_group_4p: 0 1 2 3
|
||||
|
||||
mindspore_config:
|
||||
# 基本参数
|
||||
# 训练时数据集/home/data/imagenet/train, 评测是数据集是/home/data/imagenet/val
|
||||
data_url: /home/data/imagenet/train
|
||||
#跑1p/2p/4p/8p时batch_size均为256
|
||||
batch_size: 256
|
||||
epoches: 5
|
||||
pre_trained: None
|
||||
save_checkpoint_epochs: 5
|
||||
loss_scale: 1024
|
||||
# mode:train or evaluate
|
||||
mode: train
|
||||
# 将训练后生成的ckpt的路径配置在此处
|
||||
checkpoint_path: /home/wx933135/benchmark_20200924-benchmark_Alpha/benchmark_20200924-benchmark_Alpha/train/result/ms_resnet50/training_job_20200928154504/2/ckpt_2/resnet-5_625.ckpt
|
||||
|
||||
# eval_device_id,评测是指定的device id
|
||||
eval_device_id: 4
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: c73:b02
|
||||
|
||||
# 指定 device id, 数量需与 rank_size 相同
|
||||
device_group_1p: 0
|
||||
device_group_1p: 0 1
|
||||
device_group_1p: 0 1 2 3
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
tensorflow_config:
|
||||
# 基本参数
|
||||
max_steps: 1000
|
||||
data_url: /home/imagenet_TF
|
||||
epoches: 1
|
||||
epochs_between_evals: 1
|
||||
mode: train
|
||||
batch_size: 32
|
||||
|
||||
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
|
||||
mpirun_ip: 90.90.176.154:8,90.90.176.54:8
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: mpirun3:latest
|
||||
|
||||
|
||||
# 1. 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
|
||||
# 2. 仅在小于 8p 时生效
|
||||
# 3. 若不使用该配置, 请使用在行首添加'#'注释的方法将其关闭
|
||||
# device_group: 0 1 2 3
|
||||
device_group_1p: 0
|
||||
device_group_2p: 0 1
|
||||
device_group_4p: 0 1 2 3
|
||||
@@ -0,0 +1,28 @@
|
||||
tensorflow_config:
|
||||
# 基本参数
|
||||
max_steps: 1000
|
||||
data_url: /home/imagenet_TF/
|
||||
epoches: 1
|
||||
epochs_between_evals: 1
|
||||
batch_size: 32
|
||||
|
||||
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
|
||||
mpirun_ip: 90.90.176.152:8,90.90.176.154:8
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: mpirun3:latest
|
||||
|
||||
|
||||
# 1. 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
|
||||
# 2. 仅在小于 8p 时生效
|
||||
# 3. 若不使用该配置, 请使用在行首添加'#'注释的方法将其关闭
|
||||
# device_group: 0 1 2 3
|
||||
device_group_1p: 0
|
||||
device_group_2p: 0 1
|
||||
device_group_4p: 0 1 2 3
|
||||
|
||||
profiling_mode: false
|
||||
profiling_options: training_trace
|
||||
fp_point: fp32_vars/conv2d/Conv2Dfp32_vars/BatchNorm/FusedBatchNormV3_Reduce
|
||||
bp_point: loss_scale/gradients/AddN_70
|
||||
aicpu_profiling_mode: false
|
||||
@@ -0,0 +1,25 @@
|
||||
tensorflow_config:
|
||||
|
||||
train_batch_size: 32
|
||||
training_file_pattern: /home/data/raw_data/tfrecord/train2017*
|
||||
resnet_checkpoint: /home/data/raw_data/resnet34_pretrain_model/model.ckpt-28152
|
||||
validation_file_pattern: /home/data/raw_data/tfrecord/val2017*
|
||||
val_json_file: /home/data/raw_data/annotations/instances_val2017.json
|
||||
eval_batch_size: 32
|
||||
num_epochs: 1
|
||||
model_dir: result_npu
|
||||
max_steps: 432000
|
||||
|
||||
# train_and_eval, train, eval
|
||||
runmode: train_and_eval
|
||||
|
||||
# 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
|
||||
device_group_1p: 0
|
||||
device_group_2p: 0 1
|
||||
device_group_4p: 0 1 2 3
|
||||
|
||||
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
|
||||
mpirun_ip: 90.90.176.152:8,90.90.176.154:8
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: mpirun3:latest
|
||||
@@ -0,0 +1,18 @@
|
||||
pytorch_config:
|
||||
# 基本参数
|
||||
data_url: /home/imagenet/
|
||||
#跑1p时batch_size为1536, 2p为2048,4p为4096,8p时为8196
|
||||
batch_size: 1536
|
||||
epoches: 10
|
||||
warm_up_epochs: 5
|
||||
epochs_between_evals: 5
|
||||
# 默认参数1p 0.75 2p 1 4p 2 8p 4
|
||||
lr: 0.75
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: c73:b02
|
||||
|
||||
# 指定 device id, 数量需与 rank_size 相同
|
||||
device_group_1p: 0
|
||||
device_group_multi: 0,1,2,3,4,5,6,7
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
tensorflow_config:
|
||||
|
||||
#mode支持single或者 multi
|
||||
mode: single
|
||||
|
||||
data_url: /opt/npu/dataset
|
||||
|
||||
# train 或者 evaluate
|
||||
runmode: train
|
||||
|
||||
#仅在evaluate时用到
|
||||
ckpt_path: /train/benchmark-master720/train/atlas_benchmark-master/object_detection/yolov3/tensorflow/result/TrainingJob-20200724115042
|
||||
|
||||
total_epoches: 1
|
||||
|
||||
save_epoch: 3
|
||||
|
||||
#single 默认32 multi默认16
|
||||
batch_size: 32
|
||||
|
||||
# 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
|
||||
device_group_1p: 0
|
||||
device_group_2p: 0 1
|
||||
device_group_4p: 0 1 2 3
|
||||
|
||||
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
|
||||
mpirun_ip: 90.90.176.152:8,90.90.176.154:8
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: mpirun3:latest
|
||||
|
||||
profiling_mode: false
|
||||
profiling_options: training_trace:task_trace
|
||||
fp_point: yolov3/darknet53_body/Conv/Conv2D
|
||||
bp_point: cond_1/Momentum/update_yolov3/yolov3_head/Conv_9/weights/ApplyMomentum
|
||||
aicpu_profiling_mode: false
|
||||
@@ -0,0 +1,17 @@
|
||||
tensorflow_config:
|
||||
# 基本参数
|
||||
imagenet_data: /home/imagenet_TF/
|
||||
# yolo数据集
|
||||
yolo_data: /opt/npu/dataset/
|
||||
# bert数据集
|
||||
bert_data: /home/BertData/
|
||||
# ssd数据集
|
||||
ssd_data: /home/data/raw_data
|
||||
# 镜像名称
|
||||
docker_images: ubuntu:b020
|
||||
|
||||
# 指定容器ip
|
||||
ip: 90.90.176.110
|
||||
|
||||
# 掩码个数 255.255.255.0 选24 255.255.254.0 选23:wq
|
||||
epcount: 24
|
||||
@@ -0,0 +1,34 @@
|
||||
tensorflow_config:
|
||||
# 基本参数
|
||||
data_url: /home/imagenet_TF/
|
||||
max_epoches: 100
|
||||
epochs_between_evals: 1
|
||||
batch_size: 32
|
||||
log_dir: ckpt
|
||||
max_train_steps: 1000
|
||||
|
||||
# 1p参数
|
||||
mode_1p: train # train、evaluate、train_and_evaluate三种模式,训练(train) 或训练评测(train_and_evaluate)
|
||||
iterations_per_loop_1p: 10
|
||||
display_every: 10
|
||||
log_name_1p: vgg16_1p.log
|
||||
|
||||
# 8p参数
|
||||
mode_8p: train_and_evaluate # train、evaluate、train_and_evaluate三种模式,训练(train) 或训练评测(train_and_evaluate)
|
||||
iterations_per_loop_8p: 5004
|
||||
lr: 0.01
|
||||
log_name_8p: vgg16_8p.log
|
||||
|
||||
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
|
||||
mpirun_ip: 90.90.176.154:8,90.90.176.54:8
|
||||
|
||||
# docker 镜像名称:版本号
|
||||
docker_image: mpirun3:latest
|
||||
|
||||
# 1. 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
|
||||
# 2. 仅在小于 8p 时生效
|
||||
# 3. 若不使用该配置, 请使用在行首添加'#'注释的方法将其关闭
|
||||
# device_group: 0 1 2 3
|
||||
device_group_1p: 0
|
||||
device_group_2p: 0 1
|
||||
device_group_4p: 0 1 2 3
|
||||
Reference in New Issue
Block a user