Files
ascend-tools/train/yaml/ResNet50.yaml
T
2020-10-19 20:22:23 +08:00

67 lines
2.1 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
tensorflow_config:
# 基本参数
data_url: /home/imagenet_TF/
batch_size: 32
# 1p/8p, epoches设为90
epoches: 1
# 跑精度时max_train_steps设为None
max_train_steps: 1000
epochs_between_evals: 1
iterations_per_loop: 100
save_checkpoints_steps: 115200
# 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
mpirun_ip: 90.90.176.152:8,90.90.176.154:8
# docker 镜像名称:版本号
docker_image: c73:b02
# 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
device_group_1p: 0
device_group_2p: 0 1
device_group_4p: 0 1 2 3
pytorch_config:
# 基本参数
data_url: /home/imagenet/
#跑1p时batch_size为5122p时为1024;4p时为2048;跑8p时batch_size为4096
batch_size: 512
epoches: 90
# train_and_evaluate、evaluate两种模式
mode: train_and_evaluate
ckpt_path: /home/train/result/pt_resnet50/training_job_20200916042624/7/checkpoint_npu7model_best.pth.tar
# docker 镜像名称:版本号
docker_image: c73:b02
# 默认参数1p时为0.22p/4p/8p时为2.048
lr: 0.2
# 指定 device id, 数量需与 rank_size 相同
device_group_1p: 0
device_group_2p: 0 1
device_group_4p: 0 1 2 3
mindspore_config:
# 基本参数
# 训练时数据集/home/data/imagenet/train, 评测是数据集是/home/data/imagenet/val
data_url: /home/data/imagenet/train
#跑1p/2p/4p/8p时batch_size均为256
batch_size: 256
epoches: 5
pre_trained: None
save_checkpoint_epochs: 5
loss_scale: 1024
# modetrain or evaluate
mode: train
# 将训练后生成的ckpt的路径配置在此处
checkpoint_path: /home/wx933135/benchmark_20200924-benchmark_Alpha/benchmark_20200924-benchmark_Alpha/train/result/ms_resnet50/training_job_20200928154504/2/ckpt_2/resnet-5_625.ckpt
# eval_device_id,评测是指定的device id
eval_device_id: 4
# docker 镜像名称:版本号
docker_image: c73:b02
# 指定 device id, 数量需与 rank_size 相同
device_group_1p: 0
device_group_1p: 0 1
device_group_1p: 0 1 2 3