tensorflow_config: train_batch_size: 32 training_file_pattern: /home/data/raw_data/tfrecord/train2017* resnet_checkpoint: /home/data/raw_data/resnet34_pretrain_model/model.ckpt-28152 validation_file_pattern: /home/data/raw_data/tfrecord/val2017* val_json_file: /home/data/raw_data/annotations/instances_val2017.json eval_batch_size: 32 num_epochs: 1 model_dir: result_npu max_steps: 432000 # train_and_eval, train, eval runmode: train_and_eval # 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同 device_group_1p: 0 device_group_2p: 0 1 device_group_4p: 0 1 2 3 # 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2 mpirun_ip: 90.90.176.152:8,90.90.176.154:8 # docker 镜像名称:版本号 docker_image: mpirun3:latest