[add]上传训练benchmark by z00560161
This commit is contained in:
@@ -0,0 +1,25 @@
|
||||
# DenseNet121_pytorch训练说明
|
||||
|
||||
### 1. 模型训练参数配置
|
||||
|
||||
在train/yaml/DenseNet121.yaml中修改相应配置, 配置项含义:
|
||||
|
||||
```
|
||||
pytorch_config:
|
||||
data_url: 数据集路径
|
||||
epoches: 跑多少个epoch
|
||||
batch_size: 1p 参数为256 2p 512 4p 1024 8p为2048
|
||||
lr: 默认参数1p 0.1 2p 0.2 4p 0.4 8p 0.8
|
||||
seed: 49
|
||||
docker_image: docker 镜像名称:版本号
|
||||
```
|
||||
|
||||
------
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
+515
@@ -0,0 +1,515 @@
|
||||
import argparse
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
import time
|
||||
import warnings
|
||||
import sys
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.parallel
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torch.distributed as dist
|
||||
import torch.optim
|
||||
import torch.multiprocessing as mp
|
||||
import torch.utils.data
|
||||
import torch.utils.data.distributed
|
||||
import torchvision.transforms as transforms
|
||||
import torchvision.datasets as datasets
|
||||
import torchvision.models as models
|
||||
import torch.npu
|
||||
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
from densenet_0_2_2 import densenet121
|
||||
|
||||
import numpy as np
|
||||
from apex import amp
|
||||
|
||||
|
||||
from benchmark_log import hwlog
|
||||
from benchmark_log.basic_utils import get_environment_info
|
||||
from benchmark_log.basic_utils import get_model_parameter
|
||||
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
model_names = sorted(name for name in models.__dict__
|
||||
if name.islower() and not name.startswith("__")
|
||||
and callable(models.__dict__[name]))
|
||||
|
||||
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
|
||||
parser.add_argument('--data', metavar='DIR', default='/opt/npu/dataset/imagenet',
|
||||
help='path to dataset')
|
||||
parser.add_argument('-a', '--arch', metavar='ARCH', default='densenet121',
|
||||
choices=model_names,
|
||||
help='model architecture: ' +
|
||||
' | '.join(model_names) +
|
||||
' (default: resnet18)')
|
||||
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
|
||||
help='number of data loading workers (default: 8)')
|
||||
parser.add_argument('--epochs', default=90, type=int, metavar='N',
|
||||
help='number of total epochs to run')
|
||||
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
|
||||
help='manual epoch number (useful on restarts)')
|
||||
parser.add_argument('-b', '--batch-size', default=128, type=int,
|
||||
metavar='N',
|
||||
help='mini-batch size (default: 256), this is the total '
|
||||
'batch size of all GPUs on the current node when '
|
||||
'using Data Parallel or Distributed Data Parallel')
|
||||
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
|
||||
metavar='LR', help='initial learning rate', dest='lr')
|
||||
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
|
||||
help='momentum')
|
||||
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
|
||||
metavar='W', help='weight decay (default: 1e-4)',
|
||||
dest='weight_decay')
|
||||
parser.add_argument('-p', '--print-freq', default=1, type=int,
|
||||
metavar='N', help='print frequency (default: 10)')
|
||||
parser.add_argument('-ef', '--eval-freq', default=5, type=int,
|
||||
metavar='N', help='evaluate frequency (default: 5)')
|
||||
parser.add_argument('--resume', default='', type=str, metavar='PATH',
|
||||
help='path to latest checkpoint (default: none)')
|
||||
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
|
||||
help='evaluate model on validation set')
|
||||
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
|
||||
help='use pre-trained model')
|
||||
parser.add_argument('--world-size', default=-1, type=int,
|
||||
help='number of nodes for distributed training')
|
||||
parser.add_argument('--rank', default=-1, type=int,
|
||||
help='node rank for distributed training')
|
||||
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
|
||||
help='url used to set up distributed training')
|
||||
parser.add_argument('--dist-backend', default='nccl', type=str,
|
||||
help='distributed backend')
|
||||
parser.add_argument('--seed', default=None, type=int,
|
||||
help='seed for initializing training. ')
|
||||
parser.add_argument('--gpu', default=None, type=int,
|
||||
help='GPU id to use.')
|
||||
parser.add_argument('--multiprocessing-distributed', action='store_true',
|
||||
help='Use multi-processing distributed training to launch '
|
||||
'N processes per node, which has N GPUs. This is the '
|
||||
'fastest way to use PyTorch for either single node or '
|
||||
'multi node data parallel training')
|
||||
parser.add_argument('--npu', default=None, type=int,
|
||||
help='NPU id to use.')
|
||||
|
||||
# apex
|
||||
parser.add_argument('--amp', default=False, action='store_true',
|
||||
help='use amp to train the model')
|
||||
parser.add_argument('--loss-scale', default=1024., type=float,
|
||||
help='loss scale using in amp, default -1 means dynamic')
|
||||
parser.add_argument('--opt-level', default='O2', type=str,
|
||||
help='loss scale using in amp, default -1 means dynamic')
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
if args.npu is None:
|
||||
args.npu = 0
|
||||
global CALCULATE_DEVICE
|
||||
global best_acc1
|
||||
|
||||
best_acc1 = 0
|
||||
CALCULATE_DEVICE = "npu:{}".format(args.npu)
|
||||
torch.npu.set_device(CALCULATE_DEVICE)
|
||||
|
||||
if args.seed is not None:
|
||||
random.seed(seed)
|
||||
os.environ['PYTHONHASHSEED'] = str(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
|
||||
if args.gpu is not None:
|
||||
warnings.warn('You have chosen a specific GPU. This will completely '
|
||||
'disable data parallelism.')
|
||||
|
||||
if args.dist_url == "env://" and args.world_size == -1:
|
||||
args.world_size = int(os.environ["WORLD_SIZE"])
|
||||
|
||||
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
|
||||
|
||||
ngpus_per_node = torch.npu.device_count()
|
||||
print('{} node found.'.format(ngpus_per_node))
|
||||
if args.multiprocessing_distributed:
|
||||
# Since we have ngpus_per_node processes per node, the total world_size
|
||||
# needs to be adjusted accordingly
|
||||
args.world_size = ngpus_per_node * args.world_size
|
||||
# Use torch.multiprocessing.spawn to launch distributed processes: the
|
||||
# main_worker process function
|
||||
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
|
||||
else:
|
||||
# Simply call main_worker function
|
||||
main_worker(args.gpu, ngpus_per_node, args)
|
||||
|
||||
|
||||
def main_worker(gpu, ngpus_per_node, args):
|
||||
global best_acc1
|
||||
args.gpu = gpu
|
||||
|
||||
if args.gpu is not None:
|
||||
print("Use GPU: {} for training".format(args.gpu))
|
||||
|
||||
if args.distributed:
|
||||
if args.dist_url == "env://" and args.rank == -1:
|
||||
args.rank = int(os.environ["RANK"])
|
||||
if args.multiprocessing_distributed:
|
||||
# For multiprocessing distributed training, rank needs to be the
|
||||
# global rank among all the processes
|
||||
args.rank = args.rank * ngpus_per_node + gpu
|
||||
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
|
||||
world_size=args.world_size, rank=args.rank)
|
||||
# create model
|
||||
if args.pretrained:
|
||||
print("=> using pre-trained model '{}'".format(args.arch))
|
||||
model = models.__dict__[args.arch](pretrained=True)
|
||||
else:
|
||||
print("=> creating model '{}'".format(args.arch))
|
||||
# model = models.__dict__[args.arch]()
|
||||
model = densenet121()
|
||||
|
||||
if args.distributed:
|
||||
# For multiprocessing distributed, DistributedDataParallel constructor
|
||||
# should always set the single device scope, otherwise,
|
||||
# DistributedDataParallel will use all available devices.
|
||||
if args.gpu is not None:
|
||||
torch.cuda.set_device(args.gpu)
|
||||
model.cuda(args.gpu)
|
||||
# When using a single GPU per process and per
|
||||
# DistributedDataParallel, we need to divide the batch size
|
||||
# ourselves based on the total number of GPUs we have
|
||||
args.batch_size = int(args.batch_size / ngpus_per_node)
|
||||
args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
|
||||
else:
|
||||
model.cuda()
|
||||
# DistributedDataParallel will divide and allocate batch_size to all
|
||||
# available GPUs if device_ids are not set
|
||||
model = torch.nn.parallel.DistributedDataParallel(model)
|
||||
elif args.gpu is not None:
|
||||
torch.cuda.set_device(args.gpu)
|
||||
model = model.cuda(args.gpu)
|
||||
else:
|
||||
# DataParallel will divide and allocate batch_size to all available GPUs
|
||||
if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
|
||||
model.features = torch.nn.DataParallel(model.features)
|
||||
model.cuda()
|
||||
else:
|
||||
model = model.to(CALCULATE_DEVICE)
|
||||
#for item in model.npu_unsupport_list:
|
||||
# print("npu_unsupport: ", item)
|
||||
# item.cpu()
|
||||
|
||||
# define loss function (criterion) and optimizer
|
||||
criterion = nn.CrossEntropyLoss().to(CALCULATE_DEVICE)
|
||||
|
||||
optimizer = torch.optim.SGD(model.parameters(), args.lr,
|
||||
momentum=args.momentum,
|
||||
weight_decay=args.weight_decay)
|
||||
|
||||
if args.amp:
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale)
|
||||
|
||||
# optionally resume from a checkpoint
|
||||
if args.resume:
|
||||
if os.path.isfile(args.resume):
|
||||
print("=> loading checkpoint '{}'".format(args.resume))
|
||||
checkpoint = torch.load(args.resume, map_location=CALCULATE_DEVICE)
|
||||
args.start_epoch = checkpoint['epoch']
|
||||
best_acc1 = checkpoint['best_acc1']
|
||||
model.load_state_dict(checkpoint['state_dict'])
|
||||
optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
if args.amp:
|
||||
amp.load_state_dict(checkpoint['amp'])
|
||||
print("=> loaded checkpoint '{}' (epoch {})"
|
||||
.format(args.resume, checkpoint['epoch']))
|
||||
else:
|
||||
print("=> no checkpoint found at '{}'".format(args.resume))
|
||||
|
||||
cudnn.benchmark = True
|
||||
|
||||
# Data loading code
|
||||
traindir = os.path.join(args.data, 'train')
|
||||
valdir = os.path.join(args.data, 'val')
|
||||
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
||||
std=[0.229, 0.224, 0.225])
|
||||
|
||||
train_dataset = datasets.ImageFolder(
|
||||
traindir,
|
||||
transforms.Compose([
|
||||
transforms.RandomResizedCrop(224),
|
||||
transforms.RandomHorizontalFlip(),
|
||||
transforms.ToTensor(),
|
||||
normalize,
|
||||
]))
|
||||
|
||||
if args.distributed:
|
||||
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
|
||||
else:
|
||||
train_sampler = None
|
||||
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
|
||||
num_workers=args.workers, pin_memory=False, sampler=train_sampler, drop_last=True)
|
||||
|
||||
val_loader = torch.utils.data.DataLoader(
|
||||
datasets.ImageFolder(valdir, transforms.Compose([
|
||||
transforms.Resize(256),
|
||||
transforms.CenterCrop(224),
|
||||
transforms.ToTensor(),
|
||||
normalize,
|
||||
])),
|
||||
batch_size=args.batch_size, shuffle=True,
|
||||
num_workers=args.workers, pin_memory=False, drop_last=True)
|
||||
|
||||
if args.evaluate:
|
||||
validate(val_loader, model, criterion, args)
|
||||
return
|
||||
|
||||
|
||||
writer = SummaryWriter(os.path.join('runs/densenet121'))
|
||||
for epoch in range(args.start_epoch, args.epochs):
|
||||
if args.distributed:
|
||||
train_sampler.set_epoch(epoch)
|
||||
adjust_learning_rate(optimizer, epoch, args)
|
||||
|
||||
# train for one epoch
|
||||
train(train_loader, model, criterion, optimizer, epoch, args, writer)
|
||||
|
||||
if (epoch+1)%(args.eval_freq)==0 or epoch==args.epochs-1 :
|
||||
# evaluate on validation set
|
||||
acc1 = validate(val_loader, model, criterion, args, epoch, writer)
|
||||
|
||||
# remember best acc@1 and save checkpoint
|
||||
is_best = acc1 > best_acc1
|
||||
best_acc1 = max(acc1, best_acc1)
|
||||
|
||||
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
|
||||
and args.rank % ngpus_per_node == 0 and epoch == args.epochs - 1):
|
||||
if args.amp:
|
||||
save_checkpoint({
|
||||
'epoch': epoch + 1,
|
||||
'arch': args.arch,
|
||||
'state_dict': model.state_dict(),
|
||||
'best_acc1': best_acc1,
|
||||
'optimizer' : optimizer.state_dict(),
|
||||
'amp': amp.state_dict(),
|
||||
}, is_best)
|
||||
else:
|
||||
save_checkpoint({
|
||||
'epoch': epoch + 1,
|
||||
'arch': args.arch,
|
||||
'state_dict': model.state_dict(),
|
||||
'best_acc1': best_acc1,
|
||||
'optimizer' : optimizer.state_dict(),
|
||||
}, is_best)
|
||||
|
||||
writer.close()
|
||||
|
||||
def train(train_loader, model, criterion, optimizer, epoch, args, writer):
|
||||
batch_time = AverageMeter('Time', ':6.3f')
|
||||
data_time = AverageMeter('Data', ':6.3f')
|
||||
losses = AverageMeter('Loss', ':.4e')
|
||||
top1 = AverageMeter('Acc@1', ':6.2f')
|
||||
top5 = AverageMeter('Acc@5', ':6.2f')
|
||||
progress = ProgressMeter(
|
||||
len(train_loader),
|
||||
[batch_time, data_time, losses, top1, top5],
|
||||
prefix="Epoch: [{}]".format(epoch))
|
||||
|
||||
# switch to train mode
|
||||
model.train()
|
||||
|
||||
end = time.time()
|
||||
for i, (images, target) in enumerate(train_loader):
|
||||
# measure data loading time
|
||||
data_time.update(time.time() - end)
|
||||
|
||||
target = target.to(torch.int32)
|
||||
images, target = images.to(CALCULATE_DEVICE, non_blocking=False), target.to(CALCULATE_DEVICE, non_blocking=False)
|
||||
|
||||
# compute output
|
||||
output = model(images)
|
||||
loss = criterion(output, target)
|
||||
|
||||
# measure accuracy and record loss
|
||||
acc1, acc5 = accuracy(output, target, topk=(1, 5))
|
||||
losses.update(loss.item(), images.size(0))
|
||||
top1.update(acc1[0], images.size(0))
|
||||
top5.update(acc5[0], images.size(0))
|
||||
|
||||
# add tensorboard
|
||||
writer.add_scalar('Train/Loss', losses.val, epoch * len(train_loader) + i)
|
||||
writer.add_scalar('Train/Acc@1', top1.val, epoch * len(train_loader) + i)
|
||||
writer.add_scalar('Train/Acc@5', top5.val, epoch * len(train_loader) + i)
|
||||
writer.add_scalar('Train/LR', optimizer.param_groups[0]['lr'], epoch * len(train_loader) + i)
|
||||
|
||||
# compute gradient and do SGD step
|
||||
optimizer.zero_grad()
|
||||
if args.amp:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
writer.add_scalar('Train/Time', batch_time.val, epoch * len(train_loader) + i)
|
||||
writer.add_scalar('Train/Time_Data', data_time.val, epoch * len(train_loader) + i)
|
||||
end = time.time()
|
||||
|
||||
if i % args.print_freq == 0:
|
||||
progress.display(i)
|
||||
|
||||
print(' * FPS@all {:.3f}'.format(args.batch_size/batch_time.avg))
|
||||
hwlog.remark_print(key=hwlog.FPS, value=' * FPS@all {:.3f}'.format(args.batch_size/batch_time.avg))
|
||||
|
||||
def validate(val_loader, model, criterion, args, epoch=0, writer=None):
|
||||
batch_time = AverageMeter('Time', ':6.3f')
|
||||
losses = AverageMeter('Loss', ':.4e')
|
||||
top1 = AverageMeter('Acc@1', ':6.2f')
|
||||
top5 = AverageMeter('Acc@5', ':6.2f')
|
||||
progress = ProgressMeter(
|
||||
len(val_loader),
|
||||
[batch_time, losses, top1, top5],
|
||||
prefix='Test: ')
|
||||
|
||||
# switch to evaluate mode
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
end = time.time()
|
||||
for i, (images, target) in enumerate(val_loader):
|
||||
target = target.to(torch.int32)
|
||||
images, target = images.to(CALCULATE_DEVICE, non_blocking=False), target.to(CALCULATE_DEVICE, non_blocking=False)
|
||||
|
||||
# compute output
|
||||
output = model(images)
|
||||
loss = criterion(output, target)
|
||||
|
||||
# measure accuracy and record loss
|
||||
acc1, acc5 = accuracy(output, target, topk=(1, 5))
|
||||
losses.update(loss.item(), images.size(0))
|
||||
top1.update(acc1[0], images.size(0))
|
||||
top5.update(acc5[0], images.size(0))
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
end = time.time()
|
||||
|
||||
if i % args.print_freq == 0:
|
||||
progress.display(i)
|
||||
|
||||
# TODO: this should also be done with the ProgressMeter
|
||||
print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
|
||||
.format(top1=top1, top5=top5))
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value="{top1.avg:.3f}".format(top1=top1))
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value="{top5.avg:.3f}".format(top5=top5))
|
||||
|
||||
if writer: # and args.gpu==0:
|
||||
writer.add_scalar('Val/Time', batch_time.avg, epoch)
|
||||
writer.add_scalar('Val/Loss', losses.avg, epoch)
|
||||
writer.add_scalar('Val/Acc@1', top1.avg, epoch)
|
||||
writer.add_scalar('Val/Acc@5', top5.avg, epoch)
|
||||
|
||||
return top1.avg
|
||||
|
||||
|
||||
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
|
||||
torch.save(state, filename)
|
||||
if is_best:
|
||||
shutil.copyfile(filename, 'model_best_acc%.4f_epoch%d.pth.tar'%(state['best_acc1'], state['epoch']))
|
||||
|
||||
|
||||
class AverageMeter(object):
|
||||
"""Computes and stores the average and current value"""
|
||||
def __init__(self, name, fmt=':f'):
|
||||
self.name = name
|
||||
self.fmt = fmt
|
||||
self.reset()
|
||||
self.start_count_index = 10
|
||||
|
||||
def reset(self):
|
||||
self.val = 0
|
||||
self.avg = 0
|
||||
self.sum = 0
|
||||
self.count = 0
|
||||
|
||||
def update(self, val, n=1):
|
||||
self.val = val
|
||||
self.count += n
|
||||
if self.count>(self.start_count_index*n):
|
||||
self.sum += val * n
|
||||
self.avg = self.sum / (self.count-self.start_count_index*n)
|
||||
|
||||
def __str__(self):
|
||||
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
|
||||
return fmtstr.format(**self.__dict__)
|
||||
|
||||
|
||||
class ProgressMeter(object):
|
||||
def __init__(self, num_batches, meters, prefix=""):
|
||||
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
|
||||
self.meters = meters
|
||||
self.prefix = prefix
|
||||
|
||||
def display(self, batch):
|
||||
entries = [self.prefix + self.batch_fmtstr.format(batch)]
|
||||
entries += [str(meter) for meter in self.meters]
|
||||
print('\t'.join(entries))
|
||||
# 日志打点
|
||||
train_acc1 = str(entries).split("Acc@1")[1].strip().split(" ")[0]
|
||||
train_acc5 = str(entries).split("Acc@5")[1].strip().split(" ")[0]
|
||||
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP1, value=train_acc1)
|
||||
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP5, value=train_acc5)
|
||||
|
||||
def _get_batch_fmtstr(self, num_batches):
|
||||
num_digits = len(str(num_batches // 1))
|
||||
fmt = '{:' + str(num_digits) + 'd}'
|
||||
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
|
||||
|
||||
|
||||
def adjust_learning_rate(optimizer, epoch, args):
|
||||
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
|
||||
lr = args.lr * (0.1 ** (epoch // 30))
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr
|
||||
|
||||
|
||||
def accuracy(output, target, topk=(1,)):
|
||||
"""Computes the accuracy over the k top predictions for the specified values of k"""
|
||||
with torch.no_grad():
|
||||
maxk = max(topk)
|
||||
batch_size = target.size(0)
|
||||
|
||||
_, pred = output.topk(maxk, 1, True, True)
|
||||
pred = pred.t()
|
||||
correct = pred.eq(target.view(1, -1).expand_as(pred))
|
||||
|
||||
res = []
|
||||
for k in topk:
|
||||
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
|
||||
res.append(correct_k.mul_(100.0 / batch_size))
|
||||
return res
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
|
||||
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
|
||||
config_info = get_model_parameter("pytorch_config")
|
||||
initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
|
||||
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
|
||||
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
|
||||
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
|
||||
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
|
||||
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
|
||||
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
|
||||
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
|
||||
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
|
||||
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
|
||||
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
|
||||
main()
|
||||
+538
@@ -0,0 +1,538 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
import time
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.parallel
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torch.distributed as dist
|
||||
import torch.optim
|
||||
import torch.multiprocessing as mp
|
||||
import torch.utils.data
|
||||
import torch.utils.data.distributed
|
||||
import torchvision.transforms as transforms
|
||||
import torchvision.datasets as datasets
|
||||
import torchvision.models as models
|
||||
from densenet_0_2_2 import densenet121
|
||||
|
||||
from apex import amp
|
||||
|
||||
from benchmark_log import hwlog
|
||||
from benchmark_log.basic_utils import get_environment_info
|
||||
from benchmark_log.basic_utils import get_model_parameter
|
||||
|
||||
BATCH_SIZE = 512
|
||||
OPTIMIZER_BATCH_SIZE=2048
|
||||
model_names = sorted(name for name in models.__dict__
|
||||
if name.islower() and not name.startswith("__")
|
||||
and callable(models.__dict__[name]))
|
||||
|
||||
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
|
||||
parser.add_argument('--data', metavar='DIR', default='/opt/npu/dataset/imagenet',
|
||||
help='path to dataset')
|
||||
parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
|
||||
choices=model_names,
|
||||
help='model architecture: ' +
|
||||
' | '.join(model_names) +
|
||||
' (default: resnet18)')
|
||||
parser.add_argument('-j', '--workers', default=32, type=int, metavar='N',
|
||||
help='number of data loading workers (default: 4)')
|
||||
parser.add_argument('--epochs', default=90, type=int, metavar='N',
|
||||
help='number of total epochs to run')
|
||||
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
|
||||
help='manual epoch number (useful on restarts)')
|
||||
parser.add_argument('-b', '--batch-size', default=BATCH_SIZE, type=int,
|
||||
metavar='N',
|
||||
help='mini-batch size (default: 256), this is the total '
|
||||
'batch size of all GPUs on the current node when '
|
||||
'using Data Parallel or Distributed Data Parallel')
|
||||
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
|
||||
metavar='LR', help='initial learning rate', dest='lr')
|
||||
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
|
||||
help='momentum')
|
||||
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
|
||||
metavar='W', help='weight decay (default: 1e-4)',
|
||||
dest='weight_decay')
|
||||
parser.add_argument('--workspace',type=str,default='./',metavar='DIR',
|
||||
help='path to directory where checkpoints will be stored')
|
||||
parser.add_argument('-p', '--print-freq', default=10, type=int,
|
||||
metavar='N', help='print frequency (default: 10)')
|
||||
parser.add_argument('-ef', '--eval-freq', default=5, type=int,
|
||||
metavar='N', help='evaluate frequency (default: 5)')
|
||||
parser.add_argument('--resume', default='', type=str, metavar='PATH',
|
||||
help='path to latest checkpoint (default: none)')
|
||||
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
|
||||
help='evaluate model on validation set')
|
||||
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
|
||||
help='use pre-trained model')
|
||||
parser.add_argument('--world-size', default=-1, type=int,
|
||||
help='number of nodes for distributed training')
|
||||
parser.add_argument('--rank', default=-1, type=int,
|
||||
help='node rank for distributed training')
|
||||
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
|
||||
help='url used to set up distributed training')
|
||||
parser.add_argument('--dist-backend', default='nccl', type=str,
|
||||
help='distributed backend')
|
||||
parser.add_argument('--seed', default=None, type=int,
|
||||
help='seed for initializing training. ')
|
||||
parser.add_argument('--gpu', default=None, type=int,
|
||||
help='GPU id to use.')
|
||||
parser.add_argument('--multiprocessing-distributed', action='store_true',
|
||||
help='Use multi-processing distributed training to launch '
|
||||
'N processes per node, which has N GPUs. This is the '
|
||||
'fastest way to use PyTorch for either single node or '
|
||||
'multi node data parallel training')
|
||||
parser.add_argument('-bm', '--benchmark', default=0, type=int,
|
||||
metavar='N', help='set benchmark status (default: 1,run benchmark)')
|
||||
parser.add_argument('--device', default='npu', type=str,
|
||||
help='npu or gpu')
|
||||
parser.add_argument('--addr', default='10.136.181.115', type=str,
|
||||
help='master addr')
|
||||
parser.add_argument('--checkpoint-nameprefix', default='checkpoint', type=str,
|
||||
help='checkpoint-nameprefix')
|
||||
parser.add_argument('--checkpoint-freq', default=0, type=int,
|
||||
metavar='N', help='checkpoint frequency (default: 0)'
|
||||
'0: save only one file whitch per epoch;'
|
||||
'n: save diff file per n epoch'
|
||||
'-1:no checkpoint,not support')
|
||||
parser.add_argument('--device-list', default='0,1,2,3,4,5,6,7', type=str, help='device id list')
|
||||
# apex
|
||||
parser.add_argument('--amp', default=False, action='store_true',
|
||||
help='use amp to train the model')
|
||||
parser.add_argument('--loss-scale', default=1024., type=float,
|
||||
help='loss scale using in amp, default -1 means dynamic')
|
||||
parser.add_argument('--opt-level', default='O2', type=str,
|
||||
help='loss scale using in amp, default -1 means dynamic')
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
best_acc1 = 0
|
||||
def device_id_to_process_device_map(device_list):
|
||||
devices = device_list.split(",")
|
||||
devices = [int(x) for x in devices]
|
||||
devices.sort()
|
||||
|
||||
process_device_map = dict()
|
||||
for process_id, device_id in enumerate(devices):
|
||||
process_device_map[process_id] = device_id
|
||||
|
||||
return process_device_map
|
||||
|
||||
def main():
|
||||
args = parser.parse_args()
|
||||
print("===============main()=================")
|
||||
print(args)
|
||||
print("===============main()=================")
|
||||
|
||||
os.environ['KERNEL_NAME_ID'] = str(0)
|
||||
print("+++++++++++++++++++++++++++KERNEL_NAME_ID:",os.environ['KERNEL_NAME_ID'])
|
||||
|
||||
if args.seed is not None:
|
||||
random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
cudnn.deterministic = True
|
||||
warnings.warn('You have chosen to seed training. '
|
||||
'This will turn on the CUDNN deterministic setting, '
|
||||
'which can slow down your training considerably! '
|
||||
'You may see unexpected behavior when restarting '
|
||||
'from checkpoints.')
|
||||
|
||||
os.environ['MASTER_ADDR'] = args.addr # '10.136.181.51'
|
||||
os.environ['MASTER_PORT'] = '29688'
|
||||
|
||||
if args.gpu is not None:
|
||||
warnings.warn('You have chosen a specific GPU. This will completely '
|
||||
'disable data parallelism.')
|
||||
|
||||
if args.dist_url == "env://" and args.world_size == -1:
|
||||
args.world_size = int(os.environ["WORLD_SIZE"])
|
||||
|
||||
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
|
||||
|
||||
args.process_device_map = device_id_to_process_device_map(args.device_list)
|
||||
|
||||
if args.device == 'npu':
|
||||
ngpus_per_node = len(args.process_device_map)
|
||||
else:
|
||||
ngpus_per_node = torch.cuda.device_count()
|
||||
if args.multiprocessing_distributed:
|
||||
# Since we have ngpus_per_node processes per node, the total world_size
|
||||
# needs to be adjusted accordingly
|
||||
args.world_size = ngpus_per_node * args.world_size
|
||||
# Use torch.multiprocessing.spawn to launch distributed processes: the
|
||||
# main_worker process function
|
||||
# The child process uses the environment variables of the parent process,
|
||||
# we have to set KERNEL_NAME_ID for every proc
|
||||
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
|
||||
|
||||
else:
|
||||
# Simply call main_worker function
|
||||
main_worker(args.gpu, ngpus_per_node, args)
|
||||
|
||||
|
||||
def main_worker(gpu, ngpus_per_node, args):
|
||||
global best_acc1
|
||||
args.gpu = args.process_device_map[gpu]
|
||||
print("[npu id:",args.gpu,"]","+++++++++++++++++++++++++++ before set KERNEL_NAME_ID:",os.environ['KERNEL_NAME_ID'])
|
||||
os.environ['KERNEL_NAME_ID'] = str(gpu)
|
||||
print("[npu id:",args.gpu,"]","+++++++++++++++++++++++++++KERNEL_NAME_ID:",os.environ['KERNEL_NAME_ID'])
|
||||
|
||||
if args.gpu is not None:
|
||||
print("[npu id:",args.gpu,"]","Use GPU: {} for training".format(args.gpu))
|
||||
|
||||
if args.distributed:
|
||||
if args.dist_url == "env://" and args.rank == -1:
|
||||
args.rank = int(os.environ["RANK"])
|
||||
if args.multiprocessing_distributed:
|
||||
# For multiprocessing distributed training, rank needs to be the
|
||||
# global rank among all the processes
|
||||
args.rank = args.rank * ngpus_per_node + gpu
|
||||
|
||||
if args.device == 'npu':
|
||||
dist.init_process_group(backend=args.dist_backend, #init_method=args.dist_url,
|
||||
world_size=args.world_size, rank=args.rank)
|
||||
else:
|
||||
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
|
||||
world_size=args.world_size, rank=args.rank)
|
||||
|
||||
|
||||
loc = 'npu:{}'.format(args.gpu)
|
||||
torch.npu.set_device(loc)
|
||||
|
||||
args.batch_size = int(args.batch_size / ngpus_per_node)
|
||||
args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
|
||||
|
||||
print("[npu id:",args.gpu,"]","===============main_worker()=================")
|
||||
print("[npu id:",args.gpu,"]",args)
|
||||
print("[npu id:",args.gpu,"]","===============main_worker()=================")
|
||||
|
||||
|
||||
# Data loading code
|
||||
traindir = os.path.join(args.data, 'train')
|
||||
valdir = os.path.join(args.data, 'val')
|
||||
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
||||
std=[0.229, 0.224, 0.225])
|
||||
|
||||
train_dataset = datasets.ImageFolder(
|
||||
traindir,
|
||||
transforms.Compose([
|
||||
transforms.RandomResizedCrop(224),
|
||||
transforms.RandomHorizontalFlip(),
|
||||
transforms.ToTensor(),
|
||||
normalize,
|
||||
]))
|
||||
|
||||
if args.distributed:
|
||||
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
|
||||
else:
|
||||
train_sampler = None
|
||||
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
|
||||
num_workers=args.workers, pin_memory=False, sampler=train_sampler, drop_last=True)
|
||||
|
||||
val_loader = torch.utils.data.DataLoader(
|
||||
datasets.ImageFolder(valdir, transforms.Compose([
|
||||
transforms.Resize(256),
|
||||
transforms.CenterCrop(224),
|
||||
transforms.ToTensor(),
|
||||
normalize,
|
||||
])),
|
||||
batch_size=args.batch_size, shuffle=True,
|
||||
num_workers=args.workers, pin_memory=False, drop_last=True)
|
||||
|
||||
# create model
|
||||
print("[npu id:",args.gpu,"]","=> creating model '{}'".format(args.arch))
|
||||
# model = models.__dict__[args.arch]()
|
||||
model = densenet121()
|
||||
model = model.to(loc)
|
||||
|
||||
# define loss function (criterion) and optimizer
|
||||
criterion = nn.CrossEntropyLoss().to(loc)
|
||||
optimizer = torch.optim.SGD(model.parameters(), args.lr,
|
||||
momentum=args.momentum,
|
||||
weight_decay=args.weight_decay)
|
||||
|
||||
if args.amp:
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale)
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False)
|
||||
|
||||
# optionally resume from a checkpoint
|
||||
if args.resume:
|
||||
if os.path.isfile(args.resume):
|
||||
print("=> loading checkpoint '{}'".format(args.resume))
|
||||
checkpoint = torch.load(args.resume, map_location=loc)
|
||||
args.start_epoch = checkpoint['epoch']
|
||||
best_acc1 = checkpoint['best_acc1']
|
||||
model.load_state_dict(checkpoint['state_dict'])
|
||||
optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
if args.amp:
|
||||
amp.load_state_dict(checkpoint['amp'])
|
||||
print("=> loaded checkpoint '{}' (epoch {})"
|
||||
.format(args.resume, checkpoint['epoch']))
|
||||
else:
|
||||
print("=> no checkpoint found at '{}'".format(args.resume))
|
||||
|
||||
cudnn.benchmark = True
|
||||
|
||||
|
||||
if args.evaluate:
|
||||
validate(val_loader, model, criterion, args)
|
||||
return
|
||||
|
||||
for epoch in range(args.start_epoch, args.epochs):
|
||||
if args.distributed:
|
||||
train_sampler.set_epoch(epoch)
|
||||
adjust_learning_rate(optimizer, epoch, args)
|
||||
|
||||
# train for one epoch
|
||||
train(train_loader, model, criterion, optimizer, epoch, args,ngpus_per_node)
|
||||
|
||||
if (epoch+1)%(args.eval_freq)==0 or epoch==args.epochs-1 :
|
||||
# evaluate on validation set
|
||||
acc1 = validate(val_loader, model, criterion, args,ngpus_per_node)
|
||||
|
||||
# remember best acc@1 and save checkpoint
|
||||
is_best = acc1 > best_acc1
|
||||
best_acc1 = max(acc1, best_acc1)
|
||||
|
||||
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
|
||||
and args.rank % ngpus_per_node == 0 and epoch == args.epochs - 1):
|
||||
if args.amp:
|
||||
save_checkpoint({
|
||||
'epoch': epoch + 1,
|
||||
'arch': args.arch,
|
||||
'state_dict': model.state_dict(),
|
||||
'best_acc1': best_acc1,
|
||||
'optimizer' : optimizer.state_dict(),
|
||||
'amp': amp.state_dict(),
|
||||
}, is_best)
|
||||
else:
|
||||
save_checkpoint({
|
||||
'epoch': epoch + 1,
|
||||
'arch': args.arch,
|
||||
'state_dict': model.state_dict(),
|
||||
'best_acc1': best_acc1,
|
||||
'optimizer' : optimizer.state_dict(),
|
||||
}, is_best)
|
||||
|
||||
def train(train_loader, model, criterion, optimizer, epoch, args,ngpus_per_node):
|
||||
batch_time = AverageMeter('Time', ':6.3f')
|
||||
data_time = AverageMeter('Data', ':6.3f')
|
||||
losses = AverageMeter('Loss', ':.4e')
|
||||
top1 = AverageMeter('Acc@1', ':6.2f')
|
||||
top5 = AverageMeter('Acc@5', ':6.2f')
|
||||
progress = ProgressMeter(
|
||||
len(train_loader),
|
||||
[batch_time, data_time, losses, top1, top5],
|
||||
prefix="Epoch: [{}]".format(epoch))
|
||||
|
||||
# switch to train mode
|
||||
model.train()
|
||||
end = time.time()
|
||||
if args.benchmark == 1 :
|
||||
optimizer.zero_grad()
|
||||
for i, (images, target) in enumerate(train_loader):
|
||||
# measure data loading time
|
||||
data_time.update(time.time() - end)
|
||||
|
||||
loc = 'npu:{}'.format(args.gpu)
|
||||
target = target.to(torch.int32)
|
||||
images, target = images.to(loc, non_blocking=False), target.to(loc, non_blocking=False)
|
||||
|
||||
# compute output
|
||||
output = model(images)
|
||||
|
||||
loss = criterion(output, target)
|
||||
|
||||
# measure accuracy and record loss
|
||||
acc1, acc5 = accuracy(output, target, topk=(1, 5))
|
||||
losses.update(loss.item(), images.size(0))
|
||||
top1.update(acc1[0], images.size(0))
|
||||
top5.update(acc5[0], images.size(0))
|
||||
|
||||
# compute gradient and do SGD step
|
||||
if args.benchmark == 0 :
|
||||
optimizer.zero_grad()
|
||||
|
||||
if args.amp:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
if args.benchmark == 0 :
|
||||
optimizer.step()
|
||||
elif args.benchmark == 1 :
|
||||
BATCH_SIZE_multiplier = int(OPTIMIZER_BATCH_SIZE / args.batch_size)
|
||||
BM_optimizer_step = ((i + 1) % BATCH_SIZE_multiplier) == 0
|
||||
if BM_optimizer_step:
|
||||
for param_group in optimizer.param_groups:
|
||||
for param in param_group['params']:
|
||||
param.grad /= BATCH_SIZE_multiplier
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
if i % args.print_freq == 0:
|
||||
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
|
||||
and args.rank % ngpus_per_node == 0):
|
||||
progress.display(i)
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
end = time.time()
|
||||
|
||||
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
|
||||
and args.rank % ngpus_per_node == 0):
|
||||
print("[npu id:",args.gpu,"]",'* FPS@all {:.3f}'.format(ngpus_per_node*args.batch_size/batch_time.avg))
|
||||
hwlog.remark_print(key=hwlog.FPS, value=' * FPS@all {:.3f}'.format(ngpus_per_node*args.batch_size / batch_time.avg))
|
||||
|
||||
def validate(val_loader, model, criterion, args,ngpus_per_node):
|
||||
batch_time = AverageMeter('Time', ':6.3f')
|
||||
losses = AverageMeter('Loss', ':.4e')
|
||||
top1 = AverageMeter('Acc@1', ':6.2f')
|
||||
top5 = AverageMeter('Acc@5', ':6.2f')
|
||||
progress = ProgressMeter(
|
||||
len(val_loader),
|
||||
[batch_time, losses, top1, top5],
|
||||
prefix='Test: ')
|
||||
|
||||
# switch to evaluate mode
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
end = time.time()
|
||||
for i, (images, target) in enumerate(val_loader):
|
||||
|
||||
loc = 'npu:{}'.format(args.gpu)
|
||||
target = target.to(torch.int32)
|
||||
images, target = images.to(loc, non_blocking=False), target.to(loc, non_blocking=False)
|
||||
|
||||
# compute output
|
||||
output = model(images)
|
||||
loss = criterion(output, target)
|
||||
|
||||
# measure accuracy and record loss
|
||||
acc1, acc5 = accuracy(output, target, topk=(1, 5))
|
||||
losses.update(loss.item(), images.size(0))
|
||||
top1.update(acc1[0], images.size(0))
|
||||
top5.update(acc5[0], images.size(0))
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
end = time.time()
|
||||
|
||||
if i % args.print_freq == 0:
|
||||
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
|
||||
and args.rank % ngpus_per_node == 0):
|
||||
progress.display(i)
|
||||
|
||||
# TODO: this should also be done with the ProgressMeter
|
||||
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
|
||||
and args.rank % ngpus_per_node == 0):
|
||||
print("[npu id:",args.gpu,"]",'[AVG-ACC] * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
|
||||
.format(top1=top1, top5=top5))
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value="{top1.avg:.3f}".format(top1=top1))
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value="{top5.avg:.3f}".format(top5=top5))
|
||||
|
||||
return top1.avg
|
||||
|
||||
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
|
||||
torch.save(state, filename)
|
||||
if is_best:
|
||||
shutil.copyfile(filename, 'model_best_acc%.4f_epoch%d.pth.tar'%(state['best_acc1'], state['epoch']))
|
||||
|
||||
class AverageMeter(object):
|
||||
"""Computes and stores the average and current value"""
|
||||
def __init__(self, name, fmt=':f'):
|
||||
self.name = name
|
||||
self.fmt = fmt
|
||||
self.reset()
|
||||
self.start_count_index = 10
|
||||
|
||||
def reset(self):
|
||||
self.val = 0
|
||||
self.avg = 0
|
||||
self.sum = 0
|
||||
self.count = 0
|
||||
|
||||
def update(self, val, n=1):
|
||||
self.val = val
|
||||
self.count += n
|
||||
if self.count>(self.start_count_index*n):
|
||||
self.sum += val * n
|
||||
self.avg = self.sum / (self.count-self.start_count_index*n)
|
||||
|
||||
def __str__(self):
|
||||
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
|
||||
return fmtstr.format(**self.__dict__)
|
||||
|
||||
|
||||
class ProgressMeter(object):
|
||||
def __init__(self, num_batches, meters, prefix=""):
|
||||
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
|
||||
self.meters = meters
|
||||
self.prefix = prefix
|
||||
|
||||
def display(self, batch):
|
||||
entries = [self.prefix + self.batch_fmtstr.format(batch)]
|
||||
entries += [str(meter) for meter in self.meters]
|
||||
print("[npu id:",os.environ['KERNEL_NAME_ID'],"]",'\t'.join(entries))
|
||||
# 日志打点
|
||||
train_acc1 = str(entries).split("Acc@1")[1].strip().split(" ")[0]
|
||||
train_acc5 = str(entries).split("Acc@5")[1].strip().split(" ")[0]
|
||||
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP1, value=train_acc1)
|
||||
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP5, value=train_acc5)
|
||||
|
||||
def _get_batch_fmtstr(self, num_batches):
|
||||
num_digits = len(str(num_batches // 1))
|
||||
fmt = '{:' + str(num_digits) + 'd}'
|
||||
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
|
||||
|
||||
|
||||
def adjust_learning_rate(optimizer, epoch, args):
|
||||
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
|
||||
lr = args.lr * (0.1 ** (epoch // 30))
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr
|
||||
|
||||
|
||||
def accuracy(output, target, topk=(1,)):
|
||||
"""Computes the accuracy over the k top predictions for the specified values of k"""
|
||||
with torch.no_grad():
|
||||
maxk = max(topk)
|
||||
batch_size = target.size(0)
|
||||
|
||||
_, pred = output.topk(maxk, 1, True, True)
|
||||
pred = pred.t()
|
||||
correct = pred.eq(target.view(1, -1).expand_as(pred))
|
||||
|
||||
res = []
|
||||
for k in topk:
|
||||
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
|
||||
res.append(correct_k.mul_(100.0 / batch_size))
|
||||
return res
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
|
||||
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
|
||||
config_info = get_model_parameter("pytorch_config")
|
||||
initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
|
||||
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
|
||||
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
|
||||
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
|
||||
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
|
||||
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
|
||||
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
|
||||
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
|
||||
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
|
||||
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
|
||||
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
|
||||
main()
|
||||
|
||||
+225
@@ -0,0 +1,225 @@
|
||||
import re
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.model_zoo as model_zoo
|
||||
from collections import OrderedDict
|
||||
|
||||
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
|
||||
|
||||
|
||||
model_urls = {
|
||||
'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
|
||||
'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
|
||||
'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
|
||||
'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
|
||||
}
|
||||
|
||||
|
||||
class _DenseLayer(nn.Sequential):
|
||||
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
|
||||
super(_DenseLayer, self).__init__()
|
||||
self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
|
||||
self.add_module('relu1', nn.ReLU(inplace=True)),
|
||||
self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
|
||||
growth_rate, kernel_size=1, stride=1, bias=False)),
|
||||
self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
|
||||
self.add_module('relu2', nn.ReLU(inplace=True)),
|
||||
self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
|
||||
kernel_size=3, stride=1, padding=1, bias=False)),
|
||||
self.drop_rate = drop_rate
|
||||
|
||||
def forward(self, x):
|
||||
new_features = super(_DenseLayer, self).forward(x)
|
||||
if self.drop_rate > 0:
|
||||
new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
|
||||
return torch.cat([x, new_features], 1)
|
||||
|
||||
|
||||
class _DenseBlock(nn.Sequential):
|
||||
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
|
||||
super(_DenseBlock, self).__init__()
|
||||
for i in range(num_layers):
|
||||
layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
|
||||
self.add_module('denselayer%d' % (i + 1), layer)
|
||||
|
||||
|
||||
class _Transition(nn.Sequential):
|
||||
def __init__(self, num_input_features, num_output_features):
|
||||
super(_Transition, self).__init__()
|
||||
self.add_module('norm', nn.BatchNorm2d(num_input_features))
|
||||
self.add_module('relu', nn.ReLU(inplace=True))
|
||||
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
|
||||
kernel_size=1, stride=1, bias=False))
|
||||
self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
|
||||
|
||||
|
||||
class DenseNet(nn.Module):
|
||||
r"""Densenet-BC model class, based on
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
growth_rate (int) - how many filters to add each layer (`k` in paper)
|
||||
block_config (list of 4 ints) - how many layers in each pooling block
|
||||
num_init_features (int) - the number of filters to learn in the first convolution layer
|
||||
bn_size (int) - multiplicative factor for number of bottle neck layers
|
||||
(i.e. bn_size * k features in the bottleneck layer)
|
||||
drop_rate (float) - dropout rate after each dense layer
|
||||
num_classes (int) - number of classification classes
|
||||
"""
|
||||
|
||||
def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
|
||||
num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
|
||||
|
||||
super(DenseNet, self).__init__()
|
||||
|
||||
# First convolution
|
||||
self.features = nn.Sequential(OrderedDict([
|
||||
('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
|
||||
('norm0', nn.BatchNorm2d(num_init_features)),
|
||||
('relu0', nn.ReLU(inplace=True)),
|
||||
('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
|
||||
]))
|
||||
|
||||
# Each denseblock
|
||||
num_features = num_init_features
|
||||
for i, num_layers in enumerate(block_config):
|
||||
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
|
||||
bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
|
||||
self.features.add_module('denseblock%d' % (i + 1), block)
|
||||
num_features = num_features + num_layers * growth_rate
|
||||
if i != len(block_config) - 1:
|
||||
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
|
||||
self.features.add_module('transition%d' % (i + 1), trans)
|
||||
num_features = num_features // 2
|
||||
|
||||
# Final batch norm
|
||||
self.features.add_module('norm5', nn.BatchNorm2d(num_features))
|
||||
|
||||
# Linear layer
|
||||
self.classifier = nn.Linear(num_features, num_classes)
|
||||
|
||||
# Official init from torch repo.
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight)
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
nn.init.constant_(m.weight, 1)
|
||||
nn.init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.Linear):
|
||||
nn.init.constant_(m.bias, 0)
|
||||
|
||||
def forward(self, x):
|
||||
features = self.features(x)
|
||||
out = F.relu(features, inplace=True)
|
||||
out = F.adaptive_avg_pool2d(out, (1, 1)).view(features.size(0), -1)
|
||||
out = self.classifier(out)
|
||||
return out
|
||||
|
||||
|
||||
def densenet121(pretrained=False, **kwargs):
|
||||
r"""Densenet-121 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
|
||||
**kwargs)
|
||||
if pretrained:
|
||||
# '.'s are no longer allowed in module names, but pervious _DenseLayer
|
||||
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
|
||||
# They are also in the checkpoints in model_urls. This pattern is used
|
||||
# to find such keys.
|
||||
pattern = re.compile(
|
||||
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
|
||||
state_dict = model_zoo.load_url(model_urls['densenet121'])
|
||||
for key in list(state_dict.keys()):
|
||||
res = pattern.match(key)
|
||||
if res:
|
||||
new_key = res.group(1) + res.group(2)
|
||||
state_dict[new_key] = state_dict[key]
|
||||
del state_dict[key]
|
||||
model.load_state_dict(state_dict)
|
||||
return model
|
||||
|
||||
|
||||
def densenet169(pretrained=False, **kwargs):
|
||||
r"""Densenet-169 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
|
||||
**kwargs)
|
||||
if pretrained:
|
||||
# '.'s are no longer allowed in module names, but pervious _DenseLayer
|
||||
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
|
||||
# They are also in the checkpoints in model_urls. This pattern is used
|
||||
# to find such keys.
|
||||
pattern = re.compile(
|
||||
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
|
||||
state_dict = model_zoo.load_url(model_urls['densenet169'])
|
||||
for key in list(state_dict.keys()):
|
||||
res = pattern.match(key)
|
||||
if res:
|
||||
new_key = res.group(1) + res.group(2)
|
||||
state_dict[new_key] = state_dict[key]
|
||||
del state_dict[key]
|
||||
model.load_state_dict(state_dict)
|
||||
return model
|
||||
|
||||
|
||||
def densenet201(pretrained=False, **kwargs):
|
||||
r"""Densenet-201 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
|
||||
**kwargs)
|
||||
if pretrained:
|
||||
# '.'s are no longer allowed in module names, but pervious _DenseLayer
|
||||
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
|
||||
# They are also in the checkpoints in model_urls. This pattern is used
|
||||
# to find such keys.
|
||||
pattern = re.compile(
|
||||
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
|
||||
state_dict = model_zoo.load_url(model_urls['densenet201'])
|
||||
for key in list(state_dict.keys()):
|
||||
res = pattern.match(key)
|
||||
if res:
|
||||
new_key = res.group(1) + res.group(2)
|
||||
state_dict[new_key] = state_dict[key]
|
||||
del state_dict[key]
|
||||
model.load_state_dict(state_dict)
|
||||
return model
|
||||
|
||||
|
||||
def densenet161(pretrained=False, **kwargs):
|
||||
r"""Densenet-161 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
|
||||
**kwargs)
|
||||
if pretrained:
|
||||
# '.'s are no longer allowed in module names, but pervious _DenseLayer
|
||||
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
|
||||
# They are also in the checkpoints in model_urls. This pattern is used
|
||||
# to find such keys.
|
||||
pattern = re.compile(
|
||||
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
|
||||
state_dict = model_zoo.load_url(model_urls['densenet161'])
|
||||
for key in list(state_dict.keys()):
|
||||
res = pattern.match(key)
|
||||
if res:
|
||||
new_key = res.group(1) + res.group(2)
|
||||
state_dict[new_key] = state_dict[key]
|
||||
del state_dict[key]
|
||||
model.load_state_dict(state_dict)
|
||||
return model
|
||||
+279
@@ -0,0 +1,279 @@
|
||||
import re
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.checkpoint as cp
|
||||
from collections import OrderedDict
|
||||
#from .utils import load_state_dict_from_url
|
||||
from torch import Tensor
|
||||
from torch.jit.annotations import List
|
||||
|
||||
|
||||
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
|
||||
|
||||
model_urls = {
|
||||
'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
|
||||
'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
|
||||
'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
|
||||
'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
|
||||
}
|
||||
|
||||
|
||||
class _DenseLayer(nn.Module):
|
||||
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, memory_efficient=False):
|
||||
super(_DenseLayer, self).__init__()
|
||||
self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
|
||||
self.add_module('relu1', nn.ReLU(inplace=True)),
|
||||
self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
|
||||
growth_rate, kernel_size=1, stride=1,
|
||||
bias=False)),
|
||||
self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
|
||||
self.add_module('relu2', nn.ReLU(inplace=True)),
|
||||
self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
|
||||
kernel_size=3, stride=1, padding=1,
|
||||
bias=False)),
|
||||
self.drop_rate = float(drop_rate)
|
||||
self.memory_efficient = memory_efficient
|
||||
|
||||
def bn_function(self, inputs):
|
||||
# type: (List[Tensor]) -> Tensor
|
||||
concated_features = torch.cat(inputs, 1)
|
||||
bottleneck_output = self.conv1(self.relu1(self.norm1(concated_features))) # noqa: T484
|
||||
return bottleneck_output
|
||||
|
||||
# todo: rewrite when torchscript supports any
|
||||
def any_requires_grad(self, input):
|
||||
# type: (List[Tensor]) -> bool
|
||||
for tensor in input:
|
||||
if tensor.requires_grad:
|
||||
return True
|
||||
return False
|
||||
|
||||
@torch.jit.unused # noqa: T484
|
||||
def call_checkpoint_bottleneck(self, input):
|
||||
# type: (List[Tensor]) -> Tensor
|
||||
def closure(*inputs):
|
||||
return self.bn_function(*inputs)
|
||||
|
||||
return cp.checkpoint(closure, input)
|
||||
|
||||
@torch.jit._overload_method # noqa: F811
|
||||
def forward(self, input):
|
||||
# type: (List[Tensor]) -> (Tensor)
|
||||
pass
|
||||
|
||||
@torch.jit._overload_method # noqa: F811
|
||||
def forward(self, input):
|
||||
# type: (Tensor) -> (Tensor)
|
||||
pass
|
||||
|
||||
# torchscript does not yet support *args, so we overload method
|
||||
# allowing it to take either a List[Tensor] or single Tensor
|
||||
def forward(self, input): # noqa: F811
|
||||
if isinstance(input, Tensor):
|
||||
prev_features = [input]
|
||||
else:
|
||||
prev_features = input
|
||||
|
||||
if self.memory_efficient and self.any_requires_grad(prev_features):
|
||||
if torch.jit.is_scripting():
|
||||
raise Exception("Memory Efficient not supported in JIT")
|
||||
|
||||
bottleneck_output = self.call_checkpoint_bottleneck(prev_features)
|
||||
else:
|
||||
bottleneck_output = self.bn_function(prev_features)
|
||||
|
||||
new_features = self.conv2(self.relu2(self.norm2(bottleneck_output)))
|
||||
if self.drop_rate > 0:
|
||||
new_features = F.dropout(new_features, p=self.drop_rate,
|
||||
training=self.training)
|
||||
return new_features
|
||||
|
||||
|
||||
class _DenseBlock(nn.ModuleDict):
|
||||
_version = 2
|
||||
|
||||
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, memory_efficient=False):
|
||||
super(_DenseBlock, self).__init__()
|
||||
for i in range(num_layers):
|
||||
layer = _DenseLayer(
|
||||
num_input_features + i * growth_rate,
|
||||
growth_rate=growth_rate,
|
||||
bn_size=bn_size,
|
||||
drop_rate=drop_rate,
|
||||
memory_efficient=memory_efficient,
|
||||
)
|
||||
self.add_module('denselayer%d' % (i + 1), layer)
|
||||
|
||||
def forward(self, init_features):
|
||||
features = [init_features]
|
||||
for name, layer in self.items():
|
||||
new_features = layer(features)
|
||||
features.append(new_features)
|
||||
return torch.cat(features, 1)
|
||||
|
||||
|
||||
class _Transition(nn.Sequential):
|
||||
def __init__(self, num_input_features, num_output_features):
|
||||
super(_Transition, self).__init__()
|
||||
self.add_module('norm', nn.BatchNorm2d(num_input_features))
|
||||
self.add_module('relu', nn.ReLU(inplace=True))
|
||||
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
|
||||
kernel_size=1, stride=1, bias=False))
|
||||
self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
|
||||
|
||||
|
||||
class DenseNet(nn.Module):
|
||||
r"""Densenet-BC model class, based on
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
growth_rate (int) - how many filters to add each layer (`k` in paper)
|
||||
block_config (list of 4 ints) - how many layers in each pooling block
|
||||
num_init_features (int) - the number of filters to learn in the first convolution layer
|
||||
bn_size (int) - multiplicative factor for number of bottle neck layers
|
||||
(i.e. bn_size * k features in the bottleneck layer)
|
||||
drop_rate (float) - dropout rate after each dense layer
|
||||
num_classes (int) - number of classification classes
|
||||
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
|
||||
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
|
||||
"""
|
||||
|
||||
def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
|
||||
num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000, memory_efficient=False):
|
||||
|
||||
super(DenseNet, self).__init__()
|
||||
|
||||
# First convolution
|
||||
self.features = nn.Sequential(OrderedDict([
|
||||
('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2,
|
||||
padding=3, bias=False)),
|
||||
('norm0', nn.BatchNorm2d(num_init_features)),
|
||||
('relu0', nn.ReLU(inplace=True)),
|
||||
('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
|
||||
]))
|
||||
|
||||
# Each denseblock
|
||||
num_features = num_init_features
|
||||
for i, num_layers in enumerate(block_config):
|
||||
block = _DenseBlock(
|
||||
num_layers=num_layers,
|
||||
num_input_features=num_features,
|
||||
bn_size=bn_size,
|
||||
growth_rate=growth_rate,
|
||||
drop_rate=drop_rate,
|
||||
memory_efficient=memory_efficient
|
||||
)
|
||||
self.features.add_module('denseblock%d' % (i + 1), block)
|
||||
num_features = num_features + num_layers * growth_rate
|
||||
if i != len(block_config) - 1:
|
||||
trans = _Transition(num_input_features=num_features,
|
||||
num_output_features=num_features // 2)
|
||||
self.features.add_module('transition%d' % (i + 1), trans)
|
||||
num_features = num_features // 2
|
||||
|
||||
# Final batch norm
|
||||
self.features.add_module('norm5', nn.BatchNorm2d(num_features))
|
||||
|
||||
# Linear layer
|
||||
self.classifier = nn.Linear(num_features, num_classes)
|
||||
|
||||
# Official init from torch repo.
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight)
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
nn.init.constant_(m.weight, 1)
|
||||
nn.init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.Linear):
|
||||
nn.init.constant_(m.bias, 0)
|
||||
|
||||
def forward(self, x):
|
||||
features = self.features(x)
|
||||
out = F.relu(features, inplace=True)
|
||||
out = F.adaptive_avg_pool2d(out, (1, 1))
|
||||
out = torch.flatten(out, 1)
|
||||
out = self.classifier(out)
|
||||
return out
|
||||
|
||||
|
||||
def _load_state_dict(model, model_url, progress):
|
||||
# '.'s are no longer allowed in module names, but previous _DenseLayer
|
||||
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
|
||||
# They are also in the checkpoints in model_urls. This pattern is used
|
||||
# to find such keys.
|
||||
pattern = re.compile(
|
||||
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
|
||||
|
||||
state_dict = load_state_dict_from_url(model_url, progress=progress)
|
||||
for key in list(state_dict.keys()):
|
||||
res = pattern.match(key)
|
||||
if res:
|
||||
new_key = res.group(1) + res.group(2)
|
||||
state_dict[new_key] = state_dict[key]
|
||||
del state_dict[key]
|
||||
model.load_state_dict(state_dict)
|
||||
|
||||
|
||||
def _densenet(arch, growth_rate, block_config, num_init_features, pretrained, progress,
|
||||
**kwargs):
|
||||
model = DenseNet(growth_rate, block_config, num_init_features, **kwargs)
|
||||
if pretrained:
|
||||
_load_state_dict(model, model_urls[arch], progress)
|
||||
return model
|
||||
|
||||
|
||||
def densenet121(pretrained=False, progress=True, **kwargs):
|
||||
r"""Densenet-121 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
progress (bool): If True, displays a progress bar of the download to stderr
|
||||
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
|
||||
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
|
||||
"""
|
||||
return _densenet('densenet121', 32, (6, 12, 24, 16), 64, pretrained, progress,
|
||||
**kwargs)
|
||||
|
||||
|
||||
def densenet161(pretrained=False, progress=True, **kwargs):
|
||||
r"""Densenet-161 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
progress (bool): If True, displays a progress bar of the download to stderr
|
||||
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
|
||||
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
|
||||
"""
|
||||
return _densenet('densenet161', 48, (6, 12, 36, 24), 96, pretrained, progress,
|
||||
**kwargs)
|
||||
|
||||
|
||||
def densenet169(pretrained=False, progress=True, **kwargs):
|
||||
r"""Densenet-169 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
progress (bool): If True, displays a progress bar of the download to stderr
|
||||
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
|
||||
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
|
||||
"""
|
||||
return _densenet('densenet169', 32, (6, 12, 32, 32), 64, pretrained, progress,
|
||||
**kwargs)
|
||||
|
||||
|
||||
def densenet201(pretrained=False, progress=True, **kwargs):
|
||||
r"""Densenet-201 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
progress (bool): If True, displays a progress bar of the download to stderr
|
||||
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
|
||||
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
|
||||
"""
|
||||
return _densenet('densenet201', 32, (6, 12, 48, 32), 64, pretrained, progress,
|
||||
**kwargs)
|
||||
@@ -0,0 +1,22 @@
|
||||
export ASCEND_HOME=/usr/local/Ascend
|
||||
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
|
||||
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$currentDir
|
||||
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 7"
|
||||
|
||||
export TASK_QUEUE_ENABLE=0
|
||||
taskset -c 111-150 python3 densenet121_1p_main.py \
|
||||
--workers 40 \
|
||||
--arch densenet121 \
|
||||
--npu 7 \
|
||||
--lr 0.1 \
|
||||
--momentum 0.9 \
|
||||
--amp \
|
||||
--batch-size 256 \
|
||||
--epoch 90 \
|
||||
--evaluate \
|
||||
--resume checkpoint.pth.tar \
|
||||
--data /opt/npu/dataset/imagenet
|
||||
+275
@@ -0,0 +1,275 @@
|
||||
import re
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.model_zoo as model_zoo
|
||||
from collections import OrderedDict
|
||||
|
||||
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
|
||||
|
||||
|
||||
model_urls = {
|
||||
'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
|
||||
'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
|
||||
'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
|
||||
'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
|
||||
}
|
||||
|
||||
|
||||
class _DenseLayer(nn.Sequential):
|
||||
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
|
||||
super(_DenseLayer, self).__init__()
|
||||
self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
|
||||
self.add_module('relu1', nn.ReLU(inplace=True)),
|
||||
self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
|
||||
growth_rate, kernel_size=1, stride=1, bias=False)),
|
||||
self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
|
||||
self.add_module('relu2', nn.ReLU(inplace=True)),
|
||||
self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
|
||||
kernel_size=3, stride=1, padding=1, bias=False)),
|
||||
self.drop_rate = drop_rate
|
||||
|
||||
def forward(self, x):
|
||||
new_features = super(_DenseLayer, self).forward(x)
|
||||
if self.drop_rate > 0:
|
||||
new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
|
||||
return torch.cat([x, new_features], 1)
|
||||
|
||||
|
||||
class _DenseBlock(nn.Sequential):
|
||||
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
|
||||
super(_DenseBlock, self).__init__()
|
||||
for i in range(num_layers):
|
||||
layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
|
||||
self.add_module('denselayer%d' % (i + 1), layer)
|
||||
|
||||
|
||||
class _Transition(nn.Sequential):
|
||||
def __init__(self, num_input_features, num_output_features):
|
||||
super(_Transition, self).__init__()
|
||||
self.add_module('norm', nn.BatchNorm2d(num_input_features))
|
||||
self.add_module('relu', nn.ReLU(inplace=True))
|
||||
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
|
||||
kernel_size=1, stride=1, bias=False))
|
||||
#self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2)) ######### xupeng add ##########
|
||||
|
||||
|
||||
class DenseNet(nn.Module):
|
||||
r"""Densenet-BC model class, based on
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
growth_rate (int) - how many filters to add each layer (`k` in paper)
|
||||
block_config (list of 4 ints) - how many layers in each pooling block
|
||||
num_init_features (int) - the number of filters to learn in the first convolution layer
|
||||
bn_size (int) - multiplicative factor for number of bottle neck layers
|
||||
(i.e. bn_size * k features in the bottleneck layer)
|
||||
drop_rate (float) - dropout rate after each dense layer
|
||||
num_classes (int) - number of classification classes
|
||||
"""
|
||||
|
||||
def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
|
||||
num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
|
||||
|
||||
super(DenseNet, self).__init__()
|
||||
|
||||
self.avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
|
||||
|
||||
################ block 0 ################
|
||||
num_features = num_init_features
|
||||
i=0
|
||||
num_layers=block_config[i]
|
||||
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
|
||||
num_features = num_features + num_layers * growth_rate
|
||||
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
|
||||
self.features0 = nn.Sequential(OrderedDict([
|
||||
('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
|
||||
('norm0', nn.BatchNorm2d(num_init_features)),
|
||||
('relu0', nn.ReLU(inplace=True)),
|
||||
('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
|
||||
('denseblock%d' % (i + 1), block),
|
||||
('transition%d' % (i + 1), trans)
|
||||
]))
|
||||
|
||||
################ block 1 ##############
|
||||
num_features = num_features // 2
|
||||
i=1
|
||||
num_layers=block_config[i]
|
||||
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
|
||||
num_features = num_features + num_layers * growth_rate
|
||||
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
|
||||
self.features1 = nn.Sequential(OrderedDict([
|
||||
('denseblock%d' % (i + 1), block),
|
||||
('transition%d' % (i + 1), trans),
|
||||
]))
|
||||
|
||||
################ block 2 ##############
|
||||
num_features = num_features // 2
|
||||
i=2
|
||||
num_layers=block_config[i]
|
||||
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
|
||||
num_features = num_features + num_layers * growth_rate
|
||||
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
|
||||
self.features2 = nn.Sequential(OrderedDict([
|
||||
('denseblock%d' % (i + 1), block),
|
||||
('transition%d' % (i + 1), trans),
|
||||
]))
|
||||
|
||||
################ block 3 ##############
|
||||
num_features = num_features // 2
|
||||
i=3
|
||||
num_layers=block_config[i]
|
||||
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
|
||||
num_features = num_features + num_layers * growth_rate
|
||||
self.features3 = nn.Sequential(OrderedDict([
|
||||
('denseblock%d' % (i + 1), block),
|
||||
('norm5', nn.BatchNorm2d(num_features)),
|
||||
]))
|
||||
|
||||
# Linear layer
|
||||
self.classifier = nn.Linear(num_features, num_classes)
|
||||
|
||||
# Official init from torch repo.
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight)
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
nn.init.constant_(m.weight, 1)
|
||||
nn.init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.Linear):
|
||||
nn.init.constant_(m.bias, 0)
|
||||
|
||||
def forward(self, x):
|
||||
#CALCULATE_DEVICE = "npu:0"
|
||||
#self.avg_pool = self.avg_pool.cpu()
|
||||
#print("avg_pool move to cpu")
|
||||
#print("tag0")
|
||||
features0 = self.features0(x)
|
||||
#features0 = features0.cpu()
|
||||
avg_pool_0 = self.avg_pool(features0)
|
||||
#avg_pool_0 = avg_pool_0.to(CALCULATE_DEVICE)
|
||||
#print("tag1")
|
||||
features1 = self.features1(avg_pool_0)
|
||||
#features1 = features1.cpu()
|
||||
avg_pool_1 = self.avg_pool(features1)
|
||||
#avg_pool_1 = avg_pool_1.to(CALCULATE_DEVICE)
|
||||
#print("tag2")
|
||||
features2 = self.features2(avg_pool_1)
|
||||
#features2 = features2.cpu()
|
||||
avg_pool_2 = self.avg_pool(features2)
|
||||
#avg_pool_2 = avg_pool_2.to(CALCULATE_DEVICE)
|
||||
#print("tag3")
|
||||
features3 = self.features3(avg_pool_2)
|
||||
|
||||
out = F.relu(features3, inplace=True)
|
||||
out = F.adaptive_avg_pool2d(out, (1, 1)).view(features3.size(0), -1)
|
||||
out = self.classifier(out)
|
||||
return out
|
||||
|
||||
|
||||
def densenet121(pretrained=False, **kwargs):
|
||||
r"""Densenet-121 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
|
||||
**kwargs)
|
||||
if pretrained:
|
||||
# '.'s are no longer allowed in module names, but pervious _DenseLayer
|
||||
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
|
||||
# They are also in the checkpoints in model_urls. This pattern is used
|
||||
# to find such keys.
|
||||
pattern = re.compile(
|
||||
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
|
||||
state_dict = model_zoo.load_url(model_urls['densenet121'])
|
||||
for key in list(state_dict.keys()):
|
||||
res = pattern.match(key)
|
||||
if res:
|
||||
new_key = res.group(1) + res.group(2)
|
||||
state_dict[new_key] = state_dict[key]
|
||||
del state_dict[key]
|
||||
model.load_state_dict(state_dict)
|
||||
return model
|
||||
|
||||
|
||||
def densenet169(pretrained=False, **kwargs):
|
||||
r"""Densenet-169 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
|
||||
**kwargs)
|
||||
if pretrained:
|
||||
# '.'s are no longer allowed in module names, but pervious _DenseLayer
|
||||
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
|
||||
# They are also in the checkpoints in model_urls. This pattern is used
|
||||
# to find such keys.
|
||||
pattern = re.compile(
|
||||
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
|
||||
state_dict = model_zoo.load_url(model_urls['densenet169'])
|
||||
for key in list(state_dict.keys()):
|
||||
res = pattern.match(key)
|
||||
if res:
|
||||
new_key = res.group(1) + res.group(2)
|
||||
state_dict[new_key] = state_dict[key]
|
||||
del state_dict[key]
|
||||
model.load_state_dict(state_dict)
|
||||
return model
|
||||
|
||||
|
||||
def densenet201(pretrained=False, **kwargs):
|
||||
r"""Densenet-201 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
|
||||
**kwargs)
|
||||
if pretrained:
|
||||
# '.'s are no longer allowed in module names, but pervious _DenseLayer
|
||||
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
|
||||
# They are also in the checkpoints in model_urls. This pattern is used
|
||||
# to find such keys.
|
||||
pattern = re.compile(
|
||||
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
|
||||
state_dict = model_zoo.load_url(model_urls['densenet201'])
|
||||
for key in list(state_dict.keys()):
|
||||
res = pattern.match(key)
|
||||
if res:
|
||||
new_key = res.group(1) + res.group(2)
|
||||
state_dict[new_key] = state_dict[key]
|
||||
del state_dict[key]
|
||||
model.load_state_dict(state_dict)
|
||||
return model
|
||||
|
||||
|
||||
def densenet161(pretrained=False, **kwargs):
|
||||
r"""Densenet-161 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
|
||||
**kwargs)
|
||||
if pretrained:
|
||||
# '.'s are no longer allowed in module names, but pervious _DenseLayer
|
||||
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
|
||||
# They are also in the checkpoints in model_urls. This pattern is used
|
||||
# to find such keys.
|
||||
pattern = re.compile(
|
||||
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
|
||||
state_dict = model_zoo.load_url(model_urls['densenet161'])
|
||||
for key in list(state_dict.keys()):
|
||||
res = pattern.match(key)
|
||||
if res:
|
||||
new_key = res.group(1) + res.group(2)
|
||||
state_dict[new_key] = state_dict[key]
|
||||
del state_dict[key]
|
||||
model.load_state_dict(state_dict)
|
||||
return model
|
||||
+275
@@ -0,0 +1,275 @@
|
||||
import re
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.model_zoo as model_zoo
|
||||
from collections import OrderedDict
|
||||
|
||||
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
|
||||
|
||||
|
||||
model_urls = {
|
||||
'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
|
||||
'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
|
||||
'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
|
||||
'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
|
||||
}
|
||||
|
||||
|
||||
class _DenseLayer(nn.Sequential):
|
||||
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
|
||||
super(_DenseLayer, self).__init__()
|
||||
self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
|
||||
self.add_module('relu1', nn.ReLU(inplace=True)),
|
||||
self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
|
||||
growth_rate, kernel_size=1, stride=1, bias=False)),
|
||||
self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
|
||||
self.add_module('relu2', nn.ReLU(inplace=True)),
|
||||
self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
|
||||
kernel_size=3, stride=1, padding=1, bias=False)),
|
||||
self.drop_rate = drop_rate
|
||||
|
||||
def forward(self, x):
|
||||
new_features = super(_DenseLayer, self).forward(x)
|
||||
if self.drop_rate > 0:
|
||||
new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
|
||||
return torch.cat([x, new_features], 1)
|
||||
|
||||
|
||||
class _DenseBlock(nn.Sequential):
|
||||
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
|
||||
super(_DenseBlock, self).__init__()
|
||||
for i in range(num_layers):
|
||||
layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
|
||||
self.add_module('denselayer%d' % (i + 1), layer)
|
||||
|
||||
|
||||
class _Transition(nn.Sequential):
|
||||
def __init__(self, num_input_features, num_output_features):
|
||||
super(_Transition, self).__init__()
|
||||
self.add_module('norm', nn.BatchNorm2d(num_input_features))
|
||||
self.add_module('relu', nn.ReLU(inplace=True))
|
||||
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
|
||||
kernel_size=1, stride=1, bias=False))
|
||||
#self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2)) ######### xupeng add ##########
|
||||
|
||||
|
||||
class DenseNet(nn.Module):
|
||||
r"""Densenet-BC model class, based on
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
growth_rate (int) - how many filters to add each layer (`k` in paper)
|
||||
block_config (list of 4 ints) - how many layers in each pooling block
|
||||
num_init_features (int) - the number of filters to learn in the first convolution layer
|
||||
bn_size (int) - multiplicative factor for number of bottle neck layers
|
||||
(i.e. bn_size * k features in the bottleneck layer)
|
||||
drop_rate (float) - dropout rate after each dense layer
|
||||
num_classes (int) - number of classification classes
|
||||
"""
|
||||
|
||||
def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
|
||||
num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
|
||||
|
||||
super(DenseNet, self).__init__()
|
||||
|
||||
self.avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
|
||||
|
||||
################ block 0 ################
|
||||
num_features = num_init_features
|
||||
i=0
|
||||
num_layers=block_config[i]
|
||||
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
|
||||
num_features = num_features + num_layers * growth_rate
|
||||
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
|
||||
self.features0 = nn.Sequential(OrderedDict([
|
||||
('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
|
||||
('norm0', nn.BatchNorm2d(num_init_features)),
|
||||
('relu0', nn.ReLU(inplace=True)),
|
||||
('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
|
||||
('denseblock%d' % (i + 1), block),
|
||||
('transition%d' % (i + 1), trans)
|
||||
]))
|
||||
|
||||
################ block 1 ##############
|
||||
num_features = num_features // 2
|
||||
i=1
|
||||
num_layers=block_config[i]
|
||||
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
|
||||
num_features = num_features + num_layers * growth_rate
|
||||
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
|
||||
self.features1 = nn.Sequential(OrderedDict([
|
||||
('denseblock%d' % (i + 1), block),
|
||||
('transition%d' % (i + 1), trans),
|
||||
]))
|
||||
|
||||
################ block 2 ##############
|
||||
num_features = num_features // 2
|
||||
i=2
|
||||
num_layers=block_config[i]
|
||||
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
|
||||
num_features = num_features + num_layers * growth_rate
|
||||
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
|
||||
self.features2 = nn.Sequential(OrderedDict([
|
||||
('denseblock%d' % (i + 1), block),
|
||||
('transition%d' % (i + 1), trans),
|
||||
]))
|
||||
|
||||
################ block 3 ##############
|
||||
num_features = num_features // 2
|
||||
i=3
|
||||
num_layers=block_config[i]
|
||||
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
|
||||
num_features = num_features + num_layers * growth_rate
|
||||
self.features3 = nn.Sequential(OrderedDict([
|
||||
('denseblock%d' % (i + 1), block),
|
||||
('norm5', nn.BatchNorm2d(num_features)),
|
||||
]))
|
||||
|
||||
# Linear layer
|
||||
self.classifier = nn.Linear(num_features, num_classes)
|
||||
|
||||
# Official init from torch repo.
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight)
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
nn.init.constant_(m.weight, 1)
|
||||
nn.init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.Linear):
|
||||
nn.init.constant_(m.bias, 0)
|
||||
|
||||
def forward(self, x):
|
||||
CALCULATE_DEVICE = "npu:0"
|
||||
self.avg_pool = self.avg_pool.cpu()
|
||||
#print("avg_pool move to cpu")
|
||||
#print("tag0")
|
||||
features0 = self.features0(x)
|
||||
features0 = features0.cpu()
|
||||
avg_pool_0 = self.avg_pool(features0)
|
||||
avg_pool_0 = avg_pool_0.to(CALCULATE_DEVICE)
|
||||
#print("tag1")
|
||||
features1 = self.features1(avg_pool_0)
|
||||
features1 = features1.cpu()
|
||||
avg_pool_1 = self.avg_pool(features1)
|
||||
avg_pool_1 = avg_pool_1.to(CALCULATE_DEVICE)
|
||||
#print("tag2")
|
||||
features2 = self.features2(avg_pool_1)
|
||||
features2 = features2.cpu()
|
||||
avg_pool_2 = self.avg_pool(features2)
|
||||
avg_pool_2 = avg_pool_2.to(CALCULATE_DEVICE)
|
||||
#print("tag3")
|
||||
features3 = self.features3(avg_pool_2)
|
||||
|
||||
out = F.relu(features3, inplace=True)
|
||||
out = F.adaptive_avg_pool2d(out, (1, 1)).view(features3.size(0), -1)
|
||||
out = self.classifier(out)
|
||||
return out
|
||||
|
||||
|
||||
def densenet121(pretrained=False, **kwargs):
|
||||
r"""Densenet-121 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
|
||||
**kwargs)
|
||||
if pretrained:
|
||||
# '.'s are no longer allowed in module names, but pervious _DenseLayer
|
||||
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
|
||||
# They are also in the checkpoints in model_urls. This pattern is used
|
||||
# to find such keys.
|
||||
pattern = re.compile(
|
||||
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
|
||||
state_dict = model_zoo.load_url(model_urls['densenet121'])
|
||||
for key in list(state_dict.keys()):
|
||||
res = pattern.match(key)
|
||||
if res:
|
||||
new_key = res.group(1) + res.group(2)
|
||||
state_dict[new_key] = state_dict[key]
|
||||
del state_dict[key]
|
||||
model.load_state_dict(state_dict)
|
||||
return model
|
||||
|
||||
|
||||
def densenet169(pretrained=False, **kwargs):
|
||||
r"""Densenet-169 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
|
||||
**kwargs)
|
||||
if pretrained:
|
||||
# '.'s are no longer allowed in module names, but pervious _DenseLayer
|
||||
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
|
||||
# They are also in the checkpoints in model_urls. This pattern is used
|
||||
# to find such keys.
|
||||
pattern = re.compile(
|
||||
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
|
||||
state_dict = model_zoo.load_url(model_urls['densenet169'])
|
||||
for key in list(state_dict.keys()):
|
||||
res = pattern.match(key)
|
||||
if res:
|
||||
new_key = res.group(1) + res.group(2)
|
||||
state_dict[new_key] = state_dict[key]
|
||||
del state_dict[key]
|
||||
model.load_state_dict(state_dict)
|
||||
return model
|
||||
|
||||
|
||||
def densenet201(pretrained=False, **kwargs):
|
||||
r"""Densenet-201 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
|
||||
**kwargs)
|
||||
if pretrained:
|
||||
# '.'s are no longer allowed in module names, but pervious _DenseLayer
|
||||
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
|
||||
# They are also in the checkpoints in model_urls. This pattern is used
|
||||
# to find such keys.
|
||||
pattern = re.compile(
|
||||
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
|
||||
state_dict = model_zoo.load_url(model_urls['densenet201'])
|
||||
for key in list(state_dict.keys()):
|
||||
res = pattern.match(key)
|
||||
if res:
|
||||
new_key = res.group(1) + res.group(2)
|
||||
state_dict[new_key] = state_dict[key]
|
||||
del state_dict[key]
|
||||
model.load_state_dict(state_dict)
|
||||
return model
|
||||
|
||||
|
||||
def densenet161(pretrained=False, **kwargs):
|
||||
r"""Densenet-161 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
|
||||
**kwargs)
|
||||
if pretrained:
|
||||
# '.'s are no longer allowed in module names, but pervious _DenseLayer
|
||||
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
|
||||
# They are also in the checkpoints in model_urls. This pattern is used
|
||||
# to find such keys.
|
||||
pattern = re.compile(
|
||||
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
|
||||
state_dict = model_zoo.load_url(model_urls['densenet161'])
|
||||
for key in list(state_dict.keys()):
|
||||
res = pattern.match(key)
|
||||
if res:
|
||||
new_key = res.group(1) + res.group(2)
|
||||
state_dict[new_key] = state_dict[key]
|
||||
del state_dict[key]
|
||||
model.load_state_dict(state_dict)
|
||||
return model
|
||||
+300
@@ -0,0 +1,300 @@
|
||||
import re
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.checkpoint as cp
|
||||
from collections import OrderedDict
|
||||
#from .utils import load_state_dict_from_url
|
||||
from torch import Tensor
|
||||
from torch.jit.annotations import List
|
||||
|
||||
|
||||
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
|
||||
|
||||
model_urls = {
|
||||
'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
|
||||
'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
|
||||
'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
|
||||
'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
|
||||
}
|
||||
|
||||
|
||||
class _DenseLayer(nn.Module):
|
||||
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, memory_efficient=False):
|
||||
super(_DenseLayer, self).__init__()
|
||||
self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
|
||||
self.add_module('relu1', nn.ReLU(inplace=True)),
|
||||
self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
|
||||
growth_rate, kernel_size=1, stride=1,
|
||||
bias=False)),
|
||||
self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
|
||||
self.add_module('relu2', nn.ReLU(inplace=True)),
|
||||
self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
|
||||
kernel_size=3, stride=1, padding=1,
|
||||
bias=False)),
|
||||
self.drop_rate = float(drop_rate)
|
||||
self.memory_efficient = memory_efficient
|
||||
|
||||
def bn_function(self, inputs):
|
||||
# type: (List[Tensor]) -> Tensor
|
||||
concated_features = torch.cat(inputs, 1)
|
||||
bottleneck_output = self.conv1(self.relu1(self.norm1(concated_features))) # noqa: T484
|
||||
return bottleneck_output
|
||||
|
||||
# todo: rewrite when torchscript supports any
|
||||
def any_requires_grad(self, input):
|
||||
# type: (List[Tensor]) -> bool
|
||||
for tensor in input:
|
||||
if tensor.requires_grad:
|
||||
return True
|
||||
return False
|
||||
|
||||
@torch.jit.unused # noqa: T484
|
||||
def call_checkpoint_bottleneck(self, input):
|
||||
# type: (List[Tensor]) -> Tensor
|
||||
def closure(*inputs):
|
||||
return self.bn_function(*inputs)
|
||||
|
||||
return cp.checkpoint(closure, input)
|
||||
|
||||
@torch.jit._overload_method # noqa: F811
|
||||
def forward(self, input):
|
||||
# type: (List[Tensor]) -> (Tensor)
|
||||
pass
|
||||
|
||||
@torch.jit._overload_method # noqa: F811
|
||||
def forward(self, input):
|
||||
# type: (Tensor) -> (Tensor)
|
||||
pass
|
||||
|
||||
# torchscript does not yet support *args, so we overload method
|
||||
# allowing it to take either a List[Tensor] or single Tensor
|
||||
def forward(self, input): # noqa: F811
|
||||
if isinstance(input, Tensor):
|
||||
prev_features = [input]
|
||||
else:
|
||||
prev_features = input
|
||||
|
||||
if self.memory_efficient and self.any_requires_grad(prev_features):
|
||||
if torch.jit.is_scripting():
|
||||
raise Exception("Memory Efficient not supported in JIT")
|
||||
|
||||
bottleneck_output = self.call_checkpoint_bottleneck(prev_features)
|
||||
else:
|
||||
bottleneck_output = self.bn_function(prev_features)
|
||||
|
||||
new_features = self.conv2(self.relu2(self.norm2(bottleneck_output)))
|
||||
if self.drop_rate > 0:
|
||||
new_features = F.dropout(new_features, p=self.drop_rate,
|
||||
training=self.training)
|
||||
return new_features
|
||||
|
||||
|
||||
class _DenseBlock(nn.ModuleDict):
|
||||
_version = 2
|
||||
|
||||
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, memory_efficient=False):
|
||||
super(_DenseBlock, self).__init__()
|
||||
for i in range(num_layers):
|
||||
layer = _DenseLayer(
|
||||
num_input_features + i * growth_rate,
|
||||
growth_rate=growth_rate,
|
||||
bn_size=bn_size,
|
||||
drop_rate=drop_rate,
|
||||
memory_efficient=memory_efficient,
|
||||
)
|
||||
self.add_module('denselayer%d' % (i + 1), layer)
|
||||
|
||||
def forward(self, init_features):
|
||||
features = [init_features]
|
||||
for name, layer in self.items():
|
||||
new_features = layer(features)
|
||||
features.append(new_features)
|
||||
return torch.cat(features, 1)
|
||||
|
||||
|
||||
class _Transition(nn.Sequential):
|
||||
def __init__(self, num_input_features, num_output_features):
|
||||
super(_Transition, self).__init__()
|
||||
self.add_module('norm', nn.BatchNorm2d(num_input_features))
|
||||
self.add_module('relu', nn.ReLU(inplace=True))
|
||||
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
|
||||
kernel_size=1, stride=1, bias=False))
|
||||
self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
|
||||
|
||||
class PrintLayer(nn.Module):
|
||||
def __init__(self, name):
|
||||
super(PrintLayer, self).__init__()
|
||||
self.name = name
|
||||
|
||||
def forward(self, x):
|
||||
# Do your print / debug stuff here
|
||||
print("{} mean data: {}".format(self.name, x.mean().item())) #print(x.shape)
|
||||
return x
|
||||
|
||||
class DenseNet(nn.Module):
|
||||
r"""Densenet-BC model class, based on
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
growth_rate (int) - how many filters to add each layer (`k` in paper)
|
||||
block_config (list of 4 ints) - how many layers in each pooling block
|
||||
num_init_features (int) - the number of filters to learn in the first convolution layer
|
||||
bn_size (int) - multiplicative factor for number of bottle neck layers
|
||||
(i.e. bn_size * k features in the bottleneck layer)
|
||||
drop_rate (float) - dropout rate after each dense layer
|
||||
num_classes (int) - number of classification classes
|
||||
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
|
||||
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
|
||||
"""
|
||||
|
||||
def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
|
||||
num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000, memory_efficient=False):
|
||||
|
||||
super(DenseNet, self).__init__()
|
||||
|
||||
# First convolution
|
||||
self.features = nn.Sequential(OrderedDict([
|
||||
('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2,
|
||||
padding=3, bias=False)),
|
||||
('conv0_p', PrintLayer('conv0_p')),
|
||||
('norm0', nn.BatchNorm2d(num_init_features)),
|
||||
('norm0_p', PrintLayer('norm0_p')),
|
||||
('relu0', nn.ReLU(inplace=True)),
|
||||
('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
|
||||
('pool0_p', PrintLayer('pool0_p')),
|
||||
]))
|
||||
|
||||
|
||||
|
||||
# Each denseblock
|
||||
num_features = num_init_features
|
||||
for i, num_layers in enumerate(block_config):
|
||||
block = _DenseBlock(
|
||||
num_layers=num_layers,
|
||||
num_input_features=num_features,
|
||||
bn_size=bn_size,
|
||||
growth_rate=growth_rate,
|
||||
drop_rate=drop_rate,
|
||||
memory_efficient=memory_efficient
|
||||
)
|
||||
self.features.add_module('denseblock%d' % (i + 1), block)
|
||||
self.features.add_module('denseblock%d_p' % (i + 1), PrintLayer('denseblock%d_p' % (i + 1)))
|
||||
num_features = num_features + num_layers * growth_rate
|
||||
if i != len(block_config) - 1:
|
||||
trans = _Transition(num_input_features=num_features,
|
||||
num_output_features=num_features // 2)
|
||||
self.features.add_module('transition%d' % (i + 1), trans)
|
||||
self.features.add_module('transition%d_p' % (i + 1), PrintLayer('transition%d_p' % (i + 1)))
|
||||
num_features = num_features // 2
|
||||
|
||||
# Final batch norm
|
||||
self.features.add_module('norm5', nn.BatchNorm2d(num_features))
|
||||
|
||||
# Linear layer
|
||||
self.classifier = nn.Linear(num_features, num_classes)
|
||||
|
||||
# Official init from torch repo.
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight)
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
nn.init.constant_(m.weight, 1)
|
||||
nn.init.constant_(m.bias, 0)
|
||||
elif isinstance(m, nn.Linear):
|
||||
nn.init.constant_(m.bias, 0)
|
||||
|
||||
def forward(self, x):
|
||||
features = self.features(x)
|
||||
|
||||
# features_p = features.to('cpu')'
|
||||
print('the features mean: {}'.format(features.mean().item()))
|
||||
|
||||
out = F.relu(features, inplace=True)
|
||||
out = F.adaptive_avg_pool2d(out, (1, 1))
|
||||
out = torch.flatten(out, 1)
|
||||
print('the flatten mean: {}'.format(out.mean().item()))
|
||||
out = self.classifier(out)
|
||||
return out
|
||||
|
||||
|
||||
def _load_state_dict(model, model_url, progress):
|
||||
# '.'s are no longer allowed in module names, but previous _DenseLayer
|
||||
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
|
||||
# They are also in the checkpoints in model_urls. This pattern is used
|
||||
# to find such keys.
|
||||
pattern = re.compile(
|
||||
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
|
||||
|
||||
state_dict = load_state_dict_from_url(model_url, progress=progress)
|
||||
for key in list(state_dict.keys()):
|
||||
res = pattern.match(key)
|
||||
if res:
|
||||
new_key = res.group(1) + res.group(2)
|
||||
state_dict[new_key] = state_dict[key]
|
||||
del state_dict[key]
|
||||
model.load_state_dict(state_dict)
|
||||
|
||||
|
||||
def _densenet(arch, growth_rate, block_config, num_init_features, pretrained, progress,
|
||||
**kwargs):
|
||||
model = DenseNet(growth_rate, block_config, num_init_features, **kwargs)
|
||||
if pretrained:
|
||||
_load_state_dict(model, model_urls[arch], progress)
|
||||
return model
|
||||
|
||||
|
||||
def densenet121(pretrained=False, progress=True, **kwargs):
|
||||
r"""Densenet-121 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
progress (bool): If True, displays a progress bar of the download to stderr
|
||||
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
|
||||
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
|
||||
"""
|
||||
return _densenet('densenet121', 32, (6, 12, 24, 16), 64, pretrained, progress,
|
||||
**kwargs)
|
||||
|
||||
|
||||
def densenet161(pretrained=False, progress=True, **kwargs):
|
||||
r"""Densenet-161 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
progress (bool): If True, displays a progress bar of the download to stderr
|
||||
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
|
||||
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
|
||||
"""
|
||||
return _densenet('densenet161', 48, (6, 12, 36, 24), 96, pretrained, progress,
|
||||
**kwargs)
|
||||
|
||||
|
||||
def densenet169(pretrained=False, progress=True, **kwargs):
|
||||
r"""Densenet-169 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
progress (bool): If True, displays a progress bar of the download to stderr
|
||||
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
|
||||
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
|
||||
"""
|
||||
return _densenet('densenet169', 32, (6, 12, 32, 32), 64, pretrained, progress,
|
||||
**kwargs)
|
||||
|
||||
|
||||
def densenet201(pretrained=False, progress=True, **kwargs):
|
||||
r"""Densenet-201 model from
|
||||
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
progress (bool): If True, displays a progress bar of the download to stderr
|
||||
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
|
||||
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
|
||||
"""
|
||||
return _densenet('densenet201', 32, (6, 12, 48, 32), 64, pretrained, progress,
|
||||
**kwargs)
|
||||
+32
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"board_id": "0x0000",
|
||||
"chip_info": "910",
|
||||
"deploy_mode": "lab",
|
||||
"group_count": "1",
|
||||
"group_list": [
|
||||
{
|
||||
"device_num": "1",
|
||||
"server_num": "1",
|
||||
"group_name": "",
|
||||
"instance_count": "1",
|
||||
"instance_list": [
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "0",
|
||||
"device_ip": "192.168.100.101"
|
||||
}
|
||||
],
|
||||
"rank_id": "0",
|
||||
"server_id": "10.246.246.76"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"para_plane_nic_location": "device",
|
||||
"para_plane_nic_name": [
|
||||
"eth0"
|
||||
],
|
||||
"para_plane_nic_num": "1",
|
||||
"status": "completed"
|
||||
}
|
||||
+44
@@ -0,0 +1,44 @@
|
||||
{
|
||||
"board_id": "0x0000",
|
||||
"chip_info": "910",
|
||||
"deploy_mode": "lab",
|
||||
"group_count": "1",
|
||||
"group_list": [
|
||||
{
|
||||
"device_num": "2",
|
||||
"server_num": "1",
|
||||
"group_name": "",
|
||||
"instance_count": "2",
|
||||
"instance_list": [
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "0",
|
||||
"device_ip": "192.168.100.101"
|
||||
}
|
||||
],
|
||||
"rank_id": "0",
|
||||
"server_id": "10.246.246.76"
|
||||
},
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "1",
|
||||
"device_ip": "192.168.101.101"
|
||||
}
|
||||
],
|
||||
"rank_id": "1",
|
||||
"server_id": "10.246.246.76"
|
||||
},
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"para_plane_nic_location": "device",
|
||||
"para_plane_nic_name": [
|
||||
"eth0",
|
||||
"eth1"
|
||||
],
|
||||
"para_plane_nic_num": "2",
|
||||
"status": "completed"
|
||||
}
|
||||
+65
@@ -0,0 +1,65 @@
|
||||
{
|
||||
"board_id": "0x0000",
|
||||
"chip_info": "910",
|
||||
"deploy_mode": "lab",
|
||||
"group_count": "1",
|
||||
"group_list": [
|
||||
{
|
||||
"device_num": "4",
|
||||
"server_num": "1",
|
||||
"group_name": "",
|
||||
"instance_count": "4",
|
||||
"instance_list": [
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "0",
|
||||
"device_ip": "192.168.190.102"
|
||||
}
|
||||
],
|
||||
"rank_id": "0",
|
||||
"server_id": "10.246.246.76"
|
||||
},
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "1",
|
||||
"device_ip": "192.168.191.102"
|
||||
}
|
||||
],
|
||||
"rank_id": "1",
|
||||
"server_id": "10.246.246.76"
|
||||
},
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "2",
|
||||
"device_ip": "192.168.192.102"
|
||||
}
|
||||
],
|
||||
"rank_id": "2",
|
||||
"server_id": "10.246.246.76"
|
||||
},
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "3",
|
||||
"device_ip": "192.168.193.102"
|
||||
}
|
||||
],
|
||||
"rank_id": "3",
|
||||
"server_id": "10.246.246.76"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"para_plane_nic_location": "device",
|
||||
"para_plane_nic_name": [
|
||||
"eth0",
|
||||
"eth1",
|
||||
"eth2",
|
||||
"eth3"
|
||||
],
|
||||
"para_plane_nic_num": "4",
|
||||
"status": "completed"
|
||||
}
|
||||
+109
@@ -0,0 +1,109 @@
|
||||
{
|
||||
"board_id": "0x002f",
|
||||
"chip_info": "910",
|
||||
"deploy_mode": "lab",
|
||||
"group_count": "1",
|
||||
"group_list": [
|
||||
{
|
||||
"device_num": "8",
|
||||
"server_num": "1",
|
||||
"group_name": "",
|
||||
"instance_count": "8",
|
||||
"instance_list": [
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "0",
|
||||
"device_ip": "192.168.100.101"
|
||||
}
|
||||
],
|
||||
"rank_id": "0",
|
||||
"server_id": "10.246.246.76"
|
||||
},
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "1",
|
||||
"device_ip": "192.168.101.101"
|
||||
}
|
||||
],
|
||||
"rank_id": "1",
|
||||
"server_id": "10.246.246.76"
|
||||
},
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "2",
|
||||
"device_ip": "192.168.102.101"
|
||||
}
|
||||
],
|
||||
"rank_id": "2",
|
||||
"server_id": "10.246.246.76"
|
||||
},
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "3",
|
||||
"device_ip": "192.168.103.101"
|
||||
}
|
||||
],
|
||||
"rank_id": "3",
|
||||
"server_id": "10.246.246.76"
|
||||
},
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "4",
|
||||
"device_ip": "192.168.100.100"
|
||||
}
|
||||
],
|
||||
"rank_id": "4",
|
||||
"server_id": "10.246.246.76"
|
||||
},
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "5",
|
||||
"device_ip": "192.168.101.100"
|
||||
}
|
||||
],
|
||||
"rank_id": "5",
|
||||
"server_id": "10.246.246.76"
|
||||
},
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "6",
|
||||
"device_ip": "192.168.102.100"
|
||||
}
|
||||
],
|
||||
"rank_id": "6",
|
||||
"server_id": "10.246.246.76"
|
||||
},
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "7",
|
||||
"device_ip": "192.168.103.100"
|
||||
}
|
||||
],
|
||||
"rank_id": "7",
|
||||
"server_id": "10.246.246.76"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"para_plane_nic_location": "device",
|
||||
"para_plane_nic_name": [
|
||||
"eth0",
|
||||
"eth1",
|
||||
"eth2",
|
||||
"eth3",
|
||||
"eth4",
|
||||
"eth5",
|
||||
"eth6",
|
||||
"eth7"
|
||||
],
|
||||
"para_plane_nic_num": "8",
|
||||
"status": "completed"
|
||||
}
|
||||
+40
@@ -0,0 +1,40 @@
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
from torchvision import transforms
|
||||
import torchvision.models as models
|
||||
|
||||
"""
|
||||
alexnet | densenet121 |
|
||||
densenet161 | densenet169 | densenet201 |
|
||||
resnet101 | resnet152 | resnet18 | resnet34 |
|
||||
resnet50 | squeezenet1_0 | squeezenet1_1 | vgg11 |
|
||||
vgg11_bn | vgg13 | vgg13_bn | vgg16 | vgg16_bn | vgg19 |
|
||||
mobilenet_v2 | shufflenet_v2_x0_5 |
|
||||
vgg19_bn (default: resnet18)
|
||||
"""
|
||||
model_name='densenet121'
|
||||
model = models.__dict__[model_name]()
|
||||
|
||||
img = torch.rand(size=(1,3,224,224))
|
||||
|
||||
#print(model(img))
|
||||
|
||||
labels = torch.rand(size=(1,))
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
with torch.autograd.profiler.profile(record_shapes=True) as prof:
|
||||
outputs = model(img)
|
||||
loss = criterion(outputs, labels)
|
||||
with torch.autograd.profiler.record_function("label-bp"):
|
||||
loss.backward()
|
||||
|
||||
#print(prof.key_averages().table())
|
||||
print(prof)
|
||||
prof.export_chrome_trace(model_name + ".prof")
|
||||
|
||||
|
||||
with SummaryWriter(os.path.join('runs',model_name)) as w:
|
||||
w.add_graph(model, img)
|
||||
|
||||
+20
@@ -0,0 +1,20 @@
|
||||
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
|
||||
export PATH=$PATH:/usr/local/Ascend/fwkacllib/ccec_compiler/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/opp
|
||||
export NEW_GE_FE_ID=1
|
||||
export GE_AICPU_FLAG=1
|
||||
export PYTHONPATH=/usr/local/Ascend/atc/python/site-packages/te.egg:/usr/local/Ascend/atc/python/site-packages/topi.egg:/usr/local/Ascend/atc/python/site-packages/auto_tune.egg:/usr/local/Ascend/atc/python/site-packages/schedule_search.egg:/usr/local
|
||||
export CUSTOM_OP_LIB_PATH=/usr/local/Ascend/ops/framework/built-in/tensorflow
|
||||
export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libaicpu_plugin.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
|
||||
export PLUGIN_LOAD_PATH=/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libaicpu_plugin.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/librts_engine.so
|
||||
|
||||
#export DEVICE_ID=0
|
||||
#export SLOG_PRINT_TO_STDOUT=1
|
||||
|
||||
#su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
|
||||
|
||||
#python3 pytorch-benchmark-resnet50.py
|
||||
python3 net_show_cpu.py
|
||||
#python3 pytorch-resnet50-profiling.py
|
||||
|
||||
|
||||
+51
@@ -0,0 +1,51 @@
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
from torchvision import transforms
|
||||
import torchvision.models as models
|
||||
|
||||
CALCULATE_DEVICE = "npu:0"
|
||||
torch.npu.set_device(CALCULATE_DEVICE)
|
||||
|
||||
"""
|
||||
alexnet | densenet121 |
|
||||
densenet161 | densenet169 | densenet201 |
|
||||
resnet101 | resnet152 | resnet18 | resnet34 |
|
||||
resnet50 | squeezenet1_0 | squeezenet1_1 | vgg11 |
|
||||
vgg11_bn | vgg13 | vgg13_bn | vgg16 | vgg16_bn | vgg19 |
|
||||
mobilenet_v2 | shufflenet_v2_x0_5 |
|
||||
vgg19_bn (default: resnet18)
|
||||
"""
|
||||
|
||||
img = torch.rand(size=(1,3,224,224),dtype=torch.float32).to(CALCULATE_DEVICE, non_blocking=True)
|
||||
print("img prepared")
|
||||
|
||||
model_name='densenet121'
|
||||
model = models.__dict__[model_name]().to(CALCULATE_DEVICE)
|
||||
model.train()
|
||||
print("model prepared")
|
||||
|
||||
outputs = model(img)
|
||||
print("cal done, results is {}".format(outputs))
|
||||
|
||||
labels=torch.rand(size=(1,)).to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
|
||||
criterion = nn.CrossEntropyLoss().to(CALCULATE_DEVICE)
|
||||
with torch.autograd.profiler.profile(record_shapes=True,use_npu=True) as prof:
|
||||
outputs = model(img)
|
||||
print("output ok")
|
||||
loss = criterion(outputs, labels)
|
||||
print("loss ok")
|
||||
with torch.autograd.profiler.record_function("label-bp"):
|
||||
loss.backward()
|
||||
|
||||
#print(prof.key_averages().table())
|
||||
print(prof)
|
||||
prof.export_chrome_trace(model_name + ".prof")
|
||||
|
||||
|
||||
# with SummaryWriter(os.path.join('runs',model_name)) as w:
|
||||
# w.add_graph(model, img)
|
||||
# print("tenorboard add graph ok")
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"server_count": "1",
|
||||
"server_list": [{
|
||||
"device": [
|
||||
{
|
||||
"device_id": "0",
|
||||
"device_ip": "192.168.10.103",
|
||||
"rank_id": "0"
|
||||
}],
|
||||
"server_id": "127.0.0.1"
|
||||
}],
|
||||
"status": "completed",
|
||||
"version": "1.0"
|
||||
}
|
||||
+9
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"server_count": "1",
|
||||
"server_list": [{
|
||||
"device": [{devices}],
|
||||
"server_id": "127.0.0.1"
|
||||
}],
|
||||
"status": "completed",
|
||||
"version": "1.0"
|
||||
}
|
||||
+52
@@ -0,0 +1,52 @@
|
||||
# main env
|
||||
export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe
|
||||
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:$LD_LIBRARY_PATH
|
||||
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
|
||||
export PLUGIN_LOAD_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/librts_engine.so
|
||||
export TASK_QUEUE_ENABLE=0
|
||||
export CUSTOM_OP_LIB_PATH=/usr/local/Ascend/ascend-toolkit/20.10.0.B022/arm64-linux_gcc7.3.0/opp/framework/built-in/tensorflow/
|
||||
|
||||
|
||||
export NEW_GE_FE_ID=1
|
||||
export GE_AICPU_FLAG=1
|
||||
export GEN_TO_SOURCE=1
|
||||
|
||||
|
||||
|
||||
|
||||
#export LD_LIBRARY_PATH=/usr/local/OpenBLAS/lib/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64-linux-gnu/
|
||||
#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
|
||||
#export ASCEND_OPP_PATH=/usr/local/Ascend/opp
|
||||
|
||||
|
||||
#export DDK_VERSION_FLAG=1.60.T17.B830
|
||||
#export NEW_GE_FE_ID=1
|
||||
#export GE_AICPU_FLAG=1
|
||||
#export SOC_VERSION=Ascend910
|
||||
|
||||
#export DUMP_GE_GRAPH=2
|
||||
|
||||
|
||||
#export DEVICE_ID=0
|
||||
#export DEVICE_INDEX=0
|
||||
|
||||
#export PRINT_MODEL=0
|
||||
#export ENABLE_DATA_PRE_PROC=1
|
||||
#export RANK_ID=0
|
||||
#export RANK_SIZE=1
|
||||
#export JOB_ID=10087
|
||||
#export FUSION_TENSOR_SIZE=1000000000
|
||||
#PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/atc/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe
|
||||
|
||||
#export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe
|
||||
|
||||
|
||||
#export CUSTOM_OP_LIB_PATH=/usr/local/Ascend/ascend-toolkit/20.10.0.B023/arm64-linux_gcc7.3.0/opp/framework/built-in/tensorflow/
|
||||
#export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
|
||||
#export PLUGIN_LOAD_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/librts_engine.so
|
||||
#export WHICH_OP=GEOP
|
||||
#export NEW_GE_FE_ID=1
|
||||
#export GE_AICPU_FLAG=1
|
||||
|
||||
+9
@@ -0,0 +1,9 @@
|
||||
export ASCEND_HOME=/usr/local/Ascend
|
||||
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
|
||||
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$currentDir
|
||||
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
|
||||
export TASK_QUEUE_ENABLE=0
|
||||
+21
@@ -0,0 +1,21 @@
|
||||
############## toolkit situation ################
|
||||
#export ASCEND_HOME=/usr/local/Ascend
|
||||
#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
|
||||
#export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl
|
||||
#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
|
||||
#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
|
||||
############## nnae situation ################
|
||||
export ASCEND_HOME=/usr/local/Ascend
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/:/usr/local/python3.7.5/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
|
||||
export PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/hccl
|
||||
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/
|
||||
|
||||
# pip3.7 install --upgrade /usr/local/Ascend/nnae/latest/fwkacllib/lib64/topi-0.4.0-py3-none-any.whl
|
||||
# pip3.7 install --upgrade /usr/local/Ascend/nnae/latest/fwkacllib/lib64/te-0.4.0-py3-none-any.whl
|
||||
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
#su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
|
||||
|
||||
export TASK_QUEUE_ENABLE=0
|
||||
+31
@@ -0,0 +1,31 @@
|
||||
############## toolkit situation ################
|
||||
#export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
|
||||
#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
|
||||
#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
#export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
|
||||
#export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
|
||||
|
||||
|
||||
############## nnae situation ################
|
||||
|
||||
|
||||
if [ -d /usr/local/Ascend/nnae/latest ];then
|
||||
export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
|
||||
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/nnae/latest/toolkit/tools/ide_daemon/bin/
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/
|
||||
export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
|
||||
export PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
|
||||
else
|
||||
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
|
||||
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
|
||||
export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
|
||||
fi
|
||||
|
||||
# ln -s /usr/local/Ascend/ascend-toolkit/latest/toolkit/bin/adc /usr/local/bin/
|
||||
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
#su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
|
||||
|
||||
export TASK_QUEUE_ENABLE=1
|
||||
@@ -0,0 +1,22 @@
|
||||
export ASCEND_HOME=/usr/local/Ascend
|
||||
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
|
||||
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$currentDir
|
||||
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 7"
|
||||
|
||||
export TASK_QUEUE_ENABLE=0
|
||||
taskset -c 111-150 python3 densenet121_1p_main.py \
|
||||
--workers 40 \
|
||||
--arch densenet121 \
|
||||
--npu 7 \
|
||||
--lr 0.1 \
|
||||
--momentum 0.9 \
|
||||
--amp \
|
||||
--batch-size 256 \
|
||||
--epoch 90 \
|
||||
--evaluate \
|
||||
--resume checkpoint.pth.tar \
|
||||
--data /opt/npu/dataset/imagenet
|
||||
@@ -0,0 +1,62 @@
|
||||
#!/bin/bash
|
||||
|
||||
rank_size=$1
|
||||
yamlPath=$2
|
||||
toolsPath=$3
|
||||
|
||||
currentDir=$(cd "$(dirname "$0")/.."; pwd)
|
||||
model_name=$(cd $currentDir/..;basename `pwd`)
|
||||
if [ -f /.dockerenv ];then
|
||||
CLUSTER=$4
|
||||
MPIRUN_ALL_IP="$5"
|
||||
export CLUSTER=${CLUSTER}
|
||||
fi
|
||||
# 从 yaml 获取配置
|
||||
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "pytorch_config")
|
||||
|
||||
# 清除旧日志
|
||||
rm -rf /var/log/npu/slog/host-0/*
|
||||
rm -rf ${currentDir}/result/*.log
|
||||
|
||||
#mkdir train job path
|
||||
currtime=`date +%Y%m%d%H%M%S`
|
||||
mkdir -p ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
|
||||
export train_job_dir=${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${train_job_dir} &"
|
||||
# device 列表, 若无指定 device 根据 rank_size 顺序选择
|
||||
eval device_group=\$device_group_${rank_size}p
|
||||
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
|
||||
device_group="$(seq 0 "$(expr $rank_size - 1)")"
|
||||
fi
|
||||
|
||||
# get last device id in device_group, hw log in performance from the dir named last_device_id
|
||||
device_group_str=`echo ${device_group} | sed 's/ //g'`
|
||||
first_device_id=`echo ${device_group_str: 0:1}`
|
||||
|
||||
if [ x"${CLUSTER}" == x"True" ];then
|
||||
this_ip=$(hostname -I |awk '{print $1}')
|
||||
ln -snf ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/0/hw_densenet121.log ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
|
||||
for ip in $MPIRUN_ALL_IP;do
|
||||
if [ x"$ip" != x"$this_ip" ];then
|
||||
scp $yamlPath root@$ip:$yamlPath
|
||||
scp ${jsonFilePath} root@$ip:${jsonFilePath}
|
||||
fi
|
||||
done
|
||||
export PATH=$PATH:/usr/local/mpirun4.0/bin
|
||||
mpirun -H ${mpirun_ip} \
|
||||
--bind-to none -map-by slot\
|
||||
--allow-run-as-root \
|
||||
--mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
|
||||
--prefix /usr/local/mpirun4.0/ \
|
||||
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} ${CLUSTER}
|
||||
else
|
||||
rank_id=0
|
||||
#for device_id in $device_group;do
|
||||
ln -snf ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/${first_device_id}/hw_densenet121.log ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
|
||||
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} $rank_id &
|
||||
# let rank_id++
|
||||
# done
|
||||
fi
|
||||
wait
|
||||
|
||||
|
||||
+141
@@ -0,0 +1,141 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
device_id=$1
|
||||
rank_size=$2
|
||||
yamlPath=$3
|
||||
|
||||
currentDir=$(cd "$(dirname "$0")/.."; pwd)
|
||||
currtime=$4
|
||||
toolsPath=$5
|
||||
export YAML_PATH=$3
|
||||
mkdir -p ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
|
||||
export train_job_dir=${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
|
||||
|
||||
# 从 yaml 获取配置
|
||||
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "pytorch_config")
|
||||
|
||||
export REMARK_LOG_FILE=hw_densenet121.log # 打点日志文件名称, 必须hw_后跟模型名称小写
|
||||
benchmark_log_path=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
|
||||
export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
|
||||
|
||||
|
||||
#source ${currentDir}/config/npu_set_env.sh
|
||||
source ${currentDir}/config/set_env_b023.sh
|
||||
# user env
|
||||
export HCCL_CONNECT_TIMEOUT=600
|
||||
export JOB_ID=9999001
|
||||
export HCCL_RANK_TABLE_PATH=${currentDir}/config/${rank_size}p.json
|
||||
export RANK_SIZE=${rank_size}
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
export DEVICE_ID=${device_id}
|
||||
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
|
||||
export DEVICE_INDEX=${DEVICE_INDEX}
|
||||
|
||||
cd ${train_job_dir}
|
||||
curd_dir=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils/atlasboost
|
||||
export PYTHONPATH=$PYTHONPATH:${curd_dir}
|
||||
|
||||
if [ x"$6" != x"True" ];then
|
||||
rank_id=$6
|
||||
export RANK_ID=$6
|
||||
else
|
||||
device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
|
||||
device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
|
||||
atlasboost.set_device_id(device_id);print(atlasboost.rank())")
|
||||
device_id_mo=`echo $device_id_mo`
|
||||
rank_id=${device_id_mo##* }
|
||||
export RANK_ID=${rank_id}
|
||||
device=${device_id_mo##*deviceid = }
|
||||
device_id=${device%% phyid=*}
|
||||
export DEVICE_ID=${device_id}
|
||||
hccljson=${train_job_dir}/*.json
|
||||
cp ${hccljson} ${currentDir}/config/${rank_size}p.json
|
||||
fi
|
||||
|
||||
#mkdir exec path
|
||||
mkdir -p ${train_job_dir}/${device_id}
|
||||
cd ${train_job_dir}/${device_id}
|
||||
|
||||
startTime=`date +%Y%m%d-%H:%M:%S`
|
||||
startTime_s=`date +%s`
|
||||
|
||||
# 根据单卡/多卡区分调用参数
|
||||
if [ x"$6" == x"True" ];then
|
||||
# 多卡多机
|
||||
export CLUSTER=True
|
||||
fi
|
||||
|
||||
if [ x"${mode}" == x"evaluate" ];then
|
||||
taskset -c 111-150 python3.7 ${currentDir}/code/densenet121_1p_main.py \
|
||||
--workers 40 \
|
||||
--arch densenet121 \
|
||||
--npu 7 \
|
||||
--lr 0.1 \
|
||||
--momentum 0.9 \
|
||||
--amp \
|
||||
--batch-size 256 \
|
||||
--epoch 90 \
|
||||
--evaluate \
|
||||
--resume checkpoint.pth.tar \
|
||||
--data ${data_url} > ${train_job_dir}/train_${rank_size}p.log 2>&1
|
||||
|
||||
|
||||
elif [ x"${rank_size}" == x"1" ];then
|
||||
# 单卡
|
||||
#source ${currentDir}/config/set_env_b023.sh
|
||||
|
||||
taskset -c 1-40 python3.7 ${currentDir}/code/densenet121_1p_main.py \
|
||||
--workers 40 \
|
||||
--arch densenet121 \
|
||||
--npu ${device_single} \
|
||||
--lr 0.1 \
|
||||
--momentum 0.9 \
|
||||
--amp \
|
||||
--batch-size ${batch_size} \
|
||||
--epoch ${epoches} \
|
||||
--data ${data_url} > ${train_job_dir}/train_${rank_size}p.log 2>&1
|
||||
|
||||
elif [ ${rank_size} -le 8 ];then
|
||||
# 单机多卡
|
||||
#source ${currentDir}/config/set_env_b023.sh
|
||||
python3.7 ${currentDir}/code/densenet121_8p_main.py \
|
||||
--addr=$(hostname -I |awk '{print $1}') \
|
||||
--seed 49 \
|
||||
--workers 160 \
|
||||
--lr ${lr} \
|
||||
--print-freq 1 \
|
||||
--eval-freq 5\
|
||||
--arch densenet121 \
|
||||
--dist-url 'tcp://127.0.0.1:50000' \
|
||||
--dist-backend 'hccl' \
|
||||
--multiprocessing-distributed \
|
||||
--world-size 1 \
|
||||
--batch-size ${batch_size} \
|
||||
--epochs ${epoches} \
|
||||
--rank 0 \
|
||||
--amp \
|
||||
--benchmark 0 \
|
||||
--device-list ${device_group_multi} \
|
||||
--data ${data_url} > ${train_job_dir}/train_${rank_size}p.log 2>&1
|
||||
|
||||
fi
|
||||
|
||||
#taskset -c 0-20 python3.7 ${currentDir}/code/densenet121.py > ./train.log 2>&1
|
||||
|
||||
if [ $? -eq 0 ];then
|
||||
echo ":::ABK 1.0.0 densenet121 train success"
|
||||
echo ":::ABK 1.0.0 densenet121 train success" >> ${train_job_dir}/train_${rank_size}p.log
|
||||
echo ":::ABK 1.0.0 densenet121 train success" >> ./hw_densenet121.log
|
||||
else
|
||||
echo ":::ABK 1.0.0 densenet121 train failed"
|
||||
echo ":::ABK 1.0.0 densenet121 train failed" >> ${train_job_dir}/train_${rank_size}p.log
|
||||
echo ":::ABK 1.0.0 densenet121 train failed" >> ./hw_densenet121.log
|
||||
fi
|
||||
|
||||
endTime=`date +%Y%m%d-%H:%M:%S`
|
||||
endTime_s=`date +%s`
|
||||
sumTime=$[ $endTime_s - $startTime_s ]
|
||||
hour=$(( $sumTime/3600 ))
|
||||
min=$(( ($sumTime-${hour}*3600)/60 ))
|
||||
sec=$(( $sumTime-${hour}*3600-${min}*60 ))
|
||||
echo ":::ABK 1.0.0 densenet121 train total time: ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_densenet121.log
|
||||
@@ -0,0 +1,46 @@
|
||||
# DenseNet121_tensorflow训练说明
|
||||
|
||||
### 1. 模型训练参数配置
|
||||
|
||||
在train/yaml/DenseNet121.yaml中修改相应配置, 配置项含义:
|
||||
|
||||
```
|
||||
tensorflow_config:
|
||||
# 基本参数
|
||||
data_url: 数据集路径
|
||||
epoches: 跑多少个epoch
|
||||
epochs_between_evals: 1
|
||||
batch_size: 32
|
||||
log_dir: ./ckpt
|
||||
|
||||
# 1p参数
|
||||
mode_1p: train # train、evaluate、train_and_evaluate三种模式
|
||||
max_train_steps_1p: 100
|
||||
iterations_per_loop_1p: 10
|
||||
display_every: 10
|
||||
log_name_1p: densenet121_1p.log
|
||||
|
||||
# 8p参数
|
||||
mode_8p: train_and_evaluate # train、evaluate、train_and_evaluate三种模式
|
||||
iterations_per_loop_8p: 5004
|
||||
lr: 0.1
|
||||
log_name_8p: densenet121_8p.log
|
||||
|
||||
mpirun_ip: 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
|
||||
docker_image:docker 镜像名称:版本号
|
||||
|
||||
# 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
|
||||
device_group_1p: 0
|
||||
device_group_2p: 0 1
|
||||
device_group_4p: 0 1 2 3
|
||||
```
|
||||
|
||||
------
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
+22
@@ -0,0 +1,22 @@
|
||||
import tensorflow as tf
|
||||
import os,sys
|
||||
|
||||
|
||||
class CreateSession():
|
||||
def __init__(self):
|
||||
self.estimator_config = tf.ConfigProto(
|
||||
inter_op_parallelism_threads=10,
|
||||
intra_op_parallelism_threads=10,
|
||||
allow_soft_placement=True)
|
||||
|
||||
self.estimator_config.gpu_options.allow_growth = True
|
||||
|
||||
self.set_env()
|
||||
|
||||
def set_env(self):
|
||||
gpu_thread_count = 2
|
||||
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
|
||||
os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count)
|
||||
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
|
||||
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
|
||||
|
||||
+133
@@ -0,0 +1,133 @@
|
||||
import numpy as np
|
||||
import preprocessing
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.util import nest
|
||||
import os,sys
|
||||
import numpy as np
|
||||
|
||||
|
||||
class DataLoader:
|
||||
|
||||
def __init__(self, args):
|
||||
self.args = args
|
||||
|
||||
filename_pattern = os.path.join(args.data_dir, '%s-*')
|
||||
filenames_train = sorted(tf.gfile.Glob(filename_pattern % 'train'))
|
||||
self.num_training_samples = get_num_records(filenames_train)
|
||||
self.args.num_training_samples = self.num_training_samples
|
||||
|
||||
filename_pattern = os.path.join(args.data_dir, '%s-*')
|
||||
filenames_val = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
|
||||
self.num_evaluating_samples = get_num_records(filenames_val)
|
||||
self.args.num_evaluating_samples = self.num_evaluating_samples
|
||||
|
||||
print( 'total num_training_sampels: %d' % self.num_training_samples )
|
||||
print( 'total num_evaluating_sampels: %d' % self.num_evaluating_samples )
|
||||
|
||||
self.training_samples_per_rank = self.num_training_samples
|
||||
|
||||
def get_train_input_fn(self):
|
||||
take_count = self.training_samples_per_rank
|
||||
|
||||
return make_dataset(self.args, take_count, self.args.batch_size, training=True)
|
||||
|
||||
def get_eval_input_fn(self):
|
||||
take_count = self.num_evaluating_samples
|
||||
|
||||
return make_dataset(self.args, take_count, self.args.batch_size, training=False)
|
||||
|
||||
|
||||
def get_num_records(filenames):
|
||||
def count_records(tf_record_filename):
|
||||
count = 0
|
||||
for _ in tf.python_io.tf_record_iterator(tf_record_filename):
|
||||
count += 1
|
||||
return count
|
||||
|
||||
nfile = len(filenames)
|
||||
return (count_records(filenames[0]) * (nfile - 1) +
|
||||
count_records(filenames[-1]))
|
||||
|
||||
|
||||
def _parse_example_proto(example_serialized):
|
||||
feature_map = {
|
||||
'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
|
||||
default_value=''),
|
||||
'image/class/label': tf.FixedLenFeature([], dtype=tf.int64, default_value=-1),
|
||||
'image/class/text': tf.FixedLenFeature([], dtype=tf.string,
|
||||
default_value=''),
|
||||
}
|
||||
sparse_float32 = tf.VarLenFeature(dtype=tf.float32)
|
||||
# Sparse features in Example proto.
|
||||
feature_map.update(
|
||||
{k: sparse_float32 for k in ['image/object/bbox/xmin',
|
||||
'image/object/bbox/ymin',
|
||||
'image/object/bbox/xmax',
|
||||
'image/object/bbox/ymax']})
|
||||
|
||||
features = tf.parse_single_example(example_serialized, feature_map)
|
||||
label = tf.cast(features['image/class/label'], dtype=tf.int32)
|
||||
|
||||
xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
|
||||
ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
|
||||
xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
|
||||
ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
|
||||
|
||||
# Note that we impose an ordering of (y, x) just to make life difficult.
|
||||
bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
|
||||
|
||||
# Force the variable number of bounding boxes into the shape
|
||||
# [1, num_boxes, coords].
|
||||
bbox = tf.expand_dims(bbox, 0)
|
||||
bbox = tf.transpose(bbox, [0, 2, 1])
|
||||
|
||||
return features['image/encoded'], label, bbox
|
||||
|
||||
|
||||
# since the preprocessing is done here, we add args file
|
||||
def parse_record(raw_record, is_training):
|
||||
image_buffer, label, bbox = _parse_example_proto(raw_record)
|
||||
|
||||
image = preprocessing.parse_and_preprocess_image_record(image_buffer, bbox, training=is_training)
|
||||
|
||||
# label-1 for VGG16
|
||||
return image, label-1
|
||||
|
||||
|
||||
def make_dataset(args, take_count, batch_size,
|
||||
training=False, shard=False):
|
||||
|
||||
shuffle_buffer_size = 10000
|
||||
num_readers = 10
|
||||
|
||||
rank_size = int(os.getenv('RANK_SIZE'))
|
||||
rank_id = int(os.getenv('DEVICE_INDEX'))
|
||||
|
||||
if training:
|
||||
filename_pattern = os.path.join(args.data_dir, '%s-*')
|
||||
filenames = sorted(tf.gfile.Glob(filename_pattern % 'train'))
|
||||
else:
|
||||
filename_pattern = os.path.join(args.data_dir, '%s-*')
|
||||
filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
|
||||
|
||||
ds = tf.data.Dataset.from_tensor_slices(filenames)
|
||||
|
||||
if not training:
|
||||
ds = ds.take(take_count)
|
||||
|
||||
if training:
|
||||
ds = ds.shuffle(1000, seed=7*(1+rank_id))
|
||||
|
||||
ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1)
|
||||
counter = tf.data.Dataset.range(sys.maxsize)
|
||||
ds = tf.data.Dataset.zip((ds, counter))
|
||||
|
||||
if training:
|
||||
ds = ds.apply(tf.data.experimental.shuffle_and_repeat(shuffle_buffer_size, seed=5*(1+rank_id)))
|
||||
|
||||
ds = ds.map(lambda image, counter: parse_record(image, training), num_parallel_calls=14)
|
||||
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
return ds
|
||||
|
||||
|
||||
+158
@@ -0,0 +1,158 @@
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.layers import batch_norm, flatten
|
||||
from tensorflow.contrib.framework import arg_scope
|
||||
import numpy as np
|
||||
|
||||
class_num = 1000
|
||||
nb_blocks = 4
|
||||
nb_blocks_layers = (6, 12, 24, 16)
|
||||
bn_size = 4
|
||||
growth_rate = 32
|
||||
init_layers = 64
|
||||
|
||||
|
||||
'''
|
||||
denseNet:121,169,201,264
|
||||
return _densenet('densenet121', 32, (6, 12, 24, 16), 64, pretrained, progress,
|
||||
**kwargs)
|
||||
return _densenet('densenet161', 48, (6, 12, 36, 24), 96, pretrained, progress,
|
||||
**kwargs)
|
||||
return _densenet('densenet169', 32, (6, 12, 32, 32), 64, pretrained, progress,
|
||||
**kwargs)
|
||||
return _densenet('densenet201', 32, (6, 12, 48, 32), 64, pretrained, progress,
|
||||
**kwargs)
|
||||
'''
|
||||
|
||||
|
||||
|
||||
def conv_layer(input, filter, kernel, stride=1, layer_name="conv"):
|
||||
with tf.name_scope(layer_name):
|
||||
network = tf.layers.conv2d(inputs=input, filters=filter, kernel_size=kernel, strides=stride, padding='SAME', use_bias=False, kernel_initializer=tf.initializers.variance_scaling(scale=5.0, mode='fan_out')) # scale=5.0, mode='fan_out'
|
||||
return network
|
||||
|
||||
def Global_Average_Pooling(x, stride=1):
|
||||
|
||||
width = np.shape(x)[1]
|
||||
height = np.shape(x)[2]
|
||||
pool_size = [width, height]
|
||||
return tf.layers.average_pooling2d(inputs=x, pool_size=pool_size, strides=stride) # The stride value does not matter
|
||||
#It is global average pooling without tflearn
|
||||
|
||||
|
||||
#return global_avg_pool(x, name='Global_avg_pooling')
|
||||
# But maybe you need to install h5py and curses or not
|
||||
|
||||
|
||||
def Batch_Normalization(x, training, scope):
|
||||
with arg_scope([batch_norm],
|
||||
scope=scope,
|
||||
updates_collections=None,
|
||||
decay=0.9,
|
||||
center=True,
|
||||
scale=True,
|
||||
zero_debias_moving_mean=True) :
|
||||
training = tf.cast(training, tf.bool)
|
||||
return tf.cond(training,
|
||||
lambda : batch_norm(inputs=x, is_training=training, reuse=None),
|
||||
lambda : batch_norm(inputs=x, is_training=training, reuse=True))
|
||||
|
||||
def Drop_out(x, rate, training) :
|
||||
return tf.layers.dropout(inputs=x, rate=rate, training=training)
|
||||
|
||||
def Relu(x):
|
||||
return tf.nn.relu(x)
|
||||
|
||||
def Average_pooling(x, pool_size=[2,2], stride=2, padding='VALID'):
|
||||
return tf.layers.average_pooling2d(inputs=x, pool_size=pool_size, strides=stride, padding=padding)
|
||||
|
||||
|
||||
def Max_Pooling(x, pool_size=[3,3], stride=2, padding='VALID'):
|
||||
return tf.layers.max_pooling2d(inputs=x, pool_size=pool_size, strides=stride, padding=padding)
|
||||
|
||||
def Concatenation(layers):
|
||||
return tf.concat(layers, axis=3)
|
||||
|
||||
def Linear(x):
|
||||
return tf.layers.dense(inputs=x, units=class_num, name='linear')
|
||||
|
||||
|
||||
def bottleneck_layer(x, is_training, scope):
|
||||
# print(x)
|
||||
with tf.name_scope(scope):
|
||||
x = Batch_Normalization(x, training=is_training, scope=scope+'_batch1')
|
||||
x = Relu(x)
|
||||
x = conv_layer(x, filter= growth_rate*bn_size, kernel=[1,1], layer_name=scope+'_conv1')
|
||||
#x = Drop_out(x, rate=dropout_rate, training=is_training)
|
||||
#x = Drop_out(x, rate=dropout_rate, training=is_training)
|
||||
|
||||
x = Batch_Normalization(x, training=is_training, scope=scope+'_batch2')
|
||||
x = Relu(x)
|
||||
x = conv_layer(x, filter= growth_rate, kernel=[3,3], layer_name=scope+'_conv2')
|
||||
#x = Drop_out(x, rate=dropout_rate, training=self.training)
|
||||
|
||||
# print(x)
|
||||
|
||||
return x
|
||||
|
||||
def transition_layer(x, is_training, scope):
|
||||
with tf.name_scope(scope):
|
||||
x = Batch_Normalization(x, training=is_training, scope=scope+'_batch1')
|
||||
x = Relu(x)
|
||||
# x = conv_layer(x, filter=self.filters, kernel=[1,1], layer_name=scope+'_conv1')
|
||||
|
||||
# https://github.com/taki0112/Densenet-Tensorflow/issues/10
|
||||
|
||||
in_channel = int(x.shape[-1])
|
||||
x = conv_layer(x, filter=in_channel*0.5, kernel=[1,1], layer_name=scope+'_conv1')
|
||||
#x = Drop_out(x, rate=dropout_rate, training=self.training)
|
||||
x = Average_pooling(x, pool_size=[2,2], stride=2)
|
||||
|
||||
return x
|
||||
|
||||
def dense_block(input_x, nb_layers, is_training, layer_name):
|
||||
with tf.name_scope(layer_name):
|
||||
layers_concat = list()
|
||||
layers_concat.append(input_x)
|
||||
|
||||
x = bottleneck_layer(input_x, is_training, scope=layer_name + '_bottleN_' + str(0))
|
||||
|
||||
layers_concat.append(x)
|
||||
|
||||
for i in range(nb_layers - 1):
|
||||
x = Concatenation(layers_concat)
|
||||
x = bottleneck_layer(x, is_training, scope=layer_name + '_bottleN_' + str(i + 1))
|
||||
layers_concat.append(x)
|
||||
|
||||
x = Concatenation(layers_concat)
|
||||
|
||||
return x
|
||||
|
||||
def Dense_net(input_x, is_training):
|
||||
x = conv_layer(input_x, filter=init_layers , kernel=[7,7], stride=2, layer_name='conv0')
|
||||
x = Max_Pooling(x, pool_size=[3,3], stride=2)
|
||||
|
||||
for i in range(nb_blocks-1) :
|
||||
# 6 -> 12 -> 48
|
||||
x = dense_block(input_x=x, nb_layers=nb_blocks_layers[i], is_training=is_training, layer_name='dense_'+str(i))
|
||||
x = transition_layer(x, is_training, scope='trans_'+str(i))
|
||||
|
||||
"""
|
||||
x = self.dense_block(input_x=x, nb_layers=6, layer_name='dense_1')
|
||||
x = self.transition_layer(x, scope='trans_1')
|
||||
x = self.dense_block(input_x=x, nb_layers=12, layer_name='dense_2')
|
||||
x = self.transition_layer(x, scope='trans_2')
|
||||
x = self.dense_block(input_x=x, nb_layers=48, layer_name='dense_3')
|
||||
x = self.transition_layer(x, scope='trans_3')
|
||||
"""
|
||||
|
||||
x = dense_block(input_x=x, nb_layers=nb_blocks_layers[nb_blocks-1], is_training=is_training, layer_name='dense_final')
|
||||
|
||||
# 100 Layer
|
||||
x = Batch_Normalization(x, training=is_training, scope='linear_batch')
|
||||
x = Relu(x)
|
||||
x = Global_Average_Pooling(x)
|
||||
x = flatten(x)
|
||||
x = Linear(x)
|
||||
|
||||
# x = tf.reshape(x, [-1, 10])
|
||||
return x
|
||||
+44
@@ -0,0 +1,44 @@
|
||||
import tensorflow as tf
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch, T_max, eta_min=0):
|
||||
base_lr = lr
|
||||
warmup_init_lr = 0
|
||||
total_steps = int(max_epoch * steps_per_epoch)
|
||||
warmup_steps = int(warmup_epochs * steps_per_epoch)
|
||||
|
||||
lr_each_step = []
|
||||
for i in range(total_steps):
|
||||
last_epoch = i // steps_per_epoch
|
||||
if i < warmup_steps:
|
||||
lr = linear_warmup_lr(i + 1, warmup_steps, base_lr, warmup_init_lr)
|
||||
else:
|
||||
lr = eta_min + (base_lr - eta_min) * (1. + math.cos(math.pi*last_epoch / T_max)) / 2
|
||||
lr_each_step.append(lr)
|
||||
|
||||
return np.array(lr_each_step).astype(np.float32)
|
||||
|
||||
|
||||
class HyperParams:
|
||||
def __init__(self, args):
|
||||
self.args=args
|
||||
nsteps_per_epoch = self.args.num_training_samples // self.args.global_batch_size
|
||||
self.args.nsteps_per_epoch = nsteps_per_epoch
|
||||
if self.args.max_epochs:
|
||||
nstep = nsteps_per_epoch * self.args.max_epochs
|
||||
else:
|
||||
nstep = self.args.max_train_steps
|
||||
self.args.nstep = nstep
|
||||
|
||||
self.cos_lr = warmup_cosine_annealing_lr(self.args.lr, nsteps_per_epoch, 0, self.args.T_max, self.args.T_max, 0.0)
|
||||
|
||||
def get_learning_rate(self):
|
||||
global_step = tf.train.get_global_step()
|
||||
|
||||
learning_rate = tf.gather(tf.convert_to_tensor(self.cos_lr), global_step)
|
||||
|
||||
learning_rate = tf.identity(learning_rate, 'learning_rate')
|
||||
|
||||
return learning_rate
|
||||
|
||||
+25
@@ -0,0 +1,25 @@
|
||||
import tensorflow as tf
|
||||
#from tensorflow.contrib.hccl.python.ops import hccl_ops
|
||||
#from npu_bridge.hccl import hccl_ops
|
||||
from benchmark_log import hwlog
|
||||
|
||||
class Layers:
|
||||
def get_accuracy(self, labels, predicted_classes, logits, args):
|
||||
accuracy = tf.metrics.accuracy(
|
||||
labels=labels, predictions=predicted_classes)
|
||||
top5acc = tf.metrics.mean(
|
||||
tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32))
|
||||
if args.rank_size == 1:
|
||||
newaccuracy = (accuracy[0], accuracy[1])
|
||||
newtop5acc = (top5acc[0], top5acc[1])
|
||||
else:
|
||||
from npu_bridge.hccl import hccl_ops
|
||||
newaccuracy = (hccl_ops.allreduce(accuracy[0],"sum")/args.rank_size, accuracy[1])
|
||||
newtop5acc = (hccl_ops.allreduce(top5acc[0],"sum")/args.rank_size, top5acc[1])
|
||||
metrics = {'val-top1acc': newaccuracy, 'val-top5acc': newtop5acc}
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
|
||||
|
||||
+92
@@ -0,0 +1,92 @@
|
||||
from __future__ import print_function
|
||||
import tensorflow as tf
|
||||
from benchmark_log import hwlog
|
||||
import logging
|
||||
import numpy as np
|
||||
import time
|
||||
import sys,os
|
||||
|
||||
class LogSessionRunHook(tf.train.SessionRunHook):
|
||||
def __init__(self, args, warmup_steps=5):
|
||||
self.global_batch_size = args.global_batch_size
|
||||
if args.iterations_per_loop is not None:
|
||||
self.iterations_per_loop = args.iterations_per_loop
|
||||
else:
|
||||
self.iterations_per_loop = args.nsteps_per_epoch
|
||||
self.warmup_steps = warmup_steps
|
||||
self.iter_times = []
|
||||
self.num_records = args.num_training_samples
|
||||
self.display_every = args.display_every
|
||||
self.logger = get_logger(args.log_name, args.log_dir)
|
||||
rank0log(self.logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__))
|
||||
|
||||
|
||||
|
||||
def after_create_session(self, session, coord):
|
||||
rank0log(self.logger, 'Step Epoch Speed Loss FinLoss LR')
|
||||
self.elapsed_secs = 0.
|
||||
self.count = 0
|
||||
|
||||
def before_run(self, run_context):
|
||||
self.t0 = time.time()
|
||||
return tf.train.SessionRunArgs(
|
||||
fetches=[tf.train.get_global_step(), 'loss:0', 'total_loss:0', 'learning_rate:0'])
|
||||
|
||||
def after_run(self, run_context, run_values):
|
||||
batch_time = time.time() - self.t0
|
||||
self.iter_times.append(batch_time)
|
||||
self.elapsed_secs += batch_time
|
||||
self.count += 1
|
||||
global_step, loss, total_loss, lr = run_values.results
|
||||
if global_step == 1 or global_step % self.display_every == 0:
|
||||
dt = self.elapsed_secs / self.count
|
||||
img_per_sec = self.global_batch_size * self.iterations_per_loop / dt
|
||||
epoch = global_step * self.global_batch_size / self.num_records
|
||||
self.logger.info('step:%6i epoch:%5.1f FPS:%7.1f loss:%6.3f total_loss:%6.3f lr:%7.5f' %
|
||||
(global_step, epoch, img_per_sec, loss, total_loss, lr))
|
||||
self.elapsed_secs = 0.
|
||||
self.count = 0
|
||||
|
||||
# add by wx983399
|
||||
hwlog.remark_print(key=hwlog.GLOBAL_STEP, value=int(global_step))
|
||||
hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=epoch)
|
||||
hwlog.remark_print(key=hwlog.FPS, value=img_per_sec)
|
||||
|
||||
def get_average_speed(self):
|
||||
avg_time = np.mean(self.iter_times[self.warmup_steps:])
|
||||
speed = self.global_batch_size / avg_time
|
||||
return speed
|
||||
|
||||
|
||||
|
||||
def rank0log(logger, *args, **kwargs):
|
||||
if logger:
|
||||
logger.info(''.join([str(x) for x in list(args)]))
|
||||
else:
|
||||
print(*args, **kwargs)
|
||||
|
||||
|
||||
def get_logger(log_name, log_dir):
|
||||
logger = logging.getLogger(log_name)
|
||||
logger.setLevel(logging.INFO) # INFO, ERROR
|
||||
# file handler which logs debug messages
|
||||
if not os.path.isdir(log_dir):
|
||||
try:
|
||||
os.makedirs(log_dir)
|
||||
except FileExistsError:
|
||||
# if log_dir is common for multiple ranks like on nfs
|
||||
pass
|
||||
# console handler
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(logging.INFO)
|
||||
# add formatter to the handlers
|
||||
formatter = logging.Formatter('%(message)s')
|
||||
ch.setFormatter(formatter)
|
||||
logger.addHandler(ch)
|
||||
fh = logging.FileHandler(os.path.join(log_dir, log_name))
|
||||
fh.setLevel(logging.DEBUG)
|
||||
fh.setFormatter(formatter)
|
||||
# add handlers to logger
|
||||
logger.addHandler(fh)
|
||||
return logger
|
||||
|
||||
+72
@@ -0,0 +1,72 @@
|
||||
import tensorflow as tf
|
||||
from densenet import Dense_net
|
||||
|
||||
|
||||
class Model(object):
|
||||
def __init__(self, args, data, hyper_param, layers, logger):
|
||||
self.args = args
|
||||
self.data = data
|
||||
self.hyper_param = hyper_param
|
||||
self.layers = layers
|
||||
self.logger = logger
|
||||
|
||||
def get_estimator_model_func(self, features, labels, mode, params=None):
|
||||
labels = tf.reshape(labels, (-1,)) # Squash unnecessary unary dim #----------------not use when use onehot label
|
||||
|
||||
inputs = features # TODO: Should be using feature columns?
|
||||
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
|
||||
|
||||
inputs = tf.cast(inputs, self.args.dtype)
|
||||
|
||||
top_layer = Dense_net(inputs, is_training)
|
||||
|
||||
logits = top_layer
|
||||
predicted_classes = tf.argmax(logits, axis=1, output_type=tf.int32)
|
||||
logits = tf.cast(logits, tf.float32)
|
||||
|
||||
labels_one_hot = tf.one_hot(labels, depth=1000)
|
||||
loss = tf.losses.softmax_cross_entropy(
|
||||
logits=logits, onehot_labels=labels_one_hot, label_smoothing=self.args.label_smoothing)
|
||||
|
||||
|
||||
base_loss = tf.identity(loss, name='loss') # For access by logger (TODO: Better way to access it?)
|
||||
|
||||
l2_loss = tf.add_n([tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()])
|
||||
l2_loss = tf.multiply(l2_loss, self.args.weight_decay)
|
||||
total_loss = base_loss + l2_loss
|
||||
|
||||
total_loss = tf.identity(total_loss, name = 'total_loss')
|
||||
|
||||
if mode == tf.estimator.ModeKeys.EVAL:
|
||||
with tf.device(None):
|
||||
metrics = self.layers.get_accuracy( labels, predicted_classes, logits, self.args)
|
||||
|
||||
return tf.estimator.EstimatorSpec(
|
||||
mode, loss=loss, eval_metric_ops=metrics)
|
||||
|
||||
assert (mode == tf.estimator.ModeKeys.TRAIN)
|
||||
|
||||
batch_size = tf.shape(inputs)[0]
|
||||
|
||||
global_step = tf.train.get_global_step()
|
||||
learning_rate = self.hyper_param.get_learning_rate()
|
||||
|
||||
momentum = self.args.momentum
|
||||
|
||||
opt = tf.train.MomentumOptimizer(
|
||||
learning_rate, momentum, use_nesterov=self.args.use_nesterov)
|
||||
|
||||
from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
|
||||
opt = NPUDistributedOptimizer(opt)
|
||||
|
||||
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or []
|
||||
|
||||
with tf.control_dependencies(update_ops):
|
||||
gate_gradients = tf.train.Optimizer.GATE_NONE
|
||||
grads_and_vars = opt.compute_gradients(total_loss, gate_gradients=gate_gradients)
|
||||
train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
|
||||
|
||||
train_op = tf.group(train_op)
|
||||
|
||||
return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)
|
||||
|
||||
+72
@@ -0,0 +1,72 @@
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.image.python.ops import distort_image_ops
|
||||
import math
|
||||
import random
|
||||
|
||||
def decode_jpeg(imgdata, channels=3):
|
||||
return tf.image.decode_jpeg(imgdata, channels=channels,
|
||||
fancy_upscaling=False,
|
||||
dct_method='INTEGER_FAST')
|
||||
|
||||
|
||||
def random_horizontal_flip(image, prob):
|
||||
if prob > random.random():
|
||||
image = tf.image.flip_left_right(image)
|
||||
return image
|
||||
|
||||
|
||||
def decode_crop_and_resize(record, bbox, size, scale, ratio):
|
||||
with tf.name_scope('decode_crop_and_resize'):
|
||||
height = 224
|
||||
width = 224
|
||||
crop_ratio = 0.8
|
||||
initial_shape = [int(round(height / crop_ratio)),
|
||||
int(round(width / crop_ratio)), 3]
|
||||
jpeg_shape = tf.image.extract_jpeg_shape( record )
|
||||
|
||||
bbox_begin, bbox_size, bbox = \
|
||||
tf.image.sample_distorted_bounding_box(
|
||||
tf.image.extract_jpeg_shape(record),
|
||||
bounding_boxes=bbox,
|
||||
min_object_covered=0.1,
|
||||
aspect_ratio_range=ratio,
|
||||
area_range=scale,
|
||||
max_attempts=10,
|
||||
use_image_if_no_bounding_boxes=True)
|
||||
|
||||
# Reassemble the bounding box in the format the crop op requires.
|
||||
offset_y, offset_x, _ = tf.unstack(bbox_begin)
|
||||
target_height, target_width, _ = tf.unstack(bbox_size)
|
||||
crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
|
||||
|
||||
image = tf.image.decode_and_crop_jpeg( record, crop_window, channels=3 )
|
||||
image = tf.image.resize_images( image, [height, width] )
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def parse_and_preprocess_image_record(record, bbox, training):
|
||||
with tf.name_scope('preprocess'):
|
||||
if training:
|
||||
image = decode_crop_and_resize(record, bbox, 224, (0.08, 1.0), (0.75, 1.333))
|
||||
image = random_horizontal_flip(image, 0.5)
|
||||
image = normalize(image)
|
||||
else:
|
||||
image = decode_jpeg(record, channels=3)
|
||||
image = tf.image.resize_images(image, [256, 256])
|
||||
image = tf.image.central_crop(image, 224.0/256)
|
||||
image = normalize(image)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def normalize(inputs):
|
||||
imagenet_mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
|
||||
imagenet_std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
|
||||
imagenet_mean = tf.expand_dims(tf.expand_dims(imagenet_mean, 0), 0)
|
||||
imagenet_std = tf.expand_dims(tf.expand_dims(imagenet_std, 0), 0)
|
||||
inputs = inputs - imagenet_mean
|
||||
inputs = inputs * (1.0 / imagenet_std)
|
||||
|
||||
return inputs
|
||||
|
||||
+140
@@ -0,0 +1,140 @@
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
import ast
|
||||
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../')))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../config')))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../../../../utils')))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../../../../utils/atlasboost')))
|
||||
|
||||
import data_loader as dl
|
||||
|
||||
import model as ml
|
||||
import hyper_param as hp
|
||||
import layers as ly
|
||||
import logger as lg
|
||||
import trainer as tr
|
||||
import create_session as cs
|
||||
import argparse
|
||||
|
||||
from benchmark_log import hwlog
|
||||
from benchmark_log.basic_utils import get_environment_info
|
||||
from benchmark_log.basic_utils import get_model_parameter
|
||||
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
|
||||
|
||||
parser.add_argument('--rank_size', default=1,type=int,
|
||||
help="""number of NPUs to use.""")
|
||||
|
||||
# mode and parameters related
|
||||
parser.add_argument('--mode', default='train_and_evaluate',
|
||||
help="""mode to run the program e.g. train, evaluate, and
|
||||
train_and_evaluate""")
|
||||
parser.add_argument('--max_train_steps', default=100,type=int,
|
||||
help="""train steps for one NPU""")
|
||||
parser.add_argument('--iterations_per_loop', default=10, type=int,
|
||||
help="""the number of steps in devices for each iteration""")
|
||||
parser.add_argument('--max_epochs', default=None, type=int,
|
||||
help="""total epochs for training""")
|
||||
parser.add_argument('--epochs_between_evals', default=5, type=int,
|
||||
help="""the interval between train and evaluation , only meaningful
|
||||
when the mode is train_and_evaluate""")
|
||||
|
||||
# dataset
|
||||
parser.add_argument('--data_dir', default='path/data',
|
||||
help="""directory to data.""")
|
||||
|
||||
# path for evaluation
|
||||
parser.add_argument('--eval_dir', default='path/eval',
|
||||
help="""directory to evaluate.""")
|
||||
|
||||
parser.add_argument('--dtype', default=tf.float32,
|
||||
help="""data type of inputs.""")
|
||||
parser.add_argument('--use_nesterov', default=True, type=ast.literal_eval,
|
||||
help=""" used in optimizer""")
|
||||
parser.add_argument('--label_smoothing', default=0.1, type=float,
|
||||
help="""label smoothing factor""")
|
||||
parser.add_argument('--weight_decay', default=0.0001,
|
||||
help="""weight decay""")
|
||||
parser.add_argument('--batch_size', default=32, type=int,
|
||||
help="""batch size for one NPU""")
|
||||
|
||||
# learning rate and momentum
|
||||
parser.add_argument('--lr', default=0.1, type=float,
|
||||
help="""learning rate""")
|
||||
parser.add_argument('--T_max', default=150, type=int,
|
||||
help="""T_max for cosing_annealing learning rate""")
|
||||
parser.add_argument('--momentum', default=0.9, type=float,
|
||||
help="""momentum used in optimizer.""")
|
||||
|
||||
# display frequency
|
||||
parser.add_argument('--display_every', default=1, type=int,
|
||||
help="""the frequency to display info""")
|
||||
|
||||
# log file
|
||||
parser.add_argument('--log_name', default='densenet121_training.log',
|
||||
help="""name of log file""")
|
||||
parser.add_argument('--log_dir', default='./model_1p',
|
||||
help="""log directory""")
|
||||
|
||||
args, unknown_args = parser.parse_known_args()
|
||||
# ['--config_file', 'densenet_config_1p_npu']
|
||||
|
||||
print(args, unknown_args)
|
||||
if len(unknown_args) > 0:
|
||||
for bad_arg in unknown_args:
|
||||
print("ERROR: Unknown command line arg: %s" % bad_arg)
|
||||
raise ValueError("Invalid command line arg(s)")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
args = parse_args()
|
||||
args.global_batch_size = args.batch_size * args.rank_size
|
||||
|
||||
session = cs.CreateSession()
|
||||
data = dl.DataLoader(args)
|
||||
hyper_param = hp.HyperParams(args)
|
||||
layers = ly.Layers()
|
||||
logger = lg.LogSessionRunHook(args)
|
||||
model = ml.Model(args, data, hyper_param, layers, logger)
|
||||
|
||||
trainer = tr.Trainer(session, args, data, model, logger)
|
||||
|
||||
if args.mode == 'train':
|
||||
trainer.train()
|
||||
elif args.mode == 'evaluate':
|
||||
trainer.evaluate()
|
||||
elif args.mode == 'train_and_evaluate':
|
||||
trainer.train_and_evaluate()
|
||||
else:
|
||||
raise ValueError("Invalid mode.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
|
||||
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
|
||||
config_info = get_model_parameter("tensorflow_config")
|
||||
initinal_data = {"base_lr": 0.128, "dataset": "imagenet1024", "optimizer": "SGD", "loss_scale": 512,
|
||||
"batchsize": 32}
|
||||
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
|
||||
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
|
||||
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
|
||||
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
|
||||
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
|
||||
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
|
||||
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
|
||||
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
|
||||
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
|
||||
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
|
||||
hwlog.remark_print(key=hwlog.INPUT_BATCH_SIZE, value=initinal_data.get("batchsize"))
|
||||
main()
|
||||
|
||||
+22
@@ -0,0 +1,22 @@
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.ops import data_flow_ops
|
||||
import re
|
||||
import os
|
||||
from operator import itemgetter
|
||||
|
||||
|
||||
def sort_and_load_ckpts(log_dir):
|
||||
ckpts = []
|
||||
for f in os.listdir(log_dir):
|
||||
m = re.match(r'model.ckpt-([0-9]+).index', f)
|
||||
if m is None:
|
||||
continue
|
||||
fullpath = os.path.join(log_dir, f)
|
||||
ckpts.append({'step': int(m.group(1)),
|
||||
'path': os.path.splitext(fullpath)[0],
|
||||
'mtime': os.stat(fullpath).st_mtime,
|
||||
})
|
||||
ckpts.sort(key=itemgetter('step'))
|
||||
return ckpts
|
||||
|
||||
|
||||
+128
@@ -0,0 +1,128 @@
|
||||
import tensorflow as tf
|
||||
import math
|
||||
import time
|
||||
import os
|
||||
import train_helper
|
||||
from logger import rank0log
|
||||
from benchmark_log import hwlog
|
||||
|
||||
class Trainer(object):
|
||||
def __init__(self, session, args, data, model, logger):
|
||||
self.sess = session
|
||||
self.args = args
|
||||
self.data = data
|
||||
self.model = model
|
||||
self.logger = logger
|
||||
self.print_logger = self.logger.logger
|
||||
self.all_preds = []
|
||||
self.all_targets = []
|
||||
|
||||
self.classifier, self.training_hook = self.get_npu_classifier()
|
||||
|
||||
def get_npu_classifier(self):
|
||||
from npu_bridge.estimator.npu.npu_config import NPURunConfig
|
||||
from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
|
||||
|
||||
run_config = NPURunConfig(
|
||||
hcom_parallel=True,
|
||||
precision_mode="allow_mix_precision",
|
||||
enable_data_pre_proc=True,
|
||||
save_checkpoints_steps=self.args.nsteps_per_epoch,
|
||||
session_config=self.sess.estimator_config,
|
||||
model_dir=self.args.log_dir,
|
||||
iterations_per_loop=self.args.iterations_per_loop,
|
||||
keep_checkpoint_max=5)
|
||||
|
||||
classifier =NPUEstimator(
|
||||
model_fn= self.model.get_estimator_model_func,
|
||||
config= run_config
|
||||
)
|
||||
|
||||
training_hooks = []
|
||||
training_hooks.append(self.logger)
|
||||
|
||||
return classifier, training_hooks
|
||||
|
||||
def train(self):
|
||||
print ('training steps: %d' % self.args.nstep)
|
||||
self.classifier.train( input_fn=lambda:self.data.get_train_input_fn(),
|
||||
max_steps = self.args.nstep,
|
||||
hooks = self.training_hook
|
||||
)
|
||||
|
||||
def evaluate(self):
|
||||
rank0log(self.print_logger, "Evaluating")
|
||||
rank0log(self.print_logger, "Validation dataset size: {}".format(self.args.num_evaluating_samples))
|
||||
time.sleep(5) # a little extra margin...
|
||||
try:
|
||||
ckpts = train_helper.sort_and_load_ckpts(self.args.eval_dir)
|
||||
print("=========ckpt==========")
|
||||
print(ckpts)
|
||||
print("=========ckpt==========")
|
||||
for i, c in enumerate(ckpts):
|
||||
eval_result = self.classifier.evaluate(
|
||||
input_fn=lambda: self.data.get_eval_input_fn(),
|
||||
checkpoint_path=c['path'])
|
||||
c['epoch'] = math.ceil(c['step'] / (self.args.num_training_samples/ (self.args.batch_size)))
|
||||
c['top1'] = eval_result['val-top1acc']
|
||||
c['top5'] = eval_result['val-top5acc']
|
||||
c['loss'] = eval_result['loss']
|
||||
|
||||
rank0log(self.print_logger, ' step epoch top1 top5 loss checkpoint_time(UTC)')
|
||||
for i, c in enumerate(ckpts):
|
||||
if 'top1' not in c:
|
||||
continue
|
||||
rank0log(self.print_logger,'{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}'
|
||||
.format(c['step'],
|
||||
c['epoch'],
|
||||
c['top1'] * 100,
|
||||
c['top5'] * 100,
|
||||
c['loss'],
|
||||
time=time.strftime('%Y-%m-%d %H:%M:%S',
|
||||
time.localtime(c['mtime']))))
|
||||
rank0log(self.print_logger, "Finished evaluation")
|
||||
except KeyboardInterrupt:
|
||||
self.print_logger.error("Keyboard interrupt")
|
||||
|
||||
def train_and_evaluate(self):
|
||||
epochs_between_evals = self.args.epochs_between_evals
|
||||
|
||||
for i in range(self.args.max_epochs // epochs_between_evals):
|
||||
|
||||
rank0log(self.print_logger, "Starting a training cycle")
|
||||
|
||||
self.classifier.train(input_fn=lambda:self.data.get_train_input_fn(),
|
||||
steps = self.args.nsteps_per_epoch*epochs_between_evals,
|
||||
hooks = self.training_hook )
|
||||
|
||||
rank0log(self.print_logger, "Starting to evaluate")
|
||||
rank0log(self.print_logger, "Validation dataset size: {}".format(self.args.num_evaluating_samples))
|
||||
time.sleep(5) # a little extra margin...
|
||||
|
||||
ckpts = train_helper.sort_and_load_ckpts(self.args.log_dir)
|
||||
c = ckpts[-1]
|
||||
eval_result = self.classifier.evaluate(
|
||||
input_fn=lambda: self.data.get_eval_input_fn(),
|
||||
checkpoint_path=c['path'])
|
||||
|
||||
# top1 top5 Log dotting
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value=float(eval_result.get("val-top1acc")))
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value=float(eval_result.get("val-top5acc")))
|
||||
|
||||
|
||||
c['epoch'] = math.ceil(c['step'] / (self.args.num_training_samples / (self.args.batch_size * self.args.rank_size)))
|
||||
c['top1'] = eval_result['val-top1acc']
|
||||
c['top5'] = eval_result['val-top5acc']
|
||||
c['loss'] = eval_result['loss']
|
||||
|
||||
rank0log(self.print_logger, ' step epoch top1 top5 loss checkpoint_time(UTC)')
|
||||
|
||||
rank0log(self.print_logger,'{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}'
|
||||
.format(c['step'],
|
||||
c['epoch'],
|
||||
c['top1'] * 100,
|
||||
c['top5'] * 100,
|
||||
c['loss'],
|
||||
time=time.strftime('%Y-%m-%d %H:%M:%S',
|
||||
time.localtime(c['mtime']))))
|
||||
|
||||
+23
@@ -0,0 +1,23 @@
|
||||
{
|
||||
"group_count": "1",
|
||||
"group_list": [
|
||||
{
|
||||
"group_name": "worker",
|
||||
"device_count": "1",
|
||||
"instance_count": "1",
|
||||
"instance_list": [
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "7",
|
||||
"device_ip": "192.168.193.103"
|
||||
}
|
||||
],
|
||||
"pod_name": "npu1p",
|
||||
"server_id": "127.0.0.1"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"status": "completed"
|
||||
}
|
||||
+51
@@ -0,0 +1,51 @@
|
||||
{
|
||||
"group_count": "1",
|
||||
"group_list": [
|
||||
{
|
||||
"group_name": "worker",
|
||||
"device_count": "8",
|
||||
"instance_count": "1",
|
||||
"instance_list": [
|
||||
{
|
||||
"devices": [
|
||||
{
|
||||
"device_id": "0",
|
||||
"device_ip": "192.168.190.102"
|
||||
},
|
||||
{
|
||||
"device_id": "1",
|
||||
"device_ip": "192.168.191.102"
|
||||
},
|
||||
{
|
||||
"device_id": "2",
|
||||
"device_ip": "192.168.192.102"
|
||||
},
|
||||
{
|
||||
"device_id": "3",
|
||||
"device_ip": "192.168.193.102"
|
||||
},
|
||||
{
|
||||
"device_id": "4",
|
||||
"device_ip": "192.168.190.103"
|
||||
},
|
||||
{
|
||||
"device_id": "5",
|
||||
"device_ip": "192.168.191.103"
|
||||
},
|
||||
{
|
||||
"device_id": "6",
|
||||
"device_ip": "192.168.192.103"
|
||||
},
|
||||
{
|
||||
"device_id": "7",
|
||||
"device_ip": "192.168.193.103"
|
||||
}
|
||||
],
|
||||
"pod_name": "npu8p",
|
||||
"server_id": "127.0.0.1"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"status": "completed"
|
||||
}
|
||||
+9
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"server_count": "1",
|
||||
"server_list": [{
|
||||
"device": [{devices}],
|
||||
"server_id": "127.0.0.1"
|
||||
}],
|
||||
"status": "completed",
|
||||
"version": "1.0"
|
||||
}
|
||||
+36
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
|
||||
rm -rf /var/log/npu/slog/host-0/*
|
||||
#安装toolkit
|
||||
#export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
|
||||
#export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages
|
||||
#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
|
||||
#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
|
||||
#安装nnae等
|
||||
#export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/x86_64-linux_gcc7.3.0/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
|
||||
#export PYTHONPATH=/home/train/resnet50_tf/code:/usr/local/Ascend/nnae/latest/x86_64-linux_gcc7.3.0/opp/op_impl/built-in/ai_core/tbe/:/usr/local/Ascend/nnae/latest/x86_64-linux_gcc7.3.0/fwkacllib/python/site-packages/te/:/usr/local/Ascend/nnae/latest/x86_64-linux_gcc7.3.0/fwkacllib/python/site-packages/topi/:/usr/local/Ascend/nnae/latest/x86_64-linux_gcc7.3.0/fwkacllib/python/site-packages/hccl/:/usr/local/Ascend/tfplugin/latest/x86_64-linux_gcc7.3.0/tfplugin/python/site-packages/:/usr/local/Ascend/tfplugin/latest/x86_64-linux_gcc7.3.0/tfplugin/python/site-packages/npu_bridge:/code
|
||||
#export PATH=$PATH:/usr/local/Ascend/nnae/latest/x86_64-linux_gcc7.3.0/fwkacllib/ccec_compiler/bin/
|
||||
#export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/x86_64-linux_gcc7.3.0/opp/
|
||||
|
||||
|
||||
if [ -d /usr/local/Ascend/nnae/latest ];then
|
||||
|
||||
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
|
||||
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages
|
||||
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
|
||||
else
|
||||
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
|
||||
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$projectDir
|
||||
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
|
||||
fi
|
||||
|
||||
export DDK_VERSION_FLAG=1.60.T17.B830
|
||||
export HCCL_CONNECT_TIMEOUT=600
|
||||
export JOB_ID=9999001
|
||||
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
|
||||
+70
@@ -0,0 +1,70 @@
|
||||
#!/bin/bash
|
||||
|
||||
rank_size=$1
|
||||
yamlPath=$2
|
||||
toolsPath=$3
|
||||
|
||||
currentDir=$(cd "$(dirname "$0")/.."; pwd)
|
||||
model_name=$(cd $currentDir/..;basename `pwd`)
|
||||
if [ -f /.dockerenv ];then
|
||||
CLUSTER=$4
|
||||
MPIRUN_ALL_IP="$5"
|
||||
export CLUSTER=${CLUSTER}
|
||||
fi
|
||||
# 从 yaml 获取配置
|
||||
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
|
||||
|
||||
if [ $? -eq 0 ] ;
|
||||
then
|
||||
echo "modify inner config file success"
|
||||
else
|
||||
echo "modify inner config file fail"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
#mkdir train job path
|
||||
currtime=`date +%Y%m%d%H%M%S`
|
||||
|
||||
mkdir -p ${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/
|
||||
train_job_dir=${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${train_job_dir} &"
|
||||
# device 列表, 若无指定 device 根据 rank_size 顺序选择
|
||||
eval device_group=\$device_group_${rank_size}p
|
||||
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
|
||||
device_group="$(seq 0 "$(expr $rank_size - 1)")"
|
||||
fi
|
||||
|
||||
# get last device id in device_group, hw log in performance from the dir named last_device_id
|
||||
device_group_str=`echo ${device_group} | sed 's/ //g'`
|
||||
first_device_id=`echo ${device_group_str: 0:1}`
|
||||
|
||||
if [ x"${CLUSTER}" == x"True" ];then
|
||||
# ln hw log
|
||||
ln -snf ${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/0/hw_densenet121.log ${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/
|
||||
this_ip=$(hostname -I |awk '{print $1}')
|
||||
for ip in $MPIRUN_ALL_IP;do
|
||||
if [ x"$ip" != x"$this_ip" ];then
|
||||
scp $yamlPath root@$ip:$yamlPath
|
||||
scp ${jsonFilePath} root@$ip:${jsonFilePath}
|
||||
fi
|
||||
done
|
||||
export PATH=$PATH:/usr/local/mpirun4.0/bin
|
||||
mpirun -H ${mpirun_ip} \
|
||||
--bind-to none -map-by slot\
|
||||
--allow-run-as-root \
|
||||
--mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
|
||||
--prefix /usr/local/mpirun4.0/ \
|
||||
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} ${CLUSTER}
|
||||
else
|
||||
# ln hw log
|
||||
ln -snf ${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/${first_device_id}/hw_densenet121.log ${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/
|
||||
rank_id=0
|
||||
for device_id in $device_group;do
|
||||
${currentDir}/scripts/train.sh $device_id $rank_size $yamlPath $currtime ${toolsPath} $rank_id &
|
||||
let rank_id++
|
||||
done
|
||||
fi
|
||||
wait
|
||||
|
||||
#echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] all train exit " >> ${currentDir}/result/main.log
|
||||
|
||||
+97
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
device_id=$1
|
||||
rank_size=$2
|
||||
yamlPath=$3
|
||||
|
||||
currentDir=$(cd "$(dirname "$0")/.."; pwd)
|
||||
model_name="densenet121"
|
||||
currtime=$4
|
||||
toolsPath=$5
|
||||
|
||||
export YAML_PATH=$3
|
||||
|
||||
|
||||
mkdir -p ${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/
|
||||
export train_job_dir=${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/
|
||||
|
||||
# 从 yaml 获取配置
|
||||
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
|
||||
|
||||
export REMARK_LOG_FILE=hw_densenet121.log # 打点日志文件名称, 必须hw_后跟模型名称小写
|
||||
benchmark_log_path=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
|
||||
export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
|
||||
|
||||
source ${currentDir}/config/npu_set_env.sh
|
||||
|
||||
# user env
|
||||
export HCCL_CONNECT_TIMEOUT=600
|
||||
export JOB_ID=9999001
|
||||
export RANK_TABLE_FILE=${currentDir}/config/${rank_size}p.json
|
||||
export RANK_SIZE=${rank_size}
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
export DEVICE_ID=${device_id}
|
||||
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
|
||||
export DEVICE_INDEX=${DEVICE_INDEX}
|
||||
|
||||
cd ${train_job_dir}
|
||||
curd_dir=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils/atlasboost
|
||||
export PYTHONPATH=$PYTHONPATH:${curd_dir}
|
||||
|
||||
if [ x"$6" != x"True" ];then
|
||||
rank_id=$6
|
||||
export RANK_ID=$6
|
||||
else
|
||||
device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
|
||||
device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
|
||||
atlasboost.set_device_id(device_id);print(atlasboost.rank())")
|
||||
device_id_mo=`echo $device_id_mo`
|
||||
rank_id=${device_id_mo##* }
|
||||
export RANK_ID=${rank_id}
|
||||
device=${device_id_mo##*deviceid = }
|
||||
device_id=${device%% phyid=*}
|
||||
export DEVICE_ID=${device_id}
|
||||
hccljson=${train_job_dir}/*.json
|
||||
cp ${hccljson} ${currentDir}/config/${rank_size}p.json
|
||||
fi
|
||||
|
||||
#mkdir exec path
|
||||
mkdir -p ${train_job_dir}/${device_id}
|
||||
cd ${train_job_dir}/${device_id}
|
||||
|
||||
startTime=`date +%Y%m%d-%H:%M:%S`
|
||||
startTime_s=`date +%s`
|
||||
|
||||
# 根据单卡/多卡区分调用参数
|
||||
if [ x"$6" == x"True" ];then
|
||||
# 多卡多机
|
||||
export CLUSTER=True
|
||||
python3.7 ${currentDir}/code/train.py --rank_size=${rank_size} --mode=${mode_8p} --max_epochs=${epoches} --iterations_per_loop=${iterations_per_loop_8p} --epochs_between_evals=${epochs_between_evals} --data_dir=${data_url} --lr=${lr} --log_dir=${log_dir} --log_name=${log_name_8p} > ${train_job_dir}/train_${device_id}.log 2>&1
|
||||
elif [ x"${rank_size}" == x"1" ];then
|
||||
# 单卡
|
||||
python3.7 ${currentDir}/code/train.py --rank_size=${rank_size} --mode=${mode_1p} --max_train_steps=${max_train_steps_1p} --iterations_per_loop=${iterations_per_loop_1p} --data_dir=${data_url} --display_every=${display_every} --log_dir=${log_dir} --log_name=${log_name_1p} > ${train_job_dir}/train_${device_id}.log 2>&1
|
||||
|
||||
elif [ ${rank_size} -le 8 ];then
|
||||
# 多卡单机
|
||||
|
||||
python3.7 ${currentDir}/code/train.py --rank_size=${rank_size} --mode=${mode_8p} --max_epochs=${epoches} --iterations_per_loop=${iterations_per_loop_8p} --epochs_between_evals=${epochs_between_evals} --data_dir=${data_url} --lr=${lr} --log_dir=${log_dir} --log_name=${log_name_8p} > ${train_job_dir}/train_${device_id}.log 2>&1
|
||||
fi
|
||||
|
||||
if [ $? -eq 0 ];then
|
||||
echo ":::ABK 1.0.0 densenet121 train success"
|
||||
echo ":::ABK 1.0.0 densenet121 train success" >> ${train_job_dir}/train_${device_id}.log
|
||||
echo ":::ABK 1.0.0 densenet121 train success" >> ./hw_densenet121.log
|
||||
else
|
||||
echo ":::ABK 1.0.0 densenet121 train failed"
|
||||
echo ":::ABK 1.0.0 densenet121 train failed" >> ${train_job_dir}/train_${device_id}.log
|
||||
echo ":::ABK 1.0.0 densenet121 train failed" >> ./hw_densenet121.log
|
||||
fi
|
||||
|
||||
endTime=`date +%Y%m%d-%H:%M:%S`
|
||||
endTime_s=`date +%s`
|
||||
sumTime=$[ $endTime_s - $startTime_s ]
|
||||
hour=$(( $sumTime/3600 ))
|
||||
min=$(( ($sumTime-${hour}*3600)/60 ))
|
||||
sec=$(( $sumTime-${hour}*3600-${min}*60 ))
|
||||
echo ":::ABK 1.0.0 densenet121 train total time: ${hour}:${min}:${sec}"
|
||||
echo ":::ABK 1.0.0 densenet121 train total time: ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_densenet121.log
|
||||
Reference in New Issue
Block a user