[add]上传训练benchmark by z00560161
This commit is contained in:
@@ -0,0 +1 @@
|
||||
# MobileNetV2 NPU训练
|
||||
@@ -0,0 +1,29 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
g_feat_in = []
|
||||
g_feat_out = []
|
||||
g_grad_in = []
|
||||
g_grad_out = []
|
||||
|
||||
|
||||
def forward_hook_fn(module, input, output):
|
||||
g_feat_in.append(input)
|
||||
g_feat_out.append(output)
|
||||
print(module)
|
||||
print(input)
|
||||
print(output)
|
||||
|
||||
|
||||
def backward_hook_fn(module, grad_input, grad_output):
|
||||
g_grad_in.append(grad_input)
|
||||
g_grad_out.append(grad_output)
|
||||
print(module)
|
||||
print(grad_input)
|
||||
print(grad_input)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
+498
@@ -0,0 +1,498 @@
|
||||
import argparse
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
import time
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.nn.parallel
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torch.distributed as dist
|
||||
import torch.optim
|
||||
import torch.multiprocessing as mp
|
||||
import torch.utils.data
|
||||
import torch.utils.data.distributed
|
||||
import torchvision.transforms as transforms
|
||||
import torchvision.datasets as datasets
|
||||
import torchvision.models as models
|
||||
from mobilenet import mobilenet_v2
|
||||
import torch.npu
|
||||
|
||||
# from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from apex import amp
|
||||
import numpy as np
|
||||
|
||||
from hook import *
|
||||
|
||||
from benchmark_log import hwlog
|
||||
from benchmark_log.basic_utils import get_environment_info
|
||||
from benchmark_log.basic_utils import get_model_parameter
|
||||
|
||||
|
||||
# model_names = sorted(name for name in models.__dict__
|
||||
# if name.islower() and not name.startswith("__")
|
||||
# and callable(models.__dict__[name]))
|
||||
|
||||
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
|
||||
parser.add_argument('--data', metavar='DIR', default='/dataset/imagenet',
|
||||
help='path to dataset')
|
||||
# parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
|
||||
# choices=model_names,
|
||||
# help='model architecture: ' +
|
||||
# ' | '.join(model_names) +
|
||||
# ' (default: resnet18)')
|
||||
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
|
||||
help='number of data loading workers (default: 4)')
|
||||
parser.add_argument('--epochs', default=90, type=int, metavar='N',
|
||||
help='number of total epochs to run')
|
||||
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
|
||||
help='manual epoch number (useful on restarts)')
|
||||
parser.add_argument('-b', '--batch-size', default=256, type=int,
|
||||
metavar='N',
|
||||
help='mini-batch size (default: 256), this is the total '
|
||||
'batch size of all GPUs on the current node when '
|
||||
'using Data Parallel or Distributed Data Parallel')
|
||||
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
|
||||
metavar='LR', help='initial learning rate', dest='lr')
|
||||
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
|
||||
help='momentum')
|
||||
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
|
||||
metavar='W', help='weight decay (default: 1e-4)',
|
||||
dest='weight_decay')
|
||||
parser.add_argument('-p', '--print-freq', default=10, type=int,
|
||||
metavar='N', help='print frequency (default: 10)')
|
||||
parser.add_argument('--resume', default='', type=str, metavar='PATH',
|
||||
help='path to latest checkpoint (default: none)')
|
||||
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
|
||||
help='evaluate model on validation set')
|
||||
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
|
||||
help='use pre-trained model')
|
||||
parser.add_argument('--world-size', default=-1, type=int,
|
||||
help='number of nodes for distributed training')
|
||||
parser.add_argument('--rank', default=-1, type=int,
|
||||
help='node rank for distributed training')
|
||||
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
|
||||
help='url used to set up distributed training')
|
||||
parser.add_argument('--dist-backend', default='nccl', type=str,
|
||||
help='distributed backend')
|
||||
parser.add_argument('--seed', default=None, type=int,
|
||||
help='seed for initializing training. ')
|
||||
parser.add_argument('--gpu', default=None, type=int,
|
||||
help='GPU id to use.')
|
||||
parser.add_argument('--multiprocessing-distributed', action='store_true',
|
||||
help='Use multi-processing distributed training to launch '
|
||||
'N processes per node, which has N GPUs. This is the '
|
||||
'fastest way to use PyTorch for either single node or '
|
||||
'multi node data parallel training')
|
||||
|
||||
parser.add_argument('--amp', default=False, action='store_true',
|
||||
help='use amp to train the model')
|
||||
parser.add_argument('--opt-level', default=None, type=str, help='apex optimize level')
|
||||
parser.add_argument('--loss-scale-value', default='1024', type=int, help='static loss scale value')
|
||||
|
||||
parser.add_argument('--summary-path', default=None, type=str, help='event file path')
|
||||
parser.add_argument('--stop-step-num', default=None, type=int, help='after the stop-step, killing the training task')
|
||||
parser.add_argument('--device', default='npu:0', type=str, help='device type, cpu or npu:x or cuda:x')
|
||||
parser.add_argument('--eval-freq', default=10, type=int, help='test interval')
|
||||
parser.add_argument('--hook', default=False, action='store_true', help='pytorch hook')
|
||||
|
||||
best_acc1 = 0
|
||||
cur_step = 0
|
||||
|
||||
|
||||
def seed_everything(seed, device):
|
||||
random.seed(seed)
|
||||
os.environ['PYTHONHASHSEED'] = str(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
|
||||
if 'cuda' in device:
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
cudnn.deterministic = True
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
|
||||
|
||||
def main():
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.seed is not None:
|
||||
seed_everything(args.seed, args.device)
|
||||
|
||||
warnings.warn('You have chosen to seed training. '
|
||||
'This will turn on the CUDNN deterministic setting, '
|
||||
'which can slow down your training considerably! '
|
||||
'You may see unexpected behavior when restarting '
|
||||
'from checkpoints.')
|
||||
|
||||
main_worker(args)
|
||||
|
||||
|
||||
def main_worker(args):
|
||||
global best_acc1
|
||||
global cur_step
|
||||
|
||||
# sum_writer = SummaryWriter(args.summary_path)
|
||||
global_step = -1
|
||||
|
||||
if 'npu' in args.device:
|
||||
torch.npu.set_device(args.device)
|
||||
if 'cuda' in args.device:
|
||||
torch.cuda.set_device(args.device)
|
||||
|
||||
model = mobilenet_v2()
|
||||
|
||||
# set hook
|
||||
if args.hook:
|
||||
modules = model.named_modules()
|
||||
for name, module in modules:
|
||||
module.register_forward_hook(forward_hook_fn)
|
||||
module.register_backward_hook(backward_hook_fn)
|
||||
|
||||
optimizer = torch.optim.SGD(model.parameters(), args.lr,
|
||||
momentum=args.momentum,
|
||||
weight_decay=args.weight_decay)
|
||||
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
if 'npu' in args.device or 'cuda' in args.device:
|
||||
model = model.to(args.device)
|
||||
criterion = criterion.to(args.device)
|
||||
|
||||
if args.amp:
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale_value)
|
||||
|
||||
# optionally resume from a checkpoint
|
||||
if args.resume:
|
||||
if os.path.isfile(args.resume):
|
||||
print("=> loading checkpoint '{}'".format(args.resume))
|
||||
checkpoint = torch.load(args.resume, map_location=args.device)
|
||||
args.start_epoch = checkpoint['epoch']
|
||||
best_acc1 = checkpoint['best_acc1']
|
||||
model.load_state_dict(checkpoint['state_dict'])
|
||||
optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
if args.amp:
|
||||
amp.load_state_dict(checkpoint['amp'])
|
||||
print("=> loaded checkpoint '{}' (epoch {})"
|
||||
.format(args.resume, checkpoint['epoch']))
|
||||
else:
|
||||
print("=> no checkpoint found at '{}'".format(args.resume))
|
||||
|
||||
|
||||
# Data loading code
|
||||
traindir = os.path.join(args.data, 'train')
|
||||
valdir = os.path.join(args.data, 'val')
|
||||
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
||||
std=[0.229, 0.224, 0.225])
|
||||
|
||||
train_dataset = datasets.ImageFolder(
|
||||
traindir,
|
||||
transforms.Compose([
|
||||
transforms.RandomResizedCrop(224),
|
||||
transforms.RandomHorizontalFlip(),
|
||||
transforms.ToTensor(),
|
||||
normalize,
|
||||
]))
|
||||
|
||||
train_sampler = None
|
||||
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
|
||||
num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
|
||||
|
||||
val_loader = torch.utils.data.DataLoader(
|
||||
datasets.ImageFolder(valdir, transforms.Compose([
|
||||
transforms.Resize(256),
|
||||
transforms.CenterCrop(224),
|
||||
transforms.ToTensor(),
|
||||
normalize,
|
||||
])),
|
||||
batch_size=args.batch_size, shuffle=False,
|
||||
num_workers=args.workers, pin_memory=True, drop_last=True)
|
||||
|
||||
if args.evaluate:
|
||||
validate(val_loader, model, criterion, args, global_step)
|
||||
return
|
||||
|
||||
for epoch in range(args.start_epoch, args.epochs):
|
||||
|
||||
# train for one epoch
|
||||
global_step = train(train_loader, model, criterion, optimizer, epoch, args, global_step)
|
||||
|
||||
if (epoch + 1) % (args.eval_freq) == 0 or epoch == args.epochs - 1:
|
||||
# evaluate on validation set
|
||||
acc1 = validate(val_loader, model, criterion, args, global_step)
|
||||
|
||||
# remember best acc@1 and save checkpoint
|
||||
is_best = acc1 > best_acc1
|
||||
best_acc1 = max(acc1, best_acc1)
|
||||
|
||||
# save checkpoint
|
||||
if args.amp:
|
||||
save_checkpoint({
|
||||
'epoch': epoch + 1,
|
||||
'state_dict': model.state_dict(),
|
||||
'best_acc1': best_acc1,
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'amp': amp.state_dict(),
|
||||
}, is_best)
|
||||
else:
|
||||
save_checkpoint({
|
||||
'epoch': epoch + 1,
|
||||
'state_dict': model.state_dict(),
|
||||
'best_acc1': best_acc1,
|
||||
'optimizer': optimizer.state_dict(),
|
||||
}, is_best)
|
||||
|
||||
if args.stop_step_num is not None and cur_step >= args.stop_step_num:
|
||||
break
|
||||
|
||||
# sum_writer.close()
|
||||
|
||||
|
||||
def train(train_loader, model, criterion, optimizer, epoch, args, global_step, sum_writer=None):
|
||||
global cur_step
|
||||
|
||||
if args.seed is not None:
|
||||
seed_everything(args.seed + epoch, args.device)
|
||||
|
||||
batch_time = AverageMeter('Time', ':6.3f')
|
||||
data_time = AverageMeter('Data', ':6.3f')
|
||||
learning_rate = AverageMeter('LR', ':2.8f')
|
||||
losses = AverageMeter('Loss', ':6.8f')
|
||||
top1 = AverageMeter('Acc@1', ':6.2f')
|
||||
top5 = AverageMeter('Acc@5', ':6.2f')
|
||||
progress = ProgressMeter(
|
||||
len(train_loader),
|
||||
[batch_time, data_time, learning_rate, losses, top1, top5],
|
||||
prefix="Epoch: [{}]".format(epoch))
|
||||
|
||||
# switch to train mode
|
||||
model.train()
|
||||
|
||||
end = time.time()
|
||||
steps_per_epoch = len(train_loader)
|
||||
for i, (images, target) in enumerate(train_loader):
|
||||
|
||||
global_step = epoch * steps_per_epoch + i
|
||||
cur_step = global_step
|
||||
|
||||
lr = adjust_learning_rate(optimizer, global_step, steps_per_epoch, args)
|
||||
|
||||
learning_rate.update(lr)
|
||||
|
||||
# sum_writer.add_scalar('learning rate', lr, global_step)
|
||||
|
||||
# measure data loading time
|
||||
data_time.update(time.time() - end)
|
||||
|
||||
if 'npu' in args.device:
|
||||
target = target.to(torch.int32)
|
||||
|
||||
if 'npu' in args.device or 'cuda' in args.device:
|
||||
images = images.to(args.device, non_blocking=True)
|
||||
target = target.to(args.device, non_blocking=True)
|
||||
|
||||
# output = None
|
||||
# loss = None
|
||||
# with torch.autograd.profiler.profile(record_shapes=True, use_npu=True) as prof:
|
||||
|
||||
# compute output
|
||||
output = model(images)
|
||||
loss = criterion(output, target)
|
||||
|
||||
# measure accuracy and record loss
|
||||
acc1, acc5 = accuracy(output, target, topk=(1, 5))
|
||||
losses.update(loss.item(), images.size(0))
|
||||
top1.update(acc1[0], images.size(0))
|
||||
top5.update(acc5[0], images.size(0))
|
||||
|
||||
# compute gradient and do SGD step
|
||||
optimizer.zero_grad()
|
||||
if args.amp:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
# sum_writer.add_scalar('Accuary/train/top1', acc1, global_step)
|
||||
# sum_writer.add_scalar('Accuary/train/top5', acc5, global_step)
|
||||
# sum_writer.add_scalar('Loss/train/loss', loss, global_step)
|
||||
|
||||
optimizer.step()
|
||||
# for name, parms in model.named_parameters():
|
||||
# print('-->name:', name, ' -->grad_value_max:', torch.max(parms.grad), ' -->grad_value_min:', torch.min(parms.grad))
|
||||
|
||||
# print(prof.key_averages().table())
|
||||
# prof.export_chrome_trace("mobilenetv2_{}_npu.prof".format(i))
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
end = time.time()
|
||||
|
||||
if i % args.print_freq == 0:
|
||||
progress.display(i)
|
||||
|
||||
if args.stop_step_num is not None and cur_step >= args.stop_step_num:
|
||||
break
|
||||
|
||||
print(' * FPS@all {:.3f}'.format(args.batch_size / batch_time.avg))
|
||||
hwlog.remark_print(key=hwlog.FPS, value=' * FPS@all {:.3f}'.format(args.batch_size / batch_time.avg))
|
||||
return global_step
|
||||
|
||||
|
||||
def validate(val_loader, model, criterion, args, global_step, sum_writer=None):
|
||||
batch_time = AverageMeter('Time', ':6.3f')
|
||||
losses = AverageMeter('Loss', ':.4e')
|
||||
top1 = AverageMeter('Acc@1', ':6.2f')
|
||||
top5 = AverageMeter('Acc@5', ':6.2f')
|
||||
progress = ProgressMeter(
|
||||
len(val_loader),
|
||||
[batch_time, losses, top1, top5],
|
||||
prefix='Test: ')
|
||||
|
||||
# switch to evaluate mode
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
end = time.time()
|
||||
for i, (images, target) in enumerate(val_loader):
|
||||
|
||||
if 'npu' in args.device:
|
||||
target = target.to(torch.int32)
|
||||
|
||||
if 'npu' in args.device or 'cuda' in args.device:
|
||||
images = images.to(args.device, non_blocking=True)
|
||||
target = target.to(args.device, non_blocking=True)
|
||||
|
||||
# compute output
|
||||
output = model(images)
|
||||
loss = criterion(output, target)
|
||||
|
||||
# measure accuracy and record loss
|
||||
acc1, acc5 = accuracy(output, target, topk=(1, 5))
|
||||
losses.update(loss.item(), images.size(0))
|
||||
top1.update(acc1[0], images.size(0))
|
||||
top5.update(acc5[0], images.size(0))
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
end = time.time()
|
||||
|
||||
if i % args.print_freq == 0:
|
||||
progress.display(i)
|
||||
|
||||
# TODO: this should also be done with the ProgressMeter
|
||||
print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
|
||||
.format(top1=top1, top5=top5))
|
||||
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value="{top1.avg:.3f}".format(top1=top1))
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value="{top5.avg:.3f}".format(top5=top5))
|
||||
|
||||
#if not args.evaluate:
|
||||
# # sum_writer.add_scalar('Loss/validation/loss', losses, global_step)
|
||||
# sum_writer.add_scalar('Accuary/validation/top1', top1.avg, global_step)
|
||||
# sum_writer.add_scalar('Accuary/validation/top5', top5.avg, global_step)
|
||||
|
||||
return top1.avg
|
||||
|
||||
|
||||
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
|
||||
torch.save(state, filename)
|
||||
if is_best:
|
||||
shutil.copyfile(filename, 'model_best.pth.tar')
|
||||
|
||||
|
||||
class AverageMeter(object):
|
||||
"""Computes and stores the average and current value"""
|
||||
def __init__(self, name, fmt=':f'):
|
||||
self.name = name
|
||||
self.fmt = fmt
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.val = 0
|
||||
self.avg = 0
|
||||
self.sum = 0
|
||||
self.count = 0
|
||||
|
||||
def update(self, val, n=1):
|
||||
self.val = val
|
||||
self.sum += val * n
|
||||
self.count += n
|
||||
self.avg = self.sum / self.count
|
||||
|
||||
def __str__(self):
|
||||
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
|
||||
return fmtstr.format(**self.__dict__)
|
||||
|
||||
|
||||
class ProgressMeter(object):
|
||||
def __init__(self, num_batches, meters, prefix=""):
|
||||
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
|
||||
self.meters = meters
|
||||
self.prefix = prefix
|
||||
|
||||
def display(self, batch):
|
||||
entries = [self.prefix + self.batch_fmtstr.format(batch)]
|
||||
entries += [str(meter) for meter in self.meters]
|
||||
print('\t'.join(entries))
|
||||
# 日志打点
|
||||
train_acc1 = str(entries).split("Acc@1")[1].strip().split(" ")[0]
|
||||
train_acc5 = str(entries).split("Acc@5")[1].strip().split(" ")[0]
|
||||
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP1, value=train_acc1)
|
||||
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP5, value=train_acc5)
|
||||
|
||||
def _get_batch_fmtstr(self, num_batches):
|
||||
num_digits = len(str(num_batches // 1))
|
||||
fmt = '{:' + str(num_digits) + 'd}'
|
||||
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
|
||||
|
||||
|
||||
def adjust_learning_rate(optimizer, global_step, steps_per_epoch, args):
|
||||
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
|
||||
# lr = args.lr * (0.98 ** (epoch / 2.5))
|
||||
lr = args.lr * (0.98 ** (global_step // int(steps_per_epoch * 2.5)))
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr
|
||||
return lr
|
||||
|
||||
|
||||
def accuracy(output, target, topk=(1,)):
|
||||
"""Computes the accuracy over the k top predictions for the specified values of k"""
|
||||
with torch.no_grad():
|
||||
maxk = max(topk)
|
||||
batch_size = target.size(0)
|
||||
|
||||
_, pred = output.topk(maxk, 1, True, True)
|
||||
pred = pred.t()
|
||||
correct = pred.eq(target.view(1, -1).expand_as(pred))
|
||||
|
||||
res = []
|
||||
for k in topk:
|
||||
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
|
||||
res.append(correct_k.mul_(100.0 / batch_size))
|
||||
return res
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
|
||||
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
|
||||
config_info = get_model_parameter("pytorch_config")
|
||||
initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
|
||||
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
|
||||
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
|
||||
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
|
||||
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
|
||||
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
|
||||
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
|
||||
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
|
||||
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
|
||||
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
|
||||
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
|
||||
main()
|
||||
+179
@@ -0,0 +1,179 @@
|
||||
from torch import nn
|
||||
# from .utils import load_state_dict_from_url
|
||||
|
||||
|
||||
__all__ = ['MobileNetV2', 'mobilenet_v2']
|
||||
|
||||
|
||||
model_urls = {
|
||||
'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth',
|
||||
}
|
||||
|
||||
|
||||
def _make_divisible(v, divisor, min_value=None):
|
||||
"""
|
||||
This function is taken from the original tf repo.
|
||||
It ensures that all layers have a channel number that is divisible by 8
|
||||
It can be seen here:
|
||||
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
|
||||
:param v:
|
||||
:param divisor:
|
||||
:param min_value:
|
||||
:return:
|
||||
"""
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
# Make sure that round down does not go down by more than 10%.
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
|
||||
class ConvBNReLU(nn.Sequential):
|
||||
def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
|
||||
padding = (kernel_size - 1) // 2
|
||||
super(ConvBNReLU, self).__init__(
|
||||
nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
|
||||
nn.BatchNorm2d(out_planes),
|
||||
nn.ReLU6(inplace=True)
|
||||
# nn.ReLU(inplace=True)
|
||||
)
|
||||
|
||||
|
||||
class InvertedResidual(nn.Module):
|
||||
def __init__(self, inp, oup, stride, expand_ratio):
|
||||
super(InvertedResidual, self).__init__()
|
||||
self.stride = stride
|
||||
assert stride in [1, 2]
|
||||
|
||||
hidden_dim = int(round(inp * expand_ratio))
|
||||
self.use_res_connect = self.stride == 1 and inp == oup
|
||||
|
||||
layers = []
|
||||
if expand_ratio != 1:
|
||||
# pw
|
||||
layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
|
||||
layers.extend([
|
||||
# dw
|
||||
ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
|
||||
# pw-linear
|
||||
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
|
||||
nn.BatchNorm2d(oup),
|
||||
])
|
||||
self.conv = nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
if self.use_res_connect:
|
||||
return x + self.conv(x)
|
||||
else:
|
||||
return self.conv(x)
|
||||
|
||||
|
||||
class MobileNetV2(nn.Module):
|
||||
def __init__(self,
|
||||
num_classes=1000,
|
||||
width_mult=1.0,
|
||||
inverted_residual_setting=None,
|
||||
round_nearest=8,
|
||||
block=None):
|
||||
"""
|
||||
MobileNet V2 main class
|
||||
|
||||
Args:
|
||||
num_classes (int): Number of classes
|
||||
width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
|
||||
inverted_residual_setting: Network structure
|
||||
round_nearest (int): Round the number of channels in each layer to be a multiple of this number
|
||||
Set to 1 to turn off rounding
|
||||
block: Module specifying inverted residual building block for mobilenet
|
||||
|
||||
"""
|
||||
super(MobileNetV2, self).__init__()
|
||||
|
||||
if block is None:
|
||||
block = InvertedResidual
|
||||
input_channel = 32
|
||||
last_channel = 1280
|
||||
|
||||
if inverted_residual_setting is None:
|
||||
inverted_residual_setting = [
|
||||
# t, c, n, s
|
||||
[1, 16, 1, 1],
|
||||
[6, 24, 2, 2],
|
||||
[6, 32, 3, 2],
|
||||
[6, 64, 4, 2],
|
||||
[6, 96, 3, 1],
|
||||
[6, 160, 3, 2],
|
||||
[6, 320, 1, 1],
|
||||
]
|
||||
|
||||
# only check the first element, assuming user knows t,c,n,s are required
|
||||
if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
|
||||
raise ValueError("inverted_residual_setting should be non-empty "
|
||||
"or a 4-element list, got {}".format(inverted_residual_setting))
|
||||
|
||||
# building first layer
|
||||
input_channel = _make_divisible(input_channel * width_mult, round_nearest)
|
||||
self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
|
||||
features = [ConvBNReLU(3, input_channel, stride=2)]
|
||||
# building inverted residual blocks
|
||||
for t, c, n, s in inverted_residual_setting:
|
||||
output_channel = _make_divisible(c * width_mult, round_nearest)
|
||||
for i in range(n):
|
||||
stride = s if i == 0 else 1
|
||||
features.append(block(input_channel, output_channel, stride, expand_ratio=t))
|
||||
input_channel = output_channel
|
||||
# building last several layers
|
||||
features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
|
||||
# make it nn.Sequential
|
||||
self.features = nn.Sequential(*features)
|
||||
|
||||
# building classifier
|
||||
self.classifier = nn.Sequential(
|
||||
# p=0.2
|
||||
nn.Dropout(0.2),
|
||||
nn.Linear(self.last_channel, num_classes),
|
||||
)
|
||||
|
||||
# weight initialization
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
if m.bias is not None:
|
||||
nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
nn.init.ones_(m.weight)
|
||||
nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.Linear):
|
||||
nn.init.normal_(m.weight, 0, 0.01)
|
||||
nn.init.zeros_(m.bias)
|
||||
|
||||
def _forward_impl(self, x):
|
||||
# This exists since TorchScript doesn't support inheritance, so the superclass method
|
||||
# (this one) needs to have a name other than `forward` that can be accessed in a subclass
|
||||
x = self.features(x)
|
||||
# Cannot use "squeeze" as batch-size can be 1 => must use reshape with x.shape[0]
|
||||
x = nn.functional.adaptive_avg_pool2d(x, 1).reshape(x.shape[0], -1)
|
||||
x = self.classifier(x)
|
||||
return x
|
||||
|
||||
def forward(self, x):
|
||||
return self._forward_impl(x)
|
||||
|
||||
|
||||
def mobilenet_v2(pretrained=False, progress=True, **kwargs):
|
||||
"""
|
||||
Constructs a MobileNetV2 architecture from
|
||||
`"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
progress (bool): If True, displays a progress bar of the download to stderr
|
||||
"""
|
||||
model = MobileNetV2(**kwargs)
|
||||
# if pretrained:
|
||||
# state_dict = load_state_dict_from_url(model_urls['mobilenet_v2'],
|
||||
# progress=progress)
|
||||
# model.load_state_dict(state_dict)
|
||||
return model
|
||||
+18
@@ -0,0 +1,18 @@
|
||||
{
|
||||
"startCfg":
|
||||
[
|
||||
{
|
||||
"jobID": "123456789",
|
||||
"deviceID": ["0"],
|
||||
"features":
|
||||
[
|
||||
{
|
||||
"name": "task_trace"
|
||||
},
|
||||
{
|
||||
"name": "training_trace"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
# MobileNetV2 NPU训练
|
||||
@@ -0,0 +1,22 @@
|
||||
export ASCEND_HOME=/usr/local/Ascend
|
||||
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
|
||||
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$currentDir
|
||||
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 7"
|
||||
|
||||
export TASK_QUEUE_ENABLE=0
|
||||
taskset -c 111-150 python3 densenet121_1p_main.py \
|
||||
--workers 40 \
|
||||
--arch densenet121 \
|
||||
--npu 7 \
|
||||
--lr 0.1 \
|
||||
--momentum 0.9 \
|
||||
--amp \
|
||||
--batch-size 256 \
|
||||
--epoch 90 \
|
||||
--evaluate \
|
||||
--resume checkpoint.pth.tar \
|
||||
--data /opt/npu/dataset/imagenet
|
||||
@@ -0,0 +1,29 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
g_feat_in = []
|
||||
g_feat_out = []
|
||||
g_grad_in = []
|
||||
g_grad_out = []
|
||||
|
||||
|
||||
def forward_hook_fn(module, input, output):
|
||||
g_feat_in.append(input)
|
||||
g_feat_out.append(output)
|
||||
print(module)
|
||||
print(input)
|
||||
print(output)
|
||||
|
||||
|
||||
def backward_hook_fn(module, grad_input, grad_output):
|
||||
g_grad_in.append(grad_input)
|
||||
g_grad_out.append(grad_output)
|
||||
print(module)
|
||||
print(grad_input)
|
||||
print(grad_input)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
+556
@@ -0,0 +1,556 @@
|
||||
import argparse
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
import time
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.nn.parallel
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torch.distributed as dist
|
||||
import torch.optim
|
||||
import torch.multiprocessing as mp
|
||||
import torch.utils.data
|
||||
import torch.utils.data.distributed
|
||||
import torchvision.transforms as transforms
|
||||
import torchvision.datasets as datasets
|
||||
import torchvision.models as models
|
||||
from mobilenet import mobilenet_v2
|
||||
import torch.npu
|
||||
import torch.cuda
|
||||
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from apex import amp
|
||||
import numpy as np
|
||||
|
||||
from hook import *
|
||||
|
||||
|
||||
# model_names = sorted(name for name in models.__dict__
|
||||
# if name.islower() and not name.startswith("__")
|
||||
# and callable(models.__dict__[name]))
|
||||
|
||||
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
|
||||
parser.add_argument('--data', metavar='DIR', default='/dataset/imagenet',
|
||||
help='path to dataset')
|
||||
# parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
|
||||
# choices=model_names,
|
||||
# help='model architecture: ' +
|
||||
# ' | '.join(model_names) +
|
||||
# ' (default: resnet18)')
|
||||
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
|
||||
help='number of data loading workers (default: 4)')
|
||||
parser.add_argument('--epochs', default=90, type=int, metavar='N',
|
||||
help='number of total epochs to run')
|
||||
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
|
||||
help='manual epoch number (useful on restarts)')
|
||||
parser.add_argument('-b', '--batch-size', default=256, type=int,
|
||||
metavar='N',
|
||||
help='mini-batch size (default: 256), this is the total '
|
||||
'batch size of all GPUs on the current node when '
|
||||
'using Data Parallel or Distributed Data Parallel')
|
||||
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
|
||||
metavar='LR', help='initial learning rate', dest='lr')
|
||||
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
|
||||
help='momentum')
|
||||
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
|
||||
metavar='W', help='weight decay (default: 1e-4)',
|
||||
dest='weight_decay')
|
||||
parser.add_argument('-p', '--print-freq', default=10, type=int,
|
||||
metavar='N', help='print frequency (default: 10)')
|
||||
parser.add_argument('--resume', default='', type=str, metavar='PATH',
|
||||
help='path to latest checkpoint (default: none)')
|
||||
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
|
||||
help='evaluate model on validation set')
|
||||
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
|
||||
help='use pre-trained model')
|
||||
# parser.add_argument('--world-size', default=-1, type=int,
|
||||
# help='number of nodes for distributed training')
|
||||
parser.add_argument('--node-nums', default=1, type=int,
|
||||
help='number of nodes for distributed training')
|
||||
parser.add_argument('--rank', default=0, type=int,
|
||||
help='node rank for distributed training')
|
||||
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
|
||||
help='url used to set up distributed training')
|
||||
parser.add_argument('--dist-backend', default='nccl', type=str,
|
||||
help='distributed backend')
|
||||
parser.add_argument('--seed', default=None, type=int,
|
||||
help='seed for initializing training. ')
|
||||
parser.add_argument('--gpu', default=None, type=int,
|
||||
help='GPU id to use.')
|
||||
parser.add_argument('--multiprocessing-distributed', action='store_true',
|
||||
help='Use multi-processing distributed training to launch '
|
||||
'N processes per node, which has N GPUs. This is the '
|
||||
'fastest way to use PyTorch for either single node or '
|
||||
'multi node data parallel training')
|
||||
|
||||
parser.add_argument('--addr', default='10.136.181.115', type=str,
|
||||
help='master addr')
|
||||
parser.add_argument('--device-id', default=None, type=int,
|
||||
help='GPU id to use.')
|
||||
|
||||
parser.add_argument('--amp', default=False, action='store_true',
|
||||
help='use amp to train the model')
|
||||
parser.add_argument('--opt-level', default=None, type=str, help='apex optimize level')
|
||||
parser.add_argument('--loss-scale-value', default='1024', type=int, help='static loss scale value')
|
||||
|
||||
parser.add_argument('--summary-path', default=None, type=str, help='event file path')
|
||||
parser.add_argument('--stop-step-num', default=None, type=int, help='after the stop-step, killing the training task')
|
||||
parser.add_argument('--device', default='npu', type=str, help='device type, cpu or npu:x or cuda')
|
||||
parser.add_argument('--eval-freq', default=10, type=int, help='test interval')
|
||||
parser.add_argument('--hook', default=False, action='store_true', help='pytorch hook')
|
||||
|
||||
best_acc1 = 0
|
||||
cur_step = 0
|
||||
|
||||
|
||||
def seed_everything(seed, device):
|
||||
random.seed(seed)
|
||||
os.environ['PYTHONHASHSEED'] = str(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
|
||||
if 'cuda' in device:
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
cudnn.deterministic = True
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
|
||||
|
||||
def main():
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.seed is not None:
|
||||
seed_everything(args.seed, args.device)
|
||||
|
||||
warnings.warn('You have chosen to seed training. '
|
||||
'This will turn on the CUDNN deterministic setting, '
|
||||
'which can slow down your training considerably! '
|
||||
'You may see unexpected behavior when restarting '
|
||||
'from checkpoints.')
|
||||
|
||||
os.environ['MASTER_ADDR'] = args.addr
|
||||
os.environ['MASTER_PORT'] = '90000'
|
||||
|
||||
args.distributed = args.node_nums > 1 or args.multiprocessing_distributed
|
||||
if not args.distributed:
|
||||
print('dist param is not correct!')
|
||||
return
|
||||
|
||||
if args.device == 'npu':
|
||||
# device_nums_per_node = torch.npu.device_count()
|
||||
device_nums_per_node = 2
|
||||
elif args.device == 'cuda':
|
||||
device_nums_per_node = torch.cuda.device_count()
|
||||
else:
|
||||
print('unknown device type[npu/cuda]!')
|
||||
return
|
||||
|
||||
if args.multiprocessing_distributed:
|
||||
args.world_size = device_nums_per_node * args.node_nums # world_size means nums of all devices or nums of processes
|
||||
if args.device == 'npu':
|
||||
# main_worker(args.device_id, ngpus_per_node, args) # 需要外层脚本启多个进程
|
||||
mp.spawn(main_worker, nprocs=device_nums_per_node, args=(device_nums_per_node, args)) # 这里起子进程,就不需要外层脚本启多个进程了
|
||||
else:
|
||||
mp.spawn(main_worker, nprocs=device_nums_per_node, args=(device_nums_per_node, args))
|
||||
else:
|
||||
print('dist param is not correct!')
|
||||
return
|
||||
# main_worker(args.device_id, device_nums_per_node, args)
|
||||
|
||||
|
||||
# first param must be the index of PID
|
||||
def main_worker(pid_idx, device_nums_per_node, args):
|
||||
global best_acc1
|
||||
global cur_step
|
||||
|
||||
# dist set
|
||||
sum_writer = SummaryWriter(args.summary_path)
|
||||
global_step = -1
|
||||
|
||||
if args.distributed:
|
||||
if args.multiprocessing_distributed:
|
||||
# For multiprocessing distributed training, rank needs to be the
|
||||
# global rank among all the processes
|
||||
args.rank = pid_idx # args.rank * device_nums_per_node + pid_idx
|
||||
args.pid_idx = pid_idx
|
||||
|
||||
if args.device == 'npu':
|
||||
dist.init_process_group(backend=args.dist_backend, world_size=args.world_size, rank=args.rank)
|
||||
else:
|
||||
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
|
||||
world_size=args.world_size, rank=args.rank)
|
||||
|
||||
if args.distributed:
|
||||
# For multiprocessing distributed, DistributedDataParallel constructor
|
||||
# should always set the single device scope, otherwise,
|
||||
# DistributedDataParallel will use all available devices.
|
||||
if args.device == 'npu':
|
||||
loc = 'npu:{}'.format(pid_idx)
|
||||
torch.npu.set_device(loc)
|
||||
else:
|
||||
torch.cuda.set_device(pid_idx)
|
||||
|
||||
args.batch_size = int(args.batch_size / device_nums_per_node)
|
||||
args.workers = int((args.workers + device_nums_per_node - 1) / device_nums_per_node)
|
||||
|
||||
# Data loading code
|
||||
traindir = os.path.join(args.data, 'train')
|
||||
valdir = os.path.join(args.data, 'val')
|
||||
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
||||
std=[0.229, 0.224, 0.225])
|
||||
|
||||
train_dataset = datasets.ImageFolder(
|
||||
traindir,
|
||||
transforms.Compose([
|
||||
transforms.RandomResizedCrop(224),
|
||||
transforms.RandomHorizontalFlip(),
|
||||
transforms.ToTensor(),
|
||||
normalize,
|
||||
]))
|
||||
|
||||
if args.distributed:
|
||||
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
|
||||
else:
|
||||
train_sampler = None
|
||||
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
|
||||
num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
|
||||
|
||||
val_loader = torch.utils.data.DataLoader(
|
||||
datasets.ImageFolder(valdir, transforms.Compose([
|
||||
transforms.Resize(256),
|
||||
transforms.CenterCrop(224),
|
||||
transforms.ToTensor(),
|
||||
normalize,
|
||||
])),
|
||||
batch_size=args.batch_size, shuffle=False,
|
||||
num_workers=args.workers, pin_memory=True, drop_last=True)
|
||||
|
||||
# define model and train
|
||||
model = mobilenet_v2()
|
||||
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
loc = None
|
||||
if 'npu' == args.device:
|
||||
loc = 'npu:{}'.format(pid_idx)
|
||||
elif 'cuda' == args.device:
|
||||
loc = 'cuda:{}'.format(pid_idx)
|
||||
model = model.to(loc)
|
||||
|
||||
criterion = criterion.to(loc)
|
||||
|
||||
optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
|
||||
|
||||
if args.amp:
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale_value)
|
||||
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[pid_idx], broadcast_buffers=False)
|
||||
# model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
|
||||
|
||||
# set hook
|
||||
if args.hook:
|
||||
modules = model.named_modules()
|
||||
for name, module in modules:
|
||||
module.register_forward_hook(forward_hook_fn)
|
||||
module.register_backward_hook(backward_hook_fn)
|
||||
|
||||
# optionally resume from a checkpoint
|
||||
if args.resume:
|
||||
if os.path.isfile(args.resume):
|
||||
print("=> loading checkpoint '{}'".format(args.resume))
|
||||
checkpoint = torch.load(args.resume, map_location=args.device)
|
||||
args.start_epoch = checkpoint['epoch']
|
||||
best_acc1 = checkpoint['best_acc1']
|
||||
model.load_state_dict(checkpoint['state_dict'])
|
||||
optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
if args.amp:
|
||||
amp.load_state_dict(checkpoint['amp'])
|
||||
print("=> loaded checkpoint '{}' (epoch {})"
|
||||
.format(args.resume, checkpoint['epoch']))
|
||||
else:
|
||||
print("=> no checkpoint found at '{}'".format(args.resume))
|
||||
|
||||
if args.evaluate:
|
||||
validate(val_loader, model, criterion, args, global_step, sum_writer)
|
||||
return
|
||||
|
||||
for epoch in range(args.start_epoch, args.epochs):
|
||||
|
||||
# train for one epoch
|
||||
global_step = train(train_loader, model, criterion, optimizer, epoch, args, global_step, sum_writer, device_nums_per_node)
|
||||
|
||||
if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1:
|
||||
# evaluate on validation set
|
||||
acc1 = validate(val_loader, model, criterion, args, global_step, sum_writer, device_nums_per_node)
|
||||
|
||||
# remember best acc@1 and save checkpoint
|
||||
is_best = acc1 > best_acc1
|
||||
best_acc1 = max(acc1, best_acc1)
|
||||
|
||||
# save checkpoint
|
||||
if args.amp:
|
||||
save_checkpoint({
|
||||
'epoch': epoch + 1,
|
||||
'state_dict': model.state_dict(),
|
||||
'best_acc1': best_acc1,
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'amp': amp.state_dict(),
|
||||
}, is_best)
|
||||
else:
|
||||
save_checkpoint({
|
||||
'epoch': epoch + 1,
|
||||
'state_dict': model.state_dict(),
|
||||
'best_acc1': best_acc1,
|
||||
'optimizer': optimizer.state_dict(),
|
||||
}, is_best)
|
||||
|
||||
if args.stop_step_num is not None and cur_step >= args.stop_step_num:
|
||||
break
|
||||
|
||||
sum_writer.close()
|
||||
|
||||
|
||||
def train(train_loader, model, criterion, optimizer, epoch, args, global_step, sum_writer, device_nums_per_node):
|
||||
global cur_step
|
||||
|
||||
if args.seed is not None:
|
||||
seed_everything(args.seed + epoch, args.device)
|
||||
|
||||
batch_time = AverageMeter('Time', ':6.3f')
|
||||
data_time = AverageMeter('Data', ':6.3f')
|
||||
learning_rate = AverageMeter('LR', ':2.8f')
|
||||
losses = AverageMeter('Loss', ':6.8f')
|
||||
top1 = AverageMeter('Acc@1', ':6.2f')
|
||||
top5 = AverageMeter('Acc@5', ':6.2f')
|
||||
progress = ProgressMeter(
|
||||
len(train_loader),
|
||||
[batch_time, data_time, learning_rate, losses, top1, top5],
|
||||
prefix="Epoch: [{}]".format(epoch))
|
||||
|
||||
# switch to train mode
|
||||
model.train()
|
||||
|
||||
end = time.time()
|
||||
steps_per_epoch = len(train_loader)
|
||||
for i, (images, target) in enumerate(train_loader):
|
||||
|
||||
global_step = epoch * steps_per_epoch + i
|
||||
cur_step = global_step
|
||||
|
||||
lr = adjust_learning_rate(optimizer, global_step, steps_per_epoch, args)
|
||||
|
||||
learning_rate.update(lr)
|
||||
|
||||
sum_writer.add_scalar('learning rate', lr, global_step)
|
||||
|
||||
# measure data loading time
|
||||
data_time.update(time.time() - end)
|
||||
|
||||
if 'npu' in args.device:
|
||||
target = target.to(torch.int32)
|
||||
|
||||
loc = None
|
||||
if 'npu' in args.device:
|
||||
loc = 'npu:{}'.format(args.pid_idx)
|
||||
elif 'cuda' in args.device:
|
||||
loc = 'cuda:{}'.format(args.pid_idx)
|
||||
images = images.to(loc, non_blocking=True)
|
||||
target = target.to(loc, non_blocking=True)
|
||||
|
||||
# output = None
|
||||
# loss = None
|
||||
# with torch.autograd.profiler.profile(record_shapes=True, use_npu=True) as prof:
|
||||
|
||||
# compute output
|
||||
output = model(images)
|
||||
loss = criterion(output, target)
|
||||
|
||||
# measure accuracy and record loss
|
||||
acc1, acc5 = accuracy(output, target, topk=(1, 5))
|
||||
losses.update(loss.item(), images.size(0))
|
||||
top1.update(acc1[0], images.size(0))
|
||||
top5.update(acc5[0], images.size(0))
|
||||
|
||||
# compute gradient and do SGD step
|
||||
optimizer.zero_grad()
|
||||
if args.amp:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
sum_writer.add_scalar('Accuary/train/top1', acc1, global_step)
|
||||
sum_writer.add_scalar('Accuary/train/top5', acc5, global_step)
|
||||
sum_writer.add_scalar('Loss/train/loss', loss, global_step)
|
||||
|
||||
optimizer.step()
|
||||
# for name, parms in model.named_parameters():
|
||||
# print('-->name:', name, ' -->grad_value_max:', torch.max(parms.grad), ' -->grad_value_min:', torch.min(parms.grad))
|
||||
|
||||
# print(prof.key_averages().table())
|
||||
# prof.export_chrome_trace("mobilenetv2_{}_npu.prof".format(i))
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
end = time.time()
|
||||
|
||||
if i % args.print_freq == 0:
|
||||
if not args.multiprocessing_distributed or \
|
||||
(args.multiprocessing_distributed and args.rank % device_nums_per_node == 0):
|
||||
progress.display(i)
|
||||
|
||||
if not args.multiprocessing_distributed or \
|
||||
(args.multiprocessing_distributed and args.rank % device_nums_per_node == 0):
|
||||
print('FPS@all: {:.3f}'.format(8 * args.batch_size / batch_time.avg))
|
||||
|
||||
if args.stop_step_num is not None and cur_step >= args.stop_step_num:
|
||||
break
|
||||
|
||||
return global_step
|
||||
|
||||
|
||||
def validate(val_loader, model, criterion, args, global_step, sum_writer, device_nums_per_node):
|
||||
batch_time = AverageMeter('Time', ':6.3f')
|
||||
losses = AverageMeter('Loss', ':.4e')
|
||||
top1 = AverageMeter('Acc@1', ':6.2f')
|
||||
top5 = AverageMeter('Acc@5', ':6.2f')
|
||||
progress = ProgressMeter(
|
||||
len(val_loader),
|
||||
[batch_time, losses, top1, top5],
|
||||
prefix='Test: ')
|
||||
|
||||
# switch to evaluate mode
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
end = time.time()
|
||||
for i, (images, target) in enumerate(val_loader):
|
||||
|
||||
if 'npu' in args.device:
|
||||
target = target.to(torch.int32)
|
||||
|
||||
loc = None
|
||||
if 'npu' in args.device:
|
||||
loc = 'npu:{}'.format(args.pid_idx)
|
||||
elif 'cuda' in args.device:
|
||||
loc = 'cuda:{}'.format(args.pid_idx)
|
||||
images = images.to(loc, non_blocking=True)
|
||||
target = target.to(loc, non_blocking=True)
|
||||
|
||||
# compute output
|
||||
output = model(images)
|
||||
loss = criterion(output, target)
|
||||
|
||||
# measure accuracy and record loss
|
||||
acc1, acc5 = accuracy(output, target, topk=(1, 5))
|
||||
losses.update(loss.item(), images.size(0))
|
||||
top1.update(acc1[0], images.size(0))
|
||||
top5.update(acc5[0], images.size(0))
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
end = time.time()
|
||||
|
||||
if i % args.print_freq == 0:
|
||||
if not args.multiprocessing_distributed or \
|
||||
(args.multiprocessing_distributed and args.rank % device_nums_per_node == 0):
|
||||
progress.display(i)
|
||||
|
||||
# TODO: this should also be done with the ProgressMeter
|
||||
print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
|
||||
.format(top1=top1, top5=top5))
|
||||
if not args.multiprocessing_distributed or \
|
||||
(args.multiprocessing_distributed and args.rank % device_nums_per_node == 0):
|
||||
print("[device id:", args.gpu, "]", '[AVG-ACC] * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(top1=top1, top5=top5))
|
||||
|
||||
if not args.evaluate:
|
||||
# sum_writer.add_scalar('Loss/validation/loss', losses, global_step)
|
||||
sum_writer.add_scalar('Accuary/validation/top1', top1.avg, global_step)
|
||||
sum_writer.add_scalar('Accuary/validation/top5', top5.avg, global_step)
|
||||
|
||||
return top1.avg
|
||||
|
||||
|
||||
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
|
||||
torch.save(state, filename)
|
||||
if is_best:
|
||||
shutil.copyfile(filename, 'model_best.pth.tar')
|
||||
|
||||
|
||||
class AverageMeter(object):
|
||||
"""Computes and stores the average and current value"""
|
||||
def __init__(self, name, fmt=':f'):
|
||||
self.name = name
|
||||
self.fmt = fmt
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.val = 0
|
||||
self.avg = 0
|
||||
self.sum = 0
|
||||
self.count = 0
|
||||
|
||||
def update(self, val, n=1):
|
||||
self.val = val
|
||||
self.sum += val * n
|
||||
self.count += n
|
||||
self.avg = self.sum / self.count
|
||||
|
||||
def __str__(self):
|
||||
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
|
||||
return fmtstr.format(**self.__dict__)
|
||||
|
||||
|
||||
class ProgressMeter(object):
|
||||
def __init__(self, num_batches, meters, prefix=""):
|
||||
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
|
||||
self.meters = meters
|
||||
self.prefix = prefix
|
||||
|
||||
def display(self, batch):
|
||||
entries = [self.prefix + self.batch_fmtstr.format(batch)]
|
||||
entries += [str(meter) for meter in self.meters]
|
||||
print('\t'.join(entries))
|
||||
|
||||
def _get_batch_fmtstr(self, num_batches):
|
||||
num_digits = len(str(num_batches // 1))
|
||||
fmt = '{:' + str(num_digits) + 'd}'
|
||||
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
|
||||
|
||||
|
||||
def adjust_learning_rate(optimizer, global_step, steps_per_epoch, args):
|
||||
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
|
||||
# lr = args.lr * (0.98 ** (epoch / 2.5))
|
||||
lr = args.lr * (0.98 ** (global_step // int(steps_per_epoch * 2.5)))
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr
|
||||
return lr
|
||||
|
||||
|
||||
def accuracy(output, target, topk=(1,)):
|
||||
"""Computes the accuracy over the k top predictions for the specified values of k"""
|
||||
with torch.no_grad():
|
||||
maxk = max(topk)
|
||||
batch_size = target.size(0)
|
||||
|
||||
_, pred = output.topk(maxk, 1, True, True)
|
||||
pred = pred.t()
|
||||
correct = pred.eq(target.view(1, -1).expand_as(pred))
|
||||
|
||||
res = []
|
||||
for k in topk:
|
||||
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
|
||||
res.append(correct_k.mul_(100.0 / batch_size))
|
||||
return res
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
+179
@@ -0,0 +1,179 @@
|
||||
from torch import nn
|
||||
# from .utils import load_state_dict_from_url
|
||||
|
||||
|
||||
__all__ = ['MobileNetV2', 'mobilenet_v2']
|
||||
|
||||
|
||||
model_urls = {
|
||||
'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth',
|
||||
}
|
||||
|
||||
|
||||
def _make_divisible(v, divisor, min_value=None):
|
||||
"""
|
||||
This function is taken from the original tf repo.
|
||||
It ensures that all layers have a channel number that is divisible by 8
|
||||
It can be seen here:
|
||||
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
|
||||
:param v:
|
||||
:param divisor:
|
||||
:param min_value:
|
||||
:return:
|
||||
"""
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
# Make sure that round down does not go down by more than 10%.
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
|
||||
class ConvBNReLU(nn.Sequential):
|
||||
def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
|
||||
padding = (kernel_size - 1) // 2
|
||||
super(ConvBNReLU, self).__init__(
|
||||
nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
|
||||
nn.BatchNorm2d(out_planes),
|
||||
nn.ReLU6(inplace=True)
|
||||
# nn.ReLU(inplace=True)
|
||||
)
|
||||
|
||||
|
||||
class InvertedResidual(nn.Module):
|
||||
def __init__(self, inp, oup, stride, expand_ratio):
|
||||
super(InvertedResidual, self).__init__()
|
||||
self.stride = stride
|
||||
assert stride in [1, 2]
|
||||
|
||||
hidden_dim = int(round(inp * expand_ratio))
|
||||
self.use_res_connect = self.stride == 1 and inp == oup
|
||||
|
||||
layers = []
|
||||
if expand_ratio != 1:
|
||||
# pw
|
||||
layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
|
||||
layers.extend([
|
||||
# dw
|
||||
ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
|
||||
# pw-linear
|
||||
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
|
||||
nn.BatchNorm2d(oup),
|
||||
])
|
||||
self.conv = nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
if self.use_res_connect:
|
||||
return x + self.conv(x)
|
||||
else:
|
||||
return self.conv(x)
|
||||
|
||||
|
||||
class MobileNetV2(nn.Module):
|
||||
def __init__(self,
|
||||
num_classes=1000,
|
||||
width_mult=1.0,
|
||||
inverted_residual_setting=None,
|
||||
round_nearest=8,
|
||||
block=None):
|
||||
"""
|
||||
MobileNet V2 main class
|
||||
|
||||
Args:
|
||||
num_classes (int): Number of classes
|
||||
width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
|
||||
inverted_residual_setting: Network structure
|
||||
round_nearest (int): Round the number of channels in each layer to be a multiple of this number
|
||||
Set to 1 to turn off rounding
|
||||
block: Module specifying inverted residual building block for mobilenet
|
||||
|
||||
"""
|
||||
super(MobileNetV2, self).__init__()
|
||||
|
||||
if block is None:
|
||||
block = InvertedResidual
|
||||
input_channel = 32
|
||||
last_channel = 1280
|
||||
|
||||
if inverted_residual_setting is None:
|
||||
inverted_residual_setting = [
|
||||
# t, c, n, s
|
||||
[1, 16, 1, 1],
|
||||
[6, 24, 2, 2],
|
||||
[6, 32, 3, 2],
|
||||
[6, 64, 4, 2],
|
||||
[6, 96, 3, 1],
|
||||
[6, 160, 3, 2],
|
||||
[6, 320, 1, 1],
|
||||
]
|
||||
|
||||
# only check the first element, assuming user knows t,c,n,s are required
|
||||
if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
|
||||
raise ValueError("inverted_residual_setting should be non-empty "
|
||||
"or a 4-element list, got {}".format(inverted_residual_setting))
|
||||
|
||||
# building first layer
|
||||
input_channel = _make_divisible(input_channel * width_mult, round_nearest)
|
||||
self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
|
||||
features = [ConvBNReLU(3, input_channel, stride=2)]
|
||||
# building inverted residual blocks
|
||||
for t, c, n, s in inverted_residual_setting:
|
||||
output_channel = _make_divisible(c * width_mult, round_nearest)
|
||||
for i in range(n):
|
||||
stride = s if i == 0 else 1
|
||||
features.append(block(input_channel, output_channel, stride, expand_ratio=t))
|
||||
input_channel = output_channel
|
||||
# building last several layers
|
||||
features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
|
||||
# make it nn.Sequential
|
||||
self.features = nn.Sequential(*features)
|
||||
|
||||
# building classifier
|
||||
self.classifier = nn.Sequential(
|
||||
# p=0.2
|
||||
nn.Dropout(0.2),
|
||||
nn.Linear(self.last_channel, num_classes),
|
||||
)
|
||||
|
||||
# weight initialization
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight, mode='fan_out')
|
||||
if m.bias is not None:
|
||||
nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.BatchNorm2d):
|
||||
nn.init.ones_(m.weight)
|
||||
nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.Linear):
|
||||
nn.init.normal_(m.weight, 0, 0.01)
|
||||
nn.init.zeros_(m.bias)
|
||||
|
||||
def _forward_impl(self, x):
|
||||
# This exists since TorchScript doesn't support inheritance, so the superclass method
|
||||
# (this one) needs to have a name other than `forward` that can be accessed in a subclass
|
||||
x = self.features(x)
|
||||
# Cannot use "squeeze" as batch-size can be 1 => must use reshape with x.shape[0]
|
||||
x = nn.functional.adaptive_avg_pool2d(x, 1).reshape(x.shape[0], -1)
|
||||
x = self.classifier(x)
|
||||
return x
|
||||
|
||||
def forward(self, x):
|
||||
return self._forward_impl(x)
|
||||
|
||||
|
||||
def mobilenet_v2(pretrained=False, progress=True, **kwargs):
|
||||
"""
|
||||
Constructs a MobileNetV2 architecture from
|
||||
`"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
progress (bool): If True, displays a progress bar of the download to stderr
|
||||
"""
|
||||
model = MobileNetV2(**kwargs)
|
||||
# if pretrained:
|
||||
# state_dict = load_state_dict_from_url(model_urls['mobilenet_v2'],
|
||||
# progress=progress)
|
||||
# model.load_state_dict(state_dict)
|
||||
return model
|
||||
+638
@@ -0,0 +1,638 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
import time
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.parallel
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torch.distributed as dist
|
||||
import torch.optim
|
||||
import torch.multiprocessing as mp
|
||||
import torch.utils.data
|
||||
import torch.utils.data.distributed
|
||||
import torchvision.transforms as transforms
|
||||
import torchvision.datasets as datasets
|
||||
import torchvision.models as models
|
||||
|
||||
from mobilenet import mobilenet_v2
|
||||
from apex import amp
|
||||
from multi_epochs_dataloader import MultiEpochsDataLoader
|
||||
|
||||
from benchmark_log import hwlog
|
||||
from benchmark_log.basic_utils import get_environment_info
|
||||
from benchmark_log.basic_utils import get_model_parameter
|
||||
|
||||
BATCH_SIZE = 4096
|
||||
OPTIMIZER_BATCH_SIZE = 4096
|
||||
# model_names = sorted(name for name in models.__dict__
|
||||
# if name.islower() and not name.startswith("__")
|
||||
# and callable(models.__dict__[name]))
|
||||
|
||||
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
|
||||
parser.add_argument('--data', metavar='DIR', default='/opt/npu/dataset/imagenet',
|
||||
help='path to dataset')
|
||||
# parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
|
||||
# choices=model_names,
|
||||
# help='model architecture: ' +
|
||||
# ' | '.join(model_names) +
|
||||
# ' (default: resnet18)')
|
||||
parser.add_argument('-j', '--workers', default=32, type=int, metavar='N',
|
||||
help='number of data loading workers (default: 4)')
|
||||
parser.add_argument('--epochs', default=90, type=int, metavar='N',
|
||||
help='number of total epochs to run')
|
||||
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
|
||||
help='manual epoch number (useful on restarts)')
|
||||
parser.add_argument('-b', '--batch-size', default=BATCH_SIZE, type=int,
|
||||
metavar='N',
|
||||
help='mini-batch size (default: 256), this is the total '
|
||||
'batch size of all GPUs on the current node when '
|
||||
'using Data Parallel or Distributed Data Parallel')
|
||||
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
|
||||
metavar='LR', help='initial learning rate', dest='lr')
|
||||
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
|
||||
help='momentum')
|
||||
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
|
||||
metavar='W', help='weight decay (default: 1e-4)',
|
||||
dest='weight_decay')
|
||||
parser.add_argument('--workspace', type=str, default='./', metavar='DIR',
|
||||
help='path to directory where checkpoints will be stored')
|
||||
parser.add_argument('-p', '--print-freq', default=10, type=int,
|
||||
metavar='N', help='print frequency (default: 10)')
|
||||
parser.add_argument('-ef', '--eval-freq', default=5, type=int,
|
||||
metavar='N', help='evaluate frequency (default: 5)')
|
||||
parser.add_argument('--resume', default='', type=str, metavar='PATH',
|
||||
help='path to latest checkpoint (default: none)')
|
||||
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
|
||||
help='evaluate model on validation set')
|
||||
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
|
||||
help='use pre-trained model')
|
||||
parser.add_argument('--world-size', default=-1, type=int,
|
||||
help='number of nodes for distributed training')
|
||||
parser.add_argument('--rank', default=-1, type=int,
|
||||
help='node rank for distributed training')
|
||||
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
|
||||
help='url used to set up distributed training')
|
||||
parser.add_argument('--dist-backend', default='nccl', type=str,
|
||||
help='distributed backend')
|
||||
parser.add_argument('--seed', default=None, type=int,
|
||||
help='seed for initializing training. ')
|
||||
parser.add_argument('--gpu', default=None, type=int,
|
||||
help='GPU id to use.')
|
||||
parser.add_argument('--multiprocessing-distributed', action='store_true',
|
||||
help='Use multi-processing distributed training to launch '
|
||||
'N processes per node, which has N GPUs. This is the '
|
||||
'fastest way to use PyTorch for either single node or '
|
||||
'multi node data parallel training')
|
||||
parser.add_argument('-bm', '--benchmark', default=0, type=int,
|
||||
metavar='N', help='set benchmark status (default: 1,run benchmark)')
|
||||
parser.add_argument('--device', default='npu', type=str, help='npu or gpu')
|
||||
parser.add_argument('--addr', default='10.136.181.115', type=str, help='master addr')
|
||||
parser.add_argument('--checkpoint-nameprefix', default='checkpoint', type=str, help='checkpoint-nameprefix')
|
||||
parser.add_argument('--checkpoint-freq', default=0, type=int,
|
||||
metavar='N', help='checkpoint frequency (default: 0)'
|
||||
'0: save only one file whitch per epoch;'
|
||||
'n: save diff file per n epoch'
|
||||
'-1:no checkpoint,not support')
|
||||
|
||||
# apex
|
||||
parser.add_argument('--amp', default=False, action='store_true',
|
||||
help='use amp to train the model')
|
||||
parser.add_argument('--loss-scale', default=64., type=float,
|
||||
help='loss scale using in amp, default -1 means dynamic')
|
||||
parser.add_argument('--opt-level', default='O2', type=str,
|
||||
help='loss scale using in amp, default -1 means dynamic')
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
best_acc1 = 0
|
||||
|
||||
|
||||
def main():
|
||||
args = parser.parse_args()
|
||||
print("===============main()=================")
|
||||
print(args)
|
||||
print("===============main()=================")
|
||||
|
||||
os.environ['KERNEL_NAME_ID'] = str(0)
|
||||
print("++++++++++++++++++ KERNEL_NAME_ID:", os.environ['KERNEL_NAME_ID'])
|
||||
|
||||
if args.seed is not None:
|
||||
random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
cudnn.deterministic = True
|
||||
warnings.warn('You have chosen to seed training. '
|
||||
'This will turn on the CUDNN deterministic setting, '
|
||||
'which can slow down your training considerably! '
|
||||
'You may see unexpected behavior when restarting '
|
||||
'from checkpoints.')
|
||||
|
||||
os.environ['MASTER_ADDR'] = args.addr # '10.136.181.51'
|
||||
os.environ['MASTER_PORT'] = '59629'
|
||||
|
||||
if args.gpu is not None:
|
||||
warnings.warn('You have chosen a specific GPU. This will completely '
|
||||
'disable data parallelism.')
|
||||
|
||||
if args.dist_url == "env://" and args.world_size == -1:
|
||||
args.world_size = int(os.environ["WORLD_SIZE"])
|
||||
|
||||
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
|
||||
|
||||
if args.device == 'npu':
|
||||
ngpus_per_node = torch.npu.device_count()
|
||||
else:
|
||||
ngpus_per_node = torch.cuda.device_count()
|
||||
if args.multiprocessing_distributed:
|
||||
# Since we have ngpus_per_node processes per node, the total world_size
|
||||
# needs to be adjusted accordingly
|
||||
args.world_size = ngpus_per_node * args.world_size
|
||||
# Use torch.multiprocessing.spawn to launch distributed processes: the
|
||||
# main_worker process function
|
||||
# The child process uses the environment variables of the parent process,
|
||||
# we have to set KERNEL_NAME_ID for every proc
|
||||
if args.device == 'npu':
|
||||
# main_worker(args.gpu, ngpus_per_node, args)
|
||||
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
|
||||
else:
|
||||
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
|
||||
else:
|
||||
# Simply call main_worker function
|
||||
main_worker(args.gpu, ngpus_per_node, args)
|
||||
|
||||
|
||||
def main_worker(gpu, ngpus_per_node, args):
|
||||
global best_acc1
|
||||
args.gpu = gpu
|
||||
|
||||
print("[npu id:", args.gpu, "]", "++++++++++++++++ before set KERNEL_NAME_ID:", os.environ['KERNEL_NAME_ID'])
|
||||
os.environ['KERNEL_NAME_ID'] = str(gpu)
|
||||
print("[npu id:", args.gpu, "]", "++++++++++++++++ KERNEL_NAME_ID:", os.environ['KERNEL_NAME_ID'])
|
||||
|
||||
if args.gpu is not None:
|
||||
print("[npu id:", args.gpu, "]", "Use GPU: {} for training".format(args.gpu))
|
||||
|
||||
if args.distributed:
|
||||
if args.dist_url == "env://" and args.rank == -1:
|
||||
args.rank = int(os.environ["RANK"])
|
||||
if args.multiprocessing_distributed:
|
||||
# For multiprocessing distributed training, rank needs to be the
|
||||
# global rank among all the processes
|
||||
args.rank = args.rank * ngpus_per_node + gpu
|
||||
|
||||
if args.device == 'npu':
|
||||
dist.init_process_group(backend=args.dist_backend, # init_method=args.dist_url,
|
||||
world_size=args.world_size, rank=args.rank)
|
||||
else:
|
||||
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
|
||||
world_size=args.world_size, rank=args.rank)
|
||||
|
||||
loc = 'npu:{}'.format(args.gpu)
|
||||
torch.npu.set_device(loc)
|
||||
|
||||
args.batch_size = int(args.batch_size / ngpus_per_node)
|
||||
args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
|
||||
|
||||
print("[npu id:", args.gpu, "]", "===============main_worker()=================")
|
||||
print("[npu id:", args.gpu, "]", args)
|
||||
print("[npu id:", args.gpu, "]", "===============main_worker()=================")
|
||||
|
||||
# Data loading code
|
||||
# traindir = os.path.join(args.data, 'train')
|
||||
# valdir = os.path.join(args.data, 'val')
|
||||
# normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
||||
# std=[0.229, 0.224, 0.225])
|
||||
|
||||
# train_dataset = datasets.ImageFolder(
|
||||
# traindir,
|
||||
# transforms.Compose([
|
||||
# transforms.RandomResizedCrop(224),
|
||||
# transforms.RandomHorizontalFlip(),
|
||||
# transforms.ToTensor(),
|
||||
# normalize,
|
||||
# ]))
|
||||
#
|
||||
# if args.distributed:
|
||||
# train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
|
||||
# else:
|
||||
# train_sampler = None
|
||||
#
|
||||
# train_loader = torch.utils.data.DataLoader(
|
||||
# train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
|
||||
# num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
|
||||
|
||||
train_loader, train_loader_len, train_sampler = get_pytorch_train_loader(args.data,
|
||||
args.batch_size,
|
||||
workers=args.workers,
|
||||
distributed=args.distributed)
|
||||
|
||||
# val_loader = torch.utils.data.DataLoader(
|
||||
# datasets.ImageFolder(valdir, transforms.Compose([
|
||||
# transforms.Resize(256),
|
||||
# transforms.CenterCrop(224),
|
||||
# transforms.ToTensor(),
|
||||
# normalize,
|
||||
# ])),
|
||||
# batch_size=args.batch_size, shuffle=True,
|
||||
# num_workers=args.workers, pin_memory=True, drop_last=True)
|
||||
|
||||
val_loader = get_pytorch_val_loader(args.data, args.batch_size, args.workers, distributed=False)
|
||||
|
||||
# create model
|
||||
print("[npu id:", args.gpu, "]", "=> creating model '{}'".format('mobilenetv2'))
|
||||
# model = models.__dict__[args.arch]()
|
||||
model = mobilenet_v2()
|
||||
model = model.to(loc)
|
||||
|
||||
# define loss function (criterion) and optimizer
|
||||
criterion = nn.CrossEntropyLoss().to(loc)
|
||||
optimizer = torch.optim.SGD(model.parameters(), args.lr,
|
||||
momentum=args.momentum,
|
||||
weight_decay=args.weight_decay)
|
||||
|
||||
if args.amp:
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale)
|
||||
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False)
|
||||
|
||||
# optionally resume from a checkpoint
|
||||
if args.resume:
|
||||
if os.path.isfile(args.resume):
|
||||
print("=> loading checkpoint '{}'".format(args.resume))
|
||||
checkpoint = torch.load(args.resume, map_location=loc)
|
||||
args.start_epoch = checkpoint['epoch']
|
||||
best_acc1 = checkpoint['best_acc1']
|
||||
model.load_state_dict(checkpoint['state_dict'])
|
||||
optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
if args.amp:
|
||||
amp.load_state_dict(checkpoint['amp'])
|
||||
print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
|
||||
else:
|
||||
print("=> no checkpoint found at '{}'".format(args.resume))
|
||||
|
||||
cudnn.benchmark = True
|
||||
|
||||
if args.evaluate:
|
||||
validate(val_loader, model, criterion, args, ngpus_per_node)
|
||||
return
|
||||
|
||||
for epoch in range(args.start_epoch, args.epochs):
|
||||
if args.distributed:
|
||||
train_sampler.set_epoch(epoch)
|
||||
# adjust_learning_rate(optimizer, epoch, args)
|
||||
|
||||
# train for one epoch
|
||||
train(train_loader, train_loader_len, model, criterion, optimizer, epoch, args, ngpus_per_node)
|
||||
|
||||
if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1:
|
||||
# evaluate on validation set
|
||||
acc1 = validate(val_loader, model, criterion, args, ngpus_per_node)
|
||||
|
||||
# remember best acc@1 and save checkpoint
|
||||
is_best = acc1 > best_acc1
|
||||
best_acc1 = max(acc1, best_acc1)
|
||||
|
||||
if not args.multiprocessing_distributed or \
|
||||
(args.multiprocessing_distributed and args.rank % ngpus_per_node == 0 and epoch == args.epochs - 1):
|
||||
if args.amp:
|
||||
save_checkpoint({
|
||||
'epoch': epoch + 1,
|
||||
'state_dict': model.state_dict(),
|
||||
'best_acc1': best_acc1,
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'amp': amp.state_dict(),
|
||||
}, is_best)
|
||||
else:
|
||||
save_checkpoint({
|
||||
'epoch': epoch + 1,
|
||||
'state_dict': model.state_dict(),
|
||||
'best_acc1': best_acc1,
|
||||
'optimizer': optimizer.state_dict(),
|
||||
}, is_best)
|
||||
|
||||
|
||||
def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, args, ngpus_per_node):
|
||||
batch_time = AverageMeter('Time', ':6.3f')
|
||||
data_time = AverageMeter('Data', ':6.3f')
|
||||
losses = AverageMeter('Loss', ':.4e')
|
||||
top1 = AverageMeter('Acc@1', ':6.2f')
|
||||
top5 = AverageMeter('Acc@5', ':6.2f')
|
||||
progress = ProgressMeter(
|
||||
train_loader_len,
|
||||
[batch_time, data_time, losses, top1, top5],
|
||||
prefix="Epoch: [{}]".format(epoch))
|
||||
|
||||
loc = 'npu:{}'.format(args.gpu)
|
||||
|
||||
mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).view(1, 3, 1, 1)
|
||||
std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).view(1, 3, 1, 1)
|
||||
mean = mean.to(loc, non_blocking=True)
|
||||
std = std.to(loc, non_blocking=True)
|
||||
|
||||
# switch to train mode
|
||||
model.train()
|
||||
end = time.time()
|
||||
if args.benchmark == 1:
|
||||
optimizer.zero_grad()
|
||||
|
||||
# steps_per_epoch = len(train_loader)
|
||||
steps_per_epoch = train_loader_len
|
||||
print('==========step per epoch======================', steps_per_epoch)
|
||||
for i, (images, target) in enumerate(train_loader):
|
||||
# measure data loading time
|
||||
data_time.update(time.time() - end)
|
||||
|
||||
global_step = epoch * steps_per_epoch + i
|
||||
lr = adjust_learning_rate(optimizer, global_step, steps_per_epoch, args)
|
||||
|
||||
target = target.to(torch.int32)
|
||||
images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
|
||||
target = target.to(loc, non_blocking=True)
|
||||
|
||||
# compute output
|
||||
output = model(images)
|
||||
# stream = torch.npu.current_stream()
|
||||
# stream.synchronize()
|
||||
|
||||
loss = criterion(output, target)
|
||||
# stream = torch.npu.current_stream()
|
||||
# stream.synchronize()
|
||||
|
||||
# measure accuracy and record loss
|
||||
acc1, acc5 = accuracy(output, target, topk=(1, 5))
|
||||
losses.update(loss.item(), images.size(0))
|
||||
top1.update(acc1[0], images.size(0))
|
||||
top5.update(acc5[0], images.size(0))
|
||||
|
||||
# compute gradient and do SGD step
|
||||
if args.benchmark == 0:
|
||||
optimizer.zero_grad()
|
||||
|
||||
if args.amp:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
# stream = torch.npu.current_stream()
|
||||
# stream.synchronize()
|
||||
|
||||
if args.benchmark == 0:
|
||||
optimizer.step()
|
||||
elif args.benchmark == 1:
|
||||
BATCH_SIZE_multiplier = int(OPTIMIZER_BATCH_SIZE / args.batch_size)
|
||||
BM_optimizer_step = ((i + 1) % BATCH_SIZE_multiplier) == 0
|
||||
if BM_optimizer_step:
|
||||
for param_group in optimizer.param_groups:
|
||||
for param in param_group['params']:
|
||||
param.grad /= BATCH_SIZE_multiplier
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
# stream = torch.npu.current_stream()
|
||||
# stream.synchronize()
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
end = time.time()
|
||||
|
||||
if i % args.print_freq == 0:
|
||||
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
|
||||
and args.rank % ngpus_per_node == 0):
|
||||
progress.display(i)
|
||||
|
||||
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
|
||||
and args.rank % ngpus_per_node == 0):
|
||||
print("[npu id:", args.gpu, "]", '* FPS@all {:.3f}'.format(ngpus_per_node * args.batch_size / batch_time.avg))
|
||||
hwlog.remark_print(key=hwlog.FPS,
|
||||
value=' * FPS@all {:.3f}'.format(ngpus_per_node * args.batch_size / batch_time.avg))
|
||||
|
||||
|
||||
def validate(val_loader, model, criterion, args, ngpus_per_node):
|
||||
batch_time = AverageMeter('Time', ':6.3f')
|
||||
losses = AverageMeter('Loss', ':.4e')
|
||||
top1 = AverageMeter('Acc@1', ':6.2f')
|
||||
top5 = AverageMeter('Acc@5', ':6.2f')
|
||||
progress = ProgressMeter(
|
||||
len(val_loader),
|
||||
[batch_time, losses, top1, top5],
|
||||
prefix='Test: ')
|
||||
|
||||
# switch to evaluate mode
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
loc = 'npu:{}'.format(args.gpu)
|
||||
mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).view(1, 3, 1, 1)
|
||||
std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).view(1, 3, 1, 1)
|
||||
mean = mean.to(loc, non_blocking=True)
|
||||
std = std.to(loc, non_blocking=True)
|
||||
|
||||
end = time.time()
|
||||
for i, (images, target) in enumerate(val_loader):
|
||||
|
||||
target = target.to(torch.int32)
|
||||
images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
|
||||
target = target.to(loc, non_blocking=True)
|
||||
|
||||
# compute output
|
||||
output = model(images)
|
||||
loss = criterion(output, target)
|
||||
|
||||
# measure accuracy and record loss
|
||||
acc1, acc5 = accuracy(output, target, topk=(1, 5))
|
||||
losses.update(loss.item(), images.size(0))
|
||||
top1.update(acc1[0], images.size(0))
|
||||
top5.update(acc5[0], images.size(0))
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
end = time.time()
|
||||
|
||||
if i % args.print_freq == 0:
|
||||
if not args.multiprocessing_distributed or \
|
||||
(args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
|
||||
progress.display(i)
|
||||
|
||||
# TODO: this should also be done with the ProgressMeter
|
||||
if not args.multiprocessing_distributed or \
|
||||
(args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
|
||||
print("[npu id:", args.gpu, "]", '[AVG-ACC] * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
|
||||
.format(top1=top1, top5=top5))
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value="{top1.avg:.3f}".format(top1=top1))
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value="{top5.avg:.3f}".format(top5=top5))
|
||||
|
||||
return top1.avg
|
||||
|
||||
|
||||
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
|
||||
torch.save(state, filename)
|
||||
if is_best:
|
||||
shutil.copyfile(filename, 'model_best_acc%.4f_epoch%d.pth.tar' % (state['best_acc1'], state['epoch']))
|
||||
|
||||
|
||||
class AverageMeter(object):
|
||||
"""Computes and stores the average and current value"""
|
||||
|
||||
def __init__(self, name, fmt=':f'):
|
||||
self.name = name
|
||||
self.fmt = fmt
|
||||
self.reset()
|
||||
self.start_count_index = 10
|
||||
|
||||
def reset(self):
|
||||
self.val = 0
|
||||
self.avg = 0
|
||||
self.sum = 0
|
||||
self.count = 0
|
||||
|
||||
def update(self, val, n=1):
|
||||
if self.count == 0:
|
||||
self.batchsize = n
|
||||
|
||||
self.val = val
|
||||
self.count += n
|
||||
if self.count > (self.start_count_index * self.batchsize):
|
||||
self.sum += val * n
|
||||
self.avg = self.sum / (self.count - self.start_count_index * self.batchsize)
|
||||
|
||||
def __str__(self):
|
||||
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
|
||||
return fmtstr.format(**self.__dict__)
|
||||
|
||||
|
||||
class ProgressMeter(object):
|
||||
def __init__(self, num_batches, meters, prefix=""):
|
||||
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
|
||||
self.meters = meters
|
||||
self.prefix = prefix
|
||||
|
||||
def display(self, batch):
|
||||
entries = [self.prefix + self.batch_fmtstr.format(batch)]
|
||||
entries += [str(meter) for meter in self.meters]
|
||||
print("[npu id:", os.environ['KERNEL_NAME_ID'], "]", '\t'.join(entries))
|
||||
train_acc1 = str(entries).split("Acc@1")[1].strip().split(" ")[0]
|
||||
train_acc5 = str(entries).split("Acc@5")[1].strip().split(" ")[0]
|
||||
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP1, value=train_acc1)
|
||||
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP5, value=train_acc5)
|
||||
|
||||
def _get_batch_fmtstr(self, num_batches):
|
||||
num_digits = len(str(num_batches // 1))
|
||||
fmt = '{:' + str(num_digits) + 'd}'
|
||||
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
|
||||
|
||||
|
||||
# def adjust_learning_rate(optimizer, epoch, args):
|
||||
# """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
|
||||
# lr = args.lr * (0.1 ** (epoch // 30))
|
||||
# for param_group in optimizer.param_groups:
|
||||
# param_group['lr'] = lr
|
||||
|
||||
def adjust_learning_rate(optimizer, global_step, steps_per_epoch, args):
|
||||
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
|
||||
# lr = args.lr * (0.98 ** (epoch / 2.5))
|
||||
lr = args.lr * (0.98 ** (global_step // int(steps_per_epoch * 2.5)))
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr
|
||||
return lr
|
||||
|
||||
|
||||
def accuracy(output, target, topk=(1,)):
|
||||
"""Computes the accuracy over the k top predictions for the specified values of k"""
|
||||
with torch.no_grad():
|
||||
maxk = max(topk)
|
||||
batch_size = target.size(0)
|
||||
|
||||
_, pred = output.topk(maxk, 1, True, True)
|
||||
pred = pred.t()
|
||||
correct = pred.eq(target.view(1, -1).expand_as(pred))
|
||||
|
||||
res = []
|
||||
for k in topk:
|
||||
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
|
||||
res.append(correct_k.mul_(100.0 / batch_size))
|
||||
return res
|
||||
|
||||
|
||||
def fast_collate(batch):
|
||||
imgs = [img[0] for img in batch]
|
||||
targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
|
||||
w = imgs[0].size[0]
|
||||
h = imgs[0].size[1]
|
||||
tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8)
|
||||
for i, img in enumerate(imgs):
|
||||
nump_array = np.asarray(img, dtype=np.uint8)
|
||||
if nump_array.ndim < 3:
|
||||
nump_array = np.expand_dims(nump_array, axis=-1)
|
||||
nump_array = np.rollaxis(nump_array, 2)
|
||||
|
||||
tensor[i] += torch.from_numpy(nump_array)
|
||||
|
||||
return tensor, targets
|
||||
|
||||
|
||||
def get_pytorch_train_loader(data_path, batch_size, workers=5, _worker_init_fn=None, distributed=False):
|
||||
traindir = os.path.join(data_path, 'train')
|
||||
train_dataset = datasets.ImageFolder(
|
||||
traindir,
|
||||
transforms.Compose([
|
||||
transforms.RandomResizedCrop(224),
|
||||
transforms.RandomHorizontalFlip(),
|
||||
]))
|
||||
|
||||
if distributed:
|
||||
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
|
||||
else:
|
||||
train_sampler = None
|
||||
|
||||
dataloader_fn = MultiEpochsDataLoader # torch.utils.data.DataLoader
|
||||
train_loader = dataloader_fn(
|
||||
train_dataset, batch_size=batch_size, shuffle=(train_sampler is None),
|
||||
num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate, drop_last=True)
|
||||
return train_loader, len(train_loader), train_sampler
|
||||
|
||||
|
||||
def get_pytorch_val_loader(data_path, batch_size, workers=5, _worker_init_fn=None, distributed=False):
|
||||
valdir = os.path.join(data_path, 'val')
|
||||
val_dataset = datasets.ImageFolder(
|
||||
valdir, transforms.Compose([
|
||||
transforms.Resize(256),
|
||||
transforms.CenterCrop(224),
|
||||
]))
|
||||
|
||||
if distributed:
|
||||
val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
|
||||
else:
|
||||
val_sampler = None
|
||||
|
||||
dataloader_fn = MultiEpochsDataLoader # torch.utils.data.DataLoader
|
||||
val_loader = dataloader_fn(
|
||||
val_dataset,
|
||||
sampler=val_sampler,
|
||||
batch_size=batch_size, shuffle=(val_sampler is None),
|
||||
num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, collate_fn=fast_collate)
|
||||
|
||||
return val_loader
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
|
||||
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
|
||||
config_info = get_model_parameter("pytorch_config")
|
||||
initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
|
||||
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
|
||||
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
|
||||
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
|
||||
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
|
||||
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
|
||||
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
|
||||
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
|
||||
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
|
||||
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
|
||||
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
|
||||
main()
|
||||
+663
@@ -0,0 +1,663 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
import time
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.parallel
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torch.distributed as dist
|
||||
import torch.optim
|
||||
import torch.multiprocessing as mp
|
||||
import torch.utils.data
|
||||
import torch.utils.data.distributed
|
||||
import torchvision.transforms as transforms
|
||||
import torchvision.datasets as datasets
|
||||
import torchvision.models as models
|
||||
|
||||
from mobilenet import mobilenet_v2
|
||||
from apex import amp
|
||||
from multi_epochs_dataloader import MultiEpochsDataLoader
|
||||
|
||||
from benchmark_log import hwlog
|
||||
from benchmark_log.basic_utils import get_environment_info
|
||||
from benchmark_log.basic_utils import get_model_parameter
|
||||
BATCH_SIZE = 6144
|
||||
OPTIMIZER_BATCH_SIZE = 6144
|
||||
# model_names = sorted(name for name in models.__dict__
|
||||
# if name.islower() and not name.startswith("__")
|
||||
# and callable(models.__dict__[name]))
|
||||
|
||||
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
|
||||
parser.add_argument('--data', metavar='DIR', default='/opt/npu/dataset/imagenet',
|
||||
help='path to dataset')
|
||||
# parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
|
||||
# choices=model_names,
|
||||
# help='model architecture: ' +
|
||||
# ' | '.join(model_names) +
|
||||
# ' (default: resnet18)')
|
||||
parser.add_argument('-j', '--workers', default=32, type=int, metavar='N',
|
||||
help='number of data loading workers (default: 4)')
|
||||
parser.add_argument('--epochs', default=90, type=int, metavar='N',
|
||||
help='number of total epochs to run')
|
||||
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
|
||||
help='manual epoch number (useful on restarts)')
|
||||
parser.add_argument('-b', '--batch-size', default=BATCH_SIZE, type=int,
|
||||
metavar='N',
|
||||
help='mini-batch size (default: 256), this is the total '
|
||||
'batch size of all GPUs on the current node when '
|
||||
'using Data Parallel or Distributed Data Parallel')
|
||||
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
|
||||
metavar='LR', help='initial learning rate', dest='lr')
|
||||
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
|
||||
help='momentum')
|
||||
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
|
||||
metavar='W', help='weight decay (default: 1e-4)',
|
||||
dest='weight_decay')
|
||||
parser.add_argument('--workspace', type=str, default='./', metavar='DIR',
|
||||
help='path to directory where checkpoints will be stored')
|
||||
parser.add_argument('-p', '--print-freq', default=10, type=int,
|
||||
metavar='N', help='print frequency (default: 10)')
|
||||
parser.add_argument('-ef', '--eval-freq', default=5, type=int,
|
||||
metavar='N', help='evaluate frequency (default: 5)')
|
||||
parser.add_argument('--resume', default='', type=str, metavar='PATH',
|
||||
help='path to latest checkpoint (default: none)')
|
||||
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
|
||||
help='evaluate model on validation set')
|
||||
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
|
||||
help='use pre-trained model')
|
||||
parser.add_argument('--world-size', default=-1, type=int,
|
||||
help='number of nodes for distributed training')
|
||||
parser.add_argument('--rank', default=-1, type=int,
|
||||
help='node rank for distributed training')
|
||||
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
|
||||
help='url used to set up distributed training')
|
||||
parser.add_argument('--dist-backend', default='nccl', type=str,
|
||||
help='distributed backend')
|
||||
parser.add_argument('--seed', default=None, type=int,
|
||||
help='seed for initializing training. ')
|
||||
parser.add_argument('--gpu', default=None, type=int,
|
||||
help='GPU id to use.')
|
||||
parser.add_argument('--multiprocessing-distributed', action='store_true',
|
||||
help='Use multi-processing distributed training to launch '
|
||||
'N processes per node, which has N GPUs. This is the '
|
||||
'fastest way to use PyTorch for either single node or '
|
||||
'multi node data parallel training')
|
||||
parser.add_argument('-bm', '--benchmark', default=0, type=int,
|
||||
metavar='N', help='set benchmark status (default: 1,run benchmark)')
|
||||
parser.add_argument('--device', default='npu', type=str, help='npu or gpu')
|
||||
parser.add_argument('--addr', default='10.136.181.115', type=str, help='master addr')
|
||||
parser.add_argument('--checkpoint-nameprefix', default='checkpoint', type=str, help='checkpoint-nameprefix')
|
||||
parser.add_argument('--checkpoint-freq', default=0, type=int,
|
||||
metavar='N', help='checkpoint frequency (default: 0)'
|
||||
'0: save only one file whitch per epoch;'
|
||||
'n: save diff file per n epoch'
|
||||
'-1:no checkpoint,not support')
|
||||
|
||||
parser.add_argument('--device-list', default='0,1,2,3,4,5,6,7', type=str, help='device id list')
|
||||
|
||||
# apex
|
||||
parser.add_argument('--amp', default=False, action='store_true',
|
||||
help='use amp to train the model')
|
||||
parser.add_argument('--loss-scale', default=64., type=float,
|
||||
help='loss scale using in amp, default -1 means dynamic')
|
||||
parser.add_argument('--opt-level', default='O2', type=str,
|
||||
help='loss scale using in amp, default -1 means dynamic')
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
best_acc1 = 0
|
||||
|
||||
|
||||
def device_id_to_process_device_map(device_list):
|
||||
devices = device_list.split(",")
|
||||
devices = [int(x) for x in devices]
|
||||
devices.sort()
|
||||
|
||||
process_device_map = dict()
|
||||
for process_id, device_id in enumerate(devices):
|
||||
process_device_map[process_id] = device_id
|
||||
|
||||
return process_device_map
|
||||
|
||||
|
||||
def main():
|
||||
args = parser.parse_args()
|
||||
print("===============main()=================")
|
||||
print(args)
|
||||
print("===============main()=================")
|
||||
|
||||
os.environ['KERNEL_NAME_ID'] = str(0)
|
||||
print("++++++++++++++++++ KERNEL_NAME_ID:", os.environ['KERNEL_NAME_ID'])
|
||||
|
||||
if args.seed is not None:
|
||||
random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
cudnn.deterministic = True
|
||||
warnings.warn('You have chosen to seed training. '
|
||||
'This will turn on the CUDNN deterministic setting, '
|
||||
'which can slow down your training considerably! '
|
||||
'You may see unexpected behavior when restarting '
|
||||
'from checkpoints.')
|
||||
|
||||
os.environ['MASTER_ADDR'] = args.addr # '10.136.181.51'
|
||||
os.environ['MASTER_PORT'] = '59629'
|
||||
|
||||
if args.gpu is not None:
|
||||
warnings.warn('You have chosen a specific GPU. This will completely '
|
||||
'disable data parallelism.')
|
||||
|
||||
if args.dist_url == "env://" and args.world_size == -1:
|
||||
args.world_size = int(os.environ["WORLD_SIZE"])
|
||||
|
||||
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
|
||||
|
||||
args.process_device_map = device_id_to_process_device_map(args.device_list)
|
||||
|
||||
if args.device == 'npu':
|
||||
# ngpus_per_node = torch.npu.device_count()
|
||||
ngpus_per_node = len(args.process_device_map)
|
||||
else:
|
||||
ngpus_per_node = torch.cuda.device_count()
|
||||
if args.multiprocessing_distributed:
|
||||
# Since we have ngpus_per_node processes per node, the total world_size
|
||||
# needs to be adjusted accordingly
|
||||
args.world_size = ngpus_per_node * args.world_size
|
||||
# Use torch.multiprocessing.spawn to launch distributed processes: the
|
||||
# main_worker process function
|
||||
# The child process uses the environment variables of the parent process,
|
||||
# we have to set KERNEL_NAME_ID for every proc
|
||||
if args.device == 'npu':
|
||||
# main_worker(args.gpu, ngpus_per_node, args)
|
||||
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
|
||||
else:
|
||||
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
|
||||
else:
|
||||
# Simply call main_worker function
|
||||
main_worker(args.gpu, ngpus_per_node, args)
|
||||
|
||||
|
||||
def main_worker(gpu, ngpus_per_node, args):
|
||||
global best_acc1
|
||||
# args.gpu = gpu
|
||||
args.gpu = args.process_device_map[gpu]
|
||||
|
||||
print("[npu id:", args.gpu, "]", "++++++++++++++++ before set KERNEL_NAME_ID:", os.environ['KERNEL_NAME_ID'])
|
||||
os.environ['KERNEL_NAME_ID'] = str(gpu)
|
||||
print("[npu id:", args.gpu, "]", "++++++++++++++++ KERNEL_NAME_ID:", os.environ['KERNEL_NAME_ID'])
|
||||
|
||||
if args.gpu is not None:
|
||||
print("[npu id:", args.gpu, "]", "Use GPU: {} for training".format(args.gpu))
|
||||
|
||||
if args.distributed:
|
||||
if args.dist_url == "env://" and args.rank == -1:
|
||||
args.rank = int(os.environ["RANK"])
|
||||
if args.multiprocessing_distributed:
|
||||
# For multiprocessing distributed training, rank needs to be the
|
||||
# global rank among all the processes
|
||||
args.rank = args.rank * ngpus_per_node + gpu
|
||||
|
||||
if args.device == 'npu':
|
||||
dist.init_process_group(backend=args.dist_backend, # init_method=args.dist_url,
|
||||
world_size=args.world_size, rank=args.rank)
|
||||
else:
|
||||
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
|
||||
world_size=args.world_size, rank=args.rank)
|
||||
|
||||
loc = 'npu:{}'.format(args.gpu)
|
||||
torch.npu.set_device(loc)
|
||||
|
||||
args.batch_size = int(args.batch_size / ngpus_per_node)
|
||||
args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
|
||||
|
||||
print("[npu id:", args.gpu, "]", "===============main_worker()=================")
|
||||
print("[npu id:", args.gpu, "]", args)
|
||||
print("[npu id:", args.gpu, "]", "===============main_worker()=================")
|
||||
|
||||
# Data loading code
|
||||
# traindir = os.path.join(args.data, 'train')
|
||||
# valdir = os.path.join(args.data, 'val')
|
||||
# normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
||||
# std=[0.229, 0.224, 0.225])
|
||||
|
||||
# train_dataset = datasets.ImageFolder(
|
||||
# traindir,
|
||||
# transforms.Compose([
|
||||
# transforms.RandomResizedCrop(224),
|
||||
# transforms.RandomHorizontalFlip(),
|
||||
# transforms.ToTensor(),
|
||||
# normalize,
|
||||
# ]))
|
||||
#
|
||||
# if args.distributed:
|
||||
# train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
|
||||
# else:
|
||||
# train_sampler = None
|
||||
#
|
||||
# train_loader = torch.utils.data.DataLoader(
|
||||
# train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
|
||||
# num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
|
||||
|
||||
train_loader, train_loader_len, train_sampler = get_pytorch_train_loader(args.data,
|
||||
args.batch_size,
|
||||
workers=args.workers,
|
||||
distributed=args.distributed)
|
||||
|
||||
# val_loader = torch.utils.data.DataLoader(
|
||||
# datasets.ImageFolder(valdir, transforms.Compose([
|
||||
# transforms.Resize(256),
|
||||
# transforms.CenterCrop(224),
|
||||
# transforms.ToTensor(),
|
||||
# normalize,
|
||||
# ])),
|
||||
# batch_size=args.batch_size, shuffle=True,
|
||||
# num_workers=args.workers, pin_memory=True, drop_last=True)
|
||||
|
||||
val_loader = get_pytorch_val_loader(args.data, args.batch_size, args.workers, distributed=False)
|
||||
|
||||
# create model
|
||||
print("[npu id:", args.gpu, "]", "=> creating model '{}'".format('mobilenetv2'))
|
||||
# model = models.__dict__[args.arch]()
|
||||
model = mobilenet_v2()
|
||||
model = model.to(loc)
|
||||
|
||||
# define loss function (criterion) and optimizer
|
||||
criterion = nn.CrossEntropyLoss().to(loc)
|
||||
optimizer = torch.optim.SGD(model.parameters(), args.lr,
|
||||
momentum=args.momentum,
|
||||
weight_decay=args.weight_decay)
|
||||
|
||||
if args.amp:
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale)
|
||||
|
||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False)
|
||||
|
||||
# optionally resume from a checkpoint
|
||||
if args.resume:
|
||||
if os.path.isfile(args.resume):
|
||||
print("=> loading checkpoint '{}'".format(args.resume))
|
||||
checkpoint = torch.load(args.resume, map_location=loc)
|
||||
args.start_epoch = checkpoint['epoch']
|
||||
best_acc1 = checkpoint['best_acc1']
|
||||
model.load_state_dict(checkpoint['state_dict'])
|
||||
optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
if args.amp:
|
||||
amp.load_state_dict(checkpoint['amp'])
|
||||
print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
|
||||
else:
|
||||
print("=> no checkpoint found at '{}'".format(args.resume))
|
||||
|
||||
cudnn.benchmark = True
|
||||
|
||||
if args.evaluate:
|
||||
validate(val_loader, model, criterion, args, ngpus_per_node)
|
||||
return
|
||||
|
||||
for epoch in range(args.start_epoch, args.epochs):
|
||||
if args.distributed:
|
||||
train_sampler.set_epoch(epoch)
|
||||
# adjust_learning_rate(optimizer, epoch, args)
|
||||
|
||||
# train for one epoch
|
||||
train(train_loader, train_loader_len, model, criterion, optimizer, epoch, args, ngpus_per_node)
|
||||
|
||||
if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1:
|
||||
# evaluate on validation set
|
||||
acc1 = validate(val_loader, model, criterion, args, ngpus_per_node)
|
||||
|
||||
# remember best acc@1 and save checkpoint
|
||||
is_best = acc1 > best_acc1
|
||||
best_acc1 = max(acc1, best_acc1)
|
||||
|
||||
if not args.multiprocessing_distributed or \
|
||||
(args.multiprocessing_distributed and args.rank % ngpus_per_node == 0 and epoch == args.epochs - 1):
|
||||
if args.amp:
|
||||
save_checkpoint({
|
||||
'epoch': epoch + 1,
|
||||
'state_dict': model.state_dict(),
|
||||
'best_acc1': best_acc1,
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'amp': amp.state_dict(),
|
||||
}, is_best)
|
||||
else:
|
||||
save_checkpoint({
|
||||
'epoch': epoch + 1,
|
||||
'state_dict': model.state_dict(),
|
||||
'best_acc1': best_acc1,
|
||||
'optimizer': optimizer.state_dict(),
|
||||
}, is_best)
|
||||
|
||||
|
||||
def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, args, ngpus_per_node):
|
||||
batch_time = AverageMeter('Time', ':6.3f')
|
||||
data_time = AverageMeter('Data', ':6.3f')
|
||||
losses = AverageMeter('Loss', ':.4e')
|
||||
top1 = AverageMeter('Acc@1', ':6.2f')
|
||||
top5 = AverageMeter('Acc@5', ':6.2f')
|
||||
progress = ProgressMeter(
|
||||
train_loader_len,
|
||||
[batch_time, data_time, losses, top1, top5],
|
||||
prefix="Epoch: [{}]".format(epoch))
|
||||
|
||||
loc = 'npu:{}'.format(args.gpu)
|
||||
|
||||
mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).view(1, 3, 1, 1)
|
||||
std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).view(1, 3, 1, 1)
|
||||
mean = mean.to(loc, non_blocking=True)
|
||||
std = std.to(loc, non_blocking=True)
|
||||
|
||||
# switch to train mode
|
||||
model.train()
|
||||
end = time.time()
|
||||
if args.benchmark == 1:
|
||||
optimizer.zero_grad()
|
||||
|
||||
# steps_per_epoch = len(train_loader)
|
||||
steps_per_epoch = train_loader_len
|
||||
print('==========step per epoch======================', steps_per_epoch)
|
||||
for i, (images, target) in enumerate(train_loader):
|
||||
# measure data loading time
|
||||
data_time.update(time.time() - end)
|
||||
|
||||
global_step = epoch * steps_per_epoch + i
|
||||
lr = adjust_learning_rate(optimizer, global_step, steps_per_epoch, args)
|
||||
|
||||
target = target.to(torch.int32)
|
||||
images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
|
||||
target = target.to(loc, non_blocking=True)
|
||||
|
||||
# compute output
|
||||
output = model(images)
|
||||
stream = torch.npu.current_stream()
|
||||
stream.synchronize()
|
||||
|
||||
loss = criterion(output, target)
|
||||
stream = torch.npu.current_stream()
|
||||
stream.synchronize()
|
||||
|
||||
# measure accuracy and record loss
|
||||
acc1, acc5 = accuracy(output, target, topk=(1, 5))
|
||||
losses.update(loss.item(), images.size(0))
|
||||
top1.update(acc1[0], images.size(0))
|
||||
top5.update(acc5[0], images.size(0))
|
||||
|
||||
# compute gradient and do SGD step
|
||||
if args.benchmark == 0:
|
||||
optimizer.zero_grad()
|
||||
|
||||
if args.amp:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
stream = torch.npu.current_stream()
|
||||
stream.synchronize()
|
||||
|
||||
if args.benchmark == 0:
|
||||
optimizer.step()
|
||||
elif args.benchmark == 1:
|
||||
BATCH_SIZE_multiplier = int(OPTIMIZER_BATCH_SIZE / args.batch_size)
|
||||
BM_optimizer_step = ((i + 1) % BATCH_SIZE_multiplier) == 0
|
||||
if BM_optimizer_step:
|
||||
for param_group in optimizer.param_groups:
|
||||
for param in param_group['params']:
|
||||
param.grad /= BATCH_SIZE_multiplier
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
stream = torch.npu.current_stream()
|
||||
stream.synchronize()
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
end = time.time()
|
||||
|
||||
if i % args.print_freq == 0:
|
||||
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
|
||||
and args.rank % ngpus_per_node == 0):
|
||||
progress.display(i)
|
||||
|
||||
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
|
||||
and args.rank % ngpus_per_node == 0):
|
||||
print("[npu id:", args.gpu, "]", '* FPS@all {:.3f}'.format(ngpus_per_node * args.batch_size / batch_time.avg))
|
||||
hwlog.remark_print(key=hwlog.FPS, value=' * FPS@all {:.3f}'.format(ngpus_per_node * args.batch_size / batch_time.avg))
|
||||
|
||||
|
||||
def validate(val_loader, model, criterion, args, ngpus_per_node):
|
||||
batch_time = AverageMeter('Time', ':6.3f')
|
||||
losses = AverageMeter('Loss', ':.4e')
|
||||
top1 = AverageMeter('Acc@1', ':6.2f')
|
||||
top5 = AverageMeter('Acc@5', ':6.2f')
|
||||
progress = ProgressMeter(
|
||||
len(val_loader),
|
||||
[batch_time, losses, top1, top5],
|
||||
prefix='Test: ')
|
||||
|
||||
# switch to evaluate mode
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
loc = 'npu:{}'.format(args.gpu)
|
||||
mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).view(1, 3, 1, 1)
|
||||
std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).view(1, 3, 1, 1)
|
||||
mean = mean.to(loc, non_blocking=True)
|
||||
std = std.to(loc, non_blocking=True)
|
||||
|
||||
end = time.time()
|
||||
for i, (images, target) in enumerate(val_loader):
|
||||
|
||||
target = target.to(torch.int32)
|
||||
images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
|
||||
target = target.to(loc, non_blocking=True)
|
||||
|
||||
# compute output
|
||||
output = model(images)
|
||||
loss = criterion(output, target)
|
||||
|
||||
# measure accuracy and record loss
|
||||
acc1, acc5 = accuracy(output, target, topk=(1, 5))
|
||||
losses.update(loss.item(), images.size(0))
|
||||
top1.update(acc1[0], images.size(0))
|
||||
top5.update(acc5[0], images.size(0))
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
end = time.time()
|
||||
|
||||
if i % args.print_freq == 0:
|
||||
if not args.multiprocessing_distributed or \
|
||||
(args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
|
||||
progress.display(i)
|
||||
|
||||
# TODO: this should also be done with the ProgressMeter
|
||||
if not args.multiprocessing_distributed or \
|
||||
(args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
|
||||
print("[npu id:", args.gpu, "]", '[AVG-ACC] * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
|
||||
.format(top1=top1, top5=top5))
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value="{top1.avg:.3f}".format(top1=top1))
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value="{top5.avg:.3f}".format(top5=top5))
|
||||
|
||||
return top1.avg
|
||||
|
||||
|
||||
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
|
||||
torch.save(state, filename)
|
||||
if is_best:
|
||||
shutil.copyfile(filename, 'model_best_acc%.4f_epoch%d.pth.tar' % (state['best_acc1'], state['epoch']))
|
||||
|
||||
|
||||
class AverageMeter(object):
|
||||
"""Computes and stores the average and current value"""
|
||||
|
||||
def __init__(self, name, fmt=':f'):
|
||||
self.name = name
|
||||
self.fmt = fmt
|
||||
self.reset()
|
||||
self.start_count_index = 10
|
||||
|
||||
def reset(self):
|
||||
self.val = 0
|
||||
self.avg = 0
|
||||
self.sum = 0
|
||||
self.count = 0
|
||||
|
||||
def update(self, val, n=1):
|
||||
if self.count == 0:
|
||||
self.batchsize = n
|
||||
|
||||
self.val = val
|
||||
self.count += n
|
||||
if self.count > (self.start_count_index * self.batchsize):
|
||||
self.sum += val * n
|
||||
self.avg = self.sum / (self.count - self.start_count_index * self.batchsize)
|
||||
|
||||
def __str__(self):
|
||||
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
|
||||
return fmtstr.format(**self.__dict__)
|
||||
|
||||
|
||||
class ProgressMeter(object):
|
||||
def __init__(self, num_batches, meters, prefix=""):
|
||||
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
|
||||
self.meters = meters
|
||||
self.prefix = prefix
|
||||
|
||||
def display(self, batch):
|
||||
entries = [self.prefix + self.batch_fmtstr.format(batch)]
|
||||
entries += [str(meter) for meter in self.meters]
|
||||
print("[npu id:", os.environ['KERNEL_NAME_ID'], "]", '\t'.join(entries))
|
||||
train_acc1 = str(entries).split("Acc@1")[1].strip().split(" ")[0]
|
||||
train_acc5 = str(entries).split("Acc@5")[1].strip().split(" ")[0]
|
||||
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP1, value=train_acc1)
|
||||
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP5, value=train_acc5)
|
||||
|
||||
|
||||
def _get_batch_fmtstr(self, num_batches):
|
||||
num_digits = len(str(num_batches // 1))
|
||||
fmt = '{:' + str(num_digits) + 'd}'
|
||||
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
|
||||
|
||||
|
||||
# def adjust_learning_rate(optimizer, epoch, args):
|
||||
# """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
|
||||
# lr = args.lr * (0.1 ** (epoch // 30))
|
||||
# for param_group in optimizer.param_groups:
|
||||
# param_group['lr'] = lr
|
||||
|
||||
# def adjust_learning_rate(optimizer, global_step, steps_per_epoch, args):
|
||||
# """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
|
||||
# # lr = args.lr * (0.98 ** (epoch / 2.5))
|
||||
# lr = args.lr * (0.98 ** (global_step // int(steps_per_epoch * 2.5)))
|
||||
# for param_group in optimizer.param_groups:
|
||||
# param_group['lr'] = lr
|
||||
# return lr
|
||||
|
||||
def adjust_learning_rate(optimizer, global_step, steps_per_epoch, args):
|
||||
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
|
||||
# lr = args.lr * (0.98 ** (epoch / 2.5))
|
||||
lr = args.lr * (0.98 ** (global_step // int(steps_per_epoch * 2.5)))
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr
|
||||
return lr
|
||||
|
||||
|
||||
def accuracy(output, target, topk=(1,)):
|
||||
"""Computes the accuracy over the k top predictions for the specified values of k"""
|
||||
with torch.no_grad():
|
||||
maxk = max(topk)
|
||||
batch_size = target.size(0)
|
||||
|
||||
_, pred = output.topk(maxk, 1, True, True)
|
||||
pred = pred.t()
|
||||
correct = pred.eq(target.view(1, -1).expand_as(pred))
|
||||
|
||||
res = []
|
||||
for k in topk:
|
||||
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
|
||||
res.append(correct_k.mul_(100.0 / batch_size))
|
||||
return res
|
||||
|
||||
|
||||
def fast_collate(batch):
|
||||
imgs = [img[0] for img in batch]
|
||||
targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
|
||||
w = imgs[0].size[0]
|
||||
h = imgs[0].size[1]
|
||||
tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8)
|
||||
for i, img in enumerate(imgs):
|
||||
nump_array = np.asarray(img, dtype=np.uint8)
|
||||
if nump_array.ndim < 3:
|
||||
nump_array = np.expand_dims(nump_array, axis=-1)
|
||||
nump_array = np.rollaxis(nump_array, 2)
|
||||
|
||||
tensor[i] += torch.from_numpy(nump_array)
|
||||
|
||||
return tensor, targets
|
||||
|
||||
|
||||
def get_pytorch_train_loader(data_path, batch_size, workers=5, _worker_init_fn=None, distributed=False):
|
||||
traindir = os.path.join(data_path, 'train')
|
||||
train_dataset = datasets.ImageFolder(
|
||||
traindir,
|
||||
transforms.Compose([
|
||||
transforms.RandomResizedCrop(224),
|
||||
transforms.RandomHorizontalFlip(),
|
||||
]))
|
||||
|
||||
if distributed:
|
||||
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
|
||||
else:
|
||||
train_sampler = None
|
||||
|
||||
dataloader_fn = MultiEpochsDataLoader # torch.utils.data.DataLoader
|
||||
train_loader = dataloader_fn(
|
||||
train_dataset, batch_size=batch_size, shuffle=(train_sampler is None),
|
||||
num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate, drop_last=True)
|
||||
return train_loader, len(train_loader), train_sampler
|
||||
|
||||
|
||||
def get_pytorch_val_loader(data_path, batch_size, workers=5, _worker_init_fn=None, distributed=False):
|
||||
valdir = os.path.join(data_path, 'val')
|
||||
val_dataset = datasets.ImageFolder(
|
||||
valdir, transforms.Compose([
|
||||
transforms.Resize(256),
|
||||
transforms.CenterCrop(224),
|
||||
]))
|
||||
|
||||
if distributed:
|
||||
val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
|
||||
else:
|
||||
val_sampler = None
|
||||
|
||||
dataloader_fn = MultiEpochsDataLoader # torch.utils.data.DataLoader
|
||||
val_loader = dataloader_fn(
|
||||
val_dataset,
|
||||
sampler=val_sampler,
|
||||
batch_size=batch_size, shuffle=(val_sampler is None),
|
||||
num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, collate_fn=fast_collate)
|
||||
|
||||
return val_loader
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
|
||||
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
|
||||
config_info = get_model_parameter("pytorch_config")
|
||||
initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
|
||||
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
|
||||
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
|
||||
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
|
||||
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
|
||||
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
|
||||
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
|
||||
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
|
||||
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
|
||||
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
|
||||
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
|
||||
main()
|
||||
+31
@@ -0,0 +1,31 @@
|
||||
import torch
|
||||
|
||||
|
||||
class MultiEpochsDataLoader(torch.utils.data.DataLoader):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._DataLoader__initialized = False
|
||||
self.batch_sampler = _RepeatSampler(self.batch_sampler)
|
||||
self._DataLoader__initialized = True
|
||||
self.iterator = super().__iter__()
|
||||
|
||||
def __len__(self):
|
||||
return len(self.batch_sampler.sampler)
|
||||
|
||||
def __iter__(self):
|
||||
for _ in range(len(self)):
|
||||
yield next(self.iterator)
|
||||
|
||||
|
||||
class _RepeatSampler(object):
|
||||
""" Sampler that repeats forever.
|
||||
Args:
|
||||
sampler (Sampler)
|
||||
"""
|
||||
|
||||
def __init__(self, sampler):
|
||||
self.sampler = sampler
|
||||
|
||||
def __iter__(self):
|
||||
while True:
|
||||
yield from iter(self.sampler)
|
||||
+18
@@ -0,0 +1,18 @@
|
||||
{
|
||||
"startCfg":
|
||||
[
|
||||
{
|
||||
"jobID": "123456789",
|
||||
"deviceID": ["0"],
|
||||
"features":
|
||||
[
|
||||
{
|
||||
"name": "task_trace"
|
||||
},
|
||||
{
|
||||
"name": "training_trace"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
+19
@@ -0,0 +1,19 @@
|
||||
source set_env_b023.sh
|
||||
|
||||
su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
|
||||
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
export TASK_QUEUE_ENABLE=0
|
||||
nohup taskset -c 1-40 python3.7 densenet121_1p_main.py \
|
||||
--workers 40 \
|
||||
--arch densenet121 \
|
||||
--npu 0 \
|
||||
--lr 0.1 \
|
||||
--momentum 0.9 \
|
||||
--amp \
|
||||
--print-freq 1 \
|
||||
--eval-freq 5\
|
||||
--batch-size 256 \
|
||||
--epoch 45 \
|
||||
--resume checkpoint.pth.tar \
|
||||
--data /home/dataset/imagenet > output_1p.log &
|
||||
+27
@@ -0,0 +1,27 @@
|
||||
source set_env_b023.sh
|
||||
|
||||
su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
|
||||
su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 4"
|
||||
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
export TASK_QUEUE_ENABLE=0
|
||||
nohup python3.7 ./densenet121_8p_main.py \
|
||||
--addr='10.246.246.57' \
|
||||
--seed 49 \
|
||||
--workers 80 \
|
||||
--lr 0.8 \
|
||||
--print-freq 1 \
|
||||
--eval-freq 5\
|
||||
--arch densenet121 \
|
||||
--dist-url 'tcp://127.0.0.1:50000' \
|
||||
--dist-backend 'hccl' \
|
||||
--multiprocessing-distributed \
|
||||
--world-size 1 \
|
||||
--batch-size 2048 \
|
||||
--epochs 45 \
|
||||
--rank 0 \
|
||||
--amp \
|
||||
--benchmark 0 \
|
||||
--resume checkpoint.pth.tar \
|
||||
--data /train/imagenet > resume_8p.log &
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
source set_env_b023.sh
|
||||
|
||||
su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
|
||||
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
export TASK_QUEUE_ENABLE=0
|
||||
nohup taskset -c 1-40 python3.7 densenet121_1p_main.py \
|
||||
--workers 40 \
|
||||
--arch densenet121 \
|
||||
--npu 0 \
|
||||
--lr 0.1 \
|
||||
--momentum 0.9 \
|
||||
--amp \
|
||||
--print-freq 1 \
|
||||
--eval-freq 5\
|
||||
--batch-size 256 \
|
||||
--epoch 90 \
|
||||
--data /opt/npu/dataset/imagenet > output_1p.log &
|
||||
@@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env bash
|
||||
source set_env_b023.sh
|
||||
|
||||
su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
|
||||
su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 4"
|
||||
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
export TASK_QUEUE_ENABLE=0
|
||||
nohup python3.7 ./mobilenetv2_8p_main.py \
|
||||
--addr='10.246.246.76' \
|
||||
--seed 49 \
|
||||
--workers 80 \
|
||||
--lr 0.24 \
|
||||
--print-freq 1 \
|
||||
--eval-freq 5\
|
||||
--dist-url 'tcp://127.0.0.1:50002' \
|
||||
--dist-backend 'hccl' \
|
||||
--multiprocessing-distributed \
|
||||
--world-size 1 \
|
||||
--batch-size 6144 \
|
||||
--epochs 600 \
|
||||
--rank 0 \
|
||||
--amp \
|
||||
--benchmark 0 \
|
||||
--data /opt/npu/dataset/imagenet > output_8p.log &
|
||||
|
||||
+17
@@ -0,0 +1,17 @@
|
||||
############## toolkit situation ################
|
||||
#export ASCEND_HOME=/usr/local/Ascend
|
||||
#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
|
||||
#export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl
|
||||
#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
|
||||
#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
|
||||
############## nnae situation ################
|
||||
export ASCEND_HOME=/usr/local/Ascend
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/:/usr/local/python3.7.5/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
|
||||
export PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/hccl
|
||||
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/
|
||||
|
||||
# pip3.7 install --upgrade /usr/local/Ascend/nnae/latest/fwkacllib/lib64/topi-0.4.0-py3-none-any.whl
|
||||
# pip3.7 install --upgrade /usr/local/Ascend/nnae/latest/fwkacllib/lib64/te-0.4.0-py3-none-any.whl
|
||||
|
||||
+18
@@ -0,0 +1,18 @@
|
||||
############## toolkit situation ################
|
||||
export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
|
||||
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
|
||||
export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
|
||||
|
||||
|
||||
############## nnae situation ################
|
||||
# export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
|
||||
# export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/nnae/latest/toolkit/tools/ide_daemon/bin/
|
||||
# export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/
|
||||
# export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
|
||||
# export PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
|
||||
|
||||
|
||||
|
||||
# ln -s /usr/local/Ascend/ascend-toolkit/latest/toolkit/bin/adc /usr/local/bin/
|
||||
Reference in New Issue
Block a user