[add]上传训练benchmark by z00560161

This commit is contained in:
liang_chaoming@huawei.com
2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,20 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#from . import logger
#from . import dataloaders
#from . import training
#from . import utils
#from . import mixup
#from . import resnet
#from . import smoothing
@@ -0,0 +1,369 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import os
import torch
import numpy as np
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from PIL import Image
DATA_BACKEND_CHOICES = ['pytorch', 'syntetic']
# try:
# from nvidia.dali.plugin.pytorch import DALIClassificationIterator
# from nvidia.dali.pipeline import Pipeline
# import nvidia.dali.ops as ops
# import nvidia.dali.types as types
# DATA_BACKEND_CHOICES.append('dali-gpu')
# DATA_BACKEND_CHOICES.append('dali-cpu')
# except ImportError:
# print("Please install DALI from https://www.github.com/NVIDIA/DALI to run this example.")
def load_jpeg_from_file(path, cuda=True, fp16=False):
img_transforms = transforms.Compose(
[transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()]
)
img = img_transforms(Image.open(path))
with torch.no_grad():
# mean and std are not multiplied by 255 as they are in training script
# torch dataloader reads data into bytes whereas loading directly
# through PIL creates a tensor with floats in [0,1] range
mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
if cuda:
mean = mean.cuda()
std = std.cuda()
img = img.cuda()
if fp16:
mean = mean.half()
std = std.half()
img = img.half()
else:
img = img.float()
input = img.unsqueeze(0).sub_(mean).div_(std)
return input
# class HybridTrainPipe(Pipeline):
# def __init__(self, batch_size, num_threads, device_id, data_dir, crop, dali_cpu=False):
# super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed = 12 + device_id)
# if torch.distributed.is_initialized():
# rank = torch.distributed.get_rank()
# world_size = torch.distributed.get_world_size()
# else:
# rank = 0
# world_size = 1
# self.input = ops.FileReader(
# file_root = data_dir,
# shard_id = rank,
# num_shards = world_size,
# random_shuffle = True)
# if dali_cpu:
# dali_device = "cpu"
# self.decode = ops.ImageDecoder(device=dali_device, output_type=types.RGB)
# else:
# dali_device = "gpu"
# # This padding sets the size of the internal nvJPEG buffers to be able to handle all images from full-sized ImageNet
# # without additional reallocations
# self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB, device_memory_padding=211025920, host_memory_padding=140544512)
# self.res = ops.RandomResizedCrop(
# device=dali_device,
# size=[crop, crop],
# interp_type=types.INTERP_LINEAR,
# random_aspect_ratio=[0.75, 4./3.],
# random_area=[0.08, 1.0],
# num_attempts=100)
# self.cmnp = ops.CropMirrorNormalize(device = "gpu",
# output_dtype = types.FLOAT,
# output_layout = types.NCHW,
# crop = (crop, crop),
# image_type = types.RGB,
# mean = [0.485 * 255,0.456 * 255,0.406 * 255],
# std = [0.229 * 255,0.224 * 255,0.225 * 255])
# self.coin = ops.CoinFlip(probability = 0.5)
# def define_graph(self):
# rng = self.coin()
# self.jpegs, self.labels = self.input(name = "Reader")
# images = self.decode(self.jpegs)
# images = self.res(images)
# output = self.cmnp(images.gpu(), mirror = rng)
# return [output, self.labels]
# class HybridValPipe(Pipeline):
# def __init__(self, batch_size, num_threads, device_id, data_dir, crop, size):
# super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed = 12 + device_id)
# if torch.distributed.is_initialized():
# rank = torch.distributed.get_rank()
# world_size = torch.distributed.get_world_size()
# else:
# rank = 0
# world_size = 1
# self.input = ops.FileReader(
# file_root = data_dir,
# shard_id = rank,
# num_shards = world_size,
# random_shuffle = False)
# self.decode = ops.ImageDecoder(device = "mixed", output_type = types.RGB)
# self.res = ops.Resize(device = "gpu", resize_shorter = size)
# self.cmnp = ops.CropMirrorNormalize(device = "gpu",
# output_dtype = types.FLOAT,
# output_layout = types.NCHW,
# crop = (crop, crop),
# image_type = types.RGB,
# mean = [0.485 * 255,0.456 * 255,0.406 * 255],
# std = [0.229 * 255,0.224 * 255,0.225 * 255])
# def define_graph(self):
# self.jpegs, self.labels = self.input(name = "Reader")
# images = self.decode(self.jpegs)
# images = self.res(images)
# output = self.cmnp(images)
# return [output, self.labels]
class DALIWrapper(object):
def gen_wrapper(dalipipeline, num_classes, one_hot):
for data in dalipipeline:
input = data[0]["data"]
target = torch.reshape(data[0]["label"], [-1]).cuda().long()
if one_hot:
target = expand(num_classes, torch.float, target)
yield input, target
dalipipeline.reset()
def __init__(self, dalipipeline, num_classes, one_hot):
self.dalipipeline = dalipipeline
self.num_classes = num_classes
self.one_hot = one_hot
def __iter__(self):
return DALIWrapper.gen_wrapper(self.dalipipeline, self.num_classes, self.one_hot)
def get_dali_train_loader(dali_cpu=False):
# def gdtl(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False):
# if torch.distributed.is_initialized():
# rank = torch.distributed.get_rank()
# world_size = torch.distributed.get_world_size()
# else:
# rank = 0
# world_size = 1
# traindir = os.path.join(data_path, 'train')
# pipe = HybridTrainPipe(batch_size=batch_size, num_threads=workers,
# device_id = rank % torch.cuda.device_count(),
# data_dir = traindir, crop = 224, dali_cpu=dali_cpu)
# pipe.build()
# train_loader = DALIClassificationIterator(pipe, size = int(pipe.epoch_size("Reader") / world_size))
# return DALIWrapper(train_loader, num_classes, one_hot), int(pipe.epoch_size("Reader") / (world_size * batch_size))
# return gdtl
def gdtl(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False):
return False
return gdvl
def get_dali_val_loader():
# def gdvl(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False):
# if torch.distributed.is_initialized():
# rank = torch.distributed.get_rank()
# world_size = torch.distributed.get_world_size()
# else:
# rank = 0
# world_size = 1
# valdir = os.path.join(data_path, 'val')
# pipe = HybridValPipe(batch_size=batch_size, num_threads=workers,
# device_id = rank % torch.cuda.device_count(),
# data_dir = valdir,
# crop = 224, size = 256)
# pipe.build()
# val_loader = DALIClassificationIterator(pipe, size = int(pipe.epoch_size("Reader") / world_size))
# return DALIWrapper(val_loader, num_classes, one_hot), int(pipe.epoch_size("Reader") / (world_size * batch_size))
# return gdvl
def gdvl(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False):
return False
return gdvl
def fast_collate(batch):
imgs = [img[0] for img in batch]
targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
w = imgs[0].size[0]
h = imgs[0].size[1]
tensor = torch.zeros( (len(imgs), 3, h, w), dtype=torch.uint8 )
for i, img in enumerate(imgs):
nump_array = np.asarray(img, dtype=np.uint8)
tens = torch.from_numpy(nump_array)
if(nump_array.ndim < 3):
nump_array = np.expand_dims(nump_array, axis=-1)
nump_array = np.rollaxis(nump_array, 2)
tensor[i] += torch.from_numpy(nump_array)
return tensor, targets
def expand(num_classes, dtype, tensor):
e = torch.zeros(tensor.size(0), num_classes, dtype=dtype, device=torch.device('cuda'))
e = e.scatter(1, tensor.unsqueeze(1), 1.0)
return e
class PrefetchedWrapper(object):
def prefetched_loader(loader, num_classes, fp16, one_hot):
mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).cuda().view(1,3,1,1)
std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).cuda().view(1,3,1,1)
if fp16:
mean = mean.half()
std = std.half()
stream = torch.cuda.Stream()
first = True
for next_input, next_target in loader:
with torch.cuda.stream(stream):
next_input = next_input.cuda(non_blocking=True)
next_target = next_target.cuda(non_blocking=True)
if fp16:
next_input = next_input.half()
if one_hot:
next_target = expand(num_classes, torch.half, next_target)
else:
next_input = next_input.float()
if one_hot:
next_target = expand(num_classes, torch.float, next_target)
next_input = next_input.sub_(mean).div_(std)
if not first:
yield input, target
else:
first = False
torch.cuda.current_stream().wait_stream(stream)
input = next_input
target = next_target
yield input, target
def __init__(self, dataloader, num_classes, fp16, one_hot):
self.dataloader = dataloader
self.fp16 = fp16
self.epoch = 0
self.one_hot = one_hot
self.num_classes = num_classes
def __iter__(self):
if (self.dataloader.sampler is not None and
isinstance(self.dataloader.sampler,
torch.utils.data.distributed.DistributedSampler)):
self.dataloader.sampler.set_epoch(self.epoch)
self.epoch += 1
return PrefetchedWrapper.prefetched_loader(self.dataloader, self.num_classes, self.fp16, self.one_hot)
def get_pytorch_train_loader(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False):
traindir = os.path.join(data_path, 'train')
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
]))
if torch.distributed.is_initialized():
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=batch_size, shuffle=(train_sampler is None),
num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate, drop_last=True)
return PrefetchedWrapper(train_loader, num_classes, fp16, one_hot), len(train_loader)
def get_pytorch_val_loader(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False):
valdir = os.path.join(data_path, 'val')
val_dataset = datasets.ImageFolder(
valdir, transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
]))
if torch.distributed.is_initialized():
val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
else:
val_sampler = None
val_loader = torch.utils.data.DataLoader(
val_dataset,
sampler=val_sampler,
batch_size=batch_size, shuffle=False,
num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True,
collate_fn=fast_collate)
return PrefetchedWrapper(val_loader, num_classes, fp16, one_hot), len(val_loader)
class SynteticDataLoader(object):
def __init__(self, fp16, batch_size, num_classes, num_channels, height, width, one_hot):
input_data = torch.empty(batch_size, num_channels, height, width).cuda().normal_(0, 1.0)
if one_hot:
input_target = torch.empty(batch_size, num_classes).cuda()
input_target[:, 0] = 1.0
else:
input_target = torch.randint(0, num_classes, (batch_size,))
input_target=input_target.cuda()
if fp16:
input_data = input_data.half()
self.input_data = input_data
self.input_target = input_target
def __iter__(self):
while True:
yield self.input_data, self.input_target
def get_syntetic_loader(data_path, batch_size, num_classes, one_hot, workers=None, _worker_init_fn=None, fp16=False):
return SynteticDataLoader(fp16, batch_size, 1000, 3, 224, 224, one_hot), -1
@@ -0,0 +1,310 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from collections import OrderedDict
import dllogger
import numpy as np
def format_step(step):
if isinstance(step, str):
return step
s = ""
if len(step) > 0:
s += "Epoch: {} ".format(step[0])
if len(step) > 1:
s += "Iteration: {} ".format(step[1])
if len(step) > 2:
s += "Validation Iteration: {} ".format(step[2])
if len(step) == 0:
s = "Summary:"
return s
PERF_METER = lambda: Meter(AverageMeter(), AverageMeter(), AverageMeter())
LOSS_METER = lambda: Meter(AverageMeter(), AverageMeter(), MinMeter())
ACC_METER = lambda: Meter(AverageMeter(), AverageMeter(), MaxMeter())
LR_METER = lambda: Meter(LastMeter(), LastMeter(), LastMeter())
LAT_100 = lambda: Meter(QuantileMeter(1), QuantileMeter(1), QuantileMeter(1))
LAT_99 = lambda: Meter(QuantileMeter(0.99), QuantileMeter(0.99), QuantileMeter(0.99))
LAT_95 = lambda: Meter(QuantileMeter(0.95), QuantileMeter(0.95), QuantileMeter(0.95))
class Meter(object):
def __init__(self, iteration_aggregator, epoch_aggregator, run_aggregator):
self.run_aggregator = run_aggregator
self.epoch_aggregator = epoch_aggregator
self.iteration_aggregator = iteration_aggregator
def record(self, val, n=1):
self.iteration_aggregator.record(val, n=n)
def get_iteration(self):
v, n = self.iteration_aggregator.get_val()
return v
def reset_iteration(self):
v, n = self.iteration_aggregator.get_data()
self.iteration_aggregator.reset()
if v is not None:
self.epoch_aggregator.record(v, n=n)
def get_epoch(self):
v, n = self.epoch_aggregator.get_val()
return v
def reset_epoch(self):
v, n = self.epoch_aggregator.get_data()
self.epoch_aggregator.reset()
if v is not None:
self.run_aggregator.record(v, n=n)
def get_run(self):
v, n = self.run_aggregator.get_val()
return v
def reset_run(self):
self.run_aggregator.reset()
class QuantileMeter(object):
def __init__(self, q):
self.q = q
self.reset()
def reset(self):
self.vals = []
self.n = 0
def record(self, val, n=1):
if isinstance(val, list):
self.vals += val
self.n += len(val)
else:
self.vals += [val] * n
self.n += n
def get_val(self):
if not self.vals:
return None, self.n
return np.quantile(self.vals, self.q, interpolation='nearest'), self.n
def get_data(self):
return self.vals, self.n
class MaxMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.max = None
self.n = 0
def record(self, val, n=1):
if self.max is None:
self.max = val
else:
self.max = max(self.max, val)
self.n = n
def get_val(self):
return self.max, self.n
def get_data(self):
return self.max, self.n
class MinMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.min = None
self.n = 0
def record(self, val, n=1):
if self.min is None:
self.min = val
else:
self.min = max(self.min, val)
self.n = n
def get_val(self):
return self.min, self.n
def get_data(self):
return self.min, self.n
class LastMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.last = None
self.n = 0
def record(self, val, n=1):
self.last = val
self.n = n
def get_val(self):
return self.last, self.n
def get_data(self):
return self.last, self.n
class AverageMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.n = 0
self.val = 0
def record(self, val, n=1):
self.n += n
self.val += val * n
def get_val(self):
if self.n == 0:
return None, 0
return self.val / self.n, self.n
def get_data(self):
if self.n == 0:
return None, 0
return self.val / self.n, self.n
class Logger(object):
def __init__(self, print_interval, backends, verbose=False):
self.epoch = -1
self.iteration = -1
self.val_iteration = -1
self.metrics = OrderedDict()
self.backends = backends
self.print_interval = print_interval
self.verbose = verbose
dllogger.init(backends)
def log_parameter(self, data, verbosity=0):
dllogger.log(step="PARAMETER", data=data, verbosity=verbosity)
def register_metric(self, metric_name, meter, verbosity=0, metadata={}):
if self.verbose:
print("Registering metric: {}".format(metric_name))
self.metrics[metric_name] = {'meter': meter, 'level': verbosity}
dllogger.metadata(metric_name, metadata)
def log_metric(self, metric_name, val, n=1):
self.metrics[metric_name]['meter'].record(val, n=n)
def start_iteration(self, val=False):
if val:
self.val_iteration += 1
else:
self.iteration += 1
def end_iteration(self, val=False):
it = self.val_iteration if val else self.iteration
if (it % self.print_interval == 0):
metrics = {
n: m
for n, m in self.metrics.items() if n.startswith('val') == val
}
step = (self.epoch,
self.iteration) if not val else (self.epoch,
self.iteration,
self.val_iteration)
verbositys = {m['level'] for _, m in metrics.items()}
for ll in verbositys:
llm = {n: m for n, m in metrics.items() if m['level'] == ll}
dllogger.log(step=step,
data={
n: m['meter'].get_iteration()
for n, m in llm.items()
},
verbosity=ll)
for n, m in metrics.items():
m['meter'].reset_iteration()
dllogger.flush()
def start_epoch(self):
self.epoch += 1
self.iteration = 0
self.val_iteration = 0
for n, m in self.metrics.items():
m['meter'].reset_epoch()
def end_epoch(self):
for n, m in self.metrics.items():
m['meter'].reset_iteration()
verbositys = {m['level'] for _, m in self.metrics.items()}
for ll in verbositys:
llm = {n: m for n, m in self.metrics.items() if m['level'] == ll}
dllogger.log(step=(self.epoch, ),
data={n: m['meter'].get_epoch()
for n, m in llm.items()})
def end(self):
for n, m in self.metrics.items():
m['meter'].reset_epoch()
verbositys = {m['level'] for _, m in self.metrics.items()}
for ll in verbositys:
llm = {n: m for n, m in self.metrics.items() if m['level'] == ll}
dllogger.log(step=tuple(),
data={n: m['meter'].get_run()
for n, m in llm.items()})
for n, m in self.metrics.items():
m['meter'].reset_epoch()
dllogger.flush()
def iteration_generator_wrapper(self, gen, val=False):
for g in gen:
self.start_iteration(val=val)
yield g
self.end_iteration(val=val)
def epoch_generator_wrapper(self, gen):
for g in gen:
self.start_epoch()
yield g
self.end_epoch()
@@ -0,0 +1,67 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
import numpy as np
def mixup(alpha, num_classes, data, target):
with torch.no_grad():
bs = data.size(0)
c = np.random.beta(alpha, alpha)
perm = torch.randperm(bs).cuda()
md = c * data + (1-c) * data[perm, :]
mt = c * target + (1-c) * target[perm, :]
return md, mt
class MixUpWrapper(object):
def __init__(self, alpha, num_classes, dataloader):
self.alpha = alpha
self.dataloader = dataloader
self.num_classes = num_classes
def mixup_loader(self, loader):
for input, target in loader:
i, t = mixup(self.alpha, self.num_classes, input, target)
yield i, t
def __iter__(self):
return self.mixup_loader(self.dataloader)
class NLLMultiLabelSmooth(nn.Module):
def __init__(self, smoothing = 0.0):
super(NLLMultiLabelSmooth, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
def forward(self, x, target):
if self.training:
x = x.float()
target = target.float()
logprobs = torch.nn.functional.log_softmax(x, dim = -1)
nll_loss = -logprobs * target
nll_loss = nll_loss.sum(-1)
smooth_loss = -logprobs.mean(dim=-1)
loss = self.confidence * nll_loss + self.smoothing * smooth_loss
return loss.mean()
else:
return torch.nn.functional.cross_entropy(x, target)
@@ -0,0 +1,370 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import math
import torch
import torch.nn as nn
import numpy as np
__all__ = ['ResNet', 'build_resnet', 'resnet_versions', 'resnet_configs']
# ResNetBuilder {{{
class ResNetBuilder(object):
def __init__(self, version, config):
self.conv3x3_cardinality = 1 if 'cardinality' not in version.keys() else version['cardinality']
self.config = config
def conv(self, kernel_size, in_planes, out_planes, groups=1, stride=1):
conv = nn.Conv2d(
in_planes, out_planes,
kernel_size=kernel_size, groups=groups,
stride=stride, padding=int((kernel_size - 1)/2),
bias=False)
if self.config['nonlinearity'] == 'relu':
# torch.nn.init.kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu')
# Copy
# 用论文 “Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification” - He, K. et al. (2015) 中提及的正态分布初始化输入 Tensor。初始化后的张量中的值采样 ) 且
# %20%5Ctimes%20%5Ctext%7Bfan%5C_in%7D%7D%7D%0D%0A%0D%0A)
# 也被称作 He initialization。
# 参数:
# tensor n 维 torch.Tensor
# a – 该层后面一层的整流函数中负的斜率 (默认为 0,此时为 Relu)
# mode 'fan_in' (default) 或者 'fan_out'。使用fan_in保持weights的方差在前向传播中不变;使用fan_out保持weights的方差在反向传播中不变。
# nonlinearity 非线性函数 (nn.functional 中的名字),推荐只使用 'relu' 或 'leaky_relu' (default)。
# 例子
# >>> w = torch.empty(3, 5)
# >>> nn.init.kaiming_normal_(w, mode='fan_out', nonlinearity='relu')
nn.init.kaiming_normal_(conv.weight,
mode=self.config['conv_init'],
nonlinearity=self.config['nonlinearity'])
return conv
def conv3x3(self, in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
c = self.conv(3, in_planes, out_planes, groups=self.conv3x3_cardinality, stride=stride)
return c
def conv1x1(self, in_planes, out_planes, stride=1):
"""1x1 convolution with padding"""
c = self.conv(1, in_planes, out_planes, stride=stride)
return c
def conv7x7(self, in_planes, out_planes, stride=1):
"""7x7 convolution with padding"""
c = self.conv(7, in_planes, out_planes, stride=stride)
return c
def conv5x5(self, in_planes, out_planes, stride=1):
"""5x5 convolution with padding"""
c = self.conv(5, in_planes, out_planes, stride=stride)
return c
def batchnorm(self, planes, last_bn=False):
bn = nn.BatchNorm2d(planes)
gamma_init_val = 0 if last_bn and self.config['last_bn_0_init'] else 1
nn.init.constant_(bn.weight, gamma_init_val)
nn.init.constant_(bn.bias, 0)
return bn
def activation(self):
return self.config['activation']()
# ResNetBuilder }}}
# BasicBlock {{{
class BasicBlock(nn.Module):
def __init__(self, builder, inplanes, planes, expansion, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = builder.conv3x3(inplanes, planes, stride)
self.bn1 = builder.batchnorm(planes)
self.relu = builder.activation()
self.conv2 = builder.conv3x3(planes, planes*expansion)
self.bn2 = builder.batchnorm(planes*expansion, last_bn=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
if self.bn1 is not None:
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
if self.bn2 is not None:
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
# BasicBlock }}}
# SqueezeAndExcitation {{{
class SqueezeAndExcitation(nn.Module):
def __init__(self, planes, squeeze):
super(SqueezeAndExcitation, self).__init__()
self.squeeze = nn.Linear(planes, squeeze)
self.expand = nn.Linear(squeeze, planes)
self.relu = nn.ReLU(inplace=True)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
out = torch.mean(x.view(x.size(0), x.size(1), -1), 2)
out = self.squeeze(out)
out = self.relu(out)
out = self.expand(out)
out = self.sigmoid(out)
out = out.unsqueeze(2).unsqueeze(3)
return out
# }}}
# Bottleneck {{{
class Bottleneck(nn.Module):
def __init__(self, builder, inplanes, planes, expansion, stride=1, se=False, se_squeeze=16, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = builder.conv1x1(inplanes, planes)
self.bn1 = builder.batchnorm(planes)
self.conv2 = builder.conv3x3(planes, planes, stride=stride)
self.bn2 = builder.batchnorm(planes)
self.conv3 = builder.conv1x1(planes, planes * expansion)
self.bn3 = builder.batchnorm(planes * expansion, last_bn=True)
self.relu = builder.activation()
self.downsample = downsample
self.stride = stride
self.squeeze = SqueezeAndExcitation(planes*expansion, se_squeeze) if se else None
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
if self.squeeze is None:
out += residual
else:
out = torch.addcmul(residual, 1.0, out, self.squeeze(out))
out = self.relu(out)
return out
def SEBottleneck(builder, inplanes, planes, expansion, stride=1, downsample=None):
return Bottleneck(builder, inplanes, planes, expansion, stride=stride, se=True, se_squeeze=16, downsample=downsample)
# Bottleneck }}}
# ResNet {{{
class ResNet(nn.Module):
def __init__(self, builder, block, expansion, layers, widths, num_classes=1000):
self.inplanes = 64
super(ResNet, self).__init__()
self.conv1 = builder.conv7x7(3, 64, stride=2)
self.bn1 = builder.batchnorm(64)
self.relu = builder.activation()
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(builder, block, expansion, widths[0], layers[0])
self.layer2 = self._make_layer(builder, block, expansion, widths[1], layers[1], stride=2)
self.layer3 = self._make_layer(builder, block, expansion, widths[2], layers[2], stride=2)
self.layer4 = self._make_layer(builder, block, expansion, widths[3], layers[3], stride=2)
self.avgpool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Linear(widths[3] * expansion, num_classes)
def _make_layer(self, builder, block, expansion, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * expansion:
dconv = builder.conv1x1(self.inplanes, planes * expansion,
stride=stride)
dbn = builder.batchnorm(planes * expansion)
if dbn is not None:
downsample = nn.Sequential(dconv, dbn)
else:
downsample = dconv
layers = []
layers.append(block(builder, self.inplanes, planes, expansion, stride=stride, downsample=downsample))
self.inplanes = planes * expansion
for i in range(1, blocks):
layers.append(block(builder, self.inplanes, planes, expansion))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
if self.bn1 is not None:
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
# ResNet }}}
resnet_configs = {
'classic' : {
'conv' : nn.Conv2d,
'conv_init' : 'fan_out',
'nonlinearity' : 'relu',
'last_bn_0_init' : False,
'activation' : lambda: nn.ReLU(inplace=True),
},
'fanin' : {
'conv' : nn.Conv2d,
'conv_init' : 'fan_in',
'nonlinearity' : 'relu',
'last_bn_0_init' : False,
'activation' : lambda: nn.ReLU(inplace=True),
},
'grp-fanin' : {
'conv' : nn.Conv2d,
'conv_init' : 'fan_in',
'nonlinearity' : 'relu',
'last_bn_0_init' : False,
'activation' : lambda: nn.ReLU(inplace=True),
},
'grp-fanout' : {
'conv' : nn.Conv2d,
'conv_init' : 'fan_out',
'nonlinearity' : 'relu',
'last_bn_0_init' : False,
'activation' : lambda: nn.ReLU(inplace=True),
},
}
resnet_versions = {
'resnet18' : {
'net' : ResNet,
'block' : BasicBlock,
'layers' : [2, 2, 2, 2],
'widths' : [64, 128, 256, 512],
'expansion' : 1,
'num_classes' : 1000,
},
'resnet34' : {
'net' : ResNet,
'block' : BasicBlock,
'layers' : [3, 4, 6, 3],
'widths' : [64, 128, 256, 512],
'expansion' : 1,
'num_classes' : 1000,
},
'resnet50' : {
'net' : ResNet,
'block' : Bottleneck,
'layers' : [3, 4, 6, 3],
'widths' : [64, 128, 256, 512],
'expansion' : 4,
'num_classes' : 1000,
},
'resnet101' : {
'net' : ResNet,
'block' : Bottleneck,
'layers' : [3, 4, 23, 3],
'widths' : [64, 128, 256, 512],
'expansion' : 4,
'num_classes' : 1000,
},
'resnet152' : {
'net' : ResNet,
'block' : Bottleneck,
'layers' : [3, 8, 36, 3],
'widths' : [64, 128, 256, 512],
'expansion' : 4,
'num_classes' : 1000,
},
'resnext101-32x4d' : {
'net' : ResNet,
'block' : Bottleneck,
'cardinality' : 32,
'layers' : [3, 4, 23, 3],
'widths' : [128, 256, 512, 1024],
'expansion' : 2,
'num_classes' : 1000,
},
'se-resnext101-32x4d' : {
'net' : ResNet,
'block' : SEBottleneck,
'cardinality' : 32,
'layers' : [3, 4, 23, 3],
'widths' : [128, 256, 512, 1024],
'expansion' : 2,
'num_classes' : 1000,
},
}
def build_resnet(version, config, verbose=True):
version = resnet_versions[version]
config = resnet_configs[config]
builder = ResNetBuilder(version, config)
if verbose:
print("Version: {}".format(version))
print("Config: {}".format(config))
model = version['net'](builder,
version['block'],
version['expansion'],
version['layers'],
version['widths'],
version['num_classes'])
return model
@@ -0,0 +1,91 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
class CrossEntropy(nn.CrossEntropyLoss):
def __init__(self, smooth_factor=0., num_classes=1000):
super(CrossEntropy, self).__init__()
self.on_value = 1.0 - smooth_factor
self.off_value = 1.0 * smooth_factor / (num_classes - 1)
def forward(self, input, target):
one_hot_label = torch.npu_one_hot(target, -1, input.size(1), self.on_value, self.off_value)
one_hot_label = one_hot_label.to(torch.float16)
loss = torch.npu_softmax_cross_entropy_with_logits(input.to(torch.float16), one_hot_label)
loss = torch.mean(loss, [0], keepdim=False, dtype=torch.float32)
return loss
class LabelSmoothingNpu(nn.Module):
"""
NLL loss with label smoothing.
"""
def __init__(self, smoothing=0.0):
"""
Constructor for the LabelSmoothing module.
:param smoothing: label smoothing factor
"""
super(LabelSmoothingNpu, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
self.epsilon = 0.1
self.num_classes = 1000
def forward(self, x, target):
CALCULATE_DEVICE = x.device
logprobs = torch.nn.functional.log_softmax(x, dim=-1).to("cpu")
targets = torch.zeros_like(logprobs).scatter_(1, target.unsqueeze(1), 1)
targets = (1 - self.epsilon) * targets + self.epsilon / self.num_classes
loss = (-targets * logprobs).mean(0).sum()
# nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
# nll_loss = nll_loss.squeeze(1)
# smooth_loss = -logprobs.mean(dim=-1)
# loss = self.confidence * nll_loss + self.smoothing * smooth_loss
return loss.to(CALCULATE_DEVICE)
class LabelSmoothingGpu(nn.Module):
"""
NLL loss with label smoothing.
"""
def __init__(self, smoothing=0.0):
"""
Constructor for the LabelSmoothing module.
:param smoothing: label smoothing factor
"""
super(LabelSmoothingGpu, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
# print("----------------------LabelSooothing.__init__")
# def __call__(self,x,target):
# print("----------------------LabelSooothing.__call__")
# return self.forward(self,x,target)
def forward(self, x, target):
logprobs = torch.nn.functional.log_softmax(x, dim=-1)
nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
nll_loss = nll_loss.squeeze(1)
smooth_loss = -logprobs.mean(dim=-1)
loss = self.confidence * nll_loss + self.smoothing * smooth_loss
#print("================",type(x),x.size())
#print("------------------",type(target),target.size(),target)
return loss.mean()
@@ -0,0 +1,51 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
class LabelSmoothing(nn.Module):
"""
NLL loss with label smoothing.
"""
def __init__(self, smoothing=0.0):
"""
Constructor for the LabelSmoothing module.
:param smoothing: label smoothing factor
"""
super(LabelSmoothing, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
# print("----------------------LabelSooothing.__init__")
# def __call__(self,x,target):
# print("----------------------LabelSooothing.__call__")
# return self.forward(self,x,target)
def forward(self, x, target):
device_x = x.device
device_target = target.device
x = x.to("cpu")
target = target.to("cpu")
logprobs = torch.nn.functional.log_softmax(x, dim=-1)
nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
nll_loss = nll_loss.squeeze(1)
smooth_loss = -logprobs.mean(dim=-1)
loss = self.confidence * nll_loss + self.smoothing * smooth_loss
#print("================",type(x),x.size())
#print("------------------",type(target),target.size(),target)
x = x.to(device_x)
target = target.to(device_target)
return loss.mean()
@@ -0,0 +1,534 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import os
import time
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from . import logger as log
from . import resnet as nvmodels
from . import utils
import dllogger
try:
#from apex.parallel import DistributedDataParallel as DDP #可以采用pytorch torch.distributed
from apex.fp16_utils import *
from apex import amp
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to run this example."
)
ACC_METADATA = {'unit': '%','format': ':.2f'}
IPS_METADATA = {'unit': 'img/s', 'format': ':.2f'}
TIME_METADATA = {'unit': 's', 'format': ':.5f'}
LOSS_METADATA = {'format': ':.5f'}
class ModelAndLoss(nn.Module):
def __init__(self,
arch,
loss,
pretrained_weights=None,
cuda=True,
fp16=False):
super(ModelAndLoss, self).__init__()
self.arch = arch
print("=> creating model '{}'".format(arch))
model = nvmodels.build_resnet(arch[0], arch[1])
if pretrained_weights is not None:
print("=> using pre-trained model from a file '{}'".format(arch))
model.load_state_dict(pretrained_weights)
if cuda:
model = model.cuda()
if fp16:
model = network_to_half(model)
# define loss function (criterion) and optimizer
criterion = loss()
if cuda:
criterion = criterion.cuda()
self.model = model
self.loss = criterion
def forward(self, data, target):
output = self.model(data)
loss = self.loss(output, target)
return loss, output
def distributed(self):
#self.model = DDP(self.model)
return
def load_model_state(self, state):
if not state is None:
self.model.load_state_dict(state)
def get_optimizer(parameters,
fp16,
lr,
momentum,
weight_decay,
nesterov=False,
state=None,
static_loss_scale=1.,
dynamic_loss_scale=False,
bn_weight_decay=False):
if bn_weight_decay:
print(" ! Weight decay applied to BN parameters ")
optimizer = torch.optim.SGD([v for n, v in parameters],
lr,
momentum=momentum,
weight_decay=weight_decay,
nesterov=nesterov)
else:
print(" ! Weight decay NOT applied to BN parameters ")
bn_params = [v for n, v in parameters if 'bn' in n]
rest_params = [v for n, v in parameters if not 'bn' in n]
print(len(bn_params))
print(len(rest_params))
optimizer = torch.optim.SGD([{
'params': bn_params,
'weight_decay': 0
}, {
'params': rest_params,
'weight_decay': weight_decay
}],
lr,
momentum=momentum,
weight_decay=weight_decay,
nesterov=nesterov)
if fp16:
optimizer = FP16_Optimizer(optimizer,
static_loss_scale=static_loss_scale,
dynamic_loss_scale=dynamic_loss_scale,
verbose=False)
if not state is None:
optimizer.load_state_dict(state)
return optimizer
def lr_policy(lr_fn, logger=None):
if logger is not None:
logger.register_metric('lr',
log.LR_METER(),
verbosity=dllogger.Verbosity.VERBOSE)
def _alr(optimizer, iteration, epoch):
lr = lr_fn(iteration, epoch)
if logger is not None:
logger.log_metric('lr', lr)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
return _alr
def lr_step_policy(base_lr, steps, decay_factor, warmup_length, logger=None):
def _lr_fn(iteration, epoch):
if epoch < warmup_length:
lr = base_lr * (epoch + 1) / warmup_length
else:
lr = base_lr
for s in steps:
if epoch >= s:
lr *= decay_factor
return lr
return lr_policy(_lr_fn, logger=logger)
def lr_linear_policy(base_lr, warmup_length, epochs, logger=None):
def _lr_fn(iteration, epoch):
if epoch < warmup_length:
lr = base_lr * (epoch + 1) / warmup_length
else:
e = epoch - warmup_length
es = epochs - warmup_length
lr = base_lr * (1 - (e / es))
return lr
return lr_policy(_lr_fn, logger=logger)
def lr_cosine_policy(base_lr, warmup_length, epochs, logger=None):
def _lr_fn(iteration, epoch):
if epoch < warmup_length:
lr = base_lr * (epoch + 1) / warmup_length
else:
e = epoch - warmup_length
es = epochs - warmup_length
lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
return lr
return lr_policy(_lr_fn, logger=logger)
def lr_exponential_policy(base_lr,
warmup_length,
epochs,
final_multiplier=0.001,
logger=None):
es = epochs - warmup_length
epoch_decay = np.power(2, np.log2(final_multiplier) / es)
def _lr_fn(iteration, epoch):
if epoch < warmup_length:
lr = base_lr * (epoch + 1) / warmup_length
else:
e = epoch - warmup_length
lr = base_lr * (epoch_decay**e)
return lr
return lr_policy(_lr_fn, logger=logger)
def get_train_step(model_and_loss,
optimizer,
fp16,
use_amp=False,
batch_size_multiplier=1):
def _step(input, target, optimizer_step=True):
input_var = Variable(input)
target_var = Variable(target)
loss, output = model_and_loss(input_var, target_var)
if torch.distributed.is_initialized():
print('utils.reduce_tensor(loss.data)')
reduced_loss = utils.reduce_tensor(loss.data)
else:
reduced_loss = loss.data
if fp16:
optimizer.backward(loss)
elif use_amp:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
if optimizer_step:
opt = optimizer.optimizer if isinstance(
optimizer, FP16_Optimizer) else optimizer
for param_group in opt.param_groups:
for param in param_group['params']:
param.grad /= batch_size_multiplier
optimizer.step()
optimizer.zero_grad()
torch.cuda.synchronize()
return reduced_loss
return _step
def train(train_loader,
model_and_loss,
optimizer,
lr_scheduler,
fp16,
logger,
epoch,
use_amp=False,
prof=-1,
batch_size_multiplier=1,
register_metrics=True):
if register_metrics and logger is not None:
logger.register_metric('train.loss',
log.LOSS_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=LOSS_METADATA)
logger.register_metric('train.compute_ips',
log.PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=IPS_METADATA)
logger.register_metric('train.total_ips',
log.PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=IPS_METADATA)
logger.register_metric('train.data_time',
log.PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=TIME_METADATA)
logger.register_metric('train.compute_time',
log.PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=TIME_METADATA)
step = get_train_step(model_and_loss,
optimizer,
fp16,
use_amp=use_amp,
batch_size_multiplier=batch_size_multiplier)
model_and_loss.train()
end = time.time()
optimizer.zero_grad()
data_iter = enumerate(train_loader)
if logger is not None:
data_iter = logger.iteration_generator_wrapper(data_iter)
if prof > 0:
data_iter = utils.first_n(prof, data_iter)
for i, (input, target) in data_iter:
bs = input.size(0)
lr_scheduler(optimizer, i, epoch)
data_time = time.time() - end
optimizer_step = ((i + 1) % batch_size_multiplier) == 0
loss = step(input, target, optimizer_step=optimizer_step)
it_time = time.time() - end
if logger is not None:
logger.log_metric('train.loss', to_python_float(loss), bs)
logger.log_metric('train.compute_ips',
calc_ips(bs, it_time - data_time))
logger.log_metric('train.total_ips', calc_ips(bs, it_time))
logger.log_metric('train.data_time', data_time)
logger.log_metric('train.compute_time', it_time - data_time)
end = time.time()
def get_val_step(model_and_loss):
def _step(input, target):
input_var = Variable(input)
target_var = Variable(target)
with torch.no_grad():
loss, output = model_and_loss(input_var, target_var)
prec1, prec5 = utils.accuracy(output.data, target, topk=(1, 5))
if torch.distributed.is_initialized():
reduced_loss = utils.reduce_tensor(loss.data)
prec1 = utils.reduce_tensor(prec1)
prec5 = utils.reduce_tensor(prec5)
else:
reduced_loss = loss.data
torch.cuda.synchronize()
return reduced_loss, prec1, prec5
return _step
def validate(val_loader,
model_and_loss,
fp16,
logger,
epoch,
prof=-1,
register_metrics=True):
if register_metrics and logger is not None:
logger.register_metric('val.top1',
log.ACC_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=ACC_METADATA)
logger.register_metric('val.top5',
log.ACC_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=ACC_METADATA)
logger.register_metric('val.loss',
log.LOSS_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=LOSS_METADATA)
logger.register_metric('val.compute_ips',
log.PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=IPS_METADATA)
logger.register_metric('val.total_ips',
log.PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=IPS_METADATA)
logger.register_metric('val.data_time',
log.PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=TIME_METADATA)
logger.register_metric('val.compute_latency',
log.PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=TIME_METADATA)
logger.register_metric('val.compute_latency_at100',
log.LAT_100(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=TIME_METADATA)
logger.register_metric('val.compute_latency_at99',
log.LAT_99(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=TIME_METADATA)
logger.register_metric('val.compute_latency_at95',
log.LAT_95(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=TIME_METADATA)
step = get_val_step(model_and_loss)
top1 = log.AverageMeter()
# switch to evaluate mode
model_and_loss.eval()
end = time.time()
data_iter = enumerate(val_loader)
if not logger is None:
data_iter = logger.iteration_generator_wrapper(data_iter, val=True)
if prof > 0:
data_iter = utils.first_n(prof, data_iter)
for i, (input, target) in data_iter:
bs = input.size(0)
data_time = time.time() - end
loss, prec1, prec5 = step(input, target)
it_time = time.time() - end
top1.record(to_python_float(prec1), bs)
if logger is not None:
logger.log_metric('val.top1', to_python_float(prec1), bs)
logger.log_metric('val.top5', to_python_float(prec5), bs)
logger.log_metric('val.loss', to_python_float(loss), bs)
logger.log_metric('val.compute_ips',
calc_ips(bs, it_time - data_time))
logger.log_metric('val.total_ips', calc_ips(bs, it_time))
logger.log_metric('val.data_time', data_time)
logger.log_metric('val.compute_latency', it_time - data_time)
logger.log_metric('val.compute_latency_at95', it_time - data_time)
logger.log_metric('val.compute_latency_at99', it_time - data_time)
logger.log_metric('val.compute_latency_at100', it_time - data_time)
end = time.time()
return top1.get_val()
# Train loop {{{
def calc_ips(batch_size, time):
world_size = torch.distributed.get_world_size(
) if torch.distributed.is_initialized() else 1
tbs = world_size * batch_size
return tbs / time
def train_loop(model_and_loss,
optimizer,
lr_scheduler,
train_loader,
val_loader,
epochs,
fp16,
logger,
should_backup_checkpoint,
use_amp=False,
batch_size_multiplier=1,
best_prec1=0,
start_epoch=0,
prof=-1,
skip_training=False,
skip_validation=False,
save_checkpoints=True,
checkpoint_dir='./'):
prec1 = -1
epoch_iter = range(start_epoch, epochs)
for epoch in epoch_iter:
if logger is not None:
logger.start_epoch()
if not skip_training:
train(train_loader,
model_and_loss,
optimizer,
lr_scheduler,
fp16,
logger,
epoch,
use_amp=use_amp,
prof=prof,
register_metrics=epoch == start_epoch,
batch_size_multiplier=batch_size_multiplier)
if not skip_validation:
prec1, nimg = validate(val_loader,
model_and_loss,
fp16,
logger,
epoch,
prof=prof,
register_metrics=epoch == start_epoch)
if logger is not None:
logger.end_epoch()
if save_checkpoints and (not torch.distributed.is_initialized()
or torch.distributed.get_rank() == 0):
if not skip_validation:
is_best = logger.metrics['val.top1']['meter'].get_epoch() > best_prec1
best_prec1 = max(logger.metrics['val.top1']['meter'].get_epoch(),
best_prec1)
else:
is_best = False
best_prec1 = 0
if should_backup_checkpoint(epoch):
backup_filename = 'checkpoint-{}.pth.tar'.format(epoch + 1)
else:
backup_filename = None
utils.save_checkpoint(
{
'epoch': epoch + 1,
'arch': model_and_loss.arch,
'state_dict': model_and_loss.model.state_dict(),
'best_prec1': best_prec1,
'optimizer': optimizer.state_dict(),
},
is_best,
checkpoint_dir=checkpoint_dir,
backup_filename=backup_filename)
# }}}
@@ -0,0 +1,106 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import os
import numpy as np
import torch
import shutil
import torch.distributed as dist
def should_backup_checkpoint(args):
def _sbc(epoch):
return args.gather_checkpoints and (epoch < 10 or epoch % 10 == 0)
return _sbc
def save_checkpoint(state,
is_best,
filename='checkpoint.pth.tar',
checkpoint_dir='./',
backup_filename=None):
if (not torch.distributed.is_initialized()
) or torch.distributed.get_rank() == 0:
filename = os.path.join(checkpoint_dir, filename)
print("SAVING {}".format(filename))
torch.save(state, filename)
if is_best:
shutil.copyfile(filename,
os.path.join(checkpoint_dir, 'model_best.pth.tar'))
if backup_filename is not None:
shutil.copyfile(filename,
os.path.join(checkpoint_dir, backup_filename))
def timed_generator(gen):
start = time.time()
for g in gen:
end = time.time()
t = end - start
yield g, t
start = time.time()
def timed_function(f):
def _timed_function(*args, **kwargs):
start = time.time()
ret = f(*args, **kwargs)
return ret, time.time() - start
return _timed_function
def accuracy(output, target, topk=(1, )):
"""Computes the precision@k for the specified values of k"""
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
def reduce_tensor(tensor):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.ReduceOp.SUM)
rt /= torch.distributed.get_world_size(
) if torch.distributed.is_initialized() else 1
return rt
def first_n(n, generator):
for i, d in zip(range(n), generator):
yield d
@@ -0,0 +1,609 @@
import argparse
import os
import random
import shutil
import time
import warnings
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import torch.npu
from apex import amp
from benchmark_log import hwlog
from benchmark_log.basic_utils import get_environment_info
from benchmark_log.basic_utils import get_model_parameter
'''
python3.7 pytorch-resnet50-apex.py --data /opt/npu/dataset/imagenet --npu 7 -j64 -b512 --lr 0.2 --warmup 5 --epochs 90 --label-smoothing 0.1 --optimizer-batch-size 1024 > batch1024-lr0.2-wd.txt &
'''
BATCH_SIZE = 512
EPOCHS_SIZE = 100
TRAIN_STEP = 8000
LOG_STEP = 1
CALCULATE_DEVICE = "npu:7"
PRINT_DEVICE = "cpu"
SOURCE_DIR = "/data/imagenet"
model_names = sorted(name for name in models.__dict__
if name.islower() and not name.startswith("__")
and callable(models.__dict__[name]))
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('--data', metavar='DIR', default=SOURCE_DIR,
help='path to dataset')
parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
choices=model_names,
help='model architecture: ' +
' | '.join(model_names) +
' (default: resnet18)')
parser.add_argument('-j', '--workers', default=32, type=int, metavar='N',
help='number of data loading workers (default: 8)')
parser.add_argument('--epochs', default=EPOCHS_SIZE, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=BATCH_SIZE, type=int,
metavar='N',
help='mini-batch size (default: 256), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=10, type=int,
metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
help='GPU id to use.')
parser.add_argument('--npu', default=None, type=int,
help='NPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
help='Use multi-processing distributed training to launch '
'N processes per node, which has N GPUs. This is the '
'fastest way to use PyTorch for either single node or '
'multi node data parallel training')
parser.add_argument('--warmup',
default=0,
type=int,
metavar='E',
help='number of warmup epochs')
parser.add_argument('--label-smoothing',
default=0.0,
type=float,
metavar='S',
help='label smoothing')
parser.add_argument('--optimizer-batch-size',
default=-1,
type=int,
metavar='N',
help=
'size of a total batch size, for simulating bigger batches using gradient accumulation')
parser.add_argument(
'--static-loss-scale',
type=float,
default=1,
help=
'Static loss scale, positive power of 2 values can improve fp16 convergence.')
best_acc1 = 0
def main():
args = parser.parse_args()
if args.npu is None:
args.npu = 0
global CALCULATE_DEVICE
CALCULATE_DEVICE = "npu:{}".format(args.npu)
torch.npu.set_device(CALCULATE_DEVICE)
print("use ", CALCULATE_DEVICE)
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
cudnn.deterministic = True
warnings.warn('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.')
if args.gpu is not None:
warnings.warn('You have chosen a specific GPU. This will completely '
'disable data parallelism.')
if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
ngpus_per_node = torch.cuda.device_count()
if args.multiprocessing_distributed:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args.world_size = ngpus_per_node * args.world_size
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
else:
# Simply call main_worker function
main_worker(args.gpu, ngpus_per_node, args)
def main_worker(gpu, ngpus_per_node, args):
global best_acc1
args.gpu = gpu
if args.gpu is not None:
print("Use GPU: {} for training".format(args.gpu))
if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
args.rank = int(os.environ["RANK"])
if args.multiprocessing_distributed:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
args.rank = args.rank * ngpus_per_node + gpu
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
# create model
if args.pretrained:
print("=> using pre-trained model '{}'".format(args.arch))
model = models.__dict__[args.arch](pretrained=True)
else:
print("=> creating model '{}'".format(args.arch))
model = models.__dict__[args.arch](zero_init_residual=True)
for layer in model.modules():
if isinstance(layer, nn.Linear):
torch.nn.init.kaiming_normal_(layer.weight, a=math.sqrt(5), )
if args.distributed:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
if args.gpu is not None:
torch.cuda.set_device(args.gpu)
model.cuda(args.gpu)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
args.batch_size = int(args.batch_size / ngpus_per_node)
args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
else:
model.cuda()
# DistributedDataParallel will divide and allocate batch_size to all
# available GPUs if device_ids are not set
model = torch.nn.parallel.DistributedDataParallel(model)
elif args.gpu is not None:
torch.cuda.set_device(args.gpu)
model = model.cuda(args.gpu)
else:
# DataParallel will divide and allocate batch_size to all available GPUs
if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
model.features = torch.nn.DataParallel(model.features)
model.cuda()
else:
#model = torch.nn.DataParallel(model).cuda()
model = model.to(CALCULATE_DEVICE)
lr_policy = lr_cosine_policy(args.lr,
args.warmup,
args.epochs)
# define loss function (criterion) and optimizer
#criterion = nn.CrossEntropyLoss().cuda(args.gpu)
loss = nn.CrossEntropyLoss
if args.label_smoothing > 0.0:
loss = lambda: LabelSmoothing(args.label_smoothing)
criterion = loss().to(CALCULATE_DEVICE)
optimizer = torch.optim.SGD([
{'params': [param for name, param in model.named_parameters() if name[-4:] == 'bias'], 'weight_decay': 0.0},
{'params': [param for name, param in model.named_parameters() if name[-4:] != 'bias'], 'weight_decay': args.weight_decay}],
args.lr,
momentum=args.momentum)
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=1024, verbosity=1)
# optionally resume from a checkpoint
if args.resume:
if os.path.isfile(args.resume):
print("=> loading checkpoint '{}'".format(args.resume))
if args.npu is not None:
checkpoint = torch.load(args.resume)
elif args.gpu is None:
checkpoint = torch.load(args.resume)
else:
# Map model to be loaded to specified single gpu.
loc = 'cuda:{}'.format(args.gpu)
checkpoint = torch.load(args.resume, map_location=loc)
args.start_epoch = checkpoint['epoch']
best_acc1 = checkpoint['best_acc1']
if args.npu is not None:
best_acc1 = best_acc1.to("npu:{}".format(args.npu))
elif args.gpu is not None:
# best_acc1 may be from a checkpoint from a different GPU
best_acc1 = best_acc1.to(args.gpu)
model.load_state_dict(checkpoint['state_dict'])
#optimizer.load_state_dict(checkpoint['optimizer'])
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.resume, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(args.resume))
cudnn.benchmark = True
# Data loading code
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]))
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
num_workers=args.workers, pin_memory=True, sampler=train_sampler)
val_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])),
batch_size=args.batch_size, shuffle=True,
num_workers=args.workers, pin_memory=True)
if args.evaluate:
validate(val_loader, model, criterion, args)
return
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
#adjust_learning_rate(optimizer, epoch, args)
lr_policy(optimizer, 0, epoch)
# train for one epoch
train(train_loader, model, criterion, optimizer, epoch, args)
# evaluate on validation set
acc1 = validate(val_loader, model, criterion, args)
# remember best acc@1 and save checkpoint
is_best = acc1 > best_acc1
best_acc1 = max(acc1, best_acc1)
file_name = "checkpoint_npu{}".format(args.npu)
modeltmp = model.cpu()
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': modeltmp.state_dict(),
# 'state_dict': model,
'best_acc1': best_acc1.to("cpu"),
# 'optimizer' : optimizer.state_dict(),
}, is_best.to("cpu"), file_name)
modeltmp.to(CALCULATE_DEVICE)
def train(train_loader, model, criterion, optimizer, epoch, args):
if args.optimizer_batch_size < 0:
batch_size_multiplier = 1
else:
tbs = 1 * args.batch_size
if args.optimizer_batch_size % tbs != 0:
print(
"Warning: simulated batch size {} is not divisible by actual batch size {}"
.format(args.optimizer_batch_size, tbs))
batch_size_multiplier = int(args.optimizer_batch_size / tbs)
print("BSM: {}".format(batch_size_multiplier))
batch_time = AverageMeter('Time', ':6.3f')
data_time = AverageMeter('Data', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
len(train_loader),
[batch_time, data_time, losses, top1, top5],
prefix="Epoch: [{}]".format(epoch))
# switch to train mode
model.train()
optimizer.zero_grad()
end = time.time()
for i, (images, target) in enumerate(train_loader):
#with torch.autograd.profiler.profile() as prof:
# measure data loading time
data_time.update(time.time() - end)
if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
#target = target.cuda(args.gpu, non_blocking=True)
#if 'npu' in CALCULATE_DEVICE:
# target = target.to(torch.int32)
images = images.to(CALCULATE_DEVICE, non_blocking=True)
if args.label_smoothing == 0.0:
target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
# compute output
output = model(images)
loss = criterion(output, target)
if args.label_smoothing > 0.0:
target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# compute gradient and do SGD step
#loss.backward()
###############################
with amp.scale_loss(loss, optimizer) as scaled_loss:
#print("middle")
scaled_loss.backward()
optimizer_step = ((i + 1) % batch_size_multiplier) == 0
if optimizer_step:
if batch_size_multiplier != 1:
for param_group in optimizer.param_groups:
for param in param_group['params']:
param.grad /= batch_size_multiplier
optimizer.step()
optimizer.zero_grad()
if i % LOG_STEP == 0:
progress.display(i)
#print(prof.key_averages().table(sort_by="self_cpu_time_total"))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i == TRAIN_STEP:
break
def validate(val_loader, model, criterion, args):
batch_time = AverageMeter('Time', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
len(val_loader),
[batch_time, losses, top1, top5],
prefix='Test: ')
# switch to evaluate mode
model.eval()
with torch.no_grad():
end = time.time()
for i, (images, target) in enumerate(val_loader):
#with torch.autograd.profiler.profile() as prof:
if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
#target = target.cuda(args.gpu, non_blocking=True)
#if 'npu' in CALCULATE_DEVICE:
# target = target.to(torch.int32)
images = images.to(CALCULATE_DEVICE, non_blocking=True)
if args.label_smoothing == 0.0:
target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
# compute output
output = model(images)
loss = criterion(output, target)
if args.label_smoothing > 0.0:
target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % LOG_STEP == 0:
progress.display(i)
#print(prof.key_averages().table(sort_by="self_cpu_time_total"))
# TODO: this should also be done with the ProgressMeter
print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
.format(top1=top1, top5=top5))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value="{top1.avg:.3f}".format(top1=top1))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value="{top5.avg:.3f}".format(top5=top5))
return top1.avg
def save_checkpoint(state, is_best, filename='checkpoint'):
filename2 = filename + ".pth.tar"
torch.save(state, filename2)
if is_best:
shutil.copyfile(filename2, filename+'model_best.pth.tar')
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self, name, fmt=':f'):
self.name = name
self.fmt = fmt
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def __str__(self):
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
return fmtstr.format(**self.__dict__)
class ProgressMeter(object):
def __init__(self, num_batches, meters, prefix=""):
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
self.meters = meters
self.prefix = prefix
def display(self, batch):
entries = [self.prefix + self.batch_fmtstr.format(batch)]
entries += [str(meter) for meter in self.meters]
print('\t'.join(entries))
current_run_time=str(entries).split("Time")[1].split("Data")[0].strip().split(" ")[0]
args = parser.parse_args()
batch_size = args.batch_size
if "Epoch" in self.prefix:
if float(current_run_time) > 0:
FPS = int(batch_size)/float(current_run_time)
hwlog.remark_print(key=hwlog.FPS, value=float(FPS))
def _get_batch_fmtstr(self, num_batches):
num_digits = len(str(num_batches // 1))
fmt = '{:' + str(num_digits) + 'd}'
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
def adjust_learning_rate(optimizer, epoch, args):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = args.lr * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def accuracy(output, target, topk=(1,)):
"""Computes the accuracy over the k top predictions for the specified values of k"""
with torch.no_grad():
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
class LabelSmoothing(nn.Module):
"""
NLL loss with label smoothing.
"""
def __init__(self, smoothing=0.0):
"""
Constructor for the LabelSmoothing module.
:param smoothing: label smoothing factor
"""
super(LabelSmoothing, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
def forward(self, x, target):
logprobs = torch.nn.functional.log_softmax(x, dim=-1).to("cpu")
nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
nll_loss = nll_loss.squeeze(1)
smooth_loss = -logprobs.mean(dim=-1)
loss = self.confidence * nll_loss + self.smoothing * smooth_loss
return loss.mean().to(CALCULATE_DEVICE)
def lr_policy(lr_fn, logger=None):
if logger is not None:
logger.register_metric('lr',
log.LR_METER(),
verbosity=dllogger.Verbosity.VERBOSE)
def _alr(optimizer, iteration, epoch):
lr = lr_fn(iteration, epoch)
if logger is not None:
logger.log_metric('lr', lr)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
return _alr
def lr_cosine_policy(base_lr, warmup_length, epochs, logger=None):
def _lr_fn(iteration, epoch):
if epoch < warmup_length:
lr = base_lr * (epoch + 1) / warmup_length
else:
e = epoch - warmup_length
es = epochs - warmup_length
lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
return lr
return lr_policy(_lr_fn, logger=logger)
if __name__ == '__main__':
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
config_info = get_model_parameter("pytorch_config")
initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
main()