[add]上传训练benchmark by z00560161

2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,20 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#from . import logger
+#from . import dataloaders
+#from . import training
+#from . import utils
+#from . import mixup
+#from . import resnet
+#from . import smoothing
@@ -0,0 +1,369 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+import torch
+import numpy as np
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from PIL import Image
+
+DATA_BACKEND_CHOICES = ['pytorch', 'syntetic']
+# try:
+#     from nvidia.dali.plugin.pytorch import DALIClassificationIterator
+#     from nvidia.dali.pipeline import Pipeline
+#     import nvidia.dali.ops as ops
+#     import nvidia.dali.types as types
+#     DATA_BACKEND_CHOICES.append('dali-gpu')
+#     DATA_BACKEND_CHOICES.append('dali-cpu')
+# except ImportError:
+#     print("Please install DALI from https://www.github.com/NVIDIA/DALI to run this example.")
+
+
+def load_jpeg_from_file(path, cuda=True, fp16=False):
+    img_transforms = transforms.Compose(
+        [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()]
+    )
+
+    img = img_transforms(Image.open(path))
+    with torch.no_grad():
+        # mean and std are not multiplied by 255 as they are in training script
+        # torch dataloader reads data into bytes whereas loading directly
+        # through PIL creates a tensor with floats in [0,1] range
+        mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
+        std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
+
+        if cuda:
+            mean = mean.cuda()
+            std = std.cuda()
+            img = img.cuda()
+        if fp16:
+            mean = mean.half()
+            std = std.half()
+            img = img.half()
+        else:
+            img = img.float()
+
+        input = img.unsqueeze(0).sub_(mean).div_(std)
+
+    return input
+
+
+# class HybridTrainPipe(Pipeline):
+#     def __init__(self, batch_size, num_threads, device_id, data_dir, crop, dali_cpu=False):
+#         super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed = 12 + device_id)
+#         if torch.distributed.is_initialized():
+#             rank = torch.distributed.get_rank()
+#             world_size = torch.distributed.get_world_size()
+#         else:
+#             rank = 0
+#             world_size = 1
+
+#         self.input = ops.FileReader(
+#                 file_root = data_dir,
+#                 shard_id = rank,
+#                 num_shards = world_size,
+#                 random_shuffle = True)
+
+#         if dali_cpu:
+#             dali_device = "cpu"
+#             self.decode = ops.ImageDecoder(device=dali_device, output_type=types.RGB)
+#         else:
+#             dali_device = "gpu"
+#             # This padding sets the size of the internal nvJPEG buffers to be able to handle all images from full-sized ImageNet
+#             # without additional reallocations
+#             self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB, device_memory_padding=211025920, host_memory_padding=140544512)
+
+#         self.res = ops.RandomResizedCrop(
+#                 device=dali_device,
+#                 size=[crop, crop],
+#                 interp_type=types.INTERP_LINEAR,
+#                 random_aspect_ratio=[0.75, 4./3.],
+#                 random_area=[0.08, 1.0],
+#                 num_attempts=100)
+
+#         self.cmnp = ops.CropMirrorNormalize(device = "gpu",
+#                                             output_dtype = types.FLOAT,
+#                                             output_layout = types.NCHW,
+#                                             crop = (crop, crop),
+#                                             image_type = types.RGB,
+#                                             mean = [0.485 * 255,0.456 * 255,0.406 * 255],
+#                                             std = [0.229 * 255,0.224 * 255,0.225 * 255])
+#         self.coin = ops.CoinFlip(probability = 0.5)
+
+#     def define_graph(self):
+#         rng = self.coin()
+#         self.jpegs, self.labels = self.input(name = "Reader")
+#         images = self.decode(self.jpegs)
+#         images = self.res(images)
+#         output = self.cmnp(images.gpu(), mirror = rng)
+#         return [output, self.labels]
+
+
+# class HybridValPipe(Pipeline):
+#     def __init__(self, batch_size, num_threads, device_id, data_dir, crop, size):
+#         super(HybridValPipe, self).__init__(batch_size, num_threads, device_id, seed = 12 + device_id)
+#         if torch.distributed.is_initialized():
+#             rank = torch.distributed.get_rank()
+#             world_size = torch.distributed.get_world_size()
+#         else:
+#             rank = 0
+#             world_size = 1
+
+#         self.input = ops.FileReader(
+#                 file_root = data_dir,
+#                 shard_id = rank,
+#                 num_shards = world_size,
+#                 random_shuffle = False)
+
+#         self.decode = ops.ImageDecoder(device = "mixed", output_type = types.RGB)
+#         self.res = ops.Resize(device = "gpu", resize_shorter = size)
+#         self.cmnp = ops.CropMirrorNormalize(device = "gpu",
+#                 output_dtype = types.FLOAT,
+#                 output_layout = types.NCHW,
+#                 crop = (crop, crop),
+#                 image_type = types.RGB,
+#                 mean = [0.485 * 255,0.456 * 255,0.406 * 255],
+#                 std = [0.229 * 255,0.224 * 255,0.225 * 255])
+
+#     def define_graph(self):
+#         self.jpegs, self.labels = self.input(name = "Reader")
+#         images = self.decode(self.jpegs)
+#         images = self.res(images)
+#         output = self.cmnp(images)
+#         return [output, self.labels]
+
+
+class DALIWrapper(object):
+    def gen_wrapper(dalipipeline, num_classes, one_hot):
+        for data in dalipipeline:
+            input = data[0]["data"]
+            target = torch.reshape(data[0]["label"], [-1]).cuda().long()
+            if one_hot:
+                target = expand(num_classes, torch.float, target)
+            yield input, target
+        dalipipeline.reset()
+
+    def __init__(self, dalipipeline, num_classes, one_hot):
+        self.dalipipeline = dalipipeline
+        self.num_classes =  num_classes
+        self.one_hot = one_hot
+
+    def __iter__(self):
+        return DALIWrapper.gen_wrapper(self.dalipipeline, self.num_classes, self.one_hot)
+
+def get_dali_train_loader(dali_cpu=False):
+    # def gdtl(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False):
+    #     if torch.distributed.is_initialized():
+    #         rank = torch.distributed.get_rank()
+    #         world_size = torch.distributed.get_world_size()
+    #     else:
+    #         rank = 0
+    #         world_size = 1
+
+    #     traindir = os.path.join(data_path, 'train')
+
+    #     pipe = HybridTrainPipe(batch_size=batch_size, num_threads=workers,
+    #             device_id = rank % torch.cuda.device_count(),
+    #             data_dir = traindir, crop = 224, dali_cpu=dali_cpu)
+
+    #     pipe.build()
+    #     train_loader = DALIClassificationIterator(pipe, size = int(pipe.epoch_size("Reader") / world_size))
+
+    #     return DALIWrapper(train_loader, num_classes, one_hot), int(pipe.epoch_size("Reader") / (world_size * batch_size))
+
+    # return gdtl
+    def gdtl(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False):
+        return False
+    return gdvl
+
+def get_dali_val_loader():
+    # def gdvl(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False):
+    #     if torch.distributed.is_initialized():
+    #         rank = torch.distributed.get_rank()
+    #         world_size = torch.distributed.get_world_size()
+    #     else:
+    #         rank = 0
+    #         world_size = 1
+
+    #     valdir = os.path.join(data_path, 'val')
+
+    #     pipe = HybridValPipe(batch_size=batch_size, num_threads=workers,
+    #             device_id = rank % torch.cuda.device_count(),
+    #             data_dir = valdir,
+    #             crop = 224, size = 256)
+
+    #     pipe.build()
+    #     val_loader = DALIClassificationIterator(pipe, size = int(pipe.epoch_size("Reader") / world_size))
+
+    #     return DALIWrapper(val_loader, num_classes, one_hot), int(pipe.epoch_size("Reader") / (world_size * batch_size))
+    # return gdvl
+    def gdvl(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False):
+        return False
+    return gdvl
+
+def fast_collate(batch):
+    imgs = [img[0] for img in batch]
+    targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
+    w = imgs[0].size[0]
+    h = imgs[0].size[1]
+    tensor = torch.zeros( (len(imgs), 3, h, w), dtype=torch.uint8 )
+    for i, img in enumerate(imgs):
+        nump_array = np.asarray(img, dtype=np.uint8)
+        tens = torch.from_numpy(nump_array)
+        if(nump_array.ndim < 3):
+            nump_array = np.expand_dims(nump_array, axis=-1)
+        nump_array = np.rollaxis(nump_array, 2)
+
+        tensor[i] += torch.from_numpy(nump_array)
+
+    return tensor, targets
+
+
+def expand(num_classes, dtype, tensor):
+    e = torch.zeros(tensor.size(0), num_classes, dtype=dtype, device=torch.device('cuda'))
+    e = e.scatter(1, tensor.unsqueeze(1), 1.0)
+    return e
+
+class PrefetchedWrapper(object):
+    def prefetched_loader(loader, num_classes, fp16, one_hot):
+        mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).cuda().view(1,3,1,1)
+        std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).cuda().view(1,3,1,1)
+        if fp16:
+            mean = mean.half()
+            std = std.half()
+
+        stream = torch.cuda.Stream()
+        first = True
+
+        for next_input, next_target in loader:
+            with torch.cuda.stream(stream):
+                next_input = next_input.cuda(non_blocking=True)
+                next_target = next_target.cuda(non_blocking=True)
+                if fp16:
+                    next_input = next_input.half()
+                    if one_hot:
+                        next_target = expand(num_classes, torch.half, next_target)
+                else:
+                    next_input = next_input.float()
+                    if one_hot:
+                        next_target = expand(num_classes, torch.float, next_target)
+
+                next_input = next_input.sub_(mean).div_(std)
+
+            if not first:
+                yield input, target
+            else:
+                first = False
+
+            torch.cuda.current_stream().wait_stream(stream)
+            input = next_input
+            target = next_target
+
+        yield input, target
+
+    def __init__(self, dataloader, num_classes, fp16, one_hot):
+        self.dataloader = dataloader
+        self.fp16 = fp16
+        self.epoch = 0
+        self.one_hot = one_hot
+        self.num_classes = num_classes
+
+    def __iter__(self):
+        if (self.dataloader.sampler is not None and
+            isinstance(self.dataloader.sampler,
+                       torch.utils.data.distributed.DistributedSampler)):
+
+            self.dataloader.sampler.set_epoch(self.epoch)
+        self.epoch += 1
+        return PrefetchedWrapper.prefetched_loader(self.dataloader, self.num_classes, self.fp16, self.one_hot)
+
+def get_pytorch_train_loader(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False):
+    traindir = os.path.join(data_path, 'train')
+    train_dataset = datasets.ImageFolder(
+            traindir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                ]))
+
+    if torch.distributed.is_initialized():
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    else:
+        train_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+            train_dataset, batch_size=batch_size, shuffle=(train_sampler is None),
+            num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate, drop_last=True)
+
+    return PrefetchedWrapper(train_loader, num_classes, fp16, one_hot), len(train_loader)
+
+def get_pytorch_val_loader(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False):
+    valdir = os.path.join(data_path, 'val')
+    val_dataset = datasets.ImageFolder(
+            valdir, transforms.Compose([
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                ]))
+
+    if torch.distributed.is_initialized():
+        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
+    else:
+        val_sampler = None
+
+    val_loader = torch.utils.data.DataLoader(
+            val_dataset,
+            sampler=val_sampler,
+            batch_size=batch_size, shuffle=False,
+            num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True,
+            collate_fn=fast_collate)
+
+    return PrefetchedWrapper(val_loader, num_classes, fp16, one_hot), len(val_loader)
+
+class SynteticDataLoader(object):
+    def __init__(self, fp16, batch_size, num_classes, num_channels, height, width, one_hot):
+        input_data = torch.empty(batch_size, num_channels, height, width).cuda().normal_(0, 1.0)
+        if one_hot:
+            input_target = torch.empty(batch_size, num_classes).cuda()
+            input_target[:, 0] = 1.0
+        else:
+            input_target = torch.randint(0, num_classes, (batch_size,))
+        input_target=input_target.cuda()
+        if fp16:
+            input_data = input_data.half()
+
+        self.input_data = input_data
+        self.input_target = input_target
+
+    def __iter__(self):
+        while True:
+            yield self.input_data, self.input_target
+
+def get_syntetic_loader(data_path, batch_size, num_classes, one_hot, workers=None, _worker_init_fn=None, fp16=False):
+    return SynteticDataLoader(fp16, batch_size, 1000, 3, 224, 224, one_hot), -1
@@ -0,0 +1,310 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from collections import OrderedDict
+import dllogger
+import numpy as np
+
+
+def format_step(step):
+    if isinstance(step, str):
+        return step
+    s = ""
+    if len(step) > 0:
+        s += "Epoch: {} ".format(step[0])
+    if len(step) > 1:
+        s += "Iteration: {} ".format(step[1])
+    if len(step) > 2:
+        s += "Validation Iteration: {} ".format(step[2])
+    if len(step) == 0:
+        s = "Summary:"
+    return s
+
+
+PERF_METER = lambda: Meter(AverageMeter(), AverageMeter(), AverageMeter())
+LOSS_METER = lambda: Meter(AverageMeter(), AverageMeter(), MinMeter())
+ACC_METER = lambda: Meter(AverageMeter(), AverageMeter(), MaxMeter())
+LR_METER = lambda: Meter(LastMeter(), LastMeter(), LastMeter())
+
+LAT_100 = lambda: Meter(QuantileMeter(1), QuantileMeter(1), QuantileMeter(1))
+LAT_99 = lambda: Meter(QuantileMeter(0.99), QuantileMeter(0.99), QuantileMeter(0.99))
+LAT_95 = lambda: Meter(QuantileMeter(0.95), QuantileMeter(0.95), QuantileMeter(0.95))
+
+class Meter(object):
+    def __init__(self, iteration_aggregator, epoch_aggregator, run_aggregator):
+        self.run_aggregator = run_aggregator
+        self.epoch_aggregator = epoch_aggregator
+        self.iteration_aggregator = iteration_aggregator
+
+    def record(self, val, n=1):
+        self.iteration_aggregator.record(val, n=n)
+
+    def get_iteration(self):
+        v, n = self.iteration_aggregator.get_val()
+        return v
+
+    def reset_iteration(self):
+        v, n = self.iteration_aggregator.get_data()
+        self.iteration_aggregator.reset()
+        if v is not None:
+            self.epoch_aggregator.record(v, n=n)
+
+    def get_epoch(self):
+        v, n = self.epoch_aggregator.get_val()
+        return v
+
+    def reset_epoch(self):
+        v, n = self.epoch_aggregator.get_data()
+        self.epoch_aggregator.reset()
+        if v is not None:
+            self.run_aggregator.record(v, n=n)
+
+    def get_run(self):
+        v, n = self.run_aggregator.get_val()
+        return v
+
+    def reset_run(self):
+        self.run_aggregator.reset()
+
+
+class QuantileMeter(object):
+    def __init__(self, q):
+        self.q = q
+        self.reset()
+
+    def reset(self):
+        self.vals = []
+        self.n = 0
+
+    def record(self, val, n=1):
+        if isinstance(val, list):
+            self.vals += val
+            self.n += len(val)
+        else:
+            self.vals += [val] * n
+            self.n += n
+
+    def get_val(self):
+        if not self.vals:
+            return None, self.n
+        return np.quantile(self.vals, self.q, interpolation='nearest'), self.n
+
+    def get_data(self):
+        return self.vals, self.n
+
+
+class MaxMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.max = None
+        self.n = 0
+
+    def record(self, val, n=1):
+        if self.max is None:
+            self.max = val
+        else:
+            self.max = max(self.max, val)
+        self.n = n
+
+    def get_val(self):
+        return self.max, self.n
+
+    def get_data(self):
+        return self.max, self.n
+
+
+class MinMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.min = None
+        self.n = 0
+
+    def record(self, val, n=1):
+        if self.min is None:
+            self.min = val
+        else:
+            self.min = max(self.min, val)
+        self.n = n
+
+    def get_val(self):
+        return self.min, self.n
+
+    def get_data(self):
+        return self.min, self.n
+
+
+class LastMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.last = None
+        self.n = 0
+
+    def record(self, val, n=1):
+        self.last = val
+        self.n = n
+
+    def get_val(self):
+        return self.last, self.n
+
+    def get_data(self):
+        return self.last, self.n
+
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.n = 0
+        self.val = 0
+
+    def record(self, val, n=1):
+        self.n += n
+        self.val += val * n
+
+    def get_val(self):
+        if self.n == 0:
+            return None, 0
+        return self.val / self.n, self.n
+
+    def get_data(self):
+        if self.n == 0:
+            return None, 0
+        return self.val / self.n, self.n
+
+
+class Logger(object):
+    def __init__(self, print_interval, backends, verbose=False):
+        self.epoch = -1
+        self.iteration = -1
+        self.val_iteration = -1
+        self.metrics = OrderedDict()
+        self.backends = backends
+        self.print_interval = print_interval
+        self.verbose = verbose
+        dllogger.init(backends)
+
+    def log_parameter(self, data, verbosity=0):
+        dllogger.log(step="PARAMETER", data=data, verbosity=verbosity)
+
+    def register_metric(self, metric_name, meter, verbosity=0, metadata={}):
+        if self.verbose:
+            print("Registering metric: {}".format(metric_name))
+        self.metrics[metric_name] = {'meter': meter, 'level': verbosity}
+        dllogger.metadata(metric_name, metadata)
+
+    def log_metric(self, metric_name, val, n=1):
+        self.metrics[metric_name]['meter'].record(val, n=n)
+
+    def start_iteration(self, val=False):
+        if val:
+            self.val_iteration += 1
+        else:
+            self.iteration += 1
+
+    def end_iteration(self, val=False):
+        it = self.val_iteration if val else self.iteration
+        if (it % self.print_interval == 0):
+            metrics = {
+                n: m
+                for n, m in self.metrics.items() if n.startswith('val') == val
+            }
+            step = (self.epoch,
+                    self.iteration) if not val else (self.epoch,
+                                                     self.iteration,
+                                                     self.val_iteration)
+
+            verbositys = {m['level'] for _, m in metrics.items()}
+            for ll in verbositys:
+                llm = {n: m for n, m in metrics.items() if m['level'] == ll}
+
+                dllogger.log(step=step,
+                         data={
+                             n: m['meter'].get_iteration()
+                             for n, m in llm.items()
+                         },
+                         verbosity=ll)
+
+            for n, m in metrics.items():
+                m['meter'].reset_iteration()
+
+            dllogger.flush()
+
+    def start_epoch(self):
+        self.epoch += 1
+        self.iteration = 0
+        self.val_iteration = 0
+
+        for n, m in self.metrics.items():
+            m['meter'].reset_epoch()
+
+    def end_epoch(self):
+        for n, m in self.metrics.items():
+            m['meter'].reset_iteration()
+
+        verbositys = {m['level'] for _, m in self.metrics.items()}
+        for ll in verbositys:
+            llm = {n: m for n, m in self.metrics.items() if m['level'] == ll}
+            dllogger.log(step=(self.epoch, ),
+                     data={n: m['meter'].get_epoch()
+                           for n, m in llm.items()})
+
+    def end(self):
+        for n, m in self.metrics.items():
+            m['meter'].reset_epoch()
+
+        verbositys = {m['level'] for _, m in self.metrics.items()}
+        for ll in verbositys:
+            llm = {n: m for n, m in self.metrics.items() if m['level'] == ll}
+            dllogger.log(step=tuple(),
+                     data={n: m['meter'].get_run()
+                           for n, m in llm.items()})
+
+        for n, m in self.metrics.items():
+            m['meter'].reset_epoch()
+
+        dllogger.flush()
+
+    def iteration_generator_wrapper(self, gen, val=False):
+        for g in gen:
+            self.start_iteration(val=val)
+            yield g
+            self.end_iteration(val=val)
+
+    def epoch_generator_wrapper(self, gen):
+        for g in gen:
+            self.start_epoch()
+            yield g
+            self.end_epoch()
@@ -0,0 +1,67 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+
+
+def mixup(alpha, num_classes, data, target):
+    with torch.no_grad():
+        bs = data.size(0)
+        c = np.random.beta(alpha, alpha)
+
+        perm = torch.randperm(bs).cuda()
+
+        md = c * data + (1-c) * data[perm, :]
+        mt = c * target + (1-c) * target[perm, :]
+        return md, mt
+
+
+class MixUpWrapper(object):
+    def __init__(self, alpha, num_classes, dataloader):
+        self.alpha = alpha
+        self.dataloader = dataloader
+        self.num_classes = num_classes
+
+    def mixup_loader(self, loader):
+        for input, target in loader:
+            i, t = mixup(self.alpha, self.num_classes, input, target)
+            yield i, t
+
+    def __iter__(self):
+        return self.mixup_loader(self.dataloader)
+
+
+class NLLMultiLabelSmooth(nn.Module):
+    def __init__(self, smoothing = 0.0):
+        super(NLLMultiLabelSmooth, self).__init__()
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+
+    def forward(self, x, target):
+        if self.training:
+            x = x.float()
+            target = target.float()
+            logprobs = torch.nn.functional.log_softmax(x, dim = -1)
+    
+            nll_loss = -logprobs * target
+            nll_loss = nll_loss.sum(-1)
+    
+            smooth_loss = -logprobs.mean(dim=-1)
+    
+            loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+    
+            return loss.mean()
+        else:
+            return torch.nn.functional.cross_entropy(x, target)
@@ -0,0 +1,370 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+
+__all__ = ['ResNet', 'build_resnet', 'resnet_versions', 'resnet_configs']
+
+# ResNetBuilder {{{
+
+class ResNetBuilder(object):
+    def __init__(self, version, config):
+        self.conv3x3_cardinality = 1 if 'cardinality' not in version.keys() else version['cardinality']
+        self.config = config
+
+    def conv(self, kernel_size, in_planes, out_planes, groups=1, stride=1):
+        conv = nn.Conv2d(
+                in_planes, out_planes,
+                kernel_size=kernel_size, groups=groups,
+                stride=stride, padding=int((kernel_size - 1)/2),
+                bias=False)
+
+        if self.config['nonlinearity'] == 'relu': 
+            # torch.nn.init.kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu')
+            # Copy
+            # 用论文 “Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification” - He, K. et al. (2015) 中提及的正态分布初始化输入 Tensor。初始化后的张量中的值采样 ) 且
+            # %20%5Ctimes%20%5Ctext%7Bfan%5C_in%7D%7D%7D%0D%0A%0D%0A)
+            # 也被称作 He initialization。
+            # 参数：
+            # tensor – n 维 torch.Tensor
+            # a – 该层后面一层的整流函数中负的斜率 (默认为 0，此时为 Relu)
+            # mode – 'fan_in' (default) 或者 'fan_out'。使用fan_in保持weights的方差在前向传播中不变；使用fan_out保持weights的方差在反向传播中不变。
+            # nonlinearity – 非线性函数 (nn.functional 中的名字)，推荐只使用 'relu' 或 'leaky_relu' (default)。
+            # 例子
+            # >>> w = torch.empty(3, 5)
+            # >>> nn.init.kaiming_normal_(w, mode='fan_out', nonlinearity='relu')
+            nn.init.kaiming_normal_(conv.weight,
+                    mode=self.config['conv_init'],
+                    nonlinearity=self.config['nonlinearity'])
+
+
+
+
+        return conv
+
+    def conv3x3(self, in_planes, out_planes, stride=1):
+        """3x3 convolution with padding"""
+        c = self.conv(3, in_planes, out_planes, groups=self.conv3x3_cardinality, stride=stride)
+        return c
+
+    def conv1x1(self, in_planes, out_planes, stride=1):
+        """1x1 convolution with padding"""
+        c = self.conv(1, in_planes, out_planes, stride=stride)
+        return c
+
+    def conv7x7(self, in_planes, out_planes, stride=1):
+        """7x7 convolution with padding"""
+        c = self.conv(7, in_planes, out_planes, stride=stride)
+        return c
+
+    def conv5x5(self, in_planes, out_planes, stride=1):
+        """5x5 convolution with padding"""
+        c = self.conv(5, in_planes, out_planes, stride=stride)
+        return c
+
+    def batchnorm(self, planes, last_bn=False):
+        bn = nn.BatchNorm2d(planes)
+        gamma_init_val = 0 if last_bn and self.config['last_bn_0_init'] else 1
+        nn.init.constant_(bn.weight, gamma_init_val)
+        nn.init.constant_(bn.bias, 0)
+
+        return bn
+
+    def activation(self):
+        return self.config['activation']()
+
+# ResNetBuilder }}}
+
+# BasicBlock {{{
+class BasicBlock(nn.Module):
+    def __init__(self, builder, inplanes, planes, expansion, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = builder.conv3x3(inplanes, planes, stride)
+        self.bn1 = builder.batchnorm(planes)
+        self.relu = builder.activation()
+        self.conv2 = builder.conv3x3(planes, planes*expansion)
+        self.bn2 = builder.batchnorm(planes*expansion, last_bn=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        if self.bn1 is not None:
+            out = self.bn1(out)
+
+        out = self.relu(out)
+
+        out = self.conv2(out)
+
+        if self.bn2 is not None:
+            out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+# BasicBlock }}}
+
+# SqueezeAndExcitation {{{
+class SqueezeAndExcitation(nn.Module):
+    def __init__(self, planes, squeeze):
+        super(SqueezeAndExcitation, self).__init__()
+        self.squeeze = nn.Linear(planes, squeeze)
+        self.expand = nn.Linear(squeeze, planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        out = torch.mean(x.view(x.size(0), x.size(1), -1), 2)
+        out = self.squeeze(out)
+        out = self.relu(out)
+        out = self.expand(out)
+        out = self.sigmoid(out)
+        out = out.unsqueeze(2).unsqueeze(3)
+
+        return out
+
+# }}}
+
+# Bottleneck {{{
+class Bottleneck(nn.Module):
+    def __init__(self, builder, inplanes, planes, expansion, stride=1, se=False, se_squeeze=16, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = builder.conv1x1(inplanes, planes)
+        self.bn1 = builder.batchnorm(planes)
+        self.conv2 = builder.conv3x3(planes, planes, stride=stride)
+        self.bn2 = builder.batchnorm(planes)
+        self.conv3 = builder.conv1x1(planes, planes * expansion)
+        self.bn3 = builder.batchnorm(planes * expansion, last_bn=True)
+        self.relu = builder.activation()
+        self.downsample = downsample
+        self.stride = stride
+        self.squeeze = SqueezeAndExcitation(planes*expansion, se_squeeze) if se else None
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        if self.squeeze is None:
+            out += residual
+        else:
+            out = torch.addcmul(residual, 1.0, out, self.squeeze(out))
+
+        out = self.relu(out)
+
+        return out
+
+def SEBottleneck(builder, inplanes, planes, expansion, stride=1, downsample=None):
+    return Bottleneck(builder, inplanes, planes, expansion, stride=stride, se=True, se_squeeze=16, downsample=downsample)
+# Bottleneck }}}
+
+# ResNet {{{
+class ResNet(nn.Module):
+    def __init__(self, builder, block, expansion, layers, widths, num_classes=1000):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.conv1 = builder.conv7x7(3, 64, stride=2)
+        self.bn1 = builder.batchnorm(64)
+        self.relu = builder.activation()
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(builder, block, expansion, widths[0], layers[0])
+        self.layer2 = self._make_layer(builder, block, expansion, widths[1], layers[1], stride=2)
+        self.layer3 = self._make_layer(builder, block, expansion, widths[2], layers[2], stride=2)
+        self.layer4 = self._make_layer(builder, block, expansion, widths[3], layers[3], stride=2)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(widths[3] * expansion, num_classes)
+
+    def _make_layer(self, builder, block, expansion, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * expansion:
+            dconv = builder.conv1x1(self.inplanes, planes * expansion,
+                                    stride=stride)
+            dbn = builder.batchnorm(planes * expansion)
+            if dbn is not None:
+                downsample = nn.Sequential(dconv, dbn)
+            else:
+                downsample = dconv
+
+        layers = []
+        layers.append(block(builder, self.inplanes, planes, expansion, stride=stride, downsample=downsample))
+        self.inplanes = planes * expansion
+        for i in range(1, blocks):
+            layers.append(block(builder, self.inplanes, planes, expansion))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        if self.bn1 is not None:
+            x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+# ResNet }}}
+
+resnet_configs = {
+        'classic' : {
+            'conv' : nn.Conv2d,
+            'conv_init' : 'fan_out',
+            'nonlinearity' : 'relu',
+            'last_bn_0_init' : False,
+            'activation' : lambda: nn.ReLU(inplace=True),
+            },
+        'fanin' : {
+            'conv' : nn.Conv2d,
+            'conv_init' : 'fan_in',
+            'nonlinearity' : 'relu',
+            'last_bn_0_init' : False,
+            'activation' : lambda: nn.ReLU(inplace=True),
+            },
+        'grp-fanin' : {
+            'conv' : nn.Conv2d,
+            'conv_init' : 'fan_in',
+            'nonlinearity' : 'relu',
+            'last_bn_0_init' : False,
+            'activation' : lambda: nn.ReLU(inplace=True),
+            },
+        'grp-fanout' : {
+            'conv' : nn.Conv2d,
+            'conv_init' : 'fan_out',
+            'nonlinearity' : 'relu',
+            'last_bn_0_init' : False,
+            'activation' : lambda: nn.ReLU(inplace=True),
+            },
+        }
+
+resnet_versions = {
+        'resnet18' : {
+            'net' : ResNet,
+            'block' : BasicBlock,
+            'layers' : [2, 2, 2, 2],
+            'widths' : [64, 128, 256, 512],
+            'expansion' : 1,
+            'num_classes' : 1000,
+            },
+         'resnet34' : {
+            'net' : ResNet,
+            'block' : BasicBlock,
+            'layers' : [3, 4, 6, 3],
+            'widths' : [64, 128, 256, 512],
+            'expansion' : 1,
+            'num_classes' : 1000,
+            },
+         'resnet50' : {
+            'net' : ResNet,
+            'block' : Bottleneck,
+            'layers' : [3, 4, 6, 3],
+            'widths' : [64, 128, 256, 512],
+            'expansion' : 4,
+            'num_classes' : 1000,
+            },
+        'resnet101' : {
+            'net' : ResNet,
+            'block' : Bottleneck,
+            'layers' : [3, 4, 23, 3],
+            'widths' : [64, 128, 256, 512],
+            'expansion' : 4,
+            'num_classes' : 1000,
+            },
+        'resnet152' : {
+            'net' : ResNet,
+            'block' : Bottleneck,
+            'layers' : [3, 8, 36, 3],
+            'widths' : [64, 128, 256, 512],
+            'expansion' : 4,
+            'num_classes' : 1000,
+            },
+        'resnext101-32x4d' : {
+            'net' : ResNet,
+            'block' : Bottleneck,
+            'cardinality' : 32,
+            'layers' : [3, 4, 23, 3],
+            'widths' : [128, 256, 512, 1024],
+            'expansion' : 2,
+            'num_classes' : 1000,
+            },
+        'se-resnext101-32x4d' : {
+            'net' : ResNet,
+            'block' : SEBottleneck,
+            'cardinality' : 32,
+            'layers' : [3, 4, 23, 3],
+            'widths' : [128, 256, 512, 1024],
+            'expansion' : 2,
+            'num_classes' : 1000,
+            },
+        }
+
+
+def build_resnet(version, config, verbose=True):
+    version = resnet_versions[version]
+    config = resnet_configs[config]
+
+    builder = ResNetBuilder(version, config)
+    if verbose:
+        print("Version: {}".format(version))
+        print("Config: {}".format(config))
+    model = version['net'](builder,
+                           version['block'],
+                           version['expansion'],
+                           version['layers'],
+                           version['widths'],
+                           version['num_classes'])
+
+    return model
@@ -0,0 +1,91 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+
+
+class CrossEntropy(nn.CrossEntropyLoss):
+    def __init__(self, smooth_factor=0., num_classes=1000):
+        super(CrossEntropy, self).__init__()
+        self.on_value = 1.0 - smooth_factor
+        self.off_value = 1.0 * smooth_factor / (num_classes - 1)
+
+    def forward(self, input, target):
+        one_hot_label = torch.npu_one_hot(target, -1, input.size(1), self.on_value, self.off_value)
+        one_hot_label = one_hot_label.to(torch.float16)
+        loss = torch.npu_softmax_cross_entropy_with_logits(input.to(torch.float16), one_hot_label)
+
+        loss = torch.mean(loss, [0], keepdim=False, dtype=torch.float32)
+        return loss
+
+class LabelSmoothingNpu(nn.Module):
+    """
+    NLL loss with label smoothing.
+    """
+    def __init__(self, smoothing=0.0):
+        """
+        Constructor for the LabelSmoothing module.
+
+        :param smoothing: label smoothing factor
+        """
+        super(LabelSmoothingNpu, self).__init__()
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+
+        self.epsilon = 0.1
+        self.num_classes = 1000
+
+    def forward(self, x, target):
+        CALCULATE_DEVICE = x.device
+        logprobs = torch.nn.functional.log_softmax(x, dim=-1).to("cpu")
+
+        targets = torch.zeros_like(logprobs).scatter_(1, target.unsqueeze(1), 1)
+        targets = (1 - self.epsilon) * targets + self.epsilon / self.num_classes
+        loss = (-targets * logprobs).mean(0).sum()
+
+        # nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
+        # nll_loss = nll_loss.squeeze(1)
+        # smooth_loss = -logprobs.mean(dim=-1)
+        # loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+        return loss.to(CALCULATE_DEVICE)
+
+
+class LabelSmoothingGpu(nn.Module):
+    """
+    NLL loss with label smoothing.
+    """
+    def __init__(self, smoothing=0.0):
+        """
+        Constructor for the LabelSmoothing module.
+
+        :param smoothing: label smoothing factor
+        """
+        super(LabelSmoothingGpu, self).__init__()
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+    #     print("----------------------LabelSooothing.__init__")
+    # def __call__(self,x,target):
+    #     print("----------------------LabelSooothing.__call__")
+    #     return self.forward(self,x,target)
+
+    def forward(self, x, target):
+        logprobs = torch.nn.functional.log_softmax(x, dim=-1)
+
+        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
+        nll_loss = nll_loss.squeeze(1)
+        smooth_loss = -logprobs.mean(dim=-1)
+        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+        #print("================",type(x),x.size())
+        #print("------------------",type(target),target.size(),target)
+        return loss.mean()
@@ -0,0 +1,51 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+
+class LabelSmoothing(nn.Module):
+    """
+    NLL loss with label smoothing.
+    """
+    def __init__(self, smoothing=0.0):
+        """
+        Constructor for the LabelSmoothing module.
+
+        :param smoothing: label smoothing factor
+        """
+        super(LabelSmoothing, self).__init__()
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+    #     print("----------------------LabelSooothing.__init__")
+    # def __call__(self,x,target):
+    #     print("----------------------LabelSooothing.__call__")
+    #     return self.forward(self,x,target)
+
+    def forward(self, x, target):
+        device_x = x.device
+        device_target = target.device
+        x = x.to("cpu")
+        target = target.to("cpu")
+        logprobs = torch.nn.functional.log_softmax(x, dim=-1)
+
+        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
+        nll_loss = nll_loss.squeeze(1)
+        smooth_loss = -logprobs.mean(dim=-1)
+        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+        #print("================",type(x),x.size())
+        #print("------------------",type(target),target.size(),target)
+        
+        x = x.to(device_x)
+        target = target.to(device_target)
+        return loss.mean()
@@ -0,0 +1,534 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+import time
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from . import logger as log
+from . import resnet as nvmodels
+from . import utils
+import dllogger
+try:
+    #from apex.parallel import DistributedDataParallel as DDP  #可以采用pytorch torch.distributed
+    from apex.fp16_utils import *
+    from apex import amp
+except ImportError:
+    raise ImportError(
+        "Please install apex from https://www.github.com/nvidia/apex to run this example."
+    )
+
+ACC_METADATA = {'unit': '%','format': ':.2f'}
+IPS_METADATA = {'unit': 'img/s', 'format': ':.2f'}
+TIME_METADATA = {'unit': 's', 'format': ':.5f'}
+LOSS_METADATA = {'format': ':.5f'}
+
+
+class ModelAndLoss(nn.Module):
+    def __init__(self,
+                 arch,
+                 loss,
+                 pretrained_weights=None,
+                 cuda=True,
+                 fp16=False):
+        super(ModelAndLoss, self).__init__()
+        self.arch = arch
+
+        print("=> creating model '{}'".format(arch))
+        model = nvmodels.build_resnet(arch[0], arch[1])
+        if pretrained_weights is not None:
+            print("=> using pre-trained model from a file '{}'".format(arch))
+            model.load_state_dict(pretrained_weights)
+
+        if cuda:
+            model = model.cuda()
+        if fp16:
+            model = network_to_half(model)
+
+        # define loss function (criterion) and optimizer
+        criterion = loss()
+
+        if cuda:
+            criterion = criterion.cuda()
+
+        self.model = model
+        self.loss = criterion
+
+    def forward(self, data, target):
+        output = self.model(data)
+        loss = self.loss(output, target)
+
+        return loss, output
+
+    def distributed(self):
+        #self.model = DDP(self.model)
+        return
+
+    def load_model_state(self, state):
+        if not state is None:
+            self.model.load_state_dict(state)
+
+
+def get_optimizer(parameters,
+                  fp16,
+                  lr,
+                  momentum,
+                  weight_decay,
+                  nesterov=False,
+                  state=None,
+                  static_loss_scale=1.,
+                  dynamic_loss_scale=False,
+                  bn_weight_decay=False):
+
+    if bn_weight_decay:
+        print(" ! Weight decay applied to BN parameters ")
+        optimizer = torch.optim.SGD([v for n, v in parameters],
+                                    lr,
+                                    momentum=momentum,
+                                    weight_decay=weight_decay,
+                                    nesterov=nesterov)
+    else:
+        print(" ! Weight decay NOT applied to BN parameters ")
+        bn_params = [v for n, v in parameters if 'bn' in n]
+        rest_params = [v for n, v in parameters if not 'bn' in n]
+        print(len(bn_params))
+        print(len(rest_params))
+        optimizer = torch.optim.SGD([{
+            'params': bn_params,
+            'weight_decay': 0
+        }, {
+            'params': rest_params,
+            'weight_decay': weight_decay
+        }],
+                                    lr,
+                                    momentum=momentum,
+                                    weight_decay=weight_decay,
+                                    nesterov=nesterov)
+    if fp16:
+        optimizer = FP16_Optimizer(optimizer,
+                                   static_loss_scale=static_loss_scale,
+                                   dynamic_loss_scale=dynamic_loss_scale,
+                                   verbose=False)
+
+    if not state is None:
+        optimizer.load_state_dict(state)
+
+    return optimizer
+
+
+def lr_policy(lr_fn, logger=None):
+    if logger is not None:
+        logger.register_metric('lr',
+                               log.LR_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE)
+
+    def _alr(optimizer, iteration, epoch):
+        lr = lr_fn(iteration, epoch)
+
+        if logger is not None:
+            logger.log_metric('lr', lr)
+        for param_group in optimizer.param_groups:
+            param_group['lr'] = lr
+
+    return _alr
+
+
+def lr_step_policy(base_lr, steps, decay_factor, warmup_length, logger=None):
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            lr = base_lr
+            for s in steps:
+                if epoch >= s:
+                    lr *= decay_factor
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
+
+
+def lr_linear_policy(base_lr, warmup_length, epochs, logger=None):
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            e = epoch - warmup_length
+            es = epochs - warmup_length
+            lr = base_lr * (1 - (e / es))
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
+
+
+def lr_cosine_policy(base_lr, warmup_length, epochs, logger=None):
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            e = epoch - warmup_length
+            es = epochs - warmup_length
+            lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
+
+
+def lr_exponential_policy(base_lr,
+                          warmup_length,
+                          epochs,
+                          final_multiplier=0.001,
+                          logger=None):
+    es = epochs - warmup_length
+    epoch_decay = np.power(2, np.log2(final_multiplier) / es)
+
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            e = epoch - warmup_length
+            lr = base_lr * (epoch_decay**e)
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
+
+
+def get_train_step(model_and_loss,
+                   optimizer,
+                   fp16,
+                   use_amp=False,
+                   batch_size_multiplier=1):
+    def _step(input, target, optimizer_step=True):
+        input_var = Variable(input)
+        target_var = Variable(target)
+        loss, output = model_and_loss(input_var, target_var)
+        if torch.distributed.is_initialized():
+            print('utils.reduce_tensor(loss.data)')
+            reduced_loss = utils.reduce_tensor(loss.data)
+        else:
+            reduced_loss = loss.data
+
+        if fp16:
+            optimizer.backward(loss)
+        elif use_amp:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+
+        if optimizer_step:
+            opt = optimizer.optimizer if isinstance(
+                optimizer, FP16_Optimizer) else optimizer
+            for param_group in opt.param_groups:
+                for param in param_group['params']:
+                    param.grad /= batch_size_multiplier
+
+            optimizer.step()
+            optimizer.zero_grad()
+
+        torch.cuda.synchronize()
+
+        return reduced_loss
+
+    return _step
+
+
+def train(train_loader,
+          model_and_loss,
+          optimizer,
+          lr_scheduler,
+          fp16,
+          logger,
+          epoch,
+          use_amp=False,
+          prof=-1,
+          batch_size_multiplier=1,
+          register_metrics=True):
+
+    if register_metrics and logger is not None:
+        logger.register_metric('train.loss',
+                               log.LOSS_METER(),
+                               verbosity=dllogger.Verbosity.DEFAULT,
+                               metadata=LOSS_METADATA)
+        logger.register_metric('train.compute_ips',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=IPS_METADATA)
+        logger.register_metric('train.total_ips',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.DEFAULT,
+                               metadata=IPS_METADATA)
+        logger.register_metric('train.data_time',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=TIME_METADATA)
+        logger.register_metric('train.compute_time',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=TIME_METADATA)
+
+    step = get_train_step(model_and_loss,
+                          optimizer,
+                          fp16,
+                          use_amp=use_amp,
+                          batch_size_multiplier=batch_size_multiplier)
+
+    model_and_loss.train()
+    end = time.time()
+
+    optimizer.zero_grad()
+
+    data_iter = enumerate(train_loader)
+    if logger is not None:
+        data_iter = logger.iteration_generator_wrapper(data_iter)
+    if prof > 0:
+        data_iter = utils.first_n(prof, data_iter)
+
+    for i, (input, target) in data_iter:
+        bs = input.size(0)
+        lr_scheduler(optimizer, i, epoch)
+        data_time = time.time() - end
+
+        optimizer_step = ((i + 1) % batch_size_multiplier) == 0
+        loss = step(input, target, optimizer_step=optimizer_step)
+
+        it_time = time.time() - end
+
+        if logger is not None:
+            logger.log_metric('train.loss', to_python_float(loss), bs)
+            logger.log_metric('train.compute_ips',
+                              calc_ips(bs, it_time - data_time))
+            logger.log_metric('train.total_ips', calc_ips(bs, it_time))
+            logger.log_metric('train.data_time', data_time)
+            logger.log_metric('train.compute_time', it_time - data_time)
+
+        end = time.time()
+
+
+def get_val_step(model_and_loss):
+    def _step(input, target):
+        input_var = Variable(input)
+        target_var = Variable(target)
+
+        with torch.no_grad():
+            loss, output = model_and_loss(input_var, target_var)
+
+        prec1, prec5 = utils.accuracy(output.data, target, topk=(1, 5))
+
+        if torch.distributed.is_initialized():
+            reduced_loss = utils.reduce_tensor(loss.data)
+            prec1 = utils.reduce_tensor(prec1)
+            prec5 = utils.reduce_tensor(prec5)
+        else:
+            reduced_loss = loss.data
+
+        torch.cuda.synchronize()
+
+        return reduced_loss, prec1, prec5
+
+    return _step
+
+
+def validate(val_loader,
+             model_and_loss,
+             fp16,
+             logger,
+             epoch,
+             prof=-1,
+             register_metrics=True):
+    if register_metrics and logger is not None:
+        logger.register_metric('val.top1',
+                               log.ACC_METER(),
+                               verbosity=dllogger.Verbosity.DEFAULT,
+                               metadata=ACC_METADATA)
+        logger.register_metric('val.top5',
+                               log.ACC_METER(),
+                               verbosity=dllogger.Verbosity.DEFAULT,
+                               metadata=ACC_METADATA)
+        logger.register_metric('val.loss',
+                               log.LOSS_METER(),
+                               verbosity=dllogger.Verbosity.DEFAULT,
+                               metadata=LOSS_METADATA)
+        logger.register_metric('val.compute_ips',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=IPS_METADATA)
+        logger.register_metric('val.total_ips',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.DEFAULT,
+                               metadata=IPS_METADATA)
+        logger.register_metric('val.data_time',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=TIME_METADATA)
+        logger.register_metric('val.compute_latency',
+                               log.PERF_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=TIME_METADATA)
+        logger.register_metric('val.compute_latency_at100',
+                               log.LAT_100(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=TIME_METADATA)
+        logger.register_metric('val.compute_latency_at99',
+                               log.LAT_99(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=TIME_METADATA)
+        logger.register_metric('val.compute_latency_at95',
+                               log.LAT_95(),
+                               verbosity=dllogger.Verbosity.VERBOSE,
+                               metadata=TIME_METADATA)
+
+
+    step = get_val_step(model_and_loss)
+
+    top1 = log.AverageMeter()
+    # switch to evaluate mode
+    model_and_loss.eval()
+
+    end = time.time()
+
+    data_iter = enumerate(val_loader)
+    if not logger is None:
+        data_iter = logger.iteration_generator_wrapper(data_iter, val=True)
+    if prof > 0:
+        data_iter = utils.first_n(prof, data_iter)
+
+    for i, (input, target) in data_iter:
+        bs = input.size(0)
+        data_time = time.time() - end
+
+        loss, prec1, prec5 = step(input, target)
+
+        it_time = time.time() - end
+
+        top1.record(to_python_float(prec1), bs)
+        if logger is not None:
+            logger.log_metric('val.top1', to_python_float(prec1), bs)
+            logger.log_metric('val.top5', to_python_float(prec5), bs)
+            logger.log_metric('val.loss', to_python_float(loss), bs)
+            logger.log_metric('val.compute_ips',
+                              calc_ips(bs, it_time - data_time))
+            logger.log_metric('val.total_ips', calc_ips(bs, it_time))
+            logger.log_metric('val.data_time', data_time)
+            logger.log_metric('val.compute_latency', it_time - data_time)
+            logger.log_metric('val.compute_latency_at95', it_time - data_time)
+            logger.log_metric('val.compute_latency_at99', it_time - data_time)
+            logger.log_metric('val.compute_latency_at100', it_time - data_time)
+
+        end = time.time()
+
+    return top1.get_val()
+
+
+# Train loop {{{
+def calc_ips(batch_size, time):
+    world_size = torch.distributed.get_world_size(
+    ) if torch.distributed.is_initialized() else 1
+    tbs = world_size * batch_size
+    return tbs / time
+
+
+def train_loop(model_and_loss,
+               optimizer,
+               lr_scheduler,
+               train_loader,
+               val_loader,
+               epochs,
+               fp16,
+               logger,
+               should_backup_checkpoint,
+               use_amp=False,
+               batch_size_multiplier=1,
+               best_prec1=0,
+               start_epoch=0,
+               prof=-1,
+               skip_training=False,
+               skip_validation=False,
+               save_checkpoints=True,
+               checkpoint_dir='./'):
+
+    prec1 = -1
+
+    epoch_iter = range(start_epoch, epochs)
+    for epoch in epoch_iter:
+        if logger is not None:
+            logger.start_epoch()
+        if not skip_training:
+            train(train_loader,
+                  model_and_loss,
+                  optimizer,
+                  lr_scheduler,
+                  fp16,
+                  logger,
+                  epoch,
+                  use_amp=use_amp,
+                  prof=prof,
+                  register_metrics=epoch == start_epoch,
+                  batch_size_multiplier=batch_size_multiplier)
+
+        if not skip_validation:
+            prec1, nimg = validate(val_loader,
+                                   model_and_loss,
+                                   fp16,
+                                   logger,
+                                   epoch,
+                                   prof=prof,
+                                   register_metrics=epoch == start_epoch)
+        if logger is not None:
+            logger.end_epoch()
+
+        if save_checkpoints and (not torch.distributed.is_initialized()
+                                 or torch.distributed.get_rank() == 0):
+            if not skip_validation:
+                is_best = logger.metrics['val.top1']['meter'].get_epoch() > best_prec1
+                best_prec1 = max(logger.metrics['val.top1']['meter'].get_epoch(),
+                                 best_prec1)
+            else:
+                is_best = False
+                best_prec1 = 0
+
+            if should_backup_checkpoint(epoch):
+                backup_filename = 'checkpoint-{}.pth.tar'.format(epoch + 1)
+            else:
+                backup_filename = None
+            utils.save_checkpoint(
+                {
+                    'epoch': epoch + 1,
+                    'arch': model_and_loss.arch,
+                    'state_dict': model_and_loss.model.state_dict(),
+                    'best_prec1': best_prec1,
+                    'optimizer': optimizer.state_dict(),
+                },
+                is_best,
+                checkpoint_dir=checkpoint_dir,
+                backup_filename=backup_filename)
+
+
+# }}}
@@ -0,0 +1,106 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+import numpy as np
+import torch
+import shutil
+import torch.distributed as dist
+
+
+def should_backup_checkpoint(args):
+    def _sbc(epoch):
+        return args.gather_checkpoints and (epoch < 10 or epoch % 10 == 0)
+
+    return _sbc
+
+
+def save_checkpoint(state,
+                    is_best,
+                    filename='checkpoint.pth.tar',
+                    checkpoint_dir='./',
+                    backup_filename=None):
+    if (not torch.distributed.is_initialized()
+        ) or torch.distributed.get_rank() == 0:
+        filename = os.path.join(checkpoint_dir, filename)
+        print("SAVING {}".format(filename))
+        torch.save(state, filename)
+        if is_best:
+            shutil.copyfile(filename,
+                            os.path.join(checkpoint_dir, 'model_best.pth.tar'))
+        if backup_filename is not None:
+            shutil.copyfile(filename,
+                            os.path.join(checkpoint_dir, backup_filename))
+
+
+def timed_generator(gen):
+    start = time.time()
+    for g in gen:
+        end = time.time()
+        t = end - start
+        yield g, t
+        start = time.time()
+
+
+def timed_function(f):
+    def _timed_function(*args, **kwargs):
+        start = time.time()
+        ret = f(*args, **kwargs)
+        return ret, time.time() - start
+
+    return _timed_function
+
+
+def accuracy(output, target, topk=(1, )):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+def reduce_tensor(tensor):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    rt /= torch.distributed.get_world_size(
+    ) if torch.distributed.is_initialized() else 1
+    return rt
+
+
+def first_n(n, generator):
+    for i, d in zip(range(n), generator):
+        yield d
@@ -0,0 +1,609 @@
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+import math
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torchvision.models as models
+import torch.npu
+
+from apex import amp
+from benchmark_log import hwlog
+from benchmark_log.basic_utils import get_environment_info
+from benchmark_log.basic_utils import get_model_parameter
+'''
+python3.7 pytorch-resnet50-apex.py --data /opt/npu/dataset/imagenet --npu 7 -j64 -b512 --lr 0.2 --warmup 5 --epochs 90 --label-smoothing 0.1 --optimizer-batch-size 1024 > batch1024-lr0.2-wd.txt &
+'''
+BATCH_SIZE = 512
+EPOCHS_SIZE = 100
+TRAIN_STEP = 8000
+LOG_STEP = 1
+
+CALCULATE_DEVICE = "npu:7"
+PRINT_DEVICE = "cpu"
+SOURCE_DIR = "/data/imagenet"
+
+model_names = sorted(name for name in models.__dict__
+                     if name.islower() and not name.startswith("__")
+                     and callable(models.__dict__[name]))
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('--data', metavar='DIR', default=SOURCE_DIR,
+                    help='path to dataset')
+parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
+                    choices=model_names,
+                    help='model architecture: ' +
+                         ' | '.join(model_names) +
+                         ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=32, type=int, metavar='N',
+                    help='number of data loading workers (default: 8)')
+parser.add_argument('--epochs', default=EPOCHS_SIZE, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=BATCH_SIZE, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('-p', '--print-freq', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--world-size', default=-1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=-1, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='nccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu', default=None, type=int,
+                    help='GPU id to use.')
+parser.add_argument('--npu', default=None, type=int,
+                    help='NPU id to use.')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+parser.add_argument('--warmup',
+                    default=0,
+                    type=int,
+                    metavar='E',
+                    help='number of warmup epochs')
+parser.add_argument('--label-smoothing',
+                    default=0.0,
+                    type=float,
+                    metavar='S',
+                    help='label smoothing')
+parser.add_argument('--optimizer-batch-size',
+                    default=-1,
+                    type=int,
+                    metavar='N',
+                    help=
+                    'size of a total batch size, for simulating bigger batches using gradient accumulation')
+
+parser.add_argument(
+                    '--static-loss-scale',
+                    type=float,
+                    default=1,
+                    help=
+                    'Static loss scale, positive power of 2 values can improve fp16 convergence.')
+
+best_acc1 = 0
+
+
+def main():
+    args = parser.parse_args()
+    if args.npu is None:
+        args.npu = 0
+    global CALCULATE_DEVICE
+    CALCULATE_DEVICE = "npu:{}".format(args.npu)
+    torch.npu.set_device(CALCULATE_DEVICE)
+    print("use ", CALCULATE_DEVICE)
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    if args.gpu is not None:
+        warnings.warn('You have chosen a specific GPU. This will completely '
+                      'disable data parallelism.')
+
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    ngpus_per_node = torch.cuda.device_count()
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(args.gpu, ngpus_per_node, args)
+
+
+def main_worker(gpu, ngpus_per_node, args):
+    global best_acc1
+    args.gpu = gpu
+
+    if args.gpu is not None:
+        print("Use GPU: {} for training".format(args.gpu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + gpu
+        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                world_size=args.world_size, rank=args.rank)
+    # create model
+    if args.pretrained:
+        print("=> using pre-trained model '{}'".format(args.arch))
+        model = models.__dict__[args.arch](pretrained=True)
+    else:
+        print("=> creating model '{}'".format(args.arch))
+        model = models.__dict__[args.arch](zero_init_residual=True)
+    for layer in model.modules():
+        if isinstance(layer, nn.Linear):
+            torch.nn.init.kaiming_normal_(layer.weight, a=math.sqrt(5), )
+    if args.distributed:
+        # For multiprocessing distributed, DistributedDataParallel constructor
+        # should always set the single device scope, otherwise,
+        # DistributedDataParallel will use all available devices.
+        if args.gpu is not None:
+            torch.cuda.set_device(args.gpu)
+            model.cuda(args.gpu)
+            # When using a single GPU per process and per
+            # DistributedDataParallel, we need to divide the batch size
+            # ourselves based on the total number of GPUs we have
+            args.batch_size = int(args.batch_size / ngpus_per_node)
+            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        else:
+            model.cuda()
+            # DistributedDataParallel will divide and allocate batch_size to all
+            # available GPUs if device_ids are not set
+            model = torch.nn.parallel.DistributedDataParallel(model)
+    elif args.gpu is not None:
+        torch.cuda.set_device(args.gpu)
+        model = model.cuda(args.gpu)
+    else:
+        # DataParallel will divide and allocate batch_size to all available GPUs
+        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
+            model.features = torch.nn.DataParallel(model.features)
+            model.cuda()
+        else:
+            #model = torch.nn.DataParallel(model).cuda()
+            model = model.to(CALCULATE_DEVICE)
+
+    lr_policy = lr_cosine_policy(args.lr,
+                                 args.warmup,
+                                 args.epochs)
+
+
+    # define loss function (criterion) and optimizer
+    #criterion = nn.CrossEntropyLoss().cuda(args.gpu)
+    loss = nn.CrossEntropyLoss
+    if args.label_smoothing > 0.0:
+        loss = lambda: LabelSmoothing(args.label_smoothing)
+    criterion = loss().to(CALCULATE_DEVICE)
+    optimizer = torch.optim.SGD([
+        {'params': [param for name, param in model.named_parameters() if name[-4:] == 'bias'], 'weight_decay': 0.0},
+        {'params': [param for name, param in model.named_parameters() if name[-4:] != 'bias'], 'weight_decay': args.weight_decay}],
+                                args.lr,
+                                momentum=args.momentum)
+    
+    model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=1024, verbosity=1)
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            if args.npu is not None:
+                checkpoint = torch.load(args.resume)
+            elif args.gpu is None:
+                checkpoint = torch.load(args.resume)
+            else:
+                # Map model to be loaded to specified single gpu.
+                loc = 'cuda:{}'.format(args.gpu)
+                checkpoint = torch.load(args.resume, map_location=loc)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            if args.npu is not None:
+                best_acc1 = best_acc1.to("npu:{}".format(args.npu))
+            elif args.gpu is not None:
+                # best_acc1 may be from a checkpoint from a different GPU
+                best_acc1 = best_acc1.to(args.gpu)
+            model.load_state_dict(checkpoint['state_dict'])
+            #optimizer.load_state_dict(checkpoint['optimizer'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+
+    # Data loading code
+    traindir = os.path.join(args.data, 'train')
+    valdir = os.path.join(args.data, 'val')
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+    train_dataset = datasets.ImageFolder(
+        traindir,
+        transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            normalize,
+        ]))
+
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    else:
+        train_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=True, sampler=train_sampler)
+
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            normalize,
+        ])),
+        batch_size=args.batch_size, shuffle=True,
+        num_workers=args.workers, pin_memory=True)
+
+    if args.evaluate:
+        validate(val_loader, model, criterion, args)
+        return
+
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        #adjust_learning_rate(optimizer, epoch, args)
+        lr_policy(optimizer, 0, epoch)
+        # train for one epoch
+        train(train_loader, model, criterion, optimizer, epoch, args)
+
+        # evaluate on validation set
+        acc1 = validate(val_loader, model, criterion, args)
+
+        # remember best acc@1 and save checkpoint
+        is_best = acc1 > best_acc1
+        best_acc1 = max(acc1, best_acc1)
+        file_name = "checkpoint_npu{}".format(args.npu)
+        modeltmp = model.cpu()
+        save_checkpoint({
+            'epoch': epoch + 1,
+            'arch': args.arch,
+            'state_dict': modeltmp.state_dict(),
+            # 'state_dict': model,
+            'best_acc1': best_acc1.to("cpu"),
+            # 'optimizer' : optimizer.state_dict(),
+        }, is_best.to("cpu"), file_name)
+        modeltmp.to(CALCULATE_DEVICE)
+
+def train(train_loader, model, criterion, optimizer, epoch, args):
+    if args.optimizer_batch_size < 0:
+        batch_size_multiplier = 1
+    else:
+        tbs = 1 * args.batch_size
+        if args.optimizer_batch_size % tbs != 0:
+            print(
+                "Warning: simulated batch size {} is not divisible by actual batch size {}"
+                    .format(args.optimizer_batch_size, tbs))
+        batch_size_multiplier = int(args.optimizer_batch_size / tbs)
+        print("BSM: {}".format(batch_size_multiplier))
+
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(train_loader),
+        [batch_time, data_time, losses, top1, top5],
+        prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+    optimizer.zero_grad()
+    end = time.time()
+    for i, (images, target) in enumerate(train_loader):
+        #with torch.autograd.profiler.profile() as prof:
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        if args.gpu is not None:
+            images = images.cuda(args.gpu, non_blocking=True)
+        #target = target.cuda(args.gpu, non_blocking=True)
+        #if 'npu' in CALCULATE_DEVICE:
+        #    target = target.to(torch.int32)
+        images = images.to(CALCULATE_DEVICE, non_blocking=True)
+        if args.label_smoothing == 0.0:
+            target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
+        # compute output
+        output = model(images)
+        loss = criterion(output, target)
+
+        if args.label_smoothing > 0.0:
+            target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+        # compute gradient and do SGD step
+
+        #loss.backward()
+        ###############################
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            #print("middle")
+            scaled_loss.backward()
+        optimizer_step = ((i + 1) % batch_size_multiplier) == 0
+        if optimizer_step:
+            if batch_size_multiplier != 1:
+                for param_group in optimizer.param_groups:
+                    for param in param_group['params']:
+                        param.grad /= batch_size_multiplier
+            optimizer.step()
+            optimizer.zero_grad()
+
+        if i % LOG_STEP == 0:
+            progress.display(i)
+        #print(prof.key_averages().table(sort_by="self_cpu_time_total"))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+        if i == TRAIN_STEP:
+            break
+
+
+def validate(val_loader, model, criterion, args):
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(val_loader),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        end = time.time()
+        for i, (images, target) in enumerate(val_loader):
+            #with torch.autograd.profiler.profile() as prof:
+            if args.gpu is not None:
+                images = images.cuda(args.gpu, non_blocking=True)
+            #target = target.cuda(args.gpu, non_blocking=True)
+            #if 'npu' in CALCULATE_DEVICE:
+            #    target = target.to(torch.int32)
+            images = images.to(CALCULATE_DEVICE, non_blocking=True)
+            if args.label_smoothing == 0.0:
+                target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
+
+            # compute output
+            output = model(images)
+            loss = criterion(output, target)
+
+            if args.label_smoothing > 0.0:
+                target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % LOG_STEP == 0:
+                progress.display(i)
+            #print(prof.key_averages().table(sort_by="self_cpu_time_total"))
+
+        # TODO: this should also be done with the ProgressMeter
+        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
+              .format(top1=top1, top5=top5))
+        hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value="{top1.avg:.3f}".format(top1=top1))
+        hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value="{top5.avg:.3f}".format(top5=top5))
+    return top1.avg
+
+
+def save_checkpoint(state, is_best, filename='checkpoint'):
+    filename2 = filename + ".pth.tar"
+    torch.save(state, filename2)
+    if is_best:
+        shutil.copyfile(filename2, filename+'model_best.pth.tar')
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+        current_run_time=str(entries).split("Time")[1].split("Data")[0].strip().split(" ")[0]
+        args = parser.parse_args()
+        batch_size = args.batch_size
+        if "Epoch" in self.prefix:
+            if float(current_run_time) > 0:
+               FPS = int(batch_size)/float(current_run_time)
+               hwlog.remark_print(key=hwlog.FPS, value=float(FPS))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+
+def adjust_learning_rate(optimizer, epoch, args):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    lr = args.lr * (0.1 ** (epoch // 30))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+class LabelSmoothing(nn.Module):
+    """
+    NLL loss with label smoothing.
+    """
+    def __init__(self, smoothing=0.0):
+        """
+        Constructor for the LabelSmoothing module.
+
+        :param smoothing: label smoothing factor
+        """
+        super(LabelSmoothing, self).__init__()
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+
+    def forward(self, x, target):
+        logprobs = torch.nn.functional.log_softmax(x, dim=-1).to("cpu")
+        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
+        nll_loss = nll_loss.squeeze(1)
+        smooth_loss = -logprobs.mean(dim=-1)
+        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+        return loss.mean().to(CALCULATE_DEVICE)
+
+def lr_policy(lr_fn, logger=None):
+    if logger is not None:
+        logger.register_metric('lr',
+                               log.LR_METER(),
+                               verbosity=dllogger.Verbosity.VERBOSE)
+
+    def _alr(optimizer, iteration, epoch):
+        lr = lr_fn(iteration, epoch)
+
+        if logger is not None:
+            logger.log_metric('lr', lr)
+        for param_group in optimizer.param_groups:
+            param_group['lr'] = lr
+
+    return _alr
+
+def lr_cosine_policy(base_lr, warmup_length, epochs, logger=None):
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            e = epoch - warmup_length
+            es = epochs - warmup_length
+            lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
+
+if __name__ == '__main__':
+    hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
+    cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
+    config_info = get_model_parameter("pytorch_config")
+    initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
+    hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
+    hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
+    hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
+    hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
+    hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
+    hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
+    hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
+    hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
+    hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
+    hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
+    main()