!10 [add] 增加benchmark工具代码

Merge pull request !10 from 梁朝明/master
This commit is contained in:
zhutian
2020-10-19 20:55:51 +08:00
committed by Gitee
1225 changed files with 345421 additions and 0 deletions
+50
View File
@@ -0,0 +1,50 @@
# 训练benchmark
## 支持的产品
Atlas 800 (Model 9000)
## 操作系统
centos7.6 & ubuntu 18.04
## 训练方法
1. 根据实际情况修改 ./yaml/ 目录下的对应的 yaml 文件,建议备份原文件,且保持 yaml 文件名与模型名称相同。
2. 在当前目录(train)下,执行:`./benchmark.sh --help` 查看帮助信息。
3. 根据 **帮助信息** 或本文件中的 **运行参数说明** 选择配置运行参数后,执行:`./benchmark.sh`
## 示例
- 示例1docker 环境下启动 MobileNet 多卡(8p)训练:`./benchmark.sh -e MobileNet -hw 8p -y ./yaml/MobileNet.yaml -docker`
- 示例2host 环境下启动 MobileNet 单卡(1p)训练,yaml 使用默认文件:`./benchmark.sh -e MobileNet`
- 示例3host 环境下启动 ResNet50 集群(cluster)训练,yaml 使用默认文件:`./benchmark.sh -e ResNet50 -hw ct`
- 示例4host 环境下启动 pytorch模型DeepMar单卡(1p)训练,yaml 使用默认文件:`./benchmark.sh -e DeepMar -hw 1p -f pytorch`
- 示例5host 环境下启动 pytorch模型DeepMar多卡(8p)训练,yaml 使用默认文件:`./benchmark.sh -e DeepMar -hw 8p -f pytorch`
- 示例6docker环境下启动 pytorch模型DeepMar多卡(8p)训练,yaml 使用默认文件:`./benchmark.sh -e DeepMar -hw 8p -f pytorch -docker`
## 运行参数说明
| 参数 | 是否必填 | 参数说明 | 默认值 |
| --------------- | -------- | -------------------- |------------------------ |
| --execmodel, -e | 选填 | 需要执行的模型名称 | ResNet50 |
| --hardware, -hw | 选填 | 选择 1p, 2p, 4p, 8p, cluster/ct | 1p |
| --yamlpath, -y | 选填 | yaml 文件的路径 | ./yaml/{execmodel}.yaml |
| --framework, -f | 选填 | 模型训练框架 | tensorflow |
| -docker, -host | 选填 | 选择 docker 或 host | host |
| --help, -h | 选填 | 显示帮助信息 | NA |
| --list, -l | 选填 | 显示当前支持的模型与框架 | NA |
## 查看日志
- 可在 train/result/ 目录下查看各个模型最后生成的含性能与精度数据的日志。
- 中间结果ckpt或其他文件存放在 *device id* 下。
- train_x.log 为模型训练过程日志,内容较为详细;以 hw 开头的日志为打点日志,仅记录数据。
## 注意事项
- yaml 文件中的值可以参考注释,根据实际情况自行修改。键不可随意修改,否则可能导致训练失败或训练结果偏离实际。
- 集群(cluster)执行时,请保证各节点环境配置相同,且包括**配置文件、数据集、代码**绝对路径相同。
## Benchmark工具资料参考
https://support.huawei.com/enterprise/zh/ascend-computing/atlas-data-center-solution-pid-251167910/software/251732401?idAbsPath=fixnode01%7C23710424%7C251366513%7C22892968%7C251167910
@@ -0,0 +1,40 @@
# DeepMar_pytorch训练说明
### 1. 数据集处理
#### 1.1. 下载并准备数据集:
百度云盘https://pan.baidu.com/s/1q8nsydT7xkDjZJOxvPcoEw
passwd: 5vep
或者https://drive.google.com/open?id=1q4cux17K3zNBgIrDV4FtcHJPLzXNKfYG
存放地址
./dataset/peta/images/*.png
./dataset/peta/PETA.mat
#### 1.2 运行以下命令,分割训练集、测试集(路径修改成自己存放数据集路径)
python script/dataset/transform_peta.py
生成 peta_dataset.pklpeta_partition.pkl 文件
### 2. 模型训练参数配置
在train/yaml/DeepMar.yaml中修改相应配置, 配置项含义:
```
pytorch_config:
data_url: 数据集路径
epoches: 跑多少个epoch
batch_size:1p 参数为256 2p 512 4p 1024 8p为2048
seed: 49
lr: 默认参数1p 0.01 2p 0.016 4p 0.016 8p 0.016
docker_image: docker 镜像名称:版本号
```
------
@@ -0,0 +1,81 @@
import torch.utils.data as data
import os
from PIL import Image
import numpy as np
import pickle
import copy
class AttDataset(data.Dataset):
"""
person attribute dataset interface
"""
def __init__(
self,
dataset,
partition,
split='train',
partition_idx=0,
transform=None,
target_transform=None,
**kwargs):
if os.path.exists( dataset ):
file = open(dataset, 'rb')
self.dataset = pickle.load(file)
else:
print (dataset + ' does not exist in dataset.')
raise ValueError
if os.path.exists( partition ):
part = open(partition, 'rb')
self.partition = pickle.load(part)
else:
print (partition + ' does not exist in dataset.')
raise ValueError
if split not in self.partition:
print (split + ' does not exist in dataset.')
raise ValueError
if partition_idx > len(self.partition[split])-1:
print ('partition_idx is out of range in partition.')
raise ValueError
self.transform = transform
self.target_transform = target_transform
# create image, label based on the selected partition and dataset split
self.root_path = self.dataset['root']
self.att_name = [self.dataset['att_name'][i] for i in self.dataset['selected_attribute']]
self.image = []
self.label = []
for idx in self.partition[split][partition_idx]:
self.image.append(self.dataset['image'][idx])
label_tmp = np.array(self.dataset['att'][idx])[self.dataset['selected_attribute']].tolist()
self.label.append(label_tmp)
def __getitem__(self, index):
"""
Args:
index (int): Index
Returns:
tuple: (image, target) where target is the index of the target class
"""
imgname, target = self.image[index], self.label[index]
# load image and labels
imgname = os.path.join(self.dataset['root'], imgname)
img = Image.open(imgname)
if self.transform is not None:
img = self.transform( img )
# default no transform
target = np.array(target).astype(np.float32)
target[target == 0] = -1
target[target == 2] = 0
if self.target_transform is not None:
target = self.transform( target )
return img, target
# useless for personal batch sampler
def __len__(self):
return len(self.image)
@@ -0,0 +1,65 @@
import torch
import numpy as np
import numbers
__all__ = ["AddPad", "AddCrop"]
class AddCrop(object):
def __init__(self, size):
self.size = size # two
assert len(self.size) == 2
def __repr__(self):
return self.__class__.__name__ + '(size={0})'.format(self.size)
def __call__(self, img):
shape = img.shape # 3*H*W
h_high = shape[1] - self.size[0]
w_high = shape[2] - self.size[1]
h_start = np.random.randint(low=0, high=h_high)
w_start = np.random.randint(low=0, high=w_high)
return img[:, h_start: h_start+self.size[0], w_start: w_start+self.size[1]]
class AddPad(object):
def __init__(self, padding, fill=0):
self.padding = padding
self.fill = fill
if isinstance(self.padding, numbers.Number):
self.pad_l = int(self.padding)
self.pad_r = int(self.padding)
self.pad_u = int(self.padding)
self.pad_d = int(self.padding)
elif isinstance(self.padding, (list, tuple)) and len(self.padding) == 4:
self.pad_l = int(self.padding[0])
self.pad_r = int(self.padding[1])
self.pad_u = int(self.padding[2])
self.pad_d = int(self.padding[3])
else:
print ("The type of padding is not right.")
raise ValueError
if self.pad_l <0 or self.pad_r < 0 or self.pad_u < 0 or self.pad_d < 0:
raise ValueError
if isinstance(self.fill, numbers.Number):
self.fill_value = [self.fill]
elif isinstance(self.fill, list):
self.fill_value = self.fill
def __repr__(self):
return self.__class__.__name__ + '(padding={0})'.format(self.padding)
def __call__(self, img):
"""
Args:
img: a 3-dimensional torch tensor with shape [R,G,B]*H*W
Returns:
img: a 3-dimensional padded tensor with shape [R,G,B]*H'*W'
"""
if not (self.pad_l or self.pad_r or self.pad_u or self.pad_d):
return img
shape = img.shape
img_ = torch.rand(shape[0], shape[1]+self.pad_u+self.pad_d, \
shape[2]+self.pad_l+self.pad_r)
for i in range(shape[0]):
img_[i, 0:self.pad_u, :] = self.fill_value[i%len(self.fill_value)]
img_[i, -(self.pad_d+1):-1, :] = self.fill_value[i%len(self.fill_value)]
img_[i, :, 0:self.pad_l] = self.fill_value[i%len(self.fill_value)]
img_[i, :, -(self.pad_r+1):-1] = self.fill_value[i%len(self.fill_value)]
img_[i, self.pad_u:self.pad_u+shape[1], self.pad_l:self.pad_l+shape[2]] = img[i, :, :]
return img_
@@ -0,0 +1,81 @@
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
from .resnet import resnet50
class DeepMAR_ResNet50(nn.Module):
def __init__(
self,
**kwargs
):
super(DeepMAR_ResNet50, self).__init__()
# init the necessary parameter for netwokr structure
if 'num_att' in kwargs:
self.num_att = kwargs['num_att']
else:
self.num_att = 35
if 'last_conv_stride' in kwargs:
self.last_conv_stride = kwargs['last_conv_stride']
else:
self.last_conv_stride = 2
if 'drop_pool5' in kwargs:
self.drop_pool5 = kwargs['drop_pool5']
else:
self.drop_pool5 = True
if 'drop_pool5_rate' in kwargs:
self.drop_pool5_rate = kwargs['drop_pool5_rate']
else:
self.drop_pool5_rate = 0.5
if 'pretrained' in kwargs:
self.pretrained = kwargs['pretrained']
else:
self.pretrained = True
self.base = resnet50(pretrained=self.pretrained, last_conv_stride=self.last_conv_stride)
self.classifier = nn.Linear(2048, self.num_att)
init.normal_(self.classifier.weight, std=0.001)
init.constant_(self.classifier.bias, 0)
def forward(self, x):
x = self.base(x)
x = F.avg_pool2d(x, x.shape[2:])
# x = x.view(x.size(0), -1)
x = torch.flatten(x, 1)
if self.drop_pool5:
# x = x.to("cpu")
x = F.dropout(x, p=self.drop_pool5_rate, training=self.training)
# x = x.to("npu")
x = self.classifier(x)
return x
class DeepMAR_ResNet50_ExtractFeature(object):
"""
A feature extraction function
"""
def __init__(self, model, **kwargs):
self.model = model
def __call__(self, imgs):
old_train_eval_model = self.model.training
# set the model to be eval
self.model.eval()
# imgs should be Variable
if not isinstance(imgs, Variable):
print ('imgs should be type: Variable')
raise ValueError
# compute output
score = self.model(imgs)
score = score.data.cpu().numpy()
# set the model to be training
self.model.train(old_train_eval_model)
return score
@@ -0,0 +1,217 @@
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, last_conv_stride=2):
self.inplanes = 64
super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=last_conv_stride)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
return x
def remove_fc(state_dict):
""" Remove the fc layer parameter from state_dict. """
for key, value in list(state_dict.items()):
if key.startswith('fc.'):
del state_dict[key]
return state_dict
def resnet18(pretrained=False, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
model.load_state_dict(remove_fc(model_zoo.load_url(model_urls['resnet18'])))
return model
def resnet34(pretrained=False, **kwargs):
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(remove_fc(model_zoo.load_url(model_urls['resnet34'])))
return model
def resnet50(pretrained=False, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(remove_fc(model_zoo.load_url(model_urls['resnet50'])))
return model
def resnet101(pretrained=False, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
model.load_state_dict(remove_fc(model_zoo.load_url(model_urls['resnet101'])))
return model
def resnet152(pretrained=False, **kwargs):
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
model.load_state_dict(remove_fc(model_zoo.load_url(model_urls['resnet152'])))
return model
@@ -0,0 +1,105 @@
import os
import torch
from torch.autograd import Variable
import numpy as np
import copy
import time
import sys
def extract_feat(feat_func, dataset, device_id, **kwargs):
"""
extract feature for images
"""
test_loader = torch.utils.data.DataLoader(
dataset=dataset, batch_size=32,
num_workers=32, pin_memory=True,
drop_last=True)
# extract feature for all the images of test/val identities
start_time = time.time()
total_eps = len(test_loader)
N = len(dataset.image)
start = 0
with torch.no_grad():
for ep, (imgs, labels) in enumerate(test_loader):
# imgs_var = Variable(imgs).cuda()
# imgs_var = Variable(imgs)
imgs_var = Variable(imgs).to(device_id)
feat_tmp = feat_func(imgs_var)
batch_size = feat_tmp.shape[0]
if ep == 0:
feat = np.zeros((N, int(feat_tmp.size/batch_size)))
feat[start:start+batch_size, :] = feat_tmp.reshape((batch_size, -1))
start += batch_size
end_time = time.time()
print('{} batches done, total {:.2f}s'.format(total_eps, end_time-start_time))
return feat
# attribute recognition evaluation
def attribute_evaluate(feat_func, dataset, device_id, **kwargs):
print ("extracting features for attribute recognition")
pt_result = extract_feat(feat_func, dataset, device_id)
# obain the attributes from the attribute dictionary
print ("computing attribute recognition result")
N = pt_result.shape[0]
L = pt_result.shape[1]
gt_result = np.zeros(pt_result.shape)
# get the groundtruth attributes
for idx, label in enumerate(dataset.label):
gt_result[idx, :] = label
pt_result[pt_result>=0] = 1
pt_result[pt_result<0] = 0
return attribute_evaluate_lidw(gt_result, pt_result)
def attribute_evaluate_lidw(gt_result, pt_result):
"""
Input:
gt_result, pt_result, N*L, with 0/1
Output:
result
a dictionary, including label-based and instance-based evaluation
label-based: label_pos_acc, label_neg_acc, label_acc
instance-based: instance_acc, instance_precision, instance_recall, instance_F1
"""
# obtain the label-based and instance-based accuracy
# compute the label-based accuracy
if gt_result.shape != pt_result.shape:
print ('Shape beteen groundtruth and predicted results are different')
# compute the label-based accuracy
result = {}
gt_pos = np.sum((gt_result == 1).astype(float), axis=0)
gt_neg = np.sum((gt_result == 0).astype(float), axis=0)
pt_pos = np.sum((gt_result == 1).astype(float) * (pt_result == 1).astype(float), axis=0)
pt_neg = np.sum((gt_result == 0).astype(float) * (pt_result == 0).astype(float), axis=0)
label_pos_acc = 1.0*pt_pos/gt_pos
label_neg_acc = 1.0*pt_neg/gt_neg
label_acc = (label_pos_acc + label_neg_acc)/2
result['label_pos_acc'] = label_pos_acc
result['label_neg_acc'] = label_neg_acc
result['label_acc'] = label_acc
# compute the instance-based accuracy
# precision
gt_pos = np.sum((gt_result == 1).astype(float), axis=1)
pt_pos = np.sum((pt_result == 1).astype(float), axis=1)
floatersect_pos = np.sum((gt_result == 1).astype(float)*(pt_result == 1).astype(float), axis=1)
union_pos = np.sum(((gt_result == 1)+(pt_result == 1)).astype(float),axis=1)
# avoid empty label in predicted results
cnt_eff = float(gt_result.shape[0])
for iter, key in enumerate(gt_pos):
if key == 0:
union_pos[iter] = 1
pt_pos[iter] = 1
gt_pos[iter] = 1
cnt_eff = cnt_eff - 1
continue
if pt_pos[iter] == 0:
pt_pos[iter] = 1
instance_acc = np.sum(floatersect_pos/union_pos)/cnt_eff
instance_precision = np.sum(floatersect_pos/pt_pos)/cnt_eff
instance_recall = np.sum(floatersect_pos/gt_pos)/cnt_eff
floatance_F1 = 2*instance_precision*instance_recall/(instance_precision+instance_recall)
result['instance_acc'] = instance_acc
result['instance_precision'] = instance_precision
result['instance_recall'] = instance_recall
result['instance_F1'] = floatance_F1
return result
@@ -0,0 +1,347 @@
import os
import pickle
import datetime
import time
# from contextlib import contextmanger
import torch
from torch.autograd import Variable
import random
import numpy as np
import torch.backends.cudnn as cudnn
def time_str(fmt=None):
if fmt is None:
fmt = '%Y-%m-%d_%H:%M:%S'
return datetime.datetime.today().strftime(fmt)
def str2bool(v):
return v.lower() in ("yes", "true", "1")
def is_iterable(obj):
return hasattr(obj, '__len__')
def to_scalar(vt):
"""
transform a 1-length pytorch Variable or Tensor to scalar
"""
if isinstance(vt, Variable):
return vt.data.cpu().numpy().flatten()[0]
if torch.is_tensor(vt):
return vt.cpu().numpy().flatten()[0]
raise TypeError('Input should be a variable or tensor')
def set_seed(rand_seed):
np.random.seed( rand_seed )
random.seed( rand_seed )
torch.backends.cudnn.enabled = True
torch.manual_seed( rand_seed )
torch.cuda.manual_seed( rand_seed )
def seed_everything(seed):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
cudnn.deterministic = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def may_mkdir(fname):
if not os.path.exists(os.path.dirname(os.path.abspath(fname))):
os.makedirs(os.path.dirname(os.path.abspath(fname)))
class AverageMeter(object):
"""
Computes and stores the average and current value
"""
def __init__(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = float(self.sum) / (self.count + 1e-10)
class RunningAverageMeter(object):
"""
Computes and stores the running average and current value
"""
def __init__(self, hist=0.99):
self.val = None
self.avg = None
self.hist = hist
def reset(self):
self.val = None
self.avg = None
def update(self, val):
if self.avg is None:
self.avg = val
else:
self.avg = self.avg * self.hist + val * (1 - self.hist)
self.val = val
class RecentAverageMeter(object):
"""
Stores and computes the average of recent values
"""
def __init__(self, hist_size=100):
self.hist_size = hist_size
self.fifo = []
self.val = 0
def reset(self):
self.fifo = []
self.val = 0
def update(self, val):
self.val = val
self.fifo.append(val)
if len(self.fifo) > self.hist_size:
del self.fifo[0]
@property
def avg(self):
assert len(self.fifo) > 0
return float(sum(self.fifo)) / len(self.fifo)
class ReDirectSTD(object):
"""
overwrites the sys.stdout or sys.stderr
Args:
fpath: file path
console: one of ['stdout', 'stderr']
immediately_visiable: False
Usage example:
ReDirectSTD('stdout.txt', 'stdout', False)
ReDirectSTD('stderr.txt', 'stderr', False)
"""
def __init__(self, fpath=None, console='stdout', immediately_visiable=False):
import sys
import os
assert console in ['stdout', 'stderr']
self.console = sys.stdout if console == "stdout" else sys.stderr
self.file = fpath
self.f = None
self.immediately_visiable = immediately_visiable
if fpath is not None:
# Remove existing log file
if os.path.exists(fpath):
os.remove(fpath)
if console == 'stdout':
sys.stdout = self
else:
sys.stderr = self
def __del__(self):
self.close()
def __enter__(self):
pass
def __exit__(self, **args):
self.close()
def write(self, msg):
self.console.write(msg)
if self.file is not None:
if not os.path.exists(os.path.dirname(os.path.abspath(self.file))):
os.mkdir(os.path.dirname(os.path.abspath(self.file)))
if self.immediately_visiable:
with open(self.file, 'a') as f:
f.write(msg)
else:
if self.f is None:
self.f = open(self.file, 'w')
self.f.write(msg)
def flush(self):
self.console.flush()
if self.f is not None:
self.f.flush()
import os
os.fsync(self.f.fileno())
def close(self):
self.console.close()
if self.f is not None:
self.f.close()
def find_index(seq, item):
for i, x in enumerate(seq):
if item == x:
return i
return -1
def set_devices(sys_device_ids):
"""
Args:
sys_device_ids: a tuple; which GPUs to use
e.g. sys_device_ids = (), only use cpu
sys_device_ids = (3,), use the 4-th gpu
sys_device_ids = (0, 1, 2, 3,), use the first 4 gpus
sys_device_ids = (0, 2, 4,), use the 1, 3 and 5 gpus
"""
import os
visiable_devices = ''
for i in sys_device_ids:
visiable_devices += '{}, '.format(i)
os.environ['CUDA_VISIBLE_DEVICES'] = visiable_devices
# Return wrappers
# Models and user defined Variables/Tensors would be transferred to
# the first device
device_id = 0 if len(sys_device_ids) > 0 else -1
def transfer_optims(optims, device_id=-1):
for optim in optims:
if isinstance(optim, torch.optim.Optimizer):
transfer_optim_state(optim.state, device_id=device_id)
def transfer_optim_state(state, device_id=-1):
for key, val in state.items():
if isinstance(val, dict):
transfer_optim_state(val, device_id=device_id)
elif isinstance(val, Variable):
raise RuntimeError("Oops, state[{}] is a Variable!".format(key))
elif isinstance(val, torch.nn.Parameter):
raise RuntimeError("Oops, state[{}] is a Parameter!".format(key))
else:
try:
if device_id == -1:
state[key] = val.cpu()
else:
#state[key] = val.cuda(device=device_id)
state[key] = val.npu(device=device_id)
except:
pass
def load_state_dict(model, src_state_dict):
"""
copy parameter from src_state_dict to model
Arguments:
model: A torch.nn.Module object
src_state_dict: a dict containing parameters and persistent buffers
"""
from torch.nn import Parameter
dest_state_dict = model.state_dict()
for name, param in src_state_dict.items():
if name not in dest_state_dict:
continue
if isinstance(param, Parameter):
param = param.data
try:
dest_state_dict[name].copy_(param)
except Exception:
print("Warning: Error occurs when copying '{}'".format(name))
src_missing = set(dest_state_dict.keys()) - set(src_state_dict.keys())
if len(src_missing) > 0:
print ("Keys not found in source state_dict: ")
for n in src_missing:
print('\t', n)
dest_missint = set(src_state_dict.keys()) - set(dest_state_dict.keys())
if len(dest_missint):
print ("Keys not found in destination state_dict: ")
for n in dest_missint:
print('\t', n)
def load_ckpt(modules_optims, ckpt_file, load_to_cpu=True, verbose=True):
"""
load state_dict of module & optimizer from file
Args:
modules_optims: A two-element list which contains module and optimizer
ckpt_file: the check point file
load_to_cpu: Boolean, whether to transform tensors in model & optimizer to cpu type
"""
map_location = (lambda storage, loc: storage) if load_to_cpu else None
ckpt = torch.load(ckpt_file, map_location=map_location)
for m, sd in zip(modules_optims, ckpt['state_dicts']):
m.load_state_dict(sd)
if verbose:
print("Resume from ckpt {}, \nepoch: {}, scores: {}".format(
ckpt_file, ckpt['ep'], ckpt['scores']))
return ckpt['ep'], ckpt['scores']
def save_ckpt(modules_optims, ep, scores, ckpt_file):
"""
save state_dict of modules/optimizers to file
Args:
modules_optims: a two-element list which contains a module and a optimizer
ep: the current epoch number
scores: the performance of current module
ckpt_file: the check point file path
Note:
torch.save() reserves device type and id of tensors to save.
So when loading ckpt, you have to inform torch.load() to load these tensors
to cpu or your desired gpu, if you change devices.
"""
state_dicts = [m.state_dict() for m in modules_optims]
ckpt = dict(state_dicts = state_dicts,
ep = ep,
scores = scores)
if not os.path.exists(os.path.dirname(os.path.abspath(ckpt_file))):
os.mkdir(os.path.dirname(os.path.abspath(ckpt_file)))
torch.save(ckpt, ckpt_file)
def adjust_lr_staircase(param_groups, base_lrs, ep, decay_at_epochs, factor):
""" Multiplied by a factor at the beging of specified epochs. Different
params groups specify thier own base learning rates.
Args:
param_groups: a list of params
base_lrs: starting learning rate, len(base_lrs) = len(params_groups)
ep: current epoch, ep >= 1
decay_at_epochs: a list or tuple; learning rates are multiplied by a factor
at the begining of these epochs
factor: a number in range (0, 1)
Example:
base_lrs = [0.1, 0.01]
decay_at_epochs = [51, 101]
factor = 0.1
Note:
It is meant to be called at the begining of an epoch
"""
assert len(base_lrs) == len(param_groups), \
'You should specify base lr for each param group.'
assert ep >= 1, "Current epoch number should be >= 1"
if ep not in decay_at_epochs:
return
ind = find_index(decay_at_epochs, ep)
for i, (g, base_lr) in enumerate(zip(param_groups, base_lrs)):
g['lr'] = base_lr * factor ** (ind + 1)
print('=====> Param group {}: lr adjusted to {:.10f}'
.format(i, g['lr']).rstrip('0'))
def adjust_lr(optimizer, ep, finetuned_params_lr):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
# lr = args.lr * (0.1 ** (ep// 30))
lr = finetuned_params_lr * (0.96 ** (ep // 8)) #decreasing the learning rate by 4% every 8 epoch
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def may_set_mode(maybe_modules, mode):
"""
maybe_modules, an object or a list of objects.
"""
assert mode in ['train', 'eval']
if not is_iterable(maybe_modules):
maybe_modules = [maybe_modules]
for m in maybe_modules:
if isinstance(m, torch.nn.Module):
if mode == 'train':
m.train()
else:
m.eval()
Binary file not shown.

After

Width:  |  Height:  |  Size: 8.4 KiB

@@ -0,0 +1,141 @@
import sys
import os
import numpy as np
import random
import math
import torch
import torchvision.transforms as transforms
from torch.autograd import Variable
from torch.nn.parallel import DataParallel
import pickle
import time
import argparse
from PIL import Image, ImageFont, ImageDraw
from baseline.model.DeepMAR import DeepMAR_ResNet50
from baseline.utils.utils import str2bool
from baseline.utils.utils import save_ckpt, load_ckpt
from baseline.utils.utils import load_state_dict
from baseline.utils.utils import set_devices
from baseline.utils.utils import set_seed
class Config(object):
def __init__(self):
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--sys_device_ids', type=eval, default=(0,))
parser.add_argument('--set_seed', type=str2bool, default=False)
# model
parser.add_argument('--resize', type=eval, default=(224, 224))
parser.add_argument('--last_conv_stride', type=int, default=2, choices=[1,2])
# demo image
parser.add_argument('--demo_image', type=str, default='./dataset/demo/demo_image.png')
## dataset parameter
parser.add_argument('--dataset', type=str, default='peta',
choices=['peta','rap', 'pa100k'])
# utils
parser.add_argument('--load_model_weight', type=str2bool, default=True)
parser.add_argument('--model_weight_file', type=str, default='./exp/deepmar_resnet50/peta/partition0/run1/model/ckpt_epoch150.pth')
args = parser.parse_args()
# gpu ids
self.sys_device_ids = args.sys_device_ids
# random
self.set_seed = args.set_seed
if self.set_seed:
self.rand_seed = 0
else:
self.rand_seed = None
self.resize = args.resize
self.mean = [0.485, 0.456, 0.406]
self.std = [0.229, 0.224, 0.225]
# utils
self.load_model_weight = args.load_model_weight
self.model_weight_file = args.model_weight_file
if self.load_model_weight:
if self.model_weight_file == '':
print ('Please input the model_weight_file if you want to load model weight')
raise ValueError
# dataset
datasets = dict()
datasets['peta'] = './dataset/peta/peta_dataset.pkl'
datasets['rap'] = './dataset/rap/rap_dataset.pkl'
datasets['pa100k'] = './dataset/pa100k/pa100k_dataset.pkl'
if args.dataset in datasets:
dataset = pickle.load(open(datasets[args.dataset]))
else:
print ('%s does not exist.'%(args.dataset))
raise ValueError
self.att_list = [dataset['att_name'][i] for i in dataset['selected_attribute']]
# demo image
self.demo_image = args.demo_image
# model
model_kwargs = dict()
model_kwargs['num_att'] = len(self.att_list)
model_kwargs['last_conv_stride'] = args.last_conv_stride
self.model_kwargs = model_kwargs
### main function ###
cfg = Config()
# dump the configuration to log.
import pprint
print('-' * 60)
print('cfg.__dict__')
pprint.pprint(cfg.__dict__)
print('-' * 60)
# set the random seed
if cfg.set_seed:
set_seed( cfg.rand_seed )
# init the gpu ids
set_devices(cfg.sys_device_ids)
# dataset
normalize = transforms.Normalize(mean=cfg.mean, std=cfg.std)
test_transform = transforms.Compose([
transforms.Resize(cfg.resize),
transforms.ToTensor(),
normalize,])
### Att model ###
model = DeepMAR_ResNet50(**cfg.model_kwargs)
# load model weight if necessary
if cfg.load_model_weight:
map_location = (lambda storage, loc:storage)
ckpt = torch.load(cfg.model_weight_file, map_location=map_location)
model.load_state_dict(ckpt['state_dicts'][0])
model.cuda()
model.eval()
# load one image
img = Image.open(cfg.demo_image)
img_trans = test_transform( img )
img_trans = torch.unsqueeze(img_trans, dim=0)
img_var = Variable(img_trans).cuda()
score = model(img_var).data.cpu().numpy()
# show the score in command line
for idx in range(len(cfg.att_list)):
if score[0, idx] >= 0:
print ('%s: %.2f'%(cfg.att_list[idx], score[0, idx]))
# show the score in the image
img = img.resize(size=(256, 512), resample=Image.BILINEAR)
draw = ImageDraw.Draw(img)
positive_cnt = 0
for idx in range(len(cfg.att_list)):
if score[0, idx] >= 0:
txt = '%s: %.2f'%(cfg.att_list[idx], score[0, idx])
draw.text((10, 10 + 10*positive_cnt), txt, (255, 0, 0))
positive_cnt += 1
img.save('./dataset/demo/demo_image_result.png')
@@ -0,0 +1,483 @@
import sys
import os
import numpy as np
import random
import math
import torch
import torch.optim as optim
import torchvision.transforms as transforms
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
from torch.nn.parallel import DataParallel
import pickle
import time
import argparse
import pdb
import sys
from torch.utils.tensorboard import SummaryWriter
from baseline.dataset import add_transforms
from baseline.dataset.Dataset import AttDataset
from baseline.model.DeepMAR import DeepMAR_ResNet50
from baseline.model.DeepMAR import DeepMAR_ResNet50_ExtractFeature
from baseline.utils.evaluate import attribute_evaluate
from baseline.utils.utils import str2bool
from baseline.utils.utils import transfer_optim_state
from baseline.utils.utils import time_str
from baseline.utils.utils import save_ckpt, load_ckpt
from baseline.utils.utils import load_state_dict
from baseline.utils.utils import ReDirectSTD
from baseline.utils.utils import adjust_lr_staircase
from baseline.utils.utils import adjust_lr
from baseline.utils.utils import set_devices
from baseline.utils.utils import AverageMeter
from baseline.utils.utils import to_scalar
from baseline.utils.utils import may_set_mode
from baseline.utils.utils import may_mkdir
from baseline.utils.utils import set_seed
from baseline.utils.utils import seed_everything
# Apex
import numpy as np
from apex import amp
import torch.npu
from benchmark_log import hwlog
from benchmark_log.basic_utils import get_environment_info
from benchmark_log.basic_utils import get_model_parameter
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
config_info = get_model_parameter("pytorch_config")
initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
CALCULATE_DEVICE = "npu:7"
PRINT_DEVICE = "cpu"
class Config(object):
def __init__(self):
parser = argparse.ArgumentParser()
parser.add_argument('--npu', type=int, default=0, help='NPU id to use.')
parser.add_argument('--set_seed', type=str2bool, default=False)
## dataset parameter
parser.add_argument('--dataset', type=str, default='peta',
choices=['peta','rap', 'pa100k', 'rap2'])
parser.add_argument('--save_dir', type=str, default='/home/zhusiyi/dataset/peta/')
parser.add_argument('--split', type=str, default='trainval',
choices=['trainval', 'train'])
parser.add_argument('--test_split', type=str, default='test')
parser.add_argument('--partition_idx', type=int, default=0)
parser.add_argument('--resize', type=eval, default=(224, 224))
parser.add_argument('--mirror', type=str2bool, default=True)
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--workers', type=int, default=2)
# model
parser.add_argument('--num_att', type=int, default=35)
parser.add_argument('--pretrained', type=str2bool, default=True)
parser.add_argument('--last_conv_stride', type=int, default=2, choices=[1,2])
parser.add_argument('--drop_pool5', type=str2bool, default=True)
parser.add_argument('--drop_pool5_rate', type=float, default=0.5)
parser.add_argument('--sgd_weight_decay', type=float, default=0.0005)
parser.add_argument('--sgd_momentum', type=float, default=0.9)
parser.add_argument('--new_params_lr', type=float, default=0.001)
parser.add_argument('--finetuned_params_lr', type=float, default=0.001)
parser.add_argument('--staircase_decay_at_epochs', type=eval,
default=(51, ))
parser.add_argument('--staircase_decay_multiple_factor', type=float,
default=0.1)
parser.add_argument('--total_epochs', type=int, default=150)
parser.add_argument('--weighted_entropy', type=str2bool, default=True)
# utils
parser.add_argument('--resume', type=str2bool, default=False)
parser.add_argument('--ckpt_file', type=str, default='')
parser.add_argument('--load_model_weight', type=str2bool, default=False)
parser.add_argument('--model_weight_file', type=str, default='')
parser.add_argument('--test_only', type=str2bool, default=False)
parser.add_argument('--exp_dir', type=str, default='')
parser.add_argument('--exp_subpath', type=str, default='deepmar_resnet50')
parser.add_argument('--log_to_file', type=str2bool, default=True)
parser.add_argument('--steps_per_log', type=int, default=20)
parser.add_argument('--epochs_per_val', type=int, default=10)
parser.add_argument('--epochs_per_save', type=int, default=50)
parser.add_argument('--run', type=int, default=1)
# apex
parser.add_argument('--amp', default=False, action='store_true',
help='use amp to train the model')
parser.add_argument('--loss_scale', default=-1., type=float,
help='loss scale using in amp, default -1 means dynamic')
parser.add_argument('--opt_level', default='O1', type=str,
help='opt level using in amp, default O1 means FP16')
args = parser.parse_args()
# gpu ids
self.npu = args.npu
# random
self.set_seed = args.set_seed
if self.set_seed:
self.seed = 0
else:
self.seed = None
# amp
self.amp = args.amp
self.loss_scale = args.loss_scale
self.opt_level = args.opt_level
# run time index
self.run = args.run
# Dataset #
datasets = dict()
#datasets['peta'] = '/home/zhusiyi/dataset/peta/peta_dataset.pkl'
#datasets['peta'] = os.path.join(os.path.abspath(os.path.dirname(__file__)),'dataset/peta/peta_dataset.pkl')
datasets['peta'] = args.save_dir + '/peta_dataset.pkl'
partitions = dict()
#partitions['peta'] = '/home/zhusiyi/dataset/peta/peta_partition.pkl'
#partitions['peta'] = os.path.join(os.path.abspath(os.path.dirname(__file__)),'dataset/peta/peta_partition.pkl')
partitions['peta'] = args.save_dir + '/peta_partition.pkl'
self.dataset_name = args.dataset
if args.dataset not in datasets or args.dataset not in partitions:
print ("Please select the right dataset name.")
raise ValueError
else:
self.dataset = datasets[args.dataset]
self.partition = partitions[args.dataset]
self.partition_idx = args.partition_idx
self.split = args.split
self.test_split = args.test_split
self.resize = args.resize
self.mirror = args.mirror
self.mean = [0.485, 0.456, 0.406]
self.std = [0.229, 0.224, 0.225]
self.batch_size = args.batch_size
self.workers = args.workers
# optimization
self.sgd_momentum = args.sgd_momentum
self.sgd_weight_decay = args.sgd_weight_decay
self.new_params_lr = args.new_params_lr
self.finetuned_params_lr = args.finetuned_params_lr
self.staircase_decay_at_epochs = args.staircase_decay_at_epochs
self.staircase_decay_multiple_factor = args.staircase_decay_multiple_factor
self.total_epochs = args.total_epochs
self.weighted_entropy = args.weighted_entropy
# utils
self.resume = args.resume
self.ckpt_file = args.ckpt_file
if self.resume:
if self.ckpt_file == '':
print ('Please input the ckpt_file if you want to resume training')
raise ValueError
self.load_model_weight = args.load_model_weight
self.model_weight_file = args.model_weight_file
if self.load_model_weight:
if self.model_weight_file == '':
print ('Please input the model_weight_file if you want to load model weight')
raise ValueError
self.test_only = args.test_only
self.exp_dir = args.exp_dir
self.exp_subpath = args.exp_subpath
self.log_to_file = args.log_to_file
self.steps_per_log = args.steps_per_log
self.epochs_per_val = args.epochs_per_val
self.epochs_per_save = args.epochs_per_save
self.run = args.run
# for model
model_kwargs = dict()
model_kwargs['num_att'] = args.num_att
model_kwargs['last_conv_stride'] = args.last_conv_stride
model_kwargs['drop_pool5'] = args.drop_pool5
model_kwargs['drop_pool5_rate'] = args.drop_pool5_rate
self.model_kwargs = model_kwargs
# for evaluation
self.test_kwargs = dict()
if self.exp_dir == '':
self.exp_dir = os.path.join('exp',
'{}'.format(self.exp_subpath),
'{}'.format(self.dataset_name),
'partition{}'.format(self.partition_idx),
'run{}'.format(self.run))
self.stdout_file = os.path.join(self.exp_dir, \
'log', 'stdout_{}.txt'.format(time_str()))
self.stderr_file = os.path.join(self.exp_dir, \
'log', 'stderr_{}.txt'.format(time_str()))
may_mkdir(self.stdout_file)
### main function ###
cfg = Config()
# log
if cfg.log_to_file:
ReDirectSTD(cfg.stdout_file, 'stdout', False)
ReDirectSTD(cfg.stderr_file, 'stderr', False)
# dump the configuration to log.
import pprint
print('-' * 60)
print('cfg.__dict__')
pprint.pprint(cfg.__dict__)
print('-' * 60)
# set the random seed
print(cfg.seed)
if cfg.set_seed:
set_seed(cfg.seed)
seed_everything(cfg.seed)
# init the npu ids
CALCULATE_DEVICE = "npu:{}".format(cfg.npu)
torch.npu.set_device(CALCULATE_DEVICE)
# dataset
normalize = transforms.Normalize(mean=cfg.mean, std=cfg.std)
transform = transforms.Compose([
transforms.Resize(cfg.resize),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(), # 3*H*W, [0, 1]
normalize,]) # normalize with mean/std
# by a subset of attributes
train_set = AttDataset(
dataset = cfg.dataset,
partition = cfg.partition,
split = cfg.split,
partition_idx= cfg.partition_idx,
transform = transform)
num_att = len(train_set.dataset['selected_attribute'])
cfg.model_kwargs['num_att'] = num_att
train_loader = torch.utils.data.DataLoader(
dataset = train_set,
batch_size = cfg.batch_size,
shuffle = True,
num_workers = cfg.workers,
pin_memory = True,
drop_last = True)
test_transform = transforms.Compose([
transforms.Resize(cfg.resize),
transforms.ToTensor(),
normalize,])
test_set = AttDataset(
dataset = cfg.dataset,
partition = cfg.partition,
split = cfg.test_split,
partition_idx = cfg.partition_idx,
transform = test_transform)
### Att model ###
model = DeepMAR_ResNet50(**cfg.model_kwargs)
# Optimizer
finetuned_params = []
new_params = []
for n, p in model.named_parameters():
if n.find('classifier') >=0:
new_params.append(p)
else:
finetuned_params.append(p)
param_groups = [{'params': finetuned_params, 'lr': cfg.finetuned_params_lr},
{'params': new_params, 'lr': cfg.new_params_lr}]
optimizer = optim.SGD(
param_groups,
momentum = cfg.sgd_momentum,
weight_decay = cfg.sgd_weight_decay)
model = model.to(CALCULATE_DEVICE)
# apex
if cfg.amp:
# Initialization
model, optimizer = amp.initialize(model, optimizer, opt_level=cfg.opt_level, loss_scale=cfg.loss_scale)
print("=> Using amp mode.")
# using the weighted cross entropy loss
if cfg.weighted_entropy:
rate = np.array(train_set.partition['weight_' + cfg.split][cfg.partition_idx])
rate = rate[train_set.dataset['selected_attribute']].tolist()
else:
rate = None
# compute the weight of positive and negative
if rate is None:
weight_pos = [1 for i in range(num_att)]
weight_neg = [1 for i in range(num_att)]
else:
if len(rate) != num_att:
print ("the length of rate should be equal to %d" % (num_att))
raise ValueError
weight_pos = []
weight_neg = []
for idx, v in enumerate(rate):
weight_pos.append(math.exp(1.0 - v))
weight_neg.append(math.exp(v))
# bind the model and optimizer
modules_optims = [model, optimizer]
# load model weight if necessary
if cfg.load_model_weight:
map_location = (lambda storage, loc:storage)
ckpt = torch.load(cfg.model_weight_file, map_location=map_location)
model.load_state_dict(ckpt['state_dicts'][0], strict=False)
# print(ckpt['state_dicts'][0])
### Resume or not ###
if cfg.resume:
# store the model, optimizer, epoch
start_epoch, scores = load_ckpt(modules_optims, cfg.ckpt_file)
else:
start_epoch = 0
model = torch.nn.DataParallel(model)
# model_w.cuda()
transfer_optim_state(state=optimizer.state, device_id=cfg.npu)
# cudnn.benchmark = True
# for evaluation
feat_func_att = DeepMAR_ResNet50_ExtractFeature(model=model)
def attribute_evaluate_subfunc(feat_func, test_set, device_id, **test_kwargs):
""" evaluate the attribute recognition precision """
result = attribute_evaluate(feat_func, test_set, device_id, **test_kwargs)
print ('-' * 60)
print ('Evaluation on %s set:' % (cfg.test_split))
print ('Label-based evaluation: \n mA: %.4f'%(np.mean(result['label_acc'])))
print ('Instance-based evaluation: \n Acc: %.4f, Prec: %.4f, Rec: %.4f, F1: %.4f' \
%(result['instance_acc'], result['instance_precision'], result['instance_recall'], result['instance_F1']))
print ('-' * 60)
hwlog.remark_print(key=hwlog.ACC, value="{:.4f}".format(result['instance_acc']))
hwlog.remark_print(key=hwlog.PREC, value="{:.4f}".format(result['instance_precision']))
hwlog.remark_print(key=hwlog.REC, value="{:.4f}".format(result['instance_recall']))
hwlog.remark_print(key=hwlog.F1, value="{:.4f}".format(result['instance_recall']))
return result['instance_acc']
# print the model into log
# test only
if cfg.test_only:
print ('test with feat_func_att')
attribute_evaluate_subfunc(feat_func_att, test_set, **cfg.test_kwargs)
sys.exit(0)
# writer = SummaryWriter(os.path.join('runs/deepmar', str(cfg.npu)))
# training
for epoch in range(start_epoch, cfg.total_epochs):
if cfg.seed is not None:
cfg.seed += 1
seed_everything(cfg.seed)
# adjust the learning rate
adjust_lr_staircase(
optimizer.param_groups,
[cfg.finetuned_params_lr, cfg.new_params_lr],
epoch + 1,
cfg.staircase_decay_at_epochs,
cfg.staircase_decay_multiple_factor)
# adjust_lr(optimizer,epoch+1,cfg.finetuned_params_lr)n
may_set_mode(modules_optims, 'train')
# recording loss
loss_meter = AverageMeter()
dataset_L = len(train_loader) # crop batch data
ep_st = time.time()
ep_st_mark=ep_st
# runing every batch data
for step, (imgs, targets) in enumerate(train_loader):
step_st = time.time()
# measure data loading time
data_time = step_st-ep_st
imgs_var = Variable(imgs)
targets_var = Variable(targets)
# compute the weight
weights = torch.zeros(targets_var.shape)
for i in range(targets_var.shape[0]):
for j in range(targets_var.shape[1]):
if targets_var.data.cpu()[i, j] == -1:
weights[i, j] = weight_neg[j]
elif targets_var.data.cpu()[i, j] == 1:
weights[i, j] = weight_pos[j]
else:
weights[i, j] = 0
targets_var[targets_var == -1] = 0
targets_var = targets_var.to(CALCULATE_DEVICE)
imgs_var = imgs_var.to(CALCULATE_DEVICE)
weights = weights.to(CALCULATE_DEVICE)
score = model(imgs_var)
criterion = torch.nn.BCEWithLogitsLoss(weight=Variable(weights)).to(CALCULATE_DEVICE)
loss = criterion(score, targets_var) * num_att
optimizer.zero_grad()
if cfg.amp:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
optimizer.step()
############
# step log #
############
loss_meter.update(to_scalar(loss))
# one batch time using backward calculation
batch_time = time.time() - ep_st # include data load time
ep_st = time.time()
fps = cfg.batch_size / batch_time
# do not include data load time
if (step + 1) % cfg.steps_per_log == 0 or (step + 1) % len(train_loader) == 0:
log = '{}, Step {}/{} in Ep {}, {:.2f}s, datatime:{:.6f}, batchtime:{:.6f}, FPS:{:.2f}, loss:{:.4f}'.format( \
time_str(), step + 1, dataset_L, epoch + 1, time.time() - step_st, data_time, batch_time, fps, loss_meter.val)
print(log)
hwlog.remark_print(key=hwlog.FPS, value='{:.2f}'.format(fps))
##############
# epoch log #
##############
epoch_time = time.time() - ep_st_mark
log = 'Ep{}, {:.2f}s, loss {:.4f}'.format(
epoch+1, epoch_time, loss_meter.avg)
print(log)
# writer.add_scalar('Train/Time', epoch_time, epoch+1)
# writer.add_scalar('Train/Loss', loss_meter.avg, epoch+1)
# writer.add_scalar('Train/LR', optimizer.param_groups[0]['lr'], epoch+1)
# model ckpt
if (epoch + 1) % cfg.epochs_per_save == 0 or epoch+1 == cfg.total_epochs:
ckpt_file = os.path.join(cfg.exp_dir, 'model', 'ckpt_epoch%d.pth'%(epoch+1))
save_ckpt(modules_optims, epoch+1, 0, ckpt_file)
##########################
# test on validation set #
##########################
if (epoch + 1) % cfg.epochs_per_val == 0 or epoch+1 == cfg.total_epochs:
print ('att test with feat_func_att')
res = attribute_evaluate_subfunc(feat_func_att, test_set, CALCULATE_DEVICE, **cfg.test_kwargs)
# writer.add_scalar('Val/Acc', res, epoch)
@@ -0,0 +1,587 @@
import os
import random
import math
import torch
import torch.optim as optim
import torchvision.transforms as transforms
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
from torch.nn.parallel import DataParallel
import pickle
import time
import argparse
import pdb
import sys
from torch.utils.tensorboard import SummaryWriter
from baseline.dataset import add_transforms
from baseline.dataset.Dataset import AttDataset
from baseline.model.DeepMAR import DeepMAR_ResNet50
from baseline.model.DeepMAR import DeepMAR_ResNet50_ExtractFeature
from baseline.utils.evaluate import attribute_evaluate
from baseline.utils.utils import str2bool
from baseline.utils.utils import transfer_optim_state
from baseline.utils.utils import time_str
from baseline.utils.utils import save_ckpt, load_ckpt
from baseline.utils.utils import load_state_dict
from baseline.utils.utils import ReDirectSTD
from baseline.utils.utils import adjust_lr_staircase
from baseline.utils.utils import adjust_lr
from baseline.utils.utils import set_devices
from baseline.utils.utils import AverageMeter
from baseline.utils.utils import to_scalar
from baseline.utils.utils import may_set_mode
from baseline.utils.utils import may_mkdir
from baseline.utils.utils import set_seed
from baseline.utils.utils import seed_everything
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.utils.data.distributed
# Apex
import numpy as np
from apex import amp
import torch.npu
from benchmark_log import hwlog
from benchmark_log.basic_utils import get_environment_info
from benchmark_log.basic_utils import get_model_parameter
class Config(object):
def __init__(self):
parser = argparse.ArgumentParser()
parser.add_argument('--sys_device_ids', type=eval, default=6)
parser.add_argument('--npu', default=None, type=int, help='NPU id to use.')
parser.add_argument('--set_seed', type=str2bool, default=False)
## dataset parameter
parser.add_argument('--dataset', type=str, default='peta',
choices=['peta','rap', 'pa100k', 'rap2'])
parser.add_argument('--save_dir', type=str, default='/home/zhusiyi/dataset/peta/')
parser.add_argument('--split', type=str, default='trainval',
choices=['trainval', 'train'])
parser.add_argument('--test_split', type=str, default='test')
parser.add_argument('--partition_idx', type=int, default=0)
parser.add_argument('--resize', type=eval, default=(224, 224))
parser.add_argument('--mirror', type=str2bool, default=True)
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--workers', type=int, default=2)
# model
parser.add_argument('--num_att', type=int, default=35)
parser.add_argument('--pretrained', type=str2bool, default=True)
parser.add_argument('--last_conv_stride', type=int, default=2, choices=[1,2])
parser.add_argument('--drop_pool5', type=str2bool, default=True)
parser.add_argument('--drop_pool5_rate', type=float, default=0.5)
parser.add_argument('--sgd_weight_decay', type=float, default=0.0005)
parser.add_argument('--sgd_momentum', type=float, default=0.9)
parser.add_argument('--new_params_lr', type=float, default=0.001)
parser.add_argument('--finetuned_params_lr', type=float, default=0.001)
parser.add_argument('--staircase_decay_at_epochs', type=eval,
default=(51, ))
parser.add_argument('--staircase_decay_multiple_factor', type=float,
default=0.1)
parser.add_argument('--total_epochs', type=int, default=150)
parser.add_argument('--weighted_entropy', type=str2bool, default=True)
# utils
parser.add_argument('--resume', type=str2bool, default=False)
parser.add_argument('--ckpt_file', type=str, default='')
parser.add_argument('--load_model_weight', type=str2bool, default=False)
parser.add_argument('--model_weight_file', type=str, default='')
parser.add_argument('--test_only', type=str2bool, default=False)
parser.add_argument('--exp_dir', type=str, default='')
parser.add_argument('--exp_subpath', type=str, default='deepmar_resnet50')
parser.add_argument('--log_to_file', type=str2bool, default=True)
parser.add_argument('--steps_per_log', type=int, default=20)
parser.add_argument('--epochs_per_val', type=int, default=10)
parser.add_argument('--epochs_per_save', type=int, default=50)
parser.add_argument('--run', type=int, default=1)
# apex
parser.add_argument('--amp', default=False, action='store_true',
help='use amp to train the model')
parser.add_argument('--loss_scale', default=-1., type=float,
help='loss scale using in amp, default -1 means dynamic')
parser.add_argument('--opt_level', default='O1', type=str,
help='opt level using in amp, default O1 means FP16')
# distributed
parser.add_argument('--addr', default='90.90.176.152', type=str,
help='master addr')
parser.add_argument('--world_size', default=-1, type=int,
help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
help='node rank for distributed training')
parser.add_argument('--dist_url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist_backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--multiprocessing_distributed', action='store_true',
help='Use multi-processing distributed training to launch '
'N processes per node, which has N NPUs. This is the '
'fastest way to use PyTorch for either single node or '
'multi node data parallel training')
parser.add_argument('--npus_per_node', default=None, type=int,
help='number of npus to use for distributed train on each node')
args = parser.parse_args()
# gpu ids
# self.sys_device_ids = args.sys_device_ids
self.npus_per_node=args.npus_per_node
self.npu = args.npu
# random
self.set_seed = args.set_seed
if self.set_seed:
self.seed = 0
else:
self.seed = None
# amp
self.amp = args.amp
self.loss_scale = args.loss_scale
self.opt_level = args.opt_level
# run time index
self.run = args.run
# Dataset #
datasets = dict()
datasets['peta'] = args.save_dir + '/peta_dataset.pkl'
partitions = dict()
partitions['peta'] = args.save_dir + '/peta_partition.pkl'
self.dataset_name = args.dataset
if args.dataset not in datasets or args.dataset not in partitions:
print ("Please select the right dataset name.")
raise ValueError
else:
self.dataset = datasets[args.dataset]
self.partition = partitions[args.dataset]
self.partition_idx = args.partition_idx
self.split = args.split
self.test_split = args.test_split
self.resize = args.resize
self.mirror = args.mirror
self.mean = [0.485, 0.456, 0.406]
self.std = [0.229, 0.224, 0.225]
self.batch_size = args.batch_size
self.workers = args.workers
# optimization
self.sgd_momentum = args.sgd_momentum
self.sgd_weight_decay = args.sgd_weight_decay
self.new_params_lr = args.new_params_lr
self.finetuned_params_lr = args.finetuned_params_lr
self.staircase_decay_at_epochs = args.staircase_decay_at_epochs
self.staircase_decay_multiple_factor = args.staircase_decay_multiple_factor
self.total_epochs = args.total_epochs
self.weighted_entropy = args.weighted_entropy
# distributed
self.addr = args.addr
self.world_size = args.world_size
self.rank = args.rank
self.dist_url = args.dist_url
self.dist_backend = args.dist_backend
self.multiprocessing_distributed = args.multiprocessing_distributed
# utils
self.resume = args.resume
self.ckpt_file = args.ckpt_file
if self.resume:
if self.ckpt_file == '':
print ('Please input the ckpt_file if you want to resume training')
raise ValueError
self.load_model_weight = args.load_model_weight
self.model_weight_file = args.model_weight_file
if self.load_model_weight:
if self.model_weight_file == '':
print ('Please input the model_weight_file if you want to load model weight')
raise ValueError
self.test_only = args.test_only
self.exp_dir = args.exp_dir
self.exp_subpath = args.exp_subpath
self.log_to_file = args.log_to_file
self.steps_per_log = args.steps_per_log
self.epochs_per_val = args.epochs_per_val
self.epochs_per_save = args.epochs_per_save
self.run = args.run
# for model
model_kwargs = dict()
model_kwargs['num_att'] = args.num_att
model_kwargs['last_conv_stride'] = args.last_conv_stride
model_kwargs['drop_pool5'] = args.drop_pool5
model_kwargs['drop_pool5_rate'] = args.drop_pool5_rate
self.model_kwargs = model_kwargs
# for evaluation
self.test_kwargs = dict()
if self.exp_dir == '':
self.exp_dir = os.path.join('exp',
'{}'.format(self.exp_subpath),
'{}'.format(self.dataset_name),
'partition{}'.format(self.partition_idx),
'run{}'.format(self.run))
self.stdout_file = os.path.join(self.exp_dir, \
'log', 'stdout_{}.txt'.format(time_str()))
self.stderr_file = os.path.join(self.exp_dir, \
'log', 'stderr_{}.txt'.format(time_str()))
may_mkdir(self.stdout_file)
def main():
### main function ###
# pdb.set_trace()
cfg = Config()
# log
if cfg.log_to_file:
ReDirectSTD(cfg.stdout_file, 'stdout', False)
ReDirectSTD(cfg.stderr_file, 'stderr', False)
# dump the configuration to log.
import pprint
print('-' * 60)
print('cfg.__dict__')
pprint.pprint(cfg.__dict__)
print('-' * 60)
os.environ['KERNEL_NAME_ID'] = str(0)
print("+++++++++++++++++++++++++++KERNEL_NAME_ID:",os.environ['KERNEL_NAME_ID'])
# set the random seed
print(cfg.seed)
if cfg.set_seed:
set_seed(cfg.seed)
seed_everything(cfg.seed)
os.environ['MASTER_ADDR'] = cfg.addr
os.environ['MASTER_PORT'] = '29501'
if cfg.dist_url == "env://" and cfg.world_size == -1:
cfg.world_size = int(os.environ["WORLD_SIZE"])
npus_per_node=cfg.npus_per_node
#npus_per_node = torch.npu.device_count()
if cfg.multiprocessing_distributed:
# Since we have ngpus_per_node processes per node, the total world_size needs to be adjusted accordingly
cfg.world_size = npus_per_node * cfg.world_size # world_size means nums of all devices or nums of processes
mp.spawn(main_worker, nprocs=npus_per_node, args=(npus_per_node, cfg))
def main_worker(npu, npus_per_node, cfg):
cfg.npu = npu
print("[npu id:", npu, "]", "+++++++++++++++++++++++++++ before set KERNEL_NAME_ID:", os.environ['KERNEL_NAME_ID'])
os.environ['KERNEL_NAME_ID'] = str(npu)
print("[npu id:", npu, "]", "+++++++++++++++++++++++++++KERNEL_NAME_ID:", os.environ['KERNEL_NAME_ID'])
if npu is not None:
print("[npu id:", npu, "]", "Use NPU: {} for training".format(npu))
if cfg.dist_url == "env://" and cfg.rank == -1:
cfg.rank = int(os.environ["RANK"])
if cfg.multiprocessing_distributed:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
#cfg.rank = cfg.rank * npus_per_node + npu
cfg.rank = npu
print("rank:",cfg.rank)
dist.init_process_group(backend=cfg.dist_backend, #init_method=cfg.dist_url,
world_size=cfg.world_size, rank=cfg.rank)
CALCULATE_DEVICE = 'npu:{}'.format(npu)
print(CALCULATE_DEVICE)
torch.npu.set_device(CALCULATE_DEVICE)
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of NPUs we have
cfg.batch_size = int(cfg.batch_size / npus_per_node)
cfg.workers = int((cfg.workers + npus_per_node - 1) / npus_per_node)
print("batchsize:", cfg.batch_size)
print("workers", cfg.workers)
# dataset
normalize = transforms.Normalize(mean=cfg.mean, std=cfg.std)
transform = transforms.Compose([
transforms.Resize(cfg.resize),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(), # 3*H*W, [0, 1]
normalize,]) # normalize with mean/std
# by a subset of attributes
train_set = AttDataset(
dataset = cfg.dataset,
partition = cfg.partition,
split = cfg.split,
partition_idx= cfg.partition_idx,
transform = transform)
num_att = len(train_set.dataset['selected_attribute'])
cfg.model_kwargs['num_att'] = num_att
distributed = cfg.world_size > 1 or cfg.multiprocessing_distributed
if distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_set)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
dataset = train_set,
batch_size = cfg.batch_size,
shuffle=(train_sampler is None),
num_workers = cfg.workers,
pin_memory = True,
sampler=train_sampler,
drop_last = True)
test_transform = transforms.Compose([
transforms.Resize(cfg.resize),
transforms.ToTensor(),
normalize,])
test_set = AttDataset(
dataset = cfg.dataset,
partition = cfg.partition,
split = cfg.test_split,
partition_idx = cfg.partition_idx,
transform = test_transform)
### Att model ###
model = DeepMAR_ResNet50(**cfg.model_kwargs)
# Optimizer
finetuned_params = []
new_params = []
for n, p in model.named_parameters():
if n.find('classifier') >=0:
new_params.append(p)
else:
finetuned_params.append(p)
param_groups = [{'params': finetuned_params, 'lr': cfg.finetuned_params_lr},
{'params': new_params, 'lr': cfg.new_params_lr}]
optimizer = optim.SGD(
param_groups,
momentum = cfg.sgd_momentum,
weight_decay = cfg.sgd_weight_decay)
# model = model.cuda()
model = model.to(CALCULATE_DEVICE)
# apex
if cfg.amp:
# Initialization
model, optimizer = amp.initialize(model, optimizer, opt_level=cfg.opt_level, loss_scale=cfg.loss_scale)
print("=> Using amp mode.")
# Wrap the model after set_devices, data parallel
# model_w = torch.nn.DataParallel(model)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[cfg.npu], broadcast_buffers=False)
# using the weighted cross entropy loss
if cfg.weighted_entropy:
rate = np.array(train_set.partition['weight_' + cfg.split][cfg.partition_idx])
rate = rate[train_set.dataset['selected_attribute']].tolist()
else:
rate = None
# compute the weight of positive and negative
if rate is None:
weight_pos = [1 for i in range(num_att)]
weight_neg = [1 for i in range(num_att)]
else:
if len(rate) != num_att:
print ("the length of rate should be equal to %d" % (num_att))
raise ValueError
weight_pos = []
weight_neg = []
for idx, v in enumerate(rate):
weight_pos.append(math.exp(1.0 - v))
weight_neg.append(math.exp(v))
# bind the model and optimizer
modules_optims = [model, optimizer]
# load model weight if necessary
if cfg.load_model_weight:
map_location = (lambda storage, loc:storage)
ckpt = torch.load(cfg.model_weight_file, map_location=map_location)
model.load_state_dict(ckpt['state_dicts'][0], strict=False)
### Resume or not ###
if cfg.resume:
# store the model, optimizer, epoch
start_epoch, scores = load_ckpt(modules_optims, cfg.ckpt_file)
else:
start_epoch = 0
#model = torch.nn.DataParallel(model)
#transfer_optim_state(state=optimizer.state, device_id=npu)
# cudnn.benchmark = True
# for evaluation
feat_func_att = DeepMAR_ResNet50_ExtractFeature(model=model)
# print the model into log
# print (model)
# test only
if cfg.test_only:
print ('test with feat_func_att')
attribute_evaluate_subfunc(feat_func_att, test_set, **cfg.test_kwargs)
sys.exit(0)
# writer = SummaryWriter('runs/deepmar/6')
# training
for epoch in range(start_epoch, cfg.total_epochs):
if cfg.seed is not None:
cfg.seed += 1
seed_everything(cfg.seed)
if distributed:
train_sampler.set_epoch(epoch)
# adjust the learning rate
adjust_lr_staircase(
optimizer.param_groups,
[cfg.finetuned_params_lr, cfg.new_params_lr],
epoch + 1,
cfg.staircase_decay_at_epochs,
cfg.staircase_decay_multiple_factor)
# adjust_lr(optimizer,epoch+1,cfg.finetuned_params_lr)n
may_set_mode(modules_optims, 'train')
# recording loss
loss_meter = AverageMeter()
dataset_L = len(train_loader) # crop batch data
ep_st = time.time()
ep_st_mark=ep_st
# runing every batch data
for step, (imgs, targets) in enumerate(train_loader):
step_st = time.time()
# measure data loading time
data_time = step_st-ep_st
imgs_var = Variable(imgs)
targets_var = Variable(targets)
# if 'npu' in CALCULATE_DEVICE:
# targets = targets.to(torch.int32)
# imgs, targets = imgs.to(CALCULATE_DEVICE, non_blocking=True), targets.to(CALCULATE_DEVICE, non_blocking=True)
# compute the weight
weights = torch.zeros(targets_var.shape)
for i in range(targets_var.shape[0]):
for j in range(targets_var.shape[1]):
if targets_var.data.cpu()[i, j] == -1:
weights[i, j] = weight_neg[j]
elif targets_var.data.cpu()[i, j] == 1:
weights[i, j] = weight_pos[j]
else:
weights[i, j] = 0
# loss for the attribute classification, average over the batch size
targets_var[targets_var == -1] = 0
targets_var = targets_var.to(CALCULATE_DEVICE)
imgs_var = imgs_var.to(CALCULATE_DEVICE)
weights = weights.to(CALCULATE_DEVICE)
score = model(imgs_var)
criterion = torch.nn.BCEWithLogitsLoss(weight=Variable(weights)).to(CALCULATE_DEVICE)
loss = criterion(score, targets_var) * num_att
optimizer.zero_grad()
if cfg.amp:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
# for name, parms in model.named_parameters():
# print('-->name:', name, ' -->grad_value_max:', torch.max(parms.grad), ' -->grad_value_min:',
# torch.min(parms.grad))
optimizer.step()
############
# step log #
############
loss_meter.update(to_scalar(loss))
# one batch time using backward calculation
batch_time = time.time() - ep_st # include data load time
ep_st = time.time()
fps = npus_per_node*cfg.batch_size / batch_time
# do not include data load time
if (step + 1) % cfg.steps_per_log == 0 or (step + 1) % len(train_loader) == 0:
log = '{}, Step {}/{} in Ep {}, {:.2f}s, datatime:{:.6f}, batchtime:{:.6f}, FPS:{:.2f}, loss:{:.4f}'.format( \
time_str(), step + 1, dataset_L, epoch + 1, time.time() - step_st, data_time, batch_time, fps, loss_meter.val)
print(log)
hwlog.remark_print(key=hwlog.FPS, value='{:.2f}'.format(fps))
##############
# epoch log #
##############
epoch_time = time.time() - ep_st_mark
log = 'Ep{}, {:.2f}s, loss {:.4f}'.format(
epoch+1, epoch_time, loss_meter.avg)
print(log)
# writer.add_scalar('Train/Time', epoch_time, epoch+1)
# writer.add_scalar('Train/Loss', loss_meter.avg, epoch+1)
# # writer.add_scalar('Train/Acc', res['instance_acc'], epoch+1)
# writer.add_scalar('Train/LR', optimizer.param_groups[0]['lr'], epoch+1)
# model ckpt
if (epoch + 1) % cfg.epochs_per_save == 0 or epoch+1 == cfg.total_epochs:
ckpt_file = os.path.join(cfg.exp_dir, 'model', 'ckpt_epoch%d.pth'%(epoch+1))
save_ckpt(modules_optims, epoch+1, 0, ckpt_file)
##########################
# test on validation set #
##########################
if (epoch + 1) % cfg.epochs_per_val == 0 or epoch+1 == cfg.total_epochs:
print ('att test with feat_func_att')
res = attribute_evaluate_subfunc(feat_func_att, test_set, CALCULATE_DEVICE, cfg, **cfg.test_kwargs)
# writer.add_scalar('Val/Acc', res, epoch)
# writer.close()
def attribute_evaluate_subfunc(feat_func, test_set, device_id, cfg, **test_kwargs):
""" evaluate the attribute recognition precision """
result = attribute_evaluate(feat_func, test_set, device_id, **test_kwargs)
print ('-' * 60)
print ('Evaluation on %s set:' % (cfg.test_split))
print ('Label-based evaluation: \n mA: %.4f'%(np.mean(result['label_acc'])))
print ('Instance-based evaluation: \n Acc: %.4f, Prec: %.4f, Rec: %.4f, F1: %.4f' \
%(result['instance_acc'], result['instance_precision'], result['instance_recall'], result['instance_F1']))
print ('-' * 60)
hwlog.remark_print(key=hwlog.ACC, value="{:.4f}".format(result['instance_acc']))
hwlog.remark_print(key=hwlog.PREC, value="{:.4f}".format(result['instance_precision']))
hwlog.remark_print(key=hwlog.REC, value="{:.4f}".format(result['instance_recall']))
hwlog.remark_print(key=hwlog.F1, value="{:.4f}".format(result['instance_recall']))
return result['instance_acc']
# intermediate variable
inter_feature = {}
inter_gradient = {}
def make_hook(name, flag):
if flag == 'forward':
def hook(m, input, output):
inter_feature[name] = input
return hook
elif flag == 'backward':
def hook(m, input, output):
inter_gradient[name] = output
return hook
else:
assert False
if __name__ == '__main__':
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
config_info = get_model_parameter("pytorch_config")
initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
main()
@@ -0,0 +1,96 @@
import os
import numpy as np
import random
import cPickle as pickle
from scipy.io import loadmat
np.random.seed(0)
random.seed(0)
def make_dir(path):
if os.path.exists(path):
pass
else:
os.mkdir(path)
def generate_data_description(save_dir):
"""
create a dataset description file, which consists of images, labels
"""
dataset = dict()
dataset['description'] = 'pa100k'
dataset['root'] = './dataset/pa100k/data/'
dataset['image'] = []
dataset['att'] = []
dataset['att_name'] = []
dataset['selected_attribute'] = range(26)
# load ANNOTATION.MAT
data = loadmat(open('./dataset/pa100k/annotation.mat', 'r'))
for idx in range(26):
dataset['att_name'].append(data['attributes'][idx][0][0])
for idx in range(80000):
dataset['image'].append(data['train_images_name'][idx][0][0])
dataset['att'].append(data['train_label'][idx, :].tolist())
for idx in range(10000):
dataset['image'].append(data['val_images_name'][idx][0][0])
dataset['att'].append(data['val_label'][idx, :].tolist())
for idx in range(10000):
dataset['image'].append(data['test_images_name'][idx][0][0])
dataset['att'].append(data['test_label'][idx, :].tolist())
with open(os.path.join(save_dir, 'pa100k_dataset.pkl'), 'w+') as f:
pickle.dump(dataset, f)
def create_trainvaltest_split(traintest_split_file):
"""
create a dataset split file, which consists of index of the train/val/test splits
"""
partition = dict()
partition['trainval'] = []
partition['train'] = []
partition['val'] = []
partition['test'] = []
partition['weight_trainval'] = []
partition['weight_train'] = []
# load ANNOTATION.MAT
data = loadmat(open('./dataset/pa100k/annotation.mat', 'r'))
train = range(80000)
val = [i+80000 for i in range(10000)]
test = [i+90000 for i in range(10000)]
trainval = train + val
partition['train'].append(train)
partition['val'].append(val)
partition['trainval'].append(trainval)
partition['test'].append(test)
# weight
train_label = data['train_label'].astype('float32')
trainval_label = np.concatenate((data['train_label'], data['val_label']), axis=0).astype('float32')
weight_train = np.mean(train_label==1, axis=0).tolist()
weight_trainval = np.mean(trainval_label==1, axis=0).tolist()
partition['weight_trainval'].append(weight_trainval)
partition['weight_train'].append(weight_train)
with open(traintest_split_file, 'w+') as f:
pickle.dump(partition, f)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="pa100k dataset")
parser.add_argument(
'--save_dir',
type=str,
default='./dataset/pa100k/')
parser.add_argument(
'--traintest_split_file',
type=str,
default="./dataset/pa100k/pa100k_partition.pkl")
args = parser.parse_args()
save_dir = args.save_dir
traintest_split_file = args.traintest_split_file
generate_data_description(save_dir)
create_trainvaltest_split(traintest_split_file)
@@ -0,0 +1,84 @@
import os
import numpy as np
import random
import pickle
from scipy.io import loadmat
np.random.seed(0)
random.seed(0)
def make_dir(path):
if os.path.exists(path):
pass
else:
os.mkdir(path)
def generate_data_description(save_dir):
"""
create a dataset description file, which consists of images, labels
"""
dataset = dict()
dataset['description'] = 'peta'
dataset['root'] = save_dir + '/images/'
dataset['image'] = []
dataset['att'] = []
dataset['att_name'] = []
dataset['selected_attribute'] = range(35)
# load PETA.MAT
data = loadmat(save_dir + '/PETA.mat')
for idx in range(105):
dataset['att_name'].append(data['peta'][0][0][1][idx,0][0])
for idx in range(19000):
dataset['image'].append('%05d.png'%(idx+1))
dataset['att'].append(data['peta'][0][0][0][idx, 4:].tolist())
with open(os.path.join(save_dir, 'peta_dataset.pkl'), 'wb') as f:
pickle.dump(dataset, f)
def create_trainvaltest_split(traintest_split_file):
"""
create a dataset split file, which consists of index of the train/val/test splits
"""
partition = dict()
partition['trainval'] = []
partition['train'] = []
partition['val'] = []
partition['test'] = []
partition['weight_trainval'] = []
partition['weight_train'] = []
# load PETA.MAT
data = loadmat(save_dir + '/PETA.mat')
for idx in range(5):
train = (data['peta'][0][0][3][idx][0][0][0][0][:,0]-1).tolist()
val = (data['peta'][0][0][3][idx][0][0][0][1][:,0]-1).tolist()
test = (data['peta'][0][0][3][idx][0][0][0][2][:,0]-1).tolist()
trainval = train + val
partition['train'].append(train)
partition['val'].append(val)
partition['trainval'].append(trainval)
partition['test'].append(test)
# weight
weight_trainval = np.mean(data['peta'][0][0][0][trainval, 4:].astype('float32')==1, axis=0).tolist()
weight_train = np.mean(data['peta'][0][0][0][train, 4:].astype('float32')==1, axis=0).tolist()
partition['weight_trainval'].append(weight_trainval)
partition['weight_train'].append(weight_train)
with open(traintest_split_file, 'wb') as f:
pickle.dump(partition, f)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="peta dataset")
parser.add_argument(
'--save_dir',
type=str,
default='/home/zhusiyi/dataset/peta/')
parser.add_argument(
'--traintest_split_file',
type=str,
default="/home/zhusiyi/dataset/peta/peta_partition.pkl")
args = parser.parse_args()
save_dir = args.save_dir
traintest_split_file = args.traintest_split_file
generate_data_description(save_dir)
create_trainvaltest_split(traintest_split_file)
@@ -0,0 +1,76 @@
import os
import numpy as np
import random
import cPickle as pickle
from scipy.io import loadmat
np.random.seed(0)
random.seed(0)
def make_dir(path):
if os.path.exists(path):
pass
else:
os.mkdir(path)
def generate_data_description(save_dir):
"""
create a dataset description file, which consists of images, labels
"""
dataset = dict()
dataset['description'] = 'rap'
dataset['root'] = './dataset/rap/RAP_dataset/'
dataset['image'] = []
dataset['att'] = []
dataset['att_name'] = []
dataset['selected_attribute'] = range(51)
# load Rap_annotation.mat
data = loadmat(open('./dataset/rap/RAP_annotation/RAP_annotation.mat', 'r'))
for idx in range(51):
dataset['att_name'].append(data['RAP_annotation'][0][0][6][idx][0][0])
for idx in range(41585):
dataset['image'].append(data['RAP_annotation'][0][0][5][idx][0][0])
dataset['att'].append(data['RAP_annotation'][0][0][1][idx, :].tolist())
with open(os.path.join(save_dir, 'rap_dataset.pkl'), 'w+') as f:
pickle.dump(dataset, f)
def create_trainvaltest_split(traintest_split_file):
"""
create a dataset split file, which consists of index of the train/val/test splits
"""
partition = dict()
partition['trainval'] = []
partition['test'] = []
partition['weight_trainval'] = []
# load RAP_annotation.mat
data = loadmat(open('./dataset/rap/RAP_annotation/RAP_annotation.mat', 'r'))
for idx in range(5):
trainval = (data['RAP_annotation'][0][0][0][idx][0][0][0][0][0,:]-1).tolist()
test = (data['RAP_annotation'][0][0][0][idx][0][0][0][1][0,:]-1).tolist()
partition['trainval'].append(trainval)
partition['test'].append(test)
# weight
weight_trainval = np.mean(data['RAP_annotation'][0][0][1][trainval, :].astype('float32')==1, axis=0).tolist()
partition['weight_trainval'].append(weight_trainval)
with open(traintest_split_file, 'w+') as f:
pickle.dump(partition, f)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="rap dataset")
parser.add_argument(
'--save_dir',
type=str,
default='./dataset/rap/')
parser.add_argument(
'--traintest_split_file',
type=str,
default="./dataset/rap/rap_partition.pkl")
args = parser.parse_args()
save_dir = args.save_dir
traintest_split_file = args.traintest_split_file
generate_data_description(save_dir)
create_trainvaltest_split(traintest_split_file)
@@ -0,0 +1,86 @@
import os
import numpy as np
import random
import cPickle as pickle
from scipy.io import loadmat
np.random.seed(0)
random.seed(0)
def make_dir(path):
if os.path.exists(path):
pass
else:
os.mkdir(path)
def generate_data_description(save_dir):
"""
create a dataset description file, which consists of images, labels
"""
dataset = dict()
dataset['description'] = 'rap2'
dataset['root'] = './dataset/rap2/RAP_dataset/'
dataset['image'] = []
dataset['att'] = []
dataset['att_name'] = []
# load RAP_annotation.mat
data = loadmat(open('./dataset/rap2/RAP_annotation/RAP_annotation.mat', 'r'))
dataset['selected_attribute'] = (data['RAP_annotation'][0][0][3][0,:]-1).tolist()
for idx in range(152):
dataset['att_name'].append(data['RAP_annotation'][0][0][2][idx][0][0])
for idx in range(84928):
dataset['image'].append(data['RAP_annotation'][0][0][0][idx][0][0])
dataset['att'].append(data['RAP_annotation'][0][0][1][idx, :].tolist())
with open(os.path.join(save_dir, 'rap2_dataset.pkl'), 'w+') as f:
pickle.dump(dataset, f)
def create_trainvaltest_split(traintest_split_file):
"""
create a dataset split file, which consists of index of the train/val/test splits
"""
partition = dict()
partition['train'] = []
partition['val'] = []
partition['trainval'] = []
partition['test'] = []
partition['weight_train'] = []
partition['weight_trainval'] = []
# load RAP_annotation.mat
data = loadmat(open('./dataset/rap2/RAP_annotation/RAP_annotation.mat', 'r'))
for idx in range(5):
train = (data['RAP_annotation'][0][0][4][0, idx][0][0][0][0,:]-1).tolist()
val = (data['RAP_annotation'][0][0][4][0, idx][0][0][1][0,:]-1).tolist()
test = (data['RAP_annotation'][0][0][4][0, idx][0][0][2][0,:]-1).tolist()
trainval = train + val
partition['trainval'].append(trainval)
partition['train'].append(train)
partition['val'].append(val)
partition['test'].append(test)
# weight
weight_train = np.mean(data['RAP_annotation'][0][0][1][train, :].astype('float32')==1, axis=0).tolist()
weight_trainval = np.mean(data['RAP_annotation'][0][0][1][trainval, :].astype('float32')==1, axis=0).tolist()
partition['weight_train'].append(weight_train)
partition['weight_trainval'].append(weight_trainval)
with open(traintest_split_file, 'w+') as f:
pickle.dump(partition, f)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="rap2 dataset")
parser.add_argument(
'--save_dir',
type=str,
default='./dataset/rap2/')
parser.add_argument(
'--traintest_split_file',
type=str,
default="./dataset/rap2/rap2_partition.pkl")
args = parser.parse_args()
save_dir = args.save_dir
traintest_split_file = args.traintest_split_file
generate_data_description(save_dir)
create_trainvaltest_split(traintest_split_file)
@@ -0,0 +1,31 @@
############## toolkit situation ################
#export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
#export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
#export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
############## nnae situation ################
if [ -d /usr/local/Ascend/nnae/latest ];then
export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/nnae/latest/toolkit/tools/ide_daemon/bin/
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/
export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
export PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
else
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
fi
# ln -s /usr/local/Ascend/ascend-toolkit/latest/toolkit/bin/adc /usr/local/bin/
export SLOG_PRINT_TO_STDOUT=0
#su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
export TASK_QUEUE_ENABLE=1
@@ -0,0 +1,62 @@
#!/bin/bash
rank_size=$1
yamlPath=$2
toolsPath=$3
currentDir=$(cd "$(dirname "$0")/.."; pwd)
model_name=$(cd $currentDir/..;basename `pwd`)
if [ -f /.dockerenv ];then
CLUSTER=$4
MPIRUN_ALL_IP="$5"
export CLUSTER=${CLUSTER}
fi
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "pytorch_config")
# 清除旧日志
rm -rf /var/log/npu/slog/host-0/*
rm -rf ${currentDir}/result/*.log
#mkdir train job path
currtime=`date +%Y%m%d%H%M%S`
mkdir -p ${currentDir%train*}/train/result/pt_deepmar/training_job_${currtime}/
export train_job_dir=${currentDir%train*}/train/result/pt_deepmar/training_job_${currtime}/
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${train_job_dir} &"
# device 列表, 若无指定 device 根据 rank_size 顺序选择
eval device_group=\$device_group_${rank_size}p
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
device_group="$(seq 0 "$(expr $rank_size - 1)")"
fi
# get last device id in device_group, hw log in performance from the dir named last_device_id
device_group_str=`echo ${device_group} | sed 's/ //g'`
first_device_id=`echo ${device_group_str: 0:1}`
if [ x"${CLUSTER}" == x"True" ];then
this_ip=$(hostname -I |awk '{print $1}')
ln -snf ${currentDir%train*}/train/result/pt_deepmar/training_job_${currtime}/0/hw_deepmar.log ${currentDir%train*}/train/result/pt_deepmar/training_job_${currtime}/
for ip in $MPIRUN_ALL_IP;do
if [ x"$ip" != x"$this_ip" ];then
scp $yamlPath root@$ip:$yamlPath
scp ${jsonFilePath} root@$ip:${jsonFilePath}
fi
done
export PATH=$PATH:/usr/local/mpirun4.0/bin
mpirun -H ${mpirun_ip} \
--bind-to none -map-by slot\
--allow-run-as-root \
--mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
--prefix /usr/local/mpirun4.0/ \
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} ${CLUSTER}
else
rank_id=0
#for device_id in $device_group;do
ln -snf ${currentDir%train*}/train/result/pt_deepmar/training_job_${currtime}/${first_device_id}/hw_deepmar.log ${currentDir%train*}/train/result/pt_deepmar/training_job_${currtime}/
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} $rank_id &
# let rank_id++
# done
fi
wait
@@ -0,0 +1,173 @@
#!/usr/bin/env bash
device_id=$1
rank_size=$2
yamlPath=$3
currentDir=$(cd "$(dirname "$0")/.."; pwd)
currtime=$4
toolsPath=$5
export YAML_PATH=$3
mkdir -p ${currentDir%train*}/train/result/pt_deepmar/training_job_${currtime}/
export train_job_dir=${currentDir%train*}/train/result/pt_deepmar/training_job_${currtime}/
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "pytorch_config")
export REMARK_LOG_FILE=hw_deepmar.log # 打点日志文件名称, 必须hw_后跟模型名称小写
benchmark_log_path=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
#source ${currentDir}/config/npu_set_env.sh
source ${currentDir}/config/set_env_b023.sh
# user env
export HCCL_CONNECT_TIMEOUT=600
export JOB_ID=9999001
export HCCL_RANK_TABLE_PATH=${currentDir}/config/${rank_size}p.json
export RANK_SIZE=${rank_size}
export SLOG_PRINT_TO_STDOUT=0
export DEVICE_ID=${device_id}
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
export DEVICE_INDEX=${DEVICE_INDEX}
cd ${train_job_dir}
curd_dir=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils/atlasboost
export PYTHONPATH=$PYTHONPATH:${curd_dir}
if [ x"$6" != x"True" ];then
rank_id=$6
export RANK_ID=$6
else
device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
atlasboost.set_device_id(device_id);print(atlasboost.rank())")
device_id_mo=`echo $device_id_mo`
rank_id=${device_id_mo##* }
export RANK_ID=${rank_id}
device=${device_id_mo##*deviceid = }
device_id=${device%% phyid=*}
export DEVICE_ID=${device_id}
hccljson=${train_job_dir}/*.json
cp ${hccljson} ${currentDir}/config/${rank_size}p.json
fi
#mkdir exec path
mkdir -p ${train_job_dir}/${device_id}
cd ${train_job_dir}/${device_id}
startTime=`date +%Y%m%d-%H:%M:%S`
startTime_s=`date +%s`
# 数据集预处理
python3.7 ${currentDir}/code/transform_peta.py \
--save_dir=${data_url} \
--traintest_split_file=${data_url}/peta_partition.pkl
# 根据单卡/多卡区分调用参数
if [ x"$6" == x"True" ];then
# 多卡多机
export CLUSTER=True
fi
if [ x"${mode}" == x"evaluate" ];then
pass
elif [ x"${rank_size}" == x"1" ];then
# 单卡
python3.7 ${currentDir}/code/train_deepmar_resnet50.py \
--dataset=peta \
--save_dir=${data_url} \
--workers=32 \
--npu=${device} \
--partition_idx=0 \
--split=trainval \
--test_split=test \
--batch_size=${batch_size} \
--resize="(224,224)" \
--exp_subpath=deepmar_resnet50 \
--new_params_lr=0.01 \
--finetuned_params_lr=0.01 \
--staircase_decay_at_epochs="(50,100)" \
--total_epochs=${epoches} \
--epochs_per_val=10 \
--epochs_per_save=50 \
--steps_per_log=10 \
--drop_pool5=True \
--drop_pool5_rate=0.5 \
--run=1 \
--resume=False \
--ckpt_file= \
--load_model_weight=False \
--model_weight_file= \
--amp \
--opt_level O2 \
--loss_scale 512 \
--set_seed True \
--pretrained True \
--test_only=False > ${train_job_dir}/train_${rank_size}p.log 2>&1
elif [ ${rank_size} -le 8 ];then
# 单机多卡
#source ${currentDir}/config/set_env_b023.sh
python3.7 ${currentDir}/code/train_deepmar_resnet50_8p.py \
--addr=$(hostname -I |awk '{print $1}') \
--save_dir=${data_url} \
--dataset=peta \
--workers=80 \
--partition_idx=0 \
--split=trainval \
--test_split=test \
--batch_size=${batch_size} \
--resize="(224,224)" \
--exp_subpath=deepmar_resnet50 \
--new_params_lr=${lr} \
--finetuned_params_lr=${lr} \
--staircase_decay_at_epochs="(50,100)" \
--total_epochs=${epoches} \
--epochs_per_val=10 \
--epochs_per_save=50 \
--steps_per_log=10 \
--drop_pool5=True \
--drop_pool5_rate=0.5 \
--run=1 \
--resume=False \
--ckpt_file= \
--load_model_weight=False \
--model_weight_file=ckpt_epoch101.pth\
--amp \
--opt_level O2 \
--loss_scale 512.0 \
--set_seed True \
--pretrained True \
--test_only=False \
--dist_url 'tcp://127.0.0.1:50000' \
--dist_backend 'hccl' \
--multiprocessing_distributed \
--world_size 1 \
--npus_per_node=${rank_size} \
--rank 0 > ${train_job_dir}/train_${rank_size}p.log 2>&1
fi
#taskset -c 0-20 python3.7 ${currentDir}/code/densenet121.py > ./train.log 2>&1
if [ $? -eq 0 ];then
echo ":::ABK 1.0.0 deepmar train success"
echo ":::ABK 1.0.0 deepmar train success" >> ${train_job_dir}/train_${rank_size}p.log
echo ":::ABK 1.0.0 deepmar train success" >> ./hw_deepmar.log
else
echo ":::ABK 1.0.0 deepmar train success"
echo ":::ABK 1.0.0 deepmar train failed" >> ${train_job_dir}/train_${rank_size}p.log
echo ":::ABK 1.0.0 deepmar train failed" >> ./hw_deepmar.log
fi
endTime=`date +%Y%m%d-%H:%M:%S`
endTime_s=`date +%s`
sumTime=$[ $endTime_s - $startTime_s ]
hour=$(( $sumTime/3600 ))
min=$(( ($sumTime-${hour}*3600)/60 ))
sec=$(( $sumTime-${hour}*3600-${min}*60 ))
echo ":::ABK 1.0.0 deepmar train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_deepmar.log
@@ -0,0 +1,25 @@
# DenseNet121_pytorch训练说明
### 1. 模型训练参数配置
在train/yaml/DenseNet121.yaml中修改相应配置, 配置项含义:
```
pytorch_config:
data_url: 数据集路径
epoches: 跑多少个epoch
batch_size: 1p 参数为256 2p 512 4p 1024 8p为2048
lr: 默认参数1p 0.1 2p 0.2 4p 0.4 8p 0.8
seed: 49
docker_image: docker 镜像名称:版本号
```
------
@@ -0,0 +1,515 @@
import argparse
import os
import random
import shutil
import time
import warnings
import sys
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import torch.npu
from torch.utils.tensorboard import SummaryWriter
from densenet_0_2_2 import densenet121
import numpy as np
from apex import amp
from benchmark_log import hwlog
from benchmark_log.basic_utils import get_environment_info
from benchmark_log.basic_utils import get_model_parameter
warnings.filterwarnings('ignore')
model_names = sorted(name for name in models.__dict__
if name.islower() and not name.startswith("__")
and callable(models.__dict__[name]))
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('--data', metavar='DIR', default='/opt/npu/dataset/imagenet',
help='path to dataset')
parser.add_argument('-a', '--arch', metavar='ARCH', default='densenet121',
choices=model_names,
help='model architecture: ' +
' | '.join(model_names) +
' (default: resnet18)')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 8)')
parser.add_argument('--epochs', default=90, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=128, type=int,
metavar='N',
help='mini-batch size (default: 256), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=1, type=int,
metavar='N', help='print frequency (default: 10)')
parser.add_argument('-ef', '--eval-freq', default=5, type=int,
metavar='N', help='evaluate frequency (default: 5)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
help='Use multi-processing distributed training to launch '
'N processes per node, which has N GPUs. This is the '
'fastest way to use PyTorch for either single node or '
'multi node data parallel training')
parser.add_argument('--npu', default=None, type=int,
help='NPU id to use.')
# apex
parser.add_argument('--amp', default=False, action='store_true',
help='use amp to train the model')
parser.add_argument('--loss-scale', default=1024., type=float,
help='loss scale using in amp, default -1 means dynamic')
parser.add_argument('--opt-level', default='O2', type=str,
help='loss scale using in amp, default -1 means dynamic')
def main():
args = parser.parse_args()
print(args)
if args.npu is None:
args.npu = 0
global CALCULATE_DEVICE
global best_acc1
best_acc1 = 0
CALCULATE_DEVICE = "npu:{}".format(args.npu)
torch.npu.set_device(CALCULATE_DEVICE)
if args.seed is not None:
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if args.gpu is not None:
warnings.warn('You have chosen a specific GPU. This will completely '
'disable data parallelism.')
if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
ngpus_per_node = torch.npu.device_count()
print('{} node found.'.format(ngpus_per_node))
if args.multiprocessing_distributed:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args.world_size = ngpus_per_node * args.world_size
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
else:
# Simply call main_worker function
main_worker(args.gpu, ngpus_per_node, args)
def main_worker(gpu, ngpus_per_node, args):
global best_acc1
args.gpu = gpu
if args.gpu is not None:
print("Use GPU: {} for training".format(args.gpu))
if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
args.rank = int(os.environ["RANK"])
if args.multiprocessing_distributed:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
args.rank = args.rank * ngpus_per_node + gpu
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
# create model
if args.pretrained:
print("=> using pre-trained model '{}'".format(args.arch))
model = models.__dict__[args.arch](pretrained=True)
else:
print("=> creating model '{}'".format(args.arch))
# model = models.__dict__[args.arch]()
model = densenet121()
if args.distributed:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
if args.gpu is not None:
torch.cuda.set_device(args.gpu)
model.cuda(args.gpu)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
args.batch_size = int(args.batch_size / ngpus_per_node)
args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
else:
model.cuda()
# DistributedDataParallel will divide and allocate batch_size to all
# available GPUs if device_ids are not set
model = torch.nn.parallel.DistributedDataParallel(model)
elif args.gpu is not None:
torch.cuda.set_device(args.gpu)
model = model.cuda(args.gpu)
else:
# DataParallel will divide and allocate batch_size to all available GPUs
if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
model.features = torch.nn.DataParallel(model.features)
model.cuda()
else:
model = model.to(CALCULATE_DEVICE)
#for item in model.npu_unsupport_list:
# print("npu_unsupport: ", item)
# item.cpu()
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().to(CALCULATE_DEVICE)
optimizer = torch.optim.SGD(model.parameters(), args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
if args.amp:
model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale)
# optionally resume from a checkpoint
if args.resume:
if os.path.isfile(args.resume):
print("=> loading checkpoint '{}'".format(args.resume))
checkpoint = torch.load(args.resume, map_location=CALCULATE_DEVICE)
args.start_epoch = checkpoint['epoch']
best_acc1 = checkpoint['best_acc1']
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
if args.amp:
amp.load_state_dict(checkpoint['amp'])
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.resume, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(args.resume))
cudnn.benchmark = True
# Data loading code
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]))
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
num_workers=args.workers, pin_memory=False, sampler=train_sampler, drop_last=True)
val_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])),
batch_size=args.batch_size, shuffle=True,
num_workers=args.workers, pin_memory=False, drop_last=True)
if args.evaluate:
validate(val_loader, model, criterion, args)
return
writer = SummaryWriter(os.path.join('runs/densenet121'))
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
adjust_learning_rate(optimizer, epoch, args)
# train for one epoch
train(train_loader, model, criterion, optimizer, epoch, args, writer)
if (epoch+1)%(args.eval_freq)==0 or epoch==args.epochs-1 :
# evaluate on validation set
acc1 = validate(val_loader, model, criterion, args, epoch, writer)
# remember best acc@1 and save checkpoint
is_best = acc1 > best_acc1
best_acc1 = max(acc1, best_acc1)
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % ngpus_per_node == 0 and epoch == args.epochs - 1):
if args.amp:
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_acc1': best_acc1,
'optimizer' : optimizer.state_dict(),
'amp': amp.state_dict(),
}, is_best)
else:
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_acc1': best_acc1,
'optimizer' : optimizer.state_dict(),
}, is_best)
writer.close()
def train(train_loader, model, criterion, optimizer, epoch, args, writer):
batch_time = AverageMeter('Time', ':6.3f')
data_time = AverageMeter('Data', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
len(train_loader),
[batch_time, data_time, losses, top1, top5],
prefix="Epoch: [{}]".format(epoch))
# switch to train mode
model.train()
end = time.time()
for i, (images, target) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
target = target.to(torch.int32)
images, target = images.to(CALCULATE_DEVICE, non_blocking=False), target.to(CALCULATE_DEVICE, non_blocking=False)
# compute output
output = model(images)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# add tensorboard
writer.add_scalar('Train/Loss', losses.val, epoch * len(train_loader) + i)
writer.add_scalar('Train/Acc@1', top1.val, epoch * len(train_loader) + i)
writer.add_scalar('Train/Acc@5', top5.val, epoch * len(train_loader) + i)
writer.add_scalar('Train/LR', optimizer.param_groups[0]['lr'], epoch * len(train_loader) + i)
# compute gradient and do SGD step
optimizer.zero_grad()
if args.amp:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
optimizer.step()
# measure elapsed time
batch_time.update(time.time() - end)
writer.add_scalar('Train/Time', batch_time.val, epoch * len(train_loader) + i)
writer.add_scalar('Train/Time_Data', data_time.val, epoch * len(train_loader) + i)
end = time.time()
if i % args.print_freq == 0:
progress.display(i)
print(' * FPS@all {:.3f}'.format(args.batch_size/batch_time.avg))
hwlog.remark_print(key=hwlog.FPS, value=' * FPS@all {:.3f}'.format(args.batch_size/batch_time.avg))
def validate(val_loader, model, criterion, args, epoch=0, writer=None):
batch_time = AverageMeter('Time', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
len(val_loader),
[batch_time, losses, top1, top5],
prefix='Test: ')
# switch to evaluate mode
model.eval()
with torch.no_grad():
end = time.time()
for i, (images, target) in enumerate(val_loader):
target = target.to(torch.int32)
images, target = images.to(CALCULATE_DEVICE, non_blocking=False), target.to(CALCULATE_DEVICE, non_blocking=False)
# compute output
output = model(images)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
progress.display(i)
# TODO: this should also be done with the ProgressMeter
print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
.format(top1=top1, top5=top5))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value="{top1.avg:.3f}".format(top1=top1))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value="{top5.avg:.3f}".format(top5=top5))
if writer: # and args.gpu==0:
writer.add_scalar('Val/Time', batch_time.avg, epoch)
writer.add_scalar('Val/Loss', losses.avg, epoch)
writer.add_scalar('Val/Acc@1', top1.avg, epoch)
writer.add_scalar('Val/Acc@5', top5.avg, epoch)
return top1.avg
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
torch.save(state, filename)
if is_best:
shutil.copyfile(filename, 'model_best_acc%.4f_epoch%d.pth.tar'%(state['best_acc1'], state['epoch']))
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self, name, fmt=':f'):
self.name = name
self.fmt = fmt
self.reset()
self.start_count_index = 10
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.count += n
if self.count>(self.start_count_index*n):
self.sum += val * n
self.avg = self.sum / (self.count-self.start_count_index*n)
def __str__(self):
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
return fmtstr.format(**self.__dict__)
class ProgressMeter(object):
def __init__(self, num_batches, meters, prefix=""):
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
self.meters = meters
self.prefix = prefix
def display(self, batch):
entries = [self.prefix + self.batch_fmtstr.format(batch)]
entries += [str(meter) for meter in self.meters]
print('\t'.join(entries))
# 日志打点
train_acc1 = str(entries).split("Acc@1")[1].strip().split(" ")[0]
train_acc5 = str(entries).split("Acc@5")[1].strip().split(" ")[0]
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP1, value=train_acc1)
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP5, value=train_acc5)
def _get_batch_fmtstr(self, num_batches):
num_digits = len(str(num_batches // 1))
fmt = '{:' + str(num_digits) + 'd}'
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
def adjust_learning_rate(optimizer, epoch, args):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = args.lr * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def accuracy(output, target, topk=(1,)):
"""Computes the accuracy over the k top predictions for the specified values of k"""
with torch.no_grad():
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
if __name__ == '__main__':
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
config_info = get_model_parameter("pytorch_config")
initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
main()
@@ -0,0 +1,538 @@
# -*- coding: utf-8 -*-
import argparse
import os
import random
import shutil
import time
import warnings
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from densenet_0_2_2 import densenet121
from apex import amp
from benchmark_log import hwlog
from benchmark_log.basic_utils import get_environment_info
from benchmark_log.basic_utils import get_model_parameter
BATCH_SIZE = 512
OPTIMIZER_BATCH_SIZE=2048
model_names = sorted(name for name in models.__dict__
if name.islower() and not name.startswith("__")
and callable(models.__dict__[name]))
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('--data', metavar='DIR', default='/opt/npu/dataset/imagenet',
help='path to dataset')
parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
choices=model_names,
help='model architecture: ' +
' | '.join(model_names) +
' (default: resnet18)')
parser.add_argument('-j', '--workers', default=32, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=90, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=BATCH_SIZE, type=int,
metavar='N',
help='mini-batch size (default: 256), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('--workspace',type=str,default='./',metavar='DIR',
help='path to directory where checkpoints will be stored')
parser.add_argument('-p', '--print-freq', default=10, type=int,
metavar='N', help='print frequency (default: 10)')
parser.add_argument('-ef', '--eval-freq', default=5, type=int,
metavar='N', help='evaluate frequency (default: 5)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
help='Use multi-processing distributed training to launch '
'N processes per node, which has N GPUs. This is the '
'fastest way to use PyTorch for either single node or '
'multi node data parallel training')
parser.add_argument('-bm', '--benchmark', default=0, type=int,
metavar='N', help='set benchmark status (default: 1,run benchmark)')
parser.add_argument('--device', default='npu', type=str,
help='npu or gpu')
parser.add_argument('--addr', default='10.136.181.115', type=str,
help='master addr')
parser.add_argument('--checkpoint-nameprefix', default='checkpoint', type=str,
help='checkpoint-nameprefix')
parser.add_argument('--checkpoint-freq', default=0, type=int,
metavar='N', help='checkpoint frequency (default: 0)'
'0: save only one file whitch per epoch;'
'n: save diff file per n epoch'
'-1:no checkpoint,not support')
parser.add_argument('--device-list', default='0,1,2,3,4,5,6,7', type=str, help='device id list')
# apex
parser.add_argument('--amp', default=False, action='store_true',
help='use amp to train the model')
parser.add_argument('--loss-scale', default=1024., type=float,
help='loss scale using in amp, default -1 means dynamic')
parser.add_argument('--opt-level', default='O2', type=str,
help='loss scale using in amp, default -1 means dynamic')
warnings.filterwarnings('ignore')
best_acc1 = 0
def device_id_to_process_device_map(device_list):
devices = device_list.split(",")
devices = [int(x) for x in devices]
devices.sort()
process_device_map = dict()
for process_id, device_id in enumerate(devices):
process_device_map[process_id] = device_id
return process_device_map
def main():
args = parser.parse_args()
print("===============main()=================")
print(args)
print("===============main()=================")
os.environ['KERNEL_NAME_ID'] = str(0)
print("+++++++++++++++++++++++++++KERNEL_NAME_ID:",os.environ['KERNEL_NAME_ID'])
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
cudnn.deterministic = True
warnings.warn('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.')
os.environ['MASTER_ADDR'] = args.addr # '10.136.181.51'
os.environ['MASTER_PORT'] = '29688'
if args.gpu is not None:
warnings.warn('You have chosen a specific GPU. This will completely '
'disable data parallelism.')
if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
args.process_device_map = device_id_to_process_device_map(args.device_list)
if args.device == 'npu':
ngpus_per_node = len(args.process_device_map)
else:
ngpus_per_node = torch.cuda.device_count()
if args.multiprocessing_distributed:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args.world_size = ngpus_per_node * args.world_size
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
# The child process uses the environment variables of the parent process,
# we have to set KERNEL_NAME_ID for every proc
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
else:
# Simply call main_worker function
main_worker(args.gpu, ngpus_per_node, args)
def main_worker(gpu, ngpus_per_node, args):
global best_acc1
args.gpu = args.process_device_map[gpu]
print("[npu id:",args.gpu,"]","+++++++++++++++++++++++++++ before set KERNEL_NAME_ID:",os.environ['KERNEL_NAME_ID'])
os.environ['KERNEL_NAME_ID'] = str(gpu)
print("[npu id:",args.gpu,"]","+++++++++++++++++++++++++++KERNEL_NAME_ID:",os.environ['KERNEL_NAME_ID'])
if args.gpu is not None:
print("[npu id:",args.gpu,"]","Use GPU: {} for training".format(args.gpu))
if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
args.rank = int(os.environ["RANK"])
if args.multiprocessing_distributed:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
args.rank = args.rank * ngpus_per_node + gpu
if args.device == 'npu':
dist.init_process_group(backend=args.dist_backend, #init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
else:
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
loc = 'npu:{}'.format(args.gpu)
torch.npu.set_device(loc)
args.batch_size = int(args.batch_size / ngpus_per_node)
args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
print("[npu id:",args.gpu,"]","===============main_worker()=================")
print("[npu id:",args.gpu,"]",args)
print("[npu id:",args.gpu,"]","===============main_worker()=================")
# Data loading code
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]))
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
num_workers=args.workers, pin_memory=False, sampler=train_sampler, drop_last=True)
val_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])),
batch_size=args.batch_size, shuffle=True,
num_workers=args.workers, pin_memory=False, drop_last=True)
# create model
print("[npu id:",args.gpu,"]","=> creating model '{}'".format(args.arch))
# model = models.__dict__[args.arch]()
model = densenet121()
model = model.to(loc)
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().to(loc)
optimizer = torch.optim.SGD(model.parameters(), args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
if args.amp:
model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False)
# optionally resume from a checkpoint
if args.resume:
if os.path.isfile(args.resume):
print("=> loading checkpoint '{}'".format(args.resume))
checkpoint = torch.load(args.resume, map_location=loc)
args.start_epoch = checkpoint['epoch']
best_acc1 = checkpoint['best_acc1']
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
if args.amp:
amp.load_state_dict(checkpoint['amp'])
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.resume, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(args.resume))
cudnn.benchmark = True
if args.evaluate:
validate(val_loader, model, criterion, args)
return
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
adjust_learning_rate(optimizer, epoch, args)
# train for one epoch
train(train_loader, model, criterion, optimizer, epoch, args,ngpus_per_node)
if (epoch+1)%(args.eval_freq)==0 or epoch==args.epochs-1 :
# evaluate on validation set
acc1 = validate(val_loader, model, criterion, args,ngpus_per_node)
# remember best acc@1 and save checkpoint
is_best = acc1 > best_acc1
best_acc1 = max(acc1, best_acc1)
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % ngpus_per_node == 0 and epoch == args.epochs - 1):
if args.amp:
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_acc1': best_acc1,
'optimizer' : optimizer.state_dict(),
'amp': amp.state_dict(),
}, is_best)
else:
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_acc1': best_acc1,
'optimizer' : optimizer.state_dict(),
}, is_best)
def train(train_loader, model, criterion, optimizer, epoch, args,ngpus_per_node):
batch_time = AverageMeter('Time', ':6.3f')
data_time = AverageMeter('Data', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
len(train_loader),
[batch_time, data_time, losses, top1, top5],
prefix="Epoch: [{}]".format(epoch))
# switch to train mode
model.train()
end = time.time()
if args.benchmark == 1 :
optimizer.zero_grad()
for i, (images, target) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
loc = 'npu:{}'.format(args.gpu)
target = target.to(torch.int32)
images, target = images.to(loc, non_blocking=False), target.to(loc, non_blocking=False)
# compute output
output = model(images)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# compute gradient and do SGD step
if args.benchmark == 0 :
optimizer.zero_grad()
if args.amp:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
if args.benchmark == 0 :
optimizer.step()
elif args.benchmark == 1 :
BATCH_SIZE_multiplier = int(OPTIMIZER_BATCH_SIZE / args.batch_size)
BM_optimizer_step = ((i + 1) % BATCH_SIZE_multiplier) == 0
if BM_optimizer_step:
for param_group in optimizer.param_groups:
for param in param_group['params']:
param.grad /= BATCH_SIZE_multiplier
optimizer.step()
optimizer.zero_grad()
if i % args.print_freq == 0:
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % ngpus_per_node == 0):
progress.display(i)
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % ngpus_per_node == 0):
print("[npu id:",args.gpu,"]",'* FPS@all {:.3f}'.format(ngpus_per_node*args.batch_size/batch_time.avg))
hwlog.remark_print(key=hwlog.FPS, value=' * FPS@all {:.3f}'.format(ngpus_per_node*args.batch_size / batch_time.avg))
def validate(val_loader, model, criterion, args,ngpus_per_node):
batch_time = AverageMeter('Time', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
len(val_loader),
[batch_time, losses, top1, top5],
prefix='Test: ')
# switch to evaluate mode
model.eval()
with torch.no_grad():
end = time.time()
for i, (images, target) in enumerate(val_loader):
loc = 'npu:{}'.format(args.gpu)
target = target.to(torch.int32)
images, target = images.to(loc, non_blocking=False), target.to(loc, non_blocking=False)
# compute output
output = model(images)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % ngpus_per_node == 0):
progress.display(i)
# TODO: this should also be done with the ProgressMeter
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % ngpus_per_node == 0):
print("[npu id:",args.gpu,"]",'[AVG-ACC] * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
.format(top1=top1, top5=top5))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value="{top1.avg:.3f}".format(top1=top1))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value="{top5.avg:.3f}".format(top5=top5))
return top1.avg
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
torch.save(state, filename)
if is_best:
shutil.copyfile(filename, 'model_best_acc%.4f_epoch%d.pth.tar'%(state['best_acc1'], state['epoch']))
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self, name, fmt=':f'):
self.name = name
self.fmt = fmt
self.reset()
self.start_count_index = 10
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.count += n
if self.count>(self.start_count_index*n):
self.sum += val * n
self.avg = self.sum / (self.count-self.start_count_index*n)
def __str__(self):
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
return fmtstr.format(**self.__dict__)
class ProgressMeter(object):
def __init__(self, num_batches, meters, prefix=""):
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
self.meters = meters
self.prefix = prefix
def display(self, batch):
entries = [self.prefix + self.batch_fmtstr.format(batch)]
entries += [str(meter) for meter in self.meters]
print("[npu id:",os.environ['KERNEL_NAME_ID'],"]",'\t'.join(entries))
# 日志打点
train_acc1 = str(entries).split("Acc@1")[1].strip().split(" ")[0]
train_acc5 = str(entries).split("Acc@5")[1].strip().split(" ")[0]
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP1, value=train_acc1)
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP5, value=train_acc5)
def _get_batch_fmtstr(self, num_batches):
num_digits = len(str(num_batches // 1))
fmt = '{:' + str(num_digits) + 'd}'
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
def adjust_learning_rate(optimizer, epoch, args):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = args.lr * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def accuracy(output, target, topk=(1,)):
"""Computes the accuracy over the k top predictions for the specified values of k"""
with torch.no_grad():
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
if __name__ == '__main__':
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
config_info = get_model_parameter("pytorch_config")
initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
main()
@@ -0,0 +1,225 @@
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.model_zoo as model_zoo
from collections import OrderedDict
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
model_urls = {
'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
}
class _DenseLayer(nn.Sequential):
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
super(_DenseLayer, self).__init__()
self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
self.add_module('relu1', nn.ReLU(inplace=True)),
self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
growth_rate, kernel_size=1, stride=1, bias=False)),
self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
self.add_module('relu2', nn.ReLU(inplace=True)),
self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
kernel_size=3, stride=1, padding=1, bias=False)),
self.drop_rate = drop_rate
def forward(self, x):
new_features = super(_DenseLayer, self).forward(x)
if self.drop_rate > 0:
new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
return torch.cat([x, new_features], 1)
class _DenseBlock(nn.Sequential):
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
super(_DenseBlock, self).__init__()
for i in range(num_layers):
layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
self.add_module('denselayer%d' % (i + 1), layer)
class _Transition(nn.Sequential):
def __init__(self, num_input_features, num_output_features):
super(_Transition, self).__init__()
self.add_module('norm', nn.BatchNorm2d(num_input_features))
self.add_module('relu', nn.ReLU(inplace=True))
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
kernel_size=1, stride=1, bias=False))
self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
class DenseNet(nn.Module):
r"""Densenet-BC model class, based on
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
growth_rate (int) - how many filters to add each layer (`k` in paper)
block_config (list of 4 ints) - how many layers in each pooling block
num_init_features (int) - the number of filters to learn in the first convolution layer
bn_size (int) - multiplicative factor for number of bottle neck layers
(i.e. bn_size * k features in the bottleneck layer)
drop_rate (float) - dropout rate after each dense layer
num_classes (int) - number of classification classes
"""
def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
super(DenseNet, self).__init__()
# First convolution
self.features = nn.Sequential(OrderedDict([
('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
('norm0', nn.BatchNorm2d(num_init_features)),
('relu0', nn.ReLU(inplace=True)),
('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
]))
# Each denseblock
num_features = num_init_features
for i, num_layers in enumerate(block_config):
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
self.features.add_module('denseblock%d' % (i + 1), block)
num_features = num_features + num_layers * growth_rate
if i != len(block_config) - 1:
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
self.features.add_module('transition%d' % (i + 1), trans)
num_features = num_features // 2
# Final batch norm
self.features.add_module('norm5', nn.BatchNorm2d(num_features))
# Linear layer
self.classifier = nn.Linear(num_features, num_classes)
# Official init from torch repo.
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.constant_(m.bias, 0)
def forward(self, x):
features = self.features(x)
out = F.relu(features, inplace=True)
out = F.adaptive_avg_pool2d(out, (1, 1)).view(features.size(0), -1)
out = self.classifier(out)
return out
def densenet121(pretrained=False, **kwargs):
r"""Densenet-121 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
**kwargs)
if pretrained:
# '.'s are no longer allowed in module names, but pervious _DenseLayer
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
# They are also in the checkpoints in model_urls. This pattern is used
# to find such keys.
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
state_dict = model_zoo.load_url(model_urls['densenet121'])
for key in list(state_dict.keys()):
res = pattern.match(key)
if res:
new_key = res.group(1) + res.group(2)
state_dict[new_key] = state_dict[key]
del state_dict[key]
model.load_state_dict(state_dict)
return model
def densenet169(pretrained=False, **kwargs):
r"""Densenet-169 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
**kwargs)
if pretrained:
# '.'s are no longer allowed in module names, but pervious _DenseLayer
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
# They are also in the checkpoints in model_urls. This pattern is used
# to find such keys.
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
state_dict = model_zoo.load_url(model_urls['densenet169'])
for key in list(state_dict.keys()):
res = pattern.match(key)
if res:
new_key = res.group(1) + res.group(2)
state_dict[new_key] = state_dict[key]
del state_dict[key]
model.load_state_dict(state_dict)
return model
def densenet201(pretrained=False, **kwargs):
r"""Densenet-201 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
**kwargs)
if pretrained:
# '.'s are no longer allowed in module names, but pervious _DenseLayer
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
# They are also in the checkpoints in model_urls. This pattern is used
# to find such keys.
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
state_dict = model_zoo.load_url(model_urls['densenet201'])
for key in list(state_dict.keys()):
res = pattern.match(key)
if res:
new_key = res.group(1) + res.group(2)
state_dict[new_key] = state_dict[key]
del state_dict[key]
model.load_state_dict(state_dict)
return model
def densenet161(pretrained=False, **kwargs):
r"""Densenet-161 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
**kwargs)
if pretrained:
# '.'s are no longer allowed in module names, but pervious _DenseLayer
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
# They are also in the checkpoints in model_urls. This pattern is used
# to find such keys.
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
state_dict = model_zoo.load_url(model_urls['densenet161'])
for key in list(state_dict.keys()):
res = pattern.match(key)
if res:
new_key = res.group(1) + res.group(2)
state_dict[new_key] = state_dict[key]
del state_dict[key]
model.load_state_dict(state_dict)
return model
@@ -0,0 +1,279 @@
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as cp
from collections import OrderedDict
#from .utils import load_state_dict_from_url
from torch import Tensor
from torch.jit.annotations import List
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
model_urls = {
'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
}
class _DenseLayer(nn.Module):
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, memory_efficient=False):
super(_DenseLayer, self).__init__()
self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
self.add_module('relu1', nn.ReLU(inplace=True)),
self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
growth_rate, kernel_size=1, stride=1,
bias=False)),
self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
self.add_module('relu2', nn.ReLU(inplace=True)),
self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
kernel_size=3, stride=1, padding=1,
bias=False)),
self.drop_rate = float(drop_rate)
self.memory_efficient = memory_efficient
def bn_function(self, inputs):
# type: (List[Tensor]) -> Tensor
concated_features = torch.cat(inputs, 1)
bottleneck_output = self.conv1(self.relu1(self.norm1(concated_features))) # noqa: T484
return bottleneck_output
# todo: rewrite when torchscript supports any
def any_requires_grad(self, input):
# type: (List[Tensor]) -> bool
for tensor in input:
if tensor.requires_grad:
return True
return False
@torch.jit.unused # noqa: T484
def call_checkpoint_bottleneck(self, input):
# type: (List[Tensor]) -> Tensor
def closure(*inputs):
return self.bn_function(*inputs)
return cp.checkpoint(closure, input)
@torch.jit._overload_method # noqa: F811
def forward(self, input):
# type: (List[Tensor]) -> (Tensor)
pass
@torch.jit._overload_method # noqa: F811
def forward(self, input):
# type: (Tensor) -> (Tensor)
pass
# torchscript does not yet support *args, so we overload method
# allowing it to take either a List[Tensor] or single Tensor
def forward(self, input): # noqa: F811
if isinstance(input, Tensor):
prev_features = [input]
else:
prev_features = input
if self.memory_efficient and self.any_requires_grad(prev_features):
if torch.jit.is_scripting():
raise Exception("Memory Efficient not supported in JIT")
bottleneck_output = self.call_checkpoint_bottleneck(prev_features)
else:
bottleneck_output = self.bn_function(prev_features)
new_features = self.conv2(self.relu2(self.norm2(bottleneck_output)))
if self.drop_rate > 0:
new_features = F.dropout(new_features, p=self.drop_rate,
training=self.training)
return new_features
class _DenseBlock(nn.ModuleDict):
_version = 2
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, memory_efficient=False):
super(_DenseBlock, self).__init__()
for i in range(num_layers):
layer = _DenseLayer(
num_input_features + i * growth_rate,
growth_rate=growth_rate,
bn_size=bn_size,
drop_rate=drop_rate,
memory_efficient=memory_efficient,
)
self.add_module('denselayer%d' % (i + 1), layer)
def forward(self, init_features):
features = [init_features]
for name, layer in self.items():
new_features = layer(features)
features.append(new_features)
return torch.cat(features, 1)
class _Transition(nn.Sequential):
def __init__(self, num_input_features, num_output_features):
super(_Transition, self).__init__()
self.add_module('norm', nn.BatchNorm2d(num_input_features))
self.add_module('relu', nn.ReLU(inplace=True))
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
kernel_size=1, stride=1, bias=False))
self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
class DenseNet(nn.Module):
r"""Densenet-BC model class, based on
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
growth_rate (int) - how many filters to add each layer (`k` in paper)
block_config (list of 4 ints) - how many layers in each pooling block
num_init_features (int) - the number of filters to learn in the first convolution layer
bn_size (int) - multiplicative factor for number of bottle neck layers
(i.e. bn_size * k features in the bottleneck layer)
drop_rate (float) - dropout rate after each dense layer
num_classes (int) - number of classification classes
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
"""
def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000, memory_efficient=False):
super(DenseNet, self).__init__()
# First convolution
self.features = nn.Sequential(OrderedDict([
('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2,
padding=3, bias=False)),
('norm0', nn.BatchNorm2d(num_init_features)),
('relu0', nn.ReLU(inplace=True)),
('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
]))
# Each denseblock
num_features = num_init_features
for i, num_layers in enumerate(block_config):
block = _DenseBlock(
num_layers=num_layers,
num_input_features=num_features,
bn_size=bn_size,
growth_rate=growth_rate,
drop_rate=drop_rate,
memory_efficient=memory_efficient
)
self.features.add_module('denseblock%d' % (i + 1), block)
num_features = num_features + num_layers * growth_rate
if i != len(block_config) - 1:
trans = _Transition(num_input_features=num_features,
num_output_features=num_features // 2)
self.features.add_module('transition%d' % (i + 1), trans)
num_features = num_features // 2
# Final batch norm
self.features.add_module('norm5', nn.BatchNorm2d(num_features))
# Linear layer
self.classifier = nn.Linear(num_features, num_classes)
# Official init from torch repo.
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.constant_(m.bias, 0)
def forward(self, x):
features = self.features(x)
out = F.relu(features, inplace=True)
out = F.adaptive_avg_pool2d(out, (1, 1))
out = torch.flatten(out, 1)
out = self.classifier(out)
return out
def _load_state_dict(model, model_url, progress):
# '.'s are no longer allowed in module names, but previous _DenseLayer
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
# They are also in the checkpoints in model_urls. This pattern is used
# to find such keys.
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
state_dict = load_state_dict_from_url(model_url, progress=progress)
for key in list(state_dict.keys()):
res = pattern.match(key)
if res:
new_key = res.group(1) + res.group(2)
state_dict[new_key] = state_dict[key]
del state_dict[key]
model.load_state_dict(state_dict)
def _densenet(arch, growth_rate, block_config, num_init_features, pretrained, progress,
**kwargs):
model = DenseNet(growth_rate, block_config, num_init_features, **kwargs)
if pretrained:
_load_state_dict(model, model_urls[arch], progress)
return model
def densenet121(pretrained=False, progress=True, **kwargs):
r"""Densenet-121 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
"""
return _densenet('densenet121', 32, (6, 12, 24, 16), 64, pretrained, progress,
**kwargs)
def densenet161(pretrained=False, progress=True, **kwargs):
r"""Densenet-161 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
"""
return _densenet('densenet161', 48, (6, 12, 36, 24), 96, pretrained, progress,
**kwargs)
def densenet169(pretrained=False, progress=True, **kwargs):
r"""Densenet-169 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
"""
return _densenet('densenet169', 32, (6, 12, 32, 32), 64, pretrained, progress,
**kwargs)
def densenet201(pretrained=False, progress=True, **kwargs):
r"""Densenet-201 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
"""
return _densenet('densenet201', 32, (6, 12, 48, 32), 64, pretrained, progress,
**kwargs)
@@ -0,0 +1,22 @@
export ASCEND_HOME=/usr/local/Ascend
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$currentDir
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
export SLOG_PRINT_TO_STDOUT=0
su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 7"
export TASK_QUEUE_ENABLE=0
taskset -c 111-150 python3 densenet121_1p_main.py \
--workers 40 \
--arch densenet121 \
--npu 7 \
--lr 0.1 \
--momentum 0.9 \
--amp \
--batch-size 256 \
--epoch 90 \
--evaluate \
--resume checkpoint.pth.tar \
--data /opt/npu/dataset/imagenet
@@ -0,0 +1,275 @@
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.model_zoo as model_zoo
from collections import OrderedDict
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
model_urls = {
'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
}
class _DenseLayer(nn.Sequential):
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
super(_DenseLayer, self).__init__()
self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
self.add_module('relu1', nn.ReLU(inplace=True)),
self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
growth_rate, kernel_size=1, stride=1, bias=False)),
self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
self.add_module('relu2', nn.ReLU(inplace=True)),
self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
kernel_size=3, stride=1, padding=1, bias=False)),
self.drop_rate = drop_rate
def forward(self, x):
new_features = super(_DenseLayer, self).forward(x)
if self.drop_rate > 0:
new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
return torch.cat([x, new_features], 1)
class _DenseBlock(nn.Sequential):
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
super(_DenseBlock, self).__init__()
for i in range(num_layers):
layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
self.add_module('denselayer%d' % (i + 1), layer)
class _Transition(nn.Sequential):
def __init__(self, num_input_features, num_output_features):
super(_Transition, self).__init__()
self.add_module('norm', nn.BatchNorm2d(num_input_features))
self.add_module('relu', nn.ReLU(inplace=True))
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
kernel_size=1, stride=1, bias=False))
#self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2)) ######### xupeng add ##########
class DenseNet(nn.Module):
r"""Densenet-BC model class, based on
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
growth_rate (int) - how many filters to add each layer (`k` in paper)
block_config (list of 4 ints) - how many layers in each pooling block
num_init_features (int) - the number of filters to learn in the first convolution layer
bn_size (int) - multiplicative factor for number of bottle neck layers
(i.e. bn_size * k features in the bottleneck layer)
drop_rate (float) - dropout rate after each dense layer
num_classes (int) - number of classification classes
"""
def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
super(DenseNet, self).__init__()
self.avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
################ block 0 ################
num_features = num_init_features
i=0
num_layers=block_config[i]
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
num_features = num_features + num_layers * growth_rate
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
self.features0 = nn.Sequential(OrderedDict([
('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
('norm0', nn.BatchNorm2d(num_init_features)),
('relu0', nn.ReLU(inplace=True)),
('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
('denseblock%d' % (i + 1), block),
('transition%d' % (i + 1), trans)
]))
################ block 1 ##############
num_features = num_features // 2
i=1
num_layers=block_config[i]
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
num_features = num_features + num_layers * growth_rate
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
self.features1 = nn.Sequential(OrderedDict([
('denseblock%d' % (i + 1), block),
('transition%d' % (i + 1), trans),
]))
################ block 2 ##############
num_features = num_features // 2
i=2
num_layers=block_config[i]
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
num_features = num_features + num_layers * growth_rate
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
self.features2 = nn.Sequential(OrderedDict([
('denseblock%d' % (i + 1), block),
('transition%d' % (i + 1), trans),
]))
################ block 3 ##############
num_features = num_features // 2
i=3
num_layers=block_config[i]
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
num_features = num_features + num_layers * growth_rate
self.features3 = nn.Sequential(OrderedDict([
('denseblock%d' % (i + 1), block),
('norm5', nn.BatchNorm2d(num_features)),
]))
# Linear layer
self.classifier = nn.Linear(num_features, num_classes)
# Official init from torch repo.
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.constant_(m.bias, 0)
def forward(self, x):
#CALCULATE_DEVICE = "npu:0"
#self.avg_pool = self.avg_pool.cpu()
#print("avg_pool move to cpu")
#print("tag0")
features0 = self.features0(x)
#features0 = features0.cpu()
avg_pool_0 = self.avg_pool(features0)
#avg_pool_0 = avg_pool_0.to(CALCULATE_DEVICE)
#print("tag1")
features1 = self.features1(avg_pool_0)
#features1 = features1.cpu()
avg_pool_1 = self.avg_pool(features1)
#avg_pool_1 = avg_pool_1.to(CALCULATE_DEVICE)
#print("tag2")
features2 = self.features2(avg_pool_1)
#features2 = features2.cpu()
avg_pool_2 = self.avg_pool(features2)
#avg_pool_2 = avg_pool_2.to(CALCULATE_DEVICE)
#print("tag3")
features3 = self.features3(avg_pool_2)
out = F.relu(features3, inplace=True)
out = F.adaptive_avg_pool2d(out, (1, 1)).view(features3.size(0), -1)
out = self.classifier(out)
return out
def densenet121(pretrained=False, **kwargs):
r"""Densenet-121 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
**kwargs)
if pretrained:
# '.'s are no longer allowed in module names, but pervious _DenseLayer
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
# They are also in the checkpoints in model_urls. This pattern is used
# to find such keys.
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
state_dict = model_zoo.load_url(model_urls['densenet121'])
for key in list(state_dict.keys()):
res = pattern.match(key)
if res:
new_key = res.group(1) + res.group(2)
state_dict[new_key] = state_dict[key]
del state_dict[key]
model.load_state_dict(state_dict)
return model
def densenet169(pretrained=False, **kwargs):
r"""Densenet-169 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
**kwargs)
if pretrained:
# '.'s are no longer allowed in module names, but pervious _DenseLayer
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
# They are also in the checkpoints in model_urls. This pattern is used
# to find such keys.
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
state_dict = model_zoo.load_url(model_urls['densenet169'])
for key in list(state_dict.keys()):
res = pattern.match(key)
if res:
new_key = res.group(1) + res.group(2)
state_dict[new_key] = state_dict[key]
del state_dict[key]
model.load_state_dict(state_dict)
return model
def densenet201(pretrained=False, **kwargs):
r"""Densenet-201 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
**kwargs)
if pretrained:
# '.'s are no longer allowed in module names, but pervious _DenseLayer
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
# They are also in the checkpoints in model_urls. This pattern is used
# to find such keys.
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
state_dict = model_zoo.load_url(model_urls['densenet201'])
for key in list(state_dict.keys()):
res = pattern.match(key)
if res:
new_key = res.group(1) + res.group(2)
state_dict[new_key] = state_dict[key]
del state_dict[key]
model.load_state_dict(state_dict)
return model
def densenet161(pretrained=False, **kwargs):
r"""Densenet-161 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
**kwargs)
if pretrained:
# '.'s are no longer allowed in module names, but pervious _DenseLayer
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
# They are also in the checkpoints in model_urls. This pattern is used
# to find such keys.
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
state_dict = model_zoo.load_url(model_urls['densenet161'])
for key in list(state_dict.keys()):
res = pattern.match(key)
if res:
new_key = res.group(1) + res.group(2)
state_dict[new_key] = state_dict[key]
del state_dict[key]
model.load_state_dict(state_dict)
return model
@@ -0,0 +1,275 @@
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.model_zoo as model_zoo
from collections import OrderedDict
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
model_urls = {
'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
}
class _DenseLayer(nn.Sequential):
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
super(_DenseLayer, self).__init__()
self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
self.add_module('relu1', nn.ReLU(inplace=True)),
self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
growth_rate, kernel_size=1, stride=1, bias=False)),
self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
self.add_module('relu2', nn.ReLU(inplace=True)),
self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
kernel_size=3, stride=1, padding=1, bias=False)),
self.drop_rate = drop_rate
def forward(self, x):
new_features = super(_DenseLayer, self).forward(x)
if self.drop_rate > 0:
new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
return torch.cat([x, new_features], 1)
class _DenseBlock(nn.Sequential):
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
super(_DenseBlock, self).__init__()
for i in range(num_layers):
layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
self.add_module('denselayer%d' % (i + 1), layer)
class _Transition(nn.Sequential):
def __init__(self, num_input_features, num_output_features):
super(_Transition, self).__init__()
self.add_module('norm', nn.BatchNorm2d(num_input_features))
self.add_module('relu', nn.ReLU(inplace=True))
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
kernel_size=1, stride=1, bias=False))
#self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2)) ######### xupeng add ##########
class DenseNet(nn.Module):
r"""Densenet-BC model class, based on
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
growth_rate (int) - how many filters to add each layer (`k` in paper)
block_config (list of 4 ints) - how many layers in each pooling block
num_init_features (int) - the number of filters to learn in the first convolution layer
bn_size (int) - multiplicative factor for number of bottle neck layers
(i.e. bn_size * k features in the bottleneck layer)
drop_rate (float) - dropout rate after each dense layer
num_classes (int) - number of classification classes
"""
def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
super(DenseNet, self).__init__()
self.avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
################ block 0 ################
num_features = num_init_features
i=0
num_layers=block_config[i]
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
num_features = num_features + num_layers * growth_rate
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
self.features0 = nn.Sequential(OrderedDict([
('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
('norm0', nn.BatchNorm2d(num_init_features)),
('relu0', nn.ReLU(inplace=True)),
('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
('denseblock%d' % (i + 1), block),
('transition%d' % (i + 1), trans)
]))
################ block 1 ##############
num_features = num_features // 2
i=1
num_layers=block_config[i]
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
num_features = num_features + num_layers * growth_rate
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
self.features1 = nn.Sequential(OrderedDict([
('denseblock%d' % (i + 1), block),
('transition%d' % (i + 1), trans),
]))
################ block 2 ##############
num_features = num_features // 2
i=2
num_layers=block_config[i]
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
num_features = num_features + num_layers * growth_rate
trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
self.features2 = nn.Sequential(OrderedDict([
('denseblock%d' % (i + 1), block),
('transition%d' % (i + 1), trans),
]))
################ block 3 ##############
num_features = num_features // 2
i=3
num_layers=block_config[i]
block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
num_features = num_features + num_layers * growth_rate
self.features3 = nn.Sequential(OrderedDict([
('denseblock%d' % (i + 1), block),
('norm5', nn.BatchNorm2d(num_features)),
]))
# Linear layer
self.classifier = nn.Linear(num_features, num_classes)
# Official init from torch repo.
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.constant_(m.bias, 0)
def forward(self, x):
CALCULATE_DEVICE = "npu:0"
self.avg_pool = self.avg_pool.cpu()
#print("avg_pool move to cpu")
#print("tag0")
features0 = self.features0(x)
features0 = features0.cpu()
avg_pool_0 = self.avg_pool(features0)
avg_pool_0 = avg_pool_0.to(CALCULATE_DEVICE)
#print("tag1")
features1 = self.features1(avg_pool_0)
features1 = features1.cpu()
avg_pool_1 = self.avg_pool(features1)
avg_pool_1 = avg_pool_1.to(CALCULATE_DEVICE)
#print("tag2")
features2 = self.features2(avg_pool_1)
features2 = features2.cpu()
avg_pool_2 = self.avg_pool(features2)
avg_pool_2 = avg_pool_2.to(CALCULATE_DEVICE)
#print("tag3")
features3 = self.features3(avg_pool_2)
out = F.relu(features3, inplace=True)
out = F.adaptive_avg_pool2d(out, (1, 1)).view(features3.size(0), -1)
out = self.classifier(out)
return out
def densenet121(pretrained=False, **kwargs):
r"""Densenet-121 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
**kwargs)
if pretrained:
# '.'s are no longer allowed in module names, but pervious _DenseLayer
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
# They are also in the checkpoints in model_urls. This pattern is used
# to find such keys.
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
state_dict = model_zoo.load_url(model_urls['densenet121'])
for key in list(state_dict.keys()):
res = pattern.match(key)
if res:
new_key = res.group(1) + res.group(2)
state_dict[new_key] = state_dict[key]
del state_dict[key]
model.load_state_dict(state_dict)
return model
def densenet169(pretrained=False, **kwargs):
r"""Densenet-169 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
**kwargs)
if pretrained:
# '.'s are no longer allowed in module names, but pervious _DenseLayer
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
# They are also in the checkpoints in model_urls. This pattern is used
# to find such keys.
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
state_dict = model_zoo.load_url(model_urls['densenet169'])
for key in list(state_dict.keys()):
res = pattern.match(key)
if res:
new_key = res.group(1) + res.group(2)
state_dict[new_key] = state_dict[key]
del state_dict[key]
model.load_state_dict(state_dict)
return model
def densenet201(pretrained=False, **kwargs):
r"""Densenet-201 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
**kwargs)
if pretrained:
# '.'s are no longer allowed in module names, but pervious _DenseLayer
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
# They are also in the checkpoints in model_urls. This pattern is used
# to find such keys.
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
state_dict = model_zoo.load_url(model_urls['densenet201'])
for key in list(state_dict.keys()):
res = pattern.match(key)
if res:
new_key = res.group(1) + res.group(2)
state_dict[new_key] = state_dict[key]
del state_dict[key]
model.load_state_dict(state_dict)
return model
def densenet161(pretrained=False, **kwargs):
r"""Densenet-161 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
**kwargs)
if pretrained:
# '.'s are no longer allowed in module names, but pervious _DenseLayer
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
# They are also in the checkpoints in model_urls. This pattern is used
# to find such keys.
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
state_dict = model_zoo.load_url(model_urls['densenet161'])
for key in list(state_dict.keys()):
res = pattern.match(key)
if res:
new_key = res.group(1) + res.group(2)
state_dict[new_key] = state_dict[key]
del state_dict[key]
model.load_state_dict(state_dict)
return model
@@ -0,0 +1,300 @@
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as cp
from collections import OrderedDict
#from .utils import load_state_dict_from_url
from torch import Tensor
from torch.jit.annotations import List
__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
model_urls = {
'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
}
class _DenseLayer(nn.Module):
def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, memory_efficient=False):
super(_DenseLayer, self).__init__()
self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
self.add_module('relu1', nn.ReLU(inplace=True)),
self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
growth_rate, kernel_size=1, stride=1,
bias=False)),
self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
self.add_module('relu2', nn.ReLU(inplace=True)),
self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
kernel_size=3, stride=1, padding=1,
bias=False)),
self.drop_rate = float(drop_rate)
self.memory_efficient = memory_efficient
def bn_function(self, inputs):
# type: (List[Tensor]) -> Tensor
concated_features = torch.cat(inputs, 1)
bottleneck_output = self.conv1(self.relu1(self.norm1(concated_features))) # noqa: T484
return bottleneck_output
# todo: rewrite when torchscript supports any
def any_requires_grad(self, input):
# type: (List[Tensor]) -> bool
for tensor in input:
if tensor.requires_grad:
return True
return False
@torch.jit.unused # noqa: T484
def call_checkpoint_bottleneck(self, input):
# type: (List[Tensor]) -> Tensor
def closure(*inputs):
return self.bn_function(*inputs)
return cp.checkpoint(closure, input)
@torch.jit._overload_method # noqa: F811
def forward(self, input):
# type: (List[Tensor]) -> (Tensor)
pass
@torch.jit._overload_method # noqa: F811
def forward(self, input):
# type: (Tensor) -> (Tensor)
pass
# torchscript does not yet support *args, so we overload method
# allowing it to take either a List[Tensor] or single Tensor
def forward(self, input): # noqa: F811
if isinstance(input, Tensor):
prev_features = [input]
else:
prev_features = input
if self.memory_efficient and self.any_requires_grad(prev_features):
if torch.jit.is_scripting():
raise Exception("Memory Efficient not supported in JIT")
bottleneck_output = self.call_checkpoint_bottleneck(prev_features)
else:
bottleneck_output = self.bn_function(prev_features)
new_features = self.conv2(self.relu2(self.norm2(bottleneck_output)))
if self.drop_rate > 0:
new_features = F.dropout(new_features, p=self.drop_rate,
training=self.training)
return new_features
class _DenseBlock(nn.ModuleDict):
_version = 2
def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, memory_efficient=False):
super(_DenseBlock, self).__init__()
for i in range(num_layers):
layer = _DenseLayer(
num_input_features + i * growth_rate,
growth_rate=growth_rate,
bn_size=bn_size,
drop_rate=drop_rate,
memory_efficient=memory_efficient,
)
self.add_module('denselayer%d' % (i + 1), layer)
def forward(self, init_features):
features = [init_features]
for name, layer in self.items():
new_features = layer(features)
features.append(new_features)
return torch.cat(features, 1)
class _Transition(nn.Sequential):
def __init__(self, num_input_features, num_output_features):
super(_Transition, self).__init__()
self.add_module('norm', nn.BatchNorm2d(num_input_features))
self.add_module('relu', nn.ReLU(inplace=True))
self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
kernel_size=1, stride=1, bias=False))
self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
class PrintLayer(nn.Module):
def __init__(self, name):
super(PrintLayer, self).__init__()
self.name = name
def forward(self, x):
# Do your print / debug stuff here
print("{} mean data: {}".format(self.name, x.mean().item())) #print(x.shape)
return x
class DenseNet(nn.Module):
r"""Densenet-BC model class, based on
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
growth_rate (int) - how many filters to add each layer (`k` in paper)
block_config (list of 4 ints) - how many layers in each pooling block
num_init_features (int) - the number of filters to learn in the first convolution layer
bn_size (int) - multiplicative factor for number of bottle neck layers
(i.e. bn_size * k features in the bottleneck layer)
drop_rate (float) - dropout rate after each dense layer
num_classes (int) - number of classification classes
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
"""
def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000, memory_efficient=False):
super(DenseNet, self).__init__()
# First convolution
self.features = nn.Sequential(OrderedDict([
('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2,
padding=3, bias=False)),
('conv0_p', PrintLayer('conv0_p')),
('norm0', nn.BatchNorm2d(num_init_features)),
('norm0_p', PrintLayer('norm0_p')),
('relu0', nn.ReLU(inplace=True)),
('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
('pool0_p', PrintLayer('pool0_p')),
]))
# Each denseblock
num_features = num_init_features
for i, num_layers in enumerate(block_config):
block = _DenseBlock(
num_layers=num_layers,
num_input_features=num_features,
bn_size=bn_size,
growth_rate=growth_rate,
drop_rate=drop_rate,
memory_efficient=memory_efficient
)
self.features.add_module('denseblock%d' % (i + 1), block)
self.features.add_module('denseblock%d_p' % (i + 1), PrintLayer('denseblock%d_p' % (i + 1)))
num_features = num_features + num_layers * growth_rate
if i != len(block_config) - 1:
trans = _Transition(num_input_features=num_features,
num_output_features=num_features // 2)
self.features.add_module('transition%d' % (i + 1), trans)
self.features.add_module('transition%d_p' % (i + 1), PrintLayer('transition%d_p' % (i + 1)))
num_features = num_features // 2
# Final batch norm
self.features.add_module('norm5', nn.BatchNorm2d(num_features))
# Linear layer
self.classifier = nn.Linear(num_features, num_classes)
# Official init from torch repo.
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.constant_(m.bias, 0)
def forward(self, x):
features = self.features(x)
# features_p = features.to('cpu')'
print('the features mean: {}'.format(features.mean().item()))
out = F.relu(features, inplace=True)
out = F.adaptive_avg_pool2d(out, (1, 1))
out = torch.flatten(out, 1)
print('the flatten mean: {}'.format(out.mean().item()))
out = self.classifier(out)
return out
def _load_state_dict(model, model_url, progress):
# '.'s are no longer allowed in module names, but previous _DenseLayer
# has keys 'norm.1', 'relu.1', 'conv.1', 'norm.2', 'relu.2', 'conv.2'.
# They are also in the checkpoints in model_urls. This pattern is used
# to find such keys.
pattern = re.compile(
r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')
state_dict = load_state_dict_from_url(model_url, progress=progress)
for key in list(state_dict.keys()):
res = pattern.match(key)
if res:
new_key = res.group(1) + res.group(2)
state_dict[new_key] = state_dict[key]
del state_dict[key]
model.load_state_dict(state_dict)
def _densenet(arch, growth_rate, block_config, num_init_features, pretrained, progress,
**kwargs):
model = DenseNet(growth_rate, block_config, num_init_features, **kwargs)
if pretrained:
_load_state_dict(model, model_urls[arch], progress)
return model
def densenet121(pretrained=False, progress=True, **kwargs):
r"""Densenet-121 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
"""
return _densenet('densenet121', 32, (6, 12, 24, 16), 64, pretrained, progress,
**kwargs)
def densenet161(pretrained=False, progress=True, **kwargs):
r"""Densenet-161 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
"""
return _densenet('densenet161', 48, (6, 12, 36, 24), 96, pretrained, progress,
**kwargs)
def densenet169(pretrained=False, progress=True, **kwargs):
r"""Densenet-169 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
"""
return _densenet('densenet169', 32, (6, 12, 32, 32), 64, pretrained, progress,
**kwargs)
def densenet201(pretrained=False, progress=True, **kwargs):
r"""Densenet-201 model from
`"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
"""
return _densenet('densenet201', 32, (6, 12, 48, 32), 64, pretrained, progress,
**kwargs)
@@ -0,0 +1,32 @@
{
"board_id": "0x0000",
"chip_info": "910",
"deploy_mode": "lab",
"group_count": "1",
"group_list": [
{
"device_num": "1",
"server_num": "1",
"group_name": "",
"instance_count": "1",
"instance_list": [
{
"devices": [
{
"device_id": "0",
"device_ip": "192.168.100.101"
}
],
"rank_id": "0",
"server_id": "10.246.246.76"
}
]
}
],
"para_plane_nic_location": "device",
"para_plane_nic_name": [
"eth0"
],
"para_plane_nic_num": "1",
"status": "completed"
}
@@ -0,0 +1,44 @@
{
"board_id": "0x0000",
"chip_info": "910",
"deploy_mode": "lab",
"group_count": "1",
"group_list": [
{
"device_num": "2",
"server_num": "1",
"group_name": "",
"instance_count": "2",
"instance_list": [
{
"devices": [
{
"device_id": "0",
"device_ip": "192.168.100.101"
}
],
"rank_id": "0",
"server_id": "10.246.246.76"
},
{
"devices": [
{
"device_id": "1",
"device_ip": "192.168.101.101"
}
],
"rank_id": "1",
"server_id": "10.246.246.76"
},
}
]
}
],
"para_plane_nic_location": "device",
"para_plane_nic_name": [
"eth0",
"eth1"
],
"para_plane_nic_num": "2",
"status": "completed"
}
@@ -0,0 +1,65 @@
{
"board_id": "0x0000",
"chip_info": "910",
"deploy_mode": "lab",
"group_count": "1",
"group_list": [
{
"device_num": "4",
"server_num": "1",
"group_name": "",
"instance_count": "4",
"instance_list": [
{
"devices": [
{
"device_id": "0",
"device_ip": "192.168.190.102"
}
],
"rank_id": "0",
"server_id": "10.246.246.76"
},
{
"devices": [
{
"device_id": "1",
"device_ip": "192.168.191.102"
}
],
"rank_id": "1",
"server_id": "10.246.246.76"
},
{
"devices": [
{
"device_id": "2",
"device_ip": "192.168.192.102"
}
],
"rank_id": "2",
"server_id": "10.246.246.76"
},
{
"devices": [
{
"device_id": "3",
"device_ip": "192.168.193.102"
}
],
"rank_id": "3",
"server_id": "10.246.246.76"
}
]
}
],
"para_plane_nic_location": "device",
"para_plane_nic_name": [
"eth0",
"eth1",
"eth2",
"eth3"
],
"para_plane_nic_num": "4",
"status": "completed"
}
@@ -0,0 +1,109 @@
{
"board_id": "0x002f",
"chip_info": "910",
"deploy_mode": "lab",
"group_count": "1",
"group_list": [
{
"device_num": "8",
"server_num": "1",
"group_name": "",
"instance_count": "8",
"instance_list": [
{
"devices": [
{
"device_id": "0",
"device_ip": "192.168.100.101"
}
],
"rank_id": "0",
"server_id": "10.246.246.76"
},
{
"devices": [
{
"device_id": "1",
"device_ip": "192.168.101.101"
}
],
"rank_id": "1",
"server_id": "10.246.246.76"
},
{
"devices": [
{
"device_id": "2",
"device_ip": "192.168.102.101"
}
],
"rank_id": "2",
"server_id": "10.246.246.76"
},
{
"devices": [
{
"device_id": "3",
"device_ip": "192.168.103.101"
}
],
"rank_id": "3",
"server_id": "10.246.246.76"
},
{
"devices": [
{
"device_id": "4",
"device_ip": "192.168.100.100"
}
],
"rank_id": "4",
"server_id": "10.246.246.76"
},
{
"devices": [
{
"device_id": "5",
"device_ip": "192.168.101.100"
}
],
"rank_id": "5",
"server_id": "10.246.246.76"
},
{
"devices": [
{
"device_id": "6",
"device_ip": "192.168.102.100"
}
],
"rank_id": "6",
"server_id": "10.246.246.76"
},
{
"devices": [
{
"device_id": "7",
"device_ip": "192.168.103.100"
}
],
"rank_id": "7",
"server_id": "10.246.246.76"
}
]
}
],
"para_plane_nic_location": "device",
"para_plane_nic_name": [
"eth0",
"eth1",
"eth2",
"eth3",
"eth4",
"eth5",
"eth6",
"eth7"
],
"para_plane_nic_num": "8",
"status": "completed"
}
@@ -0,0 +1,40 @@
import os
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms
import torchvision.models as models
"""
alexnet | densenet121 |
densenet161 | densenet169 | densenet201 |
resnet101 | resnet152 | resnet18 | resnet34 |
resnet50 | squeezenet1_0 | squeezenet1_1 | vgg11 |
vgg11_bn | vgg13 | vgg13_bn | vgg16 | vgg16_bn | vgg19 |
mobilenet_v2 | shufflenet_v2_x0_5 |
vgg19_bn (default: resnet18)
"""
model_name='densenet121'
model = models.__dict__[model_name]()
img = torch.rand(size=(1,3,224,224))
#print(model(img))
labels = torch.rand(size=(1,))
criterion = nn.CrossEntropyLoss()
with torch.autograd.profiler.profile(record_shapes=True) as prof:
outputs = model(img)
loss = criterion(outputs, labels)
with torch.autograd.profiler.record_function("label-bp"):
loss.backward()
#print(prof.key_averages().table())
print(prof)
prof.export_chrome_trace(model_name + ".prof")
with SummaryWriter(os.path.join('runs',model_name)) as w:
w.add_graph(model, img)
@@ -0,0 +1,20 @@
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
export PATH=$PATH:/usr/local/Ascend/fwkacllib/ccec_compiler/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/opp
export NEW_GE_FE_ID=1
export GE_AICPU_FLAG=1
export PYTHONPATH=/usr/local/Ascend/atc/python/site-packages/te.egg:/usr/local/Ascend/atc/python/site-packages/topi.egg:/usr/local/Ascend/atc/python/site-packages/auto_tune.egg:/usr/local/Ascend/atc/python/site-packages/schedule_search.egg:/usr/local
export CUSTOM_OP_LIB_PATH=/usr/local/Ascend/ops/framework/built-in/tensorflow
export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libaicpu_plugin.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
export PLUGIN_LOAD_PATH=/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libaicpu_plugin.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/librts_engine.so
#export DEVICE_ID=0
#export SLOG_PRINT_TO_STDOUT=1
#su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
#python3 pytorch-benchmark-resnet50.py
python3 net_show_cpu.py
#python3 pytorch-resnet50-profiling.py
@@ -0,0 +1,51 @@
import os
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms
import torchvision.models as models
CALCULATE_DEVICE = "npu:0"
torch.npu.set_device(CALCULATE_DEVICE)
"""
alexnet | densenet121 |
densenet161 | densenet169 | densenet201 |
resnet101 | resnet152 | resnet18 | resnet34 |
resnet50 | squeezenet1_0 | squeezenet1_1 | vgg11 |
vgg11_bn | vgg13 | vgg13_bn | vgg16 | vgg16_bn | vgg19 |
mobilenet_v2 | shufflenet_v2_x0_5 |
vgg19_bn (default: resnet18)
"""
img = torch.rand(size=(1,3,224,224),dtype=torch.float32).to(CALCULATE_DEVICE, non_blocking=True)
print("img prepared")
model_name='densenet121'
model = models.__dict__[model_name]().to(CALCULATE_DEVICE)
model.train()
print("model prepared")
outputs = model(img)
print("cal done, results is {}".format(outputs))
labels=torch.rand(size=(1,)).to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
criterion = nn.CrossEntropyLoss().to(CALCULATE_DEVICE)
with torch.autograd.profiler.profile(record_shapes=True,use_npu=True) as prof:
outputs = model(img)
print("output ok")
loss = criterion(outputs, labels)
print("loss ok")
with torch.autograd.profiler.record_function("label-bp"):
loss.backward()
#print(prof.key_averages().table())
print(prof)
prof.export_chrome_trace(model_name + ".prof")
# with SummaryWriter(os.path.join('runs',model_name)) as w:
# w.add_graph(model, img)
# print("tenorboard add graph ok")
@@ -0,0 +1,14 @@
{
"server_count": "1",
"server_list": [{
"device": [
{
"device_id": "0",
"device_ip": "192.168.10.103",
"rank_id": "0"
}],
"server_id": "127.0.0.1"
}],
"status": "completed",
"version": "1.0"
}
@@ -0,0 +1,9 @@
{
"server_count": "1",
"server_list": [{
"device": [{devices}],
"server_id": "127.0.0.1"
}],
"status": "completed",
"version": "1.0"
}
@@ -0,0 +1,52 @@
# main env
export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:$LD_LIBRARY_PATH
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
export PLUGIN_LOAD_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/librts_engine.so
export TASK_QUEUE_ENABLE=0
export CUSTOM_OP_LIB_PATH=/usr/local/Ascend/ascend-toolkit/20.10.0.B022/arm64-linux_gcc7.3.0/opp/framework/built-in/tensorflow/
export NEW_GE_FE_ID=1
export GE_AICPU_FLAG=1
export GEN_TO_SOURCE=1
#export LD_LIBRARY_PATH=/usr/local/OpenBLAS/lib/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64-linux-gnu/
#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
#export ASCEND_OPP_PATH=/usr/local/Ascend/opp
#export DDK_VERSION_FLAG=1.60.T17.B830
#export NEW_GE_FE_ID=1
#export GE_AICPU_FLAG=1
#export SOC_VERSION=Ascend910
#export DUMP_GE_GRAPH=2
#export DEVICE_ID=0
#export DEVICE_INDEX=0
#export PRINT_MODEL=0
#export ENABLE_DATA_PRE_PROC=1
#export RANK_ID=0
#export RANK_SIZE=1
#export JOB_ID=10087
#export FUSION_TENSOR_SIZE=1000000000
#PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/atc/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe
#export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe
#export CUSTOM_OP_LIB_PATH=/usr/local/Ascend/ascend-toolkit/20.10.0.B023/arm64-linux_gcc7.3.0/opp/framework/built-in/tensorflow/
#export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
#export PLUGIN_LOAD_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/librts_engine.so
#export WHICH_OP=GEOP
#export NEW_GE_FE_ID=1
#export GE_AICPU_FLAG=1
@@ -0,0 +1,9 @@
export ASCEND_HOME=/usr/local/Ascend
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$currentDir
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
export SLOG_PRINT_TO_STDOUT=0
export TASK_QUEUE_ENABLE=0
@@ -0,0 +1,21 @@
############## toolkit situation ################
#export ASCEND_HOME=/usr/local/Ascend
#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
#export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl
#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
############## nnae situation ################
export ASCEND_HOME=/usr/local/Ascend
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/:/usr/local/python3.7.5/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
export PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/hccl
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/
# pip3.7 install --upgrade /usr/local/Ascend/nnae/latest/fwkacllib/lib64/topi-0.4.0-py3-none-any.whl
# pip3.7 install --upgrade /usr/local/Ascend/nnae/latest/fwkacllib/lib64/te-0.4.0-py3-none-any.whl
export SLOG_PRINT_TO_STDOUT=0
#su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
export TASK_QUEUE_ENABLE=0
@@ -0,0 +1,31 @@
############## toolkit situation ################
#export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
#export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
#export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
############## nnae situation ################
if [ -d /usr/local/Ascend/nnae/latest ];then
export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/nnae/latest/toolkit/tools/ide_daemon/bin/
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/
export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
export PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
else
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
fi
# ln -s /usr/local/Ascend/ascend-toolkit/latest/toolkit/bin/adc /usr/local/bin/
export SLOG_PRINT_TO_STDOUT=0
#su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
export TASK_QUEUE_ENABLE=1
@@ -0,0 +1,22 @@
export ASCEND_HOME=/usr/local/Ascend
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$currentDir
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
export SLOG_PRINT_TO_STDOUT=0
su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 7"
export TASK_QUEUE_ENABLE=0
taskset -c 111-150 python3 densenet121_1p_main.py \
--workers 40 \
--arch densenet121 \
--npu 7 \
--lr 0.1 \
--momentum 0.9 \
--amp \
--batch-size 256 \
--epoch 90 \
--evaluate \
--resume checkpoint.pth.tar \
--data /opt/npu/dataset/imagenet
@@ -0,0 +1,62 @@
#!/bin/bash
rank_size=$1
yamlPath=$2
toolsPath=$3
currentDir=$(cd "$(dirname "$0")/.."; pwd)
model_name=$(cd $currentDir/..;basename `pwd`)
if [ -f /.dockerenv ];then
CLUSTER=$4
MPIRUN_ALL_IP="$5"
export CLUSTER=${CLUSTER}
fi
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "pytorch_config")
# 清除旧日志
rm -rf /var/log/npu/slog/host-0/*
rm -rf ${currentDir}/result/*.log
#mkdir train job path
currtime=`date +%Y%m%d%H%M%S`
mkdir -p ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
export train_job_dir=${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${train_job_dir} &"
# device 列表, 若无指定 device 根据 rank_size 顺序选择
eval device_group=\$device_group_${rank_size}p
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
device_group="$(seq 0 "$(expr $rank_size - 1)")"
fi
# get last device id in device_group, hw log in performance from the dir named last_device_id
device_group_str=`echo ${device_group} | sed 's/ //g'`
first_device_id=`echo ${device_group_str: 0:1}`
if [ x"${CLUSTER}" == x"True" ];then
this_ip=$(hostname -I |awk '{print $1}')
ln -snf ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/0/hw_densenet121.log ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
for ip in $MPIRUN_ALL_IP;do
if [ x"$ip" != x"$this_ip" ];then
scp $yamlPath root@$ip:$yamlPath
scp ${jsonFilePath} root@$ip:${jsonFilePath}
fi
done
export PATH=$PATH:/usr/local/mpirun4.0/bin
mpirun -H ${mpirun_ip} \
--bind-to none -map-by slot\
--allow-run-as-root \
--mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
--prefix /usr/local/mpirun4.0/ \
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} ${CLUSTER}
else
rank_id=0
#for device_id in $device_group;do
ln -snf ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/${first_device_id}/hw_densenet121.log ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} $rank_id &
# let rank_id++
# done
fi
wait
@@ -0,0 +1,141 @@
#!/usr/bin/env bash
device_id=$1
rank_size=$2
yamlPath=$3
currentDir=$(cd "$(dirname "$0")/.."; pwd)
currtime=$4
toolsPath=$5
export YAML_PATH=$3
mkdir -p ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
export train_job_dir=${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "pytorch_config")
export REMARK_LOG_FILE=hw_densenet121.log # 打点日志文件名称, 必须hw_后跟模型名称小写
benchmark_log_path=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
#source ${currentDir}/config/npu_set_env.sh
source ${currentDir}/config/set_env_b023.sh
# user env
export HCCL_CONNECT_TIMEOUT=600
export JOB_ID=9999001
export HCCL_RANK_TABLE_PATH=${currentDir}/config/${rank_size}p.json
export RANK_SIZE=${rank_size}
export SLOG_PRINT_TO_STDOUT=0
export DEVICE_ID=${device_id}
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
export DEVICE_INDEX=${DEVICE_INDEX}
cd ${train_job_dir}
curd_dir=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils/atlasboost
export PYTHONPATH=$PYTHONPATH:${curd_dir}
if [ x"$6" != x"True" ];then
rank_id=$6
export RANK_ID=$6
else
device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
atlasboost.set_device_id(device_id);print(atlasboost.rank())")
device_id_mo=`echo $device_id_mo`
rank_id=${device_id_mo##* }
export RANK_ID=${rank_id}
device=${device_id_mo##*deviceid = }
device_id=${device%% phyid=*}
export DEVICE_ID=${device_id}
hccljson=${train_job_dir}/*.json
cp ${hccljson} ${currentDir}/config/${rank_size}p.json
fi
#mkdir exec path
mkdir -p ${train_job_dir}/${device_id}
cd ${train_job_dir}/${device_id}
startTime=`date +%Y%m%d-%H:%M:%S`
startTime_s=`date +%s`
# 根据单卡/多卡区分调用参数
if [ x"$6" == x"True" ];then
# 多卡多机
export CLUSTER=True
fi
if [ x"${mode}" == x"evaluate" ];then
taskset -c 111-150 python3.7 ${currentDir}/code/densenet121_1p_main.py \
--workers 40 \
--arch densenet121 \
--npu 7 \
--lr 0.1 \
--momentum 0.9 \
--amp \
--batch-size 256 \
--epoch 90 \
--evaluate \
--resume checkpoint.pth.tar \
--data ${data_url} > ${train_job_dir}/train_${rank_size}p.log 2>&1
elif [ x"${rank_size}" == x"1" ];then
# 单卡
#source ${currentDir}/config/set_env_b023.sh
taskset -c 1-40 python3.7 ${currentDir}/code/densenet121_1p_main.py \
--workers 40 \
--arch densenet121 \
--npu ${device_single} \
--lr 0.1 \
--momentum 0.9 \
--amp \
--batch-size ${batch_size} \
--epoch ${epoches} \
--data ${data_url} > ${train_job_dir}/train_${rank_size}p.log 2>&1
elif [ ${rank_size} -le 8 ];then
# 单机多卡
#source ${currentDir}/config/set_env_b023.sh
python3.7 ${currentDir}/code/densenet121_8p_main.py \
--addr=$(hostname -I |awk '{print $1}') \
--seed 49 \
--workers 160 \
--lr ${lr} \
--print-freq 1 \
--eval-freq 5\
--arch densenet121 \
--dist-url 'tcp://127.0.0.1:50000' \
--dist-backend 'hccl' \
--multiprocessing-distributed \
--world-size 1 \
--batch-size ${batch_size} \
--epochs ${epoches} \
--rank 0 \
--amp \
--benchmark 0 \
--device-list ${device_group_multi} \
--data ${data_url} > ${train_job_dir}/train_${rank_size}p.log 2>&1
fi
#taskset -c 0-20 python3.7 ${currentDir}/code/densenet121.py > ./train.log 2>&1
if [ $? -eq 0 ];then
echo ":::ABK 1.0.0 densenet121 train success"
echo ":::ABK 1.0.0 densenet121 train success" >> ${train_job_dir}/train_${rank_size}p.log
echo ":::ABK 1.0.0 densenet121 train success" >> ./hw_densenet121.log
else
echo ":::ABK 1.0.0 densenet121 train failed"
echo ":::ABK 1.0.0 densenet121 train failed" >> ${train_job_dir}/train_${rank_size}p.log
echo ":::ABK 1.0.0 densenet121 train failed" >> ./hw_densenet121.log
fi
endTime=`date +%Y%m%d-%H:%M:%S`
endTime_s=`date +%s`
sumTime=$[ $endTime_s - $startTime_s ]
hour=$(( $sumTime/3600 ))
min=$(( ($sumTime-${hour}*3600)/60 ))
sec=$(( $sumTime-${hour}*3600-${min}*60 ))
echo ":::ABK 1.0.0 densenet121 train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_densenet121.log
@@ -0,0 +1,46 @@
# DenseNet121_tensorflow训练说明
### 1. 模型训练参数配置
在train/yaml/DenseNet121.yaml中修改相应配置, 配置项含义:
```
tensorflow_config:
# 基本参数
data_url: 数据集路径
epoches: 跑多少个epoch
epochs_between_evals: 1
batch_size: 32
log_dir: ./ckpt
# 1p参数
mode_1p: train # train、evaluate、train_and_evaluate三种模式
max_train_steps_1p: 100
iterations_per_loop_1p: 10
display_every: 10
log_name_1p: densenet121_1p.log
# 8p参数
mode_8p: train_and_evaluate # train、evaluate、train_and_evaluate三种模式
iterations_per_loop_8p: 5004
lr: 0.1
log_name_8p: densenet121_8p.log
mpirun_ip: 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
docker_image:docker 镜像名称:版本号
# 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
device_group_1p: 0
device_group_2p: 0 1
device_group_4p: 0 1 2 3
```
------
@@ -0,0 +1,22 @@
import tensorflow as tf
import os,sys
class CreateSession():
def __init__(self):
self.estimator_config = tf.ConfigProto(
inter_op_parallelism_threads=10,
intra_op_parallelism_threads=10,
allow_soft_placement=True)
self.estimator_config.gpu_options.allow_growth = True
self.set_env()
def set_env(self):
gpu_thread_count = 2
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count)
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
@@ -0,0 +1,133 @@
import numpy as np
import preprocessing
import tensorflow as tf
from tensorflow.python.util import nest
import os,sys
import numpy as np
class DataLoader:
def __init__(self, args):
self.args = args
filename_pattern = os.path.join(args.data_dir, '%s-*')
filenames_train = sorted(tf.gfile.Glob(filename_pattern % 'train'))
self.num_training_samples = get_num_records(filenames_train)
self.args.num_training_samples = self.num_training_samples
filename_pattern = os.path.join(args.data_dir, '%s-*')
filenames_val = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
self.num_evaluating_samples = get_num_records(filenames_val)
self.args.num_evaluating_samples = self.num_evaluating_samples
print( 'total num_training_sampels: %d' % self.num_training_samples )
print( 'total num_evaluating_sampels: %d' % self.num_evaluating_samples )
self.training_samples_per_rank = self.num_training_samples
def get_train_input_fn(self):
take_count = self.training_samples_per_rank
return make_dataset(self.args, take_count, self.args.batch_size, training=True)
def get_eval_input_fn(self):
take_count = self.num_evaluating_samples
return make_dataset(self.args, take_count, self.args.batch_size, training=False)
def get_num_records(filenames):
def count_records(tf_record_filename):
count = 0
for _ in tf.python_io.tf_record_iterator(tf_record_filename):
count += 1
return count
nfile = len(filenames)
return (count_records(filenames[0]) * (nfile - 1) +
count_records(filenames[-1]))
def _parse_example_proto(example_serialized):
feature_map = {
'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
default_value=''),
'image/class/label': tf.FixedLenFeature([], dtype=tf.int64, default_value=-1),
'image/class/text': tf.FixedLenFeature([], dtype=tf.string,
default_value=''),
}
sparse_float32 = tf.VarLenFeature(dtype=tf.float32)
# Sparse features in Example proto.
feature_map.update(
{k: sparse_float32 for k in ['image/object/bbox/xmin',
'image/object/bbox/ymin',
'image/object/bbox/xmax',
'image/object/bbox/ymax']})
features = tf.parse_single_example(example_serialized, feature_map)
label = tf.cast(features['image/class/label'], dtype=tf.int32)
xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
# Note that we impose an ordering of (y, x) just to make life difficult.
bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
# Force the variable number of bounding boxes into the shape
# [1, num_boxes, coords].
bbox = tf.expand_dims(bbox, 0)
bbox = tf.transpose(bbox, [0, 2, 1])
return features['image/encoded'], label, bbox
# since the preprocessing is done here, we add args file
def parse_record(raw_record, is_training):
image_buffer, label, bbox = _parse_example_proto(raw_record)
image = preprocessing.parse_and_preprocess_image_record(image_buffer, bbox, training=is_training)
# label-1 for VGG16
return image, label-1
def make_dataset(args, take_count, batch_size,
training=False, shard=False):
shuffle_buffer_size = 10000
num_readers = 10
rank_size = int(os.getenv('RANK_SIZE'))
rank_id = int(os.getenv('DEVICE_INDEX'))
if training:
filename_pattern = os.path.join(args.data_dir, '%s-*')
filenames = sorted(tf.gfile.Glob(filename_pattern % 'train'))
else:
filename_pattern = os.path.join(args.data_dir, '%s-*')
filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
ds = tf.data.Dataset.from_tensor_slices(filenames)
if not training:
ds = ds.take(take_count)
if training:
ds = ds.shuffle(1000, seed=7*(1+rank_id))
ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1)
counter = tf.data.Dataset.range(sys.maxsize)
ds = tf.data.Dataset.zip((ds, counter))
if training:
ds = ds.apply(tf.data.experimental.shuffle_and_repeat(shuffle_buffer_size, seed=5*(1+rank_id)))
ds = ds.map(lambda image, counter: parse_record(image, training), num_parallel_calls=14)
ds = ds.batch(batch_size, drop_remainder=True)
return ds
@@ -0,0 +1,158 @@
import tensorflow as tf
from tensorflow.contrib.layers import batch_norm, flatten
from tensorflow.contrib.framework import arg_scope
import numpy as np
class_num = 1000
nb_blocks = 4
nb_blocks_layers = (6, 12, 24, 16)
bn_size = 4
growth_rate = 32
init_layers = 64
'''
denseNet121169201264
return _densenet('densenet121', 32, (6, 12, 24, 16), 64, pretrained, progress,
**kwargs)
return _densenet('densenet161', 48, (6, 12, 36, 24), 96, pretrained, progress,
**kwargs)
return _densenet('densenet169', 32, (6, 12, 32, 32), 64, pretrained, progress,
**kwargs)
return _densenet('densenet201', 32, (6, 12, 48, 32), 64, pretrained, progress,
**kwargs)
'''
def conv_layer(input, filter, kernel, stride=1, layer_name="conv"):
with tf.name_scope(layer_name):
network = tf.layers.conv2d(inputs=input, filters=filter, kernel_size=kernel, strides=stride, padding='SAME', use_bias=False, kernel_initializer=tf.initializers.variance_scaling(scale=5.0, mode='fan_out')) # scale=5.0, mode='fan_out'
return network
def Global_Average_Pooling(x, stride=1):
width = np.shape(x)[1]
height = np.shape(x)[2]
pool_size = [width, height]
return tf.layers.average_pooling2d(inputs=x, pool_size=pool_size, strides=stride) # The stride value does not matter
#It is global average pooling without tflearn
#return global_avg_pool(x, name='Global_avg_pooling')
# But maybe you need to install h5py and curses or not
def Batch_Normalization(x, training, scope):
with arg_scope([batch_norm],
scope=scope,
updates_collections=None,
decay=0.9,
center=True,
scale=True,
zero_debias_moving_mean=True) :
training = tf.cast(training, tf.bool)
return tf.cond(training,
lambda : batch_norm(inputs=x, is_training=training, reuse=None),
lambda : batch_norm(inputs=x, is_training=training, reuse=True))
def Drop_out(x, rate, training) :
return tf.layers.dropout(inputs=x, rate=rate, training=training)
def Relu(x):
return tf.nn.relu(x)
def Average_pooling(x, pool_size=[2,2], stride=2, padding='VALID'):
return tf.layers.average_pooling2d(inputs=x, pool_size=pool_size, strides=stride, padding=padding)
def Max_Pooling(x, pool_size=[3,3], stride=2, padding='VALID'):
return tf.layers.max_pooling2d(inputs=x, pool_size=pool_size, strides=stride, padding=padding)
def Concatenation(layers):
return tf.concat(layers, axis=3)
def Linear(x):
return tf.layers.dense(inputs=x, units=class_num, name='linear')
def bottleneck_layer(x, is_training, scope):
# print(x)
with tf.name_scope(scope):
x = Batch_Normalization(x, training=is_training, scope=scope+'_batch1')
x = Relu(x)
x = conv_layer(x, filter= growth_rate*bn_size, kernel=[1,1], layer_name=scope+'_conv1')
#x = Drop_out(x, rate=dropout_rate, training=is_training)
#x = Drop_out(x, rate=dropout_rate, training=is_training)
x = Batch_Normalization(x, training=is_training, scope=scope+'_batch2')
x = Relu(x)
x = conv_layer(x, filter= growth_rate, kernel=[3,3], layer_name=scope+'_conv2')
#x = Drop_out(x, rate=dropout_rate, training=self.training)
# print(x)
return x
def transition_layer(x, is_training, scope):
with tf.name_scope(scope):
x = Batch_Normalization(x, training=is_training, scope=scope+'_batch1')
x = Relu(x)
# x = conv_layer(x, filter=self.filters, kernel=[1,1], layer_name=scope+'_conv1')
# https://github.com/taki0112/Densenet-Tensorflow/issues/10
in_channel = int(x.shape[-1])
x = conv_layer(x, filter=in_channel*0.5, kernel=[1,1], layer_name=scope+'_conv1')
#x = Drop_out(x, rate=dropout_rate, training=self.training)
x = Average_pooling(x, pool_size=[2,2], stride=2)
return x
def dense_block(input_x, nb_layers, is_training, layer_name):
with tf.name_scope(layer_name):
layers_concat = list()
layers_concat.append(input_x)
x = bottleneck_layer(input_x, is_training, scope=layer_name + '_bottleN_' + str(0))
layers_concat.append(x)
for i in range(nb_layers - 1):
x = Concatenation(layers_concat)
x = bottleneck_layer(x, is_training, scope=layer_name + '_bottleN_' + str(i + 1))
layers_concat.append(x)
x = Concatenation(layers_concat)
return x
def Dense_net(input_x, is_training):
x = conv_layer(input_x, filter=init_layers , kernel=[7,7], stride=2, layer_name='conv0')
x = Max_Pooling(x, pool_size=[3,3], stride=2)
for i in range(nb_blocks-1) :
# 6 -> 12 -> 48
x = dense_block(input_x=x, nb_layers=nb_blocks_layers[i], is_training=is_training, layer_name='dense_'+str(i))
x = transition_layer(x, is_training, scope='trans_'+str(i))
"""
x = self.dense_block(input_x=x, nb_layers=6, layer_name='dense_1')
x = self.transition_layer(x, scope='trans_1')
x = self.dense_block(input_x=x, nb_layers=12, layer_name='dense_2')
x = self.transition_layer(x, scope='trans_2')
x = self.dense_block(input_x=x, nb_layers=48, layer_name='dense_3')
x = self.transition_layer(x, scope='trans_3')
"""
x = dense_block(input_x=x, nb_layers=nb_blocks_layers[nb_blocks-1], is_training=is_training, layer_name='dense_final')
# 100 Layer
x = Batch_Normalization(x, training=is_training, scope='linear_batch')
x = Relu(x)
x = Global_Average_Pooling(x)
x = flatten(x)
x = Linear(x)
# x = tf.reshape(x, [-1, 10])
return x
@@ -0,0 +1,44 @@
import tensorflow as tf
import math
import numpy as np
def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch, T_max, eta_min=0):
base_lr = lr
warmup_init_lr = 0
total_steps = int(max_epoch * steps_per_epoch)
warmup_steps = int(warmup_epochs * steps_per_epoch)
lr_each_step = []
for i in range(total_steps):
last_epoch = i // steps_per_epoch
if i < warmup_steps:
lr = linear_warmup_lr(i + 1, warmup_steps, base_lr, warmup_init_lr)
else:
lr = eta_min + (base_lr - eta_min) * (1. + math.cos(math.pi*last_epoch / T_max)) / 2
lr_each_step.append(lr)
return np.array(lr_each_step).astype(np.float32)
class HyperParams:
def __init__(self, args):
self.args=args
nsteps_per_epoch = self.args.num_training_samples // self.args.global_batch_size
self.args.nsteps_per_epoch = nsteps_per_epoch
if self.args.max_epochs:
nstep = nsteps_per_epoch * self.args.max_epochs
else:
nstep = self.args.max_train_steps
self.args.nstep = nstep
self.cos_lr = warmup_cosine_annealing_lr(self.args.lr, nsteps_per_epoch, 0, self.args.T_max, self.args.T_max, 0.0)
def get_learning_rate(self):
global_step = tf.train.get_global_step()
learning_rate = tf.gather(tf.convert_to_tensor(self.cos_lr), global_step)
learning_rate = tf.identity(learning_rate, 'learning_rate')
return learning_rate
@@ -0,0 +1,25 @@
import tensorflow as tf
#from tensorflow.contrib.hccl.python.ops import hccl_ops
#from npu_bridge.hccl import hccl_ops
from benchmark_log import hwlog
class Layers:
def get_accuracy(self, labels, predicted_classes, logits, args):
accuracy = tf.metrics.accuracy(
labels=labels, predictions=predicted_classes)
top5acc = tf.metrics.mean(
tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32))
if args.rank_size == 1:
newaccuracy = (accuracy[0], accuracy[1])
newtop5acc = (top5acc[0], top5acc[1])
else:
from npu_bridge.hccl import hccl_ops
newaccuracy = (hccl_ops.allreduce(accuracy[0],"sum")/args.rank_size, accuracy[1])
newtop5acc = (hccl_ops.allreduce(top5acc[0],"sum")/args.rank_size, top5acc[1])
metrics = {'val-top1acc': newaccuracy, 'val-top5acc': newtop5acc}
return metrics
@@ -0,0 +1,92 @@
from __future__ import print_function
import tensorflow as tf
from benchmark_log import hwlog
import logging
import numpy as np
import time
import sys,os
class LogSessionRunHook(tf.train.SessionRunHook):
def __init__(self, args, warmup_steps=5):
self.global_batch_size = args.global_batch_size
if args.iterations_per_loop is not None:
self.iterations_per_loop = args.iterations_per_loop
else:
self.iterations_per_loop = args.nsteps_per_epoch
self.warmup_steps = warmup_steps
self.iter_times = []
self.num_records = args.num_training_samples
self.display_every = args.display_every
self.logger = get_logger(args.log_name, args.log_dir)
rank0log(self.logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__))
def after_create_session(self, session, coord):
rank0log(self.logger, 'Step Epoch Speed Loss FinLoss LR')
self.elapsed_secs = 0.
self.count = 0
def before_run(self, run_context):
self.t0 = time.time()
return tf.train.SessionRunArgs(
fetches=[tf.train.get_global_step(), 'loss:0', 'total_loss:0', 'learning_rate:0'])
def after_run(self, run_context, run_values):
batch_time = time.time() - self.t0
self.iter_times.append(batch_time)
self.elapsed_secs += batch_time
self.count += 1
global_step, loss, total_loss, lr = run_values.results
if global_step == 1 or global_step % self.display_every == 0:
dt = self.elapsed_secs / self.count
img_per_sec = self.global_batch_size * self.iterations_per_loop / dt
epoch = global_step * self.global_batch_size / self.num_records
self.logger.info('step:%6i epoch:%5.1f FPS:%7.1f loss:%6.3f total_loss:%6.3f lr:%7.5f' %
(global_step, epoch, img_per_sec, loss, total_loss, lr))
self.elapsed_secs = 0.
self.count = 0
# add by wx983399
hwlog.remark_print(key=hwlog.GLOBAL_STEP, value=int(global_step))
hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=epoch)
hwlog.remark_print(key=hwlog.FPS, value=img_per_sec)
def get_average_speed(self):
avg_time = np.mean(self.iter_times[self.warmup_steps:])
speed = self.global_batch_size / avg_time
return speed
def rank0log(logger, *args, **kwargs):
if logger:
logger.info(''.join([str(x) for x in list(args)]))
else:
print(*args, **kwargs)
def get_logger(log_name, log_dir):
logger = logging.getLogger(log_name)
logger.setLevel(logging.INFO) # INFO, ERROR
# file handler which logs debug messages
if not os.path.isdir(log_dir):
try:
os.makedirs(log_dir)
except FileExistsError:
# if log_dir is common for multiple ranks like on nfs
pass
# console handler
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
# add formatter to the handlers
formatter = logging.Formatter('%(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
fh = logging.FileHandler(os.path.join(log_dir, log_name))
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
# add handlers to logger
logger.addHandler(fh)
return logger
@@ -0,0 +1,72 @@
import tensorflow as tf
from densenet import Dense_net
class Model(object):
def __init__(self, args, data, hyper_param, layers, logger):
self.args = args
self.data = data
self.hyper_param = hyper_param
self.layers = layers
self.logger = logger
def get_estimator_model_func(self, features, labels, mode, params=None):
labels = tf.reshape(labels, (-1,)) # Squash unnecessary unary dim #----------------not use when use onehot label
inputs = features # TODO: Should be using feature columns?
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
inputs = tf.cast(inputs, self.args.dtype)
top_layer = Dense_net(inputs, is_training)
logits = top_layer
predicted_classes = tf.argmax(logits, axis=1, output_type=tf.int32)
logits = tf.cast(logits, tf.float32)
labels_one_hot = tf.one_hot(labels, depth=1000)
loss = tf.losses.softmax_cross_entropy(
logits=logits, onehot_labels=labels_one_hot, label_smoothing=self.args.label_smoothing)
base_loss = tf.identity(loss, name='loss') # For access by logger (TODO: Better way to access it?)
l2_loss = tf.add_n([tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()])
l2_loss = tf.multiply(l2_loss, self.args.weight_decay)
total_loss = base_loss + l2_loss
total_loss = tf.identity(total_loss, name = 'total_loss')
if mode == tf.estimator.ModeKeys.EVAL:
with tf.device(None):
metrics = self.layers.get_accuracy( labels, predicted_classes, logits, self.args)
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops=metrics)
assert (mode == tf.estimator.ModeKeys.TRAIN)
batch_size = tf.shape(inputs)[0]
global_step = tf.train.get_global_step()
learning_rate = self.hyper_param.get_learning_rate()
momentum = self.args.momentum
opt = tf.train.MomentumOptimizer(
learning_rate, momentum, use_nesterov=self.args.use_nesterov)
from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
opt = NPUDistributedOptimizer(opt)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or []
with tf.control_dependencies(update_ops):
gate_gradients = tf.train.Optimizer.GATE_NONE
grads_and_vars = opt.compute_gradients(total_loss, gate_gradients=gate_gradients)
train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
train_op = tf.group(train_op)
return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)
@@ -0,0 +1,72 @@
import tensorflow as tf
from tensorflow.contrib.image.python.ops import distort_image_ops
import math
import random
def decode_jpeg(imgdata, channels=3):
return tf.image.decode_jpeg(imgdata, channels=channels,
fancy_upscaling=False,
dct_method='INTEGER_FAST')
def random_horizontal_flip(image, prob):
if prob > random.random():
image = tf.image.flip_left_right(image)
return image
def decode_crop_and_resize(record, bbox, size, scale, ratio):
with tf.name_scope('decode_crop_and_resize'):
height = 224
width = 224
crop_ratio = 0.8
initial_shape = [int(round(height / crop_ratio)),
int(round(width / crop_ratio)), 3]
jpeg_shape = tf.image.extract_jpeg_shape( record )
bbox_begin, bbox_size, bbox = \
tf.image.sample_distorted_bounding_box(
tf.image.extract_jpeg_shape(record),
bounding_boxes=bbox,
min_object_covered=0.1,
aspect_ratio_range=ratio,
area_range=scale,
max_attempts=10,
use_image_if_no_bounding_boxes=True)
# Reassemble the bounding box in the format the crop op requires.
offset_y, offset_x, _ = tf.unstack(bbox_begin)
target_height, target_width, _ = tf.unstack(bbox_size)
crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
image = tf.image.decode_and_crop_jpeg( record, crop_window, channels=3 )
image = tf.image.resize_images( image, [height, width] )
return image
def parse_and_preprocess_image_record(record, bbox, training):
with tf.name_scope('preprocess'):
if training:
image = decode_crop_and_resize(record, bbox, 224, (0.08, 1.0), (0.75, 1.333))
image = random_horizontal_flip(image, 0.5)
image = normalize(image)
else:
image = decode_jpeg(record, channels=3)
image = tf.image.resize_images(image, [256, 256])
image = tf.image.central_crop(image, 224.0/256)
image = normalize(image)
return image
def normalize(inputs):
imagenet_mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
imagenet_std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
imagenet_mean = tf.expand_dims(tf.expand_dims(imagenet_mean, 0), 0)
imagenet_std = tf.expand_dims(tf.expand_dims(imagenet_std, 0), 0)
inputs = inputs - imagenet_mean
inputs = inputs * (1.0 / imagenet_std)
return inputs
@@ -0,0 +1,140 @@
import tensorflow as tf
import numpy as np
import os
import sys
import ast
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../')))
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../config')))
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../../../../utils')))
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../../../../utils/atlasboost')))
import data_loader as dl
import model as ml
import hyper_param as hp
import layers as ly
import logger as lg
import trainer as tr
import create_session as cs
import argparse
from benchmark_log import hwlog
from benchmark_log.basic_utils import get_environment_info
from benchmark_log.basic_utils import get_model_parameter
def parse_args():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--rank_size', default=1,type=int,
help="""number of NPUs to use.""")
# mode and parameters related
parser.add_argument('--mode', default='train_and_evaluate',
help="""mode to run the program e.g. train, evaluate, and
train_and_evaluate""")
parser.add_argument('--max_train_steps', default=100,type=int,
help="""train steps for one NPU""")
parser.add_argument('--iterations_per_loop', default=10, type=int,
help="""the number of steps in devices for each iteration""")
parser.add_argument('--max_epochs', default=None, type=int,
help="""total epochs for training""")
parser.add_argument('--epochs_between_evals', default=5, type=int,
help="""the interval between train and evaluation , only meaningful
when the mode is train_and_evaluate""")
# dataset
parser.add_argument('--data_dir', default='path/data',
help="""directory to data.""")
# path for evaluation
parser.add_argument('--eval_dir', default='path/eval',
help="""directory to evaluate.""")
parser.add_argument('--dtype', default=tf.float32,
help="""data type of inputs.""")
parser.add_argument('--use_nesterov', default=True, type=ast.literal_eval,
help=""" used in optimizer""")
parser.add_argument('--label_smoothing', default=0.1, type=float,
help="""label smoothing factor""")
parser.add_argument('--weight_decay', default=0.0001,
help="""weight decay""")
parser.add_argument('--batch_size', default=32, type=int,
help="""batch size for one NPU""")
# learning rate and momentum
parser.add_argument('--lr', default=0.1, type=float,
help="""learning rate""")
parser.add_argument('--T_max', default=150, type=int,
help="""T_max for cosing_annealing learning rate""")
parser.add_argument('--momentum', default=0.9, type=float,
help="""momentum used in optimizer.""")
# display frequency
parser.add_argument('--display_every', default=1, type=int,
help="""the frequency to display info""")
# log file
parser.add_argument('--log_name', default='densenet121_training.log',
help="""name of log file""")
parser.add_argument('--log_dir', default='./model_1p',
help="""log directory""")
args, unknown_args = parser.parse_known_args()
# ['--config_file', 'densenet_config_1p_npu']
print(args, unknown_args)
if len(unknown_args) > 0:
for bad_arg in unknown_args:
print("ERROR: Unknown command line arg: %s" % bad_arg)
raise ValueError("Invalid command line arg(s)")
return args
def main():
args = parse_args()
args.global_batch_size = args.batch_size * args.rank_size
session = cs.CreateSession()
data = dl.DataLoader(args)
hyper_param = hp.HyperParams(args)
layers = ly.Layers()
logger = lg.LogSessionRunHook(args)
model = ml.Model(args, data, hyper_param, layers, logger)
trainer = tr.Trainer(session, args, data, model, logger)
if args.mode == 'train':
trainer.train()
elif args.mode == 'evaluate':
trainer.evaluate()
elif args.mode == 'train_and_evaluate':
trainer.train_and_evaluate()
else:
raise ValueError("Invalid mode.")
if __name__ == '__main__':
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
config_info = get_model_parameter("tensorflow_config")
initinal_data = {"base_lr": 0.128, "dataset": "imagenet1024", "optimizer": "SGD", "loss_scale": 512,
"batchsize": 32}
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
hwlog.remark_print(key=hwlog.INPUT_BATCH_SIZE, value=initinal_data.get("batchsize"))
main()
@@ -0,0 +1,22 @@
import tensorflow as tf
from tensorflow.python.ops import data_flow_ops
import re
import os
from operator import itemgetter
def sort_and_load_ckpts(log_dir):
ckpts = []
for f in os.listdir(log_dir):
m = re.match(r'model.ckpt-([0-9]+).index', f)
if m is None:
continue
fullpath = os.path.join(log_dir, f)
ckpts.append({'step': int(m.group(1)),
'path': os.path.splitext(fullpath)[0],
'mtime': os.stat(fullpath).st_mtime,
})
ckpts.sort(key=itemgetter('step'))
return ckpts
@@ -0,0 +1,128 @@
import tensorflow as tf
import math
import time
import os
import train_helper
from logger import rank0log
from benchmark_log import hwlog
class Trainer(object):
def __init__(self, session, args, data, model, logger):
self.sess = session
self.args = args
self.data = data
self.model = model
self.logger = logger
self.print_logger = self.logger.logger
self.all_preds = []
self.all_targets = []
self.classifier, self.training_hook = self.get_npu_classifier()
def get_npu_classifier(self):
from npu_bridge.estimator.npu.npu_config import NPURunConfig
from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
run_config = NPURunConfig(
hcom_parallel=True,
precision_mode="allow_mix_precision",
enable_data_pre_proc=True,
save_checkpoints_steps=self.args.nsteps_per_epoch,
session_config=self.sess.estimator_config,
model_dir=self.args.log_dir,
iterations_per_loop=self.args.iterations_per_loop,
keep_checkpoint_max=5)
classifier =NPUEstimator(
model_fn= self.model.get_estimator_model_func,
config= run_config
)
training_hooks = []
training_hooks.append(self.logger)
return classifier, training_hooks
def train(self):
print ('training steps: %d' % self.args.nstep)
self.classifier.train( input_fn=lambda:self.data.get_train_input_fn(),
max_steps = self.args.nstep,
hooks = self.training_hook
)
def evaluate(self):
rank0log(self.print_logger, "Evaluating")
rank0log(self.print_logger, "Validation dataset size: {}".format(self.args.num_evaluating_samples))
time.sleep(5) # a little extra margin...
try:
ckpts = train_helper.sort_and_load_ckpts(self.args.eval_dir)
print("=========ckpt==========")
print(ckpts)
print("=========ckpt==========")
for i, c in enumerate(ckpts):
eval_result = self.classifier.evaluate(
input_fn=lambda: self.data.get_eval_input_fn(),
checkpoint_path=c['path'])
c['epoch'] = math.ceil(c['step'] / (self.args.num_training_samples/ (self.args.batch_size)))
c['top1'] = eval_result['val-top1acc']
c['top5'] = eval_result['val-top5acc']
c['loss'] = eval_result['loss']
rank0log(self.print_logger, ' step epoch top1 top5 loss checkpoint_time(UTC)')
for i, c in enumerate(ckpts):
if 'top1' not in c:
continue
rank0log(self.print_logger,'{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}'
.format(c['step'],
c['epoch'],
c['top1'] * 100,
c['top5'] * 100,
c['loss'],
time=time.strftime('%Y-%m-%d %H:%M:%S',
time.localtime(c['mtime']))))
rank0log(self.print_logger, "Finished evaluation")
except KeyboardInterrupt:
self.print_logger.error("Keyboard interrupt")
def train_and_evaluate(self):
epochs_between_evals = self.args.epochs_between_evals
for i in range(self.args.max_epochs // epochs_between_evals):
rank0log(self.print_logger, "Starting a training cycle")
self.classifier.train(input_fn=lambda:self.data.get_train_input_fn(),
steps = self.args.nsteps_per_epoch*epochs_between_evals,
hooks = self.training_hook )
rank0log(self.print_logger, "Starting to evaluate")
rank0log(self.print_logger, "Validation dataset size: {}".format(self.args.num_evaluating_samples))
time.sleep(5) # a little extra margin...
ckpts = train_helper.sort_and_load_ckpts(self.args.log_dir)
c = ckpts[-1]
eval_result = self.classifier.evaluate(
input_fn=lambda: self.data.get_eval_input_fn(),
checkpoint_path=c['path'])
# top1 top5 Log dotting
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value=float(eval_result.get("val-top1acc")))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value=float(eval_result.get("val-top5acc")))
c['epoch'] = math.ceil(c['step'] / (self.args.num_training_samples / (self.args.batch_size * self.args.rank_size)))
c['top1'] = eval_result['val-top1acc']
c['top5'] = eval_result['val-top5acc']
c['loss'] = eval_result['loss']
rank0log(self.print_logger, ' step epoch top1 top5 loss checkpoint_time(UTC)')
rank0log(self.print_logger,'{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}'
.format(c['step'],
c['epoch'],
c['top1'] * 100,
c['top5'] * 100,
c['loss'],
time=time.strftime('%Y-%m-%d %H:%M:%S',
time.localtime(c['mtime']))))
@@ -0,0 +1,23 @@
{
"group_count": "1",
"group_list": [
{
"group_name": "worker",
"device_count": "1",
"instance_count": "1",
"instance_list": [
{
"devices": [
{
"device_id": "7",
"device_ip": "192.168.193.103"
}
],
"pod_name": "npu1p",
"server_id": "127.0.0.1"
}
]
}
],
"status": "completed"
}
@@ -0,0 +1,51 @@
{
"group_count": "1",
"group_list": [
{
"group_name": "worker",
"device_count": "8",
"instance_count": "1",
"instance_list": [
{
"devices": [
{
"device_id": "0",
"device_ip": "192.168.190.102"
},
{
"device_id": "1",
"device_ip": "192.168.191.102"
},
{
"device_id": "2",
"device_ip": "192.168.192.102"
},
{
"device_id": "3",
"device_ip": "192.168.193.102"
},
{
"device_id": "4",
"device_ip": "192.168.190.103"
},
{
"device_id": "5",
"device_ip": "192.168.191.103"
},
{
"device_id": "6",
"device_ip": "192.168.192.103"
},
{
"device_id": "7",
"device_ip": "192.168.193.103"
}
],
"pod_name": "npu8p",
"server_id": "127.0.0.1"
}
]
}
],
"status": "completed"
}
@@ -0,0 +1,9 @@
{
"server_count": "1",
"server_list": [{
"device": [{devices}],
"server_id": "127.0.0.1"
}],
"status": "completed",
"version": "1.0"
}
@@ -0,0 +1,36 @@
#!/bin/bash
rm -rf /var/log/npu/slog/host-0/*
#安装toolkit
#export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
#export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages
#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
#安装nnae等
#export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/x86_64-linux_gcc7.3.0/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
#export PYTHONPATH=/home/train/resnet50_tf/code:/usr/local/Ascend/nnae/latest/x86_64-linux_gcc7.3.0/opp/op_impl/built-in/ai_core/tbe/:/usr/local/Ascend/nnae/latest/x86_64-linux_gcc7.3.0/fwkacllib/python/site-packages/te/:/usr/local/Ascend/nnae/latest/x86_64-linux_gcc7.3.0/fwkacllib/python/site-packages/topi/:/usr/local/Ascend/nnae/latest/x86_64-linux_gcc7.3.0/fwkacllib/python/site-packages/hccl/:/usr/local/Ascend/tfplugin/latest/x86_64-linux_gcc7.3.0/tfplugin/python/site-packages/:/usr/local/Ascend/tfplugin/latest/x86_64-linux_gcc7.3.0/tfplugin/python/site-packages/npu_bridge:/code
#export PATH=$PATH:/usr/local/Ascend/nnae/latest/x86_64-linux_gcc7.3.0/fwkacllib/ccec_compiler/bin/
#export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/x86_64-linux_gcc7.3.0/opp/
if [ -d /usr/local/Ascend/nnae/latest ];then
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
else
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$projectDir
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
fi
export DDK_VERSION_FLAG=1.60.T17.B830
export HCCL_CONNECT_TIMEOUT=600
export JOB_ID=9999001
export SLOG_PRINT_TO_STDOUT=0
@@ -0,0 +1,70 @@
#!/bin/bash
rank_size=$1
yamlPath=$2
toolsPath=$3
currentDir=$(cd "$(dirname "$0")/.."; pwd)
model_name=$(cd $currentDir/..;basename `pwd`)
if [ -f /.dockerenv ];then
CLUSTER=$4
MPIRUN_ALL_IP="$5"
export CLUSTER=${CLUSTER}
fi
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
if [ $? -eq 0 ] ;
then
echo "modify inner config file success"
else
echo "modify inner config file fail"
exit 1
fi
#mkdir train job path
currtime=`date +%Y%m%d%H%M%S`
mkdir -p ${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/
train_job_dir=${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${train_job_dir} &"
# device 列表, 若无指定 device 根据 rank_size 顺序选择
eval device_group=\$device_group_${rank_size}p
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
device_group="$(seq 0 "$(expr $rank_size - 1)")"
fi
# get last device id in device_group, hw log in performance from the dir named last_device_id
device_group_str=`echo ${device_group} | sed 's/ //g'`
first_device_id=`echo ${device_group_str: 0:1}`
if [ x"${CLUSTER}" == x"True" ];then
# ln hw log
ln -snf ${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/0/hw_densenet121.log ${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/
this_ip=$(hostname -I |awk '{print $1}')
for ip in $MPIRUN_ALL_IP;do
if [ x"$ip" != x"$this_ip" ];then
scp $yamlPath root@$ip:$yamlPath
scp ${jsonFilePath} root@$ip:${jsonFilePath}
fi
done
export PATH=$PATH:/usr/local/mpirun4.0/bin
mpirun -H ${mpirun_ip} \
--bind-to none -map-by slot\
--allow-run-as-root \
--mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
--prefix /usr/local/mpirun4.0/ \
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} ${CLUSTER}
else
# ln hw log
ln -snf ${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/${first_device_id}/hw_densenet121.log ${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/
rank_id=0
for device_id in $device_group;do
${currentDir}/scripts/train.sh $device_id $rank_size $yamlPath $currtime ${toolsPath} $rank_id &
let rank_id++
done
fi
wait
#echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] all train exit " >> ${currentDir}/result/main.log
@@ -0,0 +1,97 @@
#!/usr/bin/env bash
device_id=$1
rank_size=$2
yamlPath=$3
currentDir=$(cd "$(dirname "$0")/.."; pwd)
model_name="densenet121"
currtime=$4
toolsPath=$5
export YAML_PATH=$3
mkdir -p ${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/
export train_job_dir=${currentDir%train*}/train/result/tf_densenet121/training_job_${currtime}/
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
export REMARK_LOG_FILE=hw_densenet121.log # 打点日志文件名称, 必须hw_后跟模型名称小写
benchmark_log_path=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
source ${currentDir}/config/npu_set_env.sh
# user env
export HCCL_CONNECT_TIMEOUT=600
export JOB_ID=9999001
export RANK_TABLE_FILE=${currentDir}/config/${rank_size}p.json
export RANK_SIZE=${rank_size}
export SLOG_PRINT_TO_STDOUT=0
export DEVICE_ID=${device_id}
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
export DEVICE_INDEX=${DEVICE_INDEX}
cd ${train_job_dir}
curd_dir=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils/atlasboost
export PYTHONPATH=$PYTHONPATH:${curd_dir}
if [ x"$6" != x"True" ];then
rank_id=$6
export RANK_ID=$6
else
device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
atlasboost.set_device_id(device_id);print(atlasboost.rank())")
device_id_mo=`echo $device_id_mo`
rank_id=${device_id_mo##* }
export RANK_ID=${rank_id}
device=${device_id_mo##*deviceid = }
device_id=${device%% phyid=*}
export DEVICE_ID=${device_id}
hccljson=${train_job_dir}/*.json
cp ${hccljson} ${currentDir}/config/${rank_size}p.json
fi
#mkdir exec path
mkdir -p ${train_job_dir}/${device_id}
cd ${train_job_dir}/${device_id}
startTime=`date +%Y%m%d-%H:%M:%S`
startTime_s=`date +%s`
# 根据单卡/多卡区分调用参数
if [ x"$6" == x"True" ];then
# 多卡多机
export CLUSTER=True
python3.7 ${currentDir}/code/train.py --rank_size=${rank_size} --mode=${mode_8p} --max_epochs=${epoches} --iterations_per_loop=${iterations_per_loop_8p} --epochs_between_evals=${epochs_between_evals} --data_dir=${data_url} --lr=${lr} --log_dir=${log_dir} --log_name=${log_name_8p} > ${train_job_dir}/train_${device_id}.log 2>&1
elif [ x"${rank_size}" == x"1" ];then
# 单卡
python3.7 ${currentDir}/code/train.py --rank_size=${rank_size} --mode=${mode_1p} --max_train_steps=${max_train_steps_1p} --iterations_per_loop=${iterations_per_loop_1p} --data_dir=${data_url} --display_every=${display_every} --log_dir=${log_dir} --log_name=${log_name_1p} > ${train_job_dir}/train_${device_id}.log 2>&1
elif [ ${rank_size} -le 8 ];then
# 多卡单机
python3.7 ${currentDir}/code/train.py --rank_size=${rank_size} --mode=${mode_8p} --max_epochs=${epoches} --iterations_per_loop=${iterations_per_loop_8p} --epochs_between_evals=${epochs_between_evals} --data_dir=${data_url} --lr=${lr} --log_dir=${log_dir} --log_name=${log_name_8p} > ${train_job_dir}/train_${device_id}.log 2>&1
fi
if [ $? -eq 0 ];then
echo ":::ABK 1.0.0 densenet121 train success"
echo ":::ABK 1.0.0 densenet121 train success" >> ${train_job_dir}/train_${device_id}.log
echo ":::ABK 1.0.0 densenet121 train success" >> ./hw_densenet121.log
else
echo ":::ABK 1.0.0 densenet121 train failed"
echo ":::ABK 1.0.0 densenet121 train failed" >> ${train_job_dir}/train_${device_id}.log
echo ":::ABK 1.0.0 densenet121 train failed" >> ./hw_densenet121.log
fi
endTime=`date +%Y%m%d-%H:%M:%S`
endTime_s=`date +%s`
sumTime=$[ $endTime_s - $startTime_s ]
hour=$(( $sumTime/3600 ))
min=$(( ($sumTime-${hour}*3600)/60 ))
sec=$(( $sumTime-${hour}*3600-${min}*60 ))
echo ":::ABK 1.0.0 densenet121 train total time ${hour}:${min}:${sec}"
echo ":::ABK 1.0.0 densenet121 train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_densenet121.log
@@ -0,0 +1,25 @@
# EfficientNet_pytorch训练说明
### 1. 模型训练参数配置
在train/yaml/EfficientNet.yaml中修改相应配置, 配置项含义:
```
pytorch_config:
data_url: 数据集路径
epoches: 跑多少个epoch
batch_size: 1p 参数为256 2p 512 4p 1024 8p为2048
seed: 49
lr: 默认参数1p 0.2 2p 0.4 4p 0.8 8p 1.6
docker_image: docker 镜像名称:版本号
```
------
@@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
@@ -0,0 +1,253 @@
# EfficientNet PyTorch
### Quickstart
Install with `pip install efficientnet_pytorch` and load a pretrained EfficientNet with:
```python
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_pretrained('efficientnet-b0')
```
### Updates
#### Update (May 14, 2020)
This update adds comprehensive comments and documentation (thanks to @workingcoder).
#### Update (January 23, 2020)
This update adds a new category of pre-trained model based on adversarial training, called _advprop_. It is important to note that the preprocessing required for the advprop pretrained models is slightly different from normal ImageNet preprocessing. As a result, by default, advprop models are not used. To load a model with advprop, use:
```
model = EfficientNet.from_pretrained("efficientnet-b0", advprop=True)
```
There is also a new, large `efficientnet-b8` pretrained model that is only available in advprop form. When using these models, replace ImageNet preprocessing code as follows:
```
if advprop: # for models using advprop pretrained weights
normalize = transforms.Lambda(lambda img: img * 2.0 - 1.0)
else:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
```
This update also addresses multiple other issues ([#115](https://github.com/lukemelas/EfficientNet-PyTorch/issues/115), [#128](https://github.com/lukemelas/EfficientNet-PyTorch/issues/128)).
#### Update (October 15, 2019)
This update allows you to choose whether to use a memory-efficient Swish activation. The memory-efficient version is chosen by default, but it cannot be used when exporting using PyTorch JIT. For this purpose, we have also included a standard (export-friendly) swish activation function. To switch to the export-friendly version, simply call `model.set_swish(memory_efficient=False)` after loading your desired model. This update addresses issues [#88](https://github.com/lukemelas/EfficientNet-PyTorch/pull/88) and [#89](https://github.com/lukemelas/EfficientNet-PyTorch/pull/89).
#### Update (October 12, 2019)
This update makes the Swish activation function more memory-efficient. It also addresses pull requests [#72](https://github.com/lukemelas/EfficientNet-PyTorch/pull/72), [#73](https://github.com/lukemelas/EfficientNet-PyTorch/pull/73), [#85](https://github.com/lukemelas/EfficientNet-PyTorch/pull/85), and [#86](https://github.com/lukemelas/EfficientNet-PyTorch/pull/86). Thanks to the authors of all the pull requests!
#### Update (July 31, 2019)
_Upgrade the pip package with_ `pip install --upgrade efficientnet-pytorch`
The B6 and B7 models are now available. Additionally, _all_ pretrained models have been updated to use AutoAugment preprocessing, which translates to better performance across the board. Usage is the same as before:
```python
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_pretrained('efficientnet-b7')
```
#### Update (June 29, 2019)
This update adds easy model exporting ([#20](https://github.com/lukemelas/EfficientNet-PyTorch/issues/20)) and feature extraction ([#38](https://github.com/lukemelas/EfficientNet-PyTorch/issues/38)).
* [Example: Export to ONNX](#example-export)
* [Example: Extract features](#example-feature-extraction)
* Also: fixed a CUDA/CPU bug ([#32](https://github.com/lukemelas/EfficientNet-PyTorch/issues/32))
It is also now incredibly simple to load a pretrained model with a new number of classes for transfer learning:
```python
model = EfficientNet.from_pretrained('efficientnet-b1', num_classes=23)
```
#### Update (June 23, 2019)
The B4 and B5 models are now available. Their usage is identical to the other models:
```python
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_pretrained('efficientnet-b4')
```
### Overview
This repository contains an op-for-op PyTorch reimplementation of [EfficientNet](https://arxiv.org/abs/1905.11946), along with pre-trained models and examples.
The goal of this implementation is to be simple, highly extensible, and easy to integrate into your own projects. This implementation is a work in progress -- new features are currently being implemented.
At the moment, you can easily:
* Load pretrained EfficientNet models
* Use EfficientNet models for classification or feature extraction
* Evaluate EfficientNet models on ImageNet or your own images
_Upcoming features_: In the next few days, you will be able to:
* Train new models from scratch on ImageNet with a simple command
* Quickly finetune an EfficientNet on your own dataset
* Export EfficientNet models for production
### Table of contents
1. [About EfficientNet](#about-efficientnet)
2. [About EfficientNet-PyTorch](#about-efficientnet-pytorch)
3. [Installation](#installation)
4. [Usage](#usage)
* [Load pretrained models](#loading-pretrained-models)
* [Example: Classify](#example-classification)
* [Example: Extract features](#example-feature-extraction)
* [Example: Export to ONNX](#example-export)
6. [Contributing](#contributing)
### About EfficientNet
If you're new to EfficientNets, here is an explanation straight from the official TensorFlow implementation:
EfficientNets are a family of image classification models, which achieve state-of-the-art accuracy, yet being an order-of-magnitude smaller and faster than previous models. We develop EfficientNets based on AutoML and Compound Scaling. In particular, we first use [AutoML Mobile framework](https://ai.googleblog.com/2018/08/mnasnet-towards-automating-design-of.html) to develop a mobile-size baseline network, named as EfficientNet-B0; Then, we use the compound scaling method to scale up this baseline to obtain EfficientNet-B1 to B7.
<table border="0">
<tr>
<td>
<img src="https://raw.githubusercontent.com/tensorflow/tpu/master/models/official/efficientnet/g3doc/params.png" width="100%" />
</td>
<td>
<img src="https://raw.githubusercontent.com/tensorflow/tpu/master/models/official/efficientnet/g3doc/flops.png", width="90%" />
</td>
</tr>
</table>
EfficientNets achieve state-of-the-art accuracy on ImageNet with an order of magnitude better efficiency:
* In high-accuracy regime, our EfficientNet-B7 achieves state-of-the-art 84.4% top-1 / 97.1% top-5 accuracy on ImageNet with 66M parameters and 37B FLOPS, being 8.4x smaller and 6.1x faster on CPU inference than previous best [Gpipe](https://arxiv.org/abs/1811.06965).
* In middle-accuracy regime, our EfficientNet-B1 is 7.6x smaller and 5.7x faster on CPU inference than [ResNet-152](https://arxiv.org/abs/1512.03385), with similar ImageNet accuracy.
* Compared with the widely used [ResNet-50](https://arxiv.org/abs/1512.03385), our EfficientNet-B4 improves the top-1 accuracy from 76.3% of ResNet-50 to 82.6% (+6.3%), under similar FLOPS constraint.
### About EfficientNet PyTorch
EfficientNet PyTorch is a PyTorch re-implementation of EfficientNet. It is consistent with the [original TensorFlow implementation](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet), such that it is easy to load weights from a TensorFlow checkpoint. At the same time, we aim to make our PyTorch implementation as simple, flexible, and extensible as possible.
If you have any feature requests or questions, feel free to leave them as GitHub issues!
### Installation
Install via pip:
```bash
pip install efficientnet_pytorch
```
Or install from source:
```bash
git clone https://github.com/lukemelas/EfficientNet-PyTorch
cd EfficientNet-Pytorch
pip install -e .
```
### Usage
#### Loading pretrained models
Load an EfficientNet:
```python
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_name('efficientnet-b0')
```
Load a pretrained EfficientNet:
```python
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_pretrained('efficientnet-b0')
```
Note that pretrained models have only been released for `N=0,1,2,3,4,5` at the current time, so `.from_pretrained` only supports `'efficientnet-b{N}'` for `N=0,1,2,3,4,5`.
Details about the models are below:
| *Name* |*# Params*|*Top-1 Acc.*|*Pretrained?*|
|:-----------------:|:--------:|:----------:|:-----------:|
| `efficientnet-b0` | 5.3M | 76.3 | ✓ |
| `efficientnet-b1` | 7.8M | 78.8 | ✓ |
| `efficientnet-b2` | 9.2M | 79.8 | ✓ |
| `efficientnet-b3` | 12M | 81.1 | ✓ |
| `efficientnet-b4` | 19M | 82.6 | ✓ |
| `efficientnet-b5` | 30M | 83.3 | ✓ |
| `efficientnet-b6` | 43M | 84.0 | ✓ |
| `efficientnet-b7` | 66M | 84.4 | ✓ |
#### Example: Classification
Below is a simple, complete example. It may also be found as a jupyter notebook in `examples/simple` or as a [Colab Notebook](https://colab.research.google.com/drive/1Jw28xZ1NJq4Cja4jLe6tJ6_F5lCzElb4).
We assume that in your current directory, there is a `img.jpg` file and a `labels_map.txt` file (ImageNet class names). These are both included in `examples/simple`.
```python
import json
from PIL import Image
import torch
from torchvision import transforms
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_pretrained('efficientnet-b0')
# Preprocess image
tfms = transforms.Compose([transforms.Resize(224), transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),])
img = tfms(Image.open('img.jpg')).unsqueeze(0)
print(img.shape) # torch.Size([1, 3, 224, 224])
# Load ImageNet class names
labels_map = json.load(open('labels_map.txt'))
labels_map = [labels_map[str(i)] for i in range(1000)]
# Classify
model.eval()
with torch.no_grad():
outputs = model(img)
# Print predictions
print('-----')
for idx in torch.topk(outputs, k=5).indices.squeeze(0).tolist():
prob = torch.softmax(outputs, dim=1)[0, idx].item()
print('{label:<75} ({p:.2f}%)'.format(label=labels_map[idx], p=prob*100))
```
#### Example: Feature Extraction
You can easily extract features with `model.extract_features`:
```python
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_pretrained('efficientnet-b0')
# ... image preprocessing as in the classification example ...
print(img.shape) # torch.Size([1, 3, 224, 224])
features = model.extract_features(img)
print(features.shape) # torch.Size([1, 1280, 7, 7])
```
#### Example: Export to ONNX
Exporting to ONNX for deploying to production is now simple:
```python
import torch
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_pretrained('efficientnet-b1')
dummy_input = torch.randn(10, 3, 240, 240)
torch.onnx.export(model, dummy_input, "test-b1.onnx", verbose=True)
```
[Here](https://colab.research.google.com/drive/1rOAEXeXHaA8uo3aG2YcFDHItlRJMV0VP) is a Colab example.
#### ImageNet
See `examples/imagenet` for details about evaluating on ImageNet.
### Contributing
If you find a bug, create a GitHub issue, or even better, submit a pull request. Similarly, if you have questions, simply post them as GitHub issues.
I look forward to seeing what the community does with these models!
@@ -0,0 +1,45 @@
# EfficientNet PyTorch
## About EfficientNet
If you're new to EfficientNets, here is an explanation straight from the official TensorFlow implementation:
EfficientNets are a family of image classification models, which achieve state-of-the-art accuracy, yet being an order-of-magnitude smaller and faster than previous models. We develop EfficientNets based on AutoML and Compound Scaling. In particular, we first use [AutoML Mobile framework](https://ai.googleblog.com/2018/08/mnasnet-towards-automating-design-of.html) to develop a mobile-size baseline network, named as EfficientNet-B0; Then, we use the compound scaling method to scale up this baseline to obtain EfficientNet-B1 to B7.
EfficientNets achieve state-of-the-art accuracy on ImageNet with an order of magnitude better efficiency:
* In high-accuracy regime, our EfficientNet-B7 achieves state-of-the-art 84.4% top-1 / 97.1% top-5 accuracy on ImageNet with 66M parameters and 37B FLOPS, being 8.4x smaller and 6.1x faster on CPU inference than previous best [Gpipe](https://arxiv.org/abs/1811.06965).
* In middle-accuracy regime, our EfficientNet-B1 is 7.6x smaller and 5.7x faster on CPU inference than [ResNet-152](https://arxiv.org/abs/1512.03385), with similar ImageNet accuracy.
* Compared with the widely used [ResNet-50](https://arxiv.org/abs/1512.03385), our EfficientNet-B4 improves the top-1 accuracy from 76.3% of ResNet-50 to 82.6% (+6.3%), under similar FLOPS constraint.
## About EfficientNet PyTorch NPU
The source codes are based on the open source https://github.com/lukemelas/EfficientNet-PyTorch with least modified codes as far as possible.
## Quick Start
### Train on 1 NPU:
(1) modify the last line in npu_1p.sh with the particular params:
* fp32: taskset -c 0-64 python3.7 examples/imagenet/main.py --data=/data/imagenet --arch=efficientnet-b0 --batch-size=256 --lr=0.2 --epochs=200 --autoaug --npu=0
* O1: taskset -c 0-64 python3.7 examples/imagenet/main.py --data=/data/imagenet --arch=efficientnet-b0 --batch-size=256 --lr=0.2 --epochs=200 --autoaug --npu=0 --amp --pm=O1 --loss_scale=1024
* O2: taskset -c 0-64 python3.7 examples/imagenet/main.py --data=/data/imagenet --arch=efficientnet-b0 --batch-size=256 --lr=0.2 --epochs=200 --autoaug --npu=0 --amp --pm=O2 --loss_scale=128
(2) Execute run.shALL the train log will be recorded in nohup.out.
## Know issues:
* Distribution train is NOT available.
* top1/top5 accuracy is lower than GPU about 2% in the same setting (dropout).
* O2 Performance is lower than GPU about 50 fps in the same setting (dropout, depthwiseconv2d).
* torch.rand is replaced with numpy implementation due to the lack of AICPU operator (aicpu).
* momentum has to be set to 0 due to logsoftmax precision(logsoftmax)
@@ -0,0 +1,12 @@
__version__ = "0.7.0"
from .model import EfficientNet
from .utils import (
GlobalParams,
BlockArgs,
BlockDecoder,
efficientnet,
get_model_params,
)
from .auto_augment import rand_augment_transform, augment_and_mix_transform, auto_augment_transform
from .rmsprop_tf import RMSpropTF
@@ -0,0 +1,817 @@
""" AutoAugment, RandAugment, and AugMix for PyTorch
This code implements the searched ImageNet policies with various tweaks and improvements and
does not include any of the search code.
AA and RA Implementation adapted from:
https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py
AugMix adapted from:
https://github.com/google-research/augmix
Papers:
AutoAugment: Learning Augmentation Policies from Data - https://arxiv.org/abs/1805.09501
Learning Data Augmentation Strategies for Object Detection - https://arxiv.org/abs/1906.11172
RandAugment: Practical automated data augmentation... - https://arxiv.org/abs/1909.13719
AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty - https://arxiv.org/abs/1912.02781
Hacked together by Ross Wightman
"""
import random
import math
import re
from PIL import Image, ImageOps, ImageEnhance, ImageChops
import PIL
import numpy as np
_PIL_VER = tuple([int(x) for x in PIL.__version__.split('.')[:2]])
_FILL = (128, 128, 128)
# This signifies the max integer that the controller RNN could predict for the
# augmentation scheme.
_MAX_LEVEL = 10.
_HPARAMS_DEFAULT = dict(
translate_const=250,
img_mean=_FILL,
)
_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
def _interpolation(kwargs):
interpolation = kwargs.pop('resample', Image.BILINEAR)
if isinstance(interpolation, (list, tuple)):
return random.choice(interpolation)
else:
return interpolation
def _check_args_tf(kwargs):
if 'fillcolor' in kwargs and _PIL_VER < (5, 0):
kwargs.pop('fillcolor')
kwargs['resample'] = _interpolation(kwargs)
def shear_x(img, factor, **kwargs):
_check_args_tf(kwargs)
return img.transform(img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), **kwargs)
def shear_y(img, factor, **kwargs):
_check_args_tf(kwargs)
return img.transform(img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), **kwargs)
def translate_x_rel(img, pct, **kwargs):
pixels = pct * img.size[0]
_check_args_tf(kwargs)
return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs)
def translate_y_rel(img, pct, **kwargs):
pixels = pct * img.size[1]
_check_args_tf(kwargs)
return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs)
def translate_x_abs(img, pixels, **kwargs):
_check_args_tf(kwargs)
return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs)
def translate_y_abs(img, pixels, **kwargs):
_check_args_tf(kwargs)
return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs)
def rotate(img, degrees, **kwargs):
_check_args_tf(kwargs)
if _PIL_VER >= (5, 2):
return img.rotate(degrees, **kwargs)
elif _PIL_VER >= (5, 0):
w, h = img.size
post_trans = (0, 0)
rotn_center = (w / 2.0, h / 2.0)
angle = -math.radians(degrees)
matrix = [
round(math.cos(angle), 15),
round(math.sin(angle), 15),
0.0,
round(-math.sin(angle), 15),
round(math.cos(angle), 15),
0.0,
]
def transform(x, y, matrix):
(a, b, c, d, e, f) = matrix
return a * x + b * y + c, d * x + e * y + f
matrix[2], matrix[5] = transform(
-rotn_center[0] - post_trans[0], -rotn_center[1] - post_trans[1], matrix
)
matrix[2] += rotn_center[0]
matrix[5] += rotn_center[1]
return img.transform(img.size, Image.AFFINE, matrix, **kwargs)
else:
return img.rotate(degrees, resample=kwargs['resample'])
def auto_contrast(img, **__):
return ImageOps.autocontrast(img)
def invert(img, **__):
return ImageOps.invert(img)
def equalize(img, **__):
return ImageOps.equalize(img)
def solarize(img, thresh, **__):
return ImageOps.solarize(img, thresh)
def solarize_add(img, add, thresh=128, **__):
lut = []
for i in range(256):
if i < thresh:
lut.append(min(255, i + add))
else:
lut.append(i)
if img.mode in ("L", "RGB"):
if img.mode == "RGB" and len(lut) == 256:
lut = lut + lut + lut
return img.point(lut)
else:
return img
def posterize(img, bits_to_keep, **__):
if bits_to_keep >= 8:
return img
return ImageOps.posterize(img, bits_to_keep)
def contrast(img, factor, **__):
return ImageEnhance.Contrast(img).enhance(factor)
def color(img, factor, **__):
return ImageEnhance.Color(img).enhance(factor)
def brightness(img, factor, **__):
return ImageEnhance.Brightness(img).enhance(factor)
def sharpness(img, factor, **__):
return ImageEnhance.Sharpness(img).enhance(factor)
def _randomly_negate(v):
"""With 50% prob, negate the value"""
return -v if random.random() > 0.5 else v
def _rotate_level_to_arg(level, _hparams):
# range [-30, 30]
level = (level / _MAX_LEVEL) * 30.
level = _randomly_negate(level)
return level,
def _enhance_level_to_arg(level, _hparams):
# range [0.1, 1.9]
return (level / _MAX_LEVEL) * 1.8 + 0.1,
def _enhance_increasing_level_to_arg(level, _hparams):
# the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend
# range [0.1, 1.9]
level = (level / _MAX_LEVEL) * .9
level = 1.0 + _randomly_negate(level)
return level,
def _shear_level_to_arg(level, _hparams):
# range [-0.3, 0.3]
level = (level / _MAX_LEVEL) * 0.3
level = _randomly_negate(level)
return level,
def _translate_abs_level_to_arg(level, hparams):
translate_const = hparams['translate_const']
level = (level / _MAX_LEVEL) * float(translate_const)
level = _randomly_negate(level)
return level,
def _translate_rel_level_to_arg(level, hparams):
# default range [-0.45, 0.45]
translate_pct = hparams.get('translate_pct', 0.45)
level = (level / _MAX_LEVEL) * translate_pct
level = _randomly_negate(level)
return level,
def _posterize_level_to_arg(level, _hparams):
# As per Tensorflow TPU EfficientNet impl
# range [0, 4], 'keep 0 up to 4 MSB of original image'
# intensity/severity of augmentation decreases with level
return int((level / _MAX_LEVEL) * 4),
def _posterize_increasing_level_to_arg(level, hparams):
# As per Tensorflow models research and UDA impl
# range [4, 0], 'keep 4 down to 0 MSB of original image',
# intensity/severity of augmentation increases with level
return 4 - _posterize_level_to_arg(level, hparams)[0],
def _posterize_original_level_to_arg(level, _hparams):
# As per original AutoAugment paper description
# range [4, 8], 'keep 4 up to 8 MSB of image'
# intensity/severity of augmentation decreases with level
return int((level / _MAX_LEVEL) * 4) + 4,
def _solarize_level_to_arg(level, _hparams):
# range [0, 256]
# intensity/severity of augmentation decreases with level
return int((level / _MAX_LEVEL) * 256),
def _solarize_increasing_level_to_arg(level, _hparams):
# range [0, 256]
# intensity/severity of augmentation increases with level
return 256 - _solarize_level_to_arg(level, _hparams)[0],
def _solarize_add_level_to_arg(level, _hparams):
# range [0, 110]
return int((level / _MAX_LEVEL) * 110),
LEVEL_TO_ARG = {
'AutoContrast': None,
'Equalize': None,
'Invert': None,
'Rotate': _rotate_level_to_arg,
# There are several variations of the posterize level scaling in various Tensorflow/Google repositories/papers
'Posterize': _posterize_level_to_arg,
'PosterizeIncreasing': _posterize_increasing_level_to_arg,
'PosterizeOriginal': _posterize_original_level_to_arg,
'Solarize': _solarize_level_to_arg,
'SolarizeIncreasing': _solarize_increasing_level_to_arg,
'SolarizeAdd': _solarize_add_level_to_arg,
'Color': _enhance_level_to_arg,
'ColorIncreasing': _enhance_increasing_level_to_arg,
'Contrast': _enhance_level_to_arg,
'ContrastIncreasing': _enhance_increasing_level_to_arg,
'Brightness': _enhance_level_to_arg,
'BrightnessIncreasing': _enhance_increasing_level_to_arg,
'Sharpness': _enhance_level_to_arg,
'SharpnessIncreasing': _enhance_increasing_level_to_arg,
'ShearX': _shear_level_to_arg,
'ShearY': _shear_level_to_arg,
'TranslateX': _translate_abs_level_to_arg,
'TranslateY': _translate_abs_level_to_arg,
'TranslateXRel': _translate_rel_level_to_arg,
'TranslateYRel': _translate_rel_level_to_arg,
}
NAME_TO_OP = {
'AutoContrast': auto_contrast,
'Equalize': equalize,
'Invert': invert,
'Rotate': rotate,
'Posterize': posterize,
'PosterizeIncreasing': posterize,
'PosterizeOriginal': posterize,
'Solarize': solarize,
'SolarizeIncreasing': solarize,
'SolarizeAdd': solarize_add,
'Color': color,
'ColorIncreasing': color,
'Contrast': contrast,
'ContrastIncreasing': contrast,
'Brightness': brightness,
'BrightnessIncreasing': brightness,
'Sharpness': sharpness,
'SharpnessIncreasing': sharpness,
'ShearX': shear_x,
'ShearY': shear_y,
'TranslateX': translate_x_abs,
'TranslateY': translate_y_abs,
'TranslateXRel': translate_x_rel,
'TranslateYRel': translate_y_rel,
}
class AugmentOp:
def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
hparams = hparams or _HPARAMS_DEFAULT
self.aug_fn = NAME_TO_OP[name]
self.level_fn = LEVEL_TO_ARG[name]
self.prob = prob
self.magnitude = magnitude
self.hparams = hparams.copy()
self.kwargs = dict(
fillcolor=hparams['img_mean'] if 'img_mean' in hparams else _FILL,
resample=hparams['interpolation'] if 'interpolation' in hparams else _RANDOM_INTERPOLATION,
)
# If magnitude_std is > 0, we introduce some randomness
# in the usually fixed policy and sample magnitude from a normal distribution
# with mean `magnitude` and std-dev of `magnitude_std`.
# NOTE This is my own hack, being tested, not in papers or reference impls.
self.magnitude_std = self.hparams.get('magnitude_std', 0)
def __call__(self, img):
if self.prob < 1.0 and random.random() > self.prob:
return img
magnitude = self.magnitude
if self.magnitude_std and self.magnitude_std > 0:
magnitude = random.gauss(magnitude, self.magnitude_std)
magnitude = min(_MAX_LEVEL, max(0, magnitude)) # clip to valid range
level_args = self.level_fn(magnitude, self.hparams) if self.level_fn is not None else tuple()
return self.aug_fn(img, *level_args, **self.kwargs)
def auto_augment_policy_v0(hparams):
# ImageNet v0 policy from TPU EfficientNet impl, cannot find a paper reference.
policy = [
[('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
[('Color', 0.4, 9), ('Equalize', 0.6, 3)],
[('Color', 0.4, 1), ('Rotate', 0.6, 8)],
[('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
[('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
[('Color', 0.2, 0), ('Equalize', 0.8, 8)],
[('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
[('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
[('Color', 0.6, 1), ('Equalize', 1.0, 2)],
[('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
[('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
[('Color', 0.4, 7), ('Equalize', 0.6, 0)],
[('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
[('Solarize', 0.6, 8), ('Color', 0.6, 9)],
[('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
[('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
[('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
[('ShearY', 0.8, 0), ('Color', 0.6, 4)],
[('Color', 1.0, 0), ('Rotate', 0.6, 2)],
[('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
[('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
[('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
[('Posterize', 0.8, 2), ('Solarize', 0.6, 10)], # This results in black image with Tpu posterize
[('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
[('Color', 0.8, 6), ('Rotate', 0.4, 5)],
]
pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
return pc
def auto_augment_policy_v0r(hparams):
# ImageNet v0 policy from TPU EfficientNet impl, with variation of Posterize used
# in Google research implementation (number of bits discarded increases with magnitude)
policy = [
[('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
[('Color', 0.4, 9), ('Equalize', 0.6, 3)],
[('Color', 0.4, 1), ('Rotate', 0.6, 8)],
[('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
[('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
[('Color', 0.2, 0), ('Equalize', 0.8, 8)],
[('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
[('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
[('Color', 0.6, 1), ('Equalize', 1.0, 2)],
[('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
[('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
[('Color', 0.4, 7), ('Equalize', 0.6, 0)],
[('PosterizeIncreasing', 0.4, 6), ('AutoContrast', 0.4, 7)],
[('Solarize', 0.6, 8), ('Color', 0.6, 9)],
[('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
[('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
[('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
[('ShearY', 0.8, 0), ('Color', 0.6, 4)],
[('Color', 1.0, 0), ('Rotate', 0.6, 2)],
[('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
[('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
[('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
[('PosterizeIncreasing', 0.8, 2), ('Solarize', 0.6, 10)],
[('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
[('Color', 0.8, 6), ('Rotate', 0.4, 5)],
]
pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
return pc
def auto_augment_policy_original(hparams):
# ImageNet policy from https://arxiv.org/abs/1805.09501
policy = [
[('PosterizeOriginal', 0.4, 8), ('Rotate', 0.6, 9)],
[('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
[('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
[('PosterizeOriginal', 0.6, 7), ('PosterizeOriginal', 0.6, 6)],
[('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
[('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
[('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
[('PosterizeOriginal', 0.8, 5), ('Equalize', 1.0, 2)],
[('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
[('Equalize', 0.6, 8), ('PosterizeOriginal', 0.4, 6)],
[('Rotate', 0.8, 8), ('Color', 0.4, 0)],
[('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
[('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
[('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
[('Color', 0.6, 4), ('Contrast', 1.0, 8)],
[('Rotate', 0.8, 8), ('Color', 1.0, 2)],
[('Color', 0.8, 8), ('Solarize', 0.8, 7)],
[('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
[('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
[('Color', 0.4, 0), ('Equalize', 0.6, 3)],
[('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
[('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
[('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
[('Color', 0.6, 4), ('Contrast', 1.0, 8)],
[('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
]
pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
return pc
def auto_augment_policy_originalr(hparams):
# ImageNet policy from https://arxiv.org/abs/1805.09501 with research posterize variation
policy = [
[('PosterizeIncreasing', 0.4, 8), ('Rotate', 0.6, 9)],
[('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
[('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
[('PosterizeIncreasing', 0.6, 7), ('PosterizeIncreasing', 0.6, 6)],
[('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
[('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
[('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
[('PosterizeIncreasing', 0.8, 5), ('Equalize', 1.0, 2)],
[('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
[('Equalize', 0.6, 8), ('PosterizeIncreasing', 0.4, 6)],
[('Rotate', 0.8, 8), ('Color', 0.4, 0)],
[('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
[('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
[('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
[('Color', 0.6, 4), ('Contrast', 1.0, 8)],
[('Rotate', 0.8, 8), ('Color', 1.0, 2)],
[('Color', 0.8, 8), ('Solarize', 0.8, 7)],
[('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
[('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
[('Color', 0.4, 0), ('Equalize', 0.6, 3)],
[('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
[('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
[('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
[('Color', 0.6, 4), ('Contrast', 1.0, 8)],
[('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
]
pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
return pc
def auto_augment_policy(name='v0', hparams=None):
hparams = hparams or _HPARAMS_DEFAULT
if name == 'original':
return auto_augment_policy_original(hparams)
elif name == 'originalr':
return auto_augment_policy_originalr(hparams)
elif name == 'v0':
return auto_augment_policy_v0(hparams)
elif name == 'v0r':
return auto_augment_policy_v0r(hparams)
else:
assert False, 'Unknown AA policy (%s)' % name
class AutoAugment:
def __init__(self, policy):
self.policy = policy
def __call__(self, img):
sub_policy = random.choice(self.policy)
for op in sub_policy:
img = op(img)
return img
def auto_augment_transform(config_str, hparams):
"""
Create a AutoAugment transform
:param config_str: String defining configuration of auto augmentation. Consists of multiple sections separated by
dashes ('-'). The first section defines the AutoAugment policy (one of 'v0', 'v0r', 'original', 'originalr').
The remaining sections, not order sepecific determine
'mstd' - float std deviation of magnitude noise applied
Ex 'original-mstd0.5' results in AutoAugment with original policy, magnitude_std 0.5
:param hparams: Other hparams (kwargs) for the AutoAugmentation scheme
:return: A PyTorch compatible Transform
"""
config = config_str.split('-')
policy_name = config[0]
config = config[1:]
for c in config:
cs = re.split(r'(\d.*)', c)
if len(cs) < 2:
continue
key, val = cs[:2]
if key == 'mstd':
# noise param injected via hparams for now
hparams.setdefault('magnitude_std', float(val))
else:
assert False, 'Unknown AutoAugment config section'
aa_policy = auto_augment_policy(policy_name, hparams=hparams)
return AutoAugment(aa_policy)
_RAND_TRANSFORMS = [
'AutoContrast',
'Equalize',
'Invert',
'Rotate',
'Posterize',
'Solarize',
'SolarizeAdd',
'Color',
'Contrast',
'Brightness',
'Sharpness',
'ShearX',
'ShearY',
'TranslateXRel',
'TranslateYRel',
#'Cutout' # NOTE I've implement this as random erasing separately
]
_RAND_INCREASING_TRANSFORMS = [
'AutoContrast',
'Equalize',
'Invert',
'Rotate',
'PosterizeIncreasing',
'SolarizeIncreasing',
'SolarizeAdd',
'ColorIncreasing',
'ContrastIncreasing',
'BrightnessIncreasing',
'SharpnessIncreasing',
'ShearX',
'ShearY',
'TranslateXRel',
'TranslateYRel',
#'Cutout' # NOTE I've implement this as random erasing separately
]
# These experimental weights are based loosely on the relative improvements mentioned in paper.
# They may not result in increased performance, but could likely be tuned to so.
_RAND_CHOICE_WEIGHTS_0 = {
'Rotate': 0.3,
'ShearX': 0.2,
'ShearY': 0.2,
'TranslateXRel': 0.1,
'TranslateYRel': 0.1,
'Color': .025,
'Sharpness': 0.025,
'AutoContrast': 0.025,
'Solarize': .005,
'SolarizeAdd': .005,
'Contrast': .005,
'Brightness': .005,
'Equalize': .005,
'Posterize': 0,
'Invert': 0,
}
def _select_rand_weights(weight_idx=0, transforms=None):
transforms = transforms or _RAND_TRANSFORMS
assert weight_idx == 0 # only one set of weights currently
rand_weights = _RAND_CHOICE_WEIGHTS_0
probs = [rand_weights[k] for k in transforms]
probs /= np.sum(probs)
return probs
def rand_augment_ops(magnitude=10, hparams=None, transforms=None):
hparams = hparams or _HPARAMS_DEFAULT
transforms = transforms or _RAND_TRANSFORMS
return [AugmentOp(
name, prob=0.5, magnitude=magnitude, hparams=hparams) for name in transforms]
class RandAugment:
def __init__(self, ops, num_layers=2, choice_weights=None):
self.ops = ops
self.num_layers = num_layers
self.choice_weights = choice_weights
def __call__(self, img):
# no replacement when using weighted choice
ops = np.random.choice(
self.ops, self.num_layers, replace=self.choice_weights is None, p=self.choice_weights)
for op in ops:
img = op(img)
return img
def rand_augment_transform(config_str, hparams):
"""
Create a RandAugment transform
:param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
sections, not order sepecific determine
'm' - integer magnitude of rand augment
'n' - integer num layers (number of transform ops selected per image)
'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
'mstd' - float std deviation of magnitude noise applied
'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0)
Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2
:param hparams: Other hparams (kwargs) for the RandAugmentation scheme
:return: A PyTorch compatible Transform
"""
magnitude = _MAX_LEVEL # default to _MAX_LEVEL for magnitude (currently 10)
num_layers = 2 # default to 2 ops per image
weight_idx = None # default to no probability weights for op choice
transforms = _RAND_TRANSFORMS
config = config_str.split('-')
assert config[0] == 'rand'
config = config[1:]
for c in config:
cs = re.split(r'(\d.*)', c)
if len(cs) < 2:
continue
key, val = cs[:2]
if key == 'mstd':
# noise param injected via hparams for now
hparams.setdefault('magnitude_std', float(val))
elif key == 'inc':
if bool(val):
transforms = _RAND_INCREASING_TRANSFORMS
elif key == 'm':
magnitude = int(val)
elif key == 'n':
num_layers = int(val)
elif key == 'w':
weight_idx = int(val)
else:
assert False, 'Unknown RandAugment config section'
ra_ops = rand_augment_ops(magnitude=magnitude, hparams=hparams, transforms=transforms)
choice_weights = None if weight_idx is None else _select_rand_weights(weight_idx)
return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)
_AUGMIX_TRANSFORMS = [
'AutoContrast',
'ColorIncreasing', # not in paper
'ContrastIncreasing', # not in paper
'BrightnessIncreasing', # not in paper
'SharpnessIncreasing', # not in paper
'Equalize',
'Rotate',
'PosterizeIncreasing',
'SolarizeIncreasing',
'ShearX',
'ShearY',
'TranslateXRel',
'TranslateYRel',
]
def augmix_ops(magnitude=10, hparams=None, transforms=None):
hparams = hparams or _HPARAMS_DEFAULT
transforms = transforms or _AUGMIX_TRANSFORMS
return [AugmentOp(
name, prob=1.0, magnitude=magnitude, hparams=hparams) for name in transforms]
class AugMixAugment:
""" AugMix Transform
Adapted and improved from impl here: https://github.com/google-research/augmix/blob/master/imagenet.py
From paper: 'AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty -
https://arxiv.org/abs/1912.02781
"""
def __init__(self, ops, alpha=1., width=3, depth=-1, blended=False):
self.ops = ops
self.alpha = alpha
self.width = width
self.depth = depth
self.blended = blended # blended mode is faster but not well tested
def _calc_blended_weights(self, ws, m):
ws = ws * m
cump = 1.
rws = []
for w in ws[::-1]:
alpha = w / cump
cump *= (1 - alpha)
rws.append(alpha)
return np.array(rws[::-1], dtype=np.float32)
def _apply_blended(self, img, mixing_weights, m):
# This is my first crack and implementing a slightly faster mixed augmentation. Instead
# of accumulating the mix for each chain in a Numpy array and then blending with original,
# it recomputes the blending coefficients and applies one PIL image blend per chain.
# TODO the results appear in the right ballpark but they differ by more than rounding.
img_orig = img.copy()
ws = self._calc_blended_weights(mixing_weights, m)
for w in ws:
depth = self.depth if self.depth > 0 else np.random.randint(1, 4)
ops = np.random.choice(self.ops, depth, replace=True)
img_aug = img_orig # no ops are in-place, deep copy not necessary
for op in ops:
img_aug = op(img_aug)
img = Image.blend(img, img_aug, w)
return img
def _apply_basic(self, img, mixing_weights, m):
# This is a literal adaptation of the paper/official implementation without normalizations and
# PIL <-> Numpy conversions between every op. It is still quite CPU compute heavy compared to the
# typical augmentation transforms, could use a GPU / Kornia implementation.
img_shape = img.size[0], img.size[1], len(img.getbands())
mixed = np.zeros(img_shape, dtype=np.float32)
for mw in mixing_weights:
depth = self.depth if self.depth > 0 else np.random.randint(1, 4)
ops = np.random.choice(self.ops, depth, replace=True)
img_aug = img # no ops are in-place, deep copy not necessary
for op in ops:
img_aug = op(img_aug)
mixed += mw * np.asarray(img_aug, dtype=np.float32)
np.clip(mixed, 0, 255., out=mixed)
mixed = Image.fromarray(mixed.astype(np.uint8))
return Image.blend(img, mixed, m)
def __call__(self, img):
mixing_weights = np.float32(np.random.dirichlet([self.alpha] * self.width))
m = np.float32(np.random.beta(self.alpha, self.alpha))
if self.blended:
mixed = self._apply_blended(img, mixing_weights, m)
else:
mixed = self._apply_basic(img, mixing_weights, m)
return mixed
def augment_and_mix_transform(config_str, hparams):
""" Create AugMix PyTorch transform
:param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
sections, not order sepecific determine
'm' - integer magnitude (severity) of augmentation mix (default: 3)
'w' - integer width of augmentation chain (default: 3)
'd' - integer depth of augmentation chain (-1 is random [1, 3], default: -1)
'b' - integer (bool), blend each branch of chain into end result without a final blend, less CPU (default: 0)
'mstd' - float std deviation of magnitude noise applied (default: 0)
Ex 'augmix-m5-w4-d2' results in AugMix with severity 5, chain width 4, chain depth 2
:param hparams: Other hparams (kwargs) for the Augmentation transforms
:return: A PyTorch compatible Transform
"""
magnitude = 3
width = 3
depth = -1
alpha = 1.
blended = False
config = config_str.split('-')
assert config[0] == 'augmix'
config = config[1:]
for c in config:
cs = re.split(r'(\d.*)', c)
if len(cs) < 2:
continue
key, val = cs[:2]
if key == 'mstd':
# noise param injected via hparams for now
hparams.setdefault('magnitude_std', float(val))
elif key == 'm':
magnitude = int(val)
elif key == 'w':
width = int(val)
elif key == 'd':
depth = int(val)
elif key == 'a':
alpha = float(val)
elif key == 'b':
blended = bool(val)
else:
assert False, 'Unknown AugMix config section'
ops = augmix_ops(magnitude=magnitude, hparams=hparams)
return AugMixAugment(ops, alpha=alpha, width=width, depth=depth, blended=blended)
@@ -0,0 +1,432 @@
"""model.py - Model and module class for EfficientNet.
They are built to mirror those in the official TensorFlow implementation.
"""
# Author: lukemelas (github username)
# Github repo: https://github.com/lukemelas/EfficientNet-PyTorch
# With adjustments and added comments by workingcoder (github username).
import torch
from torch import nn
from torch.nn import functional as F
from .utils import (
round_filters,
round_repeats,
drop_connect,
get_same_padding_conv2d,
get_model_params,
efficientnet_params,
load_pretrained_weights,
Swish,
MemoryEfficientSwish,
calculate_output_image_size
)
class MBConvBlock(nn.Module):
"""Mobile Inverted Residual Bottleneck Block.
Args:
block_args (namedtuple): BlockArgs, defined in utils.py.
global_params (namedtuple): GlobalParam, defined in utils.py.
image_size (tuple or list): [image_height, image_width].
References:
[1] https://arxiv.org/abs/1704.04861 (MobileNet v1)
[2] https://arxiv.org/abs/1801.04381 (MobileNet v2)
[3] https://arxiv.org/abs/1905.02244 (MobileNet v3)
"""
def __init__(self, block_args, global_params, image_size=None):
super().__init__()
self._block_args = block_args
self._bn_mom = 1 - global_params.batch_norm_momentum # pytorch's difference from tensorflow
self._bn_eps = global_params.batch_norm_epsilon
self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1)
self.id_skip = block_args.id_skip # whether to use skip connection and drop connect
# Expansion phase (Inverted Bottleneck)
inp = self._block_args.input_filters # number of input channels
oup = self._block_args.input_filters * self._block_args.expand_ratio # number of output channels
if self._block_args.expand_ratio != 1:
Conv2d = get_same_padding_conv2d(image_size=image_size)
self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
# image_size = calculate_output_image_size(image_size, 1) <-- this wouldn't modify image_size
# Depthwise convolution phase
k = self._block_args.kernel_size
s = self._block_args.stride
Conv2d = get_same_padding_conv2d(image_size=image_size)
self._depthwise_conv = Conv2d(
in_channels=oup, out_channels=oup, groups=oup, # groups makes it depthwise
kernel_size=k, stride=s, bias=False)
self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
image_size = calculate_output_image_size(image_size, s)
# Squeeze and Excitation layer, if desired
if self.has_se:
Conv2d = get_same_padding_conv2d(image_size=(1,1))
num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio))
self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)
# self._se_relu = torch.nn.ReLU()
# self._se_sigmoid = torch.nn.Sigmoid()
# Pointwise convolution phase
final_oup = self._block_args.output_filters
Conv2d = get_same_padding_conv2d(image_size=image_size)
self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
self._swish = MemoryEfficientSwish()
def forward(self, inputs, drop_connect_rate=None):
"""MBConvBlock's forward function.
Args:
inputs (tensor): Input tensor.
drop_connect_rate (bool): Drop connect rate (float, between 0 and 1).
Returns:
Output of this block after processing.
"""
# Expansion and Depthwise Convolution
x = inputs
if self._block_args.expand_ratio != 1:
x = self._expand_conv(inputs)
x = self._bn0(x)
x = self._swish(x)
x = self._depthwise_conv(x)
x = self._bn1(x)
x = self._swish(x)
# Squeeze and Excitation
if self.has_se:
x_squeezed = F.adaptive_avg_pool2d(x, 1)
# x_squeezed = torch.mean(x, [2, 3], keepdim=True)
x_squeezed = self._se_reduce(x_squeezed)
x_squeezed = self._swish(x_squeezed)
x_squeezed = self._se_expand(x_squeezed)
# x_squeezed = self._se_sigmoid(x_squeezed)
#
# x = x_squeezed * x
x = torch.sigmoid(x_squeezed) * x
# x = torch.sigmoid(x_squeezed) + x
# x = torch.nn.functional.relu(x_squeezed) * x
# x = x_squeezed + x
# Pointwise Convolution
x = self._project_conv(x)
x = self._bn2(x)
# Skip connection and drop connect
input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
# The combination of skip connection and drop connect brings about stochastic depth.
if drop_connect_rate:
x = drop_connect(x, p=drop_connect_rate, training=self.training)
x = x + inputs # skip connection
return x
def set_swish(self, memory_efficient=True):
"""Sets swish function as memory efficient (for training) or standard (for export).
Args:
memory_efficient (bool): Whether to use memory-efficient version of swish.
"""
self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
class EfficientNet(nn.Module):
"""EfficientNet model.
Most easily loaded with the .from_name or .from_pretrained methods.
Args:
blocks_args (list[namedtuple]): A list of BlockArgs to construct blocks.
global_params (namedtuple): A set of GlobalParams shared between blocks.
References:
[1] https://arxiv.org/abs/1905.11946 (EfficientNet)
Example:
>>> import torch
>>> from efficientnet.model import EfficientNet
>>> inputs = torch.rand(1, 3, 224, 224)
>>> model = EfficientNet.from_pretrained('efficientnet-b0')
>>> model.eval()
>>> outputs = model(inputs)
"""
def __init__(self, blocks_args=None, global_params=None):
super().__init__()
assert isinstance(blocks_args, list), 'blocks_args should be a list'
assert len(blocks_args) > 0, 'block args must be greater than 0'
self._global_params = global_params
self._blocks_args = blocks_args
# Batch norm parameters
bn_mom = 1 - self._global_params.batch_norm_momentum
bn_eps = self._global_params.batch_norm_epsilon
# Get stem static or dynamic convolution depending on image size
image_size = global_params.image_size
Conv2d = get_same_padding_conv2d(image_size=image_size)
# Stem
in_channels = 3 # rgb
out_channels = round_filters(32, self._global_params) # number of output channels
self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
self._bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
image_size = calculate_output_image_size(image_size, 2)
# Build blocks
self._blocks = nn.ModuleList([])
for block_args in self._blocks_args:
# Update block input and output filters based on depth multiplier.
block_args = block_args._replace(
input_filters=round_filters(block_args.input_filters, self._global_params),
output_filters=round_filters(block_args.output_filters, self._global_params),
num_repeat=round_repeats(block_args.num_repeat, self._global_params)
)
# The first block needs to take care of stride and filter size increase.
self._blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size))
image_size = calculate_output_image_size(image_size, block_args.stride)
if block_args.num_repeat > 1: # modify block_args to keep same output size
block_args = block_args._replace(input_filters=block_args.output_filters, stride=1)
for _ in range(block_args.num_repeat - 1):
self._blocks.append(MBConvBlock(block_args, self._global_params, image_size=image_size))
# image_size = calculate_output_image_size(image_size, block_args.stride) # stride = 1
# Head
in_channels = block_args.output_filters # output of final block
out_channels = round_filters(1280, self._global_params)
Conv2d = get_same_padding_conv2d(image_size=image_size)
self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
# Final linear layer
self._avg_pooling = nn.AdaptiveAvgPool2d(1)
self._dropout = nn.Dropout(self._global_params.dropout_rate)
self._fc = nn.Linear(out_channels, self._global_params.num_classes)
self._swish = MemoryEfficientSwish()
def set_swish(self, memory_efficient=True):
"""Sets swish function as memory efficient (for training) or standard (for export).
Args:
memory_efficient (bool): Whether to use memory-efficient version of swish.
"""
self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
for block in self._blocks:
block.set_swish(memory_efficient)
def extract_endpoints(self, inputs):
"""Use convolution layer to extract features
from reduction levels i in [1, 2, 3, 4, 5].
Args:
inputs (tensor): Input tensor.
Returns:
Dictionary of last intermediate features
with reduction levels i in [1, 2, 3, 4, 5].
Example:
>>> import torch
>>> from efficientnet.model import EfficientNet
>>> inputs = torch.rand(1, 3, 224, 224)
>>> model = EfficientNet.from_pretrained('efficientnet-b0')
>>> endpoints = model.extract_features(inputs)
>>> print(endpoints['reduction_1'].shape) # torch.Size([1, 16, 112, 112])
>>> print(endpoints['reduction_2'].shape) # torch.Size([1, 24, 56, 56])
>>> print(endpoints['reduction_3'].shape) # torch.Size([1, 40, 28, 28])
>>> print(endpoints['reduction_4'].shape) # torch.Size([1, 112, 14, 14])
>>> print(endpoints['reduction_5'].shape) # torch.Size([1, 1280, 7, 7])
"""
endpoints = dict()
# Stem
x = self._swish(self._bn0(self._conv_stem(inputs)))
# x = self._swish(self._conv_stem(inputs))
prev_x = x
# Blocks
for idx, block in enumerate(self._blocks):
drop_connect_rate = self._global_params.drop_connect_rate
if drop_connect_rate:
drop_connect_rate *= float(idx) / len(self._blocks) # scale drop connect_rate
x = block(x, drop_connect_rate=drop_connect_rate)
if prev_x.size(2) > x.size(2):
endpoints[f'reduction_{len(endpoints)+1}'] = prev_x
prev_x = x
# Head
x = self._swish(self._bn1(self._conv_head(x)))
# x = self._swish(self._conv_head(x))
endpoints[f'reduction_{len(endpoints)+1}'] = x
return endpoints
def extract_features(self, inputs):
"""use convolution layer to extract feature .
Args:
inputs (tensor): Input tensor.
Returns:
Output of the final convolution
layer in the efficientnet model.
"""
# Stem
x = self._swish(self._bn0(self._conv_stem(inputs)))
# x = self._swish(self._conv_stem(inputs))
# Blocks
for idx, block in enumerate(self._blocks):
drop_connect_rate = self._global_params.drop_connect_rate
if drop_connect_rate:
drop_connect_rate *= float(idx) / len(self._blocks) # scale drop connect_rate
x = block(x, drop_connect_rate=drop_connect_rate)
# Head
x = self._swish(self._bn1(self._conv_head(x)))
# x = self._swish(self._conv_head(x))
return x
def forward(self, inputs):
"""EfficientNet's forward function.
Calls extract_features to extract features, applies final linear layer, and returns logits.
Args:
inputs (tensor): Input tensor.
Returns:
Output of this model after processing.
"""
bs = inputs.size(0)
# Convolution layers
x = self.extract_features(inputs)
# Pooling and final linear layer
x = self._avg_pooling(x)
# x = x.view(bs, -1)
x = torch.flatten(x, start_dim=1)
# x = self._dropout(x.to('cpu'))
# x = self._fc(x.to('npu:5'))
x = self._dropout(x)
x = self._fc(x)
return x
@classmethod
def from_name(cls, model_name, in_channels=3, **override_params):
"""create an efficientnet model according to name.
Args:
model_name (str): Name for efficientnet.
in_channels (int): Input data's channel number.
override_params (other key word params):
Params to override model's global_params.
Optional key:
'width_coefficient', 'depth_coefficient',
'image_size', 'dropout_rate',
'num_classes', 'batch_norm_momentum',
'batch_norm_epsilon', 'drop_connect_rate',
'depth_divisor', 'min_depth'
Returns:
An efficientnet model.
"""
cls._check_model_name_is_valid(model_name)
blocks_args, global_params = get_model_params(model_name, override_params)
model = cls(blocks_args, global_params)
model._change_in_channels(in_channels)
return model
@classmethod
def from_pretrained(cls, model_name, weights_path=None, advprop=False,
in_channels=3, num_classes=1000, **override_params):
"""create an efficientnet model according to name.
Args:
model_name (str): Name for efficientnet.
weights_path (None or str):
str: path to pretrained weights file on the local disk.
None: use pretrained weights downloaded from the Internet.
advprop (bool):
Whether to load pretrained weights
trained with advprop (valid when weights_path is None).
in_channels (int): Input data's channel number.
num_classes (int):
Number of categories for classification.
It controls the output size for final linear layer.
override_params (other key word params):
Params to override model's global_params.
Optional key:
'width_coefficient', 'depth_coefficient',
'image_size', 'dropout_rate',
'num_classes', 'batch_norm_momentum',
'batch_norm_epsilon', 'drop_connect_rate',
'depth_divisor', 'min_depth'
Returns:
A pretrained efficientnet model.
"""
model = cls.from_name(model_name, num_classes = num_classes, **override_params)
load_pretrained_weights(model, model_name, weights_path=weights_path, load_fc=(num_classes == 1000), advprop=advprop)
model._change_in_channels(in_channels)
return model
@classmethod
def get_image_size(cls, model_name):
"""Get the input image size for a given efficientnet model.
Args:
model_name (str): Name for efficientnet.
Returns:
Input image size (resolution).
"""
cls._check_model_name_is_valid(model_name)
_, _, res, _ = efficientnet_params(model_name)
return res
@classmethod
def _check_model_name_is_valid(cls, model_name):
"""Validates model name.
Args:
model_name (str): Name for efficientnet.
Returns:
bool: Is a valid name or not.
"""
valid_models = ['efficientnet-b'+str(i) for i in range(9)]
# Support the construction of 'efficientnet-l2' without pretrained weights
valid_models += ['efficientnet-l2']
if model_name not in valid_models:
raise ValueError('model_name should be one of: ' + ', '.join(valid_models))
def _change_in_channels(self, in_channels):
"""Adjust model's first convolution layer to in_channels, if in_channels not equals 3.
Args:
in_channels (int): Input data's channel number.
"""
if in_channels != 3:
Conv2d = get_same_padding_conv2d(image_size = self._global_params.image_size)
out_channels = round_filters(32, self._global_params)
self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
@@ -0,0 +1,7 @@
def set_value(value):
global _npu_id
_npu_id = value
print('set device id %s success'%_npu_id)
def get_value():
return _npu_id
@@ -0,0 +1,122 @@
import torch
from torch.optim import Optimizer
class RMSpropTF(Optimizer):
"""Implements RMSprop algorithm (TensorFlow style epsilon)
NOTE: This is a direct cut-and-paste of PyTorch RMSprop with eps applied before sqrt
to closer match Tensorflow for matching hyper-params.
Proposed by G. Hinton in his
`course <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.
The centered version first appears in `Generating Sequences
With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-2)
momentum (float, optional): momentum factor (default: 0)
alpha (float, optional): smoothing (decay) constant (default: 0.9)
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-10)
centered (bool, optional) : if ``True``, compute the centered RMSProp,
the gradient is normalized by an estimation of its variance
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
decoupled_decay (bool, optional): decoupled weight decay as per https://arxiv.org/abs/1711.05101
lr_in_momentum (bool, optional): learning rate scaling is included in the momentum buffer
update as per defaults in Tensorflow
"""
def __init__(self, params, lr=1e-2, alpha=0.9, eps=1e-10, weight_decay=0, momentum=0., centered=False,
decoupled_decay=False, lr_in_momentum=True):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= momentum:
raise ValueError("Invalid momentum value: {}".format(momentum))
if not 0.0 <= weight_decay:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
if not 0.0 <= alpha:
raise ValueError("Invalid alpha value: {}".format(alpha))
defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay,
decoupled_decay=decoupled_decay, lr_in_momentum=lr_in_momentum)
super(RMSpropTF, self).__init__(params, defaults)
def __setstate__(self, state):
super(RMSpropTF, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('momentum', 0)
group.setdefault('centered', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('RMSprop does not support sparse gradients')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
state['square_avg'] = torch.ones_like(p.data) # PyTorch inits to zero
if group['momentum'] > 0:
state['momentum_buffer'] = torch.zeros_like(p.data)
if group['centered']:
state['grad_avg'] = torch.zeros_like(p.data)
square_avg = state['square_avg']
one_minus_alpha = 1. - group['alpha']
state['step'] += 1
if group['weight_decay'] != 0:
if 'decoupled_decay' in group and group['decoupled_decay']:
p.data.add_(-group['weight_decay'], p.data)
else:
grad = grad.add(group['weight_decay'], p.data)
# Tensorflow order of ops for updating squared avg
square_avg.add_(one_minus_alpha, grad.pow(2) - square_avg)
# square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad) # PyTorch original
if group['centered']:
grad_avg = state['grad_avg']
grad_avg.add_(one_minus_alpha, grad - grad_avg)
# grad_avg.mul_(alpha).add_(1 - alpha, grad) # PyTorch original
avg = square_avg.addcmul(-1, grad_avg, grad_avg).add(group['eps']).sqrt_() # eps moved in sqrt
else:
avg = square_avg.add(group['eps']).sqrt_() # eps moved in sqrt
if group['momentum'] > 0:
buf = state['momentum_buffer']
# Tensorflow accumulates the LR scaling in the momentum buffer
if 'lr_in_momentum' in group and group['lr_in_momentum']:
buf.mul_(group['momentum']).addcdiv_(group['lr'], grad, avg)
p.data.add_(-buf)
else:
# PyTorch scales the param update by LR
buf.mul_(group['momentum']).addcdiv_(grad, avg)
p.data.add_(-group['lr'], buf)
else:
p.data.addcdiv_(-group['lr'], grad, avg)
return loss
@@ -0,0 +1,624 @@
"""utils.py - Helper functions for building the model and for loading model parameters.
These helper functions are built to mirror those in the official TensorFlow implementation.
"""
# Author: lukemelas (github username)
# Github repo: https://github.com/lukemelas/EfficientNet-PyTorch
# With adjustments and added comments by workingcoder (github username).
import re
import math
import collections
from functools import partial
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils import model_zoo
from . import npu_info
################################################################################
### Help functions for model architecture
################################################################################
# GlobalParams and BlockArgs: Two namedtuples
# Swish and MemoryEfficientSwish: Two implementations of the method
# round_filters and round_repeats:
# Functions to calculate params for scaling model width and depth ! ! !
# get_width_and_height_from_size and calculate_output_image_size
# drop_connect: A structural design
# get_same_padding_conv2d:
# Conv2dDynamicSamePadding
# Conv2dStaticSamePadding
# get_same_padding_maxPool2d:
# MaxPool2dDynamicSamePadding
# MaxPool2dStaticSamePadding
# It's an additional function, not used in EfficientNet,
# but can be used in other model (such as EfficientDet).
# Identity: An implementation of identical mapping
# Parameters for the entire model (stem, all blocks, and head)
GlobalParams = collections.namedtuple('GlobalParams', [
'width_coefficient', 'depth_coefficient', 'image_size', 'dropout_rate',
'num_classes', 'batch_norm_momentum', 'batch_norm_epsilon',
'drop_connect_rate', 'depth_divisor', 'min_depth'])
# Parameters for an individual model block
BlockArgs = collections.namedtuple('BlockArgs', [
'num_repeat', 'kernel_size', 'stride', 'expand_ratio',
'input_filters', 'output_filters', 'se_ratio', 'id_skip'])
# Set GlobalParams and BlockArgs's defaults
GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields)
BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields)
# An ordinary implementation of Swish function
class Swish(nn.Module):
def forward(self, x):
return x * torch.sigmoid(x)
# A memory-efficient implementation of Swish function
class SwishImplementation(torch.autograd.Function):
@staticmethod
def forward(ctx, i):
result = i * torch.sigmoid(i)
ctx.save_for_backward(i)
return result
@staticmethod
def backward(ctx, grad_output):
i = ctx.saved_tensors[0]
sigmoid_i = torch.sigmoid(i)
return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
class MemoryEfficientSwish(nn.Module):
def forward(self, x):
return SwishImplementation.apply(x)
def round_filters(filters, global_params):
"""Calculate and round number of filters based on width multiplier.
Use width_coefficient, depth_divisor and min_depth of global_params.
Args:
filters (int): Filters number to be calculated.
global_params (namedtuple): Global params of the model.
Returns:
new_filters: New filters number after calculating.
"""
multiplier = global_params.width_coefficient
if not multiplier:
return filters
# TODO: modify the params names.
# maybe the names (width_divisor,min_width)
# are more suitable than (depth_divisor,min_depth).
divisor = global_params.depth_divisor
min_depth = global_params.min_depth
filters *= multiplier
min_depth = min_depth or divisor # pay attention to this line when using min_depth
# follow the formula transferred from official TensorFlow implementation
new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
if new_filters < 0.9 * filters: # prevent rounding by more than 10%
new_filters += divisor
return int(new_filters)
def round_repeats(repeats, global_params):
"""Calculate module's repeat number of a block based on depth multiplier.
Use depth_coefficient of global_params.
Args:
repeats (int): num_repeat to be calculated.
global_params (namedtuple): Global params of the model.
Returns:
new repeat: New repeat number after calculating.
"""
multiplier = global_params.depth_coefficient
if not multiplier:
return repeats
# follow the formula transferred from official TensorFlow implementation
return int(math.ceil(multiplier * repeats))
def drop_connect(inputs, p, training):
"""Drop connect.
Args:
input (tensor: BCWH): Input of this structure.
p (float: 0.0~1.0): Probability of drop connection.
training (bool): The running mode.
Returns:
output: Output after drop connection.
"""
assert p >= 0 and p <= 1, 'p must be in range of [0,1]'
if not training:
return inputs
batch_size = inputs.shape[0]
keep_prob = 1 - p
# generate binary_tensor mask according to probability (p for 0, 1-p for 1)
random_tensor = keep_prob
random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device)
binary_tensor = torch.floor(random_tensor) / keep_prob
output = inputs * binary_tensor
return output
def get_width_and_height_from_size(x):
"""Obtain height and width from x.
Args:
x (int, tuple or list): Data size.
Returns:
size: A tuple or list (H,W).
"""
if isinstance(x, int):
return x, x
if isinstance(x, list) or isinstance(x, tuple):
return x
else:
raise TypeError()
def calculate_output_image_size(input_image_size, stride):
"""Calculates the output image size when using Conv2dSamePadding with a stride.
Necessary for static padding. Thanks to mannatsingh for pointing this out.
Args:
input_image_size (int, tuple or list): Size of input image.
stride (int, tuple or list): Conv2d operation's stride.
Returns:
output_image_size: A list [H,W].
"""
if input_image_size is None:
return None
image_height, image_width = get_width_and_height_from_size(input_image_size)
stride = stride if isinstance(stride, int) else stride[0]
image_height = int(math.ceil(image_height / stride))
image_width = int(math.ceil(image_width / stride))
return [image_height, image_width]
# Note:
# The following 'SamePadding' functions make output size equal ceil(input size/stride).
# Only when stride equals 1, can the output size be the same as input size.
# Don't be confused by their function names ! ! !
def get_same_padding_conv2d(image_size=None):
"""Chooses static padding if you have specified an image size, and dynamic padding otherwise.
Static padding is necessary for ONNX exporting of models.
Args:
image_size (int or tuple): Size of the image.
Returns:
Conv2dDynamicSamePadding or Conv2dStaticSamePadding.
"""
if image_size is None:
return Conv2dDynamicSamePadding
else:
return partial(Conv2dStaticSamePadding, image_size=image_size)
class Conv2dDynamicSamePadding(nn.Conv2d):
"""2D Convolutions like TensorFlow, for a dynamic image size.
The padding is operated in forward function by calculating dynamically.
"""
# Tips for 'SAME' mode padding.
# Given the following:
# i: width or height
# s: stride
# k: kernel size
# d: dilation
# p: padding
# Output after Conv2d:
# o = floor((i+p-((k-1)*d+1))/s+1)
# If o equals i, i = floor((i+p-((k-1)*d+1))/s+1),
# => p = (i-1)*s+((k-1)*d+1)-i
def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2
def forward(self, x):
ih, iw = x.size()[-2:]
kh, kw = self.weight.size()[-2:]
sh, sw = self.stride
oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) # change the output size according to stride ! ! !
pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
if pad_h > 0 or pad_w > 0:
x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
class Conv2dStaticSamePadding(nn.Conv2d):
"""2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size.
The padding mudule is calculated in construction function, then used in forward.
"""
# With the same calculation as Conv2dDynamicSamePadding
def __init__(self, in_channels, out_channels, kernel_size, stride=1, image_size=None, **kwargs):
super().__init__(in_channels, out_channels, kernel_size, stride, **kwargs)
self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2
# Calculate padding based on image size and save it
assert image_size is not None
ih, iw = (image_size, image_size) if isinstance(image_size, int) else image_size
kh, kw = self.weight.size()[-2:]
sh, sw = self.stride
oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
if pad_h > 0 or pad_w > 0:
self.static_padding = nn.ZeroPad2d((pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
if kh % 2 != 0:
self.padding = (kh - 1) // 2
else:
self.padding = kh // 2
else:
self.static_padding = Identity()
def forward(self, x):
x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
return x
def get_same_padding_maxPool2d(image_size=None):
"""Chooses static padding if you have specified an image size, and dynamic padding otherwise.
Static padding is necessary for ONNX exporting of models.
Args:
image_size (int or tuple): Size of the image.
Returns:
MaxPool2dDynamicSamePadding or MaxPool2dStaticSamePadding.
"""
if image_size is None:
return MaxPool2dDynamicSamePadding
else:
return partial(MaxPool2dStaticSamePadding, image_size=image_size)
class MaxPool2dDynamicSamePadding(nn.MaxPool2d):
"""2D MaxPooling like TensorFlow's 'SAME' mode, with a dynamic image size.
The padding is operated in forward function by calculating dynamically.
"""
def __init__(self, kernel_size, stride, padding=0, dilation=1, return_indices=False, ceil_mode=False):
super().__init__(kernel_size, stride, padding, dilation, return_indices, ceil_mode)
self.stride = [self.stride] * 2 if isinstance(self.stride, int) else self.stride
self.kernel_size = [self.kernel_size] * 2 if isinstance(self.kernel_size, int) else self.kernel_size
self.dilation = [self.dilation] * 2 if isinstance(self.dilation, int) else self.dilation
def forward(self, x):
ih, iw = x.size()[-2:]
kh, kw = self.kernel_size
sh, sw = self.stride
oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
if pad_h > 0 or pad_w > 0:
x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
return F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
self.dilation, self.ceil_mode, self.return_indices)
class MaxPool2dStaticSamePadding(nn.MaxPool2d):
"""2D MaxPooling like TensorFlow's 'SAME' mode, with the given input image size.
The padding mudule is calculated in construction function, then used in forward.
"""
def __init__(self, kernel_size, stride, image_size=None, **kwargs):
super().__init__(kernel_size, stride, **kwargs)
self.stride = [self.stride] * 2 if isinstance(self.stride, int) else self.stride
self.kernel_size = [self.kernel_size] * 2 if isinstance(self.kernel_size, int) else self.kernel_size
self.dilation = [self.dilation] * 2 if isinstance(self.dilation, int) else self.dilation
# Calculate padding based on image size and save it
assert image_size is not None
ih, iw = (image_size, image_size) if isinstance(image_size, int) else image_size
kh, kw = self.kernel_size
sh, sw = self.stride
oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
if pad_h > 0 or pad_w > 0:
self.static_padding = nn.ZeroPad2d((pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
else:
self.static_padding = Identity()
def forward(self, x):
x = self.static_padding(x)
x = F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
self.dilation, self.ceil_mode, self.return_indices)
return x
class Identity(nn.Module):
"""Identity mapping.
Send input to output directly.
"""
def __init__(self):
super(Identity, self).__init__()
def forward(self, input):
return input
################################################################################
### Helper functions for loading model params
################################################################################
# BlockDecoder: A Class for encoding and decoding BlockArgs
# efficientnet_params: A function to query compound coefficient
# get_model_params and efficientnet:
# Functions to get BlockArgs and GlobalParams for efficientnet
# url_map and url_map_advprop: Dicts of url_map for pretrained weights
# load_pretrained_weights: A function to load pretrained weights
class BlockDecoder(object):
"""Block Decoder for readability,
straight from the official TensorFlow repository.
"""
@staticmethod
def _decode_block_string(block_string):
"""Get a block through a string notation of arguments.
Args:
block_string (str): A string notation of arguments.
Examples: 'r1_k3_s11_e1_i32_o16_se0.25_noskip'.
Returns:
BlockArgs: The namedtuple defined at the top of this file.
"""
assert isinstance(block_string, str)
ops = block_string.split('_')
options = {}
for op in ops:
splits = re.split(r'(\d.*)', op)
if len(splits) >= 2:
key, value = splits[:2]
options[key] = value
# Check stride
assert (('s' in options and len(options['s']) == 1) or
(len(options['s']) == 2 and options['s'][0] == options['s'][1]))
return BlockArgs(
num_repeat=int(options['r']),
kernel_size=int(options['k']),
stride=[int(options['s'][0])],
expand_ratio=int(options['e']),
input_filters=int(options['i']),
output_filters=int(options['o']),
se_ratio=float(options['se']) if 'se' in options else None,
id_skip=('noskip' not in block_string))
@staticmethod
def _encode_block_string(block):
"""Encode a block to a string.
Args:
block (namedtuple): A BlockArgs type argument.
Returns:
block_string: A String form of BlockArgs.
"""
args = [
'r%d' % block.num_repeat,
'k%d' % block.kernel_size,
's%d%d' % (block.strides[0], block.strides[1]),
'e%s' % block.expand_ratio,
'i%d' % block.input_filters,
'o%d' % block.output_filters
]
if 0 < block.se_ratio <= 1:
args.append('se%s' % block.se_ratio)
if block.id_skip is False:
args.append('noskip')
return '_'.join(args)
@staticmethod
def decode(string_list):
"""Decode a list of string notations to specify blocks inside the network.
Args:
string_list (list[str]): A list of strings, each string is a notation of block.
Returns:
blocks_args: A list of BlockArgs namedtuples of block args.
"""
assert isinstance(string_list, list)
blocks_args = []
for block_string in string_list:
blocks_args.append(BlockDecoder._decode_block_string(block_string))
return blocks_args
@staticmethod
def encode(blocks_args):
"""Encode a list of BlockArgs to a list of strings.
Args:
blocks_args (list[namedtuples]): A list of BlockArgs namedtuples of block args.
Returns:
block_strings: A list of strings, each string is a notation of block.
"""
block_strings = []
for block in blocks_args:
block_strings.append(BlockDecoder._encode_block_string(block))
return block_strings
def efficientnet_params(model_name):
"""Map EfficientNet model name to parameter coefficients.
Args:
model_name (str): Model name to be queried.
Returns:
params_dict[model_name]: A (width,depth,res,dropout) tuple.
"""
params_dict = {
# Coefficients: width,depth,res,dropout
'efficientnet-b0': (1.0, 1.0, 224, 0.2),
'efficientnet-b1': (1.0, 1.1, 240, 0.2),
'efficientnet-b2': (1.1, 1.2, 260, 0.3),
'efficientnet-b3': (1.2, 1.4, 300, 0.3),
'efficientnet-b4': (1.4, 1.8, 380, 0.4),
'efficientnet-b5': (1.6, 2.2, 456, 0.4),
'efficientnet-b6': (1.8, 2.6, 528, 0.5),
'efficientnet-b7': (2.0, 3.1, 600, 0.5),
'efficientnet-b8': (2.2, 3.6, 672, 0.5),
'efficientnet-l2': (4.3, 5.3, 800, 0.5),
}
return params_dict[model_name]
def efficientnet(width_coefficient=None, depth_coefficient=None, image_size=None,
dropout_rate=0.2, drop_connect_rate=0.2, num_classes=1000):
"""Create BlockArgs and GlobalParams for efficientnet model.
Args:
width_coefficient (float)
depth_coefficient (float)
image_size (int)
dropout_rate (float)
drop_connect_rate (float)
num_classes (int)
Meaning as the name suggests.
Returns:
blocks_args, global_params.
"""
# Blocks args for the whole model(efficientnet-b0 by default)
# It will be modified in the construction of EfficientNet Class according to model
blocks_args = [
'r1_k3_s11_e1_i32_o16_se0.25',
'r2_k3_s22_e6_i16_o24_se0.25',
'r2_k5_s22_e6_i24_o40_se0.25',
'r3_k3_s22_e6_i40_o80_se0.25',
'r3_k5_s11_e6_i80_o112_se0.25',
'r4_k5_s22_e6_i112_o192_se0.25',
'r1_k3_s11_e6_i192_o320_se0.25',
]
blocks_args = BlockDecoder.decode(blocks_args)
global_params = GlobalParams(
width_coefficient=width_coefficient,
depth_coefficient=depth_coefficient,
image_size=image_size,
dropout_rate=dropout_rate,
num_classes=num_classes,
batch_norm_momentum=0.99,
batch_norm_epsilon=1e-3,
drop_connect_rate=drop_connect_rate,
depth_divisor=8,
min_depth=None,
)
return blocks_args, global_params
def get_model_params(model_name, override_params):
"""Get the block args and global params for a given model name.
Args:
model_name (str): Model's name.
override_params (dict): A dict to modify global_params.
Returns:
blocks_args, global_params
"""
if model_name.startswith('efficientnet'):
w, d, s, p = efficientnet_params(model_name)
# note: all models have drop connect rate = 0.2
blocks_args, global_params = efficientnet(
width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s)
else:
raise NotImplementedError('model name is not pre-defined: %s' % model_name)
if override_params:
# ValueError will be raised here if override_params has fields not included in global_params.
global_params = global_params._replace(**override_params)
return blocks_args, global_params
# train with Standard methods
# check more details in paper(EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks)
url_map = {
'efficientnet-b0': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth',
'efficientnet-b1': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b1-f1951068.pth',
'efficientnet-b2': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth',
'efficientnet-b3': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b3-5fb5a3c3.pth',
'efficientnet-b4': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth',
'efficientnet-b5': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b5-b6417697.pth',
'efficientnet-b6': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b6-c76e70fd.pth',
'efficientnet-b7': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth',
}
# train with Adversarial Examples(AdvProp)
# check more details in paper(Adversarial Examples Improve Image Recognition)
url_map_advprop = {
'efficientnet-b0': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b0-b64d5a18.pth',
'efficientnet-b1': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b1-0f3ce85a.pth',
'efficientnet-b2': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b2-6e9d97e5.pth',
'efficientnet-b3': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b3-cdd7c0f4.pth',
'efficientnet-b4': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b4-44fb3a87.pth',
'efficientnet-b5': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b5-86493f6b.pth',
'efficientnet-b6': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b6-ac80338e.pth',
'efficientnet-b7': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b7-4652b6dd.pth',
'efficientnet-b8': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b8-22a8fe65.pth',
}
# TODO: add the petrained weights url map of 'efficientnet-l2'
def load_pretrained_weights(model, model_name, weights_path=None, load_fc=True, advprop=False):
"""Loads pretrained weights from weights path or download using url.
Args:
model (Module): The whole model of efficientnet.
model_name (str): Model name of efficientnet.
weights_path (None or str):
str: path to pretrained weights file on the local disk.
None: use pretrained weights downloaded from the Internet.
load_fc (bool): Whether to load pretrained weights for fc layer at the end of the model.
advprop (bool): Whether to load pretrained weights
trained with advprop (valid when weights_path is None).
"""
if isinstance(weights_path,str):
state_dict = torch.load(weights_path)
else:
# AutoAugment or Advprop (different preprocessing)
url_map_ = url_map_advprop if advprop else url_map
state_dict = model_zoo.load_url(url_map_[model_name])
if load_fc:
ret = model.load_state_dict(state_dict, strict=False)
assert not ret.missing_keys, f'Missing keys when loading pretrained weights: {ret.missing_keys}'
else:
state_dict.pop('_fc.weight')
state_dict.pop('_fc.bias')
ret = model.load_state_dict(state_dict, strict=False)
assert set(ret.missing_keys) == set(
['_fc.weight', '_fc.bias']), f'Missing keys when loading pretrained weights: {ret.missing_keys}'
assert not ret.unexpected_keys, f'Missing keys when loading pretrained weights: {ret.unexpected_keys}'
print('Loaded pretrained weights for {}'.format(model_name))
@@ -0,0 +1,23 @@
### Imagenet
This is a preliminary directory for evaluating the model on ImageNet. It is adapted from the standard PyTorch Imagenet script.
For now, only evaluation is supported, but I am currently building scripts to assist with training new models on Imagenet.
The evaluation results are slightly different from the original TensorFlow repository, due to differences in data preprocessing. For example, with the current preprocessing, `efficientnet-b3` gives a top-1 accuracy of `80.8`, rather than `81.1` in the paper. I am working on porting the TensorFlow preprocessing into PyTorch to address this issue.
To run on Imagenet, place your `train` and `val` directories in `data`.
Example commands:
```bash
# Evaluate small EfficientNet on CPU
python main.py data -e -a 'efficientnet-b0' --pretrained
```
```bash
# Evaluate medium EfficientNet on GPU
python main.py data -e -a 'efficientnet-b3' --pretrained --gpu 0 --batch-size 128
```
```bash
# Evaluate ResNet-50 for comparison
python main.py data -e -a 'resnet50' --pretrained --gpu 0
```
@@ -0,0 +1,5 @@
### ImageNet
Download ImageNet and place it into `train` and `val` folders here.
More details may be found with the official PyTorch ImageNet example [here](https://github.com/pytorch/examples/blob/master/imagenet).
@@ -0,0 +1,531 @@
"""
Evaluate on ImageNet. Note that at the moment, training is not implemented (I am working on it).
that being said, evaluation is working.
"""
import argparse
import os
import sys
import random
import shutil
import time
import warnings
import PIL
import numpy as np
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from apex import amp
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../'))
from efficientnet_pytorch import EfficientNet
from efficientnet_pytorch import rand_augment_transform, augment_and_mix_transform, auto_augment_transform
from efficientnet_pytorch import RMSpropTF
from efficientnet_pytorch import npu_info
from benchmark_log import hwlog
from benchmark_log.basic_utils import get_environment_info
from benchmark_log.basic_utils import get_model_parameter
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('--data', metavar='DIR',
help='path to dataset')
parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
help='model architecture (default: resnet18)')
parser.add_argument('-j', '--workers', default=128, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=90, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=256, type=int,
metavar='N',
help='mini-batch size (default: 256), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-5, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=10, type=int,
metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='hccl', type=str,
help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
help='seed for initializing training. ')
parser.add_argument('--npu', default=None, type=str,
help='npu id to use.')
parser.add_argument('--image_size', default=224, type=int,
help='image size')
parser.add_argument('--advprop', default=False, action='store_true',
help='use advprop or not')
parser.add_argument('--multiprocessing-distributed', action='store_true',
help='Use multi-processing distributed training to launch '
'N processes per node, which has N GPUs. This is the '
'fastest way to use PyTorch for either single node or '
'multi node data parallel training')
parser.add_argument('--autoaug', action='store_true', help='use auto augment')
parser.add_argument('--amp', action='store_true', help='use apex')
parser.add_argument('--pm', '--precision-mode', default='O1', type=str,
help='precision mode to use for mix precision, only support O1, O2')
parser.add_argument('--loss_scale', default=1024, type=int, help='loss_scale for amp')
parser.add_argument('--addr', default='127.0.0.1', type=str,
help='npu id to use.')
parser.add_argument('--nnpus_per_node', default=None, type=int,
help='number of npus to use for distributed train on each node')
parser.add_argument('--val_feq', default=10, type=int,
help='validation frequency')
parser.add_argument('--device_list', default='0,1,2,3,4,5,6,7', type=str, help='device id list')
def device_id_to_process_device_map(device_list):
devices = device_list.split(",")
devices = [int(x) for x in devices]
devices.sort()
process_device_map = dict()
for process_id, device_id in enumerate(devices):
process_device_map[process_id] = device_id
return process_device_map
def main():
args = parser.parse_args()
if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
args.process_device_map = device_id_to_process_device_map(args.device_list)
nnpus_per_node = len(args.process_device_map)
if args.multiprocessing_distributed:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args.world_size = nnpus_per_node * args.world_size
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
os.environ['MASTER_ADDR'] = args.addr
os.environ['MASTER_PORT'] = '29688'
mp.spawn(main_worker, nprocs=nnpus_per_node, args=(nnpus_per_node, args))
else:
# Simply call main_worker function
main_worker(args.npu, nnpus_per_node, args)
def main_worker(npu, nnpus_per_node, args):
args.npu = npu
if args.distributed:
args.npu = args.process_device_map[npu]
if args.npu is not None:
print("Use npu: {} for training".format(args.npu))
torch.npu.set_device('npu:' + str(args.npu))
if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
args.rank = int(os.environ["RANK"])
if args.multiprocessing_distributed:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
args.rank = args.rank * nnpus_per_node + int(npu)
dist.init_process_group(backend=args.dist_backend,
world_size=args.world_size, rank=args.rank)
# create model
if 'efficientnet' in args.arch: # NEW
if args.pretrained:
model = EfficientNet.from_pretrained(args.arch, advprop=args.advprop)
print("=> using pre-trained model '{}'".format(args.arch))
else:
print("=> creating model '{}'".format(args.arch))
model = EfficientNet.from_name(args.arch)
else:
if args.pretrained:
print("=> using pre-trained model '{}'".format(args.arch))
model = models.__dict__[args.arch](pretrained=True)
else:
print("=> creating model '{}'".format(args.arch))
model = models.__dict__[args.arch]()
criterion = nn.CrossEntropyLoss().to('npu:' + str(args.npu))
optimizer = torch.optim.SGD(model.parameters(), args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
model = model.to('npu:' + str(args.npu))
if args.amp:
print("=> use amp...")
if args.pm not in ['O1', 'O2']:
print('=>unsupported precision mode!')
exit()
opt_level = args.pm
model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level, loss_scale=args.loss_scale)
global total_batch_size
total_batch_size = args.batch_size
if args.distributed:
args.batch_size = int(args.batch_size / nnpus_per_node)
args.workers = int(args.workers / nnpus_per_node)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.npu], broadcast_buffers=False)
# optionally resume from a checkpoint
if args.resume:
if os.path.isfile(args.resume):
print("=> loading checkpoint '{}'".format(args.resume))
checkpoint = torch.load(args.resume, map_location='npu:' + str(args.npu))
args.start_epoch = checkpoint['epoch']
if args.amp:
amp.load_state_dict(checkpoint['amp'])
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.resume, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(args.resume))
# Data loading code
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
if args.advprop:
normalize = transforms.Lambda(lambda img: img * 2.0 - 1.0)
else:
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
if 'efficientnet' in args.arch:
image_size = EfficientNet.get_image_size(args.arch)
else:
image_size = args.image_size
if args.autoaug:
print("=> use auto augment...")
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(image_size),
auto_augment_wrapper(image_size),
transforms.ToTensor(),
normalize,
]))
else:
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(image_size),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]))
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
val_transforms = transforms.Compose([
transforms.Resize(image_size, interpolation=PIL.Image.BICUBIC),
transforms.CenterCrop(image_size),
transforms.ToTensor(),
normalize,
])
print('npu:' + str(args.npu), ' optimizer params:', optimizer)
val_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, val_transforms),
batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True)
if args.evaluate:
res = validate(val_loader, model, criterion, args)
with open('res.txt', 'w') as f:
print(res, file=f)
return
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
# train for one epoch
train(train_loader, model, criterion, optimizer, epoch, args, nnpus_per_node)
# evaluate on validation set
if epoch % args.val_feq == 0 or epoch == args.epochs - 1:
acc1 = validate(val_loader, model, criterion, args, epoch, nnpus_per_node)
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % nnpus_per_node == 0):
if not args.amp:
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),
})
else:
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),
'amp': amp.state_dict(),
})
def train(train_loader, model, criterion, optimizer, epoch, args, nnpus_per_node):
batch_time = AverageMeter('Time', ':6.3f')
data_time = AverageMeter('Data', ':6.3f')
losses = AverageMeter('Loss', ':6.4f')
lr = AverageMeter('LR', ':6.4f')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
fps_time = AverageMeter('FPS', ':6.1f')
progress = ProgressMeter(len(train_loader), fps_time, batch_time, data_time, losses, lr, top1,
top5, prefix="Epoch: [{}]".format(epoch))
# switch to train mode
model.train()
end = time.time()
for i, (images, target) in enumerate(train_loader):
adjust_learning_rate_fraction_epoch(optimizer, epoch, i, len(train_loader), args)
# measure data loading time
data_time.update(time.time() - end)
optimizer.zero_grad()
target = target.int()
images, target = images.to('npu:' + str(args.npu), non_blocking=True), target.to('npu:' + str(args.npu), non_blocking=True)
# compute output
output = model(images)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
lr.update(optimizer.param_groups[0]['lr'], images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# compute gradient and do SGD step
if args.amp:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
optimizer.step()
# measure elapsed time
fps_time.update(total_batch_size / (time.time() - end))
batch_time.update(time.time() - end)
end = time.time()
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % nnpus_per_node == 0):
progress.print(i)
# print(' * FPS@all {:.3f}'.format(nnpus_per_node*args.batch_size / batch_time.avg))
hwlog.remark_print(key=hwlog.FPS, value=('{}'.format(fps_time)))
def validate(val_loader, model, criterion, args, epoch, nnpus_per_node):
batch_time = AverageMeter('Time', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(len(val_loader), batch_time, losses, top1, top5,
prefix='Test: ')
# switch to evaluate mode
model.eval()
with torch.no_grad():
end = time.time()
for i, (images, target) in enumerate(val_loader):
target = target.int()
images, target = images.to('npu:' + str(args.npu), non_blocking=True), target.to('npu:' + str(args.npu), non_blocking=True)
# compute output
output = model(images)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % nnpus_per_node == 0):
progress.print(i)
# TODO: this should also be done with the ProgressMeter
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % nnpus_per_node == 0):
print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
.format(top1=top1, top5=top5))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value="{top1.avg:.3f}".format(top1=top1))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value="{top5.avg:.3f}".format(top5=top5))
return top1.avg
def save_checkpoint(state, filename='checkpoint.pth'):
torch.save(state, filename)
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self, name, fmt=':f'):
self.name = name
self.fmt = fmt
self.reset()
self.skip = 0
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
self.skip = 0
def update(self, val, n=1):
self.val = val
# the first 5 value are not accumulated in the average stats
self.skip += 1
if self.skip < 5:
return
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def __str__(self):
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
return fmtstr.format(**self.__dict__)
class ProgressMeter(object):
def __init__(self, num_batches, *meters, prefix=""):
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
self.meters = meters
self.prefix = prefix
def print(self, batch):
entries = [self.prefix + self.batch_fmtstr.format(batch)]
entries += [str(meter) for meter in self.meters]
print('\t'.join(entries))
train_acc1 = str(entries).split("Acc@1")[1].strip().split(" ")[0]
train_acc5 = str(entries).split("Acc@5")[1].strip().split(" ")[0]
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP1, value=train_acc1)
hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP5, value=train_acc5)
def _get_batch_fmtstr(self, num_batches):
num_digits = len(str(num_batches // 1))
fmt = '{:' + str(num_digits) + 'd}'
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
def adjust_learning_rate(optimizer, epoch, args):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = args.lr * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def accuracy(output, target, topk=(1,)):
"""Computes the accuracy over the k top predictions for the specified values of k"""
with torch.no_grad():
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
def auto_augment_wrapper(img_size, auto_augment='original-mstd0.5'):
IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
assert isinstance(auto_augment, str)
aa_params = dict(
translate_const=int(img_size * 0.45),
img_mean=tuple([min(255, round(255 * x)) for x in IMAGENET_DEFAULT_MEAN]),
)
if auto_augment.startswith('rand'):
return rand_augment_transform(auto_augment, aa_params)
elif auto_augment.startswith('augmix'):
aa_params['translate_pct'] = 0.3
return augment_and_mix_transform(auto_augment, aa_params)
else:
return auto_augment_transform(auto_augment, aa_params)
def adjust_learning_rate_fraction_epoch(optimizer, epoch, step, steps_per_epoch, args):
"""Sets the learning rate to the initial LR decayed by 0.97 every 3.0 epochs"""
lr = args.lr * (0.97 ** ((step + epoch * steps_per_epoch) // int(steps_per_epoch * 5.0)))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
if __name__ == '__main__':
cpu_info, npu_infos, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
config_info = get_model_parameter("pytorch_config")
initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_infos)
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
main()
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,43 @@
from efficientnet_pytorch import EfficientNet as _EfficientNet
dependencies = ['torch']
def _create_model_fn(model_name):
def _model_fn(num_classes=1000, in_channels=3, pretrained='imagenet'):
"""Create Efficient Net.
Described in detail here: https://arxiv.org/abs/1905.11946
Args:
num_classes (int, optional): Number of classes, default is 1000.
in_channels (int, optional): Number of input channels, default
is 3.
pretrained (str, optional): One of [None, 'imagenet', 'advprop']
If None, no pretrained model is loaded.
If 'imagenet', models trained on imagenet dataset are loaded.
If 'advprop', models trained using adversarial training called
advprop are loaded. It is important to note that the
preprocessing required for the advprop pretrained models is
slightly different from normal ImageNet preprocessing
"""
model_name_ = model_name.replace('_', '-')
if pretrained is not None:
model = _EfficientNet.from_pretrained(
model_name=model_name_,
advprop=(pretrained == 'advprop'),
num_classes=num_classes,
in_channels=in_channels)
else:
model = _EfficientNet.from_name(
model_name=model_name_,
override_params={'num_classes': num_classes},
)
model._change_in_channels(in_channels)
return model
return _model_fn
for model_name in ['efficientnet_b' + str(i) for i in range(9)]:
locals()[model_name] = _create_model_fn(model_name)
@@ -0,0 +1,9 @@
export ASCEND_HOME=/usr/local/Ascend
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/python3.7.5/lib/
export PYTHONPATH=${PYTHONPATH}:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/hccl
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/
export PYTHONPATH=$PYTHONPATH:${PWD}
export SLOG_PRINT_TO_STDOUT=0
export TASK_QUEUE_ENABLE=1
taskset -c 0-64 python3.7 examples/imagenet/main.py --data=/data/imagenet --arch=efficientnet-b0 --batch-size=256 --lr=0.2 --epochs=200 --autoaug --npu=0 --amp --pm=O1 --loss_scale=1024
@@ -0,0 +1,123 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Note: To use the 'upload' functionality of this file, you must:
# $ pipenv install twine --dev
import io
import os
import sys
from shutil import rmtree
from setuptools import find_packages, setup, Command
# Package meta-data.
NAME = 'efficientnet_pytorch'
DESCRIPTION = 'EfficientNet implemented in PyTorch.'
URL = 'https://github.com/lukemelas/EfficientNet-PyTorch'
EMAIL = 'lmelaskyriazi@college.harvard.edu'
AUTHOR = 'Luke'
REQUIRES_PYTHON = '>=3.5.0'
VERSION = '0.7.0'
# What packages are required for this module to be executed?
REQUIRED = [
'torch'
]
# What packages are optional?
EXTRAS = {
# 'fancy feature': ['django'],
}
# The rest you shouldn't have to touch too much :)
# ------------------------------------------------
# Except, perhaps the License and Trove Classifiers!
# If you do change the License, remember to change the Trove Classifier for that!
here = os.path.abspath(os.path.dirname(__file__))
# Import the README and use it as the long-description.
# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
try:
with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = '\n' + f.read()
except FileNotFoundError:
long_description = DESCRIPTION
# Load the package's __version__.py module as a dictionary.
about = {}
if not VERSION:
project_slug = NAME.lower().replace("-", "_").replace(" ", "_")
with open(os.path.join(here, project_slug, '__version__.py')) as f:
exec(f.read(), about)
else:
about['__version__'] = VERSION
class UploadCommand(Command):
"""Support setup.py upload."""
description = 'Build and publish the package.'
user_options = []
@staticmethod
def status(s):
"""Prints things in bold."""
print('\033[1m{0}\033[0m'.format(s))
def initialize_options(self):
pass
def finalize_options(self):
pass
def run(self):
try:
self.status('Removing previous builds…')
rmtree(os.path.join(here, 'dist'))
except OSError:
pass
self.status('Building Source and Wheel (universal) distribution…')
os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable))
self.status('Uploading the package to PyPI via Twine…')
os.system('twine upload dist/*')
self.status('Pushing git tags…')
os.system('git tag v{0}'.format(about['__version__']))
os.system('git push --tags')
sys.exit()
# Where the magic happens:
setup(
name=NAME,
version=about['__version__'],
description=DESCRIPTION,
long_description=long_description,
long_description_content_type='text/markdown',
author=AUTHOR,
author_email=EMAIL,
python_requires=REQUIRES_PYTHON,
url=URL,
packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]),
# py_modules=['model'], # If your package is a single module, use this instead of 'packages'
install_requires=REQUIRED,
extras_require=EXTRAS,
include_package_data=True,
license='Apache',
classifiers=[
# Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.6',
],
# $ setup.py publish support.
cmdclass={
'upload': UploadCommand,
},
)
@@ -0,0 +1,124 @@
from collections import OrderedDict
import pytest
import torch
import torch.nn as nn
from efficientnet_pytorch import EfficientNet
# -- fixtures -------------------------------------------------------------------------------------
@pytest.fixture(scope='module', params=[x for x in range(4)])
def model(request):
return 'efficientnet-b{}'.format(request.param)
@pytest.fixture(scope='module', params=[True, False])
def pretrained(request):
return request.param
@pytest.fixture(scope='function')
def net(model, pretrained):
return EfficientNet.from_pretrained(model) if pretrained else EfficientNet.from_name(model)
# -- tests ----------------------------------------------------------------------------------------
@pytest.mark.parametrize('img_size', [224, 256, 512])
def test_forward(net, img_size):
"""Test `.forward()` doesn't throw an error"""
data = torch.zeros((1, 3, img_size, img_size))
output = net(data)
assert not torch.isnan(output).any()
def test_dropout_training(net):
"""Test dropout `.training` is set by `.train()` on parent `nn.module`"""
net.train()
assert net._dropout.training == True
def test_dropout_eval(net):
"""Test dropout `.training` is set by `.eval()` on parent `nn.module`"""
net.eval()
assert net._dropout.training == False
def test_dropout_update(net):
"""Test dropout `.training` is updated by `.train()` and `.eval()` on parent `nn.module`"""
net.train()
assert net._dropout.training == True
net.eval()
assert net._dropout.training == False
net.train()
assert net._dropout.training == True
net.eval()
assert net._dropout.training == False
@pytest.mark.parametrize('img_size', [224, 256, 512])
def test_modify_dropout(net, img_size):
"""Test ability to modify dropout and fc modules of network"""
dropout = nn.Sequential(OrderedDict([
('_bn2', nn.BatchNorm1d(net._bn1.num_features)),
('_drop1', nn.Dropout(p=net._global_params.dropout_rate)),
('_linear1', nn.Linear(net._bn1.num_features, 512)),
('_relu', nn.ReLU()),
('_bn3', nn.BatchNorm1d(512)),
('_drop2', nn.Dropout(p=net._global_params.dropout_rate / 2))
]))
fc = nn.Linear(512, net._global_params.num_classes)
net._dropout = dropout
net._fc = fc
data = torch.zeros((2, 3, img_size, img_size))
output = net(data)
assert not torch.isnan(output).any()
@pytest.mark.parametrize('img_size', [224, 256, 512])
def test_modify_pool(net, img_size):
"""Test ability to modify pooling module of network"""
class AdaptiveMaxAvgPool(nn.Module):
def __init__(self):
super().__init__()
self.ada_avgpool = nn.AdaptiveAvgPool2d(1)
self.ada_maxpool = nn.AdaptiveMaxPool2d(1)
def forward(self, x):
avg_x = self.ada_avgpool(x)
max_x = self.ada_maxpool(x)
x = torch.cat((avg_x, max_x), dim=1)
return x
avg_pooling = AdaptiveMaxAvgPool()
fc = nn.Linear(net._fc.in_features * 2, net._global_params.num_classes)
net._avg_pooling = avg_pooling
net._fc = fc
data = torch.zeros((2, 3, img_size, img_size))
output = net(data)
assert not torch.isnan(output).any()
@pytest.mark.parametrize('img_size', [224, 256, 512])
def test_extract_endpoints(net, img_size):
"""Test `.extract_endpoints()` doesn't throw an error"""
data = torch.zeros((1, 3, img_size, img_size))
endpoints = net.extract_endpoints(data)
assert not torch.isnan(endpoints['reduction_1']).any()
assert not torch.isnan(endpoints['reduction_2']).any()
assert not torch.isnan(endpoints['reduction_3']).any()
assert not torch.isnan(endpoints['reduction_4']).any()
assert not torch.isnan(endpoints['reduction_5']).any()
assert endpoints['reduction_1'].size(2) == img_size // 2
assert endpoints['reduction_2'].size(2) == img_size // 4
assert endpoints['reduction_3'].size(2) == img_size // 8
assert endpoints['reduction_4'].size(2) == img_size // 16
assert endpoints['reduction_5'].size(2) == img_size // 32
@@ -0,0 +1,31 @@
############## toolkit situation ################
#export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
#export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
#export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
############## nnae situation ################
if [ -d /usr/local/Ascend/nnae/latest ];then
export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/nnae/latest/toolkit/tools/ide_daemon/bin/
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/
export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
export PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
else
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
fi
# ln -s /usr/local/Ascend/ascend-toolkit/latest/toolkit/bin/adc /usr/local/bin/
export SLOG_PRINT_TO_STDOUT=0
#su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
export TASK_QUEUE_ENABLE=1
@@ -0,0 +1,62 @@
#!/bin/bash
rank_size=$1
yamlPath=$2
toolsPath=$3
currentDir=$(cd "$(dirname "$0")/.."; pwd)
model_name=$(cd $currentDir/..;basename `pwd`)
if [ -f /.dockerenv ];then
CLUSTER=$4
MPIRUN_ALL_IP="$5"
export CLUSTER=${CLUSTER}
fi
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "pytorch_config")
# 清除旧日志
rm -rf /var/log/npu/slog/host-0/*
rm -rf ${currentDir}/result/*.log
#mkdir train job path
currtime=`date +%Y%m%d%H%M%S`
mkdir -p ${currentDir%train*}/train/result/pt_efficientnet/training_job_${currtime}/
export train_job_dir=${currentDir%train*}/train/result/pt_efficientnet/training_job_${currtime}/
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${train_job_dir}"
# device 列表, 若无指定 device 根据 rank_size 顺序选择
eval device_group=\$device_group_${rank_size}p
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
device_group="$(seq 0 "$(expr $rank_size - 1)")"
fi
# get last device id in device_group, hw log in performance from the dir named last_device_id
device_group_str=`echo ${device_group} | sed 's/ //g'`
first_device_id=`echo ${device_group_str: 0:1}`
if [ x"${CLUSTER}" == x"True" ];then
this_ip=$(hostname -I |awk '{print $1}')
ln -snf ${currentDir%train*}/train/result/pt_efficientnet/training_job_${currtime}/0/hw_efficientnet.log ${currentDir%train*}/train/result/pt_efficientnet/training_job_${currtime}/
for ip in $MPIRUN_ALL_IP;do
if [ x"$ip" != x"$this_ip" ];then
scp $yamlPath root@$ip:$yamlPath
scp ${jsonFilePath} root@$ip:${jsonFilePath}
fi
done
export PATH=$PATH:/usr/local/mpirun4.0/bin
mpirun -H ${mpirun_ip} \
--bind-to none -map-by slot\
--allow-run-as-root \
--mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
--prefix /usr/local/mpirun4.0/ \
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} ${CLUSTER}
else
rank_id=0
#for device_id in $device_group;do
ln -snf ${currentDir%train*}/train/result/pt_efficientnet/training_job_${currtime}/${first_device_id}/hw_efficientnet.log ${currentDir%train*}/train/result/pt_efficientnet/training_job_${currtime}/
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} $rank_id &
# let rank_id++
# done
fi
wait
@@ -0,0 +1,132 @@
#!/usr/bin/env bash
device_id=$1
rank_size=$2
yamlPath=$3
currentDir=$(cd "$(dirname "$0")/.."; pwd)
currtime=$4
toolsPath=$5
export YAML_PATH=$3
mkdir -p ${currentDir%train*}/train/result/pt_efficientnet/training_job_${currtime}/
export train_job_dir=${currentDir%train*}/train/result/pt_efficientnet/training_job_${currtime}/
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "pytorch_config")
export REMARK_LOG_FILE=hw_efficientnet.log # 打点日志文件名称, 必须hw_后跟模型名称小写
benchmark_log_path=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
#source ${currentDir}/config/npu_set_env.sh
source ${currentDir}/config/set_env_b023.sh
# user env
export HCCL_CONNECT_TIMEOUT=600
export JOB_ID=9999001
export HCCL_RANK_TABLE_PATH=${currentDir}/config/${rank_size}p.json
export RANK_SIZE=${rank_size}
export SLOG_PRINT_TO_STDOUT=0
export DEVICE_ID=${device_id}
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
export DEVICE_INDEX=${DEVICE_INDEX}
cd ${train_job_dir}
curd_dir=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils/atlasboost
export PYTHONPATH=$PYTHONPATH:${curd_dir}
if [ x"$6" != x"True" ];then
rank_id=$6
export RANK_ID=$6
else
device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
atlasboost.set_device_id(device_id);print(atlasboost.rank())")
device_id_mo=`echo $device_id_mo`
rank_id=${device_id_mo##* }
export RANK_ID=${rank_id}
device=${device_id_mo##*deviceid = }
device_id=${device%% phyid=*}
export DEVICE_ID=${device_id}
hccljson=${train_job_dir}/*.json
cp ${hccljson} ${currentDir}/config/${rank_size}p.json
fi
#mkdir exec path
mkdir -p ${train_job_dir}/${device_id}
cd ${train_job_dir}/${device_id}
startTime=`date +%Y%m%d-%H:%M:%S`
startTime_s=`date +%s`
# 根据单卡/多卡区分调用参数
if [ x"$6" == x"True" ];then
# 多卡多机
export CLUSTER=True
fi
if [ x"${mode}" == x"evaluate" ];then
pass
elif [ x"${rank_size}" == x"1" ];then
# 单卡
taskset -c 0-128 python3.7 ${currentDir}/code/examples/imagenet/main.py \
--data=${data_url} \
--arch=efficientnet-b0 \
--batch-size=${batch_size} \
--lr=0.2 \
--momentum=0 \
--epochs=${epoches} \
--autoaug \
--amp \
--pm=O1 \
--loss_scale=128 \
--val_feq=10 \
--npu=${device} > ${train_job_dir}/train_${rank_size}p.log 2>&1
elif [ ${rank_size} -le 8 ];then
# 单机多卡
taskset -c 0-128 python3.7 ${currentDir}/code/examples/imagenet/main.py \
--data=${data_url} \
--arch=efficientnet-b0 \
--batch-size=${batch_size} \
--lr=${lr} \
--momentum=0 \
--epochs=${epoches} \
--autoaug \
--amp \
--pm=O1 \
--loss_scale=128 \
--val_feq=10 \
--addr=$(hostname -I |awk '{print $1}') \
--dist-backend=hccl \
--multiprocessing-distributed \
--world-size 1 \
--rank 0 \
--device_list ${device_group} > ${train_job_dir}/train_${rank_size}p.log 2>&1
fi
#taskset -c 0-20 python3.7 ${currentDir}/code/efficientnet.py > ./train.log 2>&1
if [ $? -eq 0 ];then
echo ":::ABK 1.0.0 efficientnet train success"
echo ":::ABK 1.0.0 efficientnet train success" >> ${train_job_dir}/train_${rank_size}p.log
echo ":::ABK 1.0.0 efficientnet train success" >> ./hw_efficientnet.log
else
echo ":::ABK 1.0.0 efficientnet train failed"
echo ":::ABK 1.0.0 efficientnet train failed" >> ${train_job_dir}/train_${rank_size}p.log
echo ":::ABK 1.0.0 efficientnet train failed" >> ./hw_efficientnet.log
fi
endTime=`date +%Y%m%d-%H:%M:%S`
endTime_s=`date +%s`
sumTime=$[ $endTime_s - $startTime_s ]
hour=$(( $sumTime/3600 ))
min=$(( ($sumTime-${hour}*3600)/60 ))
sec=$(( $sumTime-${hour}*3600-${min}*60 ))
echo ":::ABK 1.0.0 efficientnet train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_efficientnet.log
@@ -0,0 +1 @@
#!/bin/bash

Some files were not shown because too many files have changed in this diff Show More