[add]上传训练benchmark by z00560161

2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1 @@
+#!/bin/bash
@@ -0,0 +1,25 @@
+# MobileNet_pytorch训练说明
+
+### 1. 模型训练参数配置
+
+在train/yaml/MobileNet.yaml中修改相应配置， 配置项含义:
+
+```
+pytorch_config:
+    data_url: 数据集路径
+    epoches: 跑多少个epoch
+    batch_size: 单p默认768 2p 1534 4p 3072  8p默认6144
+    lr: 默认参数1p 0.03 2p 0.06 4p 0.12 8p 0.24
+    seed: 123456
+    docker_image: docker 镜像名称:版本号
+```
+
+------
+
+
+
+
+
+
+
+    
@@ -0,0 +1 @@
+# MobileNetV2 NPU训练
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+g_feat_in = []
+g_feat_out = []
+g_grad_in = []
+g_grad_out = []
+
+
+def forward_hook_fn(module, input, output):
+    g_feat_in.append(input)
+    g_feat_out.append(output)
+    print(module)
+    print(input)
+    print(output)
+
+
+def backward_hook_fn(module, grad_input, grad_output):
+    g_grad_in.append(grad_input)
+    g_grad_out.append(grad_output)
+    print(module)
+    print(grad_input)
+    print(grad_input)
+
+
+
+
+
@@ -0,0 +1,498 @@
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torchvision.models as models
+from mobilenet import mobilenet_v2
+import torch.npu
+
+# from torch.utils.tensorboard import SummaryWriter
+
+from apex import amp
+import numpy as np
+
+from hook import *
+
+from benchmark_log import hwlog
+from benchmark_log.basic_utils import get_environment_info
+from benchmark_log.basic_utils import get_model_parameter
+
+
+# model_names = sorted(name for name in models.__dict__
+#     if name.islower() and not name.startswith("__")
+#     and callable(models.__dict__[name]))
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('--data', metavar='DIR', default='/dataset/imagenet',
+                    help='path to dataset')
+# parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
+#                     choices=model_names,
+#                     help='model architecture: ' +
+#                         ' | '.join(model_names) +
+#                         ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('-p', '--print-freq', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--world-size', default=-1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=-1, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='nccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu', default=None, type=int,
+                    help='GPU id to use.')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+
+parser.add_argument('--amp', default=False, action='store_true',
+                    help='use amp to train the model')
+parser.add_argument('--opt-level', default=None, type=str, help='apex optimize level')
+parser.add_argument('--loss-scale-value', default='1024', type=int, help='static loss scale value')
+
+parser.add_argument('--summary-path', default=None, type=str, help='event file path')
+parser.add_argument('--stop-step-num', default=None, type=int, help='after the stop-step, killing the training task')
+parser.add_argument('--device', default='npu:0', type=str, help='device type, cpu or npu:x or cuda:x')
+parser.add_argument('--eval-freq', default=10, type=int, help='test interval')
+parser.add_argument('--hook', default=False, action='store_true', help='pytorch hook')
+
+best_acc1 = 0
+cur_step = 0
+
+
+def seed_everything(seed, device):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+    if 'cuda' in device:
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        cudnn.deterministic = True
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+
+def main():
+    args = parser.parse_args()
+
+    if args.seed is not None:
+        seed_everything(args.seed, args.device)
+
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    main_worker(args)
+
+
+def main_worker(args):
+    global best_acc1
+    global cur_step
+
+    # sum_writer = SummaryWriter(args.summary_path)
+    global_step = -1
+
+    if 'npu' in args.device:
+        torch.npu.set_device(args.device)
+    if 'cuda' in args.device:
+        torch.cuda.set_device(args.device)
+
+    model = mobilenet_v2()
+
+    # set hook
+    if args.hook:
+        modules = model.named_modules()
+        for name, module in modules:
+            module.register_forward_hook(forward_hook_fn)
+            module.register_backward_hook(backward_hook_fn)
+
+    optimizer = torch.optim.SGD(model.parameters(), args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+
+    criterion = nn.CrossEntropyLoss()
+
+    if 'npu' in args.device or 'cuda' in args.device:
+        model = model.to(args.device)
+        criterion = criterion.to(args.device)
+
+    if args.amp:
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale_value)
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume, map_location=args.device)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if args.amp:
+                amp.load_state_dict(checkpoint['amp'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+
+    # Data loading code
+    traindir = os.path.join(args.data, 'train')
+    valdir = os.path.join(args.data, 'val')
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+    train_dataset = datasets.ImageFolder(
+        traindir,
+        transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            normalize,
+        ]))
+
+    train_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
+
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            normalize,
+        ])),
+        batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True, drop_last=True)
+
+    if args.evaluate:
+        validate(val_loader, model, criterion, args, global_step)
+        return
+
+    for epoch in range(args.start_epoch, args.epochs):
+
+        # train for one epoch
+        global_step = train(train_loader, model, criterion, optimizer, epoch, args, global_step)
+
+        if (epoch + 1) % (args.eval_freq) == 0 or epoch == args.epochs - 1:
+            # evaluate on validation set
+            acc1 = validate(val_loader, model, criterion, args, global_step)
+
+            # remember best acc@1 and save checkpoint
+            is_best = acc1 > best_acc1
+            best_acc1 = max(acc1, best_acc1)
+
+            # save checkpoint
+            if args.amp:
+                save_checkpoint({
+                    'epoch': epoch + 1,
+                    'state_dict': model.state_dict(),
+                    'best_acc1': best_acc1,
+                    'optimizer': optimizer.state_dict(),
+                    'amp': amp.state_dict(),
+                }, is_best)
+            else:
+                save_checkpoint({
+                    'epoch': epoch + 1,
+                    'state_dict': model.state_dict(),
+                    'best_acc1': best_acc1,
+                    'optimizer': optimizer.state_dict(),
+                }, is_best)
+
+        if args.stop_step_num is not None and cur_step >= args.stop_step_num:
+            break
+
+    # sum_writer.close()
+
+
+def train(train_loader, model, criterion, optimizer, epoch, args, global_step, sum_writer=None):
+    global cur_step
+
+    if args.seed is not None:
+        seed_everything(args.seed + epoch, args.device)
+
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    learning_rate = AverageMeter('LR', ':2.8f')
+    losses = AverageMeter('Loss', ':6.8f')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(train_loader),
+        [batch_time, data_time, learning_rate, losses, top1, top5],
+        prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    steps_per_epoch = len(train_loader)
+    for i, (images, target) in enumerate(train_loader):
+
+        global_step = epoch * steps_per_epoch + i
+        cur_step = global_step
+
+        lr = adjust_learning_rate(optimizer, global_step, steps_per_epoch, args)
+
+        learning_rate.update(lr)
+
+        # sum_writer.add_scalar('learning rate', lr, global_step)
+
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        if 'npu' in args.device:
+            target = target.to(torch.int32)
+
+        if 'npu' in args.device or 'cuda' in args.device:
+            images = images.to(args.device, non_blocking=True)
+            target = target.to(args.device, non_blocking=True)
+
+        # output = None
+        # loss = None
+        # with torch.autograd.profiler.profile(record_shapes=True, use_npu=True) as prof:
+
+        # compute output
+        output = model(images)
+        loss = criterion(output, target)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        if args.amp:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+
+        # sum_writer.add_scalar('Accuary/train/top1', acc1, global_step)
+        # sum_writer.add_scalar('Accuary/train/top5', acc5, global_step)
+        # sum_writer.add_scalar('Loss/train/loss', loss, global_step)
+
+        optimizer.step()
+        # for name, parms in model.named_parameters():
+        #     print('-->name:', name, ' -->grad_value_max:', torch.max(parms.grad), ' -->grad_value_min:', torch.min(parms.grad))
+
+        # print(prof.key_averages().table())
+        # prof.export_chrome_trace("mobilenetv2_{}_npu.prof".format(i))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            progress.display(i)
+
+        if args.stop_step_num is not None and cur_step >= args.stop_step_num:
+            break
+
+        print(' * FPS@all {:.3f}'.format(args.batch_size / batch_time.avg))
+        hwlog.remark_print(key=hwlog.FPS, value=' * FPS@all {:.3f}'.format(args.batch_size / batch_time.avg))
+    return global_step
+
+
+def validate(val_loader, model, criterion, args, global_step, sum_writer=None):
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(val_loader),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        end = time.time()
+        for i, (images, target) in enumerate(val_loader):
+
+            if 'npu' in args.device:
+                target = target.to(torch.int32)
+
+            if 'npu' in args.device or 'cuda' in args.device:
+                images = images.to(args.device, non_blocking=True)
+                target = target.to(args.device, non_blocking=True)
+
+            # compute output
+            output = model(images)
+            loss = criterion(output, target)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % args.print_freq == 0:
+                progress.display(i)
+
+        # TODO: this should also be done with the ProgressMeter
+        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
+              .format(top1=top1, top5=top5))
+
+        hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value="{top1.avg:.3f}".format(top1=top1))
+        hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value="{top5.avg:.3f}".format(top5=top5))
+
+        #if not args.evaluate:
+        #    # sum_writer.add_scalar('Loss/validation/loss', losses, global_step)
+        #      sum_writer.add_scalar('Accuary/validation/top1', top1.avg, global_step)
+        #      sum_writer.add_scalar('Accuary/validation/top5', top5.avg, global_step)
+
+    return top1.avg
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best.pth.tar')
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+        # 日志打点
+        train_acc1 = str(entries).split("Acc@1")[1].strip().split(" ")[0]
+        train_acc5 = str(entries).split("Acc@5")[1].strip().split(" ")[0]
+        hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP1, value=train_acc1)
+        hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP5, value=train_acc5)
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+
+def adjust_learning_rate(optimizer, global_step, steps_per_epoch, args):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    # lr = args.lr * (0.98 ** (epoch / 2.5))
+    lr = args.lr * (0.98 ** (global_step // int(steps_per_epoch * 2.5)))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+    return lr
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+if __name__ == '__main__':
+    hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
+    cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
+    config_info = get_model_parameter("pytorch_config")
+    initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
+    hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
+    hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
+    hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
+    hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
+    hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
+    hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
+    hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
+    hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
+    hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
+    hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
+    main()
@@ -0,0 +1,179 @@
+from torch import nn
+# from .utils import load_state_dict_from_url
+
+
+__all__ = ['MobileNetV2', 'mobilenet_v2']
+
+
+model_urls = {
+    'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth',
+}
+
+
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNReLU(nn.Sequential):
+    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
+        padding = (kernel_size - 1) // 2
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
+            nn.BatchNorm2d(out_planes),
+            nn.ReLU6(inplace=True)
+            # nn.ReLU(inplace=True)
+        )
+
+
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        layers = []
+        if expand_ratio != 1:
+            # pw
+            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+        layers.extend([
+            # dw
+            ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
+            # pw-linear
+            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Module):
+    def __init__(self,
+                 num_classes=1000,
+                 width_mult=1.0,
+                 inverted_residual_setting=None,
+                 round_nearest=8,
+                 block=None):
+        """
+        MobileNet V2 main class
+
+        Args:
+            num_classes (int): Number of classes
+            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
+            inverted_residual_setting: Network structure
+            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
+            Set to 1 to turn off rounding
+            block: Module specifying inverted residual building block for mobilenet
+
+        """
+        super(MobileNetV2, self).__init__()
+
+        if block is None:
+            block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+
+        if inverted_residual_setting is None:
+            inverted_residual_setting = [
+                # t, c, n, s
+                [1, 16, 1, 1],
+                [6, 24, 2, 2],
+                [6, 32, 3, 2],
+                [6, 64, 4, 2],
+                [6, 96, 3, 1],
+                [6, 160, 3, 2],
+                [6, 320, 1, 1],
+            ]
+
+        # only check the first element, assuming user knows t,c,n,s are required
+        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
+            raise ValueError("inverted_residual_setting should be non-empty "
+                             "or a 4-element list, got {}".format(inverted_residual_setting))
+
+        # building first layer
+        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+        self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
+        features = [ConvBNReLU(3, input_channel, stride=2)]
+        # building inverted residual blocks
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = _make_divisible(c * width_mult, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
+                input_channel = output_channel
+        # building last several layers
+        features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*features)
+
+        # building classifier
+        self.classifier = nn.Sequential(
+            # p=0.2
+            nn.Dropout(0.2),
+            nn.Linear(self.last_channel, num_classes),
+        )
+
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.zeros_(m.bias)
+
+    def _forward_impl(self, x):
+        # This exists since TorchScript doesn't support inheritance, so the superclass method
+        # (this one) needs to have a name other than `forward` that can be accessed in a subclass
+        x = self.features(x)
+        # Cannot use "squeeze" as batch-size can be 1 => must use reshape with x.shape[0]
+        x = nn.functional.adaptive_avg_pool2d(x, 1).reshape(x.shape[0], -1)
+        x = self.classifier(x)
+        return x
+
+    def forward(self, x):
+        return self._forward_impl(x)
+
+
+def mobilenet_v2(pretrained=False, progress=True, **kwargs):
+    """
+    Constructs a MobileNetV2 architecture from
+    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    model = MobileNetV2(**kwargs)
+    # if pretrained:
+    #     state_dict = load_state_dict_from_url(model_urls['mobilenet_v2'],
+    #                                           progress=progress)
+    #     model.load_state_dict(state_dict)
+    return model
@@ -0,0 +1,18 @@
+{
+  "startCfg":
+  [
+    {
+      "jobID": "123456789",
+      "deviceID": ["0"],
+      "features":
+      [
+        {
+          "name": "task_trace"
+        },
+        {
+          "name": "training_trace"
+        }
+      ]
+    }
+  ]
+}
@@ -0,0 +1 @@
+# MobileNetV2 NPU训练
@@ -0,0 +1,22 @@
+export ASCEND_HOME=/usr/local/Ascend
+export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
+export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$currentDir
+export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
+export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
+
+export SLOG_PRINT_TO_STDOUT=0
+su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 7"
+
+export TASK_QUEUE_ENABLE=0
+taskset -c 111-150 python3 densenet121_1p_main.py \
+	--workers 40 \
+	--arch densenet121 \
+	--npu 7 \
+	--lr 0.1 \
+	--momentum 0.9 \
+	--amp \
+	--batch-size 256 \
+	--epoch 90 \
+	--evaluate \
+	--resume checkpoint.pth.tar \
+	--data /opt/npu/dataset/imagenet
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+g_feat_in = []
+g_feat_out = []
+g_grad_in = []
+g_grad_out = []
+
+
+def forward_hook_fn(module, input, output):
+    g_feat_in.append(input)
+    g_feat_out.append(output)
+    print(module)
+    print(input)
+    print(output)
+
+
+def backward_hook_fn(module, grad_input, grad_output):
+    g_grad_in.append(grad_input)
+    g_grad_out.append(grad_output)
+    print(module)
+    print(grad_input)
+    print(grad_input)
+
+
+
+
+
@@ -0,0 +1,556 @@
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torchvision.models as models
+from mobilenet import mobilenet_v2
+import torch.npu
+import torch.cuda
+
+from torch.utils.tensorboard import SummaryWriter
+
+from apex import amp
+import numpy as np
+
+from hook import *
+
+
+# model_names = sorted(name for name in models.__dict__
+#     if name.islower() and not name.startswith("__")
+#     and callable(models.__dict__[name]))
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('--data', metavar='DIR', default='/dataset/imagenet',
+                    help='path to dataset')
+# parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
+#                     choices=model_names,
+#                     help='model architecture: ' +
+#                         ' | '.join(model_names) +
+#                         ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('-p', '--print-freq', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+# parser.add_argument('--world-size', default=-1, type=int,
+#                     help='number of nodes for distributed training')
+parser.add_argument('--node-nums', default=1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=0, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='nccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu', default=None, type=int,
+                    help='GPU id to use.')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+
+parser.add_argument('--addr', default='10.136.181.115', type=str,
+                    help='master addr')
+parser.add_argument('--device-id', default=None, type=int,
+                    help='GPU id to use.')
+
+parser.add_argument('--amp', default=False, action='store_true',
+                    help='use amp to train the model')
+parser.add_argument('--opt-level', default=None, type=str, help='apex optimize level')
+parser.add_argument('--loss-scale-value', default='1024', type=int, help='static loss scale value')
+
+parser.add_argument('--summary-path', default=None, type=str, help='event file path')
+parser.add_argument('--stop-step-num', default=None, type=int, help='after the stop-step, killing the training task')
+parser.add_argument('--device', default='npu', type=str, help='device type, cpu or npu:x or cuda')
+parser.add_argument('--eval-freq', default=10, type=int, help='test interval')
+parser.add_argument('--hook', default=False, action='store_true', help='pytorch hook')
+
+best_acc1 = 0
+cur_step = 0
+
+
+def seed_everything(seed, device):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+    if 'cuda' in device:
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        cudnn.deterministic = True
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+
+def main():
+    args = parser.parse_args()
+
+    if args.seed is not None:
+        seed_everything(args.seed, args.device)
+
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    os.environ['MASTER_ADDR'] = args.addr
+    os.environ['MASTER_PORT'] = '90000'
+
+    args.distributed = args.node_nums > 1 or args.multiprocessing_distributed
+    if not args.distributed:
+        print('dist param is not correct!')
+        return
+
+    if args.device == 'npu':
+        # device_nums_per_node = torch.npu.device_count()
+        device_nums_per_node = 2
+    elif args.device == 'cuda':
+        device_nums_per_node = torch.cuda.device_count()
+    else:
+        print('unknown device type[npu/cuda]!')
+        return
+
+    if args.multiprocessing_distributed:
+        args.world_size = device_nums_per_node * args.node_nums  # world_size means nums of all devices or nums of processes
+        if args.device == 'npu':
+            # main_worker(args.device_id, ngpus_per_node, args)  # 需要外层脚本启多个进程
+            mp.spawn(main_worker, nprocs=device_nums_per_node, args=(device_nums_per_node, args))  # 这里起子进程，就不需要外层脚本启多个进程了
+        else:
+            mp.spawn(main_worker, nprocs=device_nums_per_node, args=(device_nums_per_node, args))
+    else:
+        print('dist param is not correct!')
+        return
+        # main_worker(args.device_id, device_nums_per_node, args)
+
+
+# first param must be the index of PID
+def main_worker(pid_idx, device_nums_per_node, args):
+    global best_acc1
+    global cur_step
+
+    # dist set
+    sum_writer = SummaryWriter(args.summary_path)
+    global_step = -1
+
+    if args.distributed:
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = pid_idx  # args.rank * device_nums_per_node + pid_idx
+            args.pid_idx = pid_idx
+
+        if args.device == 'npu':
+            dist.init_process_group(backend=args.dist_backend, world_size=args.world_size, rank=args.rank)
+        else:
+            dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                    world_size=args.world_size, rank=args.rank)
+
+    if args.distributed:
+        # For multiprocessing distributed, DistributedDataParallel constructor
+        # should always set the single device scope, otherwise,
+        # DistributedDataParallel will use all available devices.
+        if args.device == 'npu':
+            loc = 'npu:{}'.format(pid_idx)
+            torch.npu.set_device(loc)
+        else:
+            torch.cuda.set_device(pid_idx)
+
+        args.batch_size = int(args.batch_size / device_nums_per_node)
+        args.workers = int((args.workers + device_nums_per_node - 1) / device_nums_per_node)
+
+    # Data loading code
+    traindir = os.path.join(args.data, 'train')
+    valdir = os.path.join(args.data, 'val')
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+    train_dataset = datasets.ImageFolder(
+        traindir,
+        transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            normalize,
+        ]))
+
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    else:
+        train_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
+
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            normalize,
+        ])),
+        batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True, drop_last=True)
+
+    # define model and train
+    model = mobilenet_v2()
+
+    criterion = nn.CrossEntropyLoss()
+
+    loc = None
+    if 'npu' == args.device:
+        loc = 'npu:{}'.format(pid_idx)
+    elif 'cuda' == args.device:
+        loc = 'cuda:{}'.format(pid_idx)
+    model = model.to(loc)
+
+    criterion = criterion.to(loc)
+
+    optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
+
+    if args.amp:
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale_value)
+
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[pid_idx], broadcast_buffers=False)
+    # model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+
+    # set hook
+    if args.hook:
+        modules = model.named_modules()
+        for name, module in modules:
+            module.register_forward_hook(forward_hook_fn)
+            module.register_backward_hook(backward_hook_fn)
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume, map_location=args.device)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if args.amp:
+                amp.load_state_dict(checkpoint['amp'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    if args.evaluate:
+        validate(val_loader, model, criterion, args, global_step, sum_writer)
+        return
+
+    for epoch in range(args.start_epoch, args.epochs):
+
+        # train for one epoch
+        global_step = train(train_loader, model, criterion, optimizer, epoch, args, global_step, sum_writer, device_nums_per_node)
+
+        if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1:
+            # evaluate on validation set
+            acc1 = validate(val_loader, model, criterion, args, global_step, sum_writer, device_nums_per_node)
+
+            # remember best acc@1 and save checkpoint
+            is_best = acc1 > best_acc1
+            best_acc1 = max(acc1, best_acc1)
+
+            # save checkpoint
+            if args.amp:
+                save_checkpoint({
+                    'epoch': epoch + 1,
+                    'state_dict': model.state_dict(),
+                    'best_acc1': best_acc1,
+                    'optimizer': optimizer.state_dict(),
+                    'amp': amp.state_dict(),
+                }, is_best)
+            else:
+                save_checkpoint({
+                    'epoch': epoch + 1,
+                    'state_dict': model.state_dict(),
+                    'best_acc1': best_acc1,
+                    'optimizer': optimizer.state_dict(),
+                }, is_best)
+
+        if args.stop_step_num is not None and cur_step >= args.stop_step_num:
+            break
+
+    sum_writer.close()
+
+
+def train(train_loader, model, criterion, optimizer, epoch, args, global_step, sum_writer, device_nums_per_node):
+    global cur_step
+
+    if args.seed is not None:
+        seed_everything(args.seed + epoch, args.device)
+
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    learning_rate = AverageMeter('LR', ':2.8f')
+    losses = AverageMeter('Loss', ':6.8f')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(train_loader),
+        [batch_time, data_time, learning_rate, losses, top1, top5],
+        prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    steps_per_epoch = len(train_loader)
+    for i, (images, target) in enumerate(train_loader):
+
+        global_step = epoch * steps_per_epoch + i
+        cur_step = global_step
+
+        lr = adjust_learning_rate(optimizer, global_step, steps_per_epoch, args)
+
+        learning_rate.update(lr)
+
+        sum_writer.add_scalar('learning rate', lr, global_step)
+
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        if 'npu' in args.device:
+            target = target.to(torch.int32)
+
+        loc = None
+        if 'npu' in args.device:
+            loc = 'npu:{}'.format(args.pid_idx)
+        elif 'cuda' in args.device:
+            loc = 'cuda:{}'.format(args.pid_idx)
+        images = images.to(loc, non_blocking=True)
+        target = target.to(loc, non_blocking=True)
+
+        # output = None
+        # loss = None
+        # with torch.autograd.profiler.profile(record_shapes=True, use_npu=True) as prof:
+
+        # compute output
+        output = model(images)
+        loss = criterion(output, target)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        if args.amp:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+
+        sum_writer.add_scalar('Accuary/train/top1', acc1, global_step)
+        sum_writer.add_scalar('Accuary/train/top5', acc5, global_step)
+        sum_writer.add_scalar('Loss/train/loss', loss, global_step)
+
+        optimizer.step()
+        # for name, parms in model.named_parameters():
+        #     print('-->name:', name, ' -->grad_value_max:', torch.max(parms.grad), ' -->grad_value_min:', torch.min(parms.grad))
+
+        # print(prof.key_averages().table())
+        # prof.export_chrome_trace("mobilenetv2_{}_npu.prof".format(i))
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            if not args.multiprocessing_distributed or \
+                    (args.multiprocessing_distributed and args.rank % device_nums_per_node == 0):
+                progress.display(i)
+
+        if not args.multiprocessing_distributed or \
+                (args.multiprocessing_distributed and args.rank % device_nums_per_node == 0):
+            print('FPS@all: {:.3f}'.format(8 * args.batch_size / batch_time.avg))
+
+        if args.stop_step_num is not None and cur_step >= args.stop_step_num:
+            break
+
+    return global_step
+
+
+def validate(val_loader, model, criterion, args, global_step, sum_writer, device_nums_per_node):
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(val_loader),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        end = time.time()
+        for i, (images, target) in enumerate(val_loader):
+
+            if 'npu' in args.device:
+                target = target.to(torch.int32)
+
+            loc = None
+            if 'npu' in args.device:
+                loc = 'npu:{}'.format(args.pid_idx)
+            elif 'cuda' in args.device:
+                loc = 'cuda:{}'.format(args.pid_idx)
+            images = images.to(loc, non_blocking=True)
+            target = target.to(loc, non_blocking=True)
+
+            # compute output
+            output = model(images)
+            loss = criterion(output, target)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % args.print_freq == 0:
+                if not args.multiprocessing_distributed or \
+                        (args.multiprocessing_distributed and args.rank % device_nums_per_node == 0):
+                    progress.display(i)
+
+        # TODO: this should also be done with the ProgressMeter
+        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
+              .format(top1=top1, top5=top5))
+        if not args.multiprocessing_distributed or \
+                (args.multiprocessing_distributed and args.rank % device_nums_per_node == 0):
+            print("[device id:", args.gpu, "]", '[AVG-ACC] * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(top1=top1, top5=top5))
+
+        if not args.evaluate:
+            # sum_writer.add_scalar('Loss/validation/loss', losses, global_step)
+            sum_writer.add_scalar('Accuary/validation/top1', top1.avg, global_step)
+            sum_writer.add_scalar('Accuary/validation/top5', top5.avg, global_step)
+
+    return top1.avg
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best.pth.tar')
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+
+def adjust_learning_rate(optimizer, global_step, steps_per_epoch, args):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    # lr = args.lr * (0.98 ** (epoch / 2.5))
+    lr = args.lr * (0.98 ** (global_step // int(steps_per_epoch * 2.5)))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+    return lr
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,179 @@
+from torch import nn
+# from .utils import load_state_dict_from_url
+
+
+__all__ = ['MobileNetV2', 'mobilenet_v2']
+
+
+model_urls = {
+    'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth',
+}
+
+
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNReLU(nn.Sequential):
+    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
+        padding = (kernel_size - 1) // 2
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
+            nn.BatchNorm2d(out_planes),
+            nn.ReLU6(inplace=True)
+            # nn.ReLU(inplace=True)
+        )
+
+
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        layers = []
+        if expand_ratio != 1:
+            # pw
+            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+        layers.extend([
+            # dw
+            ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
+            # pw-linear
+            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Module):
+    def __init__(self,
+                 num_classes=1000,
+                 width_mult=1.0,
+                 inverted_residual_setting=None,
+                 round_nearest=8,
+                 block=None):
+        """
+        MobileNet V2 main class
+
+        Args:
+            num_classes (int): Number of classes
+            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
+            inverted_residual_setting: Network structure
+            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
+            Set to 1 to turn off rounding
+            block: Module specifying inverted residual building block for mobilenet
+
+        """
+        super(MobileNetV2, self).__init__()
+
+        if block is None:
+            block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+
+        if inverted_residual_setting is None:
+            inverted_residual_setting = [
+                # t, c, n, s
+                [1, 16, 1, 1],
+                [6, 24, 2, 2],
+                [6, 32, 3, 2],
+                [6, 64, 4, 2],
+                [6, 96, 3, 1],
+                [6, 160, 3, 2],
+                [6, 320, 1, 1],
+            ]
+
+        # only check the first element, assuming user knows t,c,n,s are required
+        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
+            raise ValueError("inverted_residual_setting should be non-empty "
+                             "or a 4-element list, got {}".format(inverted_residual_setting))
+
+        # building first layer
+        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+        self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
+        features = [ConvBNReLU(3, input_channel, stride=2)]
+        # building inverted residual blocks
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = _make_divisible(c * width_mult, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
+                input_channel = output_channel
+        # building last several layers
+        features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*features)
+
+        # building classifier
+        self.classifier = nn.Sequential(
+            # p=0.2
+            nn.Dropout(0.2),
+            nn.Linear(self.last_channel, num_classes),
+        )
+
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.zeros_(m.bias)
+
+    def _forward_impl(self, x):
+        # This exists since TorchScript doesn't support inheritance, so the superclass method
+        # (this one) needs to have a name other than `forward` that can be accessed in a subclass
+        x = self.features(x)
+        # Cannot use "squeeze" as batch-size can be 1 => must use reshape with x.shape[0]
+        x = nn.functional.adaptive_avg_pool2d(x, 1).reshape(x.shape[0], -1)
+        x = self.classifier(x)
+        return x
+
+    def forward(self, x):
+        return self._forward_impl(x)
+
+
+def mobilenet_v2(pretrained=False, progress=True, **kwargs):
+    """
+    Constructs a MobileNetV2 architecture from
+    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    model = MobileNetV2(**kwargs)
+    # if pretrained:
+    #     state_dict = load_state_dict_from_url(model_urls['mobilenet_v2'],
+    #                                           progress=progress)
+    #     model.load_state_dict(state_dict)
+    return model
@@ -0,0 +1,638 @@
+# -*- coding: utf-8 -*-
+
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torchvision.models as models
+
+from mobilenet import mobilenet_v2
+from apex import amp
+from multi_epochs_dataloader import MultiEpochsDataLoader
+
+from benchmark_log import hwlog
+from benchmark_log.basic_utils import get_environment_info
+from benchmark_log.basic_utils import get_model_parameter
+
+BATCH_SIZE = 4096
+OPTIMIZER_BATCH_SIZE = 4096
+# model_names = sorted(name for name in models.__dict__
+#    if name.islower() and not name.startswith("__")
+#    and callable(models.__dict__[name]))
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('--data', metavar='DIR', default='/opt/npu/dataset/imagenet',
+                    help='path to dataset')
+# parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
+#                    choices=model_names,
+#                    help='model architecture: ' +
+#                        ' | '.join(model_names) +
+#                        ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=32, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=BATCH_SIZE, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('--workspace', type=str, default='./', metavar='DIR',
+                    help='path to directory where checkpoints will be stored')
+parser.add_argument('-p', '--print-freq', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('-ef', '--eval-freq', default=5, type=int,
+                    metavar='N', help='evaluate frequency (default: 5)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--world-size', default=-1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=-1, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='nccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu', default=None, type=int,
+                    help='GPU id to use.')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+parser.add_argument('-bm', '--benchmark', default=0, type=int,
+                    metavar='N', help='set benchmark status (default: 1,run benchmark)')
+parser.add_argument('--device', default='npu', type=str, help='npu or gpu')
+parser.add_argument('--addr', default='10.136.181.115', type=str, help='master addr')
+parser.add_argument('--checkpoint-nameprefix', default='checkpoint', type=str, help='checkpoint-nameprefix')
+parser.add_argument('--checkpoint-freq', default=0, type=int,
+                    metavar='N', help='checkpoint frequency (default: 0)'
+                                      '0: save only one file whitch per epoch;'
+                                      'n: save diff file per n epoch'
+                                      '-1:no checkpoint,not support')
+
+# apex
+parser.add_argument('--amp', default=False, action='store_true',
+                    help='use amp to train the model')
+parser.add_argument('--loss-scale', default=64., type=float,
+                    help='loss scale using in amp, default -1 means dynamic')
+parser.add_argument('--opt-level', default='O2', type=str,
+                    help='loss scale using in amp, default -1 means dynamic')
+
+warnings.filterwarnings('ignore')
+best_acc1 = 0
+
+
+def main():
+    args = parser.parse_args()
+    print("===============main()=================")
+    print(args)
+    print("===============main()=================")
+
+    os.environ['KERNEL_NAME_ID'] = str(0)
+    print("++++++++++++++++++ KERNEL_NAME_ID:", os.environ['KERNEL_NAME_ID'])
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    os.environ['MASTER_ADDR'] = args.addr  # '10.136.181.51'
+    os.environ['MASTER_PORT'] = '59629'
+
+    if args.gpu is not None:
+        warnings.warn('You have chosen a specific GPU. This will completely '
+                      'disable data parallelism.')
+
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    if args.device == 'npu':
+        ngpus_per_node = torch.npu.device_count()
+    else:
+        ngpus_per_node = torch.cuda.device_count()
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        # The child process uses the environment variables of the parent process,
+        # we have to set KERNEL_NAME_ID for every proc
+        if args.device == 'npu':
+            # main_worker(args.gpu, ngpus_per_node, args)
+            mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+        else:
+            mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(args.gpu, ngpus_per_node, args)
+
+
+def main_worker(gpu, ngpus_per_node, args):
+    global best_acc1
+    args.gpu = gpu
+
+    print("[npu id:", args.gpu, "]", "++++++++++++++++ before set KERNEL_NAME_ID:", os.environ['KERNEL_NAME_ID'])
+    os.environ['KERNEL_NAME_ID'] = str(gpu)
+    print("[npu id:", args.gpu, "]", "++++++++++++++++ KERNEL_NAME_ID:", os.environ['KERNEL_NAME_ID'])
+
+    if args.gpu is not None:
+        print("[npu id:", args.gpu, "]", "Use GPU: {} for training".format(args.gpu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + gpu
+
+        if args.device == 'npu':
+            dist.init_process_group(backend=args.dist_backend,  # init_method=args.dist_url,
+                                    world_size=args.world_size, rank=args.rank)
+        else:
+            dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                    world_size=args.world_size, rank=args.rank)
+
+    loc = 'npu:{}'.format(args.gpu)
+    torch.npu.set_device(loc)
+
+    args.batch_size = int(args.batch_size / ngpus_per_node)
+    args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+
+    print("[npu id:", args.gpu, "]", "===============main_worker()=================")
+    print("[npu id:", args.gpu, "]", args)
+    print("[npu id:", args.gpu, "]", "===============main_worker()=================")
+
+    # Data loading code
+    # traindir = os.path.join(args.data, 'train')
+    # valdir = os.path.join(args.data, 'val')
+    # normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+    #                                  std=[0.229, 0.224, 0.225])
+
+    # train_dataset = datasets.ImageFolder(
+    #     traindir,
+    #     transforms.Compose([
+    #         transforms.RandomResizedCrop(224),
+    #         transforms.RandomHorizontalFlip(),
+    #         transforms.ToTensor(),
+    #         normalize,
+    #     ]))
+    #
+    # if args.distributed:
+    #     train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    # else:
+    #     train_sampler = None
+    #
+    # train_loader = torch.utils.data.DataLoader(
+    #     train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+    #     num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
+
+    train_loader, train_loader_len, train_sampler = get_pytorch_train_loader(args.data,
+                                                                       args.batch_size,
+                                                                       workers=args.workers,
+                                                                       distributed=args.distributed)
+
+    # val_loader = torch.utils.data.DataLoader(
+    #     datasets.ImageFolder(valdir, transforms.Compose([
+    #         transforms.Resize(256),
+    #         transforms.CenterCrop(224),
+    #         transforms.ToTensor(),
+    #         normalize,
+    #     ])),
+    #     batch_size=args.batch_size, shuffle=True,
+    #     num_workers=args.workers, pin_memory=True, drop_last=True)
+
+    val_loader = get_pytorch_val_loader(args.data, args.batch_size, args.workers, distributed=False)
+
+    # create model
+    print("[npu id:", args.gpu, "]", "=> creating model '{}'".format('mobilenetv2'))
+    # model = models.__dict__[args.arch]()
+    model = mobilenet_v2()
+    model = model.to(loc)
+
+    # define loss function (criterion) and optimizer
+    criterion = nn.CrossEntropyLoss().to(loc)
+    optimizer = torch.optim.SGD(model.parameters(), args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+
+    if args.amp:
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale)
+
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False)
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume, map_location=loc)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if args.amp:
+                amp.load_state_dict(checkpoint['amp'])
+            print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+
+    if args.evaluate:
+        validate(val_loader, model, criterion, args, ngpus_per_node)
+        return
+
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        # adjust_learning_rate(optimizer, epoch, args)
+
+        # train for one epoch
+        train(train_loader, train_loader_len, model, criterion, optimizer, epoch, args, ngpus_per_node)
+
+        if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1:
+            # evaluate on validation set
+            acc1 = validate(val_loader, model, criterion, args, ngpus_per_node)
+
+            # remember best acc@1 and save checkpoint
+            is_best = acc1 > best_acc1
+            best_acc1 = max(acc1, best_acc1)
+
+            if not args.multiprocessing_distributed or \
+                    (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0 and epoch == args.epochs - 1):
+                if args.amp:
+                    save_checkpoint({
+                        'epoch': epoch + 1,
+                        'state_dict': model.state_dict(),
+                        'best_acc1': best_acc1,
+                        'optimizer': optimizer.state_dict(),
+                        'amp': amp.state_dict(),
+                    }, is_best)
+                else:
+                    save_checkpoint({
+                        'epoch': epoch + 1,
+                        'state_dict': model.state_dict(),
+                        'best_acc1': best_acc1,
+                        'optimizer': optimizer.state_dict(),
+                    }, is_best)
+
+
+def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, args, ngpus_per_node):
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        train_loader_len,
+        [batch_time, data_time, losses, top1, top5],
+        prefix="Epoch: [{}]".format(epoch))
+
+    loc = 'npu:{}'.format(args.gpu)
+
+    mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).view(1, 3, 1, 1)
+    std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).view(1, 3, 1, 1)
+    mean = mean.to(loc, non_blocking=True)
+    std = std.to(loc, non_blocking=True)
+
+    # switch to train mode
+    model.train()
+    end = time.time()
+    if args.benchmark == 1:
+        optimizer.zero_grad()
+
+    # steps_per_epoch = len(train_loader)
+    steps_per_epoch = train_loader_len
+    print('==========step per epoch======================', steps_per_epoch)
+    for i, (images, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        global_step = epoch * steps_per_epoch + i
+        lr = adjust_learning_rate(optimizer, global_step, steps_per_epoch, args)
+
+        target = target.to(torch.int32)
+        images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
+        target = target.to(loc, non_blocking=True)
+
+        # compute output
+        output = model(images)
+        # stream = torch.npu.current_stream()
+        # stream.synchronize()
+
+        loss = criterion(output, target)
+        # stream = torch.npu.current_stream()
+        # stream.synchronize()
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+        # compute gradient and do SGD step
+        if args.benchmark == 0:
+            optimizer.zero_grad()
+
+        if args.amp:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+
+        # stream = torch.npu.current_stream()
+        # stream.synchronize()
+
+        if args.benchmark == 0:
+            optimizer.step()
+        elif args.benchmark == 1:
+            BATCH_SIZE_multiplier = int(OPTIMIZER_BATCH_SIZE / args.batch_size)
+            BM_optimizer_step = ((i + 1) % BATCH_SIZE_multiplier) == 0
+            if BM_optimizer_step:
+                for param_group in optimizer.param_groups:
+                    for param in param_group['params']:
+                        param.grad /= BATCH_SIZE_multiplier
+                optimizer.step()
+                optimizer.zero_grad()
+        # stream = torch.npu.current_stream()
+        # stream.synchronize()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                        and args.rank % ngpus_per_node == 0):
+                progress.display(i)
+
+    if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                and args.rank % ngpus_per_node == 0):
+        print("[npu id:", args.gpu, "]", '* FPS@all {:.3f}'.format(ngpus_per_node * args.batch_size / batch_time.avg))
+        hwlog.remark_print(key=hwlog.FPS,
+                           value=' * FPS@all {:.3f}'.format(ngpus_per_node * args.batch_size / batch_time.avg))
+
+
+def validate(val_loader, model, criterion, args, ngpus_per_node):
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(val_loader),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        loc = 'npu:{}'.format(args.gpu)
+        mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).view(1, 3, 1, 1)
+        std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).view(1, 3, 1, 1)
+        mean = mean.to(loc, non_blocking=True)
+        std = std.to(loc, non_blocking=True)
+
+        end = time.time()
+        for i, (images, target) in enumerate(val_loader):
+
+            target = target.to(torch.int32)
+            images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
+            target = target.to(loc, non_blocking=True)
+
+            # compute output
+            output = model(images)
+            loss = criterion(output, target)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % args.print_freq == 0:
+                if not args.multiprocessing_distributed or \
+                        (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
+                    progress.display(i)
+
+        # TODO: this should also be done with the ProgressMeter
+        if not args.multiprocessing_distributed or \
+                (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
+            print("[npu id:", args.gpu, "]", '[AVG-ACC] * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
+                  .format(top1=top1, top5=top5))
+            hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value="{top1.avg:.3f}".format(top1=top1))
+            hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value="{top5.avg:.3f}".format(top5=top5))
+
+    return top1.avg
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best_acc%.4f_epoch%d.pth.tar' % (state['best_acc1'], state['epoch']))
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+        self.start_count_index = 10
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        if self.count == 0:
+            self.batchsize = n
+
+        self.val = val
+        self.count += n
+        if self.count > (self.start_count_index * self.batchsize):
+            self.sum += val * n
+            self.avg = self.sum / (self.count - self.start_count_index * self.batchsize)
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print("[npu id:", os.environ['KERNEL_NAME_ID'], "]", '\t'.join(entries))
+        train_acc1 = str(entries).split("Acc@1")[1].strip().split(" ")[0]
+        train_acc5 = str(entries).split("Acc@5")[1].strip().split(" ")[0]
+        hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP1, value=train_acc1)
+        hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP5, value=train_acc5)
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+
+# def adjust_learning_rate(optimizer, epoch, args):
+#     """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+#     lr = args.lr * (0.1 ** (epoch // 30))
+#     for param_group in optimizer.param_groups:
+#         param_group['lr'] = lr
+
+def adjust_learning_rate(optimizer, global_step, steps_per_epoch, args):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    # lr = args.lr * (0.98 ** (epoch / 2.5))
+    lr = args.lr * (0.98 ** (global_step // int(steps_per_epoch * 2.5)))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+    return lr
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+def fast_collate(batch):
+    imgs = [img[0] for img in batch]
+    targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
+    w = imgs[0].size[0]
+    h = imgs[0].size[1]
+    tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8)
+    for i, img in enumerate(imgs):
+        nump_array = np.asarray(img, dtype=np.uint8)
+        if nump_array.ndim < 3:
+            nump_array = np.expand_dims(nump_array, axis=-1)
+        nump_array = np.rollaxis(nump_array, 2)
+
+        tensor[i] += torch.from_numpy(nump_array)
+
+    return tensor, targets
+
+
+def get_pytorch_train_loader(data_path, batch_size, workers=5, _worker_init_fn=None, distributed=False):
+    traindir = os.path.join(data_path, 'train')
+    train_dataset = datasets.ImageFolder(
+            traindir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                ]))
+
+    if distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    else:
+        train_sampler = None
+
+    dataloader_fn = MultiEpochsDataLoader  # torch.utils.data.DataLoader
+    train_loader = dataloader_fn(
+            train_dataset, batch_size=batch_size, shuffle=(train_sampler is None),
+            num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate, drop_last=True)
+    return train_loader, len(train_loader), train_sampler
+
+
+def get_pytorch_val_loader(data_path, batch_size, workers=5, _worker_init_fn=None, distributed=False):
+    valdir = os.path.join(data_path, 'val')
+    val_dataset = datasets.ImageFolder(
+            valdir, transforms.Compose([
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                ]))
+
+    if distributed:
+        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
+    else:
+        val_sampler = None
+
+        dataloader_fn = MultiEpochsDataLoader  # torch.utils.data.DataLoader
+        val_loader = dataloader_fn(
+            val_dataset,
+            sampler=val_sampler,
+            batch_size=batch_size, shuffle=(val_sampler is None),
+            num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, collate_fn=fast_collate)
+
+    return val_loader
+
+
+if __name__ == '__main__':
+    hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
+    cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
+    config_info = get_model_parameter("pytorch_config")
+    initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
+    hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
+    hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
+    hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
+    hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
+    hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
+    hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
+    hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
+    hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
+    hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
+    hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
+    main()
@@ -0,0 +1,663 @@
+# -*- coding: utf-8 -*-
+
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torchvision.models as models
+
+from mobilenet import mobilenet_v2
+from apex import amp
+from multi_epochs_dataloader import MultiEpochsDataLoader
+
+from benchmark_log import hwlog
+from benchmark_log.basic_utils import get_environment_info
+from benchmark_log.basic_utils import get_model_parameter
+BATCH_SIZE = 6144
+OPTIMIZER_BATCH_SIZE = 6144
+# model_names = sorted(name for name in models.__dict__
+#    if name.islower() and not name.startswith("__")
+#    and callable(models.__dict__[name]))
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('--data', metavar='DIR', default='/opt/npu/dataset/imagenet',
+                    help='path to dataset')
+# parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50',
+#                    choices=model_names,
+#                    help='model architecture: ' +
+#                        ' | '.join(model_names) +
+#                        ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=32, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=BATCH_SIZE, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('--workspace', type=str, default='./', metavar='DIR',
+                    help='path to directory where checkpoints will be stored')
+parser.add_argument('-p', '--print-freq', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('-ef', '--eval-freq', default=5, type=int,
+                    metavar='N', help='evaluate frequency (default: 5)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--world-size', default=-1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=-1, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='nccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu', default=None, type=int,
+                    help='GPU id to use.')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+parser.add_argument('-bm', '--benchmark', default=0, type=int,
+                    metavar='N', help='set benchmark status (default: 1,run benchmark)')
+parser.add_argument('--device', default='npu', type=str, help='npu or gpu')
+parser.add_argument('--addr', default='10.136.181.115', type=str, help='master addr')
+parser.add_argument('--checkpoint-nameprefix', default='checkpoint', type=str, help='checkpoint-nameprefix')
+parser.add_argument('--checkpoint-freq', default=0, type=int,
+                    metavar='N', help='checkpoint frequency (default: 0)'
+                                      '0: save only one file whitch per epoch;'
+                                      'n: save diff file per n epoch'
+                                      '-1:no checkpoint,not support')
+
+parser.add_argument('--device-list', default='0,1,2,3,4,5,6,7', type=str, help='device id list')
+
+# apex
+parser.add_argument('--amp', default=False, action='store_true',
+                    help='use amp to train the model')
+parser.add_argument('--loss-scale', default=64., type=float,
+                    help='loss scale using in amp, default -1 means dynamic')
+parser.add_argument('--opt-level', default='O2', type=str,
+                    help='loss scale using in amp, default -1 means dynamic')
+
+warnings.filterwarnings('ignore')
+best_acc1 = 0
+
+
+def device_id_to_process_device_map(device_list):
+    devices = device_list.split(",")
+    devices = [int(x) for x in devices]
+    devices.sort()
+
+    process_device_map = dict()
+    for process_id, device_id in enumerate(devices):
+        process_device_map[process_id] = device_id
+
+    return process_device_map
+
+
+def main():
+    args = parser.parse_args()
+    print("===============main()=================")
+    print(args)
+    print("===============main()=================")
+
+    os.environ['KERNEL_NAME_ID'] = str(0)
+    print("++++++++++++++++++ KERNEL_NAME_ID:", os.environ['KERNEL_NAME_ID'])
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    os.environ['MASTER_ADDR'] = args.addr  # '10.136.181.51'
+    os.environ['MASTER_PORT'] = '59629'
+
+    if args.gpu is not None:
+        warnings.warn('You have chosen a specific GPU. This will completely '
+                      'disable data parallelism.')
+
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    args.process_device_map = device_id_to_process_device_map(args.device_list)
+
+    if args.device == 'npu':
+        # ngpus_per_node = torch.npu.device_count()
+        ngpus_per_node = len(args.process_device_map)
+    else:
+        ngpus_per_node = torch.cuda.device_count()
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        # The child process uses the environment variables of the parent process,
+        # we have to set KERNEL_NAME_ID for every proc
+        if args.device == 'npu':
+            # main_worker(args.gpu, ngpus_per_node, args)
+            mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+        else:
+            mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(args.gpu, ngpus_per_node, args)
+
+
+def main_worker(gpu, ngpus_per_node, args):
+    global best_acc1
+    # args.gpu = gpu
+    args.gpu = args.process_device_map[gpu]
+
+    print("[npu id:", args.gpu, "]", "++++++++++++++++ before set KERNEL_NAME_ID:", os.environ['KERNEL_NAME_ID'])
+    os.environ['KERNEL_NAME_ID'] = str(gpu)
+    print("[npu id:", args.gpu, "]", "++++++++++++++++ KERNEL_NAME_ID:", os.environ['KERNEL_NAME_ID'])
+
+    if args.gpu is not None:
+        print("[npu id:", args.gpu, "]", "Use GPU: {} for training".format(args.gpu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + gpu
+
+        if args.device == 'npu':
+            dist.init_process_group(backend=args.dist_backend,  # init_method=args.dist_url,
+                                    world_size=args.world_size, rank=args.rank)
+        else:
+            dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                    world_size=args.world_size, rank=args.rank)
+
+    loc = 'npu:{}'.format(args.gpu)
+    torch.npu.set_device(loc)
+
+    args.batch_size = int(args.batch_size / ngpus_per_node)
+    args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+
+    print("[npu id:", args.gpu, "]", "===============main_worker()=================")
+    print("[npu id:", args.gpu, "]", args)
+    print("[npu id:", args.gpu, "]", "===============main_worker()=================")
+
+    # Data loading code
+    # traindir = os.path.join(args.data, 'train')
+    # valdir = os.path.join(args.data, 'val')
+    # normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+    #                                  std=[0.229, 0.224, 0.225])
+
+    # train_dataset = datasets.ImageFolder(
+    #     traindir,
+    #     transforms.Compose([
+    #         transforms.RandomResizedCrop(224),
+    #         transforms.RandomHorizontalFlip(),
+    #         transforms.ToTensor(),
+    #         normalize,
+    #     ]))
+    #
+    # if args.distributed:
+    #     train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    # else:
+    #     train_sampler = None
+    #
+    # train_loader = torch.utils.data.DataLoader(
+    #     train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+    #     num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
+
+    train_loader, train_loader_len, train_sampler = get_pytorch_train_loader(args.data,
+                                                                       args.batch_size,
+                                                                       workers=args.workers,
+                                                                       distributed=args.distributed)
+
+    # val_loader = torch.utils.data.DataLoader(
+    #     datasets.ImageFolder(valdir, transforms.Compose([
+    #         transforms.Resize(256),
+    #         transforms.CenterCrop(224),
+    #         transforms.ToTensor(),
+    #         normalize,
+    #     ])),
+    #     batch_size=args.batch_size, shuffle=True,
+    #     num_workers=args.workers, pin_memory=True, drop_last=True)
+
+    val_loader = get_pytorch_val_loader(args.data, args.batch_size, args.workers, distributed=False)
+
+    # create model
+    print("[npu id:", args.gpu, "]", "=> creating model '{}'".format('mobilenetv2'))
+    # model = models.__dict__[args.arch]()
+    model = mobilenet_v2()
+    model = model.to(loc)
+
+    # define loss function (criterion) and optimizer
+    criterion = nn.CrossEntropyLoss().to(loc)
+    optimizer = torch.optim.SGD(model.parameters(), args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+
+    if args.amp:
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale)
+
+    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False)
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume, map_location=loc)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            if args.amp:
+                amp.load_state_dict(checkpoint['amp'])
+            print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+
+    if args.evaluate:
+        validate(val_loader, model, criterion, args, ngpus_per_node)
+        return
+
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        # adjust_learning_rate(optimizer, epoch, args)
+
+        # train for one epoch
+        train(train_loader, train_loader_len, model, criterion, optimizer, epoch, args, ngpus_per_node)
+
+        if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1:
+            # evaluate on validation set
+            acc1 = validate(val_loader, model, criterion, args, ngpus_per_node)
+
+            # remember best acc@1 and save checkpoint
+            is_best = acc1 > best_acc1
+            best_acc1 = max(acc1, best_acc1)
+
+            if not args.multiprocessing_distributed or \
+                    (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0 and epoch == args.epochs - 1):
+                if args.amp:
+                    save_checkpoint({
+                        'epoch': epoch + 1,
+                        'state_dict': model.state_dict(),
+                        'best_acc1': best_acc1,
+                        'optimizer': optimizer.state_dict(),
+                        'amp': amp.state_dict(),
+                    }, is_best)
+                else:
+                    save_checkpoint({
+                        'epoch': epoch + 1,
+                        'state_dict': model.state_dict(),
+                        'best_acc1': best_acc1,
+                        'optimizer': optimizer.state_dict(),
+                    }, is_best)
+
+
+def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, args, ngpus_per_node):
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        train_loader_len,
+        [batch_time, data_time, losses, top1, top5],
+        prefix="Epoch: [{}]".format(epoch))
+
+    loc = 'npu:{}'.format(args.gpu)
+
+    mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).view(1, 3, 1, 1)
+    std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).view(1, 3, 1, 1)
+    mean = mean.to(loc, non_blocking=True)
+    std = std.to(loc, non_blocking=True)
+
+    # switch to train mode
+    model.train()
+    end = time.time()
+    if args.benchmark == 1:
+        optimizer.zero_grad()
+
+    # steps_per_epoch = len(train_loader)
+    steps_per_epoch = train_loader_len
+    print('==========step per epoch======================', steps_per_epoch)
+    for i, (images, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        global_step = epoch * steps_per_epoch + i
+        lr = adjust_learning_rate(optimizer, global_step, steps_per_epoch, args)
+
+        target = target.to(torch.int32)
+        images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
+        target = target.to(loc, non_blocking=True)
+
+        # compute output
+        output = model(images)
+        stream = torch.npu.current_stream()
+        stream.synchronize()
+
+        loss = criterion(output, target)
+        stream = torch.npu.current_stream()
+        stream.synchronize()
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+        # compute gradient and do SGD step
+        if args.benchmark == 0:
+            optimizer.zero_grad()
+
+        if args.amp:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+
+        stream = torch.npu.current_stream()
+        stream.synchronize()
+
+        if args.benchmark == 0:
+            optimizer.step()
+        elif args.benchmark == 1:
+            BATCH_SIZE_multiplier = int(OPTIMIZER_BATCH_SIZE / args.batch_size)
+            BM_optimizer_step = ((i + 1) % BATCH_SIZE_multiplier) == 0
+            if BM_optimizer_step:
+                for param_group in optimizer.param_groups:
+                    for param in param_group['params']:
+                        param.grad /= BATCH_SIZE_multiplier
+                optimizer.step()
+                optimizer.zero_grad()
+        stream = torch.npu.current_stream()
+        stream.synchronize()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                        and args.rank % ngpus_per_node == 0):
+                progress.display(i)
+
+    if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                and args.rank % ngpus_per_node == 0):
+        print("[npu id:", args.gpu, "]", '* FPS@all {:.3f}'.format(ngpus_per_node * args.batch_size / batch_time.avg))
+        hwlog.remark_print(key=hwlog.FPS, value=' * FPS@all {:.3f}'.format(ngpus_per_node * args.batch_size / batch_time.avg))
+
+
+def validate(val_loader, model, criterion, args, ngpus_per_node):
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(val_loader),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        loc = 'npu:{}'.format(args.gpu)
+        mean = torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]).view(1, 3, 1, 1)
+        std = torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]).view(1, 3, 1, 1)
+        mean = mean.to(loc, non_blocking=True)
+        std = std.to(loc, non_blocking=True)
+
+        end = time.time()
+        for i, (images, target) in enumerate(val_loader):
+
+            target = target.to(torch.int32)
+            images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
+            target = target.to(loc, non_blocking=True)
+
+            # compute output
+            output = model(images)
+            loss = criterion(output, target)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if i % args.print_freq == 0:
+                if not args.multiprocessing_distributed or \
+                        (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
+                    progress.display(i)
+
+        # TODO: this should also be done with the ProgressMeter
+        if not args.multiprocessing_distributed or \
+                (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
+            print("[npu id:", args.gpu, "]", '[AVG-ACC] * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
+                  .format(top1=top1, top5=top5))
+            hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value="{top1.avg:.3f}".format(top1=top1))
+            hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value="{top5.avg:.3f}".format(top5=top5))
+
+    return top1.avg
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best_acc%.4f_epoch%d.pth.tar' % (state['best_acc1'], state['epoch']))
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+        self.start_count_index = 10
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        if self.count == 0:
+            self.batchsize = n
+
+        self.val = val
+        self.count += n
+        if self.count > (self.start_count_index * self.batchsize):
+            self.sum += val * n
+            self.avg = self.sum / (self.count - self.start_count_index * self.batchsize)
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print("[npu id:", os.environ['KERNEL_NAME_ID'], "]", '\t'.join(entries))
+        train_acc1 = str(entries).split("Acc@1")[1].strip().split(" ")[0]
+        train_acc5 = str(entries).split("Acc@5")[1].strip().split(" ")[0]
+        hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP1, value=train_acc1)
+        hwlog.remark_print(key=hwlog.TRAIN_ACCURACY_TOP5, value=train_acc5)
+
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+
+# def adjust_learning_rate(optimizer, epoch, args):
+#     """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+#     lr = args.lr * (0.1 ** (epoch // 30))
+#     for param_group in optimizer.param_groups:
+#         param_group['lr'] = lr
+
+# def adjust_learning_rate(optimizer, global_step, steps_per_epoch, args):
+#     """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+#     # lr = args.lr * (0.98 ** (epoch / 2.5))
+#     lr = args.lr * (0.98 ** (global_step // int(steps_per_epoch * 2.5)))
+#     for param_group in optimizer.param_groups:
+#         param_group['lr'] = lr
+#     return lr
+
+def adjust_learning_rate(optimizer, global_step, steps_per_epoch, args):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    # lr = args.lr * (0.98 ** (epoch / 2.5))
+    lr = args.lr * (0.98 ** (global_step // int(steps_per_epoch * 2.5)))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+    return lr
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+def fast_collate(batch):
+    imgs = [img[0] for img in batch]
+    targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
+    w = imgs[0].size[0]
+    h = imgs[0].size[1]
+    tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8)
+    for i, img in enumerate(imgs):
+        nump_array = np.asarray(img, dtype=np.uint8)
+        if nump_array.ndim < 3:
+            nump_array = np.expand_dims(nump_array, axis=-1)
+        nump_array = np.rollaxis(nump_array, 2)
+
+        tensor[i] += torch.from_numpy(nump_array)
+
+    return tensor, targets
+
+
+def get_pytorch_train_loader(data_path, batch_size, workers=5, _worker_init_fn=None, distributed=False):
+    traindir = os.path.join(data_path, 'train')
+    train_dataset = datasets.ImageFolder(
+            traindir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                ]))
+
+    if distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    else:
+        train_sampler = None
+
+    dataloader_fn = MultiEpochsDataLoader  # torch.utils.data.DataLoader
+    train_loader = dataloader_fn(
+            train_dataset, batch_size=batch_size, shuffle=(train_sampler is None),
+            num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate, drop_last=True)
+    return train_loader, len(train_loader), train_sampler
+
+
+def get_pytorch_val_loader(data_path, batch_size, workers=5, _worker_init_fn=None, distributed=False):
+    valdir = os.path.join(data_path, 'val')
+    val_dataset = datasets.ImageFolder(
+            valdir, transforms.Compose([
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                ]))
+
+    if distributed:
+        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
+    else:
+        val_sampler = None
+
+        dataloader_fn = MultiEpochsDataLoader  # torch.utils.data.DataLoader
+        val_loader = dataloader_fn(
+            val_dataset,
+            sampler=val_sampler,
+            batch_size=batch_size, shuffle=(val_sampler is None),
+            num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, collate_fn=fast_collate)
+
+    return val_loader
+
+
+if __name__ == '__main__':
+    hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
+    cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("pytorch")
+    config_info = get_model_parameter("pytorch_config")
+    initinal_data = {"base_lr": 0.1, "dataset": "imagenet", "optimizer": "SGD", "loss_scale": 1024}
+    hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
+    hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
+    hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
+    hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
+    hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
+    hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
+    hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
+    hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
+    hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
+    hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
+    main()
@@ -0,0 +1,31 @@
+import torch
+
+
+class MultiEpochsDataLoader(torch.utils.data.DataLoader):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._DataLoader__initialized = False
+        self.batch_sampler = _RepeatSampler(self.batch_sampler)
+        self._DataLoader__initialized = True
+        self.iterator = super().__iter__()
+
+    def __len__(self):
+        return len(self.batch_sampler.sampler)
+
+    def __iter__(self):
+        for _ in range(len(self)):
+            yield next(self.iterator)
+
+
+class _RepeatSampler(object):
+    """ Sampler that repeats forever.
+    Args:
+        sampler (Sampler)
+    """
+
+    def __init__(self, sampler):
+        self.sampler = sampler
+
+    def __iter__(self):
+        while True:
+            yield from iter(self.sampler)
@@ -0,0 +1,18 @@
+{
+  "startCfg":
+  [
+    {
+      "jobID": "123456789",
+      "deviceID": ["0"],
+      "features":
+      [
+        {
+          "name": "task_trace"
+        },
+        {
+          "name": "training_trace"
+        }
+      ]
+    }
+  ]
+}
@@ -0,0 +1,19 @@
+source set_env_b023.sh
+
+su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
+
+export SLOG_PRINT_TO_STDOUT=0
+export TASK_QUEUE_ENABLE=0
+nohup taskset -c 1-40 python3.7 densenet121_1p_main.py \
+	--workers 40 \
+	--arch densenet121 \
+	--npu 0 \
+	--lr 0.1 \
+	--momentum 0.9 \
+	--amp \
+	--print-freq 1 \
+	--eval-freq 5\
+	--batch-size 256 \
+	--epoch 45 \
+	--resume checkpoint.pth.tar \
+	--data /home/dataset/imagenet > output_1p.log &
@@ -0,0 +1,27 @@
+source set_env_b023.sh
+
+su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
+su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 4"
+
+export SLOG_PRINT_TO_STDOUT=0
+export TASK_QUEUE_ENABLE=0
+nohup python3.7 ./densenet121_8p_main.py \
+    --addr='10.246.246.57' \
+    --seed 49  \
+    --workers 80 \
+    --lr 0.8 \
+    --print-freq 1 \
+    --eval-freq 5\
+    --arch densenet121 \
+    --dist-url 'tcp://127.0.0.1:50000' \
+    --dist-backend 'hccl' \
+    --multiprocessing-distributed \
+    --world-size 1 \
+    --batch-size 2048 \
+    --epochs 45 \
+    --rank 0 \
+    --amp \
+    --benchmark 0 \
+    --resume checkpoint.pth.tar \
+    --data /train/imagenet > resume_8p.log &
+
@@ -0,0 +1,18 @@
+source set_env_b023.sh
+
+su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
+
+export SLOG_PRINT_TO_STDOUT=0
+export TASK_QUEUE_ENABLE=0
+nohup taskset -c 1-40 python3.7 densenet121_1p_main.py \
+	--workers 40 \
+	--arch densenet121 \
+	--npu 0 \
+	--lr 0.1 \
+	--momentum 0.9 \
+	--amp \
+	--print-freq 1 \
+	--eval-freq 5\
+	--batch-size 256 \
+	--epoch 90 \
+	--data /opt/npu/dataset/imagenet > output_1p.log &
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+source set_env_b023.sh
+
+su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
+su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 4"
+
+export SLOG_PRINT_TO_STDOUT=0
+export TASK_QUEUE_ENABLE=0
+nohup python3.7 ./mobilenetv2_8p_main.py \
+    --addr='10.246.246.76' \
+    --seed 49  \
+    --workers 80 \
+    --lr 0.24 \
+    --print-freq 1 \
+    --eval-freq 5\
+    --dist-url 'tcp://127.0.0.1:50002' \
+    --dist-backend 'hccl' \
+    --multiprocessing-distributed \
+    --world-size 1 \
+    --batch-size 6144 \
+    --epochs 600 \
+    --rank 0 \
+    --amp \
+    --benchmark 0 \
+    --data /opt/npu/dataset/imagenet > output_8p.log &
+
@@ -0,0 +1,17 @@
+############## toolkit situation ################
+#export ASCEND_HOME=/usr/local/Ascend
+#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
+#export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl
+#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
+#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
+
+############## nnae situation ################
+export ASCEND_HOME=/usr/local/Ascend
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/:/usr/local/python3.7.5/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
+export PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/hccl
+export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin
+export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/
+
+# pip3.7 install --upgrade /usr/local/Ascend/nnae/latest/fwkacllib/lib64/topi-0.4.0-py3-none-any.whl
+# pip3.7 install --upgrade /usr/local/Ascend/nnae/latest/fwkacllib/lib64/te-0.4.0-py3-none-any.whl
+
@@ -0,0 +1,18 @@
+############## toolkit situation ################
+export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
+export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
+export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+
+
+############## nnae situation ################
+# export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+# export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/nnae/latest/toolkit/tools/ide_daemon/bin/
+# export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/
+# export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+# export PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+
+
+
+# ln -s /usr/local/Ascend/ascend-toolkit/latest/toolkit/bin/adc /usr/local/bin/
@@ -0,0 +1,12 @@
+# main env
+export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+export PATH=$PATH:/usr/local/Ascend/fwkacllib/ccec_compiler/bin
+export ASCEND_OPP_PATH=/usr/local/Ascend/opp
+export NEW_GE_FE_ID=1
+export GE_AICPU_FLAG=1
+export PYTHONPATH=/usr/local/Ascend/atc/python/site-packages/te.egg:/usr/local/Ascend/atc/python/site-packages/topi.egg:/usr/local/Ascend/atc/python/site-packages/auto_tune.egg:/usr/local/Ascend/atc/python/site-packages/schedule_search.egg:/usr/local
+export CUSTOM_OP_LIB_PATH=/usr/local/Ascend/ops/framework/built-in/tensorflow
+export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libaicpu_plugin.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+export PLUGIN_LOAD_PATH=/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libaicpu_plugin.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so:/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/librts_engine.so
+export SLOG_PRINT_TO_STDOUT=1
+
@@ -0,0 +1,31 @@
+############## toolkit situation ################
+#export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+#export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
+#export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
+#export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+#export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+
+
+############## nnae situation ################
+
+
+if [ -d /usr/local/Ascend/nnae/latest ];then
+	export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
+    export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/nnae/latest/toolkit/tools/ide_daemon/bin/
+    export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/
+    export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+    export PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+else
+	export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
+	export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
+	export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
+	export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
+	export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
+fi
+
+# ln -s /usr/local/Ascend/ascend-toolkit/latest/toolkit/bin/adc /usr/local/bin/
+
+export SLOG_PRINT_TO_STDOUT=0
+#su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 0"
+
+export TASK_QUEUE_ENABLE=1
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+rank_size=$1
+yamlPath=$2
+toolsPath=$3
+if [ -f /.dockerenv ];then
+        CLUSTER=$4
+		MPIRUN_ALL_IP="$5"
+        export CLUSTER=${CLUSTER}
+fi
+
+currentDir=$(cd "$(dirname "$0")/.."; pwd)
+
+source ${currentDir}/config/npu_set_env.sh
+
+eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "pytorch_config")
+
+
+rm -rf /var/log/npu/slog/host-0/*
+currtime=`date +%Y%m%d%H%M%S`
+mkdir -p ${currentDir%train*}/train/result/pt_mobilenet/training_job_${currtime}/
+train_job_dir=${currentDir%train*}/train/result/pt_mobilenet/training_job_${currtime}/
+
+echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${train_job_dir}"
+# device 列表, 若无指定 device 或大于等于 8p 时根据 rank_size 顺序选择
+eval device_group=\$device_group_${rank_size}p
+if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
+    device_group="$(seq 0 "$(expr $rank_size - 1)")"
+fi
+
+device_group_str=`echo ${device_group} | sed 's/ //g'`
+first_device_id=`echo ${device_group_str: 0:1}`
+
+rank_id=0
+
+if [ x"${CLUSTER}" == x"True" ];then
+    ln -snf ${currentDir%train*}/train/result/pt_mobilenet/training_job_${currtime}/0/hw_mobilenet.log ${currentDir%train*}/train/result/pt_mobilenet/training_job_${currtime}/
+	this_ip=$(hostname -I |awk '{print $1}')
+    for ip in $MPIRUN_ALL_IP;do
+        if [ x"$this_ip" != x"$ip" ];then
+           scp $yamlPath root@$ip:$yamlPath
+           scp $jsonFilePath root@$ip:$jsonFilePath
+        fi
+    done
+    export PATH=$PATH:/usr/local/mpirun4.0.2/bin
+    mpirun -H ${mpirun_ip} \
+    --bind-to none -map-by slot\
+    --allow-run-as-root \
+    --mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
+    --prefix /usr/local/mpirun4.0.2/ \
+    ${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} ${CLUSTER}
+else
+    ln -snf ${currentDir%train*}/train/result/pt_mobilenet/training_job_${currtime}/${first_device_id}/hw_mobilenet.log ${currentDir%train*}/train/result/pt_mobilenet/training_job_${currtime}/
+    #for device_id in $device_group;do
+      #echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train ${device_id} & " >> ${currentDir}/result/main.log
+    ${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} $rank_id&
+	  #let rank_id++
+    #done
+fi
+wait
@@ -0,0 +1,146 @@
+#!/usr/bin/env bash
+
+device_id=$1
+rank_size=$2
+yamlPath=$3
+currentDir=$(cd "$(dirname "$0")/.."; pwd)
+currtime=$4
+toolsPath=$5
+
+export YAML_PATH=$3
+mkdir -p ${currentDir%train*}/train/result/pt_mobilenet/training_job_${currtime}/
+export train_job_dir=${currentDir%train*}/train/result/pt_mobilenet/training_job_${currtime}/
+
+
+# 从 yaml 获取配置
+eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "pytorch_config")
+export REMARK_LOG_FILE=hw_mobilenet.log
+benchmark_log_path=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
+export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
+
+
+source ${currentDir}/config/set_env_b023.sh
+
+# user env
+export HCCL_CONNECT_TIMEOUT=600
+export JOB_ID=9999001
+export RANK_TABLE_FILE=${currentDir}/config/${rank_size}p.json
+export RANK_SIZE=${rank_size}
+export SLOG_PRINT_TO_STDOUT=0
+export DEVICE_ID=${device_id}
+DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
+export DEVICE_INDEX=${DEVICE_INDEX}
+
+cd ${train_job_dir}
+curd_dir=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils/atlasboost
+export PYTHONPATH=$PYTHONPATH:${curd_dir}
+
+if [ x"$6" != x"True" ];then
+        rank_id=$6
+        export RANK_ID=$6
+else
+        device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
+                device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
+                atlasboost.set_device_id(device_id);print(atlasboost.rank())")
+        device_id_mo=`echo $device_id_mo`
+        rank_id=${device_id_mo##* }
+        echo rank_id is $rank_id
+        export RANK_ID=${rank_id}
+        device=${device_id_mo##*deviceid = }
+        device_id=${device%% phyid=*}
+        export DEVICE_ID=${device_id}
+        echo device_id is $device_id
+        hccljson=${train_job_dir}/*.json
+        cp ${hccljson} ${currentDir}/config/${rank_size}p.json
+fi
+
+#mkdir exec path
+mkdir -p ${train_job_dir}/${device_id}
+cd ${train_job_dir}/${device_id}
+
+startTime=`date +%Y%m%d-%H:%M:%S`
+startTime_s=`date +%s`
+
+
+if [ x"$6" == x"True" ];then
+    python3.7 ${currentDir}/code/8p/mobilenetv2_8p_main.py \
+        --addr=$(hostname -I |awk '{print $1}') \
+        --seed 49  \
+        --workers 128 \
+        --lr 0.24 \
+        --print-freq 1 \
+        --eval-freq 5\
+        --dist-url 'tcp://127.0.0.1:50002' \
+        --dist-backend 'hccl' \
+        --multiprocessing-distributed \
+        --world-size 1 \
+        --batch-size ${batch_size} \
+        --epochs ${epoches} \
+        --rank 0 \
+        --amp \
+        --benchmark 0 \
+        --data ${data_url} > ${train_job_dir}/train_${rank_size}p.log 2>&1
+elif [ x"${rank_size}" == x"1" ];then
+    # 单卡
+    python3.7 ${currentDir}/code/1p/main_apex.py \
+		--workers 128 \
+		--seed 123456 \
+		--lr 0.03 \
+		--amp \
+		--opt-level 'O2' \
+		--loss-scale-value 64 \
+		--momentum 0.9 \
+		--batch-size ${batch_size} \
+		--weight-decay 1e-5 \
+		--epoch ${epoches} \
+		--print-freq 1 \
+		--device ${device_single}\
+		--eval-freq 1 \
+		--summary-path './runs/mobilenetv2/npu_O2_ls64_c75b150_0909' \
+		--data  ${data_url} > ${train_job_dir}/train_${rank_size}p.log 2>&1
+elif [ ${rank_size} -le 8 ];then
+    # 多卡单机
+	python3.7 ${currentDir}/code/8p/mobilenetv2_8p_main_anycard.py \
+        --addr=$(hostname -I |awk '{print $1}') \
+        --seed 49  \
+        --workers 128 \
+        --lr ${lr} \
+        --print-freq 1 \
+        --loss-scale 64 \
+        --eval-freq 1\
+        --dist-url 'tcp://127.0.0.1:50002' \
+        --dist-backend 'hccl' \
+        --multiprocessing-distributed \
+        --world-size 1 \
+        --batch-size ${batch_size} \
+        --epochs ${epoches} \
+        --rank 0 \
+        --amp \
+	--device-list ${device_group_mutli} \
+        --benchmark 0 \
+        --data ${data_url} > ${train_job_dir}/train_${rank_size}p.log 2>&1
+fi
+
+
+
+if [ $? -eq 0 ];then
+    echo ":::ABK 1.0.0 hw_mobilenet train success"
+    echo ":::ABK 1.0.0 hw_mobilenet train success" >> ${train_job_dir}/train_${rank_size}p.log
+    echo ":::ABK 1.0.0 hw_mobilenet train success" >> ./hw_mobilenet.log
+else
+    echo ":::ABK 1.0.0 hw_mobilenet train failed"
+    echo ":::ABK 1.0.0 hw_mobilenet train failed" >> ${train_job_dir}/train_${rank_size}p.log
+    echo ":::ABK 1.0.0 hw_mobilenet train failed" >> ./hw_mobilenet.log
+fi
+
+endTime=`date +%Y%m%d-%H:%M:%S`
+endTime_s=`date +%s`
+
+sumTime=$[ $endTime_s - $startTime_s ]
+
+hour=$(( $sumTime/3600 ))
+min=$(( ($sumTime-${hour}*3600)/60 ))
+sec=$(( $sumTime-${hour}*3600-${min}*60 ))
+echo ":::ABK 1.0.0 mobilenet train total time：${hour}:${min}:${sec}"
+
+echo ":::ABK 1.0.0 mobilenet train total time： ${hour}:${min}:${sec}" >> ./hw_mobilenet.log
@@ -0,0 +1,47 @@
+# MobileNet_tensorflow训练说明
+
+### 1. 模型训练参数配置
+
+在train/yaml/MobileNet.yaml中修改相应配置， 配置项含义:
+
+```
+tensorflow_config:
+    # 基本参数
+    max_steps: 1000
+    data_url: 数据集路径
+    epoches: 跑多少个epoch
+
+    # 训练(train) 或 评测(evaluate)
+    mode: train
+    batch_size: 256
+    #仅在 mode 为 evaluate 时用到
+    ckpt_path: /opt/0908/benchmark-benchmark_Alpha/train/result/tf_mobilenet/trainingJob_20200905171017/0/results/model.ckpt-123125
+
+    # 仅多机执行需要配置: ip1:卡数量1,ip2:卡数量2
+    mpirun_ip: 90.90.176.152:8,90.90.176.154:8
+
+    # docker 镜像名称:版本号
+    docker_image: c73:b021
+
+    # 指定 device id, 多个 id 使用空格分隔, 数量需与 rank_size 相同
+    device_group_1p: 0
+    device_group_2p: 0 1
+    device_group_4p: 0 1 2 3    
+    
+    profiling_mode: false
+    profiling_options: training_trace
+    fp_point: L2Loss
+    bp_point: gradients/AddN_30
+    aicpu_profiling_mode: false
+
+```
+
+------
+
+
+
+
+
+
+
+    
@@ -0,0 +1,211 @@
+# MobileNetv2 for Tensorflow 
+
+This repository provides a script and recipe to train the MobileNetv2 model to achieve state-of-the-art accuracy.
+
+## Table Of Contents
+
+* [Model overview](#model-overview)
+  * [Model Architecture](#model-architecture)  
+  * [Default configuration](#default-configuration)
+* [Data augmentation](#data-augmentation)
+* [Setup](#setup)
+  * [Requirements](#requirements)
+* [Quick start guide](#quick-start-guide)
+* [Advanced](#advanced)
+  * [Command line arguments](#command-line-arguments)
+  * [Training process](#training-process)
+* [Performance](#performance)
+  * [Results](#results)
+    * [Training accuracy results](#training-accuracy-results)
+    * [Training performance results](#training-performance-results)
+
+
+    
+
+## Model overview
+
+In this repository, we implement MobileNetv2 from paper [Sandler, Mark, et al. "Mobilenetv2: Inverted residuals and linear bottlenecks." CVPR 2018.](https://arxiv.org/abs/1801.04381)
+
+MobileNetv2 is a mobile architecture. It is mainly constructed based on depthwise separable convolutions, linear bottlenecks and inverted residuals.
+
+### Model architecture
+
+The model architecture can be found from the reference paper.
+
+### Default configuration
+
+The following sections introduce the default configurations and hyperparameters for MobileNetv2 model.
+
+#### Optimizer
+
+This model uses Momentum optimizer from Tensorflow with the following hyperparameters:
+
+- Momentum : 0.9
+- Learning rate (LR) : 0.8
+- LR schedule: cosine_annealing
+- Warmup epoch: 5
+- Batch size : 256*8 
+- Weight decay :  0.00004 
+- Moving average decay: 0.9999
+- Label smoothing = 0.1
+- We train for:
+  - 300 epochs for a standard training process using ImageNet2012
+
+#### Data augmentation
+
+This model uses the data augmentation from InceptionV2:
+
+- For training:
+  - Convert DataType and RandomResizeCrop
+  - RandomHorizontalFlip, prob=0.5
+  - Subtract with 0.5 and multiply with 2.0
+- For inference:
+  - Convert DataType
+  - CenterCrop 87.5% of the original image and resize to (224, 224)
+  - Subtract with 0.5 and multiply with 2.0
+
+For more details, we refer readers to read the corresponding source code in Slim.
+
+## Setup
+The following section lists the requirements to start training the MobileNetv2 model.
+### Requirements
+
+Tensorflow 1.15.0
+
+## Quick Start Guide
+
+### 1. Clone the respository
+
+```shell
+git clone xxx
+cd ModelZoo_MobileNetv2_TF
+```
+
+### 2. Download and preprocess the dataset
+
+1. Download the ImageNet2012 dataset
+2. Generate tfrecord files following [Tensorflow-Slim](https://github.com/tensorflow/models/tree/master/research/slim).
+3. The train and validation tfrecord files are under the path/data directories.
+
+### 3. Train
+- train on a single NPU
+    - **edit** *train_1p.sh* (see example below)
+    - bash run_1p.sh
+- train on 8 NPUs
+    - **edit** *train_8p.sh* (see example below)
+    - bash run_8p.sh 
+
+Examples:
+- Case for single NPU
+    - In *train_1p.sh*, python scripts part should look like as follows. For more detailed command lines arguments, please refer to [Command line arguments](#command-line-arguments)
+        ```shell
+        python3.7 ${currentDir}/train.py \
+            --dataset_dir=/opt/npu/slimImagenet \
+            --max_train_steps=500 \
+            --iterations_per_loop=50 \
+            --model_name="mobilenet_v2" \
+            --moving_average_decay=0.9999 \
+            --label_smoothing=0.1 \
+            --preprocessing_name="inception_v2" \
+            --weight_decay='0.00004' \
+            --batch_size=256 \
+            --learning_rate_decay_type='cosine_annealing' \
+            --learning_rate=0.4 \
+            --optimizer='momentum' \
+            --momentum='0.9' \
+            --warmup_epochs=5
+        ```
+    - Run the program  
+        ```
+        bash run_1p.sh
+        ```
+- Case for 8 NPUs
+    - In *train_8p.sh*, python scripts part should look like as follows.
+        ```shell 
+        python3.7 ${currentDir}/train.py \
+            --dataset_dir=/opt/npu/slimImagenet \
+            --max_epoch=300 \
+            --model_name="mobilenet_v2" \
+            --moving_average_decay=0.9999 \
+            --label_smoothing=0.1 \
+            --preprocessing_name="inception_v2" \
+            --weight_decay='0.00004' \
+            --batch_size=256 \
+            --learning_rate_decay_type='cosine_annealing' \
+            --learning_rate=0.8 \
+            --optimizer='momentum' \
+            --momentum='0.9' \
+            --warmup_epochs=5
+        ```
+    - Run the program  
+        ```
+        bash run_8p.sh
+        ```
+
+### 4. Test
+- We evaluate results by using following commands:
+     ```shell 
+    python3.7 eval_image_classifier_mobilenet.py --dataset_dir=/opt/npu/slimImagenet \
+        --checkpoint_path=result/8p/0/results/model.ckpt-187500
+    ```
+    Remember to modify the dataset path and checkpoint path, then run the command.
+
+
+## Advanced
+### Commmand-line options
+
+We list those important parameters to train this network here. For more details of all the parameters, please read *train.py* and other related files.
+
+```
+  --dataset_dir                     directory of dataset (default: /opt/npu/models/slimImagenet)
+  --max_epoch                       number of epochs to train the model (default: 200) 
+  --max_train_steps                 max number of training steps (default: 500)
+  --iterations_per_loop             number of steps to run in devices each iteration (default: None)
+  --model_name                      name of the model to train (default: mobilenet_v2_140)
+  --moving_average_decay            the decay to use for the moving average (default: None)
+  --label_smoothing                 use label smooth in cross entropy (default: 0.1)
+  --preprocessing_name              preprocessing method for training (default: inception_v2)
+  --weight_decay                    weight decay for regularization loss (default: 0)
+  --batch_size                      batch size per npu (default: 96)
+  --learning_rate_decay_type        learning rate decay type (default: fixed)
+  --learning_rate                   initial learning rate (default: 0.1)
+  --optimizer                       the name of optimizer (default: sgd)
+  --momentum                        momentum value used in optimizer (default: 0.9)
+  --warmup_epochs                   warmup epochs for learning rate (default: 5)
+```
+
+### Training process
+
+All the results of the training will be stored in the directory `result`.
+ 
+## Performance
+
+### Result
+
+Our result were obtained by running the applicable training script. To achieve the same results, follow the steps in the Quick Start Guide.
+
+#### Training accuracy results
+
+| **epochs** |      Top1      |
+| :--------: | :------------: |
+|    300     |     72.47%     |
+
+#### Training performance results
+| **NPUs** | train performance |
+| :------: | :---------------: |
+|    1     |     1400 img/s    |
+
+| **NPUs** | train performance |
+| :------: | :---------------: |
+|    8     |    11000 img/s    |
+
+
+
+
+
+
+
+
+
+
+
@@ -0,0 +1,240 @@
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functions to read, decode and pre-process input data for the Model.
+"""
+import collections
+import sys
+import tensorflow as tf
+
+from tensorflow.python.data.experimental.ops import threadpool
+
+# from tensorflow.contrib import slim
+
+InputEndpoints = collections.namedtuple(
+    'InputEndpoints', ['images', 'images_orig', 'labels', 'labels_one_hot'])
+ShuffleBatchConfig = collections.namedtuple('ShuffleBatchConfig', [
+    'num_batching_threads', 'queue_capacity', 'min_after_dequeue'
+])
+
+DEFAULT_SHUFFLE_CONFIG = ShuffleBatchConfig(
+    num_batching_threads=8, queue_capacity=3000, min_after_dequeue=1000)
+
+
+def get_data_files(data_sources):
+    from tensorflow.python.platform import gfile
+    if isinstance(data_sources, (list, tuple)):
+        data_files = []
+        for source in data_sources:
+            data_files += get_data_files(source)
+    else:
+        if '*' in data_sources or '?' in data_sources or '[' in data_sources:
+            data_files = gfile.Glob(data_sources)
+        else:
+            data_files = [data_sources]
+    if not data_files:
+        raise ValueError('No data files found in %s' % (data_sources,))
+    return data_files
+
+
+def preprocess_image(image, location, label_one_hot, height=224, width=224):
+    """Prepare one image for evaluation.
+    If height and width are specified it would output an image with that size by
+    applying resize_bilinear.
+    If central_fraction is specified it would cropt the central fraction of the
+    input image.
+    Args:
+    image: 3-D Tensor of image. If dtype is tf.float32 then the range should be
+      [0, 1], otherwise it would converted to tf.float32 assuming that the range
+      is [0, MAX], where MAX is largest positive representable number for
+      int(8/16/32) data type (see `tf.image.convert_image_dtype` for details)
+    height: integer
+    width: integer
+    central_fraction: Optional Float, fraction of the image to crop.
+    scope: Optional scope for name_scope.
+    Returns:
+    3-D float Tensor of prepared image.
+    """
+
+    # if image.dtype != tf.float32:
+    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+    # Crop the central region of the image with an area containing 87.5% of
+    # the original image.
+    # if central_fraction:
+    #  image = tf.image.central_crop(image, central_fraction=central_fraction)
+
+    # if height and width:
+    # Resize the image to the specified height and width.
+    image = tf.expand_dims(image, 0)
+    image = tf.image.resize_bilinear(image, [height, width], align_corners=False)
+    image = tf.squeeze(image, [0])
+
+    # image = tf.cast(image, tf.float32)
+    # image = tf.multiply(image, 1/255.)
+    image = tf.subtract(image, 0.5)
+    image = tf.multiply(image, 2.0)
+
+    return image, location, label_one_hot
+
+
+def _int64_feature(value):
+    """Wrapper for inserting int64 features into Example proto."""
+    if not isinstance(value, list):
+        value = [value]
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+
+
+def parse_example_proto(example_serialized, num_classes, labels_offset, image_preprocessing_fn):
+    feature_map = {
+        'image/encoded': tf.FixedLenFeature([], tf.string, ''),
+        'image/class/label': tf.FixedLenFeature([1], tf.int64, -1),
+        'image/class/text': tf.FixedLenFeature([], tf.string, ''),
+        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32)
+    }
+    with tf.compat.v1.name_scope('deserialize_image_record'):
+        obj = tf.io.parse_single_example(serialized=example_serialized, features=feature_map)
+        image = tf.image.decode_jpeg(obj['image/encoded'], channels=3, fancy_upscaling=False,
+                                     dct_method='INTEGER_FAST')
+        if image_preprocessing_fn:
+            image = image_preprocessing_fn(image, 224, 224)
+        else:
+            image = tf.image.resize(image, [224, 224])
+
+        label = tf.cast(obj['image/class/label'], tf.int32)
+        label = tf.squeeze(label)
+        label -= labels_offset
+        label = tf.one_hot(label, num_classes - labels_offset)
+        return image, label
+
+
+def parse_example_decode(example_serialized):
+    feature_map = {
+        'image/encoded': tf.FixedLenFeature([], tf.string, ''),
+        'image/class/label': tf.FixedLenFeature([1], tf.int64, -1),
+        'image/class/text': tf.FixedLenFeature([], tf.string, ''),
+        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32)
+    }
+    with tf.compat.v1.name_scope('deserialize_image_record'):
+        obj = tf.io.parse_single_example(serialized=example_serialized, features=feature_map)
+        image = tf.image.decode_jpeg(obj['image/encoded'], channels=3, fancy_upscaling=False,
+                                     dct_method='INTEGER_FAST')
+
+    return image, obj['image/class/label']
+
+
+def parse_example(image, label, num_classes, labels_offset, image_preprocessing_fn):
+    with tf.compat.v1.name_scope('deserialize_image_record'):
+        if image_preprocessing_fn:
+            image = image_preprocessing_fn(image, 224, 224)
+        else:
+            image = tf.image.resize(image, [224, 224])
+
+        label = tf.cast(label, tf.int32)
+        label = tf.squeeze(label)
+        label -= labels_offset
+        label = tf.one_hot(label, num_classes - labels_offset)
+    return image, label
+
+
+def parse_example1(example_serialized, image_preprocessing_fn1):
+    feature_map = {
+        'image/encoded': tf.FixedLenFeature([], tf.string, ''),
+        'image/class/label': tf.FixedLenFeature([1], tf.int64, -1),
+        'image/class/text': tf.FixedLenFeature([], tf.string, ''),
+        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32)
+    }
+    with tf.compat.v1.name_scope('deserialize_image_record'):
+        obj = tf.io.parse_single_example(serialized=example_serialized, features=feature_map)
+        image = tf.image.decode_jpeg(obj['image/encoded'], channels=3, fancy_upscaling=False,
+                                     dct_method='INTEGER_FAST')
+
+        image = image_preprocessing_fn1(image, 224, 224)
+    return image, obj['image/class/label']
+
+
+def parse_example2(image, label, num_classes, labels_offset, image_preprocessing_fn2):
+    with tf.compat.v1.name_scope('deserialize_image_record'):
+        image = image_preprocessing_fn2(image, 224, 224)
+
+        label = tf.cast(label, tf.int32)
+        label = tf.squeeze(label)
+        label -= labels_offset
+        label = tf.one_hot(label, num_classes - labels_offset)
+    return image, label
+
+
+def get_data(dataset, batch_size, num_classes, labels_offset, is_training,
+             preprocessing_name=None, use_grayscale=None, add_image_summaries=False):
+    return get_data_united(dataset, batch_size, num_classes, labels_offset, is_training,
+                           preprocessing_name, use_grayscale, add_image_summaries)
+
+
+def create_ds(data_sources, is_training):
+    data_files = get_data_files(data_sources)
+    ds = tf.data.Dataset.from_tensor_slices(data_files)
+
+    if is_training:
+        ds = ds.shuffle(1000)
+    # add for eval
+    else:
+        ds = ds.take(50000)
+
+    ##### change #####
+    num_readers = 10
+    ds = ds.interleave(
+        tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1,
+        num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    counter = tf.data.Dataset.range(sys.maxsize)
+    ds = tf.data.Dataset.zip((ds, counter))
+    ##### change #####
+
+    if is_training:
+        ds = ds.repeat()
+
+    return ds
+
+
+def get_data_united(dataset, batch_size, num_classes, labels_offset, is_training,
+                    preprocessing_name=None, use_grayscale=None, add_image_summaries=False):
+    from preprocessing import preprocessing_factory
+    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
+        name='inception_v2',
+        is_training=is_training,
+        use_grayscale=use_grayscale,
+        add_image_summaries=add_image_summaries
+    )
+
+    ds = create_ds(dataset.data_sources, is_training)
+	
+    ds = ds.map(lambda example, counter: parse_example_proto(example, num_classes, labels_offset, image_preprocessing_fn), num_parallel_calls=24)
+
+    ds = ds.batch(batch_size, drop_remainder=True)
+
+    ds = ds.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)
+
+    iterator = ds.make_initializable_iterator()
+
+    ds = threadpool.override_threadpool(ds,threadpool.PrivateThreadPool(128, display_name='input_pipeline_thread_pool'))
+
+    return iterator, ds
@@ -0,0 +1 @@
+
@@ -0,0 +1,705 @@
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts ImageNet data to TFRecords file format with Example protos.
+
+The raw ImageNet data set is expected to reside in JPEG files located in the
+following directory structure.
+
+  data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
+  data_dir/n01440764/ILSVRC2012_val_00000543.JPEG
+  ...
+
+where 'n01440764' is the unique synset label associated with
+these images.
+
+The training data set consists of 1000 sub-directories (i.e. labels)
+each containing 1200 JPEG images for a total of 1.2M JPEG images.
+
+The evaluation data set consists of 1000 sub-directories (i.e. labels)
+each containing 50 JPEG images for a total of 50K JPEG images.
+
+This TensorFlow script converts the training and evaluation data into
+a sharded data set consisting of 1024 and 128 TFRecord files, respectively.
+
+  train_directory/train-00000-of-01024
+  train_directory/train-00001-of-01024
+  ...
+  train_directory/train-00127-of-01024
+
+and
+
+  validation_directory/validation-00000-of-00128
+  validation_directory/validation-00001-of-00128
+  ...
+  validation_directory/validation-00127-of-00128
+
+Each validation TFRecord file contains ~390 records. Each training TFREcord
+file contains ~1250 records. Each record within the TFRecord file is a
+serialized Example proto. The Example proto contains the following fields:
+
+  image/encoded: string containing JPEG encoded image in RGB colorspace
+  image/height: integer, image height in pixels
+  image/width: integer, image width in pixels
+  image/colorspace: string, specifying the colorspace, always 'RGB'
+  image/channels: integer, specifying the number of channels, always 3
+  image/format: string, specifying the format, always'JPEG'
+
+  image/filename: string containing the basename of the image file
+            e.g. 'n01440764_10026.JPEG' or 'ILSVRC2012_val_00000293.JPEG'
+  image/class/label: integer specifying the index in a classification layer.
+    The label ranges from [1, 1000] where 0 is not used.
+  image/class/synset: string specifying the unique ID of the label,
+    e.g. 'n01440764'
+  image/class/text: string specifying the human-readable version of the label
+    e.g. 'red fox, Vulpes vulpes'
+
+  image/object/bbox/xmin: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/xmax: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/ymin: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/ymax: list of integers specifying the 0+ human annotated
+    bounding boxes
+  image/object/bbox/label: integer specifying the index in a classification
+    layer. The label ranges from [1, 1000] where 0 is not used. Note this is
+    always identical to the image label.
+
+Note that the length of xmin is identical to the length of xmax, ymin and ymax
+for each example.
+
+Running this script using 16 threads may take around ~2.5 hours on a HP Z420.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from datetime import datetime
+import os
+import random
+import sys
+import threading
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+
+tf.app.flags.DEFINE_string('train_directory', '/tmp/',
+                           'Training data directory')
+tf.app.flags.DEFINE_string('validation_directory', '/tmp/',
+                           'Validation data directory')
+tf.app.flags.DEFINE_string('output_directory', '/tmp/',
+                           'Output data directory')
+
+tf.app.flags.DEFINE_integer('train_shards', 1024,
+                            'Number of shards in training TFRecord files.')
+tf.app.flags.DEFINE_integer('validation_shards', 128,
+                            'Number of shards in validation TFRecord files.')
+
+tf.app.flags.DEFINE_integer('num_threads', 8,
+                            'Number of threads to preprocess the images.')
+
+# The labels file contains a list of valid labels are held in this file.
+# Assumes that the file contains entries as such:
+#   n01440764
+#   n01443537
+#   n01484850
+# where each line corresponds to a label expressed as a synset. We map
+# each synset contained in the file to an integer (based on the alphabetical
+# ordering). See below for details.
+tf.app.flags.DEFINE_string('labels_file',
+                           'imagenet_lsvrc_2015_synsets.txt',
+                           'Labels file')
+
+# This file containing mapping from synset to human-readable label.
+# Assumes each line of the file looks like:
+#
+#   n02119247    black fox
+#   n02119359    silver fox
+#   n02119477    red fox, Vulpes fulva
+#
+# where each line corresponds to a unique mapping. Note that each line is
+# formatted as <synset>\t<human readable label>.
+tf.app.flags.DEFINE_string('imagenet_metadata_file',
+                           'imagenet_metadata.txt',
+                           'ImageNet metadata file')
+
+# This file is the output of process_bounding_box.py
+# Assumes each line of the file looks like:
+#
+#   n00007846_64193.JPEG,0.0060,0.2620,0.7545,0.9940
+#
+# where each line corresponds to one bounding box annotation associated
+# with an image. Each line can be parsed as:
+#
+#   <JPEG file name>, <xmin>, <ymin>, <xmax>, <ymax>
+#
+# Note that there might exist mulitple bounding box annotations associated
+# with an image file.
+tf.app.flags.DEFINE_string('bounding_box_file',
+                           './imagenet_2012_bounding_boxes.csv',
+                           'Bounding box file')
+
+FLAGS = tf.app.flags.FLAGS
+
+
+def _int64_feature(value):
+  """Wrapper for inserting int64 features into Example proto."""
+  if not isinstance(value, list):
+    value = [value]
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
+
+
+def _float_feature(value):
+  """Wrapper for inserting float features into Example proto."""
+  if not isinstance(value, list):
+    value = [value]
+  return tf.train.Feature(float_list=tf.train.FloatList(value=value))
+
+
+def _bytes_feature(value):
+  """Wrapper for inserting bytes features into Example proto."""
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+
+def _convert_to_example(filename, image_buffer, label, synset, human, bbox,
+                        height, width):
+  """Build an Example proto for an example.
+
+  Args:
+    filename: string, path to an image file, e.g., '/path/to/example.JPG'
+    image_buffer: string, JPEG encoding of RGB image
+    label: integer, identifier for the ground truth for the network
+    synset: string, unique WordNet ID specifying the label, e.g., 'n02323233'
+    human: string, human-readable label, e.g., 'red fox, Vulpes vulpes'
+    bbox: list of bounding boxes; each box is a list of integers
+      specifying [xmin, ymin, xmax, ymax]. All boxes are assumed to belong to
+      the same label as the image label.
+    height: integer, image height in pixels
+    width: integer, image width in pixels
+  Returns:
+    Example proto
+  """
+  xmin = []
+  ymin = []
+  xmax = []
+  ymax = []
+  for b in bbox:
+    assert len(b) == 4
+    # pylint: disable=expression-not-assigned
+    [l.append(point) for l, point in zip([xmin, ymin, xmax, ymax], b)]
+    # pylint: enable=expression-not-assigned
+
+  colorspace = 'RGB'
+  channels = 3
+  image_format = 'JPEG'
+
+  example = tf.train.Example(features=tf.train.Features(feature={
+      'image/height': _int64_feature(height),
+      'image/width': _int64_feature(width),
+      'image/colorspace': _bytes_feature(colorspace),
+      'image/channels': _int64_feature(channels),
+      'image/class/label': _int64_feature(label),
+      'image/class/synset': _bytes_feature(synset),
+      'image/class/text': _bytes_feature(human),
+      'image/object/bbox/xmin': _float_feature(xmin),
+      'image/object/bbox/xmax': _float_feature(xmax),
+      'image/object/bbox/ymin': _float_feature(ymin),
+      'image/object/bbox/ymax': _float_feature(ymax),
+      'image/object/bbox/label': _int64_feature([label] * len(xmin)),
+      'image/format': _bytes_feature(image_format),
+      'image/filename': _bytes_feature(os.path.basename(filename)),
+      'image/encoded': _bytes_feature(image_buffer)}))
+  return example
+
+
+class ImageCoder(object):
+  """Helper class that provides TensorFlow image coding utilities."""
+
+  def __init__(self):
+    # Create a single Session to run all image coding calls.
+    self._sess = tf.Session()
+
+    # Initializes function that converts PNG to JPEG data.
+    self._png_data = tf.placeholder(dtype=tf.string)
+    image = tf.image.decode_png(self._png_data, channels=3)
+    self._png_to_jpeg = tf.image.encode_jpeg(image, format='rgb', quality=100)
+
+    # Initializes function that converts CMYK JPEG data to RGB JPEG data.
+    self._cmyk_data = tf.placeholder(dtype=tf.string)
+    image = tf.image.decode_jpeg(self._cmyk_data, channels=0)
+    self._cmyk_to_rgb = tf.image.encode_jpeg(image, format='rgb', quality=100)
+
+    # Initializes function that decodes RGB JPEG data.
+    self._decode_jpeg_data = tf.placeholder(dtype=tf.string)
+    self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3)
+
+  def png_to_jpeg(self, image_data):
+    return self._sess.run(self._png_to_jpeg,
+                          feed_dict={self._png_data: image_data})
+
+  def cmyk_to_rgb(self, image_data):
+    return self._sess.run(self._cmyk_to_rgb,
+                          feed_dict={self._cmyk_data: image_data})
+
+  def decode_jpeg(self, image_data):
+    image = self._sess.run(self._decode_jpeg,
+                           feed_dict={self._decode_jpeg_data: image_data})
+    assert len(image.shape) == 3
+    assert image.shape[2] == 3
+    return image
+
+
+def _is_png(filename):
+  """Determine if a file contains a PNG format image.
+
+  Args:
+    filename: string, path of the image file.
+
+  Returns:
+    boolean indicating if the image is a PNG.
+  """
+  # File list from:
+  # https://groups.google.com/forum/embed/?place=forum/torch7#!topic/torch7/fOSTXHIESSU
+  return 'n02105855_2933.JPEG' in filename
+
+
+def _is_cmyk(filename):
+  """Determine if file contains a CMYK JPEG format image.
+
+  Args:
+    filename: string, path of the image file.
+
+  Returns:
+    boolean indicating if the image is a JPEG encoded with CMYK color space.
+  """
+  # File list from:
+  # https://github.com/cytsai/ilsvrc-cmyk-image-list
+  blacklist = ['n01739381_1309.JPEG', 'n02077923_14822.JPEG',
+               'n02447366_23489.JPEG', 'n02492035_15739.JPEG',
+               'n02747177_10752.JPEG', 'n03018349_4028.JPEG',
+               'n03062245_4620.JPEG', 'n03347037_9675.JPEG',
+               'n03467068_12171.JPEG', 'n03529860_11437.JPEG',
+               'n03544143_17228.JPEG', 'n03633091_5218.JPEG',
+               'n03710637_5125.JPEG', 'n03961711_5286.JPEG',
+               'n04033995_2932.JPEG', 'n04258138_17003.JPEG',
+               'n04264628_27969.JPEG', 'n04336792_7448.JPEG',
+               'n04371774_5854.JPEG', 'n04596742_4225.JPEG',
+               'n07583066_647.JPEG', 'n13037406_4650.JPEG']
+  return filename.split('/')[-1] in blacklist
+
+
+def _process_image(filename, coder):
+  """Process a single image file.
+
+  Args:
+    filename: string, path to an image file e.g., '/path/to/example.JPG'.
+    coder: instance of ImageCoder to provide TensorFlow image coding utils.
+  Returns:
+    image_buffer: string, JPEG encoding of RGB image.
+    height: integer, image height in pixels.
+    width: integer, image width in pixels.
+  """
+  # Read the image file.
+  image_data = tf.gfile.GFile(filename, 'r').read()
+
+  # Clean the dirty data.
+  if _is_png(filename):
+    # 1 image is a PNG.
+    print('Converting PNG to JPEG for %s' % filename)
+    image_data = coder.png_to_jpeg(image_data)
+  elif _is_cmyk(filename):
+    # 22 JPEG images are in CMYK colorspace.
+    print('Converting CMYK to RGB for %s' % filename)
+    image_data = coder.cmyk_to_rgb(image_data)
+
+  # Decode the RGB JPEG.
+  image = coder.decode_jpeg(image_data)
+
+  # Check that image converted to RGB
+  assert len(image.shape) == 3
+  height = image.shape[0]
+  width = image.shape[1]
+  assert image.shape[2] == 3
+
+  return image_data, height, width
+
+
+def _process_image_files_batch(coder, thread_index, ranges, name, filenames,
+                               synsets, labels, humans, bboxes, num_shards):
+  """Processes and saves list of images as TFRecord in 1 thread.
+
+  Args:
+    coder: instance of ImageCoder to provide TensorFlow image coding utils.
+    thread_index: integer, unique batch to run index is within [0, len(ranges)).
+    ranges: list of pairs of integers specifying ranges of each batches to
+      analyze in parallel.
+    name: string, unique identifier specifying the data set
+    filenames: list of strings; each string is a path to an image file
+    synsets: list of strings; each string is a unique WordNet ID
+    labels: list of integer; each integer identifies the ground truth
+    humans: list of strings; each string is a human-readable label
+    bboxes: list of bounding boxes for each image. Note that each entry in this
+      list might contain from 0+ entries corresponding to the number of bounding
+      box annotations for the image.
+    num_shards: integer number of shards for this data set.
+  """
+  # Each thread produces N shards where N = int(num_shards / num_threads).
+  # For instance, if num_shards = 128, and the num_threads = 2, then the first
+  # thread would produce shards [0, 64).
+  num_threads = len(ranges)
+  assert not num_shards % num_threads
+  num_shards_per_batch = int(num_shards / num_threads)
+
+  shard_ranges = np.linspace(ranges[thread_index][0],
+                             ranges[thread_index][1],
+                             num_shards_per_batch + 1).astype(int)
+  num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0]
+
+  counter = 0
+  for s in xrange(num_shards_per_batch):
+    # Generate a sharded version of the file name, e.g. 'train-00002-of-00010'
+    shard = thread_index * num_shards_per_batch + s
+    output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
+    output_file = os.path.join(FLAGS.output_directory, output_filename)
+    writer = tf.python_io.TFRecordWriter(output_file)
+
+    shard_counter = 0
+    files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int)
+    for i in files_in_shard:
+      filename = filenames[i]
+      label = labels[i]
+      synset = synsets[i]
+      human = humans[i]
+      bbox = bboxes[i]
+
+      image_buffer, height, width = _process_image(filename, coder)
+
+      example = _convert_to_example(filename, image_buffer, label,
+                                    synset, human, bbox,
+                                    height, width)
+      writer.write(example.SerializeToString())
+      shard_counter += 1
+      counter += 1
+
+      if not counter % 1000:
+        print('%s [thread %d]: Processed %d of %d images in thread batch.' %
+              (datetime.now(), thread_index, counter, num_files_in_thread))
+        sys.stdout.flush()
+
+    writer.close()
+    print('%s [thread %d]: Wrote %d images to %s' %
+          (datetime.now(), thread_index, shard_counter, output_file))
+    sys.stdout.flush()
+    shard_counter = 0
+  print('%s [thread %d]: Wrote %d images to %d shards.' %
+        (datetime.now(), thread_index, counter, num_files_in_thread))
+  sys.stdout.flush()
+
+
+def _process_image_files(name, filenames, synsets, labels, humans,
+                         bboxes, num_shards):
+  """Process and save list of images as TFRecord of Example protos.
+
+  Args:
+    name: string, unique identifier specifying the data set
+    filenames: list of strings; each string is a path to an image file
+    synsets: list of strings; each string is a unique WordNet ID
+    labels: list of integer; each integer identifies the ground truth
+    humans: list of strings; each string is a human-readable label
+    bboxes: list of bounding boxes for each image. Note that each entry in this
+      list might contain from 0+ entries corresponding to the number of bounding
+      box annotations for the image.
+    num_shards: integer number of shards for this data set.
+  """
+  assert len(filenames) == len(synsets)
+  assert len(filenames) == len(labels)
+  assert len(filenames) == len(humans)
+  assert len(filenames) == len(bboxes)
+
+  # Break all images into batches with a [ranges[i][0], ranges[i][1]].
+  spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int)
+  ranges = []
+  threads = []
+  for i in xrange(len(spacing) - 1):
+    ranges.append([spacing[i], spacing[i+1]])
+
+  # Launch a thread for each batch.
+  print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges))
+  sys.stdout.flush()
+
+  # Create a mechanism for monitoring when all threads are finished.
+  coord = tf.train.Coordinator()
+
+  # Create a generic TensorFlow-based utility for converting all image codings.
+  coder = ImageCoder()
+
+  threads = []
+  for thread_index in xrange(len(ranges)):
+    args = (coder, thread_index, ranges, name, filenames,
+            synsets, labels, humans, bboxes, num_shards)
+    t = threading.Thread(target=_process_image_files_batch, args=args)
+    t.start()
+    threads.append(t)
+
+  # Wait for all the threads to terminate.
+  coord.join(threads)
+  print('%s: Finished writing all %d images in data set.' %
+        (datetime.now(), len(filenames)))
+  sys.stdout.flush()
+
+
+def _find_image_files(data_dir, labels_file):
+  """Build a list of all images files and labels in the data set.
+
+  Args:
+    data_dir: string, path to the root directory of images.
+
+      Assumes that the ImageNet data set resides in JPEG files located in
+      the following directory structure.
+
+        data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
+        data_dir/n01440764/ILSVRC2012_val_00000543.JPEG
+
+      where 'n01440764' is the unique synset label associated with these images.
+
+    labels_file: string, path to the labels file.
+
+      The list of valid labels are held in this file. Assumes that the file
+      contains entries as such:
+        n01440764
+        n01443537
+        n01484850
+      where each line corresponds to a label expressed as a synset. We map
+      each synset contained in the file to an integer (based on the alphabetical
+      ordering) starting with the integer 1 corresponding to the synset
+      contained in the first line.
+
+      The reason we start the integer labels at 1 is to reserve label 0 as an
+      unused background class.
+
+  Returns:
+    filenames: list of strings; each string is a path to an image file.
+    synsets: list of strings; each string is a unique WordNet ID.
+    labels: list of integer; each integer identifies the ground truth.
+  """
+  print('Determining list of input files and labels from %s.' % data_dir)
+  challenge_synsets = [
+      l.strip() for l in tf.gfile.GFile(labels_file, 'r').readlines()
+  ]
+
+  labels = []
+  filenames = []
+  synsets = []
+
+  # Leave label index 0 empty as a background class.
+  label_index = 1
+
+  # Construct the list of JPEG files and labels.
+  for synset in challenge_synsets:
+    jpeg_file_path = '%s/%s/*.JPEG' % (data_dir, synset)
+    matching_files = tf.gfile.Glob(jpeg_file_path)
+
+    labels.extend([label_index] * len(matching_files))
+    synsets.extend([synset] * len(matching_files))
+    filenames.extend(matching_files)
+
+    if not label_index % 100:
+      print('Finished finding files in %d of %d classes.' % (
+          label_index, len(challenge_synsets)))
+    label_index += 1
+
+  # Shuffle the ordering of all image files in order to guarantee
+  # random ordering of the images with respect to label in the
+  # saved TFRecord files. Make the randomization repeatable.
+  shuffled_index = range(len(filenames))
+  random.seed(12345)
+  random.shuffle(shuffled_index)
+
+  filenames = [filenames[i] for i in shuffled_index]
+  synsets = [synsets[i] for i in shuffled_index]
+  labels = [labels[i] for i in shuffled_index]
+
+  print('Found %d JPEG files across %d labels inside %s.' %
+        (len(filenames), len(challenge_synsets), data_dir))
+  return filenames, synsets, labels
+
+
+def _find_human_readable_labels(synsets, synset_to_human):
+  """Build a list of human-readable labels.
+
+  Args:
+    synsets: list of strings; each string is a unique WordNet ID.
+    synset_to_human: dict of synset to human labels, e.g.,
+      'n02119022' --> 'red fox, Vulpes vulpes'
+
+  Returns:
+    List of human-readable strings corresponding to each synset.
+  """
+  humans = []
+  for s in synsets:
+    assert s in synset_to_human, ('Failed to find: %s' % s)
+    humans.append(synset_to_human[s])
+  return humans
+
+
+def _find_image_bounding_boxes(filenames, image_to_bboxes):
+  """Find the bounding boxes for a given image file.
+
+  Args:
+    filenames: list of strings; each string is a path to an image file.
+    image_to_bboxes: dictionary mapping image file names to a list of
+      bounding boxes. This list contains 0+ bounding boxes.
+  Returns:
+    List of bounding boxes for each image. Note that each entry in this
+    list might contain from 0+ entries corresponding to the number of bounding
+    box annotations for the image.
+  """
+  num_image_bbox = 0
+  bboxes = []
+  for f in filenames:
+    basename = os.path.basename(f)
+    if basename in image_to_bboxes:
+      bboxes.append(image_to_bboxes[basename])
+      num_image_bbox += 1
+    else:
+      bboxes.append([])
+  print('Found %d images with bboxes out of %d images' % (
+      num_image_bbox, len(filenames)))
+  return bboxes
+
+
+def _process_dataset(name, directory, num_shards, synset_to_human,
+                     image_to_bboxes):
+  """Process a complete data set and save it as a TFRecord.
+
+  Args:
+    name: string, unique identifier specifying the data set.
+    directory: string, root path to the data set.
+    num_shards: integer number of shards for this data set.
+    synset_to_human: dict of synset to human labels, e.g.,
+      'n02119022' --> 'red fox, Vulpes vulpes'
+    image_to_bboxes: dictionary mapping image file names to a list of
+      bounding boxes. This list contains 0+ bounding boxes.
+  """
+  filenames, synsets, labels = _find_image_files(directory, FLAGS.labels_file)
+  humans = _find_human_readable_labels(synsets, synset_to_human)
+  bboxes = _find_image_bounding_boxes(filenames, image_to_bboxes)
+  _process_image_files(name, filenames, synsets, labels,
+                       humans, bboxes, num_shards)
+
+
+def _build_synset_lookup(imagenet_metadata_file):
+  """Build lookup for synset to human-readable label.
+
+  Args:
+    imagenet_metadata_file: string, path to file containing mapping from
+      synset to human-readable label.
+
+      Assumes each line of the file looks like:
+
+        n02119247    black fox
+        n02119359    silver fox
+        n02119477    red fox, Vulpes fulva
+
+      where each line corresponds to a unique mapping. Note that each line is
+      formatted as <synset>\t<human readable label>.
+
+  Returns:
+    Dictionary of synset to human labels, such as:
+      'n02119022' --> 'red fox, Vulpes vulpes'
+  """
+  lines = tf.gfile.GFile(imagenet_metadata_file, 'r').readlines()
+  synset_to_human = {}
+  for l in lines:
+    if l:
+      parts = l.strip().split('\t')
+      assert len(parts) == 2
+      synset = parts[0]
+      human = parts[1]
+      synset_to_human[synset] = human
+  return synset_to_human
+
+
+def _build_bounding_box_lookup(bounding_box_file):
+  """Build a lookup from image file to bounding boxes.
+
+  Args:
+    bounding_box_file: string, path to file with bounding boxes annotations.
+
+      Assumes each line of the file looks like:
+
+        n00007846_64193.JPEG,0.0060,0.2620,0.7545,0.9940
+
+      where each line corresponds to one bounding box annotation associated
+      with an image. Each line can be parsed as:
+
+        <JPEG file name>, <xmin>, <ymin>, <xmax>, <ymax>
+
+      Note that there might exist mulitple bounding box annotations associated
+      with an image file. This file is the output of process_bounding_boxes.py.
+
+  Returns:
+    Dictionary mapping image file names to a list of bounding boxes. This list
+    contains 0+ bounding boxes.
+  """
+  lines = tf.gfile.GFile(bounding_box_file, 'r').readlines()
+  images_to_bboxes = {}
+  num_bbox = 0
+  num_image = 0
+  for l in lines:
+    if l:
+      parts = l.split(',')
+      assert len(parts) == 5, ('Failed to parse: %s' % l)
+      filename = parts[0]
+      xmin = float(parts[1])
+      ymin = float(parts[2])
+      xmax = float(parts[3])
+      ymax = float(parts[4])
+      box = [xmin, ymin, xmax, ymax]
+
+      if filename not in images_to_bboxes:
+        images_to_bboxes[filename] = []
+        num_image += 1
+      images_to_bboxes[filename].append(box)
+      num_bbox += 1
+
+  print('Successfully read %d bounding boxes '
+        'across %d images.' % (num_bbox, num_image))
+  return images_to_bboxes
+
+
+def main(unused_argv):
+  assert not FLAGS.train_shards % FLAGS.num_threads, (
+      'Please make the FLAGS.num_threads commensurate with FLAGS.train_shards')
+  assert not FLAGS.validation_shards % FLAGS.num_threads, (
+      'Please make the FLAGS.num_threads commensurate with '
+      'FLAGS.validation_shards')
+  print('Saving results to %s' % FLAGS.output_directory)
+
+  # Build a map from synset to human-readable label.
+  synset_to_human = _build_synset_lookup(FLAGS.imagenet_metadata_file)
+  image_to_bboxes = _build_bounding_box_lookup(FLAGS.bounding_box_file)
+
+  # Run it!
+  _process_dataset('validation', FLAGS.validation_directory,
+                   FLAGS.validation_shards, synset_to_human, image_to_bboxes)
+  _process_dataset('train', FLAGS.train_directory, FLAGS.train_shards,
+                   synset_to_human, image_to_bboxes)
+
+
+if __name__ == '__main__':
+  tf.app.run()
@@ -0,0 +1,100 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Provides data for the Cifar10 dataset.
+
+The dataset scripts used to create the dataset can be found at:
+tensorflow/models/research/slim/datasets/download_and_convert_cifar10.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from datasets import dataset_utils
+
+slim = contrib_slim
+
+_FILE_PATTERN = 'cifar10_%s.tfrecord'
+
+SPLITS_TO_SIZES = {'train': 50000, 'test': 10000}
+
+_NUM_CLASSES = 10
+
+_ITEMS_TO_DESCRIPTIONS = {
+    'image': 'A [32 x 32 x 3] color image.',
+    'label': 'A single integer between 0 and 9',
+}
+
+
+def get_split(split_name, dataset_dir, file_pattern=None, reader=None):
+  """Gets a dataset tuple with instructions for reading cifar10.
+
+  Args:
+    split_name: A train/test split name.
+    dataset_dir: The base directory of the dataset sources.
+    file_pattern: The file pattern to use when matching the dataset sources.
+      It is assumed that the pattern contains a '%s' string so that the split
+      name can be inserted.
+    reader: The TensorFlow reader type.
+
+  Returns:
+    A `Dataset` namedtuple.
+
+  Raises:
+    ValueError: if `split_name` is not a valid train/test split.
+  """
+  if split_name not in SPLITS_TO_SIZES:
+    raise ValueError('split name %s was not recognized.' % split_name)
+
+  if not file_pattern:
+    file_pattern = _FILE_PATTERN
+  file_pattern = os.path.join(dataset_dir, file_pattern % split_name)
+
+  # Allowing None in the signature so that dataset_factory can use the default.
+  if not reader:
+    reader = tf.TFRecordReader
+
+  keys_to_features = {
+      'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
+      'image/format': tf.FixedLenFeature((), tf.string, default_value='png'),
+      'image/class/label': tf.FixedLenFeature(
+          [], tf.int64, default_value=tf.zeros([], dtype=tf.int64)),
+  }
+
+  items_to_handlers = {
+      'image': slim.tfexample_decoder.Image(shape=[32, 32, 3]),
+      'label': slim.tfexample_decoder.Tensor('image/class/label'),
+  }
+
+  decoder = slim.tfexample_decoder.TFExampleDecoder(
+      keys_to_features, items_to_handlers)
+
+  labels_to_names = None
+  if dataset_utils.has_labels(dataset_dir):
+    labels_to_names = dataset_utils.read_label_file(dataset_dir)
+
+  return slim.dataset.Dataset(
+      data_sources=file_pattern,
+      reader=reader,
+      decoder=decoder,
+      num_samples=SPLITS_TO_SIZES[split_name],
+      items_to_descriptions=_ITEMS_TO_DESCRIPTIONS,
+      num_classes=_NUM_CLASSES,
+      labels_to_names=labels_to_names,
+  )
@@ -0,0 +1,59 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A factory-pattern class which returns classification image/label pairs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from datasets import cifar10
+from datasets import flowers
+from datasets import imagenet
+from datasets import mnist
+from datasets import visualwakewords
+
+datasets_map = {
+    'cifar10': cifar10,
+    'flowers': flowers,
+    'imagenet': imagenet,
+    'mnist': mnist,
+    'visualwakewords': visualwakewords,
+}
+
+
+def get_dataset(name, split_name, dataset_dir, file_pattern=None, reader=None):
+  """Given a dataset name and a split_name returns a Dataset.
+
+  Args:
+    name: String, the name of the dataset.
+    split_name: A train/test split name.
+    dataset_dir: The directory where the dataset files are stored.
+    file_pattern: The file pattern to use for matching the dataset source files.
+    reader: The subclass of tf.ReaderBase. If left as `None`, then the default
+      reader defined by each dataset is used.
+
+  Returns:
+    A `Dataset` class.
+
+  Raises:
+    ValueError: If the dataset `name` is unknown.
+  """
+  if name not in datasets_map:
+    raise ValueError('Name of dataset unknown %s' % name)
+  return datasets_map[name].get_split(
+      split_name,
+      dataset_dir,
+      file_pattern,
+      reader)
@@ -0,0 +1,240 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains utilities for downloading and converting datasets."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import tarfile
+import zipfile
+
+from six.moves import urllib
+import tensorflow as tf
+
+LABELS_FILENAME = 'labels.txt'
+
+
+def int64_feature(values):
+  """Returns a TF-Feature of int64s.
+
+  Args:
+    values: A scalar or list of values.
+
+  Returns:
+    A TF-Feature.
+  """
+  if not isinstance(values, (tuple, list)):
+    values = [values]
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=values))
+
+
+def bytes_list_feature(values):
+  """Returns a TF-Feature of list of bytes.
+
+  Args:
+    values: A string or list of strings.
+
+  Returns:
+    A TF-Feature.
+  """
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=values))
+
+
+def float_list_feature(values):
+  """Returns a TF-Feature of list of floats.
+
+  Args:
+    values: A float or list of floats.
+
+  Returns:
+    A TF-Feature.
+  """
+  return tf.train.Feature(float_list=tf.train.FloatList(value=values))
+
+
+def bytes_feature(values):
+  """Returns a TF-Feature of bytes.
+
+  Args:
+    values: A string.
+
+  Returns:
+    A TF-Feature.
+  """
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values]))
+
+
+def float_feature(values):
+  """Returns a TF-Feature of floats.
+
+  Args:
+    values: A scalar of list of values.
+
+  Returns:
+    A TF-Feature.
+  """
+  if not isinstance(values, (tuple, list)):
+    values = [values]
+  return tf.train.Feature(float_list=tf.train.FloatList(value=values))
+
+
+def image_to_tfexample(image_data, image_format, height, width, class_id):
+  return tf.train.Example(features=tf.train.Features(feature={
+      'image/encoded': bytes_feature(image_data),
+      'image/format': bytes_feature(image_format),
+      'image/class/label': int64_feature(class_id),
+      'image/height': int64_feature(height),
+      'image/width': int64_feature(width),
+  }))
+
+
+def download_url(url, dataset_dir):
+  """Downloads the tarball or zip file from url into filepath.
+
+  Args:
+    url: The URL of a tarball or zip file.
+    dataset_dir: The directory where the temporary files are stored.
+
+  Returns:
+    filepath: path where the file is downloaded.
+  """
+  filename = url.split('/')[-1]
+  filepath = os.path.join(dataset_dir, filename)
+
+  def _progress(count, block_size, total_size):
+    sys.stdout.write('\r>> Downloading %s %.1f%%' % (
+        filename, float(count * block_size) / float(total_size) * 100.0))
+    sys.stdout.flush()
+
+  filepath, _ = urllib.request.urlretrieve(url, filepath, _progress)
+  print()
+  statinfo = os.stat(filepath)
+  print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
+  return filepath
+
+
+def download_and_uncompress_tarball(tarball_url, dataset_dir):
+  """Downloads the `tarball_url` and uncompresses it locally.
+
+  Args:
+    tarball_url: The URL of a tarball file.
+    dataset_dir: The directory where the temporary files are stored.
+  """
+  filepath = download_url(tarball_url, dataset_dir)
+  tarfile.open(filepath, 'r:gz').extractall(dataset_dir)
+
+
+def download_and_uncompress_zipfile(zip_url, dataset_dir):
+  """Downloads the `zip_url` and uncompresses it locally.
+
+  Args:
+    zip_url: The URL of a zip file.
+    dataset_dir: The directory where the temporary files are stored.
+  """
+  filename = zip_url.split('/')[-1]
+  filepath = os.path.join(dataset_dir, filename)
+
+  if tf.gfile.Exists(filepath):
+    print('File {filename} has been already downloaded at {filepath}. '
+          'Unzipping it....'.format(filename=filename, filepath=filepath))
+  else:
+    filepath = download_url(zip_url, dataset_dir)
+
+  with zipfile.ZipFile(filepath, 'r') as zip_file:
+    for member in zip_file.namelist():
+      memberpath = os.path.join(dataset_dir, member)
+      # extract only if file doesn't exist
+      if not (os.path.exists(memberpath) or os.path.isfile(memberpath)):
+        zip_file.extract(member, dataset_dir)
+
+
+def write_label_file(labels_to_class_names,
+                     dataset_dir,
+                     filename=LABELS_FILENAME):
+  """Writes a file with the list of class names.
+
+  Args:
+    labels_to_class_names: A map of (integer) labels to class names.
+    dataset_dir: The directory in which the labels file should be written.
+    filename: The filename where the class names are written.
+  """
+  labels_filename = os.path.join(dataset_dir, filename)
+  with tf.gfile.Open(labels_filename, 'w') as f:
+    for label in labels_to_class_names:
+      class_name = labels_to_class_names[label]
+      f.write('%d:%s\n' % (label, class_name))
+
+
+def has_labels(dataset_dir, filename=LABELS_FILENAME):
+  """Specifies whether or not the dataset directory contains a label map file.
+
+  Args:
+    dataset_dir: The directory in which the labels file is found.
+    filename: The filename where the class names are written.
+
+  Returns:
+    `True` if the labels file exists and `False` otherwise.
+  """
+  return tf.gfile.Exists(os.path.join(dataset_dir, filename))
+
+
+def read_label_file(dataset_dir, filename=LABELS_FILENAME):
+  """Reads the labels file and returns a mapping from ID to class name.
+
+  Args:
+    dataset_dir: The directory in which the labels file is found.
+    filename: The filename where the class names are written.
+
+  Returns:
+    A map from a label (integer) to class name.
+  """
+  labels_filename = os.path.join(dataset_dir, filename)
+  with tf.gfile.Open(labels_filename, 'rb') as f:
+    lines = f.read().decode()
+  lines = lines.split('\n')
+  lines = filter(None, lines)
+
+  labels_to_class_names = {}
+  for line in lines:
+    index = line.index(':')
+    labels_to_class_names[int(line[:index])] = line[index+1:]
+  return labels_to_class_names
+
+
+def open_sharded_output_tfrecords(exit_stack, base_path, num_shards):
+  """Opens all TFRecord shards for writing and adds them to an exit stack.
+
+  Args:
+    exit_stack: A context2.ExitStack used to automatically closed the TFRecords
+      opened in this function.
+    base_path: The base path for all shards
+    num_shards: The number of shards
+
+  Returns:
+    The list of opened TFRecords. Position k in the list corresponds to shard k.
+  """
+  tf_record_output_filenames = [
+      '{}-{:05d}-of-{:05d}'.format(base_path, idx, num_shards)
+      for idx in range(num_shards)
+  ]
+
+  tfrecords = [
+      exit_stack.enter_context(tf.python_io.TFRecordWriter(file_name))
+      for file_name in tf_record_output_filenames
+  ]
+
+  return tfrecords
@@ -0,0 +1,198 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Downloads and converts cifar10 data to TFRecords of TF-Example protos.
+
+This module downloads the cifar10 data, uncompresses it, reads the files
+that make up the cifar10 data and creates two TFRecord datasets: one for train
+and one for test. Each TFRecord dataset is comprised of a set of TF-Example
+protocol buffers, each of which contain a single image and label.
+
+The script should take several minutes to run.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import tarfile
+
+import numpy as np
+from six.moves import cPickle
+from six.moves import urllib
+import tensorflow as tf
+
+from datasets import dataset_utils
+
+# The URL where the CIFAR data can be downloaded.
+_DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
+
+# The number of training files.
+_NUM_TRAIN_FILES = 5
+
+# The height and width of each image.
+_IMAGE_SIZE = 32
+
+# The names of the classes.
+_CLASS_NAMES = [
+    'airplane',
+    'automobile',
+    'bird',
+    'cat',
+    'deer',
+    'dog',
+    'frog',
+    'horse',
+    'ship',
+    'truck',
+]
+
+
+def _add_to_tfrecord(filename, tfrecord_writer, offset=0):
+  """Loads data from the cifar10 pickle files and writes files to a TFRecord.
+
+  Args:
+    filename: The filename of the cifar10 pickle file.
+    tfrecord_writer: The TFRecord writer to use for writing.
+    offset: An offset into the absolute number of images previously written.
+
+  Returns:
+    The new offset.
+  """
+  with tf.gfile.Open(filename, 'rb') as f:
+    if sys.version_info < (3,):
+      data = cPickle.load(f)
+    else:
+      data = cPickle.load(f, encoding='bytes')
+
+  images = data[b'data']
+  num_images = images.shape[0]
+
+  images = images.reshape((num_images, 3, 32, 32))
+  labels = data[b'labels']
+
+  with tf.Graph().as_default():
+    image_placeholder = tf.placeholder(dtype=tf.uint8)
+    encoded_image = tf.image.encode_png(image_placeholder)
+
+    with tf.Session('') as sess:
+
+      for j in range(num_images):
+        sys.stdout.write('\r>> Reading file [%s] image %d/%d' % (
+            filename, offset + j + 1, offset + num_images))
+        sys.stdout.flush()
+
+        image = np.squeeze(images[j]).transpose((1, 2, 0))
+        label = labels[j]
+
+        png_string = sess.run(encoded_image,
+                              feed_dict={image_placeholder: image})
+
+        example = dataset_utils.image_to_tfexample(
+            png_string, b'png', _IMAGE_SIZE, _IMAGE_SIZE, label)
+        tfrecord_writer.write(example.SerializeToString())
+
+  return offset + num_images
+
+
+def _get_output_filename(dataset_dir, split_name):
+  """Creates the output filename.
+
+  Args:
+    dataset_dir: The dataset directory where the dataset is stored.
+    split_name: The name of the train/test split.
+
+  Returns:
+    An absolute file path.
+  """
+  return '%s/cifar10_%s.tfrecord' % (dataset_dir, split_name)
+
+
+def _download_and_uncompress_dataset(dataset_dir):
+  """Downloads cifar10 and uncompresses it locally.
+
+  Args:
+    dataset_dir: The directory where the temporary files are stored.
+  """
+  filename = _DATA_URL.split('/')[-1]
+  filepath = os.path.join(dataset_dir, filename)
+
+  if not os.path.exists(filepath):
+    def _progress(count, block_size, total_size):
+      sys.stdout.write('\r>> Downloading %s %.1f%%' % (
+          filename, float(count * block_size) / float(total_size) * 100.0))
+      sys.stdout.flush()
+    filepath, _ = urllib.request.urlretrieve(_DATA_URL, filepath, _progress)
+    print()
+    statinfo = os.stat(filepath)
+    print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
+    tarfile.open(filepath, 'r:gz').extractall(dataset_dir)
+
+
+def _clean_up_temporary_files(dataset_dir):
+  """Removes temporary files used to create the dataset.
+
+  Args:
+    dataset_dir: The directory where the temporary files are stored.
+  """
+  filename = _DATA_URL.split('/')[-1]
+  filepath = os.path.join(dataset_dir, filename)
+  tf.gfile.Remove(filepath)
+
+  tmp_dir = os.path.join(dataset_dir, 'cifar-10-batches-py')
+  tf.gfile.DeleteRecursively(tmp_dir)
+
+
+def run(dataset_dir):
+  """Runs the download and conversion operation.
+
+  Args:
+    dataset_dir: The dataset directory where the dataset is stored.
+  """
+  if not tf.gfile.Exists(dataset_dir):
+    tf.gfile.MakeDirs(dataset_dir)
+
+  training_filename = _get_output_filename(dataset_dir, 'train')
+  testing_filename = _get_output_filename(dataset_dir, 'test')
+
+  if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename):
+    print('Dataset files already exist. Exiting without re-creating them.')
+    return
+
+  dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
+
+  # First, process the training data:
+  with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer:
+    offset = 0
+    for i in range(_NUM_TRAIN_FILES):
+      filename = os.path.join(dataset_dir,
+                              'cifar-10-batches-py',
+                              'data_batch_%d' % (i + 1))  # 1-indexed.
+      offset = _add_to_tfrecord(filename, tfrecord_writer, offset)
+
+  # Next, process the testing data:
+  with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer:
+    filename = os.path.join(dataset_dir,
+                            'cifar-10-batches-py',
+                            'test_batch')
+    _add_to_tfrecord(filename, tfrecord_writer)
+
+  # Finally, write the labels file:
+  labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES))
+  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
+
+  _clean_up_temporary_files(dataset_dir)
+  print('\nFinished converting the Cifar10 dataset!')
@@ -0,0 +1,211 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Downloads and converts Flowers data to TFRecords of TF-Example protos.
+
+This module downloads the Flowers data, uncompresses it, reads the files
+that make up the Flowers data and creates two TFRecord datasets: one for train
+and one for test. Each TFRecord dataset is comprised of a set of TF-Example
+protocol buffers, each of which contain a single image and label.
+
+The script should take about a minute to run.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import os
+import random
+import sys
+
+import tensorflow as tf
+
+from datasets import dataset_utils
+
+# The URL where the Flowers data can be downloaded.
+_DATA_URL = 'http://download.tensorflow.org/example_images/flower_photos.tgz'
+
+# The number of images in the validation set.
+_NUM_VALIDATION = 350
+
+# Seed for repeatability.
+_RANDOM_SEED = 0
+
+# The number of shards per dataset split.
+_NUM_SHARDS = 5
+
+
+class ImageReader(object):
+  """Helper class that provides TensorFlow image coding utilities."""
+
+  def __init__(self):
+    # Initializes function that decodes RGB JPEG data.
+    self._decode_jpeg_data = tf.placeholder(dtype=tf.string)
+    self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3)
+
+  def read_image_dims(self, sess, image_data):
+    image = self.decode_jpeg(sess, image_data)
+    return image.shape[0], image.shape[1]
+
+  def decode_jpeg(self, sess, image_data):
+    image = sess.run(self._decode_jpeg,
+                     feed_dict={self._decode_jpeg_data: image_data})
+    assert len(image.shape) == 3
+    assert image.shape[2] == 3
+    return image
+
+
+def _get_filenames_and_classes(dataset_dir):
+  """Returns a list of filenames and inferred class names.
+
+  Args:
+    dataset_dir: A directory containing a set of subdirectories representing
+      class names. Each subdirectory should contain PNG or JPG encoded images.
+
+  Returns:
+    A list of image file paths, relative to `dataset_dir` and the list of
+    subdirectories, representing class names.
+  """
+  flower_root = os.path.join(dataset_dir, 'flower_photos')
+  directories = []
+  class_names = []
+  for filename in os.listdir(flower_root):
+    path = os.path.join(flower_root, filename)
+    if os.path.isdir(path):
+      directories.append(path)
+      class_names.append(filename)
+
+  photo_filenames = []
+  for directory in directories:
+    for filename in os.listdir(directory):
+      path = os.path.join(directory, filename)
+      photo_filenames.append(path)
+
+  return photo_filenames, sorted(class_names)
+
+
+def _get_dataset_filename(dataset_dir, split_name, shard_id):
+  output_filename = 'flowers_%s_%05d-of-%05d.tfrecord' % (
+      split_name, shard_id, _NUM_SHARDS)
+  return os.path.join(dataset_dir, output_filename)
+
+
+def _convert_dataset(split_name, filenames, class_names_to_ids, dataset_dir):
+  """Converts the given filenames to a TFRecord dataset.
+
+  Args:
+    split_name: The name of the dataset, either 'train' or 'validation'.
+    filenames: A list of absolute paths to png or jpg images.
+    class_names_to_ids: A dictionary from class names (strings) to ids
+      (integers).
+    dataset_dir: The directory where the converted datasets are stored.
+  """
+  assert split_name in ['train', 'validation']
+
+  num_per_shard = int(math.ceil(len(filenames) / float(_NUM_SHARDS)))
+
+  with tf.Graph().as_default():
+    image_reader = ImageReader()
+
+    with tf.Session('') as sess:
+
+      for shard_id in range(_NUM_SHARDS):
+        output_filename = _get_dataset_filename(
+            dataset_dir, split_name, shard_id)
+
+        with tf.python_io.TFRecordWriter(output_filename) as tfrecord_writer:
+          start_ndx = shard_id * num_per_shard
+          end_ndx = min((shard_id+1) * num_per_shard, len(filenames))
+          for i in range(start_ndx, end_ndx):
+            sys.stdout.write('\r>> Converting image %d/%d shard %d' % (
+                i+1, len(filenames), shard_id))
+            sys.stdout.flush()
+
+            # Read the filename:
+            image_data = tf.gfile.GFile(filenames[i], 'rb').read()
+            height, width = image_reader.read_image_dims(sess, image_data)
+
+            class_name = os.path.basename(os.path.dirname(filenames[i]))
+            class_id = class_names_to_ids[class_name]
+
+            example = dataset_utils.image_to_tfexample(
+                image_data, b'jpg', height, width, class_id)
+            tfrecord_writer.write(example.SerializeToString())
+
+  sys.stdout.write('\n')
+  sys.stdout.flush()
+
+
+def _clean_up_temporary_files(dataset_dir):
+  """Removes temporary files used to create the dataset.
+
+  Args:
+    dataset_dir: The directory where the temporary files are stored.
+  """
+  filename = _DATA_URL.split('/')[-1]
+  filepath = os.path.join(dataset_dir, filename)
+  tf.gfile.Remove(filepath)
+
+  tmp_dir = os.path.join(dataset_dir, 'flower_photos')
+  tf.gfile.DeleteRecursively(tmp_dir)
+
+
+def _dataset_exists(dataset_dir):
+  for split_name in ['train', 'validation']:
+    for shard_id in range(_NUM_SHARDS):
+      output_filename = _get_dataset_filename(
+          dataset_dir, split_name, shard_id)
+      if not tf.gfile.Exists(output_filename):
+        return False
+  return True
+
+
+def run(dataset_dir):
+  """Runs the download and conversion operation.
+
+  Args:
+    dataset_dir: The dataset directory where the dataset is stored.
+  """
+  if not tf.gfile.Exists(dataset_dir):
+    tf.gfile.MakeDirs(dataset_dir)
+
+  if _dataset_exists(dataset_dir):
+    print('Dataset files already exist. Exiting without re-creating them.')
+    return
+
+  dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
+  photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
+  class_names_to_ids = dict(zip(class_names, range(len(class_names))))
+
+  # Divide into train and test:
+  random.seed(_RANDOM_SEED)
+  random.shuffle(photo_filenames)
+  training_filenames = photo_filenames[_NUM_VALIDATION:]
+  validation_filenames = photo_filenames[:_NUM_VALIDATION]
+
+  # First, convert the training and validation sets.
+  _convert_dataset('train', training_filenames, class_names_to_ids,
+                   dataset_dir)
+  _convert_dataset('validation', validation_filenames, class_names_to_ids,
+                   dataset_dir)
+
+  # Finally, write the labels file:
+  labels_to_class_names = dict(zip(range(len(class_names)), class_names))
+  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
+
+  _clean_up_temporary_files(dataset_dir)
+  print('\nFinished converting the Flowers dataset!')
@@ -0,0 +1,103 @@
+#!/bin/bash
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Script to download and preprocess ImageNet Challenge 2012
+# training and validation data set.
+#
+# The final output of this script are sharded TFRecord files containing
+# serialized Example protocol buffers. See build_imagenet_data.py for
+# details of how the Example protocol buffers contain the ImageNet data.
+#
+# The final output of this script appears as such:
+#
+#   data_dir/train-00000-of-01024
+#   data_dir/train-00001-of-01024
+#    ...
+#   data_dir/train-00127-of-01024
+#
+# and
+#
+#   data_dir/validation-00000-of-00128
+#   data_dir/validation-00001-of-00128
+#   ...
+#   data_dir/validation-00127-of-00128
+#
+# Note that this script may take several hours to run to completion. The
+# conversion of the ImageNet data to TFRecords alone takes 2-3 hours depending
+# on the speed of your machine. Please be patient.
+#
+# **IMPORTANT**
+# To download the raw images, the user must create an account with image-net.org
+# and generate a username and access_key. The latter two are required for
+# downloading the raw images.
+#
+# usage:
+#  cd research/slim
+#  bazel build :download_and_convert_imagenet
+#  ./bazel-bin/download_and_convert_imagenet.sh [data-dir]
+set -e
+
+if [ -z "$1" ]; then
+  echo "usage download_and_convert_imagenet.sh [data dir]"
+  exit
+fi
+
+# Create the output and temporary directories.
+DATA_DIR="${1%/}"
+SCRATCH_DIR="${DATA_DIR}/raw-data/"
+mkdir -p "${DATA_DIR}"
+mkdir -p "${SCRATCH_DIR}"
+WORK_DIR="$0.runfiles/__main__"
+
+# Download the ImageNet data.
+LABELS_FILE="${WORK_DIR}/datasets/imagenet_lsvrc_2015_synsets.txt"
+DOWNLOAD_SCRIPT="${WORK_DIR}/datasets/download_imagenet.sh"
+"${DOWNLOAD_SCRIPT}" "${SCRATCH_DIR}" "${LABELS_FILE}"
+
+# Note the locations of the train and validation data.
+TRAIN_DIRECTORY="${SCRATCH_DIR}train/"
+VALIDATION_DIRECTORY="${SCRATCH_DIR}validation/"
+
+# Preprocess the validation data by moving the images into the appropriate
+# sub-directory based on the label (synset) of the image.
+echo "Organizing the validation data into sub-directories."
+PREPROCESS_VAL_SCRIPT="${WORK_DIR}/datasets/preprocess_imagenet_validation_data.py"
+VAL_LABELS_FILE="${WORK_DIR}/datasets/imagenet_2012_validation_synset_labels.txt"
+
+"${PREPROCESS_VAL_SCRIPT}" "${VALIDATION_DIRECTORY}" "${VAL_LABELS_FILE}"
+
+# Convert the XML files for bounding box annotations into a single CSV.
+echo "Extracting bounding box information from XML."
+BOUNDING_BOX_SCRIPT="${WORK_DIR}/datasets/process_bounding_boxes.py"
+BOUNDING_BOX_FILE="${SCRATCH_DIR}/imagenet_2012_bounding_boxes.csv"
+BOUNDING_BOX_DIR="${SCRATCH_DIR}bounding_boxes/"
+
+"${BOUNDING_BOX_SCRIPT}" "${BOUNDING_BOX_DIR}" "${LABELS_FILE}" \
+ | sort >"${BOUNDING_BOX_FILE}"
+echo "Finished downloading and preprocessing the ImageNet data."
+
+# Build the TFRecords version of the ImageNet data.
+BUILD_SCRIPT="${WORK_DIR}/build_imagenet_data"
+OUTPUT_DIRECTORY="${DATA_DIR}"
+IMAGENET_METADATA_FILE="${WORK_DIR}/datasets/imagenet_metadata.txt"
+
+"${BUILD_SCRIPT}" \
+  --train_directory="${TRAIN_DIRECTORY}" \
+  --validation_directory="${VALIDATION_DIRECTORY}" \
+  --output_directory="${OUTPUT_DIRECTORY}" \
+  --imagenet_metadata_file="${IMAGENET_METADATA_FILE}" \
+  --labels_file="${LABELS_FILE}" \
+  --bounding_box_file="${BOUNDING_BOX_FILE}"
@@ -0,0 +1,221 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Downloads and converts MNIST data to TFRecords of TF-Example protos.
+
+This module downloads the MNIST data, uncompresses it, reads the files
+that make up the MNIST data and creates two TFRecord datasets: one for train
+and one for test. Each TFRecord dataset is comprised of a set of TF-Example
+protocol buffers, each of which contain a single image and label.
+
+The script should take about a minute to run.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import sys
+
+import numpy as np
+from six.moves import urllib
+import tensorflow as tf
+
+from datasets import dataset_utils
+
+# The URLs where the MNIST data can be downloaded.
+_DATA_URL = 'http://yann.lecun.com/exdb/mnist/'
+_TRAIN_DATA_FILENAME = 'train-images-idx3-ubyte.gz'
+_TRAIN_LABELS_FILENAME = 'train-labels-idx1-ubyte.gz'
+_TEST_DATA_FILENAME = 't10k-images-idx3-ubyte.gz'
+_TEST_LABELS_FILENAME = 't10k-labels-idx1-ubyte.gz'
+
+_IMAGE_SIZE = 28
+_NUM_CHANNELS = 1
+
+# The names of the classes.
+_CLASS_NAMES = [
+    'zero',
+    'one',
+    'two',
+    'three',
+    'four',
+    'five',
+    'size',
+    'seven',
+    'eight',
+    'nine',
+]
+
+
+def _extract_images(filename, num_images):
+  """Extract the images into a numpy array.
+
+  Args:
+    filename: The path to an MNIST images file.
+    num_images: The number of images in the file.
+
+  Returns:
+    A numpy array of shape [number_of_images, height, width, channels].
+  """
+  print('Extracting images from: ', filename)
+  with gzip.open(filename) as bytestream:
+    bytestream.read(16)
+    buf = bytestream.read(
+        _IMAGE_SIZE * _IMAGE_SIZE * num_images * _NUM_CHANNELS)
+    data = np.frombuffer(buf, dtype=np.uint8)
+    data = data.reshape(num_images, _IMAGE_SIZE, _IMAGE_SIZE, _NUM_CHANNELS)
+  return data
+
+
+def _extract_labels(filename, num_labels):
+  """Extract the labels into a vector of int64 label IDs.
+
+  Args:
+    filename: The path to an MNIST labels file.
+    num_labels: The number of labels in the file.
+
+  Returns:
+    A numpy array of shape [number_of_labels]
+  """
+  print('Extracting labels from: ', filename)
+  with gzip.open(filename) as bytestream:
+    bytestream.read(8)
+    buf = bytestream.read(1 * num_labels)
+    labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64)
+  return labels
+
+
+def _add_to_tfrecord(data_filename, labels_filename, num_images,
+                     tfrecord_writer):
+  """Loads data from the binary MNIST files and writes files to a TFRecord.
+
+  Args:
+    data_filename: The filename of the MNIST images.
+    labels_filename: The filename of the MNIST labels.
+    num_images: The number of images in the dataset.
+    tfrecord_writer: The TFRecord writer to use for writing.
+  """
+  images = _extract_images(data_filename, num_images)
+  labels = _extract_labels(labels_filename, num_images)
+
+  shape = (_IMAGE_SIZE, _IMAGE_SIZE, _NUM_CHANNELS)
+  with tf.Graph().as_default():
+    image = tf.placeholder(dtype=tf.uint8, shape=shape)
+    encoded_png = tf.image.encode_png(image)
+
+    with tf.Session('') as sess:
+      for j in range(num_images):
+        sys.stdout.write('\r>> Converting image %d/%d' % (j + 1, num_images))
+        sys.stdout.flush()
+
+        png_string = sess.run(encoded_png, feed_dict={image: images[j]})
+
+        example = dataset_utils.image_to_tfexample(
+            png_string, 'png'.encode(), _IMAGE_SIZE, _IMAGE_SIZE, labels[j])
+        tfrecord_writer.write(example.SerializeToString())
+
+
+def _get_output_filename(dataset_dir, split_name):
+  """Creates the output filename.
+
+  Args:
+    dataset_dir: The directory where the temporary files are stored.
+    split_name: The name of the train/test split.
+
+  Returns:
+    An absolute file path.
+  """
+  return '%s/mnist_%s.tfrecord' % (dataset_dir, split_name)
+
+
+def _download_dataset(dataset_dir):
+  """Downloads MNIST locally.
+
+  Args:
+    dataset_dir: The directory where the temporary files are stored.
+  """
+  for filename in [_TRAIN_DATA_FILENAME,
+                   _TRAIN_LABELS_FILENAME,
+                   _TEST_DATA_FILENAME,
+                   _TEST_LABELS_FILENAME]:
+    filepath = os.path.join(dataset_dir, filename)
+
+    if not os.path.exists(filepath):
+      print('Downloading file %s...' % filename)
+      def _progress(count, block_size, total_size):
+        sys.stdout.write('\r>> Downloading %.1f%%' % (
+            float(count * block_size) / float(total_size) * 100.0))
+        sys.stdout.flush()
+      filepath, _ = urllib.request.urlretrieve(_DATA_URL + filename,
+                                               filepath,
+                                               _progress)
+      print()
+      with tf.gfile.GFile(filepath) as f:
+        size = f.size()
+      print('Successfully downloaded', filename, size, 'bytes.')
+
+
+def _clean_up_temporary_files(dataset_dir):
+  """Removes temporary files used to create the dataset.
+
+  Args:
+    dataset_dir: The directory where the temporary files are stored.
+  """
+  for filename in [_TRAIN_DATA_FILENAME,
+                   _TRAIN_LABELS_FILENAME,
+                   _TEST_DATA_FILENAME,
+                   _TEST_LABELS_FILENAME]:
+    filepath = os.path.join(dataset_dir, filename)
+    tf.gfile.Remove(filepath)
+
+
+def run(dataset_dir):
+  """Runs the download and conversion operation.
+
+  Args:
+    dataset_dir: The dataset directory where the dataset is stored.
+  """
+  if not tf.gfile.Exists(dataset_dir):
+    tf.gfile.MakeDirs(dataset_dir)
+
+  training_filename = _get_output_filename(dataset_dir, 'train')
+  testing_filename = _get_output_filename(dataset_dir, 'test')
+
+  if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename):
+    print('Dataset files already exist. Exiting without re-creating them.')
+    return
+
+  _download_dataset(dataset_dir)
+
+  # First, process the training data:
+  with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer:
+    data_filename = os.path.join(dataset_dir, _TRAIN_DATA_FILENAME)
+    labels_filename = os.path.join(dataset_dir, _TRAIN_LABELS_FILENAME)
+    _add_to_tfrecord(data_filename, labels_filename, 60000, tfrecord_writer)
+
+  # Next, process the testing data:
+  with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer:
+    data_filename = os.path.join(dataset_dir, _TEST_DATA_FILENAME)
+    labels_filename = os.path.join(dataset_dir, _TEST_LABELS_FILENAME)
+    _add_to_tfrecord(data_filename, labels_filename, 10000, tfrecord_writer)
+
+  # Finally, write the labels file:
+  labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES))
+  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
+
+  _clean_up_temporary_files(dataset_dir)
+  print('\nFinished converting the MNIST dataset!')
@@ -0,0 +1,158 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Downloads and converts VisualWakewords data to TFRecords of TF-Example protos.
+
+This module downloads the COCO dataset, uncompresses it, derives the
+VisualWakeWords dataset to create two TFRecord datasets: one for
+train and one for test. Each TFRecord dataset is comprised of a set of
+TF-Example protocol buffers, each of which contain a single image and label.
+
+The script should take several minutes to run.
+Please note that this tool creates sharded output files.
+
+VisualWakeWords dataset is used to design tiny models classifying two classes,
+such as person/not-person. The two steps to generate the VisualWakeWords
+dataset from the COCO dataset are given below:
+
+1. Use COCO annotations to create VisualWakeWords annotations:
+
+Note: A bounding box is 'valid' if it has the foreground_class_of_interest
+(e.g. person) and it's area is greater than 0.5% of the image area.
+
+The resulting annotations file has the following fields, where 'images' are
+the same as COCO dataset. 'categories' only contains information about the
+foreground_class_of_interest (e.g. person) and 'annotations' maps an image to
+objects (a list of valid bounding boxes) and label (value is 1 if it has
+atleast one valid bounding box, otherwise 0)
+
+  images[{
+  "id", "width", "height", "file_name", "flickr_url", "coco_url",
+  "license", "date_captured",
+  }]
+
+  categories{
+  "id": {"id", "name", "supercategory"}
+  }
+
+  annotations{
+  "image_id": {"objects":[{"area", "bbox" : [x,y,width,height]}], "label"}
+  }
+
+2. Use VisualWakeWords annotations to create TFRecords:
+
+The resulting TFRecord file contains the following features:
+{ image/height, image/width, image/source_id, image/encoded,
+  image/class/label_text, image/class/label,
+  image/object/class/text,
+  image/object/bbox/ymin, image/object/bbox/xmin, image/object/bbox/ymax,
+  image/object/bbox/xmax, image/object/area
+  image/filename, image/format, image/key/sha256}
+For classification models, you need the image/encoded and image/class/label.
+
+Example usage:
+Run download_and_convert_data.py in the parent directory as follows:
+
+    python download_and_convert_visualwakewords.py --logtostderr \
+      --dataset_name=visualwakewords \
+      --dataset_dir="${DATASET_DIR}" \
+      --small_object_area_threshold=0.005 \
+      --foreground_class_of_interest='person'
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tensorflow as tf
+from datasets import download_and_convert_visualwakewords_lib
+
+tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+
+tf.compat.v1.app.flags.DEFINE_string(
+    'coco_dirname', 'coco_dataset',
+    'A subdirectory in visualwakewords dataset directory'
+    'containing the coco dataset')
+
+FLAGS = tf.compat.v1.app.flags.FLAGS
+
+
+def run(dataset_dir, small_object_area_threshold, foreground_class_of_interest):
+  """Runs the download and conversion operation.
+
+  Args:
+    dataset_dir: The dataset directory where the dataset is stored.
+    small_object_area_threshold: Threshold of fraction of image area below which
+      small objects are filtered
+    foreground_class_of_interest: Build a binary classifier based on the
+      presence or absence of this object in the image.
+  """
+  # 1. Download the coco dataset into a subdirectory under the visualwakewords
+  #    dataset directory
+  coco_dir = os.path.join(dataset_dir, FLAGS.coco_dirname)
+
+  if not tf.gfile.IsDirectory(coco_dir):
+    tf.gfile.MakeDirs(coco_dir)
+
+  download_and_convert_visualwakewords_lib.download_coco_dataset(coco_dir)
+
+  # Path to COCO annotations
+  train_annotations_file = os.path.join(coco_dir, 'annotations',
+                                        'instances_train2014.json')
+  val_annotations_file = os.path.join(coco_dir, 'annotations',
+                                      'instances_val2014.json')
+  train_image_dir = os.path.join(coco_dir, 'train2014')
+  val_image_dir = os.path.join(coco_dir, 'val2014')
+
+  # Path to VisualWakeWords annotations
+  visualwakewords_annotations_train = os.path.join(
+      dataset_dir, 'instances_visualwakewords_train2014.json')
+  visualwakewords_annotations_val = os.path.join(
+      dataset_dir, 'instances_visualwakewords_val2014.json')
+  visualwakewords_labels_filename = os.path.join(dataset_dir, 'labels.txt')
+  train_output_path = os.path.join(dataset_dir, 'train.record')
+  val_output_path = os.path.join(dataset_dir, 'val.record')
+
+  # 2. Create a labels file
+  tf.logging.info('Creating a labels file...')
+  download_and_convert_visualwakewords_lib.create_labels_file(
+      foreground_class_of_interest, visualwakewords_labels_filename)
+
+  # 3. Use COCO annotations to create VisualWakeWords annotations
+  tf.logging.info('Creating train VisualWakeWords annotations...')
+  download_and_convert_visualwakewords_lib.create_visual_wakeword_annotations(
+      train_annotations_file, visualwakewords_annotations_train,
+      small_object_area_threshold, foreground_class_of_interest)
+  tf.logging.info('Creating validation VisualWakeWords annotations...')
+  download_and_convert_visualwakewords_lib.create_visual_wakeword_annotations(
+      val_annotations_file, visualwakewords_annotations_val,
+      small_object_area_threshold, foreground_class_of_interest)
+
+  # 4. Use VisualWakeWords annotations to create the TFRecords
+  tf.logging.info('Creating train TFRecords for VisualWakeWords dataset...')
+  download_and_convert_visualwakewords_lib.create_tf_record_for_visualwakewords_dataset(
+      visualwakewords_annotations_train,
+      train_image_dir,
+      train_output_path,
+      num_shards=100)
+
+  tf.logging.info(
+      'Creating validation TFRecords for VisualWakeWords dataset...')
+  download_and_convert_visualwakewords_lib.create_tf_record_for_visualwakewords_dataset(
+      visualwakewords_annotations_val,
+      val_image_dir,
+      val_output_path,
+      num_shards=10)
@@ -0,0 +1,286 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Helper functions to generate the Visual WakeWords dataset.
+
+    It filters raw COCO annotations file to Visual WakeWords Dataset
+    annotations. The resulting annotations and COCO images are then converted
+    to TF records.
+    See download_and_convert_visualwakewords.py for the sample usage.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import hashlib
+import io
+import json
+import os
+import contextlib2
+
+import PIL.Image
+
+import tensorflow as tf
+
+from datasets import dataset_utils
+
+tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+
+tf.compat.v1.app.flags.DEFINE_string(
+    'coco_train_url',
+    'http://images.cocodataset.org/zips/train2014.zip',
+    'Link to zip file containing coco training data')
+tf.compat.v1.app.flags.DEFINE_string(
+    'coco_validation_url',
+    'http://images.cocodataset.org/zips/val2014.zip',
+    'Link to zip file containing coco validation data')
+tf.compat.v1.app.flags.DEFINE_string(
+    'coco_annotations_url',
+    'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
+    'Link to zip file containing coco annotation data')
+
+FLAGS = tf.compat.v1.app.flags.FLAGS
+
+
+def download_coco_dataset(dataset_dir):
+  """Download the coco dataset.
+
+  Args:
+    dataset_dir: Path where coco dataset should be downloaded.
+  """
+  dataset_utils.download_and_uncompress_zipfile(FLAGS.coco_train_url,
+                                                dataset_dir)
+  dataset_utils.download_and_uncompress_zipfile(FLAGS.coco_validation_url,
+                                                dataset_dir)
+  dataset_utils.download_and_uncompress_zipfile(FLAGS.coco_annotations_url,
+                                                dataset_dir)
+
+
+def create_labels_file(foreground_class_of_interest,
+                       visualwakewords_labels_file):
+  """Generate visualwakewords labels file.
+
+  Args:
+    foreground_class_of_interest: category from COCO dataset that is filtered by
+      the visualwakewords dataset
+    visualwakewords_labels_file: output visualwakewords label file
+  """
+  labels_to_class_names = {0: 'background', 1: foreground_class_of_interest}
+  with open(visualwakewords_labels_file, 'w') as fp:
+    for label in labels_to_class_names:
+      fp.write(str(label) + ':' + str(labels_to_class_names[label]) + '\n')
+
+
+def create_visual_wakeword_annotations(annotations_file,
+                                       visualwakewords_annotations_file,
+                                       small_object_area_threshold,
+                                       foreground_class_of_interest):
+  """Generate visual wakewords annotations file.
+
+  Loads COCO annotation json files to generate visualwakewords annotations file.
+
+  Args:
+    annotations_file: JSON file containing COCO bounding box annotations
+    visualwakewords_annotations_file: path to output annotations file
+    small_object_area_threshold: threshold on fraction of image area below which
+      small object bounding boxes are filtered
+    foreground_class_of_interest: category from COCO dataset that is filtered by
+      the visual wakewords dataset
+  """
+  # default object of interest is person
+  foreground_class_of_interest_id = 1
+  with tf.gfile.GFile(annotations_file, 'r') as fid:
+    groundtruth_data = json.load(fid)
+    images = groundtruth_data['images']
+    # Create category index
+    category_index = {}
+    for category in groundtruth_data['categories']:
+      if category['name'] == foreground_class_of_interest:
+        foreground_class_of_interest_id = category['id']
+        category_index[category['id']] = category
+    # Create annotations index, a map of image_id to it's annotations
+    tf.logging.info('Building annotations index...')
+    annotations_index = collections.defaultdict(
+        lambda: collections.defaultdict(list))
+    # structure is { "image_id": {"objects" : [list of the image annotations]}}
+    for annotation in groundtruth_data['annotations']:
+      annotations_index[annotation['image_id']]['objects'].append(annotation)
+    missing_annotation_count = len(images) - len(annotations_index)
+    tf.logging.info('%d images are missing annotations.',
+                    missing_annotation_count)
+    # Create filtered annotations index
+    annotations_index_filtered = {}
+    for idx, image in enumerate(images):
+      if idx % 100 == 0:
+        tf.logging.info('On image %d of %d', idx, len(images))
+      annotations = annotations_index[image['id']]
+      annotations_filtered = _filter_annotations(
+          annotations, image, small_object_area_threshold,
+          foreground_class_of_interest_id)
+      annotations_index_filtered[image['id']] = annotations_filtered
+
+    with open(visualwakewords_annotations_file, 'w') as fp:
+      json.dump(
+          {
+              'images': images,
+              'annotations': annotations_index_filtered,
+              'categories': category_index
+          }, fp)
+
+
+def _filter_annotations(annotations, image, small_object_area_threshold,
+                        foreground_class_of_interest_id):
+  """Filters COCO annotations to visual wakewords annotations.
+
+  Args:
+    annotations: dicts with keys: {
+      u'objects': [{u'id', u'image_id', u'category_id', u'segmentation',
+                  u'area', u'bbox' : [x,y,width,height], u'iscrowd'}] } Notice
+                    that bounding box coordinates in the official COCO dataset
+                    are given as [x, y, width, height] tuples using absolute
+                    coordinates where x, y represent the top-left (0-indexed)
+                    corner.
+    image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
+      u'width', u'date_captured', u'flickr_url', u'id']
+    small_object_area_threshold: threshold on fraction of image area below which
+      small objects are filtered
+    foreground_class_of_interest_id: category of COCO dataset which visual
+      wakewords filters
+
+  Returns:
+    annotations_filtered: dict with keys: {
+      u'objects': [{"area", "bbox" : [x,y,width,height]}],
+      u'label',
+      }
+  """
+  objects = []
+  image_area = image['height'] * image['width']
+  for annotation in annotations['objects']:
+    normalized_object_area = annotation['area'] / image_area
+    category_id = int(annotation['category_id'])
+    # Filter valid bounding boxes
+    if category_id == foreground_class_of_interest_id and \
+        normalized_object_area > small_object_area_threshold:
+      objects.append({
+          u'area': annotation['area'],
+          u'bbox': annotation['bbox'],
+      })
+  label = 1 if objects else 0
+  return {
+      'objects': objects,
+      'label': label,
+  }
+
+
+def create_tf_record_for_visualwakewords_dataset(annotations_file, image_dir,
+                                                 output_path, num_shards):
+  """Loads Visual WakeWords annotations/images and converts to tf.Record format.
+
+  Args:
+    annotations_file: JSON file containing bounding box annotations.
+    image_dir: Directory containing the image files.
+    output_path: Path to output tf.Record file.
+    num_shards: number of output file shards.
+  """
+  with contextlib2.ExitStack() as tf_record_close_stack, \
+      tf.gfile.GFile(annotations_file, 'r') as fid:
+    output_tfrecords = dataset_utils.open_sharded_output_tfrecords(
+        tf_record_close_stack, output_path, num_shards)
+    groundtruth_data = json.load(fid)
+    images = groundtruth_data['images']
+    annotations_index = groundtruth_data['annotations']
+    annotations_index = {int(k): v for k, v in annotations_index.iteritems()}
+    # convert 'unicode' key to 'int' key after we parse the json file
+
+    for idx, image in enumerate(images):
+      if idx % 100 == 0:
+        tf.logging.info('On image %d of %d', idx, len(images))
+      annotations = annotations_index[image['id']]
+      tf_example = _create_tf_example(image, annotations, image_dir)
+      shard_idx = idx % num_shards
+      output_tfrecords[shard_idx].write(tf_example.SerializeToString())
+
+
+def _create_tf_example(image, annotations, image_dir):
+  """Converts image and annotations to a tf.Example proto.
+
+  Args:
+    image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
+      u'width', u'date_captured', u'flickr_url', u'id']
+    annotations: dict with objects (a list of image annotations) and a label.
+      {u'objects':[{"area", "bbox" : [x,y,width,height}], u'label'}. Notice
+      that bounding box coordinates in the COCO dataset are given as[x, y,
+      width, height] tuples using absolute coordinates where x, y represent
+      the top-left (0-indexed) corner. This function also converts to the format
+      that can be used by the Tensorflow Object Detection API (which is [ymin,
+      xmin, ymax, xmax] with coordinates normalized relative to image size).
+    image_dir: directory containing the image files.
+  Returns:
+    tf_example: The converted tf.Example
+
+  Raises:
+    ValueError: if the image pointed to by data['filename'] is not a valid JPEG
+  """
+  image_height = image['height']
+  image_width = image['width']
+  filename = image['file_name']
+  image_id = image['id']
+
+  full_path = os.path.join(image_dir, filename)
+  with tf.gfile.GFile(full_path, 'rb') as fid:
+    encoded_jpg = fid.read()
+  encoded_jpg_io = io.BytesIO(encoded_jpg)
+  image = PIL.Image.open(encoded_jpg_io)
+  key = hashlib.sha256(encoded_jpg).hexdigest()
+
+  xmin, xmax, ymin, ymax, area = [], [], [], [], []
+  for obj in annotations['objects']:
+    (x, y, width, height) = tuple(obj['bbox'])
+    xmin.append(float(x) / image_width)
+    xmax.append(float(x + width) / image_width)
+    ymin.append(float(y) / image_height)
+    ymax.append(float(y + height) / image_height)
+    area.append(obj['area'])
+
+  feature_dict = {
+      'image/height':
+          dataset_utils.int64_feature(image_height),
+      'image/width':
+          dataset_utils.int64_feature(image_width),
+      'image/filename':
+          dataset_utils.bytes_feature(filename.encode('utf8')),
+      'image/source_id':
+          dataset_utils.bytes_feature(str(image_id).encode('utf8')),
+      'image/key/sha256':
+          dataset_utils.bytes_feature(key.encode('utf8')),
+      'image/encoded':
+          dataset_utils.bytes_feature(encoded_jpg),
+      'image/format':
+          dataset_utils.bytes_feature('jpeg'.encode('utf8')),
+      'image/class/label':
+          dataset_utils.int64_feature(annotations['label']),
+      'image/object/bbox/xmin':
+          dataset_utils.float_list_feature(xmin),
+      'image/object/bbox/xmax':
+          dataset_utils.float_list_feature(xmax),
+      'image/object/bbox/ymin':
+          dataset_utils.float_list_feature(ymin),
+      'image/object/bbox/ymax':
+          dataset_utils.float_list_feature(ymax),
+      'image/object/area':
+          dataset_utils.float_list_feature(area),
+  }
+  example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
+  return example
@@ -0,0 +1,99 @@
+#!/bin/bash
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Script to download ImageNet Challenge 2012 training and validation data set.
+#
+# Downloads and decompresses raw images and bounding boxes.
+#
+# **IMPORTANT**
+# To download the raw images, the user must create an account with image-net.org
+# and generate a username and access_key. The latter two are required for
+# downloading the raw images.
+#
+# usage:
+#  ./download_imagenet.sh [dirname]
+set -e
+
+if [ "x$IMAGENET_ACCESS_KEY" == x -o "x$IMAGENET_USERNAME" == x ]; then
+  cat <<END
+In order to download the imagenet data, you have to create an account with
+image-net.org. This will get you a username and an access key. You can set the
+IMAGENET_USERNAME and IMAGENET_ACCESS_KEY environment variables, or you can
+enter the credentials here.
+END
+  read -p "Username: " IMAGENET_USERNAME
+  read -p "Access key: " IMAGENET_ACCESS_KEY
+fi
+
+OUTDIR="${1:-./imagenet-data}"
+SYNSETS_FILE="${2:-./synsets.txt}"
+
+echo "Saving downloaded files to $OUTDIR"
+mkdir -p "${OUTDIR}"
+CURRENT_DIR=$(pwd)
+BBOX_DIR="${OUTDIR}bounding_boxes"
+mkdir -p "${BBOX_DIR}"
+cd "${OUTDIR}"
+
+# Download and process all of the ImageNet bounding boxes.
+BASE_URL="http://www.image-net.org/challenges/LSVRC/2012/nnoupb"
+
+# See here for details: http://www.image-net.org/download-bboxes
+BOUNDING_BOX_ANNOTATIONS="${BASE_URL}/ILSVRC2012_bbox_train_v2.tar.gz"
+BBOX_TAR_BALL="${BBOX_DIR}/annotations.tar.gz"
+echo "Downloading bounding box annotations."
+wget "${BOUNDING_BOX_ANNOTATIONS}" -O "${BBOX_TAR_BALL}"
+echo "Uncompressing bounding box annotations ..."
+tar xzf "${BBOX_TAR_BALL}" -C "${BBOX_DIR}"
+
+LABELS_ANNOTATED="${BBOX_DIR}/*"
+NUM_XML=$(ls -1 ${LABELS_ANNOTATED} | wc -l)
+echo "Identified ${NUM_XML} bounding box annotations."
+
+# Download and uncompress all images from the ImageNet 2012 validation dataset.
+VALIDATION_TARBALL="ILSVRC2012_img_val.tar"
+OUTPUT_PATH="${OUTDIR}validation/"
+mkdir -p "${OUTPUT_PATH}"
+cd "${OUTDIR}/.."
+echo "Downloading ${VALIDATION_TARBALL} to ${OUTPUT_PATH}."
+wget -nd -c "${BASE_URL}/${VALIDATION_TARBALL}"
+tar xf "${VALIDATION_TARBALL}" -C "${OUTPUT_PATH}"
+
+# Download all images from the ImageNet 2012 train dataset.
+TRAIN_TARBALL="ILSVRC2012_img_train.tar"
+OUTPUT_PATH="${OUTDIR}train/"
+mkdir -p "${OUTPUT_PATH}"
+cd "${OUTDIR}/.."
+echo "Downloading ${TRAIN_TARBALL} to ${OUTPUT_PATH}."
+wget -nd -c "${BASE_URL}/${TRAIN_TARBALL}"
+
+# Un-compress the individual tar-files within the train tar-file.
+echo "Uncompressing individual train tar-balls in the training data."
+
+while read SYNSET; do
+  echo "Processing: ${SYNSET}"
+
+  # Create a directory and delete anything there.
+  mkdir -p "${OUTPUT_PATH}/${SYNSET}"
+  rm -rf "${OUTPUT_PATH}/${SYNSET}/*"
+
+  # Uncompress into the directory.
+  tar xf "${TRAIN_TARBALL}" "${SYNSET}.tar"
+  tar xf "${SYNSET}.tar" -C "${OUTPUT_PATH}/${SYNSET}/"
+  rm -f "${SYNSET}.tar"
+
+  echo "Finished processing: ${SYNSET}"
+done < "${SYNSETS_FILE}"
@@ -0,0 +1,99 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Provides data for the flowers dataset.
+
+The dataset scripts used to create the dataset can be found at:
+tensorflow/models/research/slim/datasets/download_and_convert_flowers.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from datasets import dataset_utils
+
+slim = contrib_slim
+
+_FILE_PATTERN = 'flowers_%s_*.tfrecord'
+
+SPLITS_TO_SIZES = {'train': 3320, 'validation': 350}
+
+_NUM_CLASSES = 5
+
+_ITEMS_TO_DESCRIPTIONS = {
+    'image': 'A color image of varying size.',
+    'label': 'A single integer between 0 and 4',
+}
+
+
+def get_split(split_name, dataset_dir, file_pattern=None, reader=None):
+    """Gets a dataset tuple with instructions for reading flowers.
+
+    Args:
+      split_name: A train/validation split name.
+      dataset_dir: The base directory of the dataset sources.
+      file_pattern: The file pattern to use when matching the dataset sources.
+        It is assumed that the pattern contains a '%s' string so that the split
+        name can be inserted.
+      reader: The TensorFlow reader type.
+
+    Returns:
+      A `Dataset` namedtuple.
+
+    Raises:
+      ValueError: if `split_name` is not a valid train/validation split.
+    """
+    if split_name not in SPLITS_TO_SIZES:
+        raise ValueError('split name %s was not recognized.' % split_name)
+
+    if not file_pattern:
+        file_pattern = _FILE_PATTERN
+    file_pattern = os.path.join(dataset_dir, file_pattern % split_name)
+
+    # Allowing None in the signature so that dataset_factory can use the default.
+    if reader is None:
+        reader = tf.TFRecordReader
+
+    keys_to_features = {
+        'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
+        'image/format': tf.FixedLenFeature((), tf.string, default_value='png'),
+        'image/class/label': tf.FixedLenFeature(
+            [], tf.int64, default_value=tf.zeros([], dtype=tf.int64)),
+    }
+
+    items_to_handlers = {
+        'image': slim.tfexample_decoder.Image(),
+        'label': slim.tfexample_decoder.Tensor('image/class/label'),
+    }
+
+    decoder = slim.tfexample_decoder.TFExampleDecoder(
+        keys_to_features, items_to_handlers)
+
+    labels_to_names = None
+    if dataset_utils.has_labels(dataset_dir):
+        labels_to_names = dataset_utils.read_label_file(dataset_dir)
+
+    return slim.dataset.Dataset(
+        data_sources=file_pattern,
+        reader=reader,
+        decoder=decoder,
+        num_samples=SPLITS_TO_SIZES[split_name],
+        items_to_descriptions=_ITEMS_TO_DESCRIPTIONS,
+        num_classes=_NUM_CLASSES,
+        labels_to_names=labels_to_names)
@@ -0,0 +1,199 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Provides data for the ImageNet ILSVRC 2012 Dataset plus some bounding boxes.
+
+Some images have one or more bounding boxes associated with the label of the
+image. See details here: http://image-net.org/download-bboxes
+
+ImageNet is based upon WordNet 3.0. To uniquely identify a synset, we use
+"WordNet ID" (wnid), which is a concatenation of POS ( i.e. part of speech )
+and SYNSET OFFSET of WordNet. For more information, please refer to the
+WordNet documentation[http://wordnet.princeton.edu/wordnet/documentation/].
+
+"There are bounding boxes for over 3000 popular synsets available.
+For each synset, there are on average 150 images with bounding boxes."
+
+WARNING: Don't use for object detection, in this case all the bounding boxes
+of the image belong to just one class.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from six.moves import urllib
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from datasets import dataset_utils
+
+slim = contrib_slim
+
+# TODO(nsilberman): Add tfrecord file type once the script is updated.
+_FILE_PATTERN = '%s-*'
+
+_SPLITS_TO_SIZES = {
+    'train': 1281167,
+    'validation': 50000,
+}
+
+_ITEMS_TO_DESCRIPTIONS = {
+    'image': 'A color image of varying height and width.',
+    'label': 'The label id of the image, integer between 0 and 999',
+    'label_text': 'The text of the label.',
+    'object/bbox': 'A list of bounding boxes.',
+    'object/label': 'A list of labels, one per each object.',
+}
+
+_NUM_CLASSES = 1001
+
+# If set to false, will not try to set label_to_names in dataset
+# by reading them from labels.txt or github.
+LOAD_READABLE_NAMES = True
+
+
+def create_readable_names_for_imagenet_labels():
+  """Create a dict mapping label id to human readable string.
+
+  Returns:
+      labels_to_names: dictionary where keys are integers from to 1000
+      and values are human-readable names.
+
+  We retrieve a synset file, which contains a list of valid synset labels used
+  by ILSVRC competition. There is one synset one per line, eg.
+          #   n01440764
+          #   n01443537
+  We also retrieve a synset_to_human_file, which contains a mapping from synsets
+  to human-readable names for every synset in Imagenet. These are stored in a
+  tsv format, as follows:
+          #   n02119247    black fox
+          #   n02119359    silver fox
+  We assign each synset (in alphabetical order) an integer, starting from 1
+  (since 0 is reserved for the background class).
+
+  Code is based on
+  https://github.com/tensorflow/models/blob/master/research/inception/inception/data/build_imagenet_data.py#L463
+  """
+
+  # pylint: disable=g-line-too-long
+  base_url = 'https://raw.githubusercontent.com/tensorflow/models/master/research/inception/inception/data/'
+  synset_url = '{}/imagenet_lsvrc_2015_synsets.txt'.format(base_url)
+  synset_to_human_url = '{}/imagenet_metadata.txt'.format(base_url)
+
+  filename, _ = urllib.request.urlretrieve(synset_url)
+  synset_list = [s.strip() for s in open(filename).readlines()]
+  num_synsets_in_ilsvrc = len(synset_list)
+  assert num_synsets_in_ilsvrc == 1000
+
+  filename, _ = urllib.request.urlretrieve(synset_to_human_url)
+  synset_to_human_list = open(filename).readlines()
+  num_synsets_in_all_imagenet = len(synset_to_human_list)
+  assert num_synsets_in_all_imagenet == 21842
+
+  synset_to_human = {}
+  for s in synset_to_human_list:
+    parts = s.strip().split('\t')
+    assert len(parts) == 2
+    synset = parts[0]
+    human = parts[1]
+    synset_to_human[synset] = human
+
+  label_index = 1
+  labels_to_names = {0: 'background'}
+  for synset in synset_list:
+    name = synset_to_human[synset]
+    labels_to_names[label_index] = name
+    label_index += 1
+
+  return labels_to_names
+
+
+def get_split(split_name, dataset_dir, file_pattern=None, reader=None):
+  """Gets a dataset tuple with instructions for reading ImageNet.
+
+  Args:
+    split_name: A train/test split name.
+    dataset_dir: The base directory of the dataset sources.
+    file_pattern: The file pattern to use when matching the dataset sources.
+      It is assumed that the pattern contains a '%s' string so that the split
+      name can be inserted.
+    reader: The TensorFlow reader type.
+
+  Returns:
+    A `Dataset` namedtuple.
+
+  Raises:
+    ValueError: if `split_name` is not a valid train/test split.
+  """
+  if split_name not in _SPLITS_TO_SIZES:
+    raise ValueError('split name %s was not recognized.' % split_name)
+
+  if not file_pattern:
+    file_pattern = _FILE_PATTERN
+  file_pattern = os.path.join(dataset_dir, file_pattern % split_name)
+
+  # Allowing None in the signature so that dataset_factory can use the default.
+  if reader is None:
+    reader = tf.TFRecordReader
+
+  keys_to_features = {
+      'image/encoded': tf.FixedLenFeature(
+          (), tf.string, default_value=''),
+      'image/format': tf.FixedLenFeature(
+          (), tf.string, default_value='jpeg'),
+      'image/class/label': tf.FixedLenFeature(
+          [], dtype=tf.int64, default_value=-1),
+      'image/class/text': tf.FixedLenFeature(
+          [], dtype=tf.string, default_value=''),
+      'image/object/bbox/xmin': tf.VarLenFeature(
+          dtype=tf.float32),
+      'image/object/bbox/ymin': tf.VarLenFeature(
+          dtype=tf.float32),
+      'image/object/bbox/xmax': tf.VarLenFeature(
+          dtype=tf.float32),
+      'image/object/bbox/ymax': tf.VarLenFeature(
+          dtype=tf.float32),
+      'image/object/class/label': tf.VarLenFeature(
+          dtype=tf.int64),
+  }
+
+  items_to_handlers = {
+      'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'),
+      'label': slim.tfexample_decoder.Tensor('image/class/label'),
+      'label_text': slim.tfexample_decoder.Tensor('image/class/text'),
+      'object/bbox': slim.tfexample_decoder.BoundingBox(
+          ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'),
+      'object/label': slim.tfexample_decoder.Tensor('image/object/class/label'),
+  }
+
+  decoder = slim.tfexample_decoder.TFExampleDecoder(
+      keys_to_features, items_to_handlers)
+
+  labels_to_names = None
+  if LOAD_READABLE_NAMES:
+    if dataset_utils.has_labels(dataset_dir):
+      labels_to_names = dataset_utils.read_label_file(dataset_dir)
+    else:
+      labels_to_names = create_readable_names_for_imagenet_labels()
+      dataset_utils.write_label_file(labels_to_names, dataset_dir)
+
+  return slim.dataset.Dataset(
+      data_sources=file_pattern,
+      reader=reader,
+      decoder=decoder,
+      num_samples=_SPLITS_TO_SIZES[split_name],
+      items_to_descriptions=_ITEMS_TO_DESCRIPTIONS,
+      num_classes=_NUM_CLASSES,
+      labels_to_names=labels_to_names)
@@ -0,0 +1,99 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Provides data for the MNIST dataset.
+
+The dataset scripts used to create the dataset can be found at:
+tensorflow/models/research/slim/datasets/download_and_convert_mnist.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from datasets import dataset_utils
+
+slim = contrib_slim
+
+_FILE_PATTERN = 'mnist_%s.tfrecord'
+
+_SPLITS_TO_SIZES = {'train': 60000, 'test': 10000}
+
+_NUM_CLASSES = 10
+
+_ITEMS_TO_DESCRIPTIONS = {
+    'image': 'A [28 x 28 x 1] grayscale image.',
+    'label': 'A single integer between 0 and 9',
+}
+
+
+def get_split(split_name, dataset_dir, file_pattern=None, reader=None):
+  """Gets a dataset tuple with instructions for reading MNIST.
+
+  Args:
+    split_name: A train/test split name.
+    dataset_dir: The base directory of the dataset sources.
+    file_pattern: The file pattern to use when matching the dataset sources.
+      It is assumed that the pattern contains a '%s' string so that the split
+      name can be inserted.
+    reader: The TensorFlow reader type.
+
+  Returns:
+    A `Dataset` namedtuple.
+
+  Raises:
+    ValueError: if `split_name` is not a valid train/test split.
+  """
+  if split_name not in _SPLITS_TO_SIZES:
+    raise ValueError('split name %s was not recognized.' % split_name)
+
+  if not file_pattern:
+    file_pattern = _FILE_PATTERN
+  file_pattern = os.path.join(dataset_dir, file_pattern % split_name)
+
+  # Allowing None in the signature so that dataset_factory can use the default.
+  if reader is None:
+    reader = tf.TFRecordReader
+
+  keys_to_features = {
+      'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
+      'image/format': tf.FixedLenFeature((), tf.string, default_value='raw'),
+      'image/class/label': tf.FixedLenFeature(
+          [1], tf.int64, default_value=tf.zeros([1], dtype=tf.int64)),
+  }
+
+  items_to_handlers = {
+      'image': slim.tfexample_decoder.Image(shape=[28, 28, 1], channels=1),
+      'label': slim.tfexample_decoder.Tensor('image/class/label', shape=[]),
+  }
+
+  decoder = slim.tfexample_decoder.TFExampleDecoder(
+      keys_to_features, items_to_handlers)
+
+  labels_to_names = None
+  if dataset_utils.has_labels(dataset_dir):
+    labels_to_names = dataset_utils.read_label_file(dataset_dir)
+
+  return slim.dataset.Dataset(
+      data_sources=file_pattern,
+      reader=reader,
+      decoder=decoder,
+      num_samples=_SPLITS_TO_SIZES[split_name],
+      num_classes=_NUM_CLASSES,
+      items_to_descriptions=_ITEMS_TO_DESCRIPTIONS,
+      labels_to_names=labels_to_names)
@@ -0,0 +1,83 @@
+#!/usr/bin/python
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Process the ImageNet Challenge bounding boxes for TensorFlow model training.
+
+Associate the ImageNet 2012 Challenge validation data set with labels.
+
+The raw ImageNet validation data set is expected to reside in JPEG files
+located in the following directory structure.
+
+ data_dir/ILSVRC2012_val_00000001.JPEG
+ data_dir/ILSVRC2012_val_00000002.JPEG
+ ...
+ data_dir/ILSVRC2012_val_00050000.JPEG
+
+This script moves the files into a directory structure like such:
+ data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
+ data_dir/n01440764/ILSVRC2012_val_00000543.JPEG
+ ...
+where 'n01440764' is the unique synset label associated with
+these images.
+
+This directory reorganization requires a mapping from validation image
+number (i.e. suffix of the original file) to the associated label. This
+is provided in the ImageNet development kit via a Matlab file.
+
+In order to make life easier and divorce ourselves from Matlab, we instead
+supply a custom text file that provides this mapping for us.
+
+Sample usage:
+  ./preprocess_imagenet_validation_data.py ILSVRC2012_img_val \
+  imagenet_2012_validation_synset_labels.txt
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+
+if __name__ == '__main__':
+  if len(sys.argv) < 3:
+    print('Invalid usage\n'
+          'usage: preprocess_imagenet_validation_data.py '
+          '<validation data dir> <validation labels file>')
+    sys.exit(-1)
+  data_dir = sys.argv[1]
+  validation_labels_file = sys.argv[2]
+
+  # Read in the 50000 synsets associated with the validation data set.
+  labels = [l.strip() for l in open(validation_labels_file).readlines()]
+  unique_labels = set(labels)
+
+  # Make all sub-directories in the validation data dir.
+  for label in unique_labels:
+    labeled_data_dir = os.path.join(data_dir, label)
+    os.makedirs(labeled_data_dir)
+
+  # Move all of the image to the appropriate sub-directory.
+  for i in xrange(len(labels)):
+    basename = 'ILSVRC2012_val_000%.5d.JPEG' % (i + 1)
+    original_filename = os.path.join(data_dir, basename)
+    if not os.path.exists(original_filename):
+      print('Failed to find: ', original_filename)
+      sys.exit(-1)
+    new_filename = os.path.join(data_dir, labels[i], basename)
+    os.rename(original_filename, new_filename)
@@ -0,0 +1,253 @@
+#!/usr/bin/python
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Process the ImageNet Challenge bounding boxes for TensorFlow model training.
+
+This script is called as
+
+process_bounding_boxes.py <dir> [synsets-file]
+
+Where <dir> is a directory containing the downloaded and unpacked bounding box
+data. If [synsets-file] is supplied, then only the bounding boxes whose
+synstes are contained within this file are returned. Note that the
+[synsets-file] file contains synset ids, one per line.
+
+The script dumps out a CSV text file in which each line contains an entry.
+  n00007846_64193.JPEG,0.0060,0.2620,0.7545,0.9940
+
+The entry can be read as:
+  <JPEG file name>, <xmin>, <ymin>, <xmax>, <ymax>
+
+The bounding box for <JPEG file name> contains two points (xmin, ymin) and
+(xmax, ymax) specifying the lower-left corner and upper-right corner of a
+bounding box in *relative* coordinates.
+
+The user supplies a directory where the XML files reside. The directory
+structure in the directory <dir> is assumed to look like this:
+
+<dir>/nXXXXXXXX/nXXXXXXXX_YYYY.xml
+
+Each XML file contains a bounding box annotation. The script:
+
+ (1) Parses the XML file and extracts the filename, label and bounding box info.
+
+ (2) The bounding box is specified in the XML files as integer (xmin, ymin) and
+    (xmax, ymax) *relative* to image size displayed to the human annotator. The
+    size of the image displayed to the human annotator is stored in the XML file
+    as integer (height, width).
+
+    Note that the displayed size will differ from the actual size of the image
+    downloaded from image-net.org. To make the bounding box annotation useable,
+    we convert bounding box to floating point numbers relative to displayed
+    height and width of the image.
+
+    Note that each XML file might contain N bounding box annotations.
+
+    Note that the points are all clamped at a range of [0.0, 1.0] because some
+    human annotations extend outside the range of the supplied image.
+
+    See details here: http://image-net.org/download-bboxes
+
+(3) By default, the script outputs all valid bounding boxes. If a
+    [synsets-file] is supplied, only the subset of bounding boxes associated
+    with those synsets are outputted. Importantly, one can supply a list of
+    synsets in the ImageNet Challenge and output the list of bounding boxes
+    associated with the training images of the ILSVRC.
+
+    We use these bounding boxes to inform the random distortion of images
+    supplied to the network.
+
+If you run this script successfully, you will see the following output
+to stderr:
+> Finished processing 544546 XML files.
+> Skipped 0 XML files not in ImageNet Challenge.
+> Skipped 0 bounding boxes not in ImageNet Challenge.
+> Wrote 615299 bounding boxes from 544546 annotated images.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import os.path
+import sys
+import xml.etree.ElementTree as ET
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+
+class BoundingBox(object):
+  pass
+
+
+def GetItem(name, root, index=0):
+  count = 0
+  for item in root.iter(name):
+    if count == index:
+      return item.text
+    count += 1
+  # Failed to find "index" occurrence of item.
+  return -1
+
+
+def GetInt(name, root, index=0):
+  return int(GetItem(name, root, index))
+
+
+def FindNumberBoundingBoxes(root):
+  index = 0
+  while True:
+    if GetInt('xmin', root, index) == -1:
+      break
+    index += 1
+  return index
+
+
+def ProcessXMLAnnotation(xml_file):
+  """Process a single XML file containing a bounding box."""
+  # pylint: disable=broad-except
+  try:
+    tree = ET.parse(xml_file)
+  except Exception:
+    print('Failed to parse: ' + xml_file, file=sys.stderr)
+    return None
+  # pylint: enable=broad-except
+  root = tree.getroot()
+
+  num_boxes = FindNumberBoundingBoxes(root)
+  boxes = []
+
+  for index in xrange(num_boxes):
+    box = BoundingBox()
+    # Grab the 'index' annotation.
+    box.xmin = GetInt('xmin', root, index)
+    box.ymin = GetInt('ymin', root, index)
+    box.xmax = GetInt('xmax', root, index)
+    box.ymax = GetInt('ymax', root, index)
+
+    box.width = GetInt('width', root)
+    box.height = GetInt('height', root)
+    box.filename = GetItem('filename', root) + '.JPEG'
+    box.label = GetItem('name', root)
+
+    xmin = float(box.xmin) / float(box.width)
+    xmax = float(box.xmax) / float(box.width)
+    ymin = float(box.ymin) / float(box.height)
+    ymax = float(box.ymax) / float(box.height)
+
+    # Some images contain bounding box annotations that
+    # extend outside of the supplied image. See, e.g.
+    # n03127925/n03127925_147.xml
+    # Additionally, for some bounding boxes, the min > max
+    # or the box is entirely outside of the image.
+    min_x = min(xmin, xmax)
+    max_x = max(xmin, xmax)
+    box.xmin_scaled = min(max(min_x, 0.0), 1.0)
+    box.xmax_scaled = min(max(max_x, 0.0), 1.0)
+
+    min_y = min(ymin, ymax)
+    max_y = max(ymin, ymax)
+    box.ymin_scaled = min(max(min_y, 0.0), 1.0)
+    box.ymax_scaled = min(max(max_y, 0.0), 1.0)
+
+    boxes.append(box)
+
+  return boxes
+
+if __name__ == '__main__':
+  if len(sys.argv) < 2 or len(sys.argv) > 3:
+    print('Invalid usage\n'
+          'usage: process_bounding_boxes.py <dir> [synsets-file]',
+          file=sys.stderr)
+    sys.exit(-1)
+
+  xml_files = glob.glob(sys.argv[1] + '/*/*.xml')
+  print('Identified %d XML files in %s' % (len(xml_files), sys.argv[1]),
+        file=sys.stderr)
+
+  if len(sys.argv) == 3:
+    labels = set([l.strip() for l in open(sys.argv[2]).readlines()])
+    print('Identified %d synset IDs in %s' % (len(labels), sys.argv[2]),
+          file=sys.stderr)
+  else:
+    labels = None
+
+  skipped_boxes = 0
+  skipped_files = 0
+  saved_boxes = 0
+  saved_files = 0
+  for file_index, one_file in enumerate(xml_files):
+    # Example: <...>/n06470073/n00141669_6790.xml
+    label = os.path.basename(os.path.dirname(one_file))
+
+    # Determine if the annotation is from an ImageNet Challenge label.
+    if labels is not None and label not in labels:
+      skipped_files += 1
+      continue
+
+    bboxes = ProcessXMLAnnotation(one_file)
+    assert bboxes is not None, 'No bounding boxes found in ' + one_file
+
+    found_box = False
+    for bbox in bboxes:
+      if labels is not None:
+        if bbox.label != label:
+          # Note: There is a slight bug in the bounding box annotation data.
+          # Many of the dog labels have the human label 'Scottish_deerhound'
+          # instead of the synset ID 'n02092002' in the bbox.label field. As a
+          # simple hack to overcome this issue, we only exclude bbox labels
+          # *which are synset ID's* that do not match original synset label for
+          # the XML file.
+          if bbox.label in labels:
+            skipped_boxes += 1
+            continue
+
+      # Guard against improperly specified boxes.
+      if (bbox.xmin_scaled >= bbox.xmax_scaled or
+          bbox.ymin_scaled >= bbox.ymax_scaled):
+        skipped_boxes += 1
+        continue
+
+      # Note bbox.filename occasionally contains '%s' in the name. This is
+      # data set noise that is fixed by just using the basename of the XML file.
+      image_filename = os.path.splitext(os.path.basename(one_file))[0]
+      print('%s.JPEG,%.4f,%.4f,%.4f,%.4f' %
+            (image_filename,
+             bbox.xmin_scaled, bbox.ymin_scaled,
+             bbox.xmax_scaled, bbox.ymax_scaled))
+
+      saved_boxes += 1
+      found_box = True
+    if found_box:
+      saved_files += 1
+    else:
+      skipped_files += 1
+
+    if not file_index % 5000:
+      print('--> processed %d of %d XML files.' %
+            (file_index + 1, len(xml_files)),
+            file=sys.stderr)
+      print('--> skipped %d boxes and %d XML files.' %
+            (skipped_boxes, skipped_files), file=sys.stderr)
+
+  print('Finished processing %d XML files.' % len(xml_files), file=sys.stderr)
+  print('Skipped %d XML files not in ImageNet Challenge.' % skipped_files,
+        file=sys.stderr)
+  print('Skipped %d bounding boxes not in ImageNet Challenge.' % skipped_boxes,
+        file=sys.stderr)
+  print('Wrote %d bounding boxes from %d annotated images.' %
+        (saved_boxes, saved_files),
+        file=sys.stderr)
+  print('Finished.', file=sys.stderr)
@@ -0,0 +1,129 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Provides data for Visual WakeWords Dataset with images+labels.
+
+Visual WakeWords Dataset derives from the COCO dataset to design tiny models
+classifying two classes, such as person/not-person. The COCO annotations
+are filtered to two classes: person and not-person (or another user-defined
+category). Bounding boxes for small objects with area less than 5% of the image
+area are filtered out.
+See build_visualwakewords_data.py which generates the Visual WakeWords dataset
+annotations from the raw COCO dataset and converts them to TFRecord.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from datasets import dataset_utils
+
+
+slim = contrib_slim
+
+_FILE_PATTERN = '%s.record-*'
+
+_SPLITS_TO_SIZES = {
+    'train': 82783,
+    'val': 40504,
+}
+
+
+_ITEMS_TO_DESCRIPTIONS = {
+    'image': 'A color image of varying height and width.',
+    'label': 'The label id of the image, an integer in {0, 1}',
+    'object/bbox': 'A list of bounding boxes.',
+}
+
+_NUM_CLASSES = 2
+
+# labels file
+LABELS_FILENAME = 'labels.txt'
+
+
+def get_split(split_name, dataset_dir, file_pattern=None, reader=None):
+  """Gets a dataset tuple with instructions for reading ImageNet.
+
+  Args:
+    split_name: A train/test split name.
+    dataset_dir: The base directory of the dataset sources.
+    file_pattern: The file pattern to use when matching the dataset sources. It
+      is assumed that the pattern contains a '%s' string so that the split name
+      can be inserted.
+    reader: The TensorFlow reader type.
+
+  Returns:
+    A `Dataset` namedtuple.
+
+  Raises:
+    ValueError: if `split_name` is not a valid train/test split.
+  """
+  if split_name not in _SPLITS_TO_SIZES:
+    raise ValueError('split name %s was not recognized.' % split_name)
+
+  if not file_pattern:
+    file_pattern = _FILE_PATTERN
+  file_pattern = os.path.join(dataset_dir, file_pattern % split_name)
+
+  # Allowing None in the signature so that dataset_factory can use the default.
+  if reader is None:
+    reader = tf.TFRecordReader
+
+  keys_to_features = {
+      'image/encoded':
+          tf.FixedLenFeature((), tf.string, default_value=''),
+      'image/format':
+          tf.FixedLenFeature((), tf.string, default_value='jpeg'),
+      'image/class/label':
+          tf.FixedLenFeature([], dtype=tf.int64, default_value=-1),
+      'image/object/bbox/xmin':
+          tf.VarLenFeature(dtype=tf.float32),
+      'image/object/bbox/ymin':
+          tf.VarLenFeature(dtype=tf.float32),
+      'image/object/bbox/xmax':
+          tf.VarLenFeature(dtype=tf.float32),
+      'image/object/bbox/ymax':
+          tf.VarLenFeature(dtype=tf.float32),
+  }
+
+  items_to_handlers = {
+      'image':
+          slim.tfexample_decoder.Image('image/encoded', 'image/format'),
+      'label':
+          slim.tfexample_decoder.Tensor('image/class/label'),
+      'object/bbox':
+          slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'],
+                                             'image/object/bbox/'),
+  }
+
+  decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features,
+                                                    items_to_handlers)
+
+  labels_to_names = None
+  labels_file = os.path.join(dataset_dir, LABELS_FILENAME)
+  if tf.gfile.Exists(labels_file):
+    labels_to_names = dataset_utils.read_label_file(dataset_dir)
+
+  return slim.dataset.Dataset(
+      data_sources=file_pattern,
+      reader=reader,
+      decoder=decoder,
+      num_samples=_SPLITS_TO_SIZES[split_name],
+      items_to_descriptions=_ITEMS_TO_DESCRIPTIONS,
+      num_classes=_NUM_CLASSES,
+      labels_to_names=labels_to_names)
@@ -0,0 +1 @@
+
@@ -0,0 +1,677 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Deploy Slim models across multiple clones and replicas.
+
+# TODO(sguada) docstring paragraph by (a) motivating the need for the file and
+# (b) defining clones.
+
+# TODO(sguada) describe the high-level components of model deployment.
+# E.g. "each model deployment is composed of several parts: a DeploymentConfig,
+# which captures A, B and C, an input_fn which loads data.. etc
+
+To easily train a model on multiple GPUs or across multiple machines this
+module provides a set of helper functions: `create_clones`,
+`optimize_clones` and `deploy`.
+
+Usage:
+
+  g = tf.Graph()
+
+  # Set up DeploymentConfig
+  config = model_deploy.DeploymentConfig(num_clones=2, clone_on_cpu=True)
+
+  # Create the global step on the device storing the variables.
+  with tf.device(config.variables_device()):
+    global_step = slim.create_global_step()
+
+  # Define the inputs
+  with tf.device(config.inputs_device()):
+    images, labels = LoadData(...)
+    inputs_queue = slim.data.prefetch_queue((images, labels))
+
+  # Define the optimizer.
+  with tf.device(config.optimizer_device()):
+    optimizer = tf.train.MomentumOptimizer(FLAGS.learning_rate, FLAGS.momentum)
+
+  # Define the model including the loss.
+  def model_fn(inputs_queue):
+    images, labels = inputs_queue.dequeue()
+    predictions = CreateNetwork(images)
+    slim.losses.log_loss(predictions, labels)
+
+  model_dp = model_deploy.deploy(config, model_fn, [inputs_queue],
+                                 optimizer=optimizer)
+
+  # Run training.
+  slim.learning.train(model_dp.train_op, my_log_dir,
+                      summary_op=model_dp.summary_op)
+
+The Clone namedtuple holds together the values associated with each call to
+model_fn:
+  * outputs: The return values of the calls to `model_fn()`.
+  * scope: The scope used to create the clone.
+  * device: The device used to create the clone.
+
+DeployedModel namedtuple, holds together the values needed to train multiple
+clones:
+  * train_op: An operation that run the optimizer training op and include
+    all the update ops created by `model_fn`. Present only if an optimizer
+    was specified.
+  * summary_op: An operation that run the summaries created by `model_fn`
+    and process_gradients.
+  * total_loss: A `Tensor` that contains the sum of all losses created by
+    `model_fn` plus the regularization losses.
+  * clones: List of `Clone` tuples returned by `create_clones()`.
+
+DeploymentConfig parameters:
+  * num_clones: Number of model clones to deploy in each replica.
+  * clone_on_cpu: True if clones should be placed on CPU.
+  * replica_id: Integer.  Index of the replica for which the model is
+      deployed.  Usually 0 for the chief replica.
+  * num_replicas: Number of replicas to use.
+  * num_ps_tasks: Number of tasks for the `ps` job. 0 to not use replicas.
+  * worker_job_name: A name for the worker job.
+  * ps_job_name: A name for the parameter server job.
+
+TODO(sguada):
+  - describe side effect to the graph.
+  - what happens to summaries and update_ops.
+  - which graph collections are altered.
+  - write a tutorial on how to use this.
+  - analyze the possibility of calling deploy more than once.
+
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+slim = contrib_slim
+
+__all__ = ['create_clones',
+           'deploy',
+           'optimize_clones',
+           'DeployedModel',
+           'DeploymentConfig',
+           'Clone',
+           ]
+
+# Namedtuple used to represent a clone during deployment.
+Clone = collections.namedtuple('Clone',
+                               ['outputs',  # Whatever model_fn() returned.
+                                'scope',  # The scope used to create it.
+                                'device',  # The device used to create.
+                                ])
+
+# Namedtuple used to represent a DeployedModel, returned by deploy().
+DeployedModel = collections.namedtuple('DeployedModel',
+                                       ['train_op',  # The `train_op`
+                                        'summary_op',  # The `summary_op`
+                                        'total_loss',  # The loss `Tensor`
+                                        'clones',  # A list of `Clones` tuples.
+                                        ])
+
+# Default parameters for DeploymentConfig
+_deployment_params = {'num_clones': 1,
+                      'clone_on_cpu': False,
+                      'replica_id': 0,
+                      'num_replicas': 1,
+                      'num_ps_tasks': 0,
+                      'worker_job_name': 'worker',
+                      'ps_job_name': 'ps'}
+
+
+def create_clones(config, model_fn, args=None, kwargs=None):
+    """Creates multiple clones according to config using a `model_fn`.
+
+    The returned values of `model_fn(*args, **kwargs)` are collected along with
+    the scope and device used to created it in a namedtuple
+    `Clone(outputs, scope, device)`
+
+    Note: it is assumed that any loss created by `model_fn` is collected at
+    the tf.GraphKeys.LOSSES collection.
+
+    To recover the losses, summaries or update_ops created by the clone use:
+    ```python
+      losses = tf.get_collection(tf.GraphKeys.LOSSES, clone.scope)
+      summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, clone.scope)
+      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, clone.scope)
+    ```
+
+    The deployment options are specified by the config object and support
+    deploying one or several clones on different GPUs and one or several replicas
+    of such clones.
+
+    The argument `model_fn` is called `config.num_clones` times to create the
+    model clones as `model_fn(*args, **kwargs)`.
+
+    If `config` specifies deployment on multiple replicas then the default
+    tensorflow device is set appropriatly for each call to `model_fn` and for the
+    slim variable creation functions: model and global variables will be created
+    on the `ps` device, the clone operations will be on the `worker` device.
+
+    Args:
+      config: A DeploymentConfig object.
+      model_fn: A callable. Called as `model_fn(*args, **kwargs)`
+      args: Optional list of arguments to pass to `model_fn`.
+      kwargs: Optional list of keyword arguments to pass to `model_fn`.
+
+    Returns:
+      A list of namedtuples `Clone`.
+    """
+    clones = []
+    args = args or []
+    kwargs = kwargs or {}
+    with slim.arg_scope([slim.model_variable, slim.variable],
+                        device=config.variables_device()):
+        # Create clones.
+        for i in range(0, config.num_clones):
+            with tf.name_scope(config.clone_scope(i)) as clone_scope:
+                clone_device = config.clone_device(i)
+                with tf.device(clone_device):
+                    with tf.variable_scope(tf.get_variable_scope(),
+                                           reuse=True if i > 0 else None):
+                        outputs = model_fn(*args, **kwargs)
+                    clones.append(Clone(outputs, clone_scope, clone_device))
+    return clones
+
+
+def _gather_clone_loss(clone, num_clones, regularization_losses):
+    """Gather the loss for a single clone.
+
+    Args:
+      clone: A Clone namedtuple.
+      num_clones: The number of clones being deployed.
+      regularization_losses: Possibly empty list of regularization_losses
+        to add to the clone losses.
+
+    Returns:
+      A tensor for the total loss for the clone.  Can be None.
+    """
+    # The return value.
+    sum_loss = None
+    # Individual components of the loss that will need summaries.
+    clone_loss = None
+    regularization_loss = None
+    # Compute and aggregate losses on the clone device.
+    with tf.device(clone.device):
+        all_losses = []
+        clone_losses = tf.get_collection(tf.GraphKeys.LOSSES, clone.scope)
+        if clone_losses:
+            clone_loss = tf.add_n(clone_losses, name='clone_loss')
+            if num_clones > 1:
+                clone_loss = tf.div(clone_loss, 1.0 * num_clones,
+                                    name='scaled_clone_loss')
+            all_losses.append(clone_loss)
+        if regularization_losses:
+            regularization_loss = tf.add_n(regularization_losses,
+                                           name='regularization_loss')
+            all_losses.append(regularization_loss)
+        if all_losses:
+            sum_loss = tf.add_n(all_losses)
+    # Add the summaries out of the clone device block.
+    if clone_loss is not None:
+        tf.summary.scalar('/'.join(filter(None,
+                                          ['Losses', clone.scope, 'clone_loss'])),
+                          clone_loss)
+    if regularization_loss is not None:
+        tf.summary.scalar('Losses/regularization_loss', regularization_loss)
+    return sum_loss
+
+
+def _optimize_clone(optimizer, clone, num_clones, regularization_losses,
+                    **kwargs):
+    """Compute losses and gradients for a single clone.
+
+    Args:
+      optimizer: A tf.Optimizer  object.
+      clone: A Clone namedtuple.
+      num_clones: The number of clones being deployed.
+      regularization_losses: Possibly empty list of regularization_losses
+        to add to the clone losses.
+      **kwargs: Dict of kwarg to pass to compute_gradients().
+
+    Returns:
+      A tuple (clone_loss, clone_grads_and_vars).
+        - clone_loss: A tensor for the total loss for the clone.  Can be None.
+        - clone_grads_and_vars: List of (gradient, variable) for the clone.
+          Can be empty.
+    """
+    sum_loss = _gather_clone_loss(clone, num_clones, regularization_losses)
+    clone_grad = None
+    if sum_loss is not None:
+        # with tf.device(clone.device):
+        #   clone_grad = optimizer.compute_gradients(sum_loss, **kwargs)
+        clone_grad = optimizer.compute_gradients(sum_loss, **kwargs)
+    return sum_loss, clone_grad
+
+
+def optimize_clones(clones, optimizer,
+                    regularization_losses=None,
+                    **kwargs):
+    """Compute clone losses and gradients for the given list of `Clones`.
+
+    Note: The regularization_losses are added to the first clone losses.
+
+    Args:
+     clones: List of `Clones` created by `create_clones()`.
+     optimizer: An `Optimizer` object.
+     regularization_losses: Optional list of regularization losses. If None it
+       will gather them from tf.GraphKeys.REGULARIZATION_LOSSES. Pass `[]` to
+       exclude them.
+     **kwargs: Optional list of keyword arguments to pass to `compute_gradients`.
+
+    Returns:
+     A tuple (total_loss, grads_and_vars).
+       - total_loss: A Tensor containing the average of the clone losses including
+         the regularization loss.
+       - grads_and_vars: A List of tuples (gradient, variable) containing the sum
+         of the gradients for each variable.
+
+    """
+    grads_and_vars = []
+    clones_losses = []
+    num_clones = len(clones)
+    if regularization_losses is None:
+        regularization_losses = tf.get_collection(
+            tf.GraphKeys.REGULARIZATION_LOSSES)
+    for clone in clones:
+        with tf.name_scope(clone.scope):
+            clone_loss, clone_grad = _optimize_clone(
+                optimizer, clone, num_clones, regularization_losses, **kwargs)
+            if clone_loss is not None:
+                clones_losses.append(clone_loss)
+                grads_and_vars.append(clone_grad)
+            # Only use regularization_losses for the first clone
+            regularization_losses = None
+    # Compute the total_loss summing all the clones_losses.
+    total_loss = tf.add_n(clones_losses, name='total_loss')
+    # Sum the gradients across clones.
+    grads_and_vars = _sum_clones_gradients(grads_and_vars)
+    return total_loss, grads_and_vars
+
+
+def deploy(config,
+           model_fn,
+           args=None,
+           kwargs=None,
+           optimizer=None,
+           summarize_gradients=False):
+    """Deploys a Slim-constructed model across multiple clones.
+
+    The deployment options are specified by the config object and support
+    deploying one or several clones on different GPUs and one or several replicas
+    of such clones.
+
+    The argument `model_fn` is called `config.num_clones` times to create the
+    model clones as `model_fn(*args, **kwargs)`.
+
+    The optional argument `optimizer` is an `Optimizer` object.  If not `None`,
+    the deployed model is configured for training with that optimizer.
+
+    If `config` specifies deployment on multiple replicas then the default
+    tensorflow device is set appropriatly for each call to `model_fn` and for the
+    slim variable creation functions: model and global variables will be created
+    on the `ps` device, the clone operations will be on the `worker` device.
+
+    Args:
+      config: A `DeploymentConfig` object.
+      model_fn: A callable. Called as `model_fn(*args, **kwargs)`
+      args: Optional list of arguments to pass to `model_fn`.
+      kwargs: Optional list of keyword arguments to pass to `model_fn`.
+      optimizer: Optional `Optimizer` object.  If passed the model is deployed
+        for training with that optimizer.
+      summarize_gradients: Whether or not add summaries to the gradients.
+
+    Returns:
+      A `DeployedModel` namedtuple.
+
+    """
+    # Gather initial summaries.
+    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
+
+    # Create Clones.
+    clones = create_clones(config, model_fn, args, kwargs)
+    first_clone = clones[0]
+
+    # Gather update_ops from the first clone. These contain, for example,
+    # the updates for the batch_norm variables created by model_fn.
+    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone.scope)
+
+    train_op = None
+    total_loss = None
+    with tf.device(config.optimizer_device()):
+        if optimizer:
+            # Place the global step on the device storing the variables.
+            with tf.device(config.variables_device()):
+                global_step = slim.get_or_create_global_step()
+
+            # Compute the gradients for the clones.
+            total_loss, clones_gradients = optimize_clones(clones, optimizer)
+
+            if clones_gradients:
+                if summarize_gradients:
+                    # Add summaries to the gradients.
+                    summaries |= set(_add_gradients_summaries(clones_gradients))
+
+                # Create gradient updates.
+                grad_updates = optimizer.apply_gradients(clones_gradients,
+                                                         global_step=global_step)
+                update_ops.append(grad_updates)
+
+                update_op = tf.group(*update_ops)
+                with tf.control_dependencies([update_op]):
+                    train_op = tf.identity(total_loss, name='train_op')
+        else:
+            clones_losses = []
+            regularization_losses = tf.get_collection(
+                tf.GraphKeys.REGULARIZATION_LOSSES)
+            for clone in clones:
+                with tf.name_scope(clone.scope):
+                    clone_loss = _gather_clone_loss(clone, len(clones),
+                                                    regularization_losses)
+                    if clone_loss is not None:
+                        clones_losses.append(clone_loss)
+                    # Only use regularization_losses for the first clone
+                    regularization_losses = None
+            if clones_losses:
+                total_loss = tf.add_n(clones_losses, name='total_loss')
+
+        # Add the summaries from the first clone. These contain the summaries
+        # created by model_fn and either optimize_clones() or _gather_clone_loss().
+        summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
+                                           first_clone.scope))
+
+        if total_loss is not None:
+            # Add total_loss to summary.
+            summaries.add(tf.summary.scalar('total_loss', total_loss))
+
+        if summaries:
+            # Merge all summaries together.
+            summary_op = tf.summary.merge(list(summaries), name='summary_op')
+        else:
+            summary_op = None
+
+    return DeployedModel(train_op, summary_op, total_loss, clones)
+
+
+def _sum_clones_gradients(clone_grads):
+    """Calculate the sum gradient for each shared variable across all clones.
+
+    This function assumes that the clone_grads has been scaled appropriately by
+    1 / num_clones.
+
+    Args:
+      clone_grads: A List of List of tuples (gradient, variable), one list per
+      `Clone`.
+
+    Returns:
+       List of tuples of (gradient, variable) where the gradient has been summed
+       across all clones.
+    """
+    sum_grads = []
+    for grad_and_vars in zip(*clone_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad_var0_clone0, var0), ... (grad_varN_cloneN, varN))
+        grads = []
+        var = grad_and_vars[0][1]
+        for g, v in grad_and_vars:
+            assert v == var
+            if g is not None:
+                grads.append(g)
+        if grads:
+            if len(grads) > 1:
+                sum_grad = tf.add_n(grads, name=var.op.name + '/sum_grads')
+            else:
+                sum_grad = grads[0]
+            sum_grads.append((sum_grad, var))
+    return sum_grads
+
+
+def _add_gradients_summaries(grads_and_vars):
+    """Add histogram summaries to gradients.
+
+    Note: The summaries are also added to the SUMMARIES collection.
+
+    Args:
+      grads_and_vars: A list of gradient to variable pairs (tuples).
+
+    Returns:
+      The _list_ of the added summaries for grads_and_vars.
+    """
+    summaries = []
+    for grad, var in grads_and_vars:
+        if grad is not None:
+            if isinstance(grad, tf.IndexedSlices):
+                grad_values = grad.values
+            else:
+                grad_values = grad
+            summaries.append(tf.summary.histogram(var.op.name + ':gradient',
+                                                  grad_values))
+            summaries.append(tf.summary.histogram(var.op.name + ':gradient_norm',
+                                                  tf.global_norm([grad_values])))
+        else:
+            tf.logging.info('Var %s has no gradient', var.op.name)
+    return summaries
+
+
+class DeploymentConfig(object):
+    """Configuration for deploying a model with `deploy()`.
+
+    You can pass an instance of this class to `deploy()` to specify exactly
+    how to deploy the model to build.  If you do not pass one, an instance built
+    from the default deployment_hparams will be used.
+    """
+
+    def __init__(self,
+                 num_clones=1,
+                 clone_on_cpu=False,
+                 replica_id=0,
+                 num_replicas=1,
+                 num_ps_tasks=0,
+                 worker_job_name='worker',
+                 ps_job_name='ps'):
+        """Create a DeploymentConfig.
+
+        The config describes how to deploy a model across multiple clones and
+        replicas.  The model will be replicated `num_clones` times in each replica.
+        If `clone_on_cpu` is True, each clone will placed on CPU.
+
+        If `num_replicas` is 1, the model is deployed via a single process.  In that
+        case `worker_device`, `num_ps_tasks`, and `ps_device` are ignored.
+
+        If `num_replicas` is greater than 1, then `worker_device` and `ps_device`
+        must specify TensorFlow devices for the `worker` and `ps` jobs and
+        `num_ps_tasks` must be positive.
+
+        Args:
+          num_clones: Number of model clones to deploy in each replica.
+          clone_on_cpu: If True clones would be placed on CPU.
+          replica_id: Integer.  Index of the replica for which the model is
+            deployed.  Usually 0 for the chief replica.
+          num_replicas: Number of replicas to use.
+          num_ps_tasks: Number of tasks for the `ps` job. 0 to not use replicas.
+          worker_job_name: A name for the worker job.
+          ps_job_name: A name for the parameter server job.
+
+        Raises:
+          ValueError: If the arguments are invalid.
+        """
+        if num_replicas > 1:
+            if num_ps_tasks < 1:
+                raise ValueError('When using replicas num_ps_tasks must be positive')
+        if num_replicas > 1 or num_ps_tasks > 0:
+            if not worker_job_name:
+                raise ValueError('Must specify worker_job_name when using replicas')
+            if not ps_job_name:
+                raise ValueError('Must specify ps_job_name when using parameter server')
+        if replica_id >= num_replicas:
+            raise ValueError('replica_id must be less than num_replicas')
+        self._num_clones = num_clones
+        self._clone_on_cpu = clone_on_cpu
+        self._replica_id = replica_id
+        self._num_replicas = num_replicas
+        self._num_ps_tasks = num_ps_tasks
+        self._ps_device = '/job:' + ps_job_name if num_ps_tasks > 0 else ''
+        self._worker_device = '/job:' + worker_job_name if num_ps_tasks > 0 else ''
+
+    @property
+    def num_clones(self):
+        return self._num_clones
+
+    @property
+    def clone_on_cpu(self):
+        return self._clone_on_cpu
+
+    @property
+    def replica_id(self):
+        return self._replica_id
+
+    @property
+    def num_replicas(self):
+        return self._num_replicas
+
+    @property
+    def num_ps_tasks(self):
+        return self._num_ps_tasks
+
+    @property
+    def ps_device(self):
+        return self._ps_device
+
+    @property
+    def worker_device(self):
+        return self._worker_device
+
+    def caching_device(self):
+        """Returns the device to use for caching variables.
+
+        Variables are cached on the worker CPU when using replicas.
+
+        Returns:
+          A device string or None if the variables do not need to be cached.
+        """
+        if self._num_ps_tasks > 0:
+            return lambda op: op.device
+        else:
+            return None
+
+    def clone_device(self, clone_index):
+        """Device used to create the clone and all the ops inside the clone.
+
+        Args:
+          clone_index: Int, representing the clone_index.
+
+        Returns:
+          A value suitable for `tf.device()`.
+
+        Raises:
+          ValueError: if `clone_index` is greater or equal to the number of clones".
+        """
+        if clone_index >= self._num_clones:
+            raise ValueError('clone_index must be less than num_clones')
+        device = ''
+        if self._num_ps_tasks > 0:
+            device += self._worker_device
+        if self._clone_on_cpu:
+            device += '/device:CPU:0'
+        else:
+            device += '/device:GPU:%d' % clone_index
+        return device
+
+    def clone_scope(self, clone_index):
+        """Name scope to create the clone.
+
+        Args:
+          clone_index: Int, representing the clone_index.
+
+        Returns:
+          A name_scope suitable for `tf.name_scope()`.
+
+        Raises:
+          ValueError: if `clone_index` is greater or equal to the number of clones".
+        """
+        if clone_index >= self._num_clones:
+            raise ValueError('clone_index must be less than num_clones')
+        scope = ''
+        if self._num_clones > 1:
+            scope = 'clone_%d' % clone_index
+        return scope
+
+    def optimizer_device(self):
+        """Device to use with the optimizer.
+
+        Returns:
+          A value suitable for `tf.device()`.
+        """
+        if self._num_ps_tasks > 0 or self._num_clones > 0:
+            return self._worker_device + '/device:CPU:0'
+        else:
+            return ''
+
+    def inputs_device(self):
+        """Device to use to build the inputs.
+
+        Returns:
+          A value suitable for `tf.device()`.
+        """
+        device = ''
+        if self._num_ps_tasks > 0:
+            device += self._worker_device
+        device += '/device:CPU:0'
+        return device
+
+    def variables_device(self):
+        """Returns the device to use for variables created inside the clone.
+
+        Returns:
+          A value suitable for `tf.device()`.
+        """
+        device = ''
+        if self._num_ps_tasks > 0:
+            device += self._ps_device
+        device += '/device:CPU:0'
+
+        class _PSDeviceChooser(object):
+            """Slim device chooser for variables when using PS."""
+
+            def __init__(self, device, tasks):
+                self._device = device
+                self._tasks = tasks
+                self._task = 0
+
+            def choose(self, op):
+                if op.device:
+                    return op.device
+                node_def = op if isinstance(op, tf.NodeDef) else op.node_def
+                if node_def.op.startswith('Variable'):
+                    t = self._task
+                    self._task = (self._task + 1) % self._tasks
+                    d = '%s/task:%d' % (self._device, t)
+                    return d
+                else:
+                    return op.device
+
+        if not self._num_ps_tasks:
+            return device
+        else:
+            chooser = _PSDeviceChooser(device, self._num_ps_tasks)
+            return chooser.choose
@@ -0,0 +1,574 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for model_deploy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib import framework as contrib_framework
+from tensorflow.contrib import layers as contrib_layers
+from tensorflow.contrib import slim as contrib_slim
+
+from deployment import model_deploy
+
+slim = contrib_slim
+
+
+class DeploymentConfigTest(tf.test.TestCase):
+
+    def testDefaults(self):
+        deploy_config = model_deploy.DeploymentConfig()
+
+        self.assertEqual(slim.get_variables(), [])
+        self.assertEqual(deploy_config.caching_device(), None)
+        self.assertDeviceEqual(deploy_config.clone_device(0), 'GPU:0')
+        self.assertEqual(deploy_config.clone_scope(0), '')
+        self.assertDeviceEqual(deploy_config.optimizer_device(), 'CPU:0')
+        self.assertDeviceEqual(deploy_config.inputs_device(), 'CPU:0')
+        self.assertDeviceEqual(deploy_config.variables_device(), 'CPU:0')
+
+    def testCPUonly(self):
+        deploy_config = model_deploy.DeploymentConfig(clone_on_cpu=True)
+
+        self.assertEqual(deploy_config.caching_device(), None)
+        self.assertDeviceEqual(deploy_config.clone_device(0), 'CPU:0')
+        self.assertEqual(deploy_config.clone_scope(0), '')
+        self.assertDeviceEqual(deploy_config.optimizer_device(), 'CPU:0')
+        self.assertDeviceEqual(deploy_config.inputs_device(), 'CPU:0')
+        self.assertDeviceEqual(deploy_config.variables_device(), 'CPU:0')
+
+    def testMultiGPU(self):
+        deploy_config = model_deploy.DeploymentConfig(num_clones=2)
+
+        self.assertEqual(deploy_config.caching_device(), None)
+        self.assertDeviceEqual(deploy_config.clone_device(0), 'GPU:0')
+        self.assertDeviceEqual(deploy_config.clone_device(1), 'GPU:1')
+        self.assertEqual(deploy_config.clone_scope(0), 'clone_0')
+        self.assertEqual(deploy_config.clone_scope(1), 'clone_1')
+        self.assertDeviceEqual(deploy_config.optimizer_device(), 'CPU:0')
+        self.assertDeviceEqual(deploy_config.inputs_device(), 'CPU:0')
+        self.assertDeviceEqual(deploy_config.variables_device(), 'CPU:0')
+
+    def testPS(self):
+        deploy_config = model_deploy.DeploymentConfig(num_clones=1, num_ps_tasks=1)
+
+        self.assertDeviceEqual(deploy_config.clone_device(0),
+                               '/job:worker/device:GPU:0')
+        self.assertEqual(deploy_config.clone_scope(0), '')
+        self.assertDeviceEqual(deploy_config.optimizer_device(),
+                               '/job:worker/device:CPU:0')
+        self.assertDeviceEqual(deploy_config.inputs_device(),
+                               '/job:worker/device:CPU:0')
+        with tf.device(deploy_config.variables_device()):
+            a = tf.Variable(0)
+            b = tf.Variable(0)
+            c = tf.no_op()
+            d = slim.variable('a', [],
+                              caching_device=deploy_config.caching_device())
+        self.assertDeviceEqual(a.device, '/job:ps/task:0/device:CPU:0')
+        self.assertDeviceEqual(a.device, a.value().device)
+        self.assertDeviceEqual(b.device, '/job:ps/task:0/device:CPU:0')
+        self.assertDeviceEqual(b.device, b.value().device)
+        self.assertDeviceEqual(c.device, '')
+        self.assertDeviceEqual(d.device, '/job:ps/task:0/device:CPU:0')
+        self.assertDeviceEqual(d.value().device, '')
+
+    def testMultiGPUPS(self):
+        deploy_config = model_deploy.DeploymentConfig(num_clones=2, num_ps_tasks=1)
+
+        self.assertEqual(deploy_config.caching_device()(tf.no_op()), '')
+        self.assertDeviceEqual(deploy_config.clone_device(0),
+                               '/job:worker/device:GPU:0')
+        self.assertDeviceEqual(deploy_config.clone_device(1),
+                               '/job:worker/device:GPU:1')
+        self.assertEqual(deploy_config.clone_scope(0), 'clone_0')
+        self.assertEqual(deploy_config.clone_scope(1), 'clone_1')
+        self.assertDeviceEqual(deploy_config.optimizer_device(),
+                               '/job:worker/device:CPU:0')
+        self.assertDeviceEqual(deploy_config.inputs_device(),
+                               '/job:worker/device:CPU:0')
+
+    def testReplicasPS(self):
+        deploy_config = model_deploy.DeploymentConfig(num_replicas=2,
+                                                      num_ps_tasks=2)
+
+        self.assertDeviceEqual(deploy_config.clone_device(0),
+                               '/job:worker/device:GPU:0')
+        self.assertEqual(deploy_config.clone_scope(0), '')
+        self.assertDeviceEqual(deploy_config.optimizer_device(),
+                               '/job:worker/device:CPU:0')
+        self.assertDeviceEqual(deploy_config.inputs_device(),
+                               '/job:worker/device:CPU:0')
+
+    def testReplicasMultiGPUPS(self):
+        deploy_config = model_deploy.DeploymentConfig(num_replicas=2,
+                                                      num_clones=2,
+                                                      num_ps_tasks=2)
+        self.assertDeviceEqual(deploy_config.clone_device(0),
+                               '/job:worker/device:GPU:0')
+        self.assertDeviceEqual(deploy_config.clone_device(1),
+                               '/job:worker/device:GPU:1')
+        self.assertEqual(deploy_config.clone_scope(0), 'clone_0')
+        self.assertEqual(deploy_config.clone_scope(1), 'clone_1')
+        self.assertDeviceEqual(deploy_config.optimizer_device(),
+                               '/job:worker/device:CPU:0')
+        self.assertDeviceEqual(deploy_config.inputs_device(),
+                               '/job:worker/device:CPU:0')
+
+    def testVariablesPS(self):
+        deploy_config = model_deploy.DeploymentConfig(num_ps_tasks=2)
+
+        with tf.device(deploy_config.variables_device()):
+            a = tf.Variable(0)
+            b = tf.Variable(0)
+            c = tf.no_op()
+            d = slim.variable('a', [],
+                              caching_device=deploy_config.caching_device())
+
+        self.assertDeviceEqual(a.device, '/job:ps/task:0/device:CPU:0')
+        self.assertDeviceEqual(a.device, a.value().device)
+        self.assertDeviceEqual(b.device, '/job:ps/task:1/device:CPU:0')
+        self.assertDeviceEqual(b.device, b.value().device)
+        self.assertDeviceEqual(c.device, '')
+        self.assertDeviceEqual(d.device, '/job:ps/task:0/device:CPU:0')
+        self.assertDeviceEqual(d.value().device, '')
+
+
+def LogisticClassifier(inputs, labels, scope=None, reuse=None):
+    with tf.variable_scope(scope, 'LogisticClassifier', [inputs, labels],
+                           reuse=reuse):
+        predictions = slim.fully_connected(inputs, 1, activation_fn=tf.sigmoid,
+                                           scope='fully_connected')
+        slim.losses.log_loss(predictions, labels)
+        return predictions
+
+
+def BatchNormClassifier(inputs, labels, scope=None, reuse=None):
+    with tf.variable_scope(scope, 'BatchNormClassifier', [inputs, labels],
+                           reuse=reuse):
+        inputs = slim.batch_norm(inputs, decay=0.1, fused=True)
+        predictions = slim.fully_connected(inputs, 1,
+                                           activation_fn=tf.sigmoid,
+                                           scope='fully_connected')
+        slim.losses.log_loss(predictions, labels)
+        return predictions
+
+
+class CreatecloneTest(tf.test.TestCase):
+
+    def setUp(self):
+        # Create an easy training set:
+        np.random.seed(0)
+
+        self._inputs = np.zeros((16, 4))
+        self._labels = np.random.randint(0, 2, size=(16, 1)).astype(np.float32)
+        self._logdir = self.get_temp_dir()
+
+        for i in range(16):
+            j = int(2 * self._labels[i] + np.random.randint(0, 2))
+            self._inputs[i, j] = 1
+
+    def testCreateLogisticClassifier(self):
+        g = tf.Graph()
+        with g.as_default():
+            tf.set_random_seed(0)
+            tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+            tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+            model_fn = LogisticClassifier
+            clone_args = (tf_inputs, tf_labels)
+            deploy_config = model_deploy.DeploymentConfig(num_clones=1)
+
+            self.assertEqual(slim.get_variables(), [])
+            clones = model_deploy.create_clones(deploy_config, model_fn, clone_args)
+            clone = clones[0]
+            self.assertEqual(len(slim.get_variables()), 2)
+            for v in slim.get_variables():
+                self.assertDeviceEqual(v.device, 'CPU:0')
+                self.assertDeviceEqual(v.value().device, 'CPU:0')
+            self.assertEqual(clone.outputs.op.name,
+                             'LogisticClassifier/fully_connected/Sigmoid')
+            self.assertEqual(clone.scope, '')
+            self.assertDeviceEqual(clone.device, 'GPU:0')
+            self.assertEqual(len(slim.losses.get_losses()), 1)
+            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+            self.assertEqual(update_ops, [])
+
+    def testCreateSingleclone(self):
+        g = tf.Graph()
+        with g.as_default():
+            tf.set_random_seed(0)
+            tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+            tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+            model_fn = BatchNormClassifier
+            clone_args = (tf_inputs, tf_labels)
+            deploy_config = model_deploy.DeploymentConfig(num_clones=1)
+
+            self.assertEqual(slim.get_variables(), [])
+            clones = model_deploy.create_clones(deploy_config, model_fn, clone_args)
+            clone = clones[0]
+            self.assertEqual(len(slim.get_variables()), 5)
+            for v in slim.get_variables():
+                self.assertDeviceEqual(v.device, 'CPU:0')
+                self.assertDeviceEqual(v.value().device, 'CPU:0')
+            self.assertEqual(clone.outputs.op.name,
+                             'BatchNormClassifier/fully_connected/Sigmoid')
+            self.assertEqual(clone.scope, '')
+            self.assertDeviceEqual(clone.device, 'GPU:0')
+            self.assertEqual(len(slim.losses.get_losses()), 1)
+            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+            self.assertEqual(len(update_ops), 2)
+
+    def testCreateMulticlone(self):
+        g = tf.Graph()
+        with g.as_default():
+            tf.set_random_seed(0)
+            tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+            tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+            model_fn = BatchNormClassifier
+            clone_args = (tf_inputs, tf_labels)
+            num_clones = 4
+            deploy_config = model_deploy.DeploymentConfig(num_clones=num_clones)
+
+            self.assertEqual(slim.get_variables(), [])
+            clones = model_deploy.create_clones(deploy_config, model_fn, clone_args)
+            self.assertEqual(len(slim.get_variables()), 5)
+            for v in slim.get_variables():
+                self.assertDeviceEqual(v.device, 'CPU:0')
+                self.assertDeviceEqual(v.value().device, 'CPU:0')
+            self.assertEqual(len(clones), num_clones)
+            for i, clone in enumerate(clones):
+                self.assertEqual(
+                    clone.outputs.op.name,
+                    'clone_%d/BatchNormClassifier/fully_connected/Sigmoid' % i)
+                update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, clone.scope)
+                self.assertEqual(len(update_ops), 2)
+                self.assertEqual(clone.scope, 'clone_%d/' % i)
+                self.assertDeviceEqual(clone.device, 'GPU:%d' % i)
+
+    def testCreateOnecloneWithPS(self):
+        g = tf.Graph()
+        with g.as_default():
+            tf.set_random_seed(0)
+            tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+            tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+            model_fn = BatchNormClassifier
+            clone_args = (tf_inputs, tf_labels)
+            deploy_config = model_deploy.DeploymentConfig(num_clones=1,
+                                                          num_ps_tasks=1)
+
+            self.assertEqual(slim.get_variables(), [])
+            clones = model_deploy.create_clones(deploy_config, model_fn, clone_args)
+            self.assertEqual(len(clones), 1)
+            clone = clones[0]
+            self.assertEqual(clone.outputs.op.name,
+                             'BatchNormClassifier/fully_connected/Sigmoid')
+            self.assertDeviceEqual(clone.device, '/job:worker/device:GPU:0')
+            self.assertEqual(clone.scope, '')
+            self.assertEqual(len(slim.get_variables()), 5)
+            for v in slim.get_variables():
+                self.assertDeviceEqual(v.device, '/job:ps/task:0/CPU:0')
+                self.assertDeviceEqual(v.device, v.value().device)
+
+    def testCreateMulticloneWithPS(self):
+        g = tf.Graph()
+        with g.as_default():
+            tf.set_random_seed(0)
+            tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+            tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+            model_fn = BatchNormClassifier
+            clone_args = (tf_inputs, tf_labels)
+            deploy_config = model_deploy.DeploymentConfig(num_clones=2,
+                                                          num_ps_tasks=2)
+
+            self.assertEqual(slim.get_variables(), [])
+            clones = model_deploy.create_clones(deploy_config, model_fn, clone_args)
+            self.assertEqual(len(slim.get_variables()), 5)
+            for i, v in enumerate(slim.get_variables()):
+                t = i % 2
+                self.assertDeviceEqual(v.device, '/job:ps/task:%d/device:CPU:0' % t)
+                self.assertDeviceEqual(v.device, v.value().device)
+            self.assertEqual(len(clones), 2)
+            for i, clone in enumerate(clones):
+                self.assertEqual(
+                    clone.outputs.op.name,
+                    'clone_%d/BatchNormClassifier/fully_connected/Sigmoid' % i)
+                self.assertEqual(clone.scope, 'clone_%d/' % i)
+                self.assertDeviceEqual(clone.device, '/job:worker/device:GPU:%d' % i)
+
+
+class OptimizeclonesTest(tf.test.TestCase):
+
+    def setUp(self):
+        # Create an easy training set:
+        np.random.seed(0)
+
+        self._inputs = np.zeros((16, 4))
+        self._labels = np.random.randint(0, 2, size=(16, 1)).astype(np.float32)
+        self._logdir = self.get_temp_dir()
+
+        for i in range(16):
+            j = int(2 * self._labels[i] + np.random.randint(0, 2))
+            self._inputs[i, j] = 1
+
+    def testCreateLogisticClassifier(self):
+        g = tf.Graph()
+        with g.as_default():
+            tf.set_random_seed(0)
+            tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+            tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+            model_fn = LogisticClassifier
+            clone_args = (tf_inputs, tf_labels)
+            deploy_config = model_deploy.DeploymentConfig(num_clones=1)
+
+            self.assertEqual(slim.get_variables(), [])
+            clones = model_deploy.create_clones(deploy_config, model_fn, clone_args)
+            self.assertEqual(len(slim.get_variables()), 2)
+            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+            self.assertEqual(update_ops, [])
+
+            optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
+            total_loss, grads_and_vars = model_deploy.optimize_clones(clones,
+                                                                      optimizer)
+            self.assertEqual(len(grads_and_vars), len(tf.trainable_variables()))
+            self.assertEqual(total_loss.op.name, 'total_loss')
+            for g, v in grads_and_vars:
+                self.assertDeviceEqual(g.device, 'GPU:0')
+                self.assertDeviceEqual(v.device, 'CPU:0')
+
+    def testCreateSingleclone(self):
+        g = tf.Graph()
+        with g.as_default():
+            tf.set_random_seed(0)
+            tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+            tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+            model_fn = BatchNormClassifier
+            clone_args = (tf_inputs, tf_labels)
+            deploy_config = model_deploy.DeploymentConfig(num_clones=1)
+
+            self.assertEqual(slim.get_variables(), [])
+            clones = model_deploy.create_clones(deploy_config, model_fn, clone_args)
+            self.assertEqual(len(slim.get_variables()), 5)
+            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+            self.assertEqual(len(update_ops), 2)
+
+            optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
+            total_loss, grads_and_vars = model_deploy.optimize_clones(clones,
+                                                                      optimizer)
+            self.assertEqual(len(grads_and_vars), len(tf.trainable_variables()))
+            self.assertEqual(total_loss.op.name, 'total_loss')
+            for g, v in grads_and_vars:
+                self.assertDeviceEqual(g.device, 'GPU:0')
+                self.assertDeviceEqual(v.device, 'CPU:0')
+
+    def testCreateMulticlone(self):
+        g = tf.Graph()
+        with g.as_default():
+            tf.set_random_seed(0)
+            tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+            tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+            model_fn = BatchNormClassifier
+            clone_args = (tf_inputs, tf_labels)
+            num_clones = 4
+            deploy_config = model_deploy.DeploymentConfig(num_clones=num_clones)
+
+            self.assertEqual(slim.get_variables(), [])
+            clones = model_deploy.create_clones(deploy_config, model_fn, clone_args)
+            self.assertEqual(len(slim.get_variables()), 5)
+            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+            self.assertEqual(len(update_ops), num_clones * 2)
+
+            optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
+            total_loss, grads_and_vars = model_deploy.optimize_clones(clones,
+                                                                      optimizer)
+            self.assertEqual(len(grads_and_vars), len(tf.trainable_variables()))
+            self.assertEqual(total_loss.op.name, 'total_loss')
+            for g, v in grads_and_vars:
+                self.assertDeviceEqual(g.device, '')
+                self.assertDeviceEqual(v.device, 'CPU:0')
+
+    def testCreateMulticloneCPU(self):
+        g = tf.Graph()
+        with g.as_default():
+            tf.set_random_seed(0)
+            tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+            tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+            model_fn = BatchNormClassifier
+            model_args = (tf_inputs, tf_labels)
+            num_clones = 4
+            deploy_config = model_deploy.DeploymentConfig(num_clones=num_clones,
+                                                          clone_on_cpu=True)
+
+            self.assertEqual(slim.get_variables(), [])
+            clones = model_deploy.create_clones(deploy_config, model_fn, model_args)
+            self.assertEqual(len(slim.get_variables()), 5)
+            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+            self.assertEqual(len(update_ops), num_clones * 2)
+
+            optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
+            total_loss, grads_and_vars = model_deploy.optimize_clones(clones,
+                                                                      optimizer)
+            self.assertEqual(len(grads_and_vars), len(tf.trainable_variables()))
+            self.assertEqual(total_loss.op.name, 'total_loss')
+            for g, v in grads_and_vars:
+                self.assertDeviceEqual(g.device, '')
+                self.assertDeviceEqual(v.device, 'CPU:0')
+
+    def testCreateOnecloneWithPS(self):
+        g = tf.Graph()
+        with g.as_default():
+            tf.set_random_seed(0)
+            tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+            tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+            model_fn = BatchNormClassifier
+            model_args = (tf_inputs, tf_labels)
+            deploy_config = model_deploy.DeploymentConfig(num_clones=1,
+                                                          num_ps_tasks=1)
+
+            self.assertEqual(slim.get_variables(), [])
+            clones = model_deploy.create_clones(deploy_config, model_fn, model_args)
+            self.assertEqual(len(slim.get_variables()), 5)
+            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+            self.assertEqual(len(update_ops), 2)
+
+            optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
+            total_loss, grads_and_vars = model_deploy.optimize_clones(clones,
+                                                                      optimizer)
+            self.assertEqual(len(grads_and_vars), len(tf.trainable_variables()))
+            self.assertEqual(total_loss.op.name, 'total_loss')
+            for g, v in grads_and_vars:
+                self.assertDeviceEqual(g.device, '/job:worker/device:GPU:0')
+                self.assertDeviceEqual(v.device, '/job:ps/task:0/CPU:0')
+
+
+class DeployTest(tf.test.TestCase):
+
+    def setUp(self):
+        # Create an easy training set:
+        np.random.seed(0)
+
+        self._inputs = np.zeros((16, 4))
+        self._labels = np.random.randint(0, 2, size=(16, 1)).astype(np.float32)
+        self._logdir = self.get_temp_dir()
+
+        for i in range(16):
+            j = int(2 * self._labels[i] + np.random.randint(0, 2))
+            self._inputs[i, j] = 1
+
+    def _addBesselsCorrection(self, sample_size, expected_var):
+        correction_factor = sample_size / (sample_size - 1)
+        expected_var *= correction_factor
+        return expected_var
+
+    def testLocalTrainOp(self):
+        g = tf.Graph()
+        with g.as_default():
+            tf.set_random_seed(0)
+            tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+            tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+            model_fn = BatchNormClassifier
+            model_args = (tf_inputs, tf_labels)
+            deploy_config = model_deploy.DeploymentConfig(num_clones=2,
+                                                          clone_on_cpu=True)
+
+            optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
+
+            self.assertEqual(slim.get_variables(), [])
+            model = model_deploy.deploy(deploy_config, model_fn, model_args,
+                                        optimizer=optimizer)
+
+            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+            self.assertEqual(len(update_ops), 4)
+            self.assertEqual(len(model.clones), 2)
+            self.assertEqual(model.total_loss.op.name, 'total_loss')
+            self.assertEqual(model.summary_op.op.name, 'summary_op/summary_op')
+            self.assertEqual(model.train_op.op.name, 'train_op')
+
+            with tf.Session() as sess:
+                sess.run(tf.global_variables_initializer())
+                moving_mean = contrib_framework.get_variables_by_name('moving_mean')[0]
+                moving_variance = contrib_framework.get_variables_by_name(
+                    'moving_variance')[0]
+                initial_loss = sess.run(model.total_loss)
+                initial_mean, initial_variance = sess.run([moving_mean,
+                                                           moving_variance])
+                self.assertAllClose(initial_mean, [0.0, 0.0, 0.0, 0.0])
+                self.assertAllClose(initial_variance, [1.0, 1.0, 1.0, 1.0])
+                for _ in range(10):
+                    sess.run(model.train_op)
+                final_loss = sess.run(model.total_loss)
+                self.assertLess(final_loss, initial_loss / 5.0)
+
+                final_mean, final_variance = sess.run([moving_mean,
+                                                       moving_variance])
+                expected_mean = np.array([0.125, 0.25, 0.375, 0.25])
+                expected_var = np.array([0.109375, 0.1875, 0.234375, 0.1875])
+                expected_var = self._addBesselsCorrection(16, expected_var)
+                self.assertAllClose(final_mean, expected_mean)
+                self.assertAllClose(final_variance, expected_var)
+
+    def testNoSummariesOnGPU(self):
+        with tf.Graph().as_default():
+            deploy_config = model_deploy.DeploymentConfig(num_clones=2)
+
+            # clone function creates a fully_connected layer with a regularizer loss.
+            def ModelFn():
+                inputs = tf.constant(1.0, shape=(10, 20), dtype=tf.float32)
+                reg = contrib_layers.l2_regularizer(0.001)
+                contrib_layers.fully_connected(inputs, 30, weights_regularizer=reg)
+
+            model = model_deploy.deploy(
+                deploy_config, ModelFn,
+                optimizer=tf.train.GradientDescentOptimizer(1.0))
+            # The model summary op should have a few summary inputs and all of them
+            # should be on the CPU.
+            self.assertTrue(model.summary_op.op.inputs)
+            for inp in model.summary_op.op.inputs:
+                self.assertEqual('/device:CPU:0', inp.device)
+
+    def testNoSummariesOnGPUForEvals(self):
+        with tf.Graph().as_default():
+            deploy_config = model_deploy.DeploymentConfig(num_clones=2)
+
+            # clone function creates a fully_connected layer with a regularizer loss.
+            def ModelFn():
+                inputs = tf.constant(1.0, shape=(10, 20), dtype=tf.float32)
+                reg = contrib_layers.l2_regularizer(0.001)
+                contrib_layers.fully_connected(inputs, 30, weights_regularizer=reg)
+
+            # No optimizer here, it's an eval.
+            model = model_deploy.deploy(deploy_config, ModelFn)
+            # The model summary op should have a few summary inputs and all of them
+            # should be on the CPU.
+            self.assertTrue(model.summary_op.op.inputs)
+            for inp in model.summary_op.op.inputs:
+                self.assertEqual('/device:CPU:0', inp.device)
+
+
+if __name__ == '__main__':
+    tf.test.main()
@@ -0,0 +1,94 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Downloads and converts a particular dataset.
+
+Usage:
+```shell
+
+$ python download_and_convert_data.py \
+    --dataset_name=flowers \
+    --dataset_dir=/tmp/flowers
+
+$ python download_and_convert_data.py \
+    --dataset_name=cifar10 \
+    --dataset_dir=/tmp/cifar10
+
+$ python download_and_convert_data.py \
+    --dataset_name=mnist \
+    --dataset_dir=/tmp/mnist
+
+$ python download_and_convert_data.py \
+    --dataset_name=visualwakewords \
+    --dataset_dir=/tmp/visualwakewords
+
+```
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from datasets import download_and_convert_cifar10
+from datasets import download_and_convert_flowers
+from datasets import download_and_convert_mnist
+from datasets import download_and_convert_visualwakewords
+
+FLAGS = tf.compat.v1.app.flags.FLAGS
+
+tf.compat.v1.app.flags.DEFINE_string(
+    'dataset_name',
+    None,
+    'The name of the dataset to convert, one of "flowers", "cifar10", "mnist", "visualwakewords"'
+    )
+
+tf.compat.v1.app.flags.DEFINE_string(
+    'dataset_dir',
+    None,
+    'The directory where the output TFRecords and temporary files are saved.')
+
+tf.flags.DEFINE_float(
+    'small_object_area_threshold', 0.005,
+    'For --dataset_name=visualwakewords only. Threshold of fraction of image '
+    'area below which small objects are filtered')
+
+tf.flags.DEFINE_string(
+    'foreground_class_of_interest', 'person',
+    'For --dataset_name=visualwakewords only. Build a binary classifier based '
+    'on the presence or absence of this object in the image.')
+
+
+def main(_):
+  if not FLAGS.dataset_name:
+    raise ValueError('You must supply the dataset name with --dataset_name')
+  if not FLAGS.dataset_dir:
+    raise ValueError('You must supply the dataset directory with --dataset_dir')
+
+  if FLAGS.dataset_name == 'flowers':
+    download_and_convert_flowers.run(FLAGS.dataset_dir)
+  elif FLAGS.dataset_name == 'cifar10':
+    download_and_convert_cifar10.run(FLAGS.dataset_dir)
+  elif FLAGS.dataset_name == 'mnist':
+    download_and_convert_mnist.run(FLAGS.dataset_dir)
+  elif FLAGS.dataset_name == 'visualwakewords':
+    download_and_convert_visualwakewords.run(
+        FLAGS.dataset_dir, FLAGS.small_object_area_threshold,
+        FLAGS.foreground_class_of_interest)
+  else:
+    raise ValueError(
+        'dataset_name [%s] was not recognized.' % FLAGS.dataset_name)
+
+if __name__ == '__main__':
+  tf.compat.v1.app.run()
@@ -0,0 +1,182 @@
+import tensorflow as tf
+from time import gmtime, strftime
+from tensorflow.contrib import slim as contrib_slim
+from gpu_helper import get_custom_getter
+import random
+import numpy as np
+import os
+
+np.random.seed(0)
+random.seed(0)
+tf.set_random_seed(0)
+
+
+class Env:
+    def __init__(self, FLAGS):
+        self.FLAGS = FLAGS
+
+        self.slim = contrib_slim
+        self.num_samples = 1281167
+
+    def _configure_optimizer(self, learning_rate):
+        """Configures the optimizer used for training.
+
+        Args:
+            learning_rate: A scalar or `Tensor` learning rate.
+
+        Returns:
+            An instance of an optimizer.
+
+        Raises:
+            ValueError: if Initializer.FLAGS.optimizer is not recognized.
+        """
+        if self.FLAGS.optimizer == 'adadelta':
+            optimizer = tf.train.AdadeltaOptimizer(
+                learning_rate,
+                rho=self.FLAGS.adadelta_rho,
+                epsilon=self.FLAGS.opt_epsilon)
+        elif self.FLAGS.optimizer == 'adagrad':
+            optimizer = tf.train.AdagradOptimizer(
+                learning_rate,
+                initial_accumulator_value=self.FLAGS.adagrad_initial_accumulator_value)
+        elif self.FLAGS.optimizer == 'adam':
+            optimizer = tf.train.AdamOptimizer(
+                learning_rate,
+                beta1=self.FLAGS.adam_beta1,
+                beta2=self.FLAGS.adam_beta2,
+                epsilon=self.FLAGS.opt_epsilon)
+        elif self.FLAGS.optimizer == 'ftrl':
+            optimizer = tf.train.FtrlOptimizer(
+                learning_rate,
+                learning_rate_power=self.FLAGS.ftrl_learning_rate_power,
+                initial_accumulator_value=self.FLAGS.ftrl_initial_accumulator_value,
+                l1_regularization_strength=self.FLAGS.ftrl_l1,
+                l2_regularization_strength=self.FLAGS.ftrl_l2)
+        elif self.FLAGS.optimizer == 'momentum':
+            optimizer = tf.train.MomentumOptimizer(
+                learning_rate,
+                momentum=self.FLAGS.momentum,
+                name='Momentum')
+        elif self.FLAGS.optimizer == 'rmsprop':
+            optimizer = tf.train.RMSPropOptimizer(
+                learning_rate,
+                decay=self.FLAGS.rmsprop_decay,
+                momentum=self.FLAGS.rmsprop_momentum,
+                epsilon=self.FLAGS.opt_epsilon)
+        elif self.FLAGS.optimizer == 'sgd':
+            optimizer = tf.train.GradientDescentOptimizer(learning_rate)
+        else:
+            raise ValueError('Optimizer [%s] was not recognized' % self.FLAGS.optimizer)
+
+        return optimizer
+
+    def create_logdir(self):
+        logdir = "results"
+        os.makedirs(logdir, exist_ok=True)
+        return logdir
+
+    def calc_logits(self, network_fn, images):
+        logits, end_points = network_fn(images, reuse=tf.AUTO_REUSE)
+        return logits
+
+    def calc_loss(self, logits_train, labels_train):
+        base_loss = self.slim.losses.softmax_cross_entropy(
+            logits_train, labels_train, label_smoothing=self.FLAGS.label_smoothing, weights=1.0)
+
+        reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
+        total_loss = tf.add_n([base_loss] + reg_losses, name='total_loss')
+
+        loss = tf.add_n([base_loss])
+        loss = tf.identity(loss, name='loss')
+
+        return loss, total_loss
+
+    def calc_steps_per_epoch(self):
+        return self.num_samples // (self.FLAGS.batch_size * int(os.getenv('RANK_SIZE')))
+
+    def _configure_learning_rate(self, global_step):
+        steps_per_epoch = self.calc_steps_per_epoch()
+        decay_steps = int(steps_per_epoch * self.FLAGS.num_epochs_per_decay)
+
+        if self.FLAGS.learning_rate_decay_type == 'exponential':
+            learning_rate = tf.train.exponential_decay(
+                self.FLAGS.learning_rate,
+                global_step,
+                decay_steps,
+                self.FLAGS.learning_rate_decay_factor,
+                staircase=True,
+                name='exponential_decay_learning_rate')
+        elif self.FLAGS.learning_rate_decay_type == 'fixed':
+            learning_rate = tf.constant(self.FLAGS.learning_rate, name='fixed_learning_rate')
+        elif self.FLAGS.learning_rate_decay_type == 'cosine_annealing':
+            current_step_epoch = global_step // steps_per_epoch * steps_per_epoch
+            learning_rate = tf.train.cosine_decay(self.FLAGS.learning_rate, current_step_epoch,
+                                                  self.FLAGS.max_number_of_steps)
+        elif self.FLAGS.learning_rate_decay_type == 'polynomial':
+            learning_rate = tf.train.polynomial_decay(
+                self.FLAGS.learning_rate, global_step,
+                decay_steps,
+                self.FLAGS.end_learning_rate,
+                power=1.0,
+                cycle=False,
+                name='polynomial_decay_learning_rate')
+        else:
+            raise ValueError('learning_rate_decay_type [%s] was not recognized' %
+                             self.FLAGS.learning_rate_decay_type)
+
+        if self.FLAGS.warmup_epochs:
+            warmup_lr = (
+                    self.FLAGS.learning_rate * tf.cast(global_step, tf.float32) /
+                    (steps_per_epoch * self.FLAGS.warmup_epochs))
+            learning_rate = tf.minimum(warmup_lr, learning_rate)
+
+        learning_rate = tf.identity(learning_rate, name='learning_rate')
+        # tf.Print(learning_rate, [learning_rate], '*****************')
+        return learning_rate
+
+    def create_train_op(self, global_step, summaries, loss):
+        # Gather update_ops from the first clone. These contain, for example,
+        # the updates for the batch_norm variables created by network_fn.
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or []
+
+        #################################
+        # Configure the moving averages #
+        #################################
+        if self.FLAGS.moving_average_decay:
+            moving_average_variables = self.slim.get_model_variables()
+            variable_averages = tf.train.ExponentialMovingAverage(
+                self.FLAGS.moving_average_decay, global_step)
+        else:
+            moving_average_variables, variable_averages = None, None
+
+        #########################################
+        # Configure the optimization procedure. #
+        #########################################
+        learning_rate = self._configure_learning_rate(global_step)
+        summaries.add(tf.summary.scalar('learning_rate', learning_rate))
+
+        if self.FLAGS.moving_average_decay:
+            # Update ops executed locally by trainer.
+            update_ops.append(variable_averages.apply(moving_average_variables))
+
+        opt = self._configure_optimizer(learning_rate)
+
+        from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
+        from npu_bridge.estimator.npu.npu_loss_scale_optimizer import NPULossScaleOptimizer
+        from npu_bridge.estimator.npu.npu_loss_scale_manager import FixedLossScaleManager
+        from npu_bridge.estimator.npu.npu_loss_scale_manager import ExponentialUpdateLossScaleManager
+        loss_scale_manager = FixedLossScaleManager(loss_scale=4096)
+        # loss_scale_manager = ExponentialUpdateLossScaleManager(init_loss_scale=1024, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
+        if int(os.getenv('RANK_SIZE')) == 1:
+            opt = NPULossScaleOptimizer(opt, loss_scale_manager)
+        else:
+            opt = NPULossScaleOptimizer(opt, loss_scale_manager, is_distributed=True)
+        opt = NPUDistributedOptimizer(opt)
+
+        update_op = tf.group(*update_ops)
+        with tf.control_dependencies([update_op]):
+            gate_gradients = (tf.train.Optimizer.GATE_NONE)
+            grads_and_vars = opt.compute_gradients(loss)
+            train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
+
+        return train_op
@@ -0,0 +1,133 @@
+import tensorflow as tf
+from dataloader import data_provider
+from datasets import dataset_factory
+from nets import nets_factory
+import os
+
+
+class EstimatorImpl:
+    def __init__(self, env):
+        self.env = env
+
+    def model_fn(self, features, labels, mode, params):
+        num_classes = 1001
+
+        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
+
+        if mode == tf.estimator.ModeKeys.TRAIN:
+            network_fn = nets_factory.get_network_fn(
+                self.env.FLAGS.model_name,
+                num_classes=(num_classes - self.env.FLAGS.labels_offset),
+                weight_decay=self.env.FLAGS.weight_decay,
+                is_training=True)
+
+            logits = self.env.calc_logits(network_fn, features)
+            loss, total_loss = self.env.calc_loss(logits, labels)
+
+            # ### accuracy ### #
+            predictions = tf.argmax(logits, 1)
+            accuracy_ops = tf.metrics.accuracy(tf.argmax(labels, 1), predictions)
+            tf.identity(accuracy_ops[1], name='train_accuracy')
+            # ### accuracy ### #
+
+            tf.identity(total_loss, 'train_loss')
+
+            global_step = tf.train.get_or_create_global_step()
+            train_op = self.env.create_train_op(global_step, summaries, total_loss)
+
+            estimator_spec = tf.estimator.EstimatorSpec(
+                mode=tf.estimator.ModeKeys.TRAIN, loss=total_loss, train_op=train_op)
+
+        elif mode == tf.estimator.ModeKeys.EVAL:
+            network_fn = nets_factory.get_network_fn(
+                self.env.FLAGS.model_name,
+                num_classes=(num_classes - self.env.FLAGS.labels_offset),
+                weight_decay=self.env.FLAGS.weight_decay,
+                is_training=False)
+
+            logits = self.env.calc_logits(network_fn, features)
+            loss, total_loss = self.env.calc_loss(logits, labels)
+            predictions = tf.argmax(logits, 1)
+            accuracy_ops = tf.metrics.accuracy(tf.argmax(labels, 1), predictions)
+            tf.identity(accuracy_ops[1], name='eval_accuracy')
+            estimator_spec = tf.estimator.EstimatorSpec(
+                mode=tf.estimator.ModeKeys.EVAL,
+                loss=total_loss, eval_metric_ops={'accuracy': accuracy_ops})
+
+        return estimator_spec
+
+    def main(self):
+        logdir = self.env.create_logdir()
+
+        from logger import LogSessionRunHook
+
+        config = {
+            'num_training_samples': self.env.num_samples,
+            # for 1p, just per loop print, for 8p, print each epoch
+            'display_every': 1,
+            'log_name': 'train_log.log',
+            'log_dir': logdir,
+            'global_batch_size': self.env.FLAGS.batch_size * int(os.getenv('RANK_SIZE')),
+            'iterations_per_loop': self.env.FLAGS.iterations_per_loop if self.env.FLAGS.iterations_per_loop is not None else self.env.calc_steps_per_epoch()
+        }
+
+        hooks = [LogSessionRunHook(config, warmup_steps=self.env.FLAGS.warmup_epochs * self.env.calc_steps_per_epoch())]
+
+        #################################################################
+        from npu_bridge.estimator.npu.npu_config import NPURunConfig
+        from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
+
+        self.estimator_config = tf.ConfigProto(
+            inter_op_parallelism_threads=10,
+            intra_op_parallelism_threads=10,
+            allow_soft_placement=True)
+
+        self.estimator_config.gpu_options.allow_growth = True
+
+        gpu_thread_count = 2
+        os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
+        os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count)
+        os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
+        os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
+
+        run_config = NPURunConfig(
+            hcom_parallel=True,
+            precision_mode="allow_mix_precision",
+            enable_data_pre_proc=True,
+            save_checkpoints_steps=self.env.calc_steps_per_epoch(),
+            session_config=self.estimator_config,
+            model_dir=logdir,
+            iterations_per_loop=config['iterations_per_loop'],
+            keep_checkpoint_max=5)
+
+        classifier = NPUEstimator(
+            model_fn=self.model_fn,
+            config=run_config
+        )
+        ###################################################################
+
+        classifier.train(
+            input_fn=self.train_data,
+            max_steps=self.env.FLAGS.max_number_of_steps,
+            hooks=hooks,
+        )
+
+    def train_data(self):
+        dataset = dataset_factory.get_dataset(self.env.FLAGS.dataset_name, 'train', self.env.FLAGS.dataset_dir)
+
+        preprocessing_name = self.env.FLAGS.preprocessing_name or self.env.FLAGS.model_name
+        _, ds = data_provider.get_data(dataset, self.env.FLAGS.batch_size,
+                                       dataset.num_classes, self.env.FLAGS.labels_offset, True,
+                                       preprocessing_name, self.env.FLAGS.use_grayscale)
+
+        return ds
+
+    def eval_data(self):
+        dataset = dataset_factory.get_dataset(self.env.FLAGS.dataset_name, 'validation', self.env.FLAGS.dataset_dir)
+
+        preprocessing_name = self.env.FLAGS.preprocessing_name or self.env.FLAGS.model_name
+        _, ds = data_provider.get_data(dataset, self.env.FLAGS.batch_size,
+                                       dataset.num_classes, self.env.FLAGS.labels_offset, False,
+                                       preprocessing_name, self.env.FLAGS.use_grayscale)
+
+        return ds
@@ -0,0 +1,174 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generic evaluation script that evaluates a model using a given dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import tensorflow as tf
+from tensorflow.contrib import quantize as contrib_quantize
+from tensorflow.contrib import slim as contrib_slim
+from benchmark_log import hwlog
+from datasets import dataset_factory
+from nets import nets_factory
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = '4'
+
+slim = contrib_slim
+
+tf.app.flags.DEFINE_integer(
+    'batch_size', 100, 'The number of samples in each batch.')
+
+tf.app.flags.DEFINE_integer(
+    'max_num_batches', None,
+    'Max number of batches to evaluate by default use all.')
+
+tf.app.flags.DEFINE_string(
+    'master', '', 'The address of the TensorFlow master to use.')
+
+ckpt_path = './results/0526023335_train_hvdTrue_mnmobilenet_v2_augmentedTrue_mixedpFalse_lr0.4_optmomentum_me200_lrdtcosine_annealing_nepd0.3125_lrdf0.98_b256_me_param'
+# ckpt_path = './results/0523130615_train_hvdTrue_mnmobilenet_v2_augmentedTrue_mixedpFalse_lr0.4_optmomentum_me200_lrdtcosine_annealing_nepd0.3125_lrdf0.98_b256_me_param'
+tf.app.flags.DEFINE_string(
+    'checkpoint_path', ckpt_path,
+    'The directory where the model was written to or an absolute path to a '
+    'checkpoint file.')
+
+tf.app.flags.DEFINE_string(
+    'eval_dir', ckpt_path, 'Directory where the results are saved to.')
+
+tf.app.flags.DEFINE_integer(
+    'num_preprocessing_threads', 4,
+    'The number of threads used to create the batches.')
+
+tf.app.flags.DEFINE_string(
+    'dataset_name', 'imagenet', 'The name of the dataset to load.')
+
+tf.app.flags.DEFINE_string(
+    'dataset_split_name', 'validation', 'The name of the train/test split.')
+
+tf.app.flags.DEFINE_string(
+    'dataset_dir', '/data/Datasets/imagenet_TF', 'The directory where the dataset files are stored.')
+
+tf.app.flags.DEFINE_integer(
+    'labels_offset', 0,
+    'An offset for the labels in the dataset. This flag is primarily used to '
+    'evaluate the VGG and ResNet architectures which do not use a background '
+    'class for the ImageNet dataset.')
+
+tf.app.flags.DEFINE_string(
+    'model_name', 'mobilenet_v2', 'The name of the architecture to evaluate.')
+
+tf.app.flags.DEFINE_string(
+    'preprocessing_name', None, 'The name of the preprocessing to use. If left '
+    'as `None`, then the model_name flag is used.')
+
+tf.app.flags.DEFINE_float(
+    'moving_average_decay', None,
+    'The decay to use for the moving average.'
+    'If left as None, then moving averages are not used.')
+
+tf.app.flags.DEFINE_integer(
+    'eval_image_size', None, 'Eval image size')
+
+tf.app.flags.DEFINE_bool(
+    'quantize', False, 'whether to use quantized graph or not.')
+
+tf.app.flags.DEFINE_bool('use_grayscale', False,
+                         'Whether to convert input images to grayscale.')
+
+FLAGS = tf.app.flags.FLAGS
+
+
+def main(_):
+  if not FLAGS.dataset_dir:
+    raise ValueError('You must supply the dataset directory with --dataset_dir')
+
+  tf.logging.set_verbosity(tf.logging.INFO)
+  with tf.Graph().as_default():
+    tf_global_step = slim.get_or_create_global_step()
+
+    ######################
+    # Select the dataset #
+    ######################
+    dataset = dataset_factory.get_dataset(
+        FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
+
+    ####################
+    # Select the model #
+    ####################
+    network_fn = nets_factory.get_network_fn(
+        FLAGS.model_name,
+        num_classes=(dataset.num_classes - FLAGS.labels_offset),
+        is_training=False)
+
+    from dataloader import data_provider
+    preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
+    iterator, _ = data_provider.get_data(dataset, FLAGS.batch_size,
+                                         dataset.num_classes, FLAGS.labels_offset, is_training=False,
+                                         preprocessing_name=preprocessing_name,
+                                         use_grayscale=FLAGS.use_grayscale,
+                                         hvd=None, enable_hvd=None)
+    images, labels = iterator.get_next()  # label: [100, 1001]
+    images = tf.reshape(images, [FLAGS.batch_size, 224, 224, 3])  # (100, 224, 224, 3), float32
+    labels = tf.argmax(labels, axis=1)  # [100]
+    logits, _ = network_fn(images)
+
+    if FLAGS.quantize:
+      contrib_quantize.create_eval_graph()
+
+    predictions = tf.argmax(logits, 1)
+    labels = tf.squeeze(labels)
+    eval_accuracy, metric_update_op = tf.metrics.accuracy(labels, predictions)
+
+    # tf.summary.scalar('top1_acc', top1_accu)
+    # summaries_op = tf.summary.merge_all()
+
+    # TODO(sguada) use num_epochs=1
+    if FLAGS.max_num_batches:
+      num_batches = FLAGS.max_num_batches
+    else:
+      # This ensures that we make a single pass over all of the data.
+      num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size))
+    if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
+      checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
+    else:
+      checkpoint_path = FLAGS.checkpoint_path
+
+    ##### evaluate #####
+    tf.logging.info('Evaluating %s' % checkpoint_path)
+    saver = tf.train.Saver()
+    from time import gmtime, strftime
+    logdir = "results/%s" % strftime("%m%d%H%M%S_evel", gmtime())
+    # summary_writer = tf.summary.FileWriter(logdir=logdir, graph=tf.get_default_graph())
+    with tf.Session() as sess:
+      sess.run(iterator.initializer)
+      sess.run(tf.global_variables_initializer())
+      sess.run(tf.local_variables_initializer())
+      saver.restore(sess, f'{checkpoint_path}')
+      tf.train.write_graph(sess.graph, logdir, 'graph.pbtxt')
+
+      for step in range(num_batches):
+        _metric_update_op = sess.run([metric_update_op])
+        print(f'{step}, _metric_update_op: {_metric_update_op}')
+
+      acc = sess.run([eval_accuracy])
+      print(f'acc: {acc}')
+      hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value=f'{acc}')
+
+
+if __name__ == '__main__':
+  tf.app.run()
@@ -0,0 +1,181 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generic evaluation script that evaluates a model using a given dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import tensorflow as tf
+from tensorflow.contrib import quantize as contrib_quantize
+from tensorflow.contrib import slim as contrib_slim
+
+from datasets import dataset_factory
+from nets import nets_factory
+from benchmark_log import hwlog
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = '4'
+
+slim = contrib_slim
+
+tf.app.flags.DEFINE_integer(
+    'batch_size', 256, 'The number of samples in each batch.')
+
+tf.app.flags.DEFINE_integer(
+    'max_num_batches', None,
+    'Max number of batches to evaluate by default use all.')
+
+tf.app.flags.DEFINE_string(
+    'master', '', 'The address of the TensorFlow master to use.')
+
+tf.app.flags.DEFINE_string(
+    'checkpoint_path', 'ckpt_path',
+    'The directory where the model was written to or an absolute path to a '
+    'checkpoint file.')
+
+tf.app.flags.DEFINE_string(
+    'eval_dir', 'ckpt_path', 'Directory where the results are saved to.')
+
+tf.app.flags.DEFINE_integer(
+    'num_preprocessing_threads', 4,
+    'The number of threads used to create the batches.')
+
+tf.app.flags.DEFINE_string(
+    'dataset_name', 'imagenet', 'The name of the dataset to load.')
+
+tf.app.flags.DEFINE_string(
+    'dataset_split_name', 'validation', 'The name of the train/test split.')
+
+tf.app.flags.DEFINE_string(
+    'dataset_dir', '/opt/npu/slimImagenet', 'The directory where the dataset files are stored.')
+
+tf.app.flags.DEFINE_integer(
+    'labels_offset', 0,
+    'An offset for the labels in the dataset. This flag is primarily used to '
+    'evaluate the VGG and ResNet architectures which do not use a background '
+    'class for the ImageNet dataset.')
+
+tf.app.flags.DEFINE_string(
+    'model_name', 'mobilenet_v2', 'The name of the architecture to evaluate.')
+
+tf.app.flags.DEFINE_string(
+    'preprocessing_name', None, 'The name of the preprocessing to use. If left '
+    'as `None`, then the model_name flag is used.')
+
+tf.app.flags.DEFINE_float(
+    'moving_average_decay', None,
+    'The decay to use for the moving average.'
+    'If left as None, then moving averages are not used.')
+
+tf.app.flags.DEFINE_integer(
+    'eval_image_size', None, 'Eval image size')
+
+tf.app.flags.DEFINE_bool(
+    'quantize', False, 'whether to use quantized graph or not.')
+
+tf.app.flags.DEFINE_bool('use_grayscale', False,
+                         'Whether to convert input images to grayscale.')
+
+FLAGS = tf.app.flags.FLAGS
+
+
+def main(_):
+  if not FLAGS.dataset_dir:
+    raise ValueError('You must supply the dataset directory with --dataset_dir')
+
+  tf.logging.set_verbosity(tf.logging.INFO)
+  with tf.Graph().as_default():
+    tf_global_step = slim.get_or_create_global_step()
+
+    ######################
+    # Select the dataset #
+    ######################
+    dataset = dataset_factory.get_dataset(
+        FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
+
+    ####################
+    # Select the model #
+    ####################
+    network_fn = nets_factory.get_network_fn(
+        FLAGS.model_name,
+        num_classes=(dataset.num_classes - FLAGS.labels_offset),
+        is_training=False)
+
+    from dataloader import data_provider
+    preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
+
+    iterator, _ = data_provider.get_data(dataset, FLAGS.batch_size,
+                                         dataset.num_classes, FLAGS.labels_offset, is_training=False,
+                                         preprocessing_name=preprocessing_name,
+                                         use_grayscale=FLAGS.use_grayscale)
+    #tf.logging.info('iterator %s' % iterator)
+    images, labels = iterator.get_next()  # label: [100, 1001]
+    images = tf.reshape(images, [FLAGS.batch_size, 224, 224, 3])  # (100, 224, 224, 3), float32
+    labels = tf.argmax(labels, axis=1)  # [100]
+    logits, _ = network_fn(images)
+
+    if FLAGS.quantize:
+      contrib_quantize.create_eval_graph()
+
+    predictions = tf.argmax(logits, 1)
+    labels = tf.squeeze(labels)
+    eval_accuracy, metric_update_op = tf.metrics.accuracy(labels, predictions)
+    #hwlog.remark_print(key=hwlog.EVAL_ACCURACY, value="".format(eval_accuracy))
+
+    # tf.summary.scalar('top1_acc', top1_accu)
+    # summaries_op = tf.summary.merge_all()
+
+    # TODO(sguada) use num_epochs=1
+    if FLAGS.max_num_batches:
+      num_batches = FLAGS.max_num_batches
+    else:
+      # This ensures that we make a single pass over all of the data.
+      num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) - 1
+
+    if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
+      checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
+    else:
+      checkpoint_path = FLAGS.checkpoint_path
+    # checkpoint_path = '/opt/npu/models/mobilenetv2_v0.1/ckpt/model.ckpt'
+    print(dataset.num_samples)
+    print(FLAGS.batch_size)
+    hwlog.remark_print(key=hwlog.GLOBAL_BATCH_SIZE, value=FLAGS.batch_size)
+    ##### evaluate #####
+    tf.logging.info('Evaluating %s' % checkpoint_path)
+    saver = tf.train.Saver()
+    from time import gmtime, strftime
+    logdir = "ckpt/%s" % strftime("%m%d%H%M%S_evel", gmtime())
+    # summary_writer = tf.summary.FileWriter(logdir=logdir, graph=tf.get_default_graph())
+    with tf.Session() as sess:
+      sess.run(iterator.initializer)
+      sess.run(tf.global_variables_initializer())
+      sess.run(tf.local_variables_initializer())
+      saver.restore(sess, f'{checkpoint_path}')
+      # saver.restore(sess, 'result/8p/2/results/model.ckpt-3750')
+      tf.train.write_graph(sess.graph, logdir, 'graph.pbtxt')
+
+      for step in range(num_batches):
+        _metric_update_op = sess.run([metric_update_op])
+        print(f'{step}, _metric_update_op: {_metric_update_op}')
+        hwlog.remark_print(key=hwlog.GLOBAL_STEP, value=f'{step}')
+        hwlog.remark_print(key=hwlog.EVAL_ACCURACY, value=f'{_metric_update_op}')
+      acc = sess.run([eval_accuracy])
+      print(f'acc: {acc}')
+      hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value=f'{acc}')
+
+
+if __name__ == '__main__':
+  tf.app.run()
@@ -0,0 +1,164 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Saves out a GraphDef containing the architecture of the model.
+
+To use it, run something like this, with a model name defined by slim:
+
+bazel build tensorflow_models/research/slim:export_inference_graph
+bazel-bin/tensorflow_models/research/slim/export_inference_graph \
+--model_name=inception_v3 --output_file=/tmp/inception_v3_inf_graph.pb
+
+If you then want to use the resulting model with your own or pretrained
+checkpoints as part of a mobile model, you can run freeze_graph to get a graph
+def with the variables inlined as constants using:
+
+bazel build tensorflow/python/tools:freeze_graph
+bazel-bin/tensorflow/python/tools/freeze_graph \
+--input_graph=/tmp/inception_v3_inf_graph.pb \
+--input_checkpoint=/tmp/checkpoints/inception_v3.ckpt \
+--input_binary=true --output_graph=/tmp/frozen_inception_v3.pb \
+--output_node_names=InceptionV3/Predictions/Reshape_1
+
+The output node names will vary depending on the model, but you can inspect and
+estimate them using the summarize_graph tool:
+
+bazel build tensorflow/tools/graph_transforms:summarize_graph
+bazel-bin/tensorflow/tools/graph_transforms/summarize_graph \
+--in_graph=/tmp/inception_v3_inf_graph.pb
+
+To run the resulting graph in C++, you can look at the label_image sample code:
+
+bazel build tensorflow/examples/label_image:label_image
+bazel-bin/tensorflow/examples/label_image/label_image \
+--image=${HOME}/Pictures/flowers.jpg \
+--input_layer=input \
+--output_layer=InceptionV3/Predictions/Reshape_1 \
+--graph=/tmp/frozen_inception_v3.pb \
+--labels=/tmp/imagenet_slim_labels.txt \
+--input_mean=0 \
+--input_std=255
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+
+import tensorflow as tf
+from tensorflow.contrib import quantize as contrib_quantize
+from tensorflow.contrib import slim as contrib_slim
+
+from tensorflow.python.platform import gfile
+from datasets import dataset_factory
+from nets import nets_factory
+
+
+slim = contrib_slim
+
+tf.app.flags.DEFINE_string(
+    'model_name', 'inception_v3', 'The name of the architecture to save.')
+
+tf.app.flags.DEFINE_boolean(
+    'is_training', False,
+    'Whether to save out a training-focused version of the model.')
+
+tf.app.flags.DEFINE_integer(
+    'image_size', None,
+    'The image size to use, otherwise use the model default_image_size.')
+
+tf.app.flags.DEFINE_integer(
+    'batch_size', None,
+    'Batch size for the exported model. Defaulted to "None" so batch size can '
+    'be specified at model runtime.')
+
+tf.app.flags.DEFINE_string('dataset_name', 'imagenet',
+                           'The name of the dataset to use with the model.')
+
+tf.app.flags.DEFINE_integer(
+    'labels_offset', 0,
+    'An offset for the labels in the dataset. This flag is primarily used to '
+    'evaluate the VGG and ResNet architectures which do not use a background '
+    'class for the ImageNet dataset.')
+
+tf.app.flags.DEFINE_string(
+    'output_file', '', 'Where to save the resulting file to.')
+
+tf.app.flags.DEFINE_string(
+    'dataset_dir', '', 'Directory to save intermediate dataset files to')
+
+tf.app.flags.DEFINE_bool(
+    'quantize', False, 'whether to use quantized graph or not.')
+
+tf.app.flags.DEFINE_bool(
+    'is_video_model', False, 'whether to use 5-D inputs for video model.')
+
+tf.app.flags.DEFINE_integer(
+    'num_frames', None,
+    'The number of frames to use. Only used if is_video_model is True.')
+
+tf.app.flags.DEFINE_bool('write_text_graphdef', False,
+                         'Whether to write a text version of graphdef.')
+
+tf.app.flags.DEFINE_bool('use_grayscale', False,
+                         'Whether to convert input images to grayscale.')
+
+FLAGS = tf.app.flags.FLAGS
+
+
+def main(_):
+  if not FLAGS.output_file:
+    raise ValueError('You must supply the path to save to with --output_file')
+  if FLAGS.is_video_model and not FLAGS.num_frames:
+    raise ValueError(
+        'Number of frames must be specified for video models with --num_frames')
+  tf.logging.set_verbosity(tf.logging.INFO)
+  with tf.Graph().as_default() as graph:
+    dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train',
+                                          FLAGS.dataset_dir)
+    network_fn = nets_factory.get_network_fn(
+        FLAGS.model_name,
+        num_classes=(dataset.num_classes - FLAGS.labels_offset),
+        is_training=FLAGS.is_training)
+    image_size = FLAGS.image_size or network_fn.default_image_size
+    num_channels = 1 if FLAGS.use_grayscale else 3
+    if FLAGS.is_video_model:
+      input_shape = [
+          FLAGS.batch_size, FLAGS.num_frames, image_size, image_size,
+          num_channels
+      ]
+    else:
+      input_shape = [FLAGS.batch_size, image_size, image_size, num_channels]
+    placeholder = tf.placeholder(name='input', dtype=tf.float32,
+                                 shape=input_shape)
+    network_fn(placeholder)
+
+    if FLAGS.quantize:
+      contrib_quantize.create_eval_graph()
+
+    graph_def = graph.as_graph_def()
+    if FLAGS.write_text_graphdef:
+      tf.io.write_graph(
+          graph_def,
+          os.path.dirname(FLAGS.output_file),
+          os.path.basename(FLAGS.output_file),
+          as_text=True)
+    else:
+      with gfile.GFile(FLAGS.output_file, 'wb') as f:
+        f.write(graph_def.SerializeToString())
+
+
+if __name__ == '__main__':
+  tf.app.run()
@@ -0,0 +1,44 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for export_inference_graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+
+import tensorflow as tf
+
+from tensorflow.python.platform import gfile
+import export_inference_graph
+
+
+class ExportInferenceGraphTest(tf.test.TestCase):
+
+  def testExportInferenceGraph(self):
+    tmpdir = self.get_temp_dir()
+    output_file = os.path.join(tmpdir, 'inception_v3.pb')
+    flags = tf.app.flags.FLAGS
+    flags.output_file = output_file
+    flags.model_name = 'inception_v3'
+    flags.dataset_dir = tmpdir
+    export_inference_graph.main(None)
+    self.assertTrue(gfile.Exists(output_file))
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,44 @@
+import tensorflow as tf
+import numpy as np
+
+def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
+                                    initializer=None, regularizer=None,
+                                    trainable=True,
+                                    *args, **kwargs):
+    """Custom variable getter that forces trainable variables to be stored in
+      float32 precision and then casts them to the training precision.
+    """
+    storage_dtype = tf.float32 if trainable else dtype
+    variable = getter(name, shape, dtype=storage_dtype,
+                      initializer=initializer, regularizer=regularizer,
+                      trainable=trainable,
+                      *args, **kwargs)
+    if trainable and dtype != tf.float32:
+        variable = tf.cast(variable, dtype)
+    return variable
+
+def get_custom_getter(compute_type):
+    return float32_variable_storage_getter if compute_type == tf.float16 else None
+
+
+
+def float32_variable_storage_getter_1(getter, name, shape=None, dtype=None,
+                                      initializer=None, regularizer=None,
+                                      trainable=True,
+                                      *args, **kwargs):
+    """Custom variable getter that forces trainable variables to be stored in
+       float32 precision and then casts them to the training precision.
+    """
+    dtype = tf.float16
+    storage_dtype = tf.float32 if trainable else dtype
+    variable = getter(name, shape, dtype=storage_dtype,
+                      initializer=initializer, regularizer=regularizer,
+                      trainable=trainable,
+                      *args, **kwargs)
+    if trainable and dtype != tf.float32:
+        variable = tf.cast(variable, dtype)
+    return variable
+
+def get_custom_getter_1(compute_type):
+    return float32_variable_storage_getter_1 if compute_type == tf.float16 else None
+
@@ -0,0 +1,86 @@
+from __future__ import print_function
+
+import datetime
+import logging
+import os
+import sys
+import time
+from benchmark_log import hwlog
+import numpy as np
+import tensorflow as tf
+
+
+class LogSessionRunHook(tf.train.SessionRunHook):
+    def __init__(self, config, warmup_steps=5):
+        self.global_batch_size = config['global_batch_size']
+        self.iterations_per_loop = config['iterations_per_loop']
+        self.warmup_steps = warmup_steps
+        self.iter_times = []
+        self.num_records = config['num_training_samples']
+        self.display_every = config['display_every']
+        self.logger = get_logger(config['log_name'], config['log_dir'])
+        rank0log(self.logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__))
+
+    def after_create_session(self, session, coord):
+        rank0log(self.logger, 'Step   Epoch   Speed   Loss   FinLoss   LR')
+        self.elapsed_secs = 0.
+        self.count = 0
+
+    def before_run(self, run_context):
+        self.t0 = time.time()
+        return tf.train.SessionRunArgs(
+            fetches=[tf.train.get_global_step(), 'loss:0', 'total_loss:0', 'learning_rate:0',
+                     'train_accuracy:0'])
+
+    def after_run(self, run_context, run_values):
+        batch_time = time.time() - self.t0
+        self.iter_times.append(batch_time)
+        self.elapsed_secs += batch_time
+        self.count += 1
+        global_step, loss, total_loss, lr, train_accuracy = run_values.results
+        if global_step == 1 or global_step % self.display_every == 0:
+            dt = self.elapsed_secs / self.count
+            img_per_sec = self.global_batch_size * self.iterations_per_loop / dt
+            epoch = global_step * self.global_batch_size / self.num_records
+            self.logger.info(f'step:{global_step}  epoch:{epoch} ips:{img_per_sec} '
+                             f'loss:{loss}  total_loss:{total_loss}  lr:{lr}, '
+                             f'train_accuracy:{train_accuracy}')
+
+            hwlog.remark_print(key=hwlog.GLOBAL_STEP, value=f"{global_step}")
+            hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=f"{epoch}")
+            hwlog.remark_print(key=hwlog.TRAIN_ACCURACY, value=f"{train_accuracy}")
+            hwlog.remark_print(key=hwlog.FPS, value=f"{img_per_sec}")
+            self.elapsed_secs = 0.
+            self.count = 0
+
+    def get_average_speed(self):
+        avg_time = np.mean(self.iter_times[self.warmup_steps:])
+        speed = self.global_batch_size / avg_time
+        return speed
+
+
+def rank0log(logger, *args, **kwargs):
+    if logger:
+        logger.info(''.join([str(x) for x in list(args)]))
+    else:
+        print(*args, **kwargs)
+
+
+def get_logger(log_name, log_dir):
+    logger = logging.getLogger(log_name)
+    logger.setLevel(logging.INFO)  # INFO, ERROR
+    if not os.path.isdir(log_dir):
+        try:
+            os.makedirs(log_dir)
+        except FileExistsError:
+            pass
+    ch = logging.StreamHandler()
+    ch.setLevel(logging.INFO)
+    formatter = logging.Formatter('%(message)s')
+    ch.setFormatter(formatter)
+    logger.addHandler(ch)
+    fh = logging.FileHandler(os.path.join(log_dir, log_name))
+    fh.setLevel(logging.DEBUG)
+    fh.setFormatter(formatter)
+    logger.addHandler(fh)
+    return logger
@@ -0,0 +1 @@
+
@@ -0,0 +1,148 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains a model definition for AlexNet.
+
+This work was first described in:
+  ImageNet Classification with Deep Convolutional Neural Networks
+  Alex Krizhevsky, Ilya Sutskever and Geoffrey E. Hinton
+
+and later refined in:
+  One weird trick for parallelizing convolutional neural networks
+  Alex Krizhevsky, 2014
+
+Here we provide the implementation proposed in "One weird trick" and not
+"ImageNet Classification", as per the paper, the LRN layers have been removed.
+
+Usage:
+  with slim.arg_scope(alexnet.alexnet_v2_arg_scope()):
+    outputs, end_points = alexnet.alexnet_v2(inputs)
+
+@@alexnet_v2
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+slim = contrib_slim
+
+# pylint: disable=g-long-lambda
+trunc_normal = lambda stddev: tf.compat.v1.truncated_normal_initializer(
+    0.0, stddev)
+
+
+def alexnet_v2_arg_scope(weight_decay=0.0005):
+    with slim.arg_scope([slim.conv2d, slim.fully_connected],
+                        activation_fn=tf.nn.relu,
+                        biases_initializer=tf.compat.v1.constant_initializer(0.1),
+                        weights_regularizer=slim.l2_regularizer(weight_decay)):
+        with slim.arg_scope([slim.conv2d], padding='SAME'):
+            with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc:
+                return arg_sc
+
+
+def alexnet_v2(inputs,
+               num_classes=1000,
+               is_training=True,
+               dropout_keep_prob=0.5,
+               spatial_squeeze=True,
+               scope='alexnet_v2',
+               global_pool=False):
+    """AlexNet version 2.
+
+    Described in: http://arxiv.org/pdf/1404.5997v2.pdf
+    Parameters from:
+    github.com/akrizhevsky/cuda-convnet2/blob/master/layers/
+    layers-imagenet-1gpu.cfg
+
+    Note: All the fully_connected layers have been transformed to conv2d layers.
+          To use in classification mode, resize input to 224x224 or set
+          global_pool=True. To use in fully convolutional mode, set
+          spatial_squeeze to false.
+          The LRN layers have been removed and change the initializers from
+          random_normal_initializer to xavier_initializer.
+
+    Args:
+      inputs: a tensor of size [batch_size, height, width, channels].
+      num_classes: the number of predicted classes. If 0 or None, the logits layer
+      is omitted and the input features to the logits layer are returned instead.
+      is_training: whether or not the model is being trained.
+      dropout_keep_prob: the probability that activations are kept in the dropout
+        layers during training.
+      spatial_squeeze: whether or not should squeeze the spatial dimensions of the
+        logits. Useful to remove unnecessary dimensions for classification.
+      scope: Optional scope for the variables.
+      global_pool: Optional boolean flag. If True, the input to the classification
+        layer is avgpooled to size 1x1, for any input size. (This is not part
+        of the original AlexNet.)
+
+    Returns:
+      net: the output of the logits layer (if num_classes is a non-zero integer),
+        or the non-dropped-out input to the logits layer (if num_classes is 0
+        or None).
+      end_points: a dict of tensors with intermediate activations.
+    """
+    with tf.compat.v1.variable_scope(scope, 'alexnet_v2', [inputs]) as sc:
+        end_points_collection = sc.original_name_scope + '_end_points'
+        # Collect outputs for conv2d, fully_connected and max_pool2d.
+        with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
+                            outputs_collections=[end_points_collection]):
+            net = slim.conv2d(inputs, 64, [11, 11], 4, padding='VALID',
+                              scope='conv1')
+            net = slim.max_pool2d(net, [3, 3], 2, scope='pool1')
+            net = slim.conv2d(net, 192, [5, 5], scope='conv2')
+            net = slim.max_pool2d(net, [3, 3], 2, scope='pool2')
+            net = slim.conv2d(net, 384, [3, 3], scope='conv3')
+            net = slim.conv2d(net, 384, [3, 3], scope='conv4')
+            net = slim.conv2d(net, 256, [3, 3], scope='conv5')
+            net = slim.max_pool2d(net, [3, 3], 2, scope='pool5')
+
+            # Use conv2d instead of fully_connected layers.
+            with slim.arg_scope(
+                    [slim.conv2d],
+                    weights_initializer=trunc_normal(0.005),
+                    biases_initializer=tf.compat.v1.constant_initializer(0.1)):
+                net = slim.conv2d(net, 4096, [5, 5], padding='VALID',
+                                  scope='fc6')
+                net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
+                                   scope='dropout6')
+                net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
+                # Convert end_points_collection into a end_point dict.
+                end_points = slim.utils.convert_collection_to_dict(
+                    end_points_collection)
+                if global_pool:
+                    net = tf.reduce_mean(
+                        input_tensor=net, axis=[1, 2], keepdims=True, name='global_pool')
+                    end_points['global_pool'] = net
+                if num_classes:
+                    net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
+                                       scope='dropout7')
+                    net = slim.conv2d(
+                        net,
+                        num_classes, [1, 1],
+                        activation_fn=None,
+                        normalizer_fn=None,
+                        biases_initializer=tf.compat.v1.zeros_initializer(),
+                        scope='fc8')
+                    if spatial_squeeze:
+                        net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
+                    end_points[sc.name + '/fc8'] = net
+            return net, end_points
+
+
+alexnet_v2.default_image_size = 224
@@ -0,0 +1,181 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for slim.nets.alexnet."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from nets import alexnet
+
+slim = contrib_slim
+
+
+class AlexnetV2Test(tf.test.TestCase):
+
+  def testBuild(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = 1000
+    with self.test_session():
+      inputs = tf.random.uniform((batch_size, height, width, 3))
+      logits, _ = alexnet.alexnet_v2(inputs, num_classes)
+      self.assertEquals(logits.op.name, 'alexnet_v2/fc8/squeezed')
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+
+  def testFullyConvolutional(self):
+    batch_size = 1
+    height, width = 300, 400
+    num_classes = 1000
+    with self.test_session():
+      inputs = tf.random.uniform((batch_size, height, width, 3))
+      logits, _ = alexnet.alexnet_v2(inputs, num_classes, spatial_squeeze=False)
+      self.assertEquals(logits.op.name, 'alexnet_v2/fc8/BiasAdd')
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, 4, 7, num_classes])
+
+  def testGlobalPool(self):
+    batch_size = 1
+    height, width = 256, 256
+    num_classes = 1000
+    with self.test_session():
+      inputs = tf.random.uniform((batch_size, height, width, 3))
+      logits, _ = alexnet.alexnet_v2(inputs, num_classes, spatial_squeeze=False,
+                                     global_pool=True)
+      self.assertEquals(logits.op.name, 'alexnet_v2/fc8/BiasAdd')
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, 1, 1, num_classes])
+
+  def testEndPoints(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = 1000
+    with self.test_session():
+      inputs = tf.random.uniform((batch_size, height, width, 3))
+      _, end_points = alexnet.alexnet_v2(inputs, num_classes)
+      expected_names = ['alexnet_v2/conv1',
+                        'alexnet_v2/pool1',
+                        'alexnet_v2/conv2',
+                        'alexnet_v2/pool2',
+                        'alexnet_v2/conv3',
+                        'alexnet_v2/conv4',
+                        'alexnet_v2/conv5',
+                        'alexnet_v2/pool5',
+                        'alexnet_v2/fc6',
+                        'alexnet_v2/fc7',
+                        'alexnet_v2/fc8'
+                       ]
+      self.assertSetEqual(set(end_points.keys()), set(expected_names))
+
+  def testNoClasses(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = None
+    with self.test_session():
+      inputs = tf.random.uniform((batch_size, height, width, 3))
+      net, end_points = alexnet.alexnet_v2(inputs, num_classes)
+      expected_names = ['alexnet_v2/conv1',
+                        'alexnet_v2/pool1',
+                        'alexnet_v2/conv2',
+                        'alexnet_v2/pool2',
+                        'alexnet_v2/conv3',
+                        'alexnet_v2/conv4',
+                        'alexnet_v2/conv5',
+                        'alexnet_v2/pool5',
+                        'alexnet_v2/fc6',
+                        'alexnet_v2/fc7'
+                       ]
+      self.assertSetEqual(set(end_points.keys()), set(expected_names))
+      self.assertTrue(net.op.name.startswith('alexnet_v2/fc7'))
+      self.assertListEqual(net.get_shape().as_list(),
+                           [batch_size, 1, 1, 4096])
+
+  def testModelVariables(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = 1000
+    with self.test_session():
+      inputs = tf.random.uniform((batch_size, height, width, 3))
+      alexnet.alexnet_v2(inputs, num_classes)
+      expected_names = ['alexnet_v2/conv1/weights',
+                        'alexnet_v2/conv1/biases',
+                        'alexnet_v2/conv2/weights',
+                        'alexnet_v2/conv2/biases',
+                        'alexnet_v2/conv3/weights',
+                        'alexnet_v2/conv3/biases',
+                        'alexnet_v2/conv4/weights',
+                        'alexnet_v2/conv4/biases',
+                        'alexnet_v2/conv5/weights',
+                        'alexnet_v2/conv5/biases',
+                        'alexnet_v2/fc6/weights',
+                        'alexnet_v2/fc6/biases',
+                        'alexnet_v2/fc7/weights',
+                        'alexnet_v2/fc7/biases',
+                        'alexnet_v2/fc8/weights',
+                        'alexnet_v2/fc8/biases',
+                       ]
+      model_variables = [v.op.name for v in slim.get_model_variables()]
+      self.assertSetEqual(set(model_variables), set(expected_names))
+
+  def testEvaluation(self):
+    batch_size = 2
+    height, width = 224, 224
+    num_classes = 1000
+    with self.test_session():
+      eval_inputs = tf.random.uniform((batch_size, height, width, 3))
+      logits, _ = alexnet.alexnet_v2(eval_inputs, is_training=False)
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+      predictions = tf.argmax(input=logits, axis=1)
+      self.assertListEqual(predictions.get_shape().as_list(), [batch_size])
+
+  def testTrainEvalWithReuse(self):
+    train_batch_size = 2
+    eval_batch_size = 1
+    train_height, train_width = 224, 224
+    eval_height, eval_width = 300, 400
+    num_classes = 1000
+    with self.test_session():
+      train_inputs = tf.random.uniform(
+          (train_batch_size, train_height, train_width, 3))
+      logits, _ = alexnet.alexnet_v2(train_inputs)
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [train_batch_size, num_classes])
+      tf.compat.v1.get_variable_scope().reuse_variables()
+      eval_inputs = tf.random.uniform(
+          (eval_batch_size, eval_height, eval_width, 3))
+      logits, _ = alexnet.alexnet_v2(eval_inputs, is_training=False,
+                                     spatial_squeeze=False)
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [eval_batch_size, 4, 7, num_classes])
+      logits = tf.reduce_mean(input_tensor=logits, axis=[1, 2])
+      predictions = tf.argmax(input=logits, axis=1)
+      self.assertEquals(predictions.get_shape().as_list(), [eval_batch_size])
+
+  def testForward(self):
+    batch_size = 1
+    height, width = 224, 224
+    with self.test_session() as sess:
+      inputs = tf.random.uniform((batch_size, height, width, 3))
+      logits, _ = alexnet.alexnet_v2(inputs)
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(logits)
+      self.assertTrue(output.any())
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,123 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains a variant of the CIFAR-10 model definition."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+slim = contrib_slim
+
+# pylint: disable=g-long-lambda
+trunc_normal = lambda stddev: tf.compat.v1.truncated_normal_initializer(
+    stddev=stddev)
+
+
+def cifarnet(images, num_classes=10, is_training=False,
+             dropout_keep_prob=0.5,
+             prediction_fn=slim.softmax,
+             scope='CifarNet'):
+  """Creates a variant of the CifarNet model.
+
+  Note that since the output is a set of 'logits', the values fall in the
+  interval of (-infinity, infinity). Consequently, to convert the outputs to a
+  probability distribution over the characters, one will need to convert them
+  using the softmax function:
+
+        logits = cifarnet.cifarnet(images, is_training=False)
+        probabilities = tf.nn.softmax(logits)
+        predictions = tf.argmax(logits, 1)
+
+  Args:
+    images: A batch of `Tensors` of size [batch_size, height, width, channels].
+    num_classes: the number of classes in the dataset. If 0 or None, the logits
+      layer is omitted and the input features to the logits layer are returned
+      instead.
+    is_training: specifies whether or not we're currently training the model.
+      This variable will determine the behaviour of the dropout layer.
+    dropout_keep_prob: the percentage of activation values that are retained.
+    prediction_fn: a function to get predictions out of logits.
+    scope: Optional variable_scope.
+
+  Returns:
+    net: a 2D Tensor with the logits (pre-softmax activations) if num_classes
+      is a non-zero integer, or the input to the logits layer if num_classes
+      is 0 or None.
+    end_points: a dictionary from components of the network to the corresponding
+      activation.
+  """
+  end_points = {}
+
+  with tf.compat.v1.variable_scope(scope, 'CifarNet', [images]):
+    net = slim.conv2d(images, 64, [5, 5], scope='conv1')
+    end_points['conv1'] = net
+    net = slim.max_pool2d(net, [2, 2], 2, scope='pool1')
+    end_points['pool1'] = net
+    net = tf.nn.lrn(net, 4, bias=1.0, alpha=0.001/9.0, beta=0.75, name='norm1')
+    net = slim.conv2d(net, 64, [5, 5], scope='conv2')
+    end_points['conv2'] = net
+    net = tf.nn.lrn(net, 4, bias=1.0, alpha=0.001/9.0, beta=0.75, name='norm2')
+    net = slim.max_pool2d(net, [2, 2], 2, scope='pool2')
+    end_points['pool2'] = net
+    net = slim.flatten(net)
+    end_points['Flatten'] = net
+    net = slim.fully_connected(net, 384, scope='fc3')
+    end_points['fc3'] = net
+    net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
+                       scope='dropout3')
+    net = slim.fully_connected(net, 192, scope='fc4')
+    end_points['fc4'] = net
+    if not num_classes:
+      return net, end_points
+    logits = slim.fully_connected(
+        net,
+        num_classes,
+        biases_initializer=tf.compat.v1.zeros_initializer(),
+        weights_initializer=trunc_normal(1 / 192.0),
+        weights_regularizer=None,
+        activation_fn=None,
+        scope='logits')
+
+    end_points['Logits'] = logits
+    end_points['Predictions'] = prediction_fn(logits, scope='Predictions')
+
+  return logits, end_points
+cifarnet.default_image_size = 32
+
+
+def cifarnet_arg_scope(weight_decay=0.004):
+  """Defines the default cifarnet argument scope.
+
+  Args:
+    weight_decay: The weight decay to use for regularizing the model.
+
+  Returns:
+    An `arg_scope` to use for the inception v3 model.
+  """
+  with slim.arg_scope(
+      [slim.conv2d],
+      weights_initializer=tf.compat.v1.truncated_normal_initializer(
+          stddev=5e-2),
+      activation_fn=tf.nn.relu):
+    with slim.arg_scope(
+        [slim.fully_connected],
+        biases_initializer=tf.compat.v1.constant_initializer(0.1),
+        weights_initializer=trunc_normal(0.04),
+        weights_regularizer=slim.l2_regularizer(weight_decay),
+        activation_fn=tf.nn.relu) as sc:
+      return sc
@@ -0,0 +1,280 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines the CycleGAN generator and discriminator networks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+from tensorflow.contrib import framework as contrib_framework
+from tensorflow.contrib import layers as contrib_layers
+from tensorflow.contrib import util as contrib_util
+
+layers = contrib_layers
+
+
+def cyclegan_arg_scope(instance_norm_center=True,
+                       instance_norm_scale=True,
+                       instance_norm_epsilon=0.001,
+                       weights_init_stddev=0.02,
+                       weight_decay=0.0):
+  """Returns a default argument scope for all generators and discriminators.
+
+  Args:
+    instance_norm_center: Whether instance normalization applies centering.
+    instance_norm_scale: Whether instance normalization applies scaling.
+    instance_norm_epsilon: Small float added to the variance in the instance
+      normalization to avoid dividing by zero.
+    weights_init_stddev: Standard deviation of the random values to initialize
+      the convolution kernels with.
+    weight_decay: Magnitude of weight decay applied to all convolution kernel
+      variables of the generator.
+
+  Returns:
+    An arg-scope.
+  """
+  instance_norm_params = {
+      'center': instance_norm_center,
+      'scale': instance_norm_scale,
+      'epsilon': instance_norm_epsilon,
+  }
+
+  weights_regularizer = None
+  if weight_decay and weight_decay > 0.0:
+    weights_regularizer = layers.l2_regularizer(weight_decay)
+
+  with contrib_framework.arg_scope(
+      [layers.conv2d],
+      normalizer_fn=layers.instance_norm,
+      normalizer_params=instance_norm_params,
+      weights_initializer=tf.compat.v1.random_normal_initializer(
+          0, weights_init_stddev),
+      weights_regularizer=weights_regularizer) as sc:
+    return sc
+
+
+def cyclegan_upsample(net, num_outputs, stride, method='conv2d_transpose',
+                      pad_mode='REFLECT', align_corners=False):
+  """Upsamples the given inputs.
+
+  Args:
+    net: A Tensor of size [batch_size, height, width, filters].
+    num_outputs: The number of output filters.
+    stride: A list of 2 scalars or a 1x2 Tensor indicating the scale,
+      relative to the inputs, of the output dimensions. For example, if kernel
+      size is [2, 3], then the output height and width will be twice and three
+      times the input size.
+    method: The upsampling method: 'nn_upsample_conv', 'bilinear_upsample_conv',
+      or 'conv2d_transpose'.
+    pad_mode: mode for tf.pad, one of "CONSTANT", "REFLECT", or "SYMMETRIC".
+    align_corners: option for method, 'bilinear_upsample_conv'. If true, the
+      centers of the 4 corner pixels of the input and output tensors are
+      aligned, preserving the values at the corner pixels.
+
+  Returns:
+    A Tensor which was upsampled using the specified method.
+
+  Raises:
+    ValueError: if `method` is not recognized.
+  """
+  with tf.compat.v1.variable_scope('upconv'):
+    net_shape = tf.shape(input=net)
+    height = net_shape[1]
+    width = net_shape[2]
+
+    # Reflection pad by 1 in spatial dimensions (axes 1, 2 = h, w) to make a 3x3
+    # 'valid' convolution produce an output with the same dimension as the
+    # input.
+    spatial_pad_1 = np.array([[0, 0], [1, 1], [1, 1], [0, 0]])
+
+    if method == 'nn_upsample_conv':
+      net = tf.image.resize(
+          net, [stride[0] * height, stride[1] * width],
+          method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
+      net = tf.pad(tensor=net, paddings=spatial_pad_1, mode=pad_mode)
+      net = layers.conv2d(net, num_outputs, kernel_size=[3, 3], padding='valid')
+    elif method == 'bilinear_upsample_conv':
+      net = tf.compat.v1.image.resize_bilinear(
+          net, [stride[0] * height, stride[1] * width],
+          align_corners=align_corners)
+      net = tf.pad(tensor=net, paddings=spatial_pad_1, mode=pad_mode)
+      net = layers.conv2d(net, num_outputs, kernel_size=[3, 3], padding='valid')
+    elif method == 'conv2d_transpose':
+      # This corrects 1 pixel offset for images with even width and height.
+      # conv2d is left aligned and conv2d_transpose is right aligned for even
+      # sized images (while doing 'SAME' padding).
+      # Note: This doesn't reflect actual model in paper.
+      net = layers.conv2d_transpose(
+          net, num_outputs, kernel_size=[3, 3], stride=stride, padding='valid')
+      net = net[:, 1:, 1:, :]
+    else:
+      raise ValueError('Unknown method: [%s]' % method)
+
+    return net
+
+
+def _dynamic_or_static_shape(tensor):
+  shape = tf.shape(input=tensor)
+  static_shape = contrib_util.constant_value(shape)
+  return static_shape if static_shape is not None else shape
+
+
+def cyclegan_generator_resnet(images,
+                              arg_scope_fn=cyclegan_arg_scope,
+                              num_resnet_blocks=6,
+                              num_filters=64,
+                              upsample_fn=cyclegan_upsample,
+                              kernel_size=3,
+                              tanh_linear_slope=0.0,
+                              is_training=False):
+  """Defines the cyclegan resnet network architecture.
+
+  As closely as possible following
+  https://github.com/junyanz/CycleGAN/blob/master/models/architectures.lua#L232
+
+  FYI: This network requires input height and width to be divisible by 4 in
+  order to generate an output with shape equal to input shape. Assertions will
+  catch this if input dimensions are known at graph construction time, but
+  there's no protection if unknown at graph construction time (you'll see an
+  error).
+
+  Args:
+    images: Input image tensor of shape [batch_size, h, w, 3].
+    arg_scope_fn: Function to create the global arg_scope for the network.
+    num_resnet_blocks: Number of ResNet blocks in the middle of the generator.
+    num_filters: Number of filters of the first hidden layer.
+    upsample_fn: Upsampling function for the decoder part of the generator.
+    kernel_size: Size w or list/tuple [h, w] of the filter kernels for all inner
+      layers.
+    tanh_linear_slope: Slope of the linear function to add to the tanh over the
+      logits.
+    is_training: Whether the network is created in training mode or inference
+      only mode. Not actually needed, just for compliance with other generator
+      network functions.
+
+  Returns:
+    A `Tensor` representing the model output and a dictionary of model end
+      points.
+
+  Raises:
+    ValueError: If the input height or width is known at graph construction time
+      and not a multiple of 4.
+  """
+  # Neither dropout nor batch norm -> dont need is_training
+  del is_training
+
+  end_points = {}
+
+  input_size = images.shape.as_list()
+  height, width = input_size[1], input_size[2]
+  if height and height % 4 != 0:
+    raise ValueError('The input height must be a multiple of 4.')
+  if width and width % 4 != 0:
+    raise ValueError('The input width must be a multiple of 4.')
+  num_outputs = input_size[3]
+
+  if not isinstance(kernel_size, (list, tuple)):
+    kernel_size = [kernel_size, kernel_size]
+
+  kernel_height = kernel_size[0]
+  kernel_width = kernel_size[1]
+  pad_top = (kernel_height - 1) // 2
+  pad_bottom = kernel_height // 2
+  pad_left = (kernel_width - 1) // 2
+  pad_right = kernel_width // 2
+  paddings = np.array(
+      [[0, 0], [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]],
+      dtype=np.int32)
+  spatial_pad_3 = np.array([[0, 0], [3, 3], [3, 3], [0, 0]])
+
+  with contrib_framework.arg_scope(arg_scope_fn()):
+
+    ###########
+    # Encoder #
+    ###########
+    with tf.compat.v1.variable_scope('input'):
+      # 7x7 input stage
+      net = tf.pad(tensor=images, paddings=spatial_pad_3, mode='REFLECT')
+      net = layers.conv2d(net, num_filters, kernel_size=[7, 7], padding='VALID')
+      end_points['encoder_0'] = net
+
+    with tf.compat.v1.variable_scope('encoder'):
+      with contrib_framework.arg_scope([layers.conv2d],
+                                       kernel_size=kernel_size,
+                                       stride=2,
+                                       activation_fn=tf.nn.relu,
+                                       padding='VALID'):
+
+        net = tf.pad(tensor=net, paddings=paddings, mode='REFLECT')
+        net = layers.conv2d(net, num_filters * 2)
+        end_points['encoder_1'] = net
+        net = tf.pad(tensor=net, paddings=paddings, mode='REFLECT')
+        net = layers.conv2d(net, num_filters * 4)
+        end_points['encoder_2'] = net
+
+    ###################
+    # Residual Blocks #
+    ###################
+    with tf.compat.v1.variable_scope('residual_blocks'):
+      with contrib_framework.arg_scope([layers.conv2d],
+                                       kernel_size=kernel_size,
+                                       stride=1,
+                                       activation_fn=tf.nn.relu,
+                                       padding='VALID'):
+        for block_id in xrange(num_resnet_blocks):
+          with tf.compat.v1.variable_scope('block_{}'.format(block_id)):
+            res_net = tf.pad(tensor=net, paddings=paddings, mode='REFLECT')
+            res_net = layers.conv2d(res_net, num_filters * 4)
+            res_net = tf.pad(tensor=res_net, paddings=paddings, mode='REFLECT')
+            res_net = layers.conv2d(res_net, num_filters * 4,
+                                    activation_fn=None)
+            net += res_net
+
+            end_points['resnet_block_%d' % block_id] = net
+
+    ###########
+    # Decoder #
+    ###########
+    with tf.compat.v1.variable_scope('decoder'):
+
+      with contrib_framework.arg_scope([layers.conv2d],
+                                       kernel_size=kernel_size,
+                                       stride=1,
+                                       activation_fn=tf.nn.relu):
+
+        with tf.compat.v1.variable_scope('decoder1'):
+          net = upsample_fn(net, num_outputs=num_filters * 2, stride=[2, 2])
+        end_points['decoder1'] = net
+
+        with tf.compat.v1.variable_scope('decoder2'):
+          net = upsample_fn(net, num_outputs=num_filters, stride=[2, 2])
+        end_points['decoder2'] = net
+
+    with tf.compat.v1.variable_scope('output'):
+      net = tf.pad(tensor=net, paddings=spatial_pad_3, mode='REFLECT')
+      logits = layers.conv2d(
+          net,
+          num_outputs, [7, 7],
+          activation_fn=None,
+          normalizer_fn=None,
+          padding='valid')
+      logits = tf.reshape(logits, _dynamic_or_static_shape(images))
+
+      end_points['logits'] = logits
+      end_points['predictions'] = tf.tanh(logits) + logits * tanh_linear_slope
+
+  return end_points['predictions'], end_points
@@ -0,0 +1,110 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.contrib.slim.nets.cyclegan."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from nets import cyclegan
+
+
+# TODO(joelshor): Add a test to check generator endpoints.
+class CycleganTest(tf.test.TestCase):
+
+  def test_generator_inference(self):
+    """Check one inference step."""
+    img_batch = tf.zeros([2, 32, 32, 3])
+    model_output, _ = cyclegan.cyclegan_generator_resnet(img_batch)
+    with self.test_session() as sess:
+      sess.run(tf.compat.v1.global_variables_initializer())
+      sess.run(model_output)
+
+  def _test_generator_graph_helper(self, shape):
+    """Check that generator can take small and non-square inputs."""
+    output_imgs, _ = cyclegan.cyclegan_generator_resnet(tf.ones(shape))
+    self.assertAllEqual(shape, output_imgs.shape.as_list())
+
+  def test_generator_graph_small(self):
+    self._test_generator_graph_helper([4, 32, 32, 3])
+
+  def test_generator_graph_medium(self):
+    self._test_generator_graph_helper([3, 128, 128, 3])
+
+  def test_generator_graph_nonsquare(self):
+    self._test_generator_graph_helper([2, 80, 400, 3])
+
+  def test_generator_unknown_batch_dim(self):
+    """Check that generator can take unknown batch dimension inputs."""
+    img = tf.compat.v1.placeholder(tf.float32, shape=[None, 32, None, 3])
+    output_imgs, _ = cyclegan.cyclegan_generator_resnet(img)
+
+    self.assertAllEqual([None, 32, None, 3], output_imgs.shape.as_list())
+
+  def _input_and_output_same_shape_helper(self, kernel_size):
+    img_batch = tf.compat.v1.placeholder(tf.float32, shape=[None, 32, 32, 3])
+    output_img_batch, _ = cyclegan.cyclegan_generator_resnet(
+        img_batch, kernel_size=kernel_size)
+
+    self.assertAllEqual(img_batch.shape.as_list(),
+                        output_img_batch.shape.as_list())
+
+  def input_and_output_same_shape_kernel3(self):
+    self._input_and_output_same_shape_helper(3)
+
+  def input_and_output_same_shape_kernel4(self):
+    self._input_and_output_same_shape_helper(4)
+
+  def input_and_output_same_shape_kernel5(self):
+    self._input_and_output_same_shape_helper(5)
+
+  def input_and_output_same_shape_kernel6(self):
+    self._input_and_output_same_shape_helper(6)
+
+  def _error_if_height_not_multiple_of_four_helper(self, height):
+    self.assertRaisesRegexp(
+        ValueError, 'The input height must be a multiple of 4.',
+        cyclegan.cyclegan_generator_resnet,
+        tf.compat.v1.placeholder(tf.float32, shape=[None, height, 32, 3]))
+
+  def test_error_if_height_not_multiple_of_four_height29(self):
+    self._error_if_height_not_multiple_of_four_helper(29)
+
+  def test_error_if_height_not_multiple_of_four_height30(self):
+    self._error_if_height_not_multiple_of_four_helper(30)
+
+  def test_error_if_height_not_multiple_of_four_height31(self):
+    self._error_if_height_not_multiple_of_four_helper(31)
+
+  def _error_if_width_not_multiple_of_four_helper(self, width):
+    self.assertRaisesRegexp(
+        ValueError, 'The input width must be a multiple of 4.',
+        cyclegan.cyclegan_generator_resnet,
+        tf.compat.v1.placeholder(tf.float32, shape=[None, 32, width, 3]))
+
+  def test_error_if_width_not_multiple_of_four_width29(self):
+    self._error_if_width_not_multiple_of_four_helper(29)
+
+  def test_error_if_width_not_multiple_of_four_width30(self):
+    self._error_if_width_not_multiple_of_four_helper(30)
+
+  def test_error_if_width_not_multiple_of_four_width31(self):
+    self._error_if_width_not_multiple_of_four_helper(31)
+
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,205 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""DCGAN generator and discriminator from https://arxiv.org/abs/1511.06434."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from math import log
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+slim = contrib_slim
+
+
+def _validate_image_inputs(inputs):
+  inputs.get_shape().assert_has_rank(4)
+  inputs.get_shape()[1:3].assert_is_fully_defined()
+  if inputs.get_shape()[1] != inputs.get_shape()[2]:
+    raise ValueError('Input tensor does not have equal width and height: ',
+                     inputs.get_shape()[1:3])
+  width = inputs.get_shape().as_list()[1]
+  if log(width, 2) != int(log(width, 2)):
+    raise ValueError('Input tensor `width` is not a power of 2: ', width)
+
+
+# TODO(joelshor): Use fused batch norm by default. Investigate why some GAN
+# setups need the gradient of gradient FusedBatchNormGrad.
+def discriminator(inputs,
+                  depth=64,
+                  is_training=True,
+                  reuse=None,
+                  scope='Discriminator',
+                  fused_batch_norm=False):
+  """Discriminator network for DCGAN.
+
+  Construct discriminator network from inputs to the final endpoint.
+
+  Args:
+    inputs: A tensor of size [batch_size, height, width, channels]. Must be
+      floating point.
+    depth: Number of channels in first convolution layer.
+    is_training: Whether the network is for training or not.
+    reuse: Whether or not the network variables should be reused. `scope`
+      must be given to be reused.
+    scope: Optional variable_scope.
+    fused_batch_norm: If `True`, use a faster, fused implementation of
+      batch norm.
+
+  Returns:
+    logits: The pre-softmax activations, a tensor of size [batch_size, 1]
+    end_points: a dictionary from components of the network to their activation.
+
+  Raises:
+    ValueError: If the input image shape is not 4-dimensional, if the spatial
+      dimensions aren't defined at graph construction time, if the spatial
+      dimensions aren't square, or if the spatial dimensions aren't a power of
+      two.
+  """
+
+  normalizer_fn = slim.batch_norm
+  normalizer_fn_args = {
+      'is_training': is_training,
+      'zero_debias_moving_mean': True,
+      'fused': fused_batch_norm,
+  }
+
+  _validate_image_inputs(inputs)
+  inp_shape = inputs.get_shape().as_list()[1]
+
+  end_points = {}
+  with tf.compat.v1.variable_scope(
+      scope, values=[inputs], reuse=reuse) as scope:
+    with slim.arg_scope([normalizer_fn], **normalizer_fn_args):
+      with slim.arg_scope([slim.conv2d],
+                          stride=2,
+                          kernel_size=4,
+                          activation_fn=tf.nn.leaky_relu):
+        net = inputs
+        for i in xrange(int(log(inp_shape, 2))):
+          scope = 'conv%i' % (i + 1)
+          current_depth = depth * 2**i
+          normalizer_fn_ = None if i == 0 else normalizer_fn
+          net = slim.conv2d(
+              net, current_depth, normalizer_fn=normalizer_fn_, scope=scope)
+          end_points[scope] = net
+
+        logits = slim.conv2d(net, 1, kernel_size=1, stride=1, padding='VALID',
+                             normalizer_fn=None, activation_fn=None)
+        logits = tf.reshape(logits, [-1, 1])
+        end_points['logits'] = logits
+
+        return logits, end_points
+
+
+# TODO(joelshor): Use fused batch norm by default. Investigate why some GAN
+# setups need the gradient of gradient FusedBatchNormGrad.
+def generator(inputs,
+              depth=64,
+              final_size=32,
+              num_outputs=3,
+              is_training=True,
+              reuse=None,
+              scope='Generator',
+              fused_batch_norm=False):
+  """Generator network for DCGAN.
+
+  Construct generator network from inputs to the final endpoint.
+
+  Args:
+    inputs: A tensor with any size N. [batch_size, N]
+    depth: Number of channels in last deconvolution layer.
+    final_size: The shape of the final output.
+    num_outputs: Number of output features. For images, this is the number of
+      channels.
+    is_training: whether is training or not.
+    reuse: Whether or not the network has its variables should be reused. scope
+      must be given to be reused.
+    scope: Optional variable_scope.
+    fused_batch_norm: If `True`, use a faster, fused implementation of
+      batch norm.
+
+  Returns:
+    logits: the pre-softmax activations, a tensor of size
+      [batch_size, 32, 32, channels]
+    end_points: a dictionary from components of the network to their activation.
+
+  Raises:
+    ValueError: If `inputs` is not 2-dimensional.
+    ValueError: If `final_size` isn't a power of 2 or is less than 8.
+  """
+  normalizer_fn = slim.batch_norm
+  normalizer_fn_args = {
+      'is_training': is_training,
+      'zero_debias_moving_mean': True,
+      'fused': fused_batch_norm,
+  }
+
+  inputs.get_shape().assert_has_rank(2)
+  if log(final_size, 2) != int(log(final_size, 2)):
+    raise ValueError('`final_size` (%i) must be a power of 2.' % final_size)
+  if final_size < 8:
+    raise ValueError('`final_size` (%i) must be greater than 8.' % final_size)
+
+  end_points = {}
+  num_layers = int(log(final_size, 2)) - 1
+  with tf.compat.v1.variable_scope(
+      scope, values=[inputs], reuse=reuse) as scope:
+    with slim.arg_scope([normalizer_fn], **normalizer_fn_args):
+      with slim.arg_scope([slim.conv2d_transpose],
+                          normalizer_fn=normalizer_fn,
+                          stride=2,
+                          kernel_size=4):
+        net = tf.expand_dims(tf.expand_dims(inputs, 1), 1)
+
+        # First upscaling is different because it takes the input vector.
+        current_depth = depth * 2 ** (num_layers - 1)
+        scope = 'deconv1'
+        net = slim.conv2d_transpose(
+            net, current_depth, stride=1, padding='VALID', scope=scope)
+        end_points[scope] = net
+
+        for i in xrange(2, num_layers):
+          scope = 'deconv%i' % (i)
+          current_depth = depth * 2 ** (num_layers - i)
+          net = slim.conv2d_transpose(net, current_depth, scope=scope)
+          end_points[scope] = net
+
+        # Last layer has different normalizer and activation.
+        scope = 'deconv%i' % (num_layers)
+        net = slim.conv2d_transpose(
+            net, depth, normalizer_fn=None, activation_fn=None, scope=scope)
+        end_points[scope] = net
+
+        # Convert to proper channels.
+        scope = 'logits'
+        logits = slim.conv2d(
+            net,
+            num_outputs,
+            normalizer_fn=None,
+            activation_fn=None,
+            kernel_size=1,
+            stride=1,
+            padding='VALID',
+            scope=scope)
+        end_points[scope] = logits
+
+        logits.get_shape().assert_has_rank(4)
+        logits.get_shape().assert_is_compatible_with(
+            [None, final_size, final_size, num_outputs])
+
+        return logits, end_points
@@ -0,0 +1,121 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dcgan."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+from nets import dcgan
+
+
+class DCGANTest(tf.test.TestCase):
+
+  def test_generator_run(self):
+    tf.compat.v1.set_random_seed(1234)
+    noise = tf.random.normal([100, 64])
+    image, _ = dcgan.generator(noise)
+    with self.test_session() as sess:
+      sess.run(tf.compat.v1.global_variables_initializer())
+      image.eval()
+
+  def test_generator_graph(self):
+    tf.compat.v1.set_random_seed(1234)
+    # Check graph construction for a number of image size/depths and batch
+    # sizes.
+    for i, batch_size in zip(xrange(3, 7), xrange(3, 8)):
+      tf.compat.v1.reset_default_graph()
+      final_size = 2 ** i
+      noise = tf.random.normal([batch_size, 64])
+      image, end_points = dcgan.generator(
+          noise,
+          depth=32,
+          final_size=final_size)
+
+      self.assertAllEqual([batch_size, final_size, final_size, 3],
+                          image.shape.as_list())
+
+      expected_names = ['deconv%i' % j for j in xrange(1, i)] + ['logits']
+      self.assertSetEqual(set(expected_names), set(end_points.keys()))
+
+      # Check layer depths.
+      for j in range(1, i):
+        layer = end_points['deconv%i' % j]
+        self.assertEqual(32 * 2**(i-j-1), layer.get_shape().as_list()[-1])
+
+  def test_generator_invalid_input(self):
+    wrong_dim_input = tf.zeros([5, 32, 32])
+    with self.assertRaises(ValueError):
+      dcgan.generator(wrong_dim_input)
+
+    correct_input = tf.zeros([3, 2])
+    with self.assertRaisesRegexp(ValueError, 'must be a power of 2'):
+      dcgan.generator(correct_input, final_size=30)
+
+    with self.assertRaisesRegexp(ValueError, 'must be greater than 8'):
+      dcgan.generator(correct_input, final_size=4)
+
+  def test_discriminator_run(self):
+    image = tf.random.uniform([5, 32, 32, 3], -1, 1)
+    output, _ = dcgan.discriminator(image)
+    with self.test_session() as sess:
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output.eval()
+
+  def test_discriminator_graph(self):
+    # Check graph construction for a number of image size/depths and batch
+    # sizes.
+    for i, batch_size in zip(xrange(1, 6), xrange(3, 8)):
+      tf.compat.v1.reset_default_graph()
+      img_w = 2 ** i
+      image = tf.random.uniform([batch_size, img_w, img_w, 3], -1, 1)
+      output, end_points = dcgan.discriminator(
+          image,
+          depth=32)
+
+      self.assertAllEqual([batch_size, 1], output.get_shape().as_list())
+
+      expected_names = ['conv%i' % j for j in xrange(1, i+1)] + ['logits']
+      self.assertSetEqual(set(expected_names), set(end_points.keys()))
+
+      # Check layer depths.
+      for j in range(1, i+1):
+        layer = end_points['conv%i' % j]
+        self.assertEqual(32 * 2**(j-1), layer.get_shape().as_list()[-1])
+
+  def test_discriminator_invalid_input(self):
+    wrong_dim_img = tf.zeros([5, 32, 32])
+    with self.assertRaises(ValueError):
+      dcgan.discriminator(wrong_dim_img)
+
+    spatially_undefined_shape = tf.compat.v1.placeholder(
+        tf.float32, [5, 32, None, 3])
+    with self.assertRaises(ValueError):
+      dcgan.discriminator(spatially_undefined_shape)
+
+    not_square = tf.zeros([5, 32, 16, 3])
+    with self.assertRaisesRegexp(ValueError, 'not have equal width and height'):
+      dcgan.discriminator(not_square)
+
+    not_power_2 = tf.zeros([5, 30, 30, 3])
+    with self.assertRaisesRegexp(ValueError, 'not a power of 2'):
+      dcgan.discriminator(not_power_2)
+
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,181 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains the definition for Inflated 3D Inception V1 (I3D).
+
+The network architecture is proposed by:
+  Joao Carreira and Andrew Zisserman,
+  Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset.
+  https://arxiv.org/abs/1705.07750
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from nets import i3d_utils
+from nets import s3dg
+
+slim = contrib_slim
+
+# pylint: disable=g-long-lambda
+trunc_normal = lambda stddev: tf.compat.v1.truncated_normal_initializer(
+    0.0, stddev)
+conv3d_spatiotemporal = i3d_utils.conv3d_spatiotemporal
+
+
+def i3d_arg_scope(weight_decay=1e-7,
+                  batch_norm_decay=0.999,
+                  batch_norm_epsilon=0.001,
+                  use_renorm=False,
+                  separable_conv3d=False):
+  """Defines default arg_scope for I3D.
+
+  Args:
+    weight_decay: The weight decay to use for regularizing the model.
+    batch_norm_decay: Decay for batch norm moving average.
+    batch_norm_epsilon: Small float added to variance to avoid dividing by zero
+      in batch norm.
+    use_renorm: Whether to use batch renormalization or not.
+    separable_conv3d: Whether to use separable 3d Convs.
+
+  Returns:
+    sc: An arg_scope to use for the models.
+  """
+  batch_norm_params = {
+      # Decay for the moving averages.
+      'decay': batch_norm_decay,
+      # epsilon to prevent 0s in variance.
+      'epsilon': batch_norm_epsilon,
+      # Turns off fused batch norm.
+      'fused': False,
+      'renorm': use_renorm,
+      # collection containing the moving mean and moving variance.
+      'variables_collections': {
+          'beta': None,
+          'gamma': None,
+          'moving_mean': ['moving_vars'],
+          'moving_variance': ['moving_vars'],
+      }
+  }
+
+  with slim.arg_scope(
+      [slim.conv3d, conv3d_spatiotemporal],
+      weights_regularizer=slim.l2_regularizer(weight_decay),
+      activation_fn=tf.nn.relu,
+      normalizer_fn=slim.batch_norm,
+      normalizer_params=batch_norm_params):
+    with slim.arg_scope(
+        [conv3d_spatiotemporal], separable=separable_conv3d) as sc:
+      return sc
+
+
+def i3d_base(inputs, final_endpoint='Mixed_5c',
+             scope='InceptionV1'):
+  """Defines the I3D base architecture.
+
+  Note that we use the names as defined in Inception V1 to facilitate checkpoint
+  conversion from an image-trained Inception V1 checkpoint to I3D checkpoint.
+
+  Args:
+    inputs: A 5-D float tensor of size [batch_size, num_frames, height, width,
+      channels].
+    final_endpoint: Specifies the endpoint to construct the network up to. It
+      can be one of ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1',
+      'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c',
+      'MaxPool_4a_3x3', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e',
+      'Mixed_4f', 'MaxPool_5a_2x2', 'Mixed_5b', 'Mixed_5c']
+    scope: Optional variable_scope.
+
+  Returns:
+    A dictionary from components of the network to the corresponding activation.
+
+  Raises:
+    ValueError: if final_endpoint is not set to one of the predefined values.
+  """
+
+  return s3dg.s3dg_base(
+      inputs,
+      first_temporal_kernel_size=7,
+      temporal_conv_startat='Conv2d_2c_3x3',
+      gating_startat=None,
+      final_endpoint=final_endpoint,
+      min_depth=16,
+      depth_multiplier=1.0,
+      data_format='NDHWC',
+      scope=scope)
+
+
+def i3d(inputs,
+        num_classes=1000,
+        dropout_keep_prob=0.8,
+        is_training=True,
+        prediction_fn=slim.softmax,
+        spatial_squeeze=True,
+        reuse=None,
+        scope='InceptionV1'):
+  """Defines the I3D architecture.
+
+  The default image size used to train this network is 224x224.
+
+  Args:
+    inputs: A 5-D float tensor of size [batch_size, num_frames, height, width,
+      channels].
+    num_classes: number of predicted classes.
+    dropout_keep_prob: the percentage of activation values that are retained.
+    is_training: whether is training or not.
+    prediction_fn: a function to get predictions out of logits.
+    spatial_squeeze: if True, logits is of shape is [B, C], if false logits is
+        of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
+    reuse: whether or not the network and its variables should be reused. To be
+      able to reuse 'scope' must be given.
+    scope: Optional variable_scope.
+
+  Returns:
+    logits: the pre-softmax activations, a tensor of size
+      [batch_size, num_classes]
+    end_points: a dictionary from components of the network to the corresponding
+      activation.
+  """
+  # Final pooling and prediction
+  with tf.compat.v1.variable_scope(
+      scope, 'InceptionV1', [inputs, num_classes], reuse=reuse) as scope:
+    with slim.arg_scope(
+        [slim.batch_norm, slim.dropout], is_training=is_training):
+      net, end_points = i3d_base(inputs, scope=scope)
+      with tf.compat.v1.variable_scope('Logits'):
+        kernel_size = i3d_utils.reduced_kernel_size_3d(net, [2, 7, 7])
+        net = slim.avg_pool3d(
+            net, kernel_size, stride=1, scope='AvgPool_0a_7x7')
+        net = slim.dropout(net, dropout_keep_prob, scope='Dropout_0b')
+        logits = slim.conv3d(
+            net,
+            num_classes, [1, 1, 1],
+            activation_fn=None,
+            normalizer_fn=None,
+            scope='Conv2d_0c_1x1')
+        # Temporal average pooling.
+        logits = tf.reduce_mean(input_tensor=logits, axis=1)
+        if spatial_squeeze:
+          logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze')
+
+        end_points['Logits'] = logits
+        end_points['Predictions'] = prediction_fn(logits, scope='Predictions')
+  return logits, end_points
+
+
+i3d.default_image_size = 224
@@ -0,0 +1,149 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for networks.i3d."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from nets import i3d
+
+
+class I3DTest(tf.test.TestCase):
+
+  def testBuildClassificationNetwork(self):
+    batch_size = 5
+    num_frames = 64
+    height, width = 224, 224
+    num_classes = 1000
+
+    inputs = tf.random.uniform((batch_size, num_frames, height, width, 3))
+    logits, end_points = i3d.i3d(inputs, num_classes)
+    self.assertTrue(logits.op.name.startswith('InceptionV1/Logits'))
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    self.assertTrue('Predictions' in end_points)
+    self.assertListEqual(end_points['Predictions'].get_shape().as_list(),
+                         [batch_size, num_classes])
+
+  def testBuildBaseNetwork(self):
+    batch_size = 5
+    num_frames = 64
+    height, width = 224, 224
+
+    inputs = tf.random.uniform((batch_size, num_frames, height, width, 3))
+    mixed_6c, end_points = i3d.i3d_base(inputs)
+    self.assertTrue(mixed_6c.op.name.startswith('InceptionV1/Mixed_5c'))
+    self.assertListEqual(mixed_6c.get_shape().as_list(),
+                         [batch_size, 8, 7, 7, 1024])
+    expected_endpoints = ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1',
+                          'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b',
+                          'Mixed_3c', 'MaxPool_4a_3x3', 'Mixed_4b', 'Mixed_4c',
+                          'Mixed_4d', 'Mixed_4e', 'Mixed_4f', 'MaxPool_5a_2x2',
+                          'Mixed_5b', 'Mixed_5c']
+    self.assertItemsEqual(end_points.keys(), expected_endpoints)
+
+  def testBuildOnlyUptoFinalEndpoint(self):
+    batch_size = 5
+    num_frames = 64
+    height, width = 224, 224
+    endpoints = ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1',
+                 'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c',
+                 'MaxPool_4a_3x3', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d',
+                 'Mixed_4e', 'Mixed_4f', 'MaxPool_5a_2x2', 'Mixed_5b',
+                 'Mixed_5c']
+    for index, endpoint in enumerate(endpoints):
+      with tf.Graph().as_default():
+        inputs = tf.random.uniform((batch_size, num_frames, height, width, 3))
+        out_tensor, end_points = i3d.i3d_base(
+            inputs, final_endpoint=endpoint)
+        self.assertTrue(out_tensor.op.name.startswith(
+            'InceptionV1/' + endpoint))
+        self.assertItemsEqual(endpoints[:index+1], end_points)
+
+  def testBuildAndCheckAllEndPointsUptoMixed5c(self):
+    batch_size = 5
+    num_frames = 64
+    height, width = 224, 224
+
+    inputs = tf.random.uniform((batch_size, num_frames, height, width, 3))
+    _, end_points = i3d.i3d_base(inputs,
+                                 final_endpoint='Mixed_5c')
+    endpoints_shapes = {'Conv2d_1a_7x7': [5, 32, 112, 112, 64],
+                        'MaxPool_2a_3x3': [5, 32, 56, 56, 64],
+                        'Conv2d_2b_1x1': [5, 32, 56, 56, 64],
+                        'Conv2d_2c_3x3': [5, 32, 56, 56, 192],
+                        'MaxPool_3a_3x3': [5, 32, 28, 28, 192],
+                        'Mixed_3b': [5, 32, 28, 28, 256],
+                        'Mixed_3c': [5, 32, 28, 28, 480],
+                        'MaxPool_4a_3x3': [5, 16, 14, 14, 480],
+                        'Mixed_4b': [5, 16, 14, 14, 512],
+                        'Mixed_4c': [5, 16, 14, 14, 512],
+                        'Mixed_4d': [5, 16, 14, 14, 512],
+                        'Mixed_4e': [5, 16, 14, 14, 528],
+                        'Mixed_4f': [5, 16, 14, 14, 832],
+                        'MaxPool_5a_2x2': [5, 8, 7, 7, 832],
+                        'Mixed_5b': [5, 8, 7, 7, 832],
+                        'Mixed_5c': [5, 8, 7, 7, 1024]}
+
+    self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys())
+    for endpoint_name, expected_shape in endpoints_shapes.iteritems():
+      self.assertTrue(endpoint_name in end_points)
+      self.assertListEqual(end_points[endpoint_name].get_shape().as_list(),
+                           expected_shape)
+
+  def testHalfSizeImages(self):
+    batch_size = 5
+    num_frames = 64
+    height, width = 112, 112
+
+    inputs = tf.random.uniform((batch_size, num_frames, height, width, 3))
+    mixed_5c, _ = i3d.i3d_base(inputs)
+    self.assertTrue(mixed_5c.op.name.startswith('InceptionV1/Mixed_5c'))
+    self.assertListEqual(mixed_5c.get_shape().as_list(),
+                         [batch_size, 8, 4, 4, 1024])
+
+  def testTenFrames(self):
+    batch_size = 5
+    num_frames = 10
+    height, width = 224, 224
+
+    inputs = tf.random.uniform((batch_size, num_frames, height, width, 3))
+    mixed_5c, _ = i3d.i3d_base(inputs)
+    self.assertTrue(mixed_5c.op.name.startswith('InceptionV1/Mixed_5c'))
+    self.assertListEqual(mixed_5c.get_shape().as_list(),
+                         [batch_size, 2, 7, 7, 1024])
+
+  def testEvaluation(self):
+    batch_size = 2
+    num_frames = 64
+    height, width = 224, 224
+    num_classes = 1000
+
+    eval_inputs = tf.random.uniform((batch_size, num_frames, height, width, 3))
+    logits, _ = i3d.i3d(eval_inputs, num_classes,
+                        is_training=False)
+    predictions = tf.argmax(input=logits, axis=1)
+
+    with self.test_session() as sess:
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(predictions)
+      self.assertEquals(output.shape, (batch_size,))
+
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,289 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for building I3D network models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib import framework as contrib_framework
+from tensorflow.contrib import layers as contrib_layers
+
+
+# Orignaly, add_arg_scope = slim.add_arg_scope and layers = slim, now switch to
+# more update-to-date tf.contrib.* API.
+add_arg_scope = contrib_framework.add_arg_scope
+layers = contrib_layers
+
+
+def center_initializer():
+  """Centering Initializer for I3D.
+
+  This initializer allows identity mapping for temporal convolution at the
+  initialization, which is critical for a desired convergence behavior
+  for training a seprable I3D model.
+
+  The centering behavior of this initializer requires an odd-sized kernel,
+  typically set to 3.
+
+  Returns:
+    A weight initializer op used in temporal convolutional layers.
+
+  Raises:
+    ValueError: Input tensor data type has to be tf.float32.
+    ValueError: If input tensor is not a 5-D tensor.
+    ValueError: If input and output channel dimensions are different.
+    ValueError: If spatial kernel sizes are not 1.
+    ValueError: If temporal kernel size is even.
+  """
+
+  def _initializer(shape, dtype=tf.float32, partition_info=None):  # pylint: disable=unused-argument
+    """Initializer op."""
+
+    if dtype != tf.float32 and dtype != tf.bfloat16:
+      raise ValueError(
+          'Input tensor data type has to be tf.float32 or tf.bfloat16.')
+    if len(shape) != 5:
+      raise ValueError('Input tensor has to be 5-D.')
+    if shape[3] != shape[4]:
+      raise ValueError('Input and output channel dimensions must be the same.')
+    if shape[1] != 1 or shape[2] != 1:
+      raise ValueError('Spatial kernel sizes must be 1 (pointwise conv).')
+    if shape[0] % 2 == 0:
+      raise ValueError('Temporal kernel size has to be odd.')
+
+    center_pos = int(shape[0] / 2)
+    init_mat = np.zeros(
+        [shape[0], shape[1], shape[2], shape[3], shape[4]], dtype=np.float32)
+    for i in range(0, shape[3]):
+      init_mat[center_pos, 0, 0, i, i] = 1.0
+
+    init_op = tf.constant(init_mat, dtype=dtype)
+    return init_op
+
+  return _initializer
+
+
+@add_arg_scope
+def conv3d_spatiotemporal(inputs,
+                          num_outputs,
+                          kernel_size,
+                          stride=1,
+                          padding='SAME',
+                          activation_fn=None,
+                          normalizer_fn=None,
+                          normalizer_params=None,
+                          weights_regularizer=None,
+                          separable=False,
+                          data_format='NDHWC',
+                          scope=''):
+  """A wrapper for conv3d to model spatiotemporal representations.
+
+  This allows switching between original 3D convolution and separable 3D
+  convolutions for spatial and temporal features respectively. On Kinetics,
+  seprable 3D convolutions yields better classification performance.
+
+  Args:
+    inputs: a 5-D tensor  `[batch_size, depth, height, width, channels]`.
+    num_outputs: integer, the number of output filters.
+    kernel_size: a list of length 3
+      `[kernel_depth, kernel_height, kernel_width]` of the filters. Can be an
+      int if all values are the same.
+    stride: a list of length 3 `[stride_depth, stride_height, stride_width]`.
+      Can be an int if all strides are the same.
+    padding: one of `VALID` or `SAME`.
+    activation_fn: activation function.
+    normalizer_fn: normalization function to use instead of `biases`.
+    normalizer_params: dictionary of normalization function parameters.
+    weights_regularizer: Optional regularizer for the weights.
+    separable: If `True`, use separable spatiotemporal convolutions.
+    data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC".
+      The data format of the input and output data. With the default format
+      "NDHWC", the data is stored in the order of: [batch, in_depth, in_height,
+      in_width, in_channels]. Alternatively, the format could be "NCDHW", the
+      data storage order is:
+      [batch, in_channels, in_depth, in_height, in_width].
+    scope: scope for `variable_scope`.
+
+  Returns:
+    A tensor representing the output of the (separable) conv3d operation.
+
+  """
+  assert len(kernel_size) == 3
+  if separable and kernel_size[0] != 1:
+    spatial_kernel_size = [1, kernel_size[1], kernel_size[2]]
+    temporal_kernel_size = [kernel_size[0], 1, 1]
+    if isinstance(stride, list) and len(stride) == 3:
+      spatial_stride = [1, stride[1], stride[2]]
+      temporal_stride = [stride[0], 1, 1]
+    else:
+      spatial_stride = [1, stride, stride]
+      temporal_stride = [stride, 1, 1]
+    net = layers.conv3d(
+        inputs,
+        num_outputs,
+        spatial_kernel_size,
+        stride=spatial_stride,
+        padding=padding,
+        activation_fn=activation_fn,
+        normalizer_fn=normalizer_fn,
+        normalizer_params=normalizer_params,
+        weights_regularizer=weights_regularizer,
+        data_format=data_format,
+        scope=scope)
+    net = layers.conv3d(
+        net,
+        num_outputs,
+        temporal_kernel_size,
+        stride=temporal_stride,
+        padding=padding,
+        scope=scope + '/temporal',
+        activation_fn=activation_fn,
+        normalizer_fn=None,
+        data_format=data_format,
+        weights_initializer=center_initializer())
+    return net
+  else:
+    return layers.conv3d(
+        inputs,
+        num_outputs,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        activation_fn=activation_fn,
+        normalizer_fn=normalizer_fn,
+        normalizer_params=normalizer_params,
+        weights_regularizer=weights_regularizer,
+        data_format=data_format,
+        scope=scope)
+
+
+@add_arg_scope
+def inception_block_v1_3d(inputs,
+                          num_outputs_0_0a,
+                          num_outputs_1_0a,
+                          num_outputs_1_0b,
+                          num_outputs_2_0a,
+                          num_outputs_2_0b,
+                          num_outputs_3_0b,
+                          temporal_kernel_size=3,
+                          self_gating_fn=None,
+                          data_format='NDHWC',
+                          scope=''):
+  """A 3D Inception v1 block.
+
+  This allows use of separable 3D convolutions and self-gating, as
+  described in:
+  Saining Xie, Chen Sun, Jonathan Huang, Zhuowen Tu and Kevin Murphy,
+    Rethinking Spatiotemporal Feature Learning For Video Understanding.
+    https://arxiv.org/abs/1712.04851.
+
+  Args:
+    inputs: a 5-D tensor  `[batch_size, depth, height, width, channels]`.
+    num_outputs_0_0a: integer, the number of output filters for Branch 0,
+      operation Conv2d_0a_1x1.
+    num_outputs_1_0a: integer, the number of output filters for Branch 1,
+      operation Conv2d_0a_1x1.
+    num_outputs_1_0b: integer, the number of output filters for Branch 1,
+      operation Conv2d_0b_3x3.
+    num_outputs_2_0a: integer, the number of output filters for Branch 2,
+      operation Conv2d_0a_1x1.
+    num_outputs_2_0b: integer, the number of output filters for Branch 2,
+      operation Conv2d_0b_3x3.
+    num_outputs_3_0b: integer, the number of output filters for Branch 3,
+      operation Conv2d_0b_1x1.
+    temporal_kernel_size: integer, the size of the temporal convolutional
+      filters in the conv3d_spatiotemporal blocks.
+    self_gating_fn: function which optionally performs self-gating.
+      Must have two arguments, `inputs` and `scope`, and return one output
+      tensor the same size as `inputs`. If `None`, no self-gating is
+      applied.
+    data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC".
+      The data format of the input and output data. With the default format
+      "NDHWC", the data is stored in the order of: [batch, in_depth, in_height,
+      in_width, in_channels]. Alternatively, the format could be "NCDHW", the
+      data storage order is:
+      [batch, in_channels, in_depth, in_height, in_width].
+    scope: scope for `variable_scope`.
+
+  Returns:
+    A 5-D tensor `[batch_size, depth, height, width, out_channels]`, where
+    `out_channels = num_outputs_0_0a + num_outputs_1_0b + num_outputs_2_0b
+    + num_outputs_3_0b`.
+
+  """
+  use_gating = self_gating_fn is not None
+
+  with tf.compat.v1.variable_scope(scope):
+    with tf.compat.v1.variable_scope('Branch_0'):
+      branch_0 = layers.conv3d(
+          inputs, num_outputs_0_0a, [1, 1, 1], scope='Conv2d_0a_1x1')
+      if use_gating:
+        branch_0 = self_gating_fn(branch_0, scope='Conv2d_0a_1x1')
+    with tf.compat.v1.variable_scope('Branch_1'):
+      branch_1 = layers.conv3d(
+          inputs, num_outputs_1_0a, [1, 1, 1], scope='Conv2d_0a_1x1')
+      branch_1 = conv3d_spatiotemporal(
+          branch_1, num_outputs_1_0b, [temporal_kernel_size, 3, 3],
+          scope='Conv2d_0b_3x3')
+      if use_gating:
+        branch_1 = self_gating_fn(branch_1, scope='Conv2d_0b_3x3')
+    with tf.compat.v1.variable_scope('Branch_2'):
+      branch_2 = layers.conv3d(
+          inputs, num_outputs_2_0a, [1, 1, 1], scope='Conv2d_0a_1x1')
+      branch_2 = conv3d_spatiotemporal(
+          branch_2, num_outputs_2_0b, [temporal_kernel_size, 3, 3],
+          scope='Conv2d_0b_3x3')
+      if use_gating:
+        branch_2 = self_gating_fn(branch_2, scope='Conv2d_0b_3x3')
+    with tf.compat.v1.variable_scope('Branch_3'):
+      branch_3 = layers.max_pool3d(inputs, [3, 3, 3], scope='MaxPool_0a_3x3')
+      branch_3 = layers.conv3d(
+          branch_3, num_outputs_3_0b, [1, 1, 1], scope='Conv2d_0b_1x1')
+      if use_gating:
+        branch_3 = self_gating_fn(branch_3, scope='Conv2d_0b_1x1')
+    index_c = data_format.index('C')
+    assert 1 <= index_c <= 4, 'Cannot identify channel dimension.'
+    output = tf.concat([branch_0, branch_1, branch_2, branch_3], index_c)
+  return output
+
+
+def reduced_kernel_size_3d(input_tensor, kernel_size):
+  """Define kernel size which is automatically reduced for small input.
+
+  If the shape of the input images is unknown at graph construction time this
+  function assumes that the input images are large enough.
+
+  Args:
+    input_tensor: input tensor of size
+      [batch_size, time, height, width, channels].
+    kernel_size: desired kernel size of length 3, corresponding to time,
+      height and width.
+
+  Returns:
+    a tensor with the kernel size.
+  """
+  assert len(kernel_size) == 3
+  shape = input_tensor.get_shape().as_list()
+  assert len(shape) == 5
+  if None in shape[1:4]:
+    kernel_size_out = kernel_size
+  else:
+    kernel_size_out = [min(shape[1], kernel_size[0]),
+                       min(shape[2], kernel_size[1]),
+                       min(shape[3], kernel_size[2])]
+  return kernel_size_out
@@ -0,0 +1,37 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Brings all inception models under one namespace."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from nets.inception_resnet_v2 import inception_resnet_v2
+from nets.inception_resnet_v2 import inception_resnet_v2_arg_scope
+from nets.inception_resnet_v2 import inception_resnet_v2_base
+from nets.inception_v1 import inception_v1
+from nets.inception_v1 import inception_v1_arg_scope
+from nets.inception_v1 import inception_v1_base
+from nets.inception_v2 import inception_v2
+from nets.inception_v2 import inception_v2_arg_scope
+from nets.inception_v2 import inception_v2_base
+from nets.inception_v3 import inception_v3
+from nets.inception_v3 import inception_v3_arg_scope
+from nets.inception_v3 import inception_v3_base
+from nets.inception_v4 import inception_v4
+from nets.inception_v4 import inception_v4_arg_scope
+from nets.inception_v4 import inception_v4_base
+# pylint: enable=unused-import
@@ -0,0 +1,408 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains the definition of the Inception Resnet V2 architecture.
+
+As described in http://arxiv.org/abs/1602.07261.
+
+  Inception-v4, Inception-ResNet and the Impact of Residual Connections
+    on Learning
+  Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+slim = contrib_slim
+
+
+def block35(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None):
+  """Builds the 35x35 resnet block."""
+  with tf.compat.v1.variable_scope(scope, 'Block35', [net], reuse=reuse):
+    with tf.compat.v1.variable_scope('Branch_0'):
+      tower_conv = slim.conv2d(net, 32, 1, scope='Conv2d_1x1')
+    with tf.compat.v1.variable_scope('Branch_1'):
+      tower_conv1_0 = slim.conv2d(net, 32, 1, scope='Conv2d_0a_1x1')
+      tower_conv1_1 = slim.conv2d(tower_conv1_0, 32, 3, scope='Conv2d_0b_3x3')
+    with tf.compat.v1.variable_scope('Branch_2'):
+      tower_conv2_0 = slim.conv2d(net, 32, 1, scope='Conv2d_0a_1x1')
+      tower_conv2_1 = slim.conv2d(tower_conv2_0, 48, 3, scope='Conv2d_0b_3x3')
+      tower_conv2_2 = slim.conv2d(tower_conv2_1, 64, 3, scope='Conv2d_0c_3x3')
+    mixed = tf.concat(axis=3, values=[tower_conv, tower_conv1_1, tower_conv2_2])
+    up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None,
+                     activation_fn=None, scope='Conv2d_1x1')
+    scaled_up = up * scale
+    if activation_fn == tf.nn.relu6:
+      # Use clip_by_value to simulate bandpass activation.
+      scaled_up = tf.clip_by_value(scaled_up, -6.0, 6.0)
+
+    net += scaled_up
+    if activation_fn:
+      net = activation_fn(net)
+  return net
+
+
+def block17(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None):
+  """Builds the 17x17 resnet block."""
+  with tf.compat.v1.variable_scope(scope, 'Block17', [net], reuse=reuse):
+    with tf.compat.v1.variable_scope('Branch_0'):
+      tower_conv = slim.conv2d(net, 192, 1, scope='Conv2d_1x1')
+    with tf.compat.v1.variable_scope('Branch_1'):
+      tower_conv1_0 = slim.conv2d(net, 128, 1, scope='Conv2d_0a_1x1')
+      tower_conv1_1 = slim.conv2d(tower_conv1_0, 160, [1, 7],
+                                  scope='Conv2d_0b_1x7')
+      tower_conv1_2 = slim.conv2d(tower_conv1_1, 192, [7, 1],
+                                  scope='Conv2d_0c_7x1')
+    mixed = tf.concat(axis=3, values=[tower_conv, tower_conv1_2])
+    up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None,
+                     activation_fn=None, scope='Conv2d_1x1')
+
+    scaled_up = up * scale
+    if activation_fn == tf.nn.relu6:
+      # Use clip_by_value to simulate bandpass activation.
+      scaled_up = tf.clip_by_value(scaled_up, -6.0, 6.0)
+
+    net += scaled_up
+    if activation_fn:
+      net = activation_fn(net)
+  return net
+
+
+def block8(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None):
+  """Builds the 8x8 resnet block."""
+  with tf.compat.v1.variable_scope(scope, 'Block8', [net], reuse=reuse):
+    with tf.compat.v1.variable_scope('Branch_0'):
+      tower_conv = slim.conv2d(net, 192, 1, scope='Conv2d_1x1')
+    with tf.compat.v1.variable_scope('Branch_1'):
+      tower_conv1_0 = slim.conv2d(net, 192, 1, scope='Conv2d_0a_1x1')
+      tower_conv1_1 = slim.conv2d(tower_conv1_0, 224, [1, 3],
+                                  scope='Conv2d_0b_1x3')
+      tower_conv1_2 = slim.conv2d(tower_conv1_1, 256, [3, 1],
+                                  scope='Conv2d_0c_3x1')
+    mixed = tf.concat(axis=3, values=[tower_conv, tower_conv1_2])
+    up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None,
+                     activation_fn=None, scope='Conv2d_1x1')
+
+    scaled_up = up * scale
+    if activation_fn == tf.nn.relu6:
+      # Use clip_by_value to simulate bandpass activation.
+      scaled_up = tf.clip_by_value(scaled_up, -6.0, 6.0)
+
+    net += scaled_up
+    if activation_fn:
+      net = activation_fn(net)
+  return net
+
+
+def inception_resnet_v2_base(inputs,
+                             final_endpoint='Conv2d_7b_1x1',
+                             output_stride=16,
+                             align_feature_maps=False,
+                             scope=None,
+                             activation_fn=tf.nn.relu):
+  """Inception model from  http://arxiv.org/abs/1602.07261.
+
+  Constructs an Inception Resnet v2 network from inputs to the given final
+  endpoint. This method can construct the network up to the final inception
+  block Conv2d_7b_1x1.
+
+  Args:
+    inputs: a tensor of size [batch_size, height, width, channels].
+    final_endpoint: specifies the endpoint to construct the network up to. It
+      can be one of ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3',
+      'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3',
+      'Mixed_5b', 'Mixed_6a', 'PreAuxLogits', 'Mixed_7a', 'Conv2d_7b_1x1']
+    output_stride: A scalar that specifies the requested ratio of input to
+      output spatial resolution. Only supports 8 and 16.
+    align_feature_maps: When true, changes all the VALID paddings in the network
+      to SAME padding so that the feature maps are aligned.
+    scope: Optional variable_scope.
+    activation_fn: Activation function for block scopes.
+
+  Returns:
+    tensor_out: output tensor corresponding to the final_endpoint.
+    end_points: a set of activations for external use, for example summaries or
+                losses.
+
+  Raises:
+    ValueError: if final_endpoint is not set to one of the predefined values,
+      or if the output_stride is not 8 or 16, or if the output_stride is 8 and
+      we request an end point after 'PreAuxLogits'.
+  """
+  if output_stride != 8 and output_stride != 16:
+    raise ValueError('output_stride must be 8 or 16.')
+
+  padding = 'SAME' if align_feature_maps else 'VALID'
+
+  end_points = {}
+
+  def add_and_check_final(name, net):
+    end_points[name] = net
+    return name == final_endpoint
+
+  with tf.compat.v1.variable_scope(scope, 'InceptionResnetV2', [inputs]):
+    with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
+                        stride=1, padding='SAME'):
+      # 149 x 149 x 32
+      net = slim.conv2d(inputs, 32, 3, stride=2, padding=padding,
+                        scope='Conv2d_1a_3x3')
+      if add_and_check_final('Conv2d_1a_3x3', net): return net, end_points
+
+      # 147 x 147 x 32
+      net = slim.conv2d(net, 32, 3, padding=padding,
+                        scope='Conv2d_2a_3x3')
+      if add_and_check_final('Conv2d_2a_3x3', net): return net, end_points
+      # 147 x 147 x 64
+      net = slim.conv2d(net, 64, 3, scope='Conv2d_2b_3x3')
+      if add_and_check_final('Conv2d_2b_3x3', net): return net, end_points
+      # 73 x 73 x 64
+      net = slim.max_pool2d(net, 3, stride=2, padding=padding,
+                            scope='MaxPool_3a_3x3')
+      if add_and_check_final('MaxPool_3a_3x3', net): return net, end_points
+      # 73 x 73 x 80
+      net = slim.conv2d(net, 80, 1, padding=padding,
+                        scope='Conv2d_3b_1x1')
+      if add_and_check_final('Conv2d_3b_1x1', net): return net, end_points
+      # 71 x 71 x 192
+      net = slim.conv2d(net, 192, 3, padding=padding,
+                        scope='Conv2d_4a_3x3')
+      if add_and_check_final('Conv2d_4a_3x3', net): return net, end_points
+      # 35 x 35 x 192
+      net = slim.max_pool2d(net, 3, stride=2, padding=padding,
+                            scope='MaxPool_5a_3x3')
+      if add_and_check_final('MaxPool_5a_3x3', net): return net, end_points
+
+      # 35 x 35 x 320
+      with tf.compat.v1.variable_scope('Mixed_5b'):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          tower_conv = slim.conv2d(net, 96, 1, scope='Conv2d_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          tower_conv1_0 = slim.conv2d(net, 48, 1, scope='Conv2d_0a_1x1')
+          tower_conv1_1 = slim.conv2d(tower_conv1_0, 64, 5,
+                                      scope='Conv2d_0b_5x5')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          tower_conv2_0 = slim.conv2d(net, 64, 1, scope='Conv2d_0a_1x1')
+          tower_conv2_1 = slim.conv2d(tower_conv2_0, 96, 3,
+                                      scope='Conv2d_0b_3x3')
+          tower_conv2_2 = slim.conv2d(tower_conv2_1, 96, 3,
+                                      scope='Conv2d_0c_3x3')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          tower_pool = slim.avg_pool2d(net, 3, stride=1, padding='SAME',
+                                       scope='AvgPool_0a_3x3')
+          tower_pool_1 = slim.conv2d(tower_pool, 64, 1,
+                                     scope='Conv2d_0b_1x1')
+        net = tf.concat(
+            [tower_conv, tower_conv1_1, tower_conv2_2, tower_pool_1], 3)
+
+      if add_and_check_final('Mixed_5b', net): return net, end_points
+      # TODO(alemi): Register intermediate endpoints
+      net = slim.repeat(net, 10, block35, scale=0.17,
+                        activation_fn=activation_fn)
+
+      # 17 x 17 x 1088 if output_stride == 8,
+      # 33 x 33 x 1088 if output_stride == 16
+      use_atrous = output_stride == 8
+
+      with tf.compat.v1.variable_scope('Mixed_6a'):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          tower_conv = slim.conv2d(net, 384, 3, stride=1 if use_atrous else 2,
+                                   padding=padding,
+                                   scope='Conv2d_1a_3x3')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          tower_conv1_0 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1')
+          tower_conv1_1 = slim.conv2d(tower_conv1_0, 256, 3,
+                                      scope='Conv2d_0b_3x3')
+          tower_conv1_2 = slim.conv2d(tower_conv1_1, 384, 3,
+                                      stride=1 if use_atrous else 2,
+                                      padding=padding,
+                                      scope='Conv2d_1a_3x3')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          tower_pool = slim.max_pool2d(net, 3, stride=1 if use_atrous else 2,
+                                       padding=padding,
+                                       scope='MaxPool_1a_3x3')
+        net = tf.concat([tower_conv, tower_conv1_2, tower_pool], 3)
+
+      if add_and_check_final('Mixed_6a', net): return net, end_points
+
+      # TODO(alemi): register intermediate endpoints
+      with slim.arg_scope([slim.conv2d], rate=2 if use_atrous else 1):
+        net = slim.repeat(net, 20, block17, scale=0.10,
+                          activation_fn=activation_fn)
+      if add_and_check_final('PreAuxLogits', net): return net, end_points
+
+      if output_stride == 8:
+        # TODO(gpapan): Properly support output_stride for the rest of the net.
+        raise ValueError('output_stride==8 is only supported up to the '
+                         'PreAuxlogits end_point for now.')
+
+      # 8 x 8 x 2080
+      with tf.compat.v1.variable_scope('Mixed_7a'):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          tower_conv = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1')
+          tower_conv_1 = slim.conv2d(tower_conv, 384, 3, stride=2,
+                                     padding=padding,
+                                     scope='Conv2d_1a_3x3')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          tower_conv1 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1')
+          tower_conv1_1 = slim.conv2d(tower_conv1, 288, 3, stride=2,
+                                      padding=padding,
+                                      scope='Conv2d_1a_3x3')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          tower_conv2 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1')
+          tower_conv2_1 = slim.conv2d(tower_conv2, 288, 3,
+                                      scope='Conv2d_0b_3x3')
+          tower_conv2_2 = slim.conv2d(tower_conv2_1, 320, 3, stride=2,
+                                      padding=padding,
+                                      scope='Conv2d_1a_3x3')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          tower_pool = slim.max_pool2d(net, 3, stride=2,
+                                       padding=padding,
+                                       scope='MaxPool_1a_3x3')
+        net = tf.concat(
+            [tower_conv_1, tower_conv1_1, tower_conv2_2, tower_pool], 3)
+
+      if add_and_check_final('Mixed_7a', net): return net, end_points
+
+      # TODO(alemi): register intermediate endpoints
+      net = slim.repeat(net, 9, block8, scale=0.20, activation_fn=activation_fn)
+      net = block8(net, activation_fn=None)
+
+      # 8 x 8 x 1536
+      net = slim.conv2d(net, 1536, 1, scope='Conv2d_7b_1x1')
+      if add_and_check_final('Conv2d_7b_1x1', net): return net, end_points
+
+    raise ValueError('final_endpoint (%s) not recognized', final_endpoint)
+
+
+def inception_resnet_v2(inputs, num_classes=1001, is_training=True,
+                        dropout_keep_prob=0.8,
+                        reuse=None,
+                        scope='InceptionResnetV2',
+                        create_aux_logits=True,
+                        activation_fn=tf.nn.relu):
+  """Creates the Inception Resnet V2 model.
+
+  Args:
+    inputs: a 4-D tensor of size [batch_size, height, width, 3].
+      Dimension batch_size may be undefined. If create_aux_logits is false,
+      also height and width may be undefined.
+    num_classes: number of predicted classes. If 0 or None, the logits layer
+      is omitted and the input features to the logits layer (before  dropout)
+      are returned instead.
+    is_training: whether is training or not.
+    dropout_keep_prob: float, the fraction to keep before final layer.
+    reuse: whether or not the network and its variables should be reused. To be
+      able to reuse 'scope' must be given.
+    scope: Optional variable_scope.
+    create_aux_logits: Whether to include the auxilliary logits.
+    activation_fn: Activation function for conv2d.
+
+  Returns:
+    net: the output of the logits layer (if num_classes is a non-zero integer),
+      or the non-dropped-out input to the logits layer (if num_classes is 0 or
+      None).
+    end_points: the set of end_points from the inception model.
+  """
+  end_points = {}
+
+  with tf.compat.v1.variable_scope(
+      scope, 'InceptionResnetV2', [inputs], reuse=reuse) as scope:
+    with slim.arg_scope([slim.batch_norm, slim.dropout],
+                        is_training=is_training):
+
+      net, end_points = inception_resnet_v2_base(inputs, scope=scope,
+                                                 activation_fn=activation_fn)
+
+      if create_aux_logits and num_classes:
+        with tf.compat.v1.variable_scope('AuxLogits'):
+          aux = end_points['PreAuxLogits']
+          aux = slim.avg_pool2d(aux, 5, stride=3, padding='VALID',
+                                scope='Conv2d_1a_3x3')
+          aux = slim.conv2d(aux, 128, 1, scope='Conv2d_1b_1x1')
+          aux = slim.conv2d(aux, 768, aux.get_shape()[1:3],
+                            padding='VALID', scope='Conv2d_2a_5x5')
+          aux = slim.flatten(aux)
+          aux = slim.fully_connected(aux, num_classes, activation_fn=None,
+                                     scope='Logits')
+          end_points['AuxLogits'] = aux
+
+      with tf.compat.v1.variable_scope('Logits'):
+        # TODO(sguada,arnoegw): Consider adding a parameter global_pool which
+        # can be set to False to disable pooling here (as in resnet_*()).
+        kernel_size = net.get_shape()[1:3]
+        if kernel_size.is_fully_defined():
+          net = slim.avg_pool2d(net, kernel_size, padding='VALID',
+                                scope='AvgPool_1a_8x8')
+        else:
+          net = tf.reduce_mean(
+              input_tensor=net, axis=[1, 2], keepdims=True, name='global_pool')
+        end_points['global_pool'] = net
+        if not num_classes:
+          return net, end_points
+        net = slim.flatten(net)
+        net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
+                           scope='Dropout')
+        end_points['PreLogitsFlatten'] = net
+        logits = slim.fully_connected(net, num_classes, activation_fn=None,
+                                      scope='Logits')
+        end_points['Logits'] = logits
+        end_points['Predictions'] = tf.nn.softmax(logits, name='Predictions')
+
+    return logits, end_points
+inception_resnet_v2.default_image_size = 299
+
+
+def inception_resnet_v2_arg_scope(
+    weight_decay=0.00004,
+    batch_norm_decay=0.9997,
+    batch_norm_epsilon=0.001,
+    activation_fn=tf.nn.relu,
+    batch_norm_updates_collections=tf.compat.v1.GraphKeys.UPDATE_OPS,
+    batch_norm_scale=False):
+  """Returns the scope with the default parameters for inception_resnet_v2.
+
+  Args:
+    weight_decay: the weight decay for weights variables.
+    batch_norm_decay: decay for the moving average of batch_norm momentums.
+    batch_norm_epsilon: small float added to variance to avoid dividing by zero.
+    activation_fn: Activation function for conv2d.
+    batch_norm_updates_collections: Collection for the update ops for
+      batch norm.
+    batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
+      activations in the batch normalization layer.
+
+  Returns:
+    a arg_scope with the parameters needed for inception_resnet_v2.
+  """
+  # Set weight_decay for weights in conv2d and fully_connected layers.
+  with slim.arg_scope([slim.conv2d, slim.fully_connected],
+                      weights_regularizer=slim.l2_regularizer(weight_decay),
+                      biases_regularizer=slim.l2_regularizer(weight_decay)):
+
+    batch_norm_params = {
+        'decay': batch_norm_decay,
+        'epsilon': batch_norm_epsilon,
+        'updates_collections': batch_norm_updates_collections,
+        'fused': None,  # Use fused batch norm if possible.
+        'scale': batch_norm_scale,
+    }
+    # Set activation_fn and parameters for batch_norm.
+    with slim.arg_scope([slim.conv2d], activation_fn=activation_fn,
+                        normalizer_fn=slim.batch_norm,
+                        normalizer_params=batch_norm_params) as scope:
+      return scope
@@ -0,0 +1,338 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for slim.inception_resnet_v2."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from nets import inception
+
+
+class InceptionTest(tf.test.TestCase):
+
+  def testBuildLogits(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = 1000
+    with self.test_session():
+      inputs = tf.random.uniform((batch_size, height, width, 3))
+      logits, endpoints = inception.inception_resnet_v2(inputs, num_classes)
+      self.assertTrue('AuxLogits' in endpoints)
+      auxlogits = endpoints['AuxLogits']
+      self.assertTrue(
+          auxlogits.op.name.startswith('InceptionResnetV2/AuxLogits'))
+      self.assertListEqual(auxlogits.get_shape().as_list(),
+                           [batch_size, num_classes])
+      self.assertTrue(logits.op.name.startswith('InceptionResnetV2/Logits'))
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+
+  def testBuildWithoutAuxLogits(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = 1000
+    with self.test_session():
+      inputs = tf.random.uniform((batch_size, height, width, 3))
+      logits, endpoints = inception.inception_resnet_v2(inputs, num_classes,
+                                                        create_aux_logits=False)
+      self.assertTrue('AuxLogits' not in endpoints)
+      self.assertTrue(logits.op.name.startswith('InceptionResnetV2/Logits'))
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+
+  def testBuildNoClasses(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = None
+    with self.test_session():
+      inputs = tf.random.uniform((batch_size, height, width, 3))
+      net, endpoints = inception.inception_resnet_v2(inputs, num_classes)
+      self.assertTrue('AuxLogits' not in endpoints)
+      self.assertTrue('Logits' not in endpoints)
+      self.assertTrue(
+          net.op.name.startswith('InceptionResnetV2/Logits/AvgPool'))
+      self.assertListEqual(net.get_shape().as_list(), [batch_size, 1, 1, 1536])
+
+  def testBuildEndPoints(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = 1000
+    with self.test_session():
+      inputs = tf.random.uniform((batch_size, height, width, 3))
+      _, end_points = inception.inception_resnet_v2(inputs, num_classes)
+      self.assertTrue('Logits' in end_points)
+      logits = end_points['Logits']
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+      self.assertTrue('AuxLogits' in end_points)
+      aux_logits = end_points['AuxLogits']
+      self.assertListEqual(aux_logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+      pre_pool = end_points['Conv2d_7b_1x1']
+      self.assertListEqual(pre_pool.get_shape().as_list(),
+                           [batch_size, 8, 8, 1536])
+
+  def testBuildBaseNetwork(self):
+    batch_size = 5
+    height, width = 299, 299
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    net, end_points = inception.inception_resnet_v2_base(inputs)
+    self.assertTrue(net.op.name.startswith('InceptionResnetV2/Conv2d_7b_1x1'))
+    self.assertListEqual(net.get_shape().as_list(),
+                         [batch_size, 8, 8, 1536])
+    expected_endpoints = ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3',
+                          'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3',
+                          'MaxPool_5a_3x3', 'Mixed_5b', 'Mixed_6a',
+                          'PreAuxLogits', 'Mixed_7a', 'Conv2d_7b_1x1']
+    self.assertItemsEqual(end_points.keys(), expected_endpoints)
+
+  def testBuildOnlyUptoFinalEndpoint(self):
+    batch_size = 5
+    height, width = 299, 299
+    endpoints = ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3',
+                 'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3',
+                 'MaxPool_5a_3x3', 'Mixed_5b', 'Mixed_6a',
+                 'PreAuxLogits', 'Mixed_7a', 'Conv2d_7b_1x1']
+    for index, endpoint in enumerate(endpoints):
+      with tf.Graph().as_default():
+        inputs = tf.random.uniform((batch_size, height, width, 3))
+        out_tensor, end_points = inception.inception_resnet_v2_base(
+            inputs, final_endpoint=endpoint)
+        if endpoint != 'PreAuxLogits':
+          self.assertTrue(out_tensor.op.name.startswith(
+              'InceptionResnetV2/' + endpoint))
+        self.assertItemsEqual(endpoints[:index+1], end_points.keys())
+
+  def testBuildAndCheckAllEndPointsUptoPreAuxLogits(self):
+    batch_size = 5
+    height, width = 299, 299
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_resnet_v2_base(
+        inputs, final_endpoint='PreAuxLogits')
+    endpoints_shapes = {'Conv2d_1a_3x3': [5, 149, 149, 32],
+                        'Conv2d_2a_3x3': [5, 147, 147, 32],
+                        'Conv2d_2b_3x3': [5, 147, 147, 64],
+                        'MaxPool_3a_3x3': [5, 73, 73, 64],
+                        'Conv2d_3b_1x1': [5, 73, 73, 80],
+                        'Conv2d_4a_3x3': [5, 71, 71, 192],
+                        'MaxPool_5a_3x3': [5, 35, 35, 192],
+                        'Mixed_5b': [5, 35, 35, 320],
+                        'Mixed_6a': [5, 17, 17, 1088],
+                        'PreAuxLogits': [5, 17, 17, 1088]
+                       }
+
+    self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys())
+    for endpoint_name in endpoints_shapes:
+      expected_shape = endpoints_shapes[endpoint_name]
+      self.assertTrue(endpoint_name in end_points)
+      self.assertListEqual(end_points[endpoint_name].get_shape().as_list(),
+                           expected_shape)
+
+  def testBuildAndCheckAllEndPointsUptoPreAuxLogitsWithAlignedFeatureMaps(self):
+    batch_size = 5
+    height, width = 299, 299
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_resnet_v2_base(
+        inputs, final_endpoint='PreAuxLogits', align_feature_maps=True)
+    endpoints_shapes = {'Conv2d_1a_3x3': [5, 150, 150, 32],
+                        'Conv2d_2a_3x3': [5, 150, 150, 32],
+                        'Conv2d_2b_3x3': [5, 150, 150, 64],
+                        'MaxPool_3a_3x3': [5, 75, 75, 64],
+                        'Conv2d_3b_1x1': [5, 75, 75, 80],
+                        'Conv2d_4a_3x3': [5, 75, 75, 192],
+                        'MaxPool_5a_3x3': [5, 38, 38, 192],
+                        'Mixed_5b': [5, 38, 38, 320],
+                        'Mixed_6a': [5, 19, 19, 1088],
+                        'PreAuxLogits': [5, 19, 19, 1088]
+                       }
+
+    self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys())
+    for endpoint_name in endpoints_shapes:
+      expected_shape = endpoints_shapes[endpoint_name]
+      self.assertTrue(endpoint_name in end_points)
+      self.assertListEqual(end_points[endpoint_name].get_shape().as_list(),
+                           expected_shape)
+
+  def testBuildAndCheckAllEndPointsUptoPreAuxLogitsWithOutputStrideEight(self):
+    batch_size = 5
+    height, width = 299, 299
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_resnet_v2_base(
+        inputs, final_endpoint='PreAuxLogits', output_stride=8)
+    endpoints_shapes = {'Conv2d_1a_3x3': [5, 149, 149, 32],
+                        'Conv2d_2a_3x3': [5, 147, 147, 32],
+                        'Conv2d_2b_3x3': [5, 147, 147, 64],
+                        'MaxPool_3a_3x3': [5, 73, 73, 64],
+                        'Conv2d_3b_1x1': [5, 73, 73, 80],
+                        'Conv2d_4a_3x3': [5, 71, 71, 192],
+                        'MaxPool_5a_3x3': [5, 35, 35, 192],
+                        'Mixed_5b': [5, 35, 35, 320],
+                        'Mixed_6a': [5, 33, 33, 1088],
+                        'PreAuxLogits': [5, 33, 33, 1088]
+                       }
+
+    self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys())
+    for endpoint_name in endpoints_shapes:
+      expected_shape = endpoints_shapes[endpoint_name]
+      self.assertTrue(endpoint_name in end_points)
+      self.assertListEqual(end_points[endpoint_name].get_shape().as_list(),
+                           expected_shape)
+
+  def testVariablesSetDevice(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = 1000
+    with self.test_session():
+      inputs = tf.random.uniform((batch_size, height, width, 3))
+      # Force all Variables to reside on the device.
+      with tf.compat.v1.variable_scope('on_cpu'), tf.device('/cpu:0'):
+        inception.inception_resnet_v2(inputs, num_classes)
+      with tf.compat.v1.variable_scope('on_gpu'), tf.device('/gpu:0'):
+        inception.inception_resnet_v2(inputs, num_classes)
+      for v in tf.compat.v1.get_collection(
+          tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='on_cpu'):
+        self.assertDeviceEqual(v.device, '/cpu:0')
+      for v in tf.compat.v1.get_collection(
+          tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='on_gpu'):
+        self.assertDeviceEqual(v.device, '/gpu:0')
+
+  def testHalfSizeImages(self):
+    batch_size = 5
+    height, width = 150, 150
+    num_classes = 1000
+    with self.test_session():
+      inputs = tf.random.uniform((batch_size, height, width, 3))
+      logits, end_points = inception.inception_resnet_v2(inputs, num_classes)
+      self.assertTrue(logits.op.name.startswith('InceptionResnetV2/Logits'))
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+      pre_pool = end_points['Conv2d_7b_1x1']
+      self.assertListEqual(pre_pool.get_shape().as_list(),
+                           [batch_size, 3, 3, 1536])
+
+  def testGlobalPool(self):
+    batch_size = 1
+    height, width = 330, 400
+    num_classes = 1000
+    with self.test_session():
+      inputs = tf.random.uniform((batch_size, height, width, 3))
+      logits, end_points = inception.inception_resnet_v2(inputs, num_classes)
+      self.assertTrue(logits.op.name.startswith('InceptionResnetV2/Logits'))
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+      pre_pool = end_points['Conv2d_7b_1x1']
+      self.assertListEqual(pre_pool.get_shape().as_list(),
+                           [batch_size, 8, 11, 1536])
+
+  def testGlobalPoolUnknownImageShape(self):
+    batch_size = 1
+    height, width = 330, 400
+    num_classes = 1000
+    with self.test_session() as sess:
+      inputs = tf.compat.v1.placeholder(tf.float32, (batch_size, None, None, 3))
+      logits, end_points = inception.inception_resnet_v2(
+          inputs, num_classes, create_aux_logits=False)
+      self.assertTrue(logits.op.name.startswith('InceptionResnetV2/Logits'))
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+      pre_pool = end_points['Conv2d_7b_1x1']
+      images = tf.random.uniform((batch_size, height, width, 3))
+      sess.run(tf.compat.v1.global_variables_initializer())
+      logits_out, pre_pool_out = sess.run([logits, pre_pool],
+                                          {inputs: images.eval()})
+      self.assertTupleEqual(logits_out.shape, (batch_size, num_classes))
+      self.assertTupleEqual(pre_pool_out.shape, (batch_size, 8, 11, 1536))
+
+  def testUnknownBatchSize(self):
+    batch_size = 1
+    height, width = 299, 299
+    num_classes = 1000
+    with self.test_session() as sess:
+      inputs = tf.compat.v1.placeholder(tf.float32, (None, height, width, 3))
+      logits, _ = inception.inception_resnet_v2(inputs, num_classes)
+      self.assertTrue(logits.op.name.startswith('InceptionResnetV2/Logits'))
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [None, num_classes])
+      images = tf.random.uniform((batch_size, height, width, 3))
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(logits, {inputs: images.eval()})
+      self.assertEquals(output.shape, (batch_size, num_classes))
+
+  def testEvaluation(self):
+    batch_size = 2
+    height, width = 299, 299
+    num_classes = 1000
+    with self.test_session() as sess:
+      eval_inputs = tf.random.uniform((batch_size, height, width, 3))
+      logits, _ = inception.inception_resnet_v2(eval_inputs,
+                                                num_classes,
+                                                is_training=False)
+      predictions = tf.argmax(input=logits, axis=1)
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(predictions)
+      self.assertEquals(output.shape, (batch_size,))
+
+  def testTrainEvalWithReuse(self):
+    train_batch_size = 5
+    eval_batch_size = 2
+    height, width = 150, 150
+    num_classes = 1000
+    with self.test_session() as sess:
+      train_inputs = tf.random.uniform((train_batch_size, height, width, 3))
+      inception.inception_resnet_v2(train_inputs, num_classes)
+      eval_inputs = tf.random.uniform((eval_batch_size, height, width, 3))
+      logits, _ = inception.inception_resnet_v2(eval_inputs,
+                                                num_classes,
+                                                is_training=False,
+                                                reuse=True)
+      predictions = tf.argmax(input=logits, axis=1)
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(predictions)
+      self.assertEquals(output.shape, (eval_batch_size,))
+
+  def testNoBatchNormScaleByDefault(self):
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.compat.v1.placeholder(tf.float32, (1, height, width, 3))
+    with contrib_slim.arg_scope(inception.inception_resnet_v2_arg_scope()):
+      inception.inception_resnet_v2(inputs, num_classes, is_training=False)
+
+    self.assertEqual(tf.compat.v1.global_variables('.*/BatchNorm/gamma:0$'), [])
+
+  def testBatchNormScale(self):
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.compat.v1.placeholder(tf.float32, (1, height, width, 3))
+    with contrib_slim.arg_scope(
+        inception.inception_resnet_v2_arg_scope(batch_norm_scale=True)):
+      inception.inception_resnet_v2(inputs, num_classes, is_training=False)
+
+    gamma_names = set(
+        v.op.name
+        for v in tf.compat.v1.global_variables('.*/BatchNorm/gamma:0$'))
+    self.assertGreater(len(gamma_names), 0)
+    for v in tf.compat.v1.global_variables('.*/BatchNorm/moving_mean:0$'):
+      self.assertIn(v.op.name[:-len('moving_mean')] + 'gamma', gamma_names)
+
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,84 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains common code shared by all inception models.
+
+Usage of arg scope:
+  with slim.arg_scope(inception_arg_scope()):
+    logits, end_points = inception.inception_v3(images, num_classes,
+                                                is_training=is_training)
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+slim = contrib_slim
+
+
+def inception_arg_scope(
+    weight_decay=0.00004,
+    use_batch_norm=True,
+    batch_norm_decay=0.9997,
+    batch_norm_epsilon=0.001,
+    activation_fn=tf.nn.relu,
+    batch_norm_updates_collections=tf.compat.v1.GraphKeys.UPDATE_OPS,
+    batch_norm_scale=False):
+  """Defines the default arg scope for inception models.
+
+  Args:
+    weight_decay: The weight decay to use for regularizing the model.
+    use_batch_norm: "If `True`, batch_norm is applied after each convolution.
+    batch_norm_decay: Decay for batch norm moving average.
+    batch_norm_epsilon: Small float added to variance to avoid dividing by zero
+      in batch norm.
+    activation_fn: Activation function for conv2d.
+    batch_norm_updates_collections: Collection for the update ops for
+      batch norm.
+    batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
+      activations in the batch normalization layer.
+
+  Returns:
+    An `arg_scope` to use for the inception models.
+  """
+  batch_norm_params = {
+      # Decay for the moving averages.
+      'decay': batch_norm_decay,
+      # epsilon to prevent 0s in variance.
+      'epsilon': batch_norm_epsilon,
+      # collection containing update_ops.
+      'updates_collections': batch_norm_updates_collections,
+      # use fused batch norm if possible.
+      'fused': None,
+      'scale': batch_norm_scale,
+  }
+  if use_batch_norm:
+    normalizer_fn = slim.batch_norm
+    normalizer_params = batch_norm_params
+  else:
+    normalizer_fn = None
+    normalizer_params = {}
+  # Set weight_decay for weights in Conv and FC layers.
+  with slim.arg_scope([slim.conv2d, slim.fully_connected],
+                      weights_regularizer=slim.l2_regularizer(weight_decay)):
+    with slim.arg_scope(
+        [slim.conv2d],
+        weights_initializer=slim.variance_scaling_initializer(),
+        activation_fn=activation_fn,
+        normalizer_fn=normalizer_fn,
+        normalizer_params=normalizer_params) as sc:
+      return sc
@@ -0,0 +1,347 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains the definition for inception v1 classification network."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from nets import inception_utils
+
+slim = contrib_slim
+
+# pylint: disable=g-long-lambda
+trunc_normal = lambda stddev: tf.compat.v1.truncated_normal_initializer(
+    0.0, stddev)
+
+
+def inception_v1_base(inputs,
+                      final_endpoint='Mixed_5c',
+                      include_root_block=True,
+                      scope='InceptionV1'):
+  """Defines the Inception V1 base architecture.
+
+  This architecture is defined in:
+    Going deeper with convolutions
+    Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
+    Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
+    http://arxiv.org/pdf/1409.4842v1.pdf.
+
+  Args:
+    inputs: a tensor of size [batch_size, height, width, channels].
+    final_endpoint: specifies the endpoint to construct the network up to. It
+      can be one of ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1',
+      'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c',
+      'MaxPool_4a_3x3', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e',
+      'Mixed_4f', 'MaxPool_5a_2x2', 'Mixed_5b', 'Mixed_5c']. If
+      include_root_block is False, ['Conv2d_1a_7x7', 'MaxPool_2a_3x3',
+      'Conv2d_2b_1x1', 'Conv2d_2c_3x3', 'MaxPool_3a_3x3'] will not be available.
+    include_root_block: If True, include the convolution and max-pooling layers
+      before the inception modules. If False, excludes those layers.
+    scope: Optional variable_scope.
+
+  Returns:
+    A dictionary from components of the network to the corresponding activation.
+
+  Raises:
+    ValueError: if final_endpoint is not set to one of the predefined values.
+  """
+  end_points = {}
+  with tf.compat.v1.variable_scope(scope, 'InceptionV1', [inputs]):
+    with slim.arg_scope(
+        [slim.conv2d, slim.fully_connected],
+        weights_initializer=trunc_normal(0.01)):
+      with slim.arg_scope([slim.conv2d, slim.max_pool2d],
+                          stride=1, padding='SAME'):
+        net = inputs
+        if include_root_block:
+          end_point = 'Conv2d_1a_7x7'
+          net = slim.conv2d(inputs, 64, [7, 7], stride=2, scope=end_point)
+          end_points[end_point] = net
+          if final_endpoint == end_point:
+            return net, end_points
+          end_point = 'MaxPool_2a_3x3'
+          net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point)
+          end_points[end_point] = net
+          if final_endpoint == end_point:
+            return net, end_points
+          end_point = 'Conv2d_2b_1x1'
+          net = slim.conv2d(net, 64, [1, 1], scope=end_point)
+          end_points[end_point] = net
+          if final_endpoint == end_point:
+            return net, end_points
+          end_point = 'Conv2d_2c_3x3'
+          net = slim.conv2d(net, 192, [3, 3], scope=end_point)
+          end_points[end_point] = net
+          if final_endpoint == end_point:
+            return net, end_points
+          end_point = 'MaxPool_3a_3x3'
+          net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point)
+          end_points[end_point] = net
+          if final_endpoint == end_point:
+            return net, end_points
+
+        end_point = 'Mixed_3b'
+        with tf.compat.v1.variable_scope(end_point):
+          with tf.compat.v1.variable_scope('Branch_0'):
+            branch_0 = slim.conv2d(net, 64, [1, 1], scope='Conv2d_0a_1x1')
+          with tf.compat.v1.variable_scope('Branch_1'):
+            branch_1 = slim.conv2d(net, 96, [1, 1], scope='Conv2d_0a_1x1')
+            branch_1 = slim.conv2d(branch_1, 128, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_2'):
+            branch_2 = slim.conv2d(net, 16, [1, 1], scope='Conv2d_0a_1x1')
+            branch_2 = slim.conv2d(branch_2, 32, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_3'):
+            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
+            branch_3 = slim.conv2d(branch_3, 32, [1, 1], scope='Conv2d_0b_1x1')
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if final_endpoint == end_point: return net, end_points
+
+        end_point = 'Mixed_3c'
+        with tf.compat.v1.variable_scope(end_point):
+          with tf.compat.v1.variable_scope('Branch_0'):
+            branch_0 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1')
+          with tf.compat.v1.variable_scope('Branch_1'):
+            branch_1 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1')
+            branch_1 = slim.conv2d(branch_1, 192, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_2'):
+            branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1')
+            branch_2 = slim.conv2d(branch_2, 96, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_3'):
+            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
+            branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if final_endpoint == end_point: return net, end_points
+
+        end_point = 'MaxPool_4a_3x3'
+        net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point)
+        end_points[end_point] = net
+        if final_endpoint == end_point: return net, end_points
+
+        end_point = 'Mixed_4b'
+        with tf.compat.v1.variable_scope(end_point):
+          with tf.compat.v1.variable_scope('Branch_0'):
+            branch_0 = slim.conv2d(net, 192, [1, 1], scope='Conv2d_0a_1x1')
+          with tf.compat.v1.variable_scope('Branch_1'):
+            branch_1 = slim.conv2d(net, 96, [1, 1], scope='Conv2d_0a_1x1')
+            branch_1 = slim.conv2d(branch_1, 208, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_2'):
+            branch_2 = slim.conv2d(net, 16, [1, 1], scope='Conv2d_0a_1x1')
+            branch_2 = slim.conv2d(branch_2, 48, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_3'):
+            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
+            branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if final_endpoint == end_point: return net, end_points
+
+        end_point = 'Mixed_4c'
+        with tf.compat.v1.variable_scope(end_point):
+          with tf.compat.v1.variable_scope('Branch_0'):
+            branch_0 = slim.conv2d(net, 160, [1, 1], scope='Conv2d_0a_1x1')
+          with tf.compat.v1.variable_scope('Branch_1'):
+            branch_1 = slim.conv2d(net, 112, [1, 1], scope='Conv2d_0a_1x1')
+            branch_1 = slim.conv2d(branch_1, 224, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_2'):
+            branch_2 = slim.conv2d(net, 24, [1, 1], scope='Conv2d_0a_1x1')
+            branch_2 = slim.conv2d(branch_2, 64, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_3'):
+            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
+            branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if final_endpoint == end_point: return net, end_points
+
+        end_point = 'Mixed_4d'
+        with tf.compat.v1.variable_scope(end_point):
+          with tf.compat.v1.variable_scope('Branch_0'):
+            branch_0 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1')
+          with tf.compat.v1.variable_scope('Branch_1'):
+            branch_1 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1')
+            branch_1 = slim.conv2d(branch_1, 256, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_2'):
+            branch_2 = slim.conv2d(net, 24, [1, 1], scope='Conv2d_0a_1x1')
+            branch_2 = slim.conv2d(branch_2, 64, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_3'):
+            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
+            branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if final_endpoint == end_point: return net, end_points
+
+        end_point = 'Mixed_4e'
+        with tf.compat.v1.variable_scope(end_point):
+          with tf.compat.v1.variable_scope('Branch_0'):
+            branch_0 = slim.conv2d(net, 112, [1, 1], scope='Conv2d_0a_1x1')
+          with tf.compat.v1.variable_scope('Branch_1'):
+            branch_1 = slim.conv2d(net, 144, [1, 1], scope='Conv2d_0a_1x1')
+            branch_1 = slim.conv2d(branch_1, 288, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_2'):
+            branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1')
+            branch_2 = slim.conv2d(branch_2, 64, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_3'):
+            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
+            branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1')
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if final_endpoint == end_point: return net, end_points
+
+        end_point = 'Mixed_4f'
+        with tf.compat.v1.variable_scope(end_point):
+          with tf.compat.v1.variable_scope('Branch_0'):
+            branch_0 = slim.conv2d(net, 256, [1, 1], scope='Conv2d_0a_1x1')
+          with tf.compat.v1.variable_scope('Branch_1'):
+            branch_1 = slim.conv2d(net, 160, [1, 1], scope='Conv2d_0a_1x1')
+            branch_1 = slim.conv2d(branch_1, 320, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_2'):
+            branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1')
+            branch_2 = slim.conv2d(branch_2, 128, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_3'):
+            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
+            branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1')
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if final_endpoint == end_point: return net, end_points
+
+        end_point = 'MaxPool_5a_2x2'
+        net = slim.max_pool2d(net, [2, 2], stride=2, scope=end_point)
+        end_points[end_point] = net
+        if final_endpoint == end_point: return net, end_points
+
+        end_point = 'Mixed_5b'
+        with tf.compat.v1.variable_scope(end_point):
+          with tf.compat.v1.variable_scope('Branch_0'):
+            branch_0 = slim.conv2d(net, 256, [1, 1], scope='Conv2d_0a_1x1')
+          with tf.compat.v1.variable_scope('Branch_1'):
+            branch_1 = slim.conv2d(net, 160, [1, 1], scope='Conv2d_0a_1x1')
+            branch_1 = slim.conv2d(branch_1, 320, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_2'):
+            branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1')
+            branch_2 = slim.conv2d(branch_2, 128, [3, 3], scope='Conv2d_0a_3x3')
+          with tf.compat.v1.variable_scope('Branch_3'):
+            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
+            branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1')
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if final_endpoint == end_point: return net, end_points
+
+        end_point = 'Mixed_5c'
+        with tf.compat.v1.variable_scope(end_point):
+          with tf.compat.v1.variable_scope('Branch_0'):
+            branch_0 = slim.conv2d(net, 384, [1, 1], scope='Conv2d_0a_1x1')
+          with tf.compat.v1.variable_scope('Branch_1'):
+            branch_1 = slim.conv2d(net, 192, [1, 1], scope='Conv2d_0a_1x1')
+            branch_1 = slim.conv2d(branch_1, 384, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_2'):
+            branch_2 = slim.conv2d(net, 48, [1, 1], scope='Conv2d_0a_1x1')
+            branch_2 = slim.conv2d(branch_2, 128, [3, 3], scope='Conv2d_0b_3x3')
+          with tf.compat.v1.variable_scope('Branch_3'):
+            branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
+            branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1')
+          net = tf.concat(
+              axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if final_endpoint == end_point: return net, end_points
+    raise ValueError('Unknown final endpoint %s' % final_endpoint)
+
+
+def inception_v1(inputs,
+                 num_classes=1000,
+                 is_training=True,
+                 dropout_keep_prob=0.8,
+                 prediction_fn=slim.softmax,
+                 spatial_squeeze=True,
+                 reuse=None,
+                 scope='InceptionV1',
+                 global_pool=False):
+  """Defines the Inception V1 architecture.
+
+  This architecture is defined in:
+
+    Going deeper with convolutions
+    Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
+    Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
+    http://arxiv.org/pdf/1409.4842v1.pdf.
+
+  The default image size used to train this network is 224x224.
+
+  Args:
+    inputs: a tensor of size [batch_size, height, width, channels].
+    num_classes: number of predicted classes. If 0 or None, the logits layer
+      is omitted and the input features to the logits layer (before dropout)
+      are returned instead.
+    is_training: whether is training or not.
+    dropout_keep_prob: the percentage of activation values that are retained.
+    prediction_fn: a function to get predictions out of logits.
+    spatial_squeeze: if True, logits is of shape [B, C], if false logits is of
+        shape [B, 1, 1, C], where B is batch_size and C is number of classes.
+    reuse: whether or not the network and its variables should be reused. To be
+      able to reuse 'scope' must be given.
+    scope: Optional variable_scope.
+    global_pool: Optional boolean flag to control the avgpooling before the
+      logits layer. If false or unset, pooling is done with a fixed window
+      that reduces default-sized inputs to 1x1, while larger inputs lead to
+      larger outputs. If true, any input size is pooled down to 1x1.
+
+  Returns:
+    net: a Tensor with the logits (pre-softmax activations) if num_classes
+      is a non-zero integer, or the non-dropped-out input to the logits layer
+      if num_classes is 0 or None.
+    end_points: a dictionary from components of the network to the corresponding
+      activation.
+  """
+  # Final pooling and prediction
+  with tf.compat.v1.variable_scope(
+      scope, 'InceptionV1', [inputs], reuse=reuse) as scope:
+    with slim.arg_scope([slim.batch_norm, slim.dropout],
+                        is_training=is_training):
+      net, end_points = inception_v1_base(inputs, scope=scope)
+      with tf.compat.v1.variable_scope('Logits'):
+        if global_pool:
+          # Global average pooling.
+          net = tf.reduce_mean(
+              input_tensor=net, axis=[1, 2], keepdims=True, name='global_pool')
+          end_points['global_pool'] = net
+        else:
+          # Pooling with a fixed kernel size.
+          net = slim.avg_pool2d(net, [7, 7], stride=1, scope='AvgPool_0a_7x7')
+          end_points['AvgPool_0a_7x7'] = net
+        if not num_classes:
+          return net, end_points
+        net = slim.dropout(net, dropout_keep_prob, scope='Dropout_0b')
+        logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
+                             normalizer_fn=None, scope='Conv2d_0c_1x1')
+        if spatial_squeeze:
+          logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze')
+
+        end_points['Logits'] = logits
+        end_points['Predictions'] = prediction_fn(logits, scope='Predictions')
+  return logits, end_points
+inception_v1.default_image_size = 224
+
+inception_v1_arg_scope = inception_utils.inception_arg_scope
@@ -0,0 +1,300 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for nets.inception_v1."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from nets import inception
+
+slim = contrib_slim
+
+
+class InceptionV1Test(tf.test.TestCase):
+
+  def testBuildClassificationNetwork(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = 1000
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    logits, end_points = inception.inception_v1(inputs, num_classes)
+    self.assertTrue(logits.op.name.startswith(
+        'InceptionV1/Logits/SpatialSqueeze'))
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    self.assertTrue('Predictions' in end_points)
+    self.assertListEqual(end_points['Predictions'].get_shape().as_list(),
+                         [batch_size, num_classes])
+
+  def testBuildPreLogitsNetwork(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = None
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    net, end_points = inception.inception_v1(inputs, num_classes)
+    self.assertTrue(net.op.name.startswith('InceptionV1/Logits/AvgPool'))
+    self.assertListEqual(net.get_shape().as_list(), [batch_size, 1, 1, 1024])
+    self.assertFalse('Logits' in end_points)
+    self.assertFalse('Predictions' in end_points)
+
+  def testBuildBaseNetwork(self):
+    batch_size = 5
+    height, width = 224, 224
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    mixed_6c, end_points = inception.inception_v1_base(inputs)
+    self.assertTrue(mixed_6c.op.name.startswith('InceptionV1/Mixed_5c'))
+    self.assertListEqual(mixed_6c.get_shape().as_list(),
+                         [batch_size, 7, 7, 1024])
+    expected_endpoints = ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1',
+                          'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b',
+                          'Mixed_3c', 'MaxPool_4a_3x3', 'Mixed_4b', 'Mixed_4c',
+                          'Mixed_4d', 'Mixed_4e', 'Mixed_4f', 'MaxPool_5a_2x2',
+                          'Mixed_5b', 'Mixed_5c']
+    self.assertItemsEqual(end_points.keys(), expected_endpoints)
+
+  def testBuildOnlyUptoFinalEndpoint(self):
+    batch_size = 5
+    height, width = 224, 224
+    endpoints = ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1',
+                 'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c',
+                 'MaxPool_4a_3x3', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d',
+                 'Mixed_4e', 'Mixed_4f', 'MaxPool_5a_2x2', 'Mixed_5b',
+                 'Mixed_5c']
+    for index, endpoint in enumerate(endpoints):
+      with tf.Graph().as_default():
+        inputs = tf.random.uniform((batch_size, height, width, 3))
+        out_tensor, end_points = inception.inception_v1_base(
+            inputs, final_endpoint=endpoint)
+        self.assertTrue(out_tensor.op.name.startswith(
+            'InceptionV1/' + endpoint))
+        self.assertItemsEqual(endpoints[:index+1], end_points.keys())
+
+  def testBuildAndCheckAllEndPointsUptoMixed5c(self):
+    batch_size = 5
+    height, width = 224, 224
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_v1_base(inputs,
+                                                final_endpoint='Mixed_5c')
+    endpoints_shapes = {
+        'Conv2d_1a_7x7': [5, 112, 112, 64],
+        'MaxPool_2a_3x3': [5, 56, 56, 64],
+        'Conv2d_2b_1x1': [5, 56, 56, 64],
+        'Conv2d_2c_3x3': [5, 56, 56, 192],
+        'MaxPool_3a_3x3': [5, 28, 28, 192],
+        'Mixed_3b': [5, 28, 28, 256],
+        'Mixed_3c': [5, 28, 28, 480],
+        'MaxPool_4a_3x3': [5, 14, 14, 480],
+        'Mixed_4b': [5, 14, 14, 512],
+        'Mixed_4c': [5, 14, 14, 512],
+        'Mixed_4d': [5, 14, 14, 512],
+        'Mixed_4e': [5, 14, 14, 528],
+        'Mixed_4f': [5, 14, 14, 832],
+        'MaxPool_5a_2x2': [5, 7, 7, 832],
+        'Mixed_5b': [5, 7, 7, 832],
+        'Mixed_5c': [5, 7, 7, 1024]
+    }
+
+    self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys())
+    for endpoint_name in endpoints_shapes:
+      expected_shape = endpoints_shapes[endpoint_name]
+      self.assertTrue(endpoint_name in end_points)
+      self.assertListEqual(end_points[endpoint_name].get_shape().as_list(),
+                           expected_shape)
+
+  def testModelHasExpectedNumberOfParameters(self):
+    batch_size = 5
+    height, width = 224, 224
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    with slim.arg_scope(inception.inception_v1_arg_scope()):
+      inception.inception_v1_base(inputs)
+    total_params, _ = slim.model_analyzer.analyze_vars(
+        slim.get_model_variables())
+    self.assertAlmostEqual(5607184, total_params)
+
+  def testHalfSizeImages(self):
+    batch_size = 5
+    height, width = 112, 112
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    mixed_5c, _ = inception.inception_v1_base(inputs)
+    self.assertTrue(mixed_5c.op.name.startswith('InceptionV1/Mixed_5c'))
+    self.assertListEqual(mixed_5c.get_shape().as_list(),
+                         [batch_size, 4, 4, 1024])
+
+  def testBuildBaseNetworkWithoutRootBlock(self):
+    batch_size = 5
+    height, width = 28, 28
+    channels = 192
+
+    inputs = tf.random.uniform((batch_size, height, width, channels))
+    _, end_points = inception.inception_v1_base(
+        inputs, include_root_block=False)
+    endpoints_shapes = {
+        'Mixed_3b': [5, 28, 28, 256],
+        'Mixed_3c': [5, 28, 28, 480],
+        'MaxPool_4a_3x3': [5, 14, 14, 480],
+        'Mixed_4b': [5, 14, 14, 512],
+        'Mixed_4c': [5, 14, 14, 512],
+        'Mixed_4d': [5, 14, 14, 512],
+        'Mixed_4e': [5, 14, 14, 528],
+        'Mixed_4f': [5, 14, 14, 832],
+        'MaxPool_5a_2x2': [5, 7, 7, 832],
+        'Mixed_5b': [5, 7, 7, 832],
+        'Mixed_5c': [5, 7, 7, 1024]
+    }
+
+    self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys())
+    for endpoint_name in endpoints_shapes:
+      expected_shape = endpoints_shapes[endpoint_name]
+      self.assertTrue(endpoint_name in end_points)
+      self.assertListEqual(end_points[endpoint_name].get_shape().as_list(),
+                           expected_shape)
+
+  def testUnknownImageShape(self):
+    tf.compat.v1.reset_default_graph()
+    batch_size = 2
+    height, width = 224, 224
+    num_classes = 1000
+    input_np = np.random.uniform(0, 1, (batch_size, height, width, 3))
+    with self.test_session() as sess:
+      inputs = tf.compat.v1.placeholder(
+          tf.float32, shape=(batch_size, None, None, 3))
+      logits, end_points = inception.inception_v1(inputs, num_classes)
+      self.assertTrue(logits.op.name.startswith('InceptionV1/Logits'))
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+      pre_pool = end_points['Mixed_5c']
+      feed_dict = {inputs: input_np}
+      tf.compat.v1.global_variables_initializer().run()
+      pre_pool_out = sess.run(pre_pool, feed_dict=feed_dict)
+      self.assertListEqual(list(pre_pool_out.shape), [batch_size, 7, 7, 1024])
+
+  def testGlobalPoolUnknownImageShape(self):
+    tf.compat.v1.reset_default_graph()
+    batch_size = 1
+    height, width = 250, 300
+    num_classes = 1000
+    input_np = np.random.uniform(0, 1, (batch_size, height, width, 3))
+    with self.test_session() as sess:
+      inputs = tf.compat.v1.placeholder(
+          tf.float32, shape=(batch_size, None, None, 3))
+      logits, end_points = inception.inception_v1(inputs, num_classes,
+                                                  global_pool=True)
+      self.assertTrue(logits.op.name.startswith('InceptionV1/Logits'))
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+      pre_pool = end_points['Mixed_5c']
+      feed_dict = {inputs: input_np}
+      tf.compat.v1.global_variables_initializer().run()
+      pre_pool_out = sess.run(pre_pool, feed_dict=feed_dict)
+      self.assertListEqual(list(pre_pool_out.shape), [batch_size, 8, 10, 1024])
+
+  def testUnknowBatchSize(self):
+    batch_size = 1
+    height, width = 224, 224
+    num_classes = 1000
+
+    inputs = tf.compat.v1.placeholder(tf.float32, (None, height, width, 3))
+    logits, _ = inception.inception_v1(inputs, num_classes)
+    self.assertTrue(logits.op.name.startswith('InceptionV1/Logits'))
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [None, num_classes])
+    images = tf.random.uniform((batch_size, height, width, 3))
+
+    with self.test_session() as sess:
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(logits, {inputs: images.eval()})
+      self.assertEquals(output.shape, (batch_size, num_classes))
+
+  def testEvaluation(self):
+    batch_size = 2
+    height, width = 224, 224
+    num_classes = 1000
+
+    eval_inputs = tf.random.uniform((batch_size, height, width, 3))
+    logits, _ = inception.inception_v1(eval_inputs, num_classes,
+                                       is_training=False)
+    predictions = tf.argmax(input=logits, axis=1)
+
+    with self.test_session() as sess:
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(predictions)
+      self.assertEquals(output.shape, (batch_size,))
+
+  def testTrainEvalWithReuse(self):
+    train_batch_size = 5
+    eval_batch_size = 2
+    height, width = 224, 224
+    num_classes = 1000
+
+    train_inputs = tf.random.uniform((train_batch_size, height, width, 3))
+    inception.inception_v1(train_inputs, num_classes)
+    eval_inputs = tf.random.uniform((eval_batch_size, height, width, 3))
+    logits, _ = inception.inception_v1(eval_inputs, num_classes, reuse=True)
+    predictions = tf.argmax(input=logits, axis=1)
+
+    with self.test_session() as sess:
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(predictions)
+      self.assertEquals(output.shape, (eval_batch_size,))
+
+  def testLogitsNotSqueezed(self):
+    num_classes = 25
+    images = tf.random.uniform([1, 224, 224, 3])
+    logits, _ = inception.inception_v1(images,
+                                       num_classes=num_classes,
+                                       spatial_squeeze=False)
+
+    with self.test_session() as sess:
+      tf.compat.v1.global_variables_initializer().run()
+      logits_out = sess.run(logits)
+      self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes])
+
+  def testNoBatchNormScaleByDefault(self):
+    height, width = 224, 224
+    num_classes = 1000
+    inputs = tf.compat.v1.placeholder(tf.float32, (1, height, width, 3))
+    with slim.arg_scope(inception.inception_v1_arg_scope()):
+      inception.inception_v1(inputs, num_classes, is_training=False)
+
+    self.assertEqual(tf.compat.v1.global_variables('.*/BatchNorm/gamma:0$'), [])
+
+  def testBatchNormScale(self):
+    height, width = 224, 224
+    num_classes = 1000
+    inputs = tf.compat.v1.placeholder(tf.float32, (1, height, width, 3))
+    with slim.arg_scope(
+        inception.inception_v1_arg_scope(batch_norm_scale=True)):
+      inception.inception_v1(inputs, num_classes, is_training=False)
+
+    gamma_names = set(
+        v.op.name
+        for v in tf.compat.v1.global_variables('.*/BatchNorm/gamma:0$'))
+    self.assertGreater(len(gamma_names), 0)
+    for v in tf.compat.v1.global_variables('.*/BatchNorm/moving_mean:0$'):
+      self.assertIn(v.op.name[:-len('moving_mean')] + 'gamma', gamma_names)
+
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,596 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains the definition for inception v2 classification network."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from nets import inception_utils
+
+slim = contrib_slim
+
+# pylint: disable=g-long-lambda
+trunc_normal = lambda stddev: tf.compat.v1.truncated_normal_initializer(
+    0.0, stddev)
+
+
+def inception_v2_base(inputs,
+                      final_endpoint='Mixed_5c',
+                      min_depth=16,
+                      depth_multiplier=1.0,
+                      use_separable_conv=True,
+                      data_format='NHWC',
+                      include_root_block=True,
+                      scope=None):
+  """Inception v2 (6a2).
+
+  Constructs an Inception v2 network from inputs to the given final endpoint.
+  This method can construct the network up to the layer inception(5b) as
+  described in http://arxiv.org/abs/1502.03167.
+
+  Args:
+    inputs: a tensor of shape [batch_size, height, width, channels].
+    final_endpoint: specifies the endpoint to construct the network up to. It
+      can be one of ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1',
+      'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c', 'Mixed_4a',
+      'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e', 'Mixed_5a', 'Mixed_5b',
+      'Mixed_5c']. If include_root_block is False, ['Conv2d_1a_7x7',
+      'MaxPool_2a_3x3', 'Conv2d_2b_1x1', 'Conv2d_2c_3x3', 'MaxPool_3a_3x3'] will
+      not be available.
+    min_depth: Minimum depth value (number of channels) for all convolution ops.
+      Enforced when depth_multiplier < 1, and not an active constraint when
+      depth_multiplier >= 1.
+    depth_multiplier: Float multiplier for the depth (number of channels)
+      for all convolution ops. The value must be greater than zero. Typical
+      usage will be to set this value in (0, 1) to reduce the number of
+      parameters or computation cost of the model.
+    use_separable_conv: Use a separable convolution for the first layer
+      Conv2d_1a_7x7. If this is False, use a normal convolution instead.
+    data_format: Data format of the activations ('NHWC' or 'NCHW').
+    include_root_block: If True, include the convolution and max-pooling layers
+      before the inception modules. If False, excludes those layers.
+    scope: Optional variable_scope.
+
+  Returns:
+    tensor_out: output tensor corresponding to the final_endpoint.
+    end_points: a set of activations for external use, for example summaries or
+                losses.
+
+  Raises:
+    ValueError: if final_endpoint is not set to one of the predefined values,
+                or depth_multiplier <= 0
+  """
+
+  # end_points will collect relevant activations for external use, for example
+  # summaries or losses.
+  end_points = {}
+
+  # Used to find thinned depths for each layer.
+  if depth_multiplier <= 0:
+    raise ValueError('depth_multiplier is not greater than zero.')
+  depth = lambda d: max(int(d * depth_multiplier), min_depth)
+
+  if data_format != 'NHWC' and data_format != 'NCHW':
+    raise ValueError('data_format must be either NHWC or NCHW.')
+  if data_format == 'NCHW' and use_separable_conv:
+    raise ValueError(
+        'separable convolution only supports NHWC layout. NCHW data format can'
+        ' only be used when use_separable_conv is False.'
+    )
+
+  concat_dim = 3 if data_format == 'NHWC' else 1
+  with tf.compat.v1.variable_scope(scope, 'InceptionV2', [inputs]):
+    with slim.arg_scope(
+        [slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
+        stride=1,
+        padding='SAME',
+        data_format=data_format):
+
+      net = inputs
+      if include_root_block:
+        # Note that sizes in the comments below assume an input spatial size of
+        # 224x224, however, the inputs can be of any size greater 32x32.
+
+        # 224 x 224 x 3
+        end_point = 'Conv2d_1a_7x7'
+
+        if use_separable_conv:
+          # depthwise_multiplier here is different from depth_multiplier.
+          # depthwise_multiplier determines the output channels of the initial
+          # depthwise conv (see docs for tf.nn.separable_conv2d), while
+          # depth_multiplier controls the # channels of the subsequent 1x1
+          # convolution. Must have
+          #   in_channels * depthwise_multipler <= out_channels
+          # so that the separable convolution is not overparameterized.
+          depthwise_multiplier = min(int(depth(64) / 3), 8)
+          net = slim.separable_conv2d(
+              inputs,
+              depth(64), [7, 7],
+              depth_multiplier=depthwise_multiplier,
+              stride=2,
+              padding='SAME',
+              weights_initializer=trunc_normal(1.0),
+              scope=end_point)
+        else:
+          # Use a normal convolution instead of a separable convolution.
+          net = slim.conv2d(
+              inputs,
+              depth(64), [7, 7],
+              stride=2,
+              weights_initializer=trunc_normal(1.0),
+              scope=end_point)
+        end_points[end_point] = net
+        if end_point == final_endpoint:
+          return net, end_points
+        # 112 x 112 x 64
+        end_point = 'MaxPool_2a_3x3'
+        net = slim.max_pool2d(net, [3, 3], scope=end_point, stride=2)
+        end_points[end_point] = net
+        if end_point == final_endpoint:
+          return net, end_points
+        # 56 x 56 x 64
+        end_point = 'Conv2d_2b_1x1'
+        net = slim.conv2d(
+            net,
+            depth(64), [1, 1],
+            scope=end_point,
+            weights_initializer=trunc_normal(0.1))
+        end_points[end_point] = net
+        if end_point == final_endpoint:
+          return net, end_points
+        # 56 x 56 x 64
+        end_point = 'Conv2d_2c_3x3'
+        net = slim.conv2d(net, depth(192), [3, 3], scope=end_point)
+        end_points[end_point] = net
+        if end_point == final_endpoint:
+          return net, end_points
+        # 56 x 56 x 192
+        end_point = 'MaxPool_3a_3x3'
+        net = slim.max_pool2d(net, [3, 3], scope=end_point, stride=2)
+        end_points[end_point] = net
+        if end_point == final_endpoint:
+          return net, end_points
+
+      # 28 x 28 x 192
+      # Inception module.
+      end_point = 'Mixed_3b'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(
+              net, depth(64), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(64), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(
+              net, depth(64), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(96), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+          branch_2 = slim.conv2d(branch_2, depth(96), [3, 3],
+                                 scope='Conv2d_0c_3x3')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(
+              branch_3, depth(32), [1, 1],
+              weights_initializer=trunc_normal(0.1),
+              scope='Conv2d_0b_1x1')
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if end_point == final_endpoint: return net, end_points
+      # 28 x 28 x 256
+      end_point = 'Mixed_3c'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(
+              net, depth(64), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(96), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(
+              net, depth(64), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(96), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+          branch_2 = slim.conv2d(branch_2, depth(96), [3, 3],
+                                 scope='Conv2d_0c_3x3')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(
+              branch_3, depth(64), [1, 1],
+              weights_initializer=trunc_normal(0.1),
+              scope='Conv2d_0b_1x1')
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if end_point == final_endpoint: return net, end_points
+      # 28 x 28 x 320
+      end_point = 'Mixed_4a'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(
+              net, depth(128), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_0 = slim.conv2d(branch_0, depth(160), [3, 3], stride=2,
+                                 scope='Conv2d_1a_3x3')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(
+              net, depth(64), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(
+              branch_1, depth(96), [3, 3], scope='Conv2d_0b_3x3')
+          branch_1 = slim.conv2d(
+              branch_1, depth(96), [3, 3], stride=2, scope='Conv2d_1a_3x3')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.max_pool2d(
+              net, [3, 3], stride=2, scope='MaxPool_1a_3x3')
+        net = tf.concat(axis=concat_dim, values=[branch_0, branch_1, branch_2])
+        end_points[end_point] = net
+        if end_point == final_endpoint: return net, end_points
+      # 14 x 14 x 576
+      end_point = 'Mixed_4b'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(224), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(
+              net, depth(64), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(
+              branch_1, depth(96), [3, 3], scope='Conv2d_0b_3x3')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(
+              net, depth(96), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(128), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+          branch_2 = slim.conv2d(branch_2, depth(128), [3, 3],
+                                 scope='Conv2d_0c_3x3')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(
+              branch_3, depth(128), [1, 1],
+              weights_initializer=trunc_normal(0.1),
+              scope='Conv2d_0b_1x1')
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if end_point == final_endpoint: return net, end_points
+      # 14 x 14 x 576
+      end_point = 'Mixed_4c'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(
+              net, depth(96), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(128), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(
+              net, depth(96), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(128), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+          branch_2 = slim.conv2d(branch_2, depth(128), [3, 3],
+                                 scope='Conv2d_0c_3x3')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(
+              branch_3, depth(128), [1, 1],
+              weights_initializer=trunc_normal(0.1),
+              scope='Conv2d_0b_1x1')
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if end_point == final_endpoint: return net, end_points
+      # 14 x 14 x 576
+      end_point = 'Mixed_4d'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(
+              net, depth(128), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(160), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(
+              net, depth(128), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(160), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+          branch_2 = slim.conv2d(branch_2, depth(160), [3, 3],
+                                 scope='Conv2d_0c_3x3')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(
+              branch_3, depth(96), [1, 1],
+              weights_initializer=trunc_normal(0.1),
+              scope='Conv2d_0b_1x1')
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if end_point == final_endpoint: return net, end_points
+      # 14 x 14 x 576
+      end_point = 'Mixed_4e'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(96), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(
+              net, depth(128), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(192), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(
+              net, depth(160), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(192), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+          branch_2 = slim.conv2d(branch_2, depth(192), [3, 3],
+                                 scope='Conv2d_0c_3x3')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(
+              branch_3, depth(96), [1, 1],
+              weights_initializer=trunc_normal(0.1),
+              scope='Conv2d_0b_1x1')
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if end_point == final_endpoint: return net, end_points
+      # 14 x 14 x 576
+      end_point = 'Mixed_5a'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(
+              net, depth(128), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_0 = slim.conv2d(branch_0, depth(192), [3, 3], stride=2,
+                                 scope='Conv2d_1a_3x3')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(
+              net, depth(192), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(256), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+          branch_1 = slim.conv2d(branch_1, depth(256), [3, 3], stride=2,
+                                 scope='Conv2d_1a_3x3')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.max_pool2d(net, [3, 3], stride=2,
+                                     scope='MaxPool_1a_3x3')
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2])
+        end_points[end_point] = net
+        if end_point == final_endpoint: return net, end_points
+      # 7 x 7 x 1024
+      end_point = 'Mixed_5b'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(352), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(
+              net, depth(192), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(320), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(
+              net, depth(160), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(224), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+          branch_2 = slim.conv2d(branch_2, depth(224), [3, 3],
+                                 scope='Conv2d_0c_3x3')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(
+              branch_3, depth(128), [1, 1],
+              weights_initializer=trunc_normal(0.1),
+              scope='Conv2d_0b_1x1')
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if end_point == final_endpoint: return net, end_points
+      # 7 x 7 x 1024
+      end_point = 'Mixed_5c'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(352), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(
+              net, depth(192), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(320), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(
+              net, depth(192), [1, 1],
+              weights_initializer=trunc_normal(0.09),
+              scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(224), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+          branch_2 = slim.conv2d(branch_2, depth(224), [3, 3],
+                                 scope='Conv2d_0c_3x3')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3')
+          branch_3 = slim.conv2d(
+              branch_3, depth(128), [1, 1],
+              weights_initializer=trunc_normal(0.1),
+              scope='Conv2d_0b_1x1')
+        net = tf.concat(
+            axis=concat_dim, values=[branch_0, branch_1, branch_2, branch_3])
+        end_points[end_point] = net
+        if end_point == final_endpoint: return net, end_points
+    raise ValueError('Unknown final endpoint %s' % final_endpoint)
+
+
+def inception_v2(inputs,
+                 num_classes=1000,
+                 is_training=True,
+                 dropout_keep_prob=0.8,
+                 min_depth=16,
+                 depth_multiplier=1.0,
+                 prediction_fn=slim.softmax,
+                 spatial_squeeze=True,
+                 reuse=None,
+                 scope='InceptionV2',
+                 global_pool=False):
+  """Inception v2 model for classification.
+
+  Constructs an Inception v2 network for classification as described in
+  http://arxiv.org/abs/1502.03167.
+
+  The default image size used to train this network is 224x224.
+
+  Args:
+    inputs: a tensor of shape [batch_size, height, width, channels].
+    num_classes: number of predicted classes. If 0 or None, the logits layer
+      is omitted and the input features to the logits layer (before dropout)
+      are returned instead.
+    is_training: whether is training or not.
+    dropout_keep_prob: the percentage of activation values that are retained.
+    min_depth: Minimum depth value (number of channels) for all convolution ops.
+      Enforced when depth_multiplier < 1, and not an active constraint when
+      depth_multiplier >= 1.
+    depth_multiplier: Float multiplier for the depth (number of channels)
+      for all convolution ops. The value must be greater than zero. Typical
+      usage will be to set this value in (0, 1) to reduce the number of
+      parameters or computation cost of the model.
+    prediction_fn: a function to get predictions out of logits.
+    spatial_squeeze: if True, logits is of shape [B, C], if false logits is of
+        shape [B, 1, 1, C], where B is batch_size and C is number of classes.
+    reuse: whether or not the network and its variables should be reused. To be
+      able to reuse 'scope' must be given.
+    scope: Optional variable_scope.
+    global_pool: Optional boolean flag to control the avgpooling before the
+      logits layer. If false or unset, pooling is done with a fixed window
+      that reduces default-sized inputs to 1x1, while larger inputs lead to
+      larger outputs. If true, any input size is pooled down to 1x1.
+
+  Returns:
+    net: a Tensor with the logits (pre-softmax activations) if num_classes
+      is a non-zero integer, or the non-dropped-out input to the logits layer
+      if num_classes is 0 or None.
+    end_points: a dictionary from components of the network to the corresponding
+      activation.
+
+  Raises:
+    ValueError: if final_endpoint is not set to one of the predefined values,
+                or depth_multiplier <= 0
+  """
+  if depth_multiplier <= 0:
+    raise ValueError('depth_multiplier is not greater than zero.')
+
+  # Final pooling and prediction
+  with tf.compat.v1.variable_scope(
+      scope, 'InceptionV2', [inputs], reuse=reuse) as scope:
+    with slim.arg_scope([slim.batch_norm, slim.dropout],
+                        is_training=is_training):
+      net, end_points = inception_v2_base(
+          inputs, scope=scope, min_depth=min_depth,
+          depth_multiplier=depth_multiplier)
+      with tf.compat.v1.variable_scope('Logits'):
+        if global_pool:
+          # Global average pooling.
+          net = tf.reduce_mean(
+              input_tensor=net, axis=[1, 2], keepdims=True, name='global_pool')
+          end_points['global_pool'] = net
+        else:
+          # Pooling with a fixed kernel size.
+          kernel_size = _reduced_kernel_size_for_small_input(net, [7, 7])
+          net = slim.avg_pool2d(net, kernel_size, padding='VALID',
+                                scope='AvgPool_1a_{}x{}'.format(*kernel_size))
+          end_points['AvgPool_1a'] = net
+        if not num_classes:
+          return net, end_points
+        # 1 x 1 x 1024
+        net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b')
+        end_points['PreLogits'] = net
+        logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
+                             normalizer_fn=None, scope='Conv2d_1c_1x1')
+        if spatial_squeeze:
+          logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze')
+      end_points['Logits'] = logits
+      end_points['Predictions'] = prediction_fn(logits, scope='Predictions')
+  return logits, end_points
+inception_v2.default_image_size = 224
+
+
+def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
+  """Define kernel size which is automatically reduced for small input.
+
+  If the shape of the input images is unknown at graph construction time this
+  function assumes that the input images are is large enough.
+
+  Args:
+    input_tensor: input tensor of size [batch_size, height, width, channels].
+    kernel_size: desired kernel size of length 2: [kernel_height, kernel_width]
+
+  Returns:
+    a tensor with the kernel size.
+
+  TODO(jrru): Make this function work with unknown shapes. Theoretically, this
+  can be done with the code below. Problems are two-fold: (1) If the shape was
+  known, it will be lost. (2) inception.slim.ops._two_element_tuple cannot
+  handle tensors that define the kernel size.
+      shape = tf.shape(input_tensor)
+      return = tf.stack([tf.minimum(shape[1], kernel_size[0]),
+                         tf.minimum(shape[2], kernel_size[1])])
+
+  """
+  shape = input_tensor.get_shape().as_list()
+  if shape[1] is None or shape[2] is None:
+    kernel_size_out = kernel_size
+  else:
+    kernel_size_out = [min(shape[1], kernel_size[0]),
+                       min(shape[2], kernel_size[1])]
+  return kernel_size_out
+
+
+inception_v2_arg_scope = inception_utils.inception_arg_scope
@@ -0,0 +1,412 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for nets.inception_v2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from nets import inception
+
+slim = contrib_slim
+
+
+class InceptionV2Test(tf.test.TestCase):
+
+  def testBuildClassificationNetwork(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = 1000
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    logits, end_points = inception.inception_v2(inputs, num_classes)
+    self.assertTrue(logits.op.name.startswith(
+        'InceptionV2/Logits/SpatialSqueeze'))
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    self.assertTrue('Predictions' in end_points)
+    self.assertListEqual(end_points['Predictions'].get_shape().as_list(),
+                         [batch_size, num_classes])
+
+  def testBuildPreLogitsNetwork(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = None
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    net, end_points = inception.inception_v2(inputs, num_classes)
+    self.assertTrue(net.op.name.startswith('InceptionV2/Logits/AvgPool'))
+    self.assertListEqual(net.get_shape().as_list(), [batch_size, 1, 1, 1024])
+    self.assertFalse('Logits' in end_points)
+    self.assertFalse('Predictions' in end_points)
+
+  def testBuildBaseNetwork(self):
+    batch_size = 5
+    height, width = 224, 224
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    mixed_5c, end_points = inception.inception_v2_base(inputs)
+    self.assertTrue(mixed_5c.op.name.startswith('InceptionV2/Mixed_5c'))
+    self.assertListEqual(mixed_5c.get_shape().as_list(),
+                         [batch_size, 7, 7, 1024])
+    expected_endpoints = ['Mixed_3b', 'Mixed_3c', 'Mixed_4a', 'Mixed_4b',
+                          'Mixed_4c', 'Mixed_4d', 'Mixed_4e', 'Mixed_5a',
+                          'Mixed_5b', 'Mixed_5c', 'Conv2d_1a_7x7',
+                          'MaxPool_2a_3x3', 'Conv2d_2b_1x1', 'Conv2d_2c_3x3',
+                          'MaxPool_3a_3x3']
+    self.assertItemsEqual(list(end_points.keys()), expected_endpoints)
+
+  def testBuildOnlyUptoFinalEndpoint(self):
+    batch_size = 5
+    height, width = 224, 224
+    endpoints = ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1',
+                 'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c',
+                 'Mixed_4a', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e',
+                 'Mixed_5a', 'Mixed_5b', 'Mixed_5c']
+    for index, endpoint in enumerate(endpoints):
+      with tf.Graph().as_default():
+        inputs = tf.random.uniform((batch_size, height, width, 3))
+        out_tensor, end_points = inception.inception_v2_base(
+            inputs, final_endpoint=endpoint)
+        self.assertTrue(out_tensor.op.name.startswith(
+            'InceptionV2/' + endpoint))
+        self.assertItemsEqual(endpoints[:index + 1], list(end_points.keys()))
+
+  def testBuildAndCheckAllEndPointsUptoMixed5c(self):
+    batch_size = 5
+    height, width = 224, 224
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_v2_base(inputs,
+                                                final_endpoint='Mixed_5c')
+    endpoints_shapes = {'Mixed_3b': [batch_size, 28, 28, 256],
+                        'Mixed_3c': [batch_size, 28, 28, 320],
+                        'Mixed_4a': [batch_size, 14, 14, 576],
+                        'Mixed_4b': [batch_size, 14, 14, 576],
+                        'Mixed_4c': [batch_size, 14, 14, 576],
+                        'Mixed_4d': [batch_size, 14, 14, 576],
+                        'Mixed_4e': [batch_size, 14, 14, 576],
+                        'Mixed_5a': [batch_size, 7, 7, 1024],
+                        'Mixed_5b': [batch_size, 7, 7, 1024],
+                        'Mixed_5c': [batch_size, 7, 7, 1024],
+                        'Conv2d_1a_7x7': [batch_size, 112, 112, 64],
+                        'MaxPool_2a_3x3': [batch_size, 56, 56, 64],
+                        'Conv2d_2b_1x1': [batch_size, 56, 56, 64],
+                        'Conv2d_2c_3x3': [batch_size, 56, 56, 192],
+                        'MaxPool_3a_3x3': [batch_size, 28, 28, 192]}
+    self.assertItemsEqual(
+        list(endpoints_shapes.keys()), list(end_points.keys()))
+    for endpoint_name in endpoints_shapes:
+      expected_shape = endpoints_shapes[endpoint_name]
+      self.assertTrue(endpoint_name in end_points)
+      self.assertListEqual(end_points[endpoint_name].get_shape().as_list(),
+                           expected_shape)
+
+  def testModelHasExpectedNumberOfParameters(self):
+    batch_size = 5
+    height, width = 224, 224
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    with slim.arg_scope(inception.inception_v2_arg_scope()):
+      inception.inception_v2_base(inputs)
+    total_params, _ = slim.model_analyzer.analyze_vars(
+        slim.get_model_variables())
+    self.assertAlmostEqual(10173112, total_params)
+
+  def testBuildEndPointsWithDepthMultiplierLessThanOne(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = 1000
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_v2(inputs, num_classes)
+
+    endpoint_keys = [key for key in end_points.keys()
+                     if key.startswith('Mixed') or key.startswith('Conv')]
+
+    _, end_points_with_multiplier = inception.inception_v2(
+        inputs, num_classes, scope='depth_multiplied_net',
+        depth_multiplier=0.5)
+
+    for key in endpoint_keys:
+      original_depth = end_points[key].get_shape().as_list()[3]
+      new_depth = end_points_with_multiplier[key].get_shape().as_list()[3]
+      self.assertEqual(0.5 * original_depth, new_depth)
+
+  def testBuildEndPointsWithDepthMultiplierGreaterThanOne(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = 1000
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_v2(inputs, num_classes)
+
+    endpoint_keys = [key for key in end_points.keys()
+                     if key.startswith('Mixed') or key.startswith('Conv')]
+
+    _, end_points_with_multiplier = inception.inception_v2(
+        inputs, num_classes, scope='depth_multiplied_net',
+        depth_multiplier=2.0)
+
+    for key in endpoint_keys:
+      original_depth = end_points[key].get_shape().as_list()[3]
+      new_depth = end_points_with_multiplier[key].get_shape().as_list()[3]
+      self.assertEqual(2.0 * original_depth, new_depth)
+
+  def testRaiseValueErrorWithInvalidDepthMultiplier(self):
+    batch_size = 5
+    height, width = 224, 224
+    num_classes = 1000
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    with self.assertRaises(ValueError):
+      _ = inception.inception_v2(inputs, num_classes, depth_multiplier=-0.1)
+    with self.assertRaises(ValueError):
+      _ = inception.inception_v2(inputs, num_classes, depth_multiplier=0.0)
+
+  def testBuildEndPointsWithUseSeparableConvolutionFalse(self):
+    batch_size = 5
+    height, width = 224, 224
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_v2_base(inputs)
+
+    endpoint_keys = [
+        key for key in end_points.keys()
+        if key.startswith('Mixed') or key.startswith('Conv')
+    ]
+
+    _, end_points_with_replacement = inception.inception_v2_base(
+        inputs, use_separable_conv=False)
+
+    # The endpoint shapes must be equal to the original shape even when the
+    # separable convolution is replaced with a normal convolution.
+    for key in endpoint_keys:
+      original_shape = end_points[key].get_shape().as_list()
+      self.assertTrue(key in end_points_with_replacement)
+      new_shape = end_points_with_replacement[key].get_shape().as_list()
+      self.assertListEqual(original_shape, new_shape)
+
+  def testBuildEndPointsNCHWDataFormat(self):
+    batch_size = 5
+    height, width = 224, 224
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_v2_base(inputs)
+
+    endpoint_keys = [
+        key for key in end_points.keys()
+        if key.startswith('Mixed') or key.startswith('Conv')
+    ]
+
+    inputs_in_nchw = tf.random.uniform((batch_size, 3, height, width))
+    _, end_points_with_replacement = inception.inception_v2_base(
+        inputs_in_nchw, use_separable_conv=False, data_format='NCHW')
+
+    # With the 'NCHW' data format, all endpoint activations have a transposed
+    # shape from the original shape with the 'NHWC' layout.
+    for key in endpoint_keys:
+      transposed_original_shape = tf.transpose(
+          a=end_points[key], perm=[0, 3, 1, 2]).get_shape().as_list()
+      self.assertTrue(key in end_points_with_replacement)
+      new_shape = end_points_with_replacement[key].get_shape().as_list()
+      self.assertListEqual(transposed_original_shape, new_shape)
+
+  def testBuildErrorsForDataFormats(self):
+    batch_size = 5
+    height, width = 224, 224
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+
+    # 'NCWH' data format is not supported.
+    with self.assertRaises(ValueError):
+      _ = inception.inception_v2_base(inputs, data_format='NCWH')
+
+    # 'NCHW' data format is not supported for separable convolution.
+    with self.assertRaises(ValueError):
+      _ = inception.inception_v2_base(inputs, data_format='NCHW')
+
+  def testHalfSizeImages(self):
+    batch_size = 5
+    height, width = 112, 112
+    num_classes = 1000
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    logits, end_points = inception.inception_v2(inputs, num_classes)
+    self.assertTrue(logits.op.name.startswith('InceptionV2/Logits'))
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    pre_pool = end_points['Mixed_5c']
+    self.assertListEqual(pre_pool.get_shape().as_list(),
+                         [batch_size, 4, 4, 1024])
+
+  def testBuildBaseNetworkWithoutRootBlock(self):
+    batch_size = 5
+    height, width = 28, 28
+    channels = 192
+
+    inputs = tf.random.uniform((batch_size, height, width, channels))
+    _, end_points = inception.inception_v2_base(
+        inputs, include_root_block=False)
+    endpoints_shapes = {
+        'Mixed_3b': [batch_size, 28, 28, 256],
+        'Mixed_3c': [batch_size, 28, 28, 320],
+        'Mixed_4a': [batch_size, 14, 14, 576],
+        'Mixed_4b': [batch_size, 14, 14, 576],
+        'Mixed_4c': [batch_size, 14, 14, 576],
+        'Mixed_4d': [batch_size, 14, 14, 576],
+        'Mixed_4e': [batch_size, 14, 14, 576],
+        'Mixed_5a': [batch_size, 7, 7, 1024],
+        'Mixed_5b': [batch_size, 7, 7, 1024],
+        'Mixed_5c': [batch_size, 7, 7, 1024]
+    }
+    self.assertItemsEqual(
+        list(endpoints_shapes.keys()), list(end_points.keys()))
+    for endpoint_name in endpoints_shapes:
+      expected_shape = endpoints_shapes[endpoint_name]
+      self.assertTrue(endpoint_name in end_points)
+      self.assertListEqual(end_points[endpoint_name].get_shape().as_list(),
+                           expected_shape)
+
+  def testUnknownImageShape(self):
+    tf.compat.v1.reset_default_graph()
+    batch_size = 2
+    height, width = 224, 224
+    num_classes = 1000
+    input_np = np.random.uniform(0, 1, (batch_size, height, width, 3))
+    with self.test_session() as sess:
+      inputs = tf.compat.v1.placeholder(
+          tf.float32, shape=(batch_size, None, None, 3))
+      logits, end_points = inception.inception_v2(inputs, num_classes)
+      self.assertTrue(logits.op.name.startswith('InceptionV2/Logits'))
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+      pre_pool = end_points['Mixed_5c']
+      feed_dict = {inputs: input_np}
+      tf.compat.v1.global_variables_initializer().run()
+      pre_pool_out = sess.run(pre_pool, feed_dict=feed_dict)
+      self.assertListEqual(list(pre_pool_out.shape), [batch_size, 7, 7, 1024])
+
+  def testGlobalPoolUnknownImageShape(self):
+    tf.compat.v1.reset_default_graph()
+    batch_size = 1
+    height, width = 250, 300
+    num_classes = 1000
+    input_np = np.random.uniform(0, 1, (batch_size, height, width, 3))
+    with self.test_session() as sess:
+      inputs = tf.compat.v1.placeholder(
+          tf.float32, shape=(batch_size, None, None, 3))
+      logits, end_points = inception.inception_v2(inputs, num_classes,
+                                                  global_pool=True)
+      self.assertTrue(logits.op.name.startswith('InceptionV2/Logits'))
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+      pre_pool = end_points['Mixed_5c']
+      feed_dict = {inputs: input_np}
+      tf.compat.v1.global_variables_initializer().run()
+      pre_pool_out = sess.run(pre_pool, feed_dict=feed_dict)
+      self.assertListEqual(list(pre_pool_out.shape), [batch_size, 8, 10, 1024])
+
+  def testUnknowBatchSize(self):
+    batch_size = 1
+    height, width = 224, 224
+    num_classes = 1000
+
+    inputs = tf.compat.v1.placeholder(tf.float32, (None, height, width, 3))
+    logits, _ = inception.inception_v2(inputs, num_classes)
+    self.assertTrue(logits.op.name.startswith('InceptionV2/Logits'))
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [None, num_classes])
+    images = tf.random.uniform((batch_size, height, width, 3))
+
+    with self.test_session() as sess:
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(logits, {inputs: images.eval()})
+      self.assertEquals(output.shape, (batch_size, num_classes))
+
+  def testEvaluation(self):
+    batch_size = 2
+    height, width = 224, 224
+    num_classes = 1000
+
+    eval_inputs = tf.random.uniform((batch_size, height, width, 3))
+    logits, _ = inception.inception_v2(eval_inputs, num_classes,
+                                       is_training=False)
+    predictions = tf.argmax(input=logits, axis=1)
+
+    with self.test_session() as sess:
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(predictions)
+      self.assertEquals(output.shape, (batch_size,))
+
+  def testTrainEvalWithReuse(self):
+    train_batch_size = 5
+    eval_batch_size = 2
+    height, width = 150, 150
+    num_classes = 1000
+
+    train_inputs = tf.random.uniform((train_batch_size, height, width, 3))
+    inception.inception_v2(train_inputs, num_classes)
+    eval_inputs = tf.random.uniform((eval_batch_size, height, width, 3))
+    logits, _ = inception.inception_v2(eval_inputs, num_classes, reuse=True)
+    predictions = tf.argmax(input=logits, axis=1)
+
+    with self.test_session() as sess:
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(predictions)
+      self.assertEquals(output.shape, (eval_batch_size,))
+
+  def testLogitsNotSqueezed(self):
+    num_classes = 25
+    images = tf.random.uniform([1, 224, 224, 3])
+    logits, _ = inception.inception_v2(images,
+                                       num_classes=num_classes,
+                                       spatial_squeeze=False)
+
+    with self.test_session() as sess:
+      tf.compat.v1.global_variables_initializer().run()
+      logits_out = sess.run(logits)
+      self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes])
+
+  def testNoBatchNormScaleByDefault(self):
+    height, width = 224, 224
+    num_classes = 1000
+    inputs = tf.compat.v1.placeholder(tf.float32, (1, height, width, 3))
+    with slim.arg_scope(inception.inception_v2_arg_scope()):
+      inception.inception_v2(inputs, num_classes, is_training=False)
+
+    self.assertEqual(tf.compat.v1.global_variables('.*/BatchNorm/gamma:0$'), [])
+
+  def testBatchNormScale(self):
+    height, width = 224, 224
+    num_classes = 1000
+    inputs = tf.compat.v1.placeholder(tf.float32, (1, height, width, 3))
+    with slim.arg_scope(
+        inception.inception_v2_arg_scope(batch_norm_scale=True)):
+      inception.inception_v2(inputs, num_classes, is_training=False)
+
+    gamma_names = set(
+        v.op.name
+        for v in tf.compat.v1.global_variables('.*/BatchNorm/gamma:0$'))
+    self.assertGreater(len(gamma_names), 0)
+    for v in tf.compat.v1.global_variables('.*/BatchNorm/moving_mean:0$'):
+      self.assertIn(v.op.name[:-len('moving_mean')] + 'gamma', gamma_names)
+
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,585 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains the definition for inception v3 classification network."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from nets import inception_utils
+
+slim = contrib_slim
+
+# pylint: disable=g-long-lambda
+trunc_normal = lambda stddev: tf.compat.v1.truncated_normal_initializer(
+    0.0, stddev)
+
+
+def inception_v3_base(inputs,
+                      final_endpoint='Mixed_7c',
+                      min_depth=16,
+                      depth_multiplier=1.0,
+                      scope=None):
+  """Inception model from http://arxiv.org/abs/1512.00567.
+
+  Constructs an Inception v3 network from inputs to the given final endpoint.
+  This method can construct the network up to the final inception block
+  Mixed_7c.
+
+  Note that the names of the layers in the paper do not correspond to the names
+  of the endpoints registered by this function although they build the same
+  network.
+
+  Here is a mapping from the old_names to the new names:
+  Old name          | New name
+  =======================================
+  conv0             | Conv2d_1a_3x3
+  conv1             | Conv2d_2a_3x3
+  conv2             | Conv2d_2b_3x3
+  pool1             | MaxPool_3a_3x3
+  conv3             | Conv2d_3b_1x1
+  conv4             | Conv2d_4a_3x3
+  pool2             | MaxPool_5a_3x3
+  mixed_35x35x256a  | Mixed_5b
+  mixed_35x35x288a  | Mixed_5c
+  mixed_35x35x288b  | Mixed_5d
+  mixed_17x17x768a  | Mixed_6a
+  mixed_17x17x768b  | Mixed_6b
+  mixed_17x17x768c  | Mixed_6c
+  mixed_17x17x768d  | Mixed_6d
+  mixed_17x17x768e  | Mixed_6e
+  mixed_8x8x1280a   | Mixed_7a
+  mixed_8x8x2048a   | Mixed_7b
+  mixed_8x8x2048b   | Mixed_7c
+
+  Args:
+    inputs: a tensor of size [batch_size, height, width, channels].
+    final_endpoint: specifies the endpoint to construct the network up to. It
+      can be one of ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3',
+      'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3',
+      'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c',
+      'Mixed_6d', 'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c'].
+    min_depth: Minimum depth value (number of channels) for all convolution ops.
+      Enforced when depth_multiplier < 1, and not an active constraint when
+      depth_multiplier >= 1.
+    depth_multiplier: Float multiplier for the depth (number of channels)
+      for all convolution ops. The value must be greater than zero. Typical
+      usage will be to set this value in (0, 1) to reduce the number of
+      parameters or computation cost of the model.
+    scope: Optional variable_scope.
+
+  Returns:
+    tensor_out: output tensor corresponding to the final_endpoint.
+    end_points: a set of activations for external use, for example summaries or
+                losses.
+
+  Raises:
+    ValueError: if final_endpoint is not set to one of the predefined values,
+                or depth_multiplier <= 0
+  """
+  # end_points will collect relevant activations for external use, for example
+  # summaries or losses.
+  end_points = {}
+
+  if depth_multiplier <= 0:
+    raise ValueError('depth_multiplier is not greater than zero.')
+  depth = lambda d: max(int(d * depth_multiplier), min_depth)
+
+  with tf.compat.v1.variable_scope(scope, 'InceptionV3', [inputs]):
+    with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
+                        stride=1, padding='VALID'):
+      # 299 x 299 x 3
+      end_point = 'Conv2d_1a_3x3'
+      net = slim.conv2d(inputs, depth(32), [3, 3], stride=2, scope=end_point)
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+      # 149 x 149 x 32
+      end_point = 'Conv2d_2a_3x3'
+      net = slim.conv2d(net, depth(32), [3, 3], scope=end_point)
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+      # 147 x 147 x 32
+      end_point = 'Conv2d_2b_3x3'
+      net = slim.conv2d(net, depth(64), [3, 3], padding='SAME', scope=end_point)
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+      # 147 x 147 x 64
+      end_point = 'MaxPool_3a_3x3'
+      net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point)
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+      # 73 x 73 x 64
+      end_point = 'Conv2d_3b_1x1'
+      net = slim.conv2d(net, depth(80), [1, 1], scope=end_point)
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+      # 73 x 73 x 80.
+      end_point = 'Conv2d_4a_3x3'
+      net = slim.conv2d(net, depth(192), [3, 3], scope=end_point)
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+      # 71 x 71 x 192.
+      end_point = 'MaxPool_5a_3x3'
+      net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point)
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+      # 35 x 35 x 192.
+
+    # Inception blocks
+    with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
+                        stride=1, padding='SAME'):
+      # mixed: 35 x 35 x 256.
+      end_point = 'Mixed_5b'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(64), [5, 5],
+                                 scope='Conv2d_0b_5x5')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(96), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+          branch_2 = slim.conv2d(branch_2, depth(96), [3, 3],
+                                 scope='Conv2d_0c_3x3')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(branch_3, depth(32), [1, 1],
+                                 scope='Conv2d_0b_1x1')
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+
+      # mixed_1: 35 x 35 x 288.
+      end_point = 'Mixed_5c'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0b_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(64), [5, 5],
+                                 scope='Conv_1_0c_5x5')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(net, depth(64), [1, 1],
+                                 scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(96), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+          branch_2 = slim.conv2d(branch_2, depth(96), [3, 3],
+                                 scope='Conv2d_0c_3x3')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(branch_3, depth(64), [1, 1],
+                                 scope='Conv2d_0b_1x1')
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+
+      # mixed_2: 35 x 35 x 288.
+      end_point = 'Mixed_5d'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(64), [5, 5],
+                                 scope='Conv2d_0b_5x5')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(96), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+          branch_2 = slim.conv2d(branch_2, depth(96), [3, 3],
+                                 scope='Conv2d_0c_3x3')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(branch_3, depth(64), [1, 1],
+                                 scope='Conv2d_0b_1x1')
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+
+      # mixed_3: 17 x 17 x 768.
+      end_point = 'Mixed_6a'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(384), [3, 3], stride=2,
+                                 padding='VALID', scope='Conv2d_1a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(96), [3, 3],
+                                 scope='Conv2d_0b_3x3')
+          branch_1 = slim.conv2d(branch_1, depth(96), [3, 3], stride=2,
+                                 padding='VALID', scope='Conv2d_1a_1x1')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID',
+                                     scope='MaxPool_1a_3x3')
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2])
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+
+      # mixed4: 17 x 17 x 768.
+      end_point = 'Mixed_6b'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(net, depth(128), [1, 1], scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(128), [1, 7],
+                                 scope='Conv2d_0b_1x7')
+          branch_1 = slim.conv2d(branch_1, depth(192), [7, 1],
+                                 scope='Conv2d_0c_7x1')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(net, depth(128), [1, 1], scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(128), [7, 1],
+                                 scope='Conv2d_0b_7x1')
+          branch_2 = slim.conv2d(branch_2, depth(128), [1, 7],
+                                 scope='Conv2d_0c_1x7')
+          branch_2 = slim.conv2d(branch_2, depth(128), [7, 1],
+                                 scope='Conv2d_0d_7x1')
+          branch_2 = slim.conv2d(branch_2, depth(192), [1, 7],
+                                 scope='Conv2d_0e_1x7')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(branch_3, depth(192), [1, 1],
+                                 scope='Conv2d_0b_1x1')
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+
+      # mixed_5: 17 x 17 x 768.
+      end_point = 'Mixed_6c'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(160), [1, 7],
+                                 scope='Conv2d_0b_1x7')
+          branch_1 = slim.conv2d(branch_1, depth(192), [7, 1],
+                                 scope='Conv2d_0c_7x1')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(160), [7, 1],
+                                 scope='Conv2d_0b_7x1')
+          branch_2 = slim.conv2d(branch_2, depth(160), [1, 7],
+                                 scope='Conv2d_0c_1x7')
+          branch_2 = slim.conv2d(branch_2, depth(160), [7, 1],
+                                 scope='Conv2d_0d_7x1')
+          branch_2 = slim.conv2d(branch_2, depth(192), [1, 7],
+                                 scope='Conv2d_0e_1x7')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(branch_3, depth(192), [1, 1],
+                                 scope='Conv2d_0b_1x1')
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+      # mixed_6: 17 x 17 x 768.
+      end_point = 'Mixed_6d'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(160), [1, 7],
+                                 scope='Conv2d_0b_1x7')
+          branch_1 = slim.conv2d(branch_1, depth(192), [7, 1],
+                                 scope='Conv2d_0c_7x1')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(160), [7, 1],
+                                 scope='Conv2d_0b_7x1')
+          branch_2 = slim.conv2d(branch_2, depth(160), [1, 7],
+                                 scope='Conv2d_0c_1x7')
+          branch_2 = slim.conv2d(branch_2, depth(160), [7, 1],
+                                 scope='Conv2d_0d_7x1')
+          branch_2 = slim.conv2d(branch_2, depth(192), [1, 7],
+                                 scope='Conv2d_0e_1x7')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(branch_3, depth(192), [1, 1],
+                                 scope='Conv2d_0b_1x1')
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+
+      # mixed_7: 17 x 17 x 768.
+      end_point = 'Mixed_6e'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(192), [1, 7],
+                                 scope='Conv2d_0b_1x7')
+          branch_1 = slim.conv2d(branch_1, depth(192), [7, 1],
+                                 scope='Conv2d_0c_7x1')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(branch_2, depth(192), [7, 1],
+                                 scope='Conv2d_0b_7x1')
+          branch_2 = slim.conv2d(branch_2, depth(192), [1, 7],
+                                 scope='Conv2d_0c_1x7')
+          branch_2 = slim.conv2d(branch_2, depth(192), [7, 1],
+                                 scope='Conv2d_0d_7x1')
+          branch_2 = slim.conv2d(branch_2, depth(192), [1, 7],
+                                 scope='Conv2d_0e_1x7')
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(branch_3, depth(192), [1, 1],
+                                 scope='Conv2d_0b_1x1')
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+
+      # mixed_8: 8 x 8 x 1280.
+      end_point = 'Mixed_7a'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1')
+          branch_0 = slim.conv2d(branch_0, depth(320), [3, 3], stride=2,
+                                 padding='VALID', scope='Conv2d_1a_3x3')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, depth(192), [1, 7],
+                                 scope='Conv2d_0b_1x7')
+          branch_1 = slim.conv2d(branch_1, depth(192), [7, 1],
+                                 scope='Conv2d_0c_7x1')
+          branch_1 = slim.conv2d(branch_1, depth(192), [3, 3], stride=2,
+                                 padding='VALID', scope='Conv2d_1a_3x3')
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID',
+                                     scope='MaxPool_1a_3x3')
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2])
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+      # mixed_9: 8 x 8 x 2048.
+      end_point = 'Mixed_7b'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(320), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(net, depth(384), [1, 1], scope='Conv2d_0a_1x1')
+          branch_1 = tf.concat(axis=3, values=[
+              slim.conv2d(branch_1, depth(384), [1, 3], scope='Conv2d_0b_1x3'),
+              slim.conv2d(branch_1, depth(384), [3, 1], scope='Conv2d_0b_3x1')])
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(net, depth(448), [1, 1], scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(
+              branch_2, depth(384), [3, 3], scope='Conv2d_0b_3x3')
+          branch_2 = tf.concat(axis=3, values=[
+              slim.conv2d(branch_2, depth(384), [1, 3], scope='Conv2d_0c_1x3'),
+              slim.conv2d(branch_2, depth(384), [3, 1], scope='Conv2d_0d_3x1')])
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(
+              branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1')
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+
+      # mixed_10: 8 x 8 x 2048.
+      end_point = 'Mixed_7c'
+      with tf.compat.v1.variable_scope(end_point):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, depth(320), [1, 1], scope='Conv2d_0a_1x1')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(net, depth(384), [1, 1], scope='Conv2d_0a_1x1')
+          branch_1 = tf.concat(axis=3, values=[
+              slim.conv2d(branch_1, depth(384), [1, 3], scope='Conv2d_0b_1x3'),
+              slim.conv2d(branch_1, depth(384), [3, 1], scope='Conv2d_0c_3x1')])
+        with tf.compat.v1.variable_scope('Branch_2'):
+          branch_2 = slim.conv2d(net, depth(448), [1, 1], scope='Conv2d_0a_1x1')
+          branch_2 = slim.conv2d(
+              branch_2, depth(384), [3, 3], scope='Conv2d_0b_3x3')
+          branch_2 = tf.concat(axis=3, values=[
+              slim.conv2d(branch_2, depth(384), [1, 3], scope='Conv2d_0c_1x3'),
+              slim.conv2d(branch_2, depth(384), [3, 1], scope='Conv2d_0d_3x1')])
+        with tf.compat.v1.variable_scope('Branch_3'):
+          branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3')
+          branch_3 = slim.conv2d(
+              branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1')
+        net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+      end_points[end_point] = net
+      if end_point == final_endpoint: return net, end_points
+    raise ValueError('Unknown final endpoint %s' % final_endpoint)
+
+
+def inception_v3(inputs,
+                 num_classes=1000,
+                 is_training=True,
+                 dropout_keep_prob=0.8,
+                 min_depth=16,
+                 depth_multiplier=1.0,
+                 prediction_fn=slim.softmax,
+                 spatial_squeeze=True,
+                 reuse=None,
+                 create_aux_logits=True,
+                 scope='InceptionV3',
+                 global_pool=False):
+  """Inception model from http://arxiv.org/abs/1512.00567.
+
+  "Rethinking the Inception Architecture for Computer Vision"
+
+  Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens,
+  Zbigniew Wojna.
+
+  With the default arguments this method constructs the exact model defined in
+  the paper. However, one can experiment with variations of the inception_v3
+  network by changing arguments dropout_keep_prob, min_depth and
+  depth_multiplier.
+
+  The default image size used to train this network is 299x299.
+
+  Args:
+    inputs: a tensor of size [batch_size, height, width, channels].
+    num_classes: number of predicted classes. If 0 or None, the logits layer
+      is omitted and the input features to the logits layer (before dropout)
+      are returned instead.
+    is_training: whether is training or not.
+    dropout_keep_prob: the percentage of activation values that are retained.
+    min_depth: Minimum depth value (number of channels) for all convolution ops.
+      Enforced when depth_multiplier < 1, and not an active constraint when
+      depth_multiplier >= 1.
+    depth_multiplier: Float multiplier for the depth (number of channels)
+      for all convolution ops. The value must be greater than zero. Typical
+      usage will be to set this value in (0, 1) to reduce the number of
+      parameters or computation cost of the model.
+    prediction_fn: a function to get predictions out of logits.
+    spatial_squeeze: if True, logits is of shape [B, C], if false logits is of
+        shape [B, 1, 1, C], where B is batch_size and C is number of classes.
+    reuse: whether or not the network and its variables should be reused. To be
+      able to reuse 'scope' must be given.
+    create_aux_logits: Whether to create the auxiliary logits.
+    scope: Optional variable_scope.
+    global_pool: Optional boolean flag to control the avgpooling before the
+      logits layer. If false or unset, pooling is done with a fixed window
+      that reduces default-sized inputs to 1x1, while larger inputs lead to
+      larger outputs. If true, any input size is pooled down to 1x1.
+
+  Returns:
+    net: a Tensor with the logits (pre-softmax activations) if num_classes
+      is a non-zero integer, or the non-dropped-out input to the logits layer
+      if num_classes is 0 or None.
+    end_points: a dictionary from components of the network to the corresponding
+      activation.
+
+  Raises:
+    ValueError: if 'depth_multiplier' is less than or equal to zero.
+  """
+  if depth_multiplier <= 0:
+    raise ValueError('depth_multiplier is not greater than zero.')
+  depth = lambda d: max(int(d * depth_multiplier), min_depth)
+
+  with tf.compat.v1.variable_scope(
+      scope, 'InceptionV3', [inputs], reuse=reuse) as scope:
+    with slim.arg_scope([slim.batch_norm, slim.dropout],
+                        is_training=is_training):
+      net, end_points = inception_v3_base(
+          inputs, scope=scope, min_depth=min_depth,
+          depth_multiplier=depth_multiplier)
+
+      # Auxiliary Head logits
+      if create_aux_logits and num_classes:
+        with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
+                            stride=1, padding='SAME'):
+          aux_logits = end_points['Mixed_6e']
+          with tf.compat.v1.variable_scope('AuxLogits'):
+            aux_logits = slim.avg_pool2d(
+                aux_logits, [5, 5], stride=3, padding='VALID',
+                scope='AvgPool_1a_5x5')
+            aux_logits = slim.conv2d(aux_logits, depth(128), [1, 1],
+                                     scope='Conv2d_1b_1x1')
+
+            # Shape of feature map before the final layer.
+            kernel_size = _reduced_kernel_size_for_small_input(
+                aux_logits, [5, 5])
+            aux_logits = slim.conv2d(
+                aux_logits, depth(768), kernel_size,
+                weights_initializer=trunc_normal(0.01),
+                padding='VALID', scope='Conv2d_2a_{}x{}'.format(*kernel_size))
+            aux_logits = slim.conv2d(
+                aux_logits, num_classes, [1, 1], activation_fn=None,
+                normalizer_fn=None, weights_initializer=trunc_normal(0.001),
+                scope='Conv2d_2b_1x1')
+            if spatial_squeeze:
+              aux_logits = tf.squeeze(aux_logits, [1, 2], name='SpatialSqueeze')
+            end_points['AuxLogits'] = aux_logits
+
+      # Final pooling and prediction
+      with tf.compat.v1.variable_scope('Logits'):
+        if global_pool:
+          # Global average pooling.
+          net = tf.reduce_mean(
+              input_tensor=net, axis=[1, 2], keepdims=True, name='GlobalPool')
+          end_points['global_pool'] = net
+        else:
+          # Pooling with a fixed kernel size.
+          kernel_size = _reduced_kernel_size_for_small_input(net, [8, 8])
+          net = slim.avg_pool2d(net, kernel_size, padding='VALID',
+                                scope='AvgPool_1a_{}x{}'.format(*kernel_size))
+          end_points['AvgPool_1a'] = net
+        if not num_classes:
+          return net, end_points
+        # 1 x 1 x 2048
+        net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b')
+        end_points['PreLogits'] = net
+        # 2048
+        logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
+                             normalizer_fn=None, scope='Conv2d_1c_1x1')
+        if spatial_squeeze:
+          logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze')
+        # 1000
+      end_points['Logits'] = logits
+      end_points['Predictions'] = prediction_fn(logits, scope='Predictions')
+  return logits, end_points
+inception_v3.default_image_size = 299
+
+
+def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
+  """Define kernel size which is automatically reduced for small input.
+
+  If the shape of the input images is unknown at graph construction time this
+  function assumes that the input images are is large enough.
+
+  Args:
+    input_tensor: input tensor of size [batch_size, height, width, channels].
+    kernel_size: desired kernel size of length 2: [kernel_height, kernel_width]
+
+  Returns:
+    a tensor with the kernel size.
+
+  TODO(jrru): Make this function work with unknown shapes. Theoretically, this
+  can be done with the code below. Problems are two-fold: (1) If the shape was
+  known, it will be lost. (2) inception.slim.ops._two_element_tuple cannot
+  handle tensors that define the kernel size.
+      shape = tf.shape(input_tensor)
+      return = tf.stack([tf.minimum(shape[1], kernel_size[0]),
+                         tf.minimum(shape[2], kernel_size[1])])
+
+  """
+  shape = input_tensor.get_shape().as_list()
+  if shape[1] is None or shape[2] is None:
+    kernel_size_out = kernel_size
+  else:
+    kernel_size_out = [min(shape[1], kernel_size[0]),
+                       min(shape[2], kernel_size[1])]
+  return kernel_size_out
+
+
+inception_v3_arg_scope = inception_utils.inception_arg_scope
@@ -0,0 +1,350 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for nets.inception_v1."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from nets import inception
+
+slim = contrib_slim
+
+
+class InceptionV3Test(tf.test.TestCase):
+
+  def testBuildClassificationNetwork(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = 1000
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    logits, end_points = inception.inception_v3(inputs, num_classes)
+    self.assertTrue(logits.op.name.startswith(
+        'InceptionV3/Logits/SpatialSqueeze'))
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    self.assertTrue('Predictions' in end_points)
+    self.assertListEqual(end_points['Predictions'].get_shape().as_list(),
+                         [batch_size, num_classes])
+
+  def testBuildPreLogitsNetwork(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = None
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    net, end_points = inception.inception_v3(inputs, num_classes)
+    self.assertTrue(net.op.name.startswith('InceptionV3/Logits/AvgPool'))
+    self.assertListEqual(net.get_shape().as_list(), [batch_size, 1, 1, 2048])
+    self.assertFalse('Logits' in end_points)
+    self.assertFalse('Predictions' in end_points)
+
+  def testBuildBaseNetwork(self):
+    batch_size = 5
+    height, width = 299, 299
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    final_endpoint, end_points = inception.inception_v3_base(inputs)
+    self.assertTrue(final_endpoint.op.name.startswith(
+        'InceptionV3/Mixed_7c'))
+    self.assertListEqual(final_endpoint.get_shape().as_list(),
+                         [batch_size, 8, 8, 2048])
+    expected_endpoints = ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3',
+                          'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3',
+                          'MaxPool_5a_3x3', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d',
+                          'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d',
+                          'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c']
+    self.assertItemsEqual(end_points.keys(), expected_endpoints)
+
+  def testBuildOnlyUptoFinalEndpoint(self):
+    batch_size = 5
+    height, width = 299, 299
+    endpoints = ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3',
+                 'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3',
+                 'MaxPool_5a_3x3', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d',
+                 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d',
+                 'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c']
+
+    for index, endpoint in enumerate(endpoints):
+      with tf.Graph().as_default():
+        inputs = tf.random.uniform((batch_size, height, width, 3))
+        out_tensor, end_points = inception.inception_v3_base(
+            inputs, final_endpoint=endpoint)
+        self.assertTrue(out_tensor.op.name.startswith(
+            'InceptionV3/' + endpoint))
+        self.assertItemsEqual(endpoints[:index+1], end_points.keys())
+
+  def testBuildAndCheckAllEndPointsUptoMixed7c(self):
+    batch_size = 5
+    height, width = 299, 299
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_v3_base(
+        inputs, final_endpoint='Mixed_7c')
+    endpoints_shapes = {'Conv2d_1a_3x3': [batch_size, 149, 149, 32],
+                        'Conv2d_2a_3x3': [batch_size, 147, 147, 32],
+                        'Conv2d_2b_3x3': [batch_size, 147, 147, 64],
+                        'MaxPool_3a_3x3': [batch_size, 73, 73, 64],
+                        'Conv2d_3b_1x1': [batch_size, 73, 73, 80],
+                        'Conv2d_4a_3x3': [batch_size, 71, 71, 192],
+                        'MaxPool_5a_3x3': [batch_size, 35, 35, 192],
+                        'Mixed_5b': [batch_size, 35, 35, 256],
+                        'Mixed_5c': [batch_size, 35, 35, 288],
+                        'Mixed_5d': [batch_size, 35, 35, 288],
+                        'Mixed_6a': [batch_size, 17, 17, 768],
+                        'Mixed_6b': [batch_size, 17, 17, 768],
+                        'Mixed_6c': [batch_size, 17, 17, 768],
+                        'Mixed_6d': [batch_size, 17, 17, 768],
+                        'Mixed_6e': [batch_size, 17, 17, 768],
+                        'Mixed_7a': [batch_size, 8, 8, 1280],
+                        'Mixed_7b': [batch_size, 8, 8, 2048],
+                        'Mixed_7c': [batch_size, 8, 8, 2048]}
+    self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys())
+    for endpoint_name in endpoints_shapes:
+      expected_shape = endpoints_shapes[endpoint_name]
+      self.assertTrue(endpoint_name in end_points)
+      self.assertListEqual(end_points[endpoint_name].get_shape().as_list(),
+                           expected_shape)
+
+  def testModelHasExpectedNumberOfParameters(self):
+    batch_size = 5
+    height, width = 299, 299
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    with slim.arg_scope(inception.inception_v3_arg_scope()):
+      inception.inception_v3_base(inputs)
+    total_params, _ = slim.model_analyzer.analyze_vars(
+        slim.get_model_variables())
+    self.assertAlmostEqual(21802784, total_params)
+
+  def testBuildEndPoints(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = 1000
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_v3(inputs, num_classes)
+    self.assertTrue('Logits' in end_points)
+    logits = end_points['Logits']
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    self.assertTrue('AuxLogits' in end_points)
+    aux_logits = end_points['AuxLogits']
+    self.assertListEqual(aux_logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    self.assertTrue('Mixed_7c' in end_points)
+    pre_pool = end_points['Mixed_7c']
+    self.assertListEqual(pre_pool.get_shape().as_list(),
+                         [batch_size, 8, 8, 2048])
+    self.assertTrue('PreLogits' in end_points)
+    pre_logits = end_points['PreLogits']
+    self.assertListEqual(pre_logits.get_shape().as_list(),
+                         [batch_size, 1, 1, 2048])
+
+  def testBuildEndPointsWithDepthMultiplierLessThanOne(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = 1000
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_v3(inputs, num_classes)
+
+    endpoint_keys = [key for key in end_points.keys()
+                     if key.startswith('Mixed') or key.startswith('Conv')]
+
+    _, end_points_with_multiplier = inception.inception_v3(
+        inputs, num_classes, scope='depth_multiplied_net',
+        depth_multiplier=0.5)
+
+    for key in endpoint_keys:
+      original_depth = end_points[key].get_shape().as_list()[3]
+      new_depth = end_points_with_multiplier[key].get_shape().as_list()[3]
+      self.assertEqual(0.5 * original_depth, new_depth)
+
+  def testBuildEndPointsWithDepthMultiplierGreaterThanOne(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = 1000
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_v3(inputs, num_classes)
+
+    endpoint_keys = [key for key in end_points.keys()
+                     if key.startswith('Mixed') or key.startswith('Conv')]
+
+    _, end_points_with_multiplier = inception.inception_v3(
+        inputs, num_classes, scope='depth_multiplied_net',
+        depth_multiplier=2.0)
+
+    for key in endpoint_keys:
+      original_depth = end_points[key].get_shape().as_list()[3]
+      new_depth = end_points_with_multiplier[key].get_shape().as_list()[3]
+      self.assertEqual(2.0 * original_depth, new_depth)
+
+  def testRaiseValueErrorWithInvalidDepthMultiplier(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = 1000
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    with self.assertRaises(ValueError):
+      _ = inception.inception_v3(inputs, num_classes, depth_multiplier=-0.1)
+    with self.assertRaises(ValueError):
+      _ = inception.inception_v3(inputs, num_classes, depth_multiplier=0.0)
+
+  def testHalfSizeImages(self):
+    batch_size = 5
+    height, width = 150, 150
+    num_classes = 1000
+
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    logits, end_points = inception.inception_v3(inputs, num_classes)
+    self.assertTrue(logits.op.name.startswith('InceptionV3/Logits'))
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    pre_pool = end_points['Mixed_7c']
+    self.assertListEqual(pre_pool.get_shape().as_list(),
+                         [batch_size, 3, 3, 2048])
+
+  def testUnknownImageShape(self):
+    tf.compat.v1.reset_default_graph()
+    batch_size = 2
+    height, width = 299, 299
+    num_classes = 1000
+    input_np = np.random.uniform(0, 1, (batch_size, height, width, 3))
+    with self.test_session() as sess:
+      inputs = tf.compat.v1.placeholder(
+          tf.float32, shape=(batch_size, None, None, 3))
+      logits, end_points = inception.inception_v3(inputs, num_classes)
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+      pre_pool = end_points['Mixed_7c']
+      feed_dict = {inputs: input_np}
+      tf.compat.v1.global_variables_initializer().run()
+      pre_pool_out = sess.run(pre_pool, feed_dict=feed_dict)
+      self.assertListEqual(list(pre_pool_out.shape), [batch_size, 8, 8, 2048])
+
+  def testGlobalPoolUnknownImageShape(self):
+    tf.compat.v1.reset_default_graph()
+    batch_size = 1
+    height, width = 330, 400
+    num_classes = 1000
+    input_np = np.random.uniform(0, 1, (batch_size, height, width, 3))
+    with self.test_session() as sess:
+      inputs = tf.compat.v1.placeholder(
+          tf.float32, shape=(batch_size, None, None, 3))
+      logits, end_points = inception.inception_v3(inputs, num_classes,
+                                                  global_pool=True)
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+      pre_pool = end_points['Mixed_7c']
+      feed_dict = {inputs: input_np}
+      tf.compat.v1.global_variables_initializer().run()
+      pre_pool_out = sess.run(pre_pool, feed_dict=feed_dict)
+      self.assertListEqual(list(pre_pool_out.shape), [batch_size, 8, 11, 2048])
+
+  def testUnknowBatchSize(self):
+    batch_size = 1
+    height, width = 299, 299
+    num_classes = 1000
+
+    inputs = tf.compat.v1.placeholder(tf.float32, (None, height, width, 3))
+    logits, _ = inception.inception_v3(inputs, num_classes)
+    self.assertTrue(logits.op.name.startswith('InceptionV3/Logits'))
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [None, num_classes])
+    images = tf.random.uniform((batch_size, height, width, 3))
+
+    with self.test_session() as sess:
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(logits, {inputs: images.eval()})
+      self.assertEquals(output.shape, (batch_size, num_classes))
+
+  def testEvaluation(self):
+    batch_size = 2
+    height, width = 299, 299
+    num_classes = 1000
+
+    eval_inputs = tf.random.uniform((batch_size, height, width, 3))
+    logits, _ = inception.inception_v3(eval_inputs, num_classes,
+                                       is_training=False)
+    predictions = tf.argmax(input=logits, axis=1)
+
+    with self.test_session() as sess:
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(predictions)
+      self.assertEquals(output.shape, (batch_size,))
+
+  def testTrainEvalWithReuse(self):
+    train_batch_size = 5
+    eval_batch_size = 2
+    height, width = 150, 150
+    num_classes = 1000
+
+    train_inputs = tf.random.uniform((train_batch_size, height, width, 3))
+    inception.inception_v3(train_inputs, num_classes)
+    eval_inputs = tf.random.uniform((eval_batch_size, height, width, 3))
+    logits, _ = inception.inception_v3(eval_inputs, num_classes,
+                                       is_training=False, reuse=True)
+    predictions = tf.argmax(input=logits, axis=1)
+
+    with self.test_session() as sess:
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(predictions)
+      self.assertEquals(output.shape, (eval_batch_size,))
+
+  def testLogitsNotSqueezed(self):
+    num_classes = 25
+    images = tf.random.uniform([1, 299, 299, 3])
+    logits, _ = inception.inception_v3(images,
+                                       num_classes=num_classes,
+                                       spatial_squeeze=False)
+
+    with self.test_session() as sess:
+      tf.compat.v1.global_variables_initializer().run()
+      logits_out = sess.run(logits)
+      self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes])
+
+  def testNoBatchNormScaleByDefault(self):
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.compat.v1.placeholder(tf.float32, (1, height, width, 3))
+    with slim.arg_scope(inception.inception_v3_arg_scope()):
+      inception.inception_v3(inputs, num_classes, is_training=False)
+
+    self.assertEqual(tf.compat.v1.global_variables('.*/BatchNorm/gamma:0$'), [])
+
+  def testBatchNormScale(self):
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.compat.v1.placeholder(tf.float32, (1, height, width, 3))
+    with slim.arg_scope(
+        inception.inception_v3_arg_scope(batch_norm_scale=True)):
+      inception.inception_v3(inputs, num_classes, is_training=False)
+
+    gamma_names = set(
+        v.op.name
+        for v in tf.compat.v1.global_variables('.*/BatchNorm/gamma:0$'))
+    self.assertGreater(len(gamma_names), 0)
+    for v in tf.compat.v1.global_variables('.*/BatchNorm/moving_mean:0$'):
+      self.assertIn(v.op.name[:-len('moving_mean')] + 'gamma', gamma_names)
+
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,347 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains the definition of the Inception V4 architecture.
+
+As described in http://arxiv.org/abs/1602.07261.
+
+  Inception-v4, Inception-ResNet and the Impact of Residual Connections
+    on Learning
+  Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from nets import inception_utils
+
+slim = contrib_slim
+
+
+def block_inception_a(inputs, scope=None, reuse=None):
+  """Builds Inception-A block for Inception v4 network."""
+  # By default use stride=1 and SAME padding
+  with slim.arg_scope([slim.conv2d, slim.avg_pool2d, slim.max_pool2d],
+                      stride=1, padding='SAME'):
+    with tf.compat.v1.variable_scope(
+        scope, 'BlockInceptionA', [inputs], reuse=reuse):
+      with tf.compat.v1.variable_scope('Branch_0'):
+        branch_0 = slim.conv2d(inputs, 96, [1, 1], scope='Conv2d_0a_1x1')
+      with tf.compat.v1.variable_scope('Branch_1'):
+        branch_1 = slim.conv2d(inputs, 64, [1, 1], scope='Conv2d_0a_1x1')
+        branch_1 = slim.conv2d(branch_1, 96, [3, 3], scope='Conv2d_0b_3x3')
+      with tf.compat.v1.variable_scope('Branch_2'):
+        branch_2 = slim.conv2d(inputs, 64, [1, 1], scope='Conv2d_0a_1x1')
+        branch_2 = slim.conv2d(branch_2, 96, [3, 3], scope='Conv2d_0b_3x3')
+        branch_2 = slim.conv2d(branch_2, 96, [3, 3], scope='Conv2d_0c_3x3')
+      with tf.compat.v1.variable_scope('Branch_3'):
+        branch_3 = slim.avg_pool2d(inputs, [3, 3], scope='AvgPool_0a_3x3')
+        branch_3 = slim.conv2d(branch_3, 96, [1, 1], scope='Conv2d_0b_1x1')
+      return tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+
+
+def block_reduction_a(inputs, scope=None, reuse=None):
+  """Builds Reduction-A block for Inception v4 network."""
+  # By default use stride=1 and SAME padding
+  with slim.arg_scope([slim.conv2d, slim.avg_pool2d, slim.max_pool2d],
+                      stride=1, padding='SAME'):
+    with tf.compat.v1.variable_scope(
+        scope, 'BlockReductionA', [inputs], reuse=reuse):
+      with tf.compat.v1.variable_scope('Branch_0'):
+        branch_0 = slim.conv2d(inputs, 384, [3, 3], stride=2, padding='VALID',
+                               scope='Conv2d_1a_3x3')
+      with tf.compat.v1.variable_scope('Branch_1'):
+        branch_1 = slim.conv2d(inputs, 192, [1, 1], scope='Conv2d_0a_1x1')
+        branch_1 = slim.conv2d(branch_1, 224, [3, 3], scope='Conv2d_0b_3x3')
+        branch_1 = slim.conv2d(branch_1, 256, [3, 3], stride=2,
+                               padding='VALID', scope='Conv2d_1a_3x3')
+      with tf.compat.v1.variable_scope('Branch_2'):
+        branch_2 = slim.max_pool2d(inputs, [3, 3], stride=2, padding='VALID',
+                                   scope='MaxPool_1a_3x3')
+      return tf.concat(axis=3, values=[branch_0, branch_1, branch_2])
+
+
+def block_inception_b(inputs, scope=None, reuse=None):
+  """Builds Inception-B block for Inception v4 network."""
+  # By default use stride=1 and SAME padding
+  with slim.arg_scope([slim.conv2d, slim.avg_pool2d, slim.max_pool2d],
+                      stride=1, padding='SAME'):
+    with tf.compat.v1.variable_scope(
+        scope, 'BlockInceptionB', [inputs], reuse=reuse):
+      with tf.compat.v1.variable_scope('Branch_0'):
+        branch_0 = slim.conv2d(inputs, 384, [1, 1], scope='Conv2d_0a_1x1')
+      with tf.compat.v1.variable_scope('Branch_1'):
+        branch_1 = slim.conv2d(inputs, 192, [1, 1], scope='Conv2d_0a_1x1')
+        branch_1 = slim.conv2d(branch_1, 224, [1, 7], scope='Conv2d_0b_1x7')
+        branch_1 = slim.conv2d(branch_1, 256, [7, 1], scope='Conv2d_0c_7x1')
+      with tf.compat.v1.variable_scope('Branch_2'):
+        branch_2 = slim.conv2d(inputs, 192, [1, 1], scope='Conv2d_0a_1x1')
+        branch_2 = slim.conv2d(branch_2, 192, [7, 1], scope='Conv2d_0b_7x1')
+        branch_2 = slim.conv2d(branch_2, 224, [1, 7], scope='Conv2d_0c_1x7')
+        branch_2 = slim.conv2d(branch_2, 224, [7, 1], scope='Conv2d_0d_7x1')
+        branch_2 = slim.conv2d(branch_2, 256, [1, 7], scope='Conv2d_0e_1x7')
+      with tf.compat.v1.variable_scope('Branch_3'):
+        branch_3 = slim.avg_pool2d(inputs, [3, 3], scope='AvgPool_0a_3x3')
+        branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1')
+      return tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+
+
+def block_reduction_b(inputs, scope=None, reuse=None):
+  """Builds Reduction-B block for Inception v4 network."""
+  # By default use stride=1 and SAME padding
+  with slim.arg_scope([slim.conv2d, slim.avg_pool2d, slim.max_pool2d],
+                      stride=1, padding='SAME'):
+    with tf.compat.v1.variable_scope(
+        scope, 'BlockReductionB', [inputs], reuse=reuse):
+      with tf.compat.v1.variable_scope('Branch_0'):
+        branch_0 = slim.conv2d(inputs, 192, [1, 1], scope='Conv2d_0a_1x1')
+        branch_0 = slim.conv2d(branch_0, 192, [3, 3], stride=2,
+                               padding='VALID', scope='Conv2d_1a_3x3')
+      with tf.compat.v1.variable_scope('Branch_1'):
+        branch_1 = slim.conv2d(inputs, 256, [1, 1], scope='Conv2d_0a_1x1')
+        branch_1 = slim.conv2d(branch_1, 256, [1, 7], scope='Conv2d_0b_1x7')
+        branch_1 = slim.conv2d(branch_1, 320, [7, 1], scope='Conv2d_0c_7x1')
+        branch_1 = slim.conv2d(branch_1, 320, [3, 3], stride=2,
+                               padding='VALID', scope='Conv2d_1a_3x3')
+      with tf.compat.v1.variable_scope('Branch_2'):
+        branch_2 = slim.max_pool2d(inputs, [3, 3], stride=2, padding='VALID',
+                                   scope='MaxPool_1a_3x3')
+      return tf.concat(axis=3, values=[branch_0, branch_1, branch_2])
+
+
+def block_inception_c(inputs, scope=None, reuse=None):
+  """Builds Inception-C block for Inception v4 network."""
+  # By default use stride=1 and SAME padding
+  with slim.arg_scope([slim.conv2d, slim.avg_pool2d, slim.max_pool2d],
+                      stride=1, padding='SAME'):
+    with tf.compat.v1.variable_scope(
+        scope, 'BlockInceptionC', [inputs], reuse=reuse):
+      with tf.compat.v1.variable_scope('Branch_0'):
+        branch_0 = slim.conv2d(inputs, 256, [1, 1], scope='Conv2d_0a_1x1')
+      with tf.compat.v1.variable_scope('Branch_1'):
+        branch_1 = slim.conv2d(inputs, 384, [1, 1], scope='Conv2d_0a_1x1')
+        branch_1 = tf.concat(axis=3, values=[
+            slim.conv2d(branch_1, 256, [1, 3], scope='Conv2d_0b_1x3'),
+            slim.conv2d(branch_1, 256, [3, 1], scope='Conv2d_0c_3x1')])
+      with tf.compat.v1.variable_scope('Branch_2'):
+        branch_2 = slim.conv2d(inputs, 384, [1, 1], scope='Conv2d_0a_1x1')
+        branch_2 = slim.conv2d(branch_2, 448, [3, 1], scope='Conv2d_0b_3x1')
+        branch_2 = slim.conv2d(branch_2, 512, [1, 3], scope='Conv2d_0c_1x3')
+        branch_2 = tf.concat(axis=3, values=[
+            slim.conv2d(branch_2, 256, [1, 3], scope='Conv2d_0d_1x3'),
+            slim.conv2d(branch_2, 256, [3, 1], scope='Conv2d_0e_3x1')])
+      with tf.compat.v1.variable_scope('Branch_3'):
+        branch_3 = slim.avg_pool2d(inputs, [3, 3], scope='AvgPool_0a_3x3')
+        branch_3 = slim.conv2d(branch_3, 256, [1, 1], scope='Conv2d_0b_1x1')
+      return tf.concat(axis=3, values=[branch_0, branch_1, branch_2, branch_3])
+
+
+def inception_v4_base(inputs, final_endpoint='Mixed_7d', scope=None):
+  """Creates the Inception V4 network up to the given final endpoint.
+
+  Args:
+    inputs: a 4-D tensor of size [batch_size, height, width, 3].
+    final_endpoint: specifies the endpoint to construct the network up to.
+      It can be one of [ 'Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3',
+      'Mixed_3a', 'Mixed_4a', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d',
+      'Mixed_5e', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', 'Mixed_6e',
+      'Mixed_6f', 'Mixed_6g', 'Mixed_6h', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c',
+      'Mixed_7d']
+    scope: Optional variable_scope.
+
+  Returns:
+    logits: the logits outputs of the model.
+    end_points: the set of end_points from the inception model.
+
+  Raises:
+    ValueError: if final_endpoint is not set to one of the predefined values,
+  """
+  end_points = {}
+
+  def add_and_check_final(name, net):
+    end_points[name] = net
+    return name == final_endpoint
+
+  with tf.compat.v1.variable_scope(scope, 'InceptionV4', [inputs]):
+    with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
+                        stride=1, padding='SAME'):
+      # 299 x 299 x 3
+      net = slim.conv2d(inputs, 32, [3, 3], stride=2,
+                        padding='VALID', scope='Conv2d_1a_3x3')
+      if add_and_check_final('Conv2d_1a_3x3', net): return net, end_points
+      # 149 x 149 x 32
+      net = slim.conv2d(net, 32, [3, 3], padding='VALID',
+                        scope='Conv2d_2a_3x3')
+      if add_and_check_final('Conv2d_2a_3x3', net): return net, end_points
+      # 147 x 147 x 32
+      net = slim.conv2d(net, 64, [3, 3], scope='Conv2d_2b_3x3')
+      if add_and_check_final('Conv2d_2b_3x3', net): return net, end_points
+      # 147 x 147 x 64
+      with tf.compat.v1.variable_scope('Mixed_3a'):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID',
+                                     scope='MaxPool_0a_3x3')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(net, 96, [3, 3], stride=2, padding='VALID',
+                                 scope='Conv2d_0a_3x3')
+        net = tf.concat(axis=3, values=[branch_0, branch_1])
+        if add_and_check_final('Mixed_3a', net): return net, end_points
+
+      # 73 x 73 x 160
+      with tf.compat.v1.variable_scope('Mixed_4a'):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, 64, [1, 1], scope='Conv2d_0a_1x1')
+          branch_0 = slim.conv2d(branch_0, 96, [3, 3], padding='VALID',
+                                 scope='Conv2d_1a_3x3')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.conv2d(net, 64, [1, 1], scope='Conv2d_0a_1x1')
+          branch_1 = slim.conv2d(branch_1, 64, [1, 7], scope='Conv2d_0b_1x7')
+          branch_1 = slim.conv2d(branch_1, 64, [7, 1], scope='Conv2d_0c_7x1')
+          branch_1 = slim.conv2d(branch_1, 96, [3, 3], padding='VALID',
+                                 scope='Conv2d_1a_3x3')
+        net = tf.concat(axis=3, values=[branch_0, branch_1])
+        if add_and_check_final('Mixed_4a', net): return net, end_points
+
+      # 71 x 71 x 192
+      with tf.compat.v1.variable_scope('Mixed_5a'):
+        with tf.compat.v1.variable_scope('Branch_0'):
+          branch_0 = slim.conv2d(net, 192, [3, 3], stride=2, padding='VALID',
+                                 scope='Conv2d_1a_3x3')
+        with tf.compat.v1.variable_scope('Branch_1'):
+          branch_1 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID',
+                                     scope='MaxPool_1a_3x3')
+        net = tf.concat(axis=3, values=[branch_0, branch_1])
+        if add_and_check_final('Mixed_5a', net): return net, end_points
+
+      # 35 x 35 x 384
+      # 4 x Inception-A blocks
+      for idx in range(4):
+        block_scope = 'Mixed_5' + chr(ord('b') + idx)
+        net = block_inception_a(net, block_scope)
+        if add_and_check_final(block_scope, net): return net, end_points
+
+      # 35 x 35 x 384
+      # Reduction-A block
+      net = block_reduction_a(net, 'Mixed_6a')
+      if add_and_check_final('Mixed_6a', net): return net, end_points
+
+      # 17 x 17 x 1024
+      # 7 x Inception-B blocks
+      for idx in range(7):
+        block_scope = 'Mixed_6' + chr(ord('b') + idx)
+        net = block_inception_b(net, block_scope)
+        if add_and_check_final(block_scope, net): return net, end_points
+
+      # 17 x 17 x 1024
+      # Reduction-B block
+      net = block_reduction_b(net, 'Mixed_7a')
+      if add_and_check_final('Mixed_7a', net): return net, end_points
+
+      # 8 x 8 x 1536
+      # 3 x Inception-C blocks
+      for idx in range(3):
+        block_scope = 'Mixed_7' + chr(ord('b') + idx)
+        net = block_inception_c(net, block_scope)
+        if add_and_check_final(block_scope, net): return net, end_points
+  raise ValueError('Unknown final endpoint %s' % final_endpoint)
+
+
+def inception_v4(inputs, num_classes=1001, is_training=True,
+                 dropout_keep_prob=0.8,
+                 reuse=None,
+                 scope='InceptionV4',
+                 create_aux_logits=True):
+  """Creates the Inception V4 model.
+
+  Args:
+    inputs: a 4-D tensor of size [batch_size, height, width, 3].
+    num_classes: number of predicted classes. If 0 or None, the logits layer
+      is omitted and the input features to the logits layer (before dropout)
+      are returned instead.
+    is_training: whether is training or not.
+    dropout_keep_prob: float, the fraction to keep before final layer.
+    reuse: whether or not the network and its variables should be reused. To be
+      able to reuse 'scope' must be given.
+    scope: Optional variable_scope.
+    create_aux_logits: Whether to include the auxiliary logits.
+
+  Returns:
+    net: a Tensor with the logits (pre-softmax activations) if num_classes
+      is a non-zero integer, or the non-dropped input to the logits layer
+      if num_classes is 0 or None.
+    end_points: the set of end_points from the inception model.
+  """
+  end_points = {}
+  with tf.compat.v1.variable_scope(
+      scope, 'InceptionV4', [inputs], reuse=reuse) as scope:
+    with slim.arg_scope([slim.batch_norm, slim.dropout],
+                        is_training=is_training):
+      net, end_points = inception_v4_base(inputs, scope=scope)
+
+      with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
+                          stride=1, padding='SAME'):
+        # Auxiliary Head logits
+        if create_aux_logits and num_classes:
+          with tf.compat.v1.variable_scope('AuxLogits'):
+            # 17 x 17 x 1024
+            aux_logits = end_points['Mixed_6h']
+            aux_logits = slim.avg_pool2d(aux_logits, [5, 5], stride=3,
+                                         padding='VALID',
+                                         scope='AvgPool_1a_5x5')
+            aux_logits = slim.conv2d(aux_logits, 128, [1, 1],
+                                     scope='Conv2d_1b_1x1')
+            aux_logits = slim.conv2d(aux_logits, 768,
+                                     aux_logits.get_shape()[1:3],
+                                     padding='VALID', scope='Conv2d_2a')
+            aux_logits = slim.flatten(aux_logits)
+            aux_logits = slim.fully_connected(aux_logits, num_classes,
+                                              activation_fn=None,
+                                              scope='Aux_logits')
+            end_points['AuxLogits'] = aux_logits
+
+        # Final pooling and prediction
+        # TODO(sguada,arnoegw): Consider adding a parameter global_pool which
+        # can be set to False to disable pooling here (as in resnet_*()).
+        with tf.compat.v1.variable_scope('Logits'):
+          # 8 x 8 x 1536
+          kernel_size = net.get_shape()[1:3]
+          if kernel_size.is_fully_defined():
+            net = slim.avg_pool2d(net, kernel_size, padding='VALID',
+                                  scope='AvgPool_1a')
+          else:
+            net = tf.reduce_mean(
+                input_tensor=net,
+                axis=[1, 2],
+                keepdims=True,
+                name='global_pool')
+          end_points['global_pool'] = net
+          if not num_classes:
+            return net, end_points
+          # 1 x 1 x 1536
+          net = slim.dropout(net, dropout_keep_prob, scope='Dropout_1b')
+          net = slim.flatten(net, scope='PreLogitsFlatten')
+          end_points['PreLogitsFlatten'] = net
+          # 1536
+          logits = slim.fully_connected(net, num_classes, activation_fn=None,
+                                        scope='Logits')
+          end_points['Logits'] = logits
+          end_points['Predictions'] = tf.nn.softmax(logits, name='Predictions')
+    return logits, end_points
+inception_v4.default_image_size = 299
+
+
+inception_v4_arg_scope = inception_utils.inception_arg_scope
@@ -0,0 +1,287 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for slim.inception_v4."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+from nets import inception
+
+
+class InceptionTest(tf.test.TestCase):
+
+  def testBuildLogits(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    logits, end_points = inception.inception_v4(inputs, num_classes)
+    auxlogits = end_points['AuxLogits']
+    predictions = end_points['Predictions']
+    self.assertTrue(auxlogits.op.name.startswith('InceptionV4/AuxLogits'))
+    self.assertListEqual(auxlogits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    self.assertTrue(logits.op.name.startswith('InceptionV4/Logits'))
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    self.assertTrue(predictions.op.name.startswith(
+        'InceptionV4/Logits/Predictions'))
+    self.assertListEqual(predictions.get_shape().as_list(),
+                         [batch_size, num_classes])
+
+  def testBuildPreLogitsNetwork(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = None
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    net, end_points = inception.inception_v4(inputs, num_classes)
+    self.assertTrue(net.op.name.startswith('InceptionV4/Logits/AvgPool'))
+    self.assertListEqual(net.get_shape().as_list(), [batch_size, 1, 1, 1536])
+    self.assertFalse('Logits' in end_points)
+    self.assertFalse('Predictions' in end_points)
+
+  def testBuildWithoutAuxLogits(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    logits, endpoints = inception.inception_v4(inputs, num_classes,
+                                               create_aux_logits=False)
+    self.assertFalse('AuxLogits' in endpoints)
+    self.assertTrue(logits.op.name.startswith('InceptionV4/Logits'))
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+
+  def testAllEndPointsShapes(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    _, end_points = inception.inception_v4(inputs, num_classes)
+    endpoints_shapes = {'Conv2d_1a_3x3': [batch_size, 149, 149, 32],
+                        'Conv2d_2a_3x3': [batch_size, 147, 147, 32],
+                        'Conv2d_2b_3x3': [batch_size, 147, 147, 64],
+                        'Mixed_3a': [batch_size, 73, 73, 160],
+                        'Mixed_4a': [batch_size, 71, 71, 192],
+                        'Mixed_5a': [batch_size, 35, 35, 384],
+                        # 4 x Inception-A blocks
+                        'Mixed_5b': [batch_size, 35, 35, 384],
+                        'Mixed_5c': [batch_size, 35, 35, 384],
+                        'Mixed_5d': [batch_size, 35, 35, 384],
+                        'Mixed_5e': [batch_size, 35, 35, 384],
+                        # Reduction-A block
+                        'Mixed_6a': [batch_size, 17, 17, 1024],
+                        # 7 x Inception-B blocks
+                        'Mixed_6b': [batch_size, 17, 17, 1024],
+                        'Mixed_6c': [batch_size, 17, 17, 1024],
+                        'Mixed_6d': [batch_size, 17, 17, 1024],
+                        'Mixed_6e': [batch_size, 17, 17, 1024],
+                        'Mixed_6f': [batch_size, 17, 17, 1024],
+                        'Mixed_6g': [batch_size, 17, 17, 1024],
+                        'Mixed_6h': [batch_size, 17, 17, 1024],
+                        # Reduction-A block
+                        'Mixed_7a': [batch_size, 8, 8, 1536],
+                        # 3 x Inception-C blocks
+                        'Mixed_7b': [batch_size, 8, 8, 1536],
+                        'Mixed_7c': [batch_size, 8, 8, 1536],
+                        'Mixed_7d': [batch_size, 8, 8, 1536],
+                        # Logits and predictions
+                        'AuxLogits': [batch_size, num_classes],
+                        'global_pool': [batch_size, 1, 1, 1536],
+                        'PreLogitsFlatten': [batch_size, 1536],
+                        'Logits': [batch_size, num_classes],
+                        'Predictions': [batch_size, num_classes]}
+    self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys())
+    for endpoint_name in endpoints_shapes:
+      expected_shape = endpoints_shapes[endpoint_name]
+      self.assertTrue(endpoint_name in end_points)
+      self.assertListEqual(end_points[endpoint_name].get_shape().as_list(),
+                           expected_shape)
+
+  def testBuildBaseNetwork(self):
+    batch_size = 5
+    height, width = 299, 299
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    net, end_points = inception.inception_v4_base(inputs)
+    self.assertTrue(net.op.name.startswith(
+        'InceptionV4/Mixed_7d'))
+    self.assertListEqual(net.get_shape().as_list(), [batch_size, 8, 8, 1536])
+    expected_endpoints = [
+        'Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'Mixed_3a',
+        'Mixed_4a', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d',
+        'Mixed_5e', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d',
+        'Mixed_6e', 'Mixed_6f', 'Mixed_6g', 'Mixed_6h', 'Mixed_7a',
+        'Mixed_7b', 'Mixed_7c', 'Mixed_7d']
+    self.assertItemsEqual(end_points.keys(), expected_endpoints)
+    for name, op in end_points.items():
+      self.assertTrue(op.name.startswith('InceptionV4/' + name))
+
+  def testBuildOnlyUpToFinalEndpoint(self):
+    batch_size = 5
+    height, width = 299, 299
+    all_endpoints = [
+        'Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'Mixed_3a',
+        'Mixed_4a', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d',
+        'Mixed_5e', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d',
+        'Mixed_6e', 'Mixed_6f', 'Mixed_6g', 'Mixed_6h', 'Mixed_7a',
+        'Mixed_7b', 'Mixed_7c', 'Mixed_7d']
+    for index, endpoint in enumerate(all_endpoints):
+      with tf.Graph().as_default():
+        inputs = tf.random.uniform((batch_size, height, width, 3))
+        out_tensor, end_points = inception.inception_v4_base(
+            inputs, final_endpoint=endpoint)
+        self.assertTrue(out_tensor.op.name.startswith(
+            'InceptionV4/' + endpoint))
+        self.assertItemsEqual(all_endpoints[:index+1], end_points.keys())
+
+  def testVariablesSetDevice(self):
+    batch_size = 5
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    # Force all Variables to reside on the device.
+    with tf.compat.v1.variable_scope('on_cpu'), tf.device('/cpu:0'):
+      inception.inception_v4(inputs, num_classes)
+    with tf.compat.v1.variable_scope('on_gpu'), tf.device('/gpu:0'):
+      inception.inception_v4(inputs, num_classes)
+    for v in tf.compat.v1.get_collection(
+        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='on_cpu'):
+      self.assertDeviceEqual(v.device, '/cpu:0')
+    for v in tf.compat.v1.get_collection(
+        tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope='on_gpu'):
+      self.assertDeviceEqual(v.device, '/gpu:0')
+
+  def testHalfSizeImages(self):
+    batch_size = 5
+    height, width = 150, 150
+    num_classes = 1000
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    logits, end_points = inception.inception_v4(inputs, num_classes)
+    self.assertTrue(logits.op.name.startswith('InceptionV4/Logits'))
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    pre_pool = end_points['Mixed_7d']
+    self.assertListEqual(pre_pool.get_shape().as_list(),
+                         [batch_size, 3, 3, 1536])
+
+  def testGlobalPool(self):
+    batch_size = 1
+    height, width = 350, 400
+    num_classes = 1000
+    inputs = tf.random.uniform((batch_size, height, width, 3))
+    logits, end_points = inception.inception_v4(inputs, num_classes)
+    self.assertTrue(logits.op.name.startswith('InceptionV4/Logits'))
+    self.assertListEqual(logits.get_shape().as_list(),
+                         [batch_size, num_classes])
+    pre_pool = end_points['Mixed_7d']
+    self.assertListEqual(pre_pool.get_shape().as_list(),
+                         [batch_size, 9, 11, 1536])
+
+  def testGlobalPoolUnknownImageShape(self):
+    batch_size = 1
+    height, width = 350, 400
+    num_classes = 1000
+    with self.test_session() as sess:
+      inputs = tf.compat.v1.placeholder(tf.float32, (batch_size, None, None, 3))
+      logits, end_points = inception.inception_v4(
+          inputs, num_classes, create_aux_logits=False)
+      self.assertTrue(logits.op.name.startswith('InceptionV4/Logits'))
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [batch_size, num_classes])
+      pre_pool = end_points['Mixed_7d']
+      images = tf.random.uniform((batch_size, height, width, 3))
+      sess.run(tf.compat.v1.global_variables_initializer())
+      logits_out, pre_pool_out = sess.run([logits, pre_pool],
+                                          {inputs: images.eval()})
+      self.assertTupleEqual(logits_out.shape, (batch_size, num_classes))
+      self.assertTupleEqual(pre_pool_out.shape, (batch_size, 9, 11, 1536))
+
+  def testUnknownBatchSize(self):
+    batch_size = 1
+    height, width = 299, 299
+    num_classes = 1000
+    with self.test_session() as sess:
+      inputs = tf.compat.v1.placeholder(tf.float32, (None, height, width, 3))
+      logits, _ = inception.inception_v4(inputs, num_classes)
+      self.assertTrue(logits.op.name.startswith('InceptionV4/Logits'))
+      self.assertListEqual(logits.get_shape().as_list(),
+                           [None, num_classes])
+      images = tf.random.uniform((batch_size, height, width, 3))
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(logits, {inputs: images.eval()})
+      self.assertEquals(output.shape, (batch_size, num_classes))
+
+  def testEvaluation(self):
+    batch_size = 2
+    height, width = 299, 299
+    num_classes = 1000
+    with self.test_session() as sess:
+      eval_inputs = tf.random.uniform((batch_size, height, width, 3))
+      logits, _ = inception.inception_v4(eval_inputs,
+                                         num_classes,
+                                         is_training=False)
+      predictions = tf.argmax(input=logits, axis=1)
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(predictions)
+      self.assertEquals(output.shape, (batch_size,))
+
+  def testTrainEvalWithReuse(self):
+    train_batch_size = 5
+    eval_batch_size = 2
+    height, width = 150, 150
+    num_classes = 1000
+    with self.test_session() as sess:
+      train_inputs = tf.random.uniform((train_batch_size, height, width, 3))
+      inception.inception_v4(train_inputs, num_classes)
+      eval_inputs = tf.random.uniform((eval_batch_size, height, width, 3))
+      logits, _ = inception.inception_v4(eval_inputs,
+                                         num_classes,
+                                         is_training=False,
+                                         reuse=True)
+      predictions = tf.argmax(input=logits, axis=1)
+      sess.run(tf.compat.v1.global_variables_initializer())
+      output = sess.run(predictions)
+      self.assertEquals(output.shape, (eval_batch_size,))
+
+  def testNoBatchNormScaleByDefault(self):
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.compat.v1.placeholder(tf.float32, (1, height, width, 3))
+    with contrib_slim.arg_scope(inception.inception_v4_arg_scope()):
+      inception.inception_v4(inputs, num_classes, is_training=False)
+
+    self.assertEqual(tf.compat.v1.global_variables('.*/BatchNorm/gamma:0$'), [])
+
+  def testBatchNormScale(self):
+    height, width = 299, 299
+    num_classes = 1000
+    inputs = tf.compat.v1.placeholder(tf.float32, (1, height, width, 3))
+    with contrib_slim.arg_scope(
+        inception.inception_v4_arg_scope(batch_norm_scale=True)):
+      inception.inception_v4(inputs, num_classes, is_training=False)
+
+    gamma_names = set(
+        v.op.name
+        for v in tf.compat.v1.global_variables('.*/BatchNorm/gamma:0$'))
+    self.assertGreater(len(gamma_names), 0)
+    for v in tf.compat.v1.global_variables('.*/BatchNorm/moving_mean:0$'):
+      self.assertIn(v.op.name[:-len('moving_mean')] + 'gamma', gamma_names)
+
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,98 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains a variant of the LeNet model definition."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+slim = contrib_slim
+
+
+def lenet(images, num_classes=10, is_training=False,
+          dropout_keep_prob=0.5,
+          prediction_fn=slim.softmax,
+          scope='LeNet'):
+  """Creates a variant of the LeNet model.
+
+  Note that since the output is a set of 'logits', the values fall in the
+  interval of (-infinity, infinity). Consequently, to convert the outputs to a
+  probability distribution over the characters, one will need to convert them
+  using the softmax function:
+
+        logits = lenet.lenet(images, is_training=False)
+        probabilities = tf.nn.softmax(logits)
+        predictions = tf.argmax(logits, 1)
+
+  Args:
+    images: A batch of `Tensors` of size [batch_size, height, width, channels].
+    num_classes: the number of classes in the dataset. If 0 or None, the logits
+      layer is omitted and the input features to the logits layer are returned
+      instead.
+    is_training: specifies whether or not we're currently training the model.
+      This variable will determine the behaviour of the dropout layer.
+    dropout_keep_prob: the percentage of activation values that are retained.
+    prediction_fn: a function to get predictions out of logits.
+    scope: Optional variable_scope.
+
+  Returns:
+     net: a 2D Tensor with the logits (pre-softmax activations) if num_classes
+      is a non-zero integer, or the inon-dropped-out nput to the logits layer
+      if num_classes is 0 or None.
+    end_points: a dictionary from components of the network to the corresponding
+      activation.
+  """
+  end_points = {}
+
+  with tf.compat.v1.variable_scope(scope, 'LeNet', [images]):
+    net = end_points['conv1'] = slim.conv2d(images, 32, [5, 5], scope='conv1')
+    net = end_points['pool1'] = slim.max_pool2d(net, [2, 2], 2, scope='pool1')
+    net = end_points['conv2'] = slim.conv2d(net, 64, [5, 5], scope='conv2')
+    net = end_points['pool2'] = slim.max_pool2d(net, [2, 2], 2, scope='pool2')
+    net = slim.flatten(net)
+    end_points['Flatten'] = net
+
+    net = end_points['fc3'] = slim.fully_connected(net, 1024, scope='fc3')
+    if not num_classes:
+      return net, end_points
+    net = end_points['dropout3'] = slim.dropout(
+        net, dropout_keep_prob, is_training=is_training, scope='dropout3')
+    logits = end_points['Logits'] = slim.fully_connected(
+        net, num_classes, activation_fn=None, scope='fc4')
+
+  end_points['Predictions'] = prediction_fn(logits, scope='Predictions')
+
+  return logits, end_points
+lenet.default_image_size = 28
+
+
+def lenet_arg_scope(weight_decay=0.0):
+  """Defines the default lenet argument scope.
+
+  Args:
+    weight_decay: The weight decay to use for regularizing the model.
+
+  Returns:
+    An `arg_scope` to use for the inception v3 model.
+  """
+  with slim.arg_scope(
+      [slim.conv2d, slim.fully_connected],
+      weights_regularizer=slim.l2_regularizer(weight_decay),
+      weights_initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.1),
+      activation_fn=tf.nn.relu) as sc:
+    return sc
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Encoding" addBOMForNewFiles="with NO BOM" />
+</project>
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
+</project>
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/mobilenet.iml" filepath="$PROJECT_DIR$/.idea/mobilenet.iml" />
+    </modules>
+  </component>
+</project>
@@ -0,0 +1,166 @@
+# MobileNet
+
+This folder contains building code for
+[MobileNetV2](https://arxiv.org/abs/1801.04381) and
+[MobilenetV3](https://arxiv.org/abs/1905.02244) networks. The architectural
+definition for each model is located in [mobilenet_v2.py](mobilenet_v2.py) and
+[mobilenet_v3.py](mobilenet_v3.py) respectively.
+
+For MobilenetV1 please refer to this [page](../mobilenet_v1.md)
+
+We have also introduced a family of MobileNets customized for the Edge TPU
+accelerator found in
+[Google Pixel4](https://blog.google/products/pixel/pixel-4/) devices. The
+architectural definition for MobileNetEdgeTPU is located in
+[mobilenet_v3.py](mobilenet_v3.py)
+
+## Performance
+
+### Mobilenet V3 latency
+
+This is the timing of [MobileNetV2] vs [MobileNetV3] using TF-Lite on the large
+core of Pixel 1 phone.
+
+![Mobilenet V2 and V3 Latency for Pixel 1.png](g3doc/latency_pixel1.png)
+
+### MACs
+
+MACs, also sometimes known as MADDs - the number of multiply-accumulates needed
+to compute an inference on a single image is a common metric to measure the
+efficiency of the model. Full size Mobilenet V3 on image size 224 uses ~215
+Million MADDs (MMadds) while achieving accuracy 75.1%, while Mobilenet V2 uses
+~300MMadds and achieving accuracy 72%. By comparison ResNet-50 uses
+approximately 3500 MMAdds while achieving 76% accuracy.
+
+Below is the graph comparing Mobilenets and a few selected networks. The size of
+each blob represents the number of parameters. Note for
+[ShuffleNet](https://arxiv.org/abs/1707.01083) there are no published size
+numbers. We estimate it to be comparable to MobileNetV2 numbers.
+
+![madds_top1_accuracy](g3doc/madds_top1_accuracy.png)
+
+### Mobilenet EdgeTPU latency
+
+The figure below shows the Pixel 4 Edge TPU latency of int8-quantized Mobilenet
+EdgeTPU compared with MobilenetV2 and the minimalistic variants of MobilenetV3
+(see below).
+
+![Mobilenet Edge TPU latency for Pixel 4 Edge TPU.png](g3doc/edgetpu_latency.png)
+
+## Pretrained models
+
+### Mobilenet V3 Imagenet Checkpoints
+
+All mobilenet V3 checkpoints were trained with image resolution 224x224. All
+phone latencies are in milliseconds, measured on large core. In addition to
+large and small models this page also contains so-called minimalistic models,
+these models have the same per-layer dimensions characteristic as MobilenetV3
+however, they don't utilize any of the advanced blocks (squeeze-and-excite
+units, hard-swish, and 5x5 convolutions). While these models are less efficient
+on CPU, we find that they are much more performant on GPU/DSP.
+
+| Imagenet Checkpoint | MACs (M) | Params (M) | Top1 | Pixel 1 | Pixel 2 | Pixel 3 |
+| ------------------ | -------- | ---------- | ---- | ------- | ------- | ------- |
+| [Large dm=1 (float)]   | 217      | 5.4        | 75.2 | 51.2    | 61      | 44      |
+| [Large dm=1 (8-bit)] | 217      | 5.4        | 73.9 | 44      | 42.5    | 32      |
+| [Large dm=0.75 (float)] | 155      | 4.0        | 73.3 | 39.8    | 48      | 34      |
+| [Small dm=1 (float)]   | 66       | 2.9        | 67.5 | 15.8    | 19.4    | 14.4    |
+| [Small dm=1 (8-bit)]   | 66       | 2.9        | 64.9 | 15.5    | 15      | 10.7    |
+| [Small dm=0.75 (float)] | 44       | 2.4        | 65.4 | 12.8    | 15.9    | 11.6    |
+
+#### Minimalistic checkpoints:
+
+| Imagenet Checkpoint | MACs (M) | Params (M) | Top1 | Pixel 1 | Pixel 2 | Pixel 3 |
+| -------------- | -------- | ---------- | ---- | ------- | ------- | ------- |
+| [Large minimalistic (float)]   | 209      | 3.9        | 72.3 | 44.1    | 51      | 35      |
+| [Large minimalistic (8-bit)][lm8]   | 209      | 3.9        | 71.3 | 37      | 35      | 27      |
+| [Small minimalistic (float)]   | 65       | 2.0        | 61.9 | 12.2    | 15.1    | 11      |
+
+#### Edge TPU checkpoints:
+
+| Imagenet Checkpoint | MACs (M) | Params (M) | Top1 | Pixel 4 Edge TPU | Pixel 4 CPU |
+| ----------------- | -------- | ---------- | ---- | ------- | ----------- |
+| [MobilenetEdgeTPU dm=0.75 (8-bit)]| 624      | 2.9        | 73.5 | 3.1     | 13.8        |
+| [MobilenetEdgeTPU dm=1 (8-bit)] | 990      | 4.0        | 75.6 | 3.6     | 20.6        |
+
+
+Note: 8-bit quantized versions of the MobilenetEdgeTPU models were obtained
+using Tensorflow Lite's
+[post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization)
+tool.
+
+[Small minimalistic (float)]: https://storage.googleapis.com/mobilenet_v3/checkpoints/v3-small-minimalistic_224_1.0_float.tgz
+[Large minimalistic (float)]: https://storage.googleapis.com/mobilenet_v3/checkpoints/v3-large-minimalistic_224_1.0_float.tgz
+[lm8]: https://storage.googleapis.com/mobilenet_v3/checkpoints/v3-large-minimalistic_224_1.0_uint8.tgz
+[Large dm=1 (float)]: https://storage.googleapis.com/mobilenet_v3/checkpoints/v3-large_224_1.0_float.tgz
+[Small dm=1 (float)]: https://storage.googleapis.com/mobilenet_v3/checkpoints/v3-small_224_1.0_float.tgz
+[Large dm=1 (8-bit)]: https://storage.googleapis.com/mobilenet_v3/checkpoints/v3-large_224_1.0_uint8.tgz
+[Small dm=1 (8-bit)]: https://storage.googleapis.com/mobilenet_v3/checkpoints/v3-small_224_1.0_uint8.tgz
+[Large dm=0.75 (float)]: https://storage.googleapis.com/mobilenet_v3/checkpoints/v3-large_224_0.75_float.tgz
+[Small dm=0.75 (float)]: https://storage.googleapis.com/mobilenet_v3/checkpoints/v3-small_224_0.75_float.tgz
+[MobilenetEdgeTPU dm=0.75 (8-bit)]: https://storage.cloud.google.com/mobilenet_edgetpu/checkpoints/mobilenet_edgetpu_224_0.75.tgz
+[MobilenetEdgeTPU dm=1 (8-bit)]: https://storage.cloud.google.com/mobilenet_edgetpu/checkpoints/mobilenet_edgetpu_224_1.0.tgz
+
+### Mobilenet V2 Imagenet Checkpoints
+
+Classification Checkpoint                                                                                  | MACs (M) | Parameters (M) | Top 1 Accuracy | Top 5 Accuracy | Mobile CPU (ms) Pixel 1
+---------------------------------------------------------------------------------------------------------- | -------- | -------------- | -------------- | -------------- | -----------------------
+[mobilenet_v2_1.4_224](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.4_224.tgz)   | 582      | 6.06           | 75.0           | 92.5           | 138.0
+[mobilenet_v2_1.3_224](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.3_224.tgz)   | 509      | 5.34           | 74.4           | 92.1           | 123.0
+[mobilenet_v2_1.0_224](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_224.tgz)   | 300      | 3.47           | 71.8           | 91.0           | 73.8
+[mobilenet_v2_1.0_192](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_192.tgz)   | 221      | 3.47           | 70.7           | 90.1           | 55.1
+[mobilenet_v2_1.0_160](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_160.tgz)   | 154      | 3.47           | 68.8           | 89.0           | 40.2
+[mobilenet_v2_1.0_128](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_128.tgz)   | 99       | 3.47           | 65.3           | 86.9           | 27.6
+[mobilenet_v2_1.0_96](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_96.tgz)     | 56       | 3.47           | 60.3           | 83.2           | 17.6
+[mobilenet_v2_0.75_224](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.75_224.tgz) | 209      | 2.61           | 69.8           | 89.6           | 55.8
+[mobilenet_v2_0.75_192](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.75_192.tgz) | 153      | 2.61           | 68.7           | 88.9           | 41.6
+[mobilenet_v2_0.75_160](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.75_160.tgz) | 107      | 2.61           | 66.4           | 87.3           | 30.4
+[mobilenet_v2_0.75_128](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.75_128.tgz) | 69       | 2.61           | 63.2           | 85.3           | 21.9
+[mobilenet_v2_0.75_96](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.75_96.tgz)   | 39       | 2.61           | 58.8           | 81.6           | 14.2
+[mobilenet_v2_0.5_224](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.5_224.tgz)   | 97       | 1.95           | 65.4           | 86.4           | 28.7
+[mobilenet_v2_0.5_192](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.5_192.tgz)   | 71       | 1.95           | 63.9           | 85.4           | 21.1
+[mobilenet_v2_0.5_160](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.5_160.tgz)   | 50       | 1.95           | 61.0           | 83.2           | 14.9
+[mobilenet_v2_0.5_128](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.5_128.tgz)   | 32       | 1.95           | 57.7           | 80.8           | 9.9
+[mobilenet_v2_0.5_96](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.5_96.tgz)     | 18       | 1.95           | 51.2           | 75.8           | 6.4
+[mobilenet_v2_0.35_224](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.35_224.tgz) | 59       | 1.66           | 60.3           | 82.9           | 19.7
+[mobilenet_v2_0.35_192](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.35_192.tgz) | 43       | 1.66           | 58.2           | 81.2           | 14.6
+[mobilenet_v2_0.35_160](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.35_160.tgz) | 30       | 1.66           | 55.7           | 79.1           | 10.5
+[mobilenet_v2_0.35_128](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.35_128.tgz) | 20       | 1.66           | 50.8           | 75.0           | 6.9
+[mobilenet_v2_0.35_96](https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_0.35_96.tgz)   | 11       | 1.66           | 45.5           | 70.4           | 4.5
+
+## Training
+
+### V3
+
+TODO: Add V3 hyperparameters
+
+### V2
+
+The numbers above can be reproduced using slim's
+[`train_image_classifier`](https://github.com/tensorflow/models/blob/master/research/slim/README.md#training-a-model-from-scratch).
+Below is the set of parameters that achieves 72.0% for full size MobileNetV2,
+after about 700K when trained on 8 GPU. If trained on a single GPU the full
+convergence is after 5.5M steps. Also note that learning rate and
+num_epochs_per_decay both need to be adjusted depending on how many GPUs are
+being used due to slim's internal averaging.
+
+```bash
+--model_name="mobilenet_v2"
+--learning_rate=0.045 * NUM_GPUS   #slim internally averages clones so we compensate
+--preprocessing_name="inception_v2"
+--label_smoothing=0.1
+--moving_average_decay=0.9999
+--batch_size= 96
+--num_clones = NUM_GPUS # you can use any number here between 1 and 8 depending on your hardware setup.
+--learning_rate_decay_factor=0.98
+--num_epochs_per_decay = 2.5 / NUM_GPUS # train_image_classifier does per clone epochs
+```
+
+# Example
+
+See this [ipython notebook](mobilenet_example.ipynb) or open and run the network
+directly in
+[Colaboratory](https://colab.research.google.com/github/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_example.ipynb).
+
+[MobilenetV2]: https://arxiv.org/abs/1801.04381
+[MobilenetV3]: https://arxiv.org/abs/1905.02244
@@ -0,0 +1,475 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Convolution blocks for mobilenet."""
+import contextlib
+import functools
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+slim = contrib_slim
+
+
+def _fixed_padding(inputs, kernel_size, rate=1):
+  """Pads the input along the spatial dimensions independently of input size.
+
+  Pads the input such that if it was used in a convolution with 'VALID' padding,
+  the output would have the same dimensions as if the unpadded input was used
+  in a convolution with 'SAME' padding.
+
+  Args:
+    inputs: A tensor of size [batch, height_in, width_in, channels].
+    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
+    rate: An integer, rate for atrous convolution.
+
+  Returns:
+    output: A tensor of size [batch, height_out, width_out, channels] with the
+      input, either intact (if kernel_size == 1) or padded (if kernel_size > 1).
+  """
+  kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1),
+                           kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)]
+  pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1]
+  pad_beg = [pad_total[0] // 2, pad_total[1] // 2]
+  pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]]
+  padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]],
+                                  [pad_beg[1], pad_end[1]], [0, 0]])
+  return padded_inputs
+
+
+def _make_divisible(v, divisor, min_value=None):
+  if min_value is None:
+    min_value = divisor
+  new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_v < 0.9 * v:
+    new_v += divisor
+  return new_v
+
+
+def _split_divisible(num, num_ways, divisible_by=8):
+  """Evenly splits num, num_ways so each piece is a multiple of divisible_by."""
+  assert num % divisible_by == 0
+  assert num / num_ways >= divisible_by
+  # Note: want to round down, we adjust each split to match the total.
+  base = num // num_ways // divisible_by * divisible_by
+  result = []
+  accumulated = 0
+  for i in range(num_ways):
+    r = base
+    while accumulated + r < num * (i + 1) / num_ways:
+      r += divisible_by
+    result.append(r)
+    accumulated += r
+  assert accumulated == num
+  return result
+
+
+@contextlib.contextmanager
+def _v1_compatible_scope_naming(scope):
+  """v1 compatible scope naming."""
+  if scope is None:  # Create uniqified separable blocks.
+    with tf.compat.v1.variable_scope(None, default_name='separable') as s, \
+         tf.compat.v1.name_scope(s.original_name_scope):
+      yield ''
+  else:
+    # We use scope_depthwise, scope_pointwise for compatibility with V1 ckpts.
+    # which provide numbered scopes.
+    scope += '_'
+    yield scope
+
+
+@slim.add_arg_scope
+def split_separable_conv2d(input_tensor,
+                           num_outputs,
+                           scope=None,
+                           normalizer_fn=None,
+                           stride=1,
+                           rate=1,
+                           endpoints=None,
+                           use_explicit_padding=False):
+  """Separable mobilenet V1 style convolution.
+
+  Depthwise convolution, with default non-linearity,
+  followed by 1x1 depthwise convolution.  This is similar to
+  slim.separable_conv2d, but differs in tha it applies batch
+  normalization and non-linearity to depthwise. This  matches
+  the basic building of Mobilenet Paper
+  (https://arxiv.org/abs/1704.04861)
+
+  Args:
+    input_tensor: input
+    num_outputs: number of outputs
+    scope: optional name of the scope. Note if provided it will use
+    scope_depthwise for deptwhise, and scope_pointwise for pointwise.
+    normalizer_fn: which normalizer function to use for depthwise/pointwise
+    stride: stride
+    rate: output rate (also known as dilation rate)
+    endpoints: optional, if provided, will export additional tensors to it.
+    use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
+      inputs so that the output dimensions are the same as if 'SAME' padding
+      were used.
+
+  Returns:
+    output tesnor
+  """
+
+  with _v1_compatible_scope_naming(scope) as scope:
+    dw_scope = scope + 'depthwise'
+    endpoints = endpoints if endpoints is not None else {}
+    kernel_size = [3, 3]
+    padding = 'SAME'
+    if use_explicit_padding:
+      padding = 'VALID'
+      input_tensor = _fixed_padding(input_tensor, kernel_size, rate)
+    net = slim.separable_conv2d(
+        input_tensor,
+        None,
+        kernel_size,
+        depth_multiplier=1,
+        stride=stride,
+        rate=rate,
+        normalizer_fn=normalizer_fn,
+        padding=padding,
+        scope=dw_scope)
+
+    endpoints[dw_scope] = net
+
+    pw_scope = scope + 'pointwise'
+    net = slim.conv2d(
+        net,
+        num_outputs, [1, 1],
+        stride=1,
+        normalizer_fn=normalizer_fn,
+        scope=pw_scope)
+    endpoints[pw_scope] = net
+  return net
+
+
+def expand_input_by_factor(n, divisible_by=8):
+  return lambda num_inputs, **_: _make_divisible(num_inputs * n, divisible_by)
+
+
+def split_conv(input_tensor,
+               num_outputs,
+               num_ways,
+               scope,
+               divisible_by=8,
+               **kwargs):
+  """Creates a split convolution.
+
+  Split convolution splits the input and output into
+  'num_blocks' blocks of approximately the same size each,
+  and only connects $i$-th input to $i$ output.
+
+  Args:
+    input_tensor: input tensor
+    num_outputs: number of output filters
+    num_ways: num blocks to split by.
+    scope: scope for all the operators.
+    divisible_by: make sure that every part is divisiable by this.
+    **kwargs: will be passed directly into conv2d operator
+  Returns:
+    tensor
+  """
+  b = input_tensor.get_shape().as_list()[3]
+
+  if num_ways == 1 or min(b // num_ways,
+                          num_outputs // num_ways) < divisible_by:
+    # Don't do any splitting if we end up with less than 8 filters
+    # on either side.
+    return slim.conv2d(input_tensor, num_outputs, [1, 1], scope=scope, **kwargs)
+
+  outs = []
+  input_splits = _split_divisible(b, num_ways, divisible_by=divisible_by)
+  output_splits = _split_divisible(
+      num_outputs, num_ways, divisible_by=divisible_by)
+  inputs = tf.split(input_tensor, input_splits, axis=3, name='split_' + scope)
+  base = scope
+  for i, (input_tensor, out_size) in enumerate(zip(inputs, output_splits)):
+    scope = base + '_part_%d' % (i,)
+    n = slim.conv2d(input_tensor, out_size, [1, 1], scope=scope, **kwargs)
+    n = tf.identity(n, scope + '_output')
+    outs.append(n)
+  return tf.concat(outs, 3, name=scope + '_concat')
+
+
+@slim.add_arg_scope
+def expanded_conv(input_tensor,
+                  num_outputs,
+                  expansion_size=expand_input_by_factor(6),
+                  stride=1,
+                  rate=1,
+                  kernel_size=(3, 3),
+                  residual=True,
+                  normalizer_fn=None,
+                  split_projection=1,
+                  split_expansion=1,
+                  split_divisible_by=8,
+                  expansion_transform=None,
+                  depthwise_location='expansion',
+                  depthwise_channel_multiplier=1,
+                  endpoints=None,
+                  use_explicit_padding=False,
+                  padding='SAME',
+                  inner_activation_fn=None,
+                  depthwise_activation_fn=None,
+                  project_activation_fn=tf.identity,
+                  depthwise_fn=slim.separable_conv2d,
+                  expansion_fn=split_conv,
+                  projection_fn=split_conv,
+                  scope=None):
+  """Depthwise Convolution Block with expansion.
+
+  Builds a composite convolution that has the following structure
+  expansion (1x1) -> depthwise (kernel_size) -> projection (1x1)
+
+  Args:
+    input_tensor: input
+    num_outputs: number of outputs in the final layer.
+    expansion_size: the size of expansion, could be a constant or a callable.
+      If latter it will be provided 'num_inputs' as an input. For forward
+      compatibility it should accept arbitrary keyword arguments.
+      Default will expand the input by factor of 6.
+    stride: depthwise stride
+    rate: depthwise rate
+    kernel_size: depthwise kernel
+    residual: whether to include residual connection between input
+      and output.
+    normalizer_fn: batchnorm or otherwise
+    split_projection: how many ways to split projection operator
+      (that is conv expansion->bottleneck)
+    split_expansion: how many ways to split expansion op
+      (that is conv bottleneck->expansion) ops will keep depth divisible
+      by this value.
+    split_divisible_by: make sure every split group is divisible by this number.
+    expansion_transform: Optional function that takes expansion
+      as a single input and returns output.
+    depthwise_location: where to put depthwise covnvolutions supported
+      values None, 'input', 'output', 'expansion'
+    depthwise_channel_multiplier: depthwise channel multiplier:
+    each input will replicated (with different filters)
+    that many times. So if input had c channels,
+    output will have c x depthwise_channel_multpilier.
+    endpoints: An optional dictionary into which intermediate endpoints are
+      placed. The keys "expansion_output", "depthwise_output",
+      "projection_output" and "expansion_transform" are always populated, even
+      if the corresponding functions are not invoked.
+    use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
+      inputs so that the output dimensions are the same as if 'SAME' padding
+      were used.
+    padding: Padding type to use if `use_explicit_padding` is not set.
+    inner_activation_fn: activation function to use in all inner convolutions.
+    If none, will rely on slim default scopes.
+    depthwise_activation_fn: activation function to use for deptwhise only.
+      If not provided will rely on slim default scopes. If both
+      inner_activation_fn and depthwise_activation_fn are provided,
+      depthwise_activation_fn takes precedence over inner_activation_fn.
+    project_activation_fn: activation function for the project layer.
+    (note this layer is not affected by inner_activation_fn)
+    depthwise_fn: Depthwise convolution function.
+    expansion_fn: Expansion convolution function. If use custom function then
+      "split_expansion" and "split_divisible_by" will be ignored.
+    projection_fn: Projection convolution function. If use custom function then
+      "split_projection" and "split_divisible_by" will be ignored.
+
+    scope: optional scope.
+
+  Returns:
+    Tensor of depth num_outputs
+
+  Raises:
+    TypeError: on inval
+  """
+  conv_defaults = {}
+  dw_defaults = {}
+  if inner_activation_fn is not None:
+    conv_defaults['activation_fn'] = inner_activation_fn
+    dw_defaults['activation_fn'] = inner_activation_fn
+  if depthwise_activation_fn is not None:
+    dw_defaults['activation_fn'] = depthwise_activation_fn
+  # pylint: disable=g-backslash-continuation
+  with tf.compat.v1.variable_scope(scope, default_name='expanded_conv') as s, \
+       tf.compat.v1.name_scope(s.original_name_scope), \
+      slim.arg_scope((slim.conv2d,), **conv_defaults), \
+       slim.arg_scope((slim.separable_conv2d,), **dw_defaults):
+    prev_depth = input_tensor.get_shape().as_list()[3]
+    if  depthwise_location not in [None, 'input', 'output', 'expansion']:
+      raise TypeError('%r is unknown value for depthwise_location' %
+                      depthwise_location)
+    if use_explicit_padding:
+      if padding != 'SAME':
+        raise TypeError('`use_explicit_padding` should only be used with '
+                        '"SAME" padding.')
+      padding = 'VALID'
+    depthwise_func = functools.partial(
+        depthwise_fn,
+        num_outputs=None,
+        kernel_size=kernel_size,
+        depth_multiplier=depthwise_channel_multiplier,
+        stride=stride,
+        rate=rate,
+        normalizer_fn=normalizer_fn,
+        padding=padding,
+        scope='depthwise')
+    # b1 -> b2 * r -> b2
+    #   i -> (o * r) (bottleneck) -> o
+    input_tensor = tf.identity(input_tensor, 'input')
+    net = input_tensor
+
+    if depthwise_location == 'input':
+      if use_explicit_padding:
+        net = _fixed_padding(net, kernel_size, rate)
+      net = depthwise_func(net, activation_fn=None)
+      net = tf.identity(net, name='depthwise_output')
+      if endpoints is not None:
+        endpoints['depthwise_output'] = net
+
+    if callable(expansion_size):
+      inner_size = expansion_size(num_inputs=prev_depth)
+    else:
+      inner_size = expansion_size
+
+    if inner_size > net.shape[3]:
+      if expansion_fn == split_conv:
+        expansion_fn = functools.partial(
+            expansion_fn,
+            num_ways=split_expansion,
+            divisible_by=split_divisible_by,
+            stride=1)
+      net = expansion_fn(
+          net,
+          inner_size,
+          scope='expand',
+          normalizer_fn=normalizer_fn)
+      net = tf.identity(net, 'expansion_output')
+      if endpoints is not None:
+        endpoints['expansion_output'] = net
+
+    if depthwise_location == 'expansion':
+      if use_explicit_padding:
+        net = _fixed_padding(net, kernel_size, rate)
+      net = depthwise_func(net)
+      net = tf.identity(net, name='depthwise_output')
+      if endpoints is not None:
+        endpoints['depthwise_output'] = net
+
+    if expansion_transform:
+      net = expansion_transform(expansion_tensor=net, input_tensor=input_tensor)
+    # Note in contrast with expansion, we always have
+    # projection to produce the desired output size.
+    if projection_fn == split_conv:
+      projection_fn = functools.partial(
+          projection_fn,
+          num_ways=split_projection,
+          divisible_by=split_divisible_by,
+          stride=1)
+    net = projection_fn(
+        net,
+        num_outputs,
+        scope='project',
+        normalizer_fn=normalizer_fn,
+        activation_fn=project_activation_fn)
+    if endpoints is not None:
+      endpoints['projection_output'] = net
+    if depthwise_location == 'output':
+      if use_explicit_padding:
+        net = _fixed_padding(net, kernel_size, rate)
+      net = depthwise_func(net, activation_fn=None)
+      net = tf.identity(net, name='depthwise_output')
+      if endpoints is not None:
+        endpoints['depthwise_output'] = net
+
+    if callable(residual):  # custom residual
+      net = residual(input_tensor=input_tensor, output_tensor=net)
+    elif (residual and
+          # stride check enforces that we don't add residuals when spatial
+          # dimensions are None
+          stride == 1 and
+          # Depth matches
+          net.get_shape().as_list()[3] ==
+          input_tensor.get_shape().as_list()[3]):
+      net += input_tensor
+    return tf.identity(net, name='output')
+
+
+@slim.add_arg_scope
+def squeeze_excite(input_tensor,
+                   divisible_by=8,
+                   squeeze_factor=3,
+                   inner_activation_fn=tf.nn.relu,
+                   gating_fn=tf.sigmoid,
+                   squeeze_input_tensor=None,
+                   pool=None):
+  """Squeeze excite block for Mobilenet V3.
+
+  If the squeeze_input_tensor - or the input_tensor if squeeze_input_tensor is
+  None - contains variable dimensions (Nonetype in tensor shape), perform
+  average pooling (as the first step in the squeeze operation) by calling
+  reduce_mean across the H/W of the input tensor.
+
+  Args:
+    input_tensor: input tensor to apply SE block to.
+    divisible_by: ensures all inner dimensions are divisible by this number.
+    squeeze_factor: the factor of squeezing in the inner fully connected layer
+    inner_activation_fn: non-linearity to be used in inner layer.
+    gating_fn: non-linearity to be used for final gating function
+    squeeze_input_tensor: custom tensor to use for computing gating activation.
+     If provided the result will be input_tensor * SE(squeeze_input_tensor)
+     instead of input_tensor * SE(input_tensor).
+    pool: if number is  provided will average pool with that kernel size
+      to compute inner tensor, followed by bilinear upsampling.
+
+  Returns:
+    Gated input_tensor. (e.g. X * SE(X))
+  """
+  with tf.compat.v1.variable_scope('squeeze_excite'):
+    if squeeze_input_tensor is None:
+      squeeze_input_tensor = input_tensor
+    input_size = input_tensor.shape.as_list()[1:3]
+    pool_height, pool_width = squeeze_input_tensor.shape.as_list()[1:3]
+    stride = 1
+    if pool is not None and pool_height >= pool:
+      pool_height, pool_width, stride = pool, pool, pool
+    input_channels = squeeze_input_tensor.shape.as_list()[3]
+    output_channels = input_tensor.shape.as_list()[3]
+    squeeze_channels = _make_divisible(
+        input_channels / squeeze_factor, divisor=divisible_by)
+
+    if pool is None:
+      pooled = tf.reduce_mean(squeeze_input_tensor, axis=[1, 2], keepdims=True)
+    else:
+      pooled = tf.nn.avg_pool(
+          squeeze_input_tensor, (1, pool_height, pool_width, 1),
+          strides=(1, stride, stride, 1),
+          padding='VALID')
+    squeeze = slim.conv2d(
+        pooled,
+        kernel_size=(1, 1),
+        num_outputs=squeeze_channels,
+        normalizer_fn=None,
+        activation_fn=inner_activation_fn)
+    excite_outputs = output_channels
+    excite = slim.conv2d(squeeze, num_outputs=excite_outputs,
+                         kernel_size=[1, 1],
+                         normalizer_fn=None,
+                         activation_fn=gating_fn)
+    if pool is not None:
+      # Note: As of 03/20/2019 only BILINEAR (the default) with
+      # align_corners=True has gradients implemented in TPU.
+      excite = tf.image.resize_images(
+          excite, input_size,
+          align_corners=True)
+    result = input_tensor * excite
+  return result
@@ -0,0 +1,501 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Mobilenet Base Class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import contextlib
+import copy
+import os
+
+import tensorflow as tf
+from tensorflow.contrib import slim as contrib_slim
+
+slim = contrib_slim
+
+
+@slim.add_arg_scope
+def apply_activation(x, name=None, activation_fn=None):
+  return activation_fn(x, name=name) if activation_fn else x
+
+
+def _fixed_padding(inputs, kernel_size, rate=1):
+  """Pads the input along the spatial dimensions independently of input size.
+
+  Pads the input such that if it was used in a convolution with 'VALID' padding,
+  the output would have the same dimensions as if the unpadded input was used
+  in a convolution with 'SAME' padding.
+
+  Args:
+    inputs: A tensor of size [batch, height_in, width_in, channels].
+    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
+    rate: An integer, rate for atrous convolution.
+
+  Returns:
+    output: A tensor of size [batch, height_out, width_out, channels] with the
+      input, either intact (if kernel_size == 1) or padded (if kernel_size > 1).
+  """
+  kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1),
+                           kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)]
+  pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1]
+  pad_beg = [pad_total[0] // 2, pad_total[1] // 2]
+  pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]]
+  padded_inputs = tf.pad(
+      tensor=inputs,
+      paddings=[[0, 0], [pad_beg[0], pad_end[0]], [pad_beg[1], pad_end[1]],
+                [0, 0]])
+  return padded_inputs
+
+
+def _make_divisible(v, divisor, min_value=None):
+  if min_value is None:
+    min_value = divisor
+  new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_v < 0.9 * v:
+    new_v += divisor
+  return int(new_v)
+
+
+@contextlib.contextmanager
+def _set_arg_scope_defaults(defaults):
+  """Sets arg scope defaults for all items present in defaults.
+
+  Args:
+    defaults: dictionary/list of pairs, containing a mapping from
+    function to a dictionary of default args.
+
+  Yields:
+    context manager where all defaults are set.
+  """
+  if hasattr(defaults, 'items'):
+    items = list(defaults.items())
+  else:
+    items = defaults
+  if not items:
+    yield
+  else:
+    func, default_arg = items[0]
+    with slim.arg_scope(func, **default_arg):
+      with _set_arg_scope_defaults(items[1:]):
+        yield
+
+
+@slim.add_arg_scope
+def depth_multiplier(output_params,
+                     multiplier,
+                     divisible_by=8,
+                     min_depth=8,
+                     **unused_kwargs):
+  if 'num_outputs' not in output_params:
+    return
+  d = output_params['num_outputs']
+  output_params['num_outputs'] = _make_divisible(d * multiplier, divisible_by,
+                                                 min_depth)
+
+
+_Op = collections.namedtuple('Op', ['op', 'params', 'multiplier_func'])
+
+
+def op(opfunc, multiplier_func=depth_multiplier, **params):
+  multiplier = params.pop('multiplier_transform', multiplier_func)
+  return _Op(opfunc, params=params, multiplier_func=multiplier)
+
+
+class NoOpScope(object):
+  """No-op context manager."""
+
+  def __enter__(self):
+    return None
+
+  def __exit__(self, exc_type, exc_value, traceback):
+    return False
+
+
+def safe_arg_scope(funcs, **kwargs):
+  """Returns `slim.arg_scope` with all None arguments removed.
+
+  Arguments:
+    funcs: Functions to pass to `arg_scope`.
+    **kwargs: Arguments to pass to `arg_scope`.
+
+  Returns:
+    arg_scope or No-op context manager.
+
+  Note: can be useful if None value should be interpreted as "do not overwrite
+    this parameter value".
+  """
+  filtered_args = {name: value for name, value in kwargs.items()
+                   if value is not None}
+  if filtered_args:
+    return slim.arg_scope(funcs, **filtered_args)
+  else:
+    return NoOpScope()
+
+
+@slim.add_arg_scope
+def mobilenet_base(  # pylint: disable=invalid-name
+    inputs,
+    conv_defs,
+    multiplier=1.0,
+    final_endpoint=None,
+    output_stride=None,
+    use_explicit_padding=False,
+    scope=None,
+    is_training=False):
+  """Mobilenet base network.
+
+  Constructs a network from inputs to the given final endpoint. By default
+  the network is constructed in inference mode. To create network
+  in training mode use:
+
+  with slim.arg_scope(mobilenet.training_scope()):
+     logits, endpoints = mobilenet_base(...)
+
+  Args:
+    inputs: a tensor of shape [batch_size, height, width, channels].
+    conv_defs: A list of op(...) layers specifying the net architecture.
+    multiplier: Float multiplier for the depth (number of channels)
+      for all convolution ops. The value must be greater than zero. Typical
+      usage will be to set this value in (0, 1) to reduce the number of
+      parameters or computation cost of the model.
+    final_endpoint: The name of last layer, for early termination for
+    for V1-based networks: last layer is "layer_14", for V2: "layer_20"
+    output_stride: An integer that specifies the requested ratio of input to
+      output spatial resolution. If not None, then we invoke atrous convolution
+      if necessary to prevent the network from reducing the spatial resolution
+      of the activation maps. Allowed values are 1 or any even number, excluding
+      zero. Typical values are 8 (accurate fully convolutional mode), 16
+      (fast fully convolutional mode), and 32 (classification mode).
+
+      NOTE- output_stride relies on all consequent operators to support dilated
+      operators via "rate" parameter. This might require wrapping non-conv
+      operators to operate properly.
+
+    use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
+      inputs so that the output dimensions are the same as if 'SAME' padding
+      were used.
+    scope: optional variable scope.
+    is_training: How to setup batch_norm and other ops. Note: most of the time
+      this does not need be set directly. Use mobilenet.training_scope() to set
+      up training instead. This parameter is here for backward compatibility
+      only. It is safe to set it to the value matching
+      training_scope(is_training=...). It is also safe to explicitly set
+      it to False, even if there is outer training_scope set to to training.
+      (The network will be built in inference mode). If this is set to None,
+      no arg_scope is added for slim.batch_norm's is_training parameter.
+
+  Returns:
+    tensor_out: output tensor.
+    end_points: a set of activations for external use, for example summaries or
+                losses.
+
+  Raises:
+    ValueError: depth_multiplier <= 0, or the target output_stride is not
+                allowed.
+  """
+  if multiplier <= 0:
+    raise ValueError('multiplier is not greater than zero.')
+
+  # Set conv defs defaults and overrides.
+  conv_defs_defaults = conv_defs.get('defaults', {})
+  conv_defs_overrides = conv_defs.get('overrides', {})
+  if use_explicit_padding:
+    conv_defs_overrides = copy.deepcopy(conv_defs_overrides)
+    conv_defs_overrides[
+        (slim.conv2d, slim.separable_conv2d)] = {'padding': 'VALID'}
+
+  if output_stride is not None:
+    if output_stride == 0 or (output_stride > 1 and output_stride % 2):
+      raise ValueError('Output stride must be None, 1 or a multiple of 2.')
+
+  # a) Set the tensorflow scope
+  # b) set padding to default: note we might consider removing this
+  # since it is also set by mobilenet_scope
+  # c) set all defaults
+  # d) set all extra overrides.
+  # pylint: disable=g-backslash-continuation
+  with _scope_all(scope, default_scope='Mobilenet'), \
+      safe_arg_scope([slim.batch_norm], is_training=is_training), \
+      _set_arg_scope_defaults(conv_defs_defaults), \
+      _set_arg_scope_defaults(conv_defs_overrides):
+    # The current_stride variable keeps track of the output stride of the
+    # activations, i.e., the running product of convolution strides up to the
+    # current network layer. This allows us to invoke atrous convolution
+    # whenever applying the next convolution would result in the activations
+    # having output stride larger than the target output_stride.
+    current_stride = 1
+
+    # The atrous convolution rate parameter.
+    rate = 1
+
+    net = inputs
+    # Insert default parameters before the base scope which includes
+    # any custom overrides set in mobilenet.
+    end_points = {}
+    scopes = {}
+    for i, opdef in enumerate(conv_defs['spec']):
+      params = dict(opdef.params)
+      opdef.multiplier_func(params, multiplier)
+      stride = params.get('stride', 1)
+      if output_stride is not None and current_stride == output_stride:
+        # If we have reached the target output_stride, then we need to employ
+        # atrous convolution with stride=1 and multiply the atrous rate by the
+        # current unit's stride for use in subsequent layers.
+        layer_stride = 1
+        layer_rate = rate
+        rate *= stride
+      else:
+        layer_stride = stride
+        layer_rate = 1
+        current_stride *= stride
+      # Update params.
+      params['stride'] = layer_stride
+      # Only insert rate to params if rate > 1 and kernel size is not [1, 1].
+      if layer_rate > 1:
+        if tuple(params.get('kernel_size', [])) != (1, 1):
+          # We will apply atrous rate in the following cases:
+          # 1) When kernel_size is not in params, the operation then uses
+          #   default kernel size 3x3.
+          # 2) When kernel_size is in params, and if the kernel_size is not
+          #   equal to (1, 1) (there is no need to apply atrous convolution to
+          #   any 1x1 convolution).
+          params['rate'] = layer_rate
+      # Set padding
+      if use_explicit_padding:
+        if 'kernel_size' in params:
+          net = _fixed_padding(net, params['kernel_size'], layer_rate)
+        else:
+          params['use_explicit_padding'] = True
+
+      end_point = 'layer_%d' % (i + 1)
+      try:
+        net = opdef.op(net, **params)
+      except Exception:
+        print('Failed to create op %i: %r params: %r' % (i, opdef, params))
+        raise
+      end_points[end_point] = net
+      scope = os.path.dirname(net.name)
+      scopes[scope] = end_point
+      if final_endpoint is not None and end_point == final_endpoint:
+        break
+
+    # Add all tensors that end with 'output' to
+    # endpoints
+    for t in net.graph.get_operations():
+      scope = os.path.dirname(t.name)
+      bn = os.path.basename(t.name)
+      if scope in scopes and t.name.endswith('output'):
+        end_points[scopes[scope] + '/' + bn] = t.outputs[0]
+    return net, end_points
+
+
+@contextlib.contextmanager
+def _scope_all(scope, default_scope=None):
+  with tf.compat.v1.variable_scope(scope, default_name=default_scope) as s,\
+       tf.compat.v1.name_scope(s.original_name_scope):
+    yield s
+
+
+@slim.add_arg_scope
+def mobilenet(inputs,
+              num_classes=1001,
+              prediction_fn=slim.softmax,
+              reuse=None,
+              scope='Mobilenet',
+              base_only=False,
+              **mobilenet_args):
+  """Mobilenet model for classification, supports both V1 and V2.
+
+  Note: default mode is inference, use mobilenet.training_scope to create
+  training network.
+
+
+  Args:
+    inputs: a tensor of shape [batch_size, height, width, channels].
+    num_classes: number of predicted classes. If 0 or None, the logits layer
+      is omitted and the input features to the logits layer (before dropout)
+      are returned instead.
+    prediction_fn: a function to get predictions out of logits
+      (default softmax).
+    reuse: whether or not the network and its variables should be reused. To be
+      able to reuse 'scope' must be given.
+    scope: Optional variable_scope.
+    base_only: if True will only create the base of the network (no pooling
+    and no logits).
+    **mobilenet_args: passed to mobilenet_base verbatim.
+      - conv_defs: list of conv defs
+      - multiplier: Float multiplier for the depth (number of channels)
+      for all convolution ops. The value must be greater than zero. Typical
+      usage will be to set this value in (0, 1) to reduce the number of
+      parameters or computation cost of the model.
+      - output_stride: will ensure that the last layer has at most total stride.
+      If the architecture calls for more stride than that provided
+      (e.g. output_stride=16, but the architecture has 5 stride=2 operators),
+      it will replace output_stride with fractional convolutions using Atrous
+      Convolutions.
+
+  Returns:
+    logits: the pre-softmax activations, a tensor of size
+      [batch_size, num_classes]
+    end_points: a dictionary from components of the network to the corresponding
+      activation tensor.
+
+  Raises:
+    ValueError: Input rank is invalid.
+  """
+  is_training = mobilenet_args.get('is_training', False)
+  input_shape = inputs.get_shape().as_list()
+  if len(input_shape) != 4:
+    raise ValueError('Expected rank 4 input, was: %d' % len(input_shape))
+
+  with tf.compat.v1.variable_scope(scope, 'Mobilenet', reuse=reuse) as scope:
+    inputs = tf.identity(inputs, 'input')
+    net, end_points = mobilenet_base(inputs, scope=scope, **mobilenet_args)
+    if base_only:
+      return net, end_points
+
+    net = tf.identity(net, name='embedding')
+
+    with tf.compat.v1.variable_scope('Logits'):
+      net = global_pool(net)
+      end_points['global_pool'] = net
+      if not num_classes:
+        return net, end_points
+      # net = slim.dropout(net, scope='Dropout', is_training=is_training)
+      # 1 x 1 x num_classes
+      # Note: legacy scope name.
+      # logits = slim.conv2d(
+      #     net,
+      #     num_classes, [1, 1],
+      #     activation_fn=None,
+      #     normalizer_fn=None,
+      #     biases_initializer=tf.compat.v1.zeros_initializer(),
+      #     scope='Conv2d_1c_1x1')
+
+      # logits = tf.squeeze(logits, [1, 2])
+
+      # use slim.fully_connected instead
+      net = tf.squeeze(net)
+      net = slim.dropout(net, keep_prob=0.8, scope='Dropout', is_training=is_training)
+      logits = slim.fully_connected(
+        net,
+        num_classes,
+        activation_fn=None,
+        normalizer_fn=None,
+        scope='FC'
+        )
+      #logits = tf.expand_dims(logits, axis=[])
+
+      logits = tf.identity(logits, name='output')
+    end_points['Logits'] = logits
+    if prediction_fn:
+      end_points['Predictions'] = prediction_fn(logits, 'Predictions')
+  return logits, end_points
+
+
+def global_pool(input_tensor, pool_op=tf.compat.v2.nn.avg_pool2d):
+  """Applies avg pool to produce 1x1 output.
+
+  NOTE: This function is funcitonally equivalenet to reduce_mean, but it has
+  baked in average pool which has better support across hardware.
+
+  Args:
+    input_tensor: input tensor
+    pool_op: pooling op (avg pool is default)
+  Returns:
+    a tensor batch_size x 1 x 1 x depth.
+  """
+  shape = input_tensor.get_shape().as_list()
+  if shape[1] is None or shape[2] is None:
+    kernel_size = tf.convert_to_tensor(value=[
+        1,
+        tf.shape(input=input_tensor)[1],
+        tf.shape(input=input_tensor)[2], 1
+    ])
+  else:
+    kernel_size = [1, shape[1], shape[2], 1]
+  output = pool_op(
+      input_tensor, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID')
+  # Recover output shape, for unknown shape.
+  output.set_shape([None, 1, 1, None])
+  return output
+
+
+def training_scope(is_training=True,
+                   weight_decay=0.00004,
+                   stddev=0.09,
+                   dropout_keep_prob=0.8,
+                   bn_decay=0.997):
+  """Defines Mobilenet training scope.
+
+  Usage:
+     with tf.contrib.slim.arg_scope(mobilenet.training_scope()):
+       logits, endpoints = mobilenet_v2.mobilenet(input_tensor)
+
+     # the network created will be trainble with dropout/batch norm
+     # initialized appropriately.
+  Args:
+    is_training: if set to False this will ensure that all customizations are
+      set to non-training mode. This might be helpful for code that is reused
+      across both training/evaluation, but most of the time training_scope with
+      value False is not needed. If this is set to None, the parameters is not
+      added to the batch_norm arg_scope.
+
+    weight_decay: The weight decay to use for regularizing the model.
+    stddev: Standard deviation for initialization, if negative uses xavier.
+    dropout_keep_prob: dropout keep probability (not set if equals to None).
+    bn_decay: decay for the batch norm moving averages (not set if equals to
+      None).
+
+  Returns:
+    An argument scope to use via arg_scope.
+  """
+  # Note: do not introduce parameters that would change the inference
+  # model here (for example whether to use bias), modify conv_def instead.
+  batch_norm_params = {
+      'decay': bn_decay,
+      'is_training': is_training
+  }
+  #if stddev < 0:
+  #  weight_intitializer = slim.initializers.xavier_initializer()
+  #else:
+  #  weight_intitializer = tf.compat.v1.truncated_normal_initializer(stddev=stddev)
+
+  # modified for NPU
+  weight_2d = tf.initializers.variance_scaling(scale=2., mode="fan_out", distribution="untruncated_normal")
+  weight_dw = tf.initializers.variance_scaling(scale=2., mode="fan_in", distribution="untruncated_normal")
+  weight_pw = tf.initializers.variance_scaling(scale=2., mode="fan_out", distribution="untruncated_normal")
+  weight_fc = tf.initializers.random_normal(stddev=0.01)
+
+  # Set weight_decay for weights in Conv and FC layers.
+  with slim.arg_scope(
+      #[slim.conv2d, slim.fully_connected, slim.separable_conv2d],
+      [slim.conv2d],
+      #weights_initializer=weight_intitializer,
+      weights_initializer=weight_2d,
+      normalizer_fn=slim.batch_norm), \
+      slim.arg_scope([slim.fully_connected], weights_initializer=weight_fc, normalizer_fn=slim.batch_norm), \
+      slim.arg_scope([slim.separable_conv2d], weights_initializer=weight_dw, pointwise_initializer=weight_pw, normalizer_fn=slim.batch_norm), \
+      slim.arg_scope([mobilenet_base, mobilenet], is_training=is_training),\
+      safe_arg_scope([slim.batch_norm], **batch_norm_params), \
+      safe_arg_scope([slim.dropout], is_training=is_training,
+                     keep_prob=dropout_keep_prob), \
+      slim.arg_scope([slim.conv2d], \
+                     weights_regularizer=slim.l2_regularizer(weight_decay)), \
+      slim.arg_scope([slim.separable_conv2d], weights_regularizer=None) as s:
+    return s
--- a/Show More
+++ b/Show More