#hide
! [ -e /content ] && pip install -Uqq fastbook
import fastbook
fastbook.setup_book()ResNets
#hide
from fastbook import *Going Back to Imagenette
def get_data(url, presize, resize):
path = untar_data(url)
return DataBlock(
blocks=(ImageBlock, CategoryBlock), get_items=get_image_files,
splitter=GrandparentSplitter(valid_name='val'),
get_y=parent_label, item_tfms=Resize(presize),
batch_tfms=[*aug_transforms(min_scale=0.5, size=resize),
Normalize.from_stats(*imagenet_stats)],
).dataloaders(path, bs=128)dls = get_data(URLs.IMAGENETTE_160, 160, 128)dls.show_batch(max_n=4)def avg_pool(x): return x.mean((2,3))def block(ni, nf): return ConvLayer(ni, nf, stride=2)
def get_model():
return nn.Sequential(
block(3, 16),
block(16, 32),
block(32, 64),
block(64, 128),
block(128, 256),
nn.AdaptiveAvgPool2d(1),
Flatten(),
nn.Linear(256, dls.c))def get_learner(m):
return Learner(dls, m, loss_func=nn.CrossEntropyLoss(), metrics=accuracy
).to_fp16()
learn = get_learner(get_model())learn.lr_find()learn.fit_one_cycle(5, 3e-3)Building a Modern CNN: ResNet
Skip Connections
class ResBlock(Module):
def __init__(self, ni, nf):
self.convs = nn.Sequential(
ConvLayer(ni,nf),
ConvLayer(nf,nf, norm_type=NormType.BatchZero))
def forward(self, x): return x + self.convs(x)def _conv_block(ni,nf,stride):
return nn.Sequential(
ConvLayer(ni, nf, stride=stride),
ConvLayer(nf, nf, act_cls=None, norm_type=NormType.BatchZero))class ResBlock(Module):
def __init__(self, ni, nf, stride=1):
self.convs = _conv_block(ni,nf,stride)
self.idconv = noop if ni==nf else ConvLayer(ni, nf, 1, act_cls=None)
self.pool = noop if stride==1 else nn.AvgPool2d(2, ceil_mode=True)
def forward(self, x):
return F.relu(self.convs(x) + self.idconv(self.pool(x)))def block(ni,nf): return ResBlock(ni, nf, stride=2)
learn = get_learner(get_model())learn.fit_one_cycle(5, 3e-3)def block(ni, nf):
return nn.Sequential(ResBlock(ni, nf, stride=2), ResBlock(nf, nf))learn = get_learner(get_model())
learn.fit_one_cycle(5, 3e-3)A State-of-the-Art ResNet
def _resnet_stem(*sizes):
return [
ConvLayer(sizes[i], sizes[i+1], 3, stride = 2 if i==0 else 1)
for i in range(len(sizes)-1)
] + [nn.MaxPool2d(kernel_size=3, stride=2, padding=1)]_resnet_stem(3,32,32,64)class ResNet(nn.Sequential):
def __init__(self, n_out, layers, expansion=1):
stem = _resnet_stem(3,32,32,64)
self.block_szs = [64, 64, 128, 256, 512]
for i in range(1,5): self.block_szs[i] *= expansion
blocks = [self._make_layer(*o) for o in enumerate(layers)]
super().__init__(*stem, *blocks,
nn.AdaptiveAvgPool2d(1), Flatten(),
nn.Linear(self.block_szs[-1], n_out))
def _make_layer(self, idx, n_layers):
stride = 1 if idx==0 else 2
ch_in,ch_out = self.block_szs[idx:idx+2]
return nn.Sequential(*[
ResBlock(ch_in if i==0 else ch_out, ch_out, stride if i==0 else 1)
for i in range(n_layers)
])rn = ResNet(dls.c, [2,2,2,2])learn = get_learner(rn)
learn.fit_one_cycle(5, 3e-3)Bottleneck Layers
def _conv_block(ni,nf,stride):
return nn.Sequential(
ConvLayer(ni, nf//4, 1),
ConvLayer(nf//4, nf//4, stride=stride),
ConvLayer(nf//4, nf, 1, act_cls=None, norm_type=NormType.BatchZero))dls = get_data(URLs.IMAGENETTE_320, presize=320, resize=224)rn = ResNet(dls.c, [3,4,6,3], 4)learn = get_learner(rn)
learn.fit_one_cycle(20, 3e-3)Conclusion
Questionnaire
- How did we get to a single vector of activations in the CNNs used for MNIST in previous chapters? Why isn’t that suitable for Imagenette?
- What do we do for Imagenette instead?
- What is “adaptive pooling”?
- What is “average pooling”?
- Why do we need
Flattenafter an adaptive average pooling layer? - What is a “skip connection”?
- Why do skip connections allow us to train deeper models?
- What does <
> show? How did that lead to the idea of skip connections? - What is “identity mapping”?
- What is the basic equation for a ResNet block (ignoring batchnorm and ReLU layers)?
- What do ResNets have to do with residuals?
- How do we deal with the skip connection when there is a stride-2 convolution? How about when the number of filters changes?
- How can we express a 1×1 convolution in terms of a vector dot product?
- Create a
1x1 convolutionwithF.conv2dornn.Conv2dand apply it to an image. What happens to theshapeof the image? - What does the
noopfunction return? - Explain what is shown in <
>. - When is top-5 accuracy a better metric than top-1 accuracy?
- What is the “stem” of a CNN?
- Why do we use plain convolutions in the CNN stem, instead of ResNet blocks?
- How does a bottleneck block differ from a plain ResNet block?
- Why is a bottleneck block faster?
- How do fully convolutional nets (and nets with adaptive pooling in general) allow for progressive resizing?
Further Research
- Try creating a fully convolutional net with adaptive average pooling for MNIST (note that you’ll need fewer stride-2 layers). How does it compare to a network without such a pooling layer?
- In <
> we introduce Einstein summation notation. Skip ahead to see how this works, and then write an implementation of the 1×1 convolution operation using torch.einsum. Compare it to the same operation usingtorch.conv2d. - Write a “top-5 accuracy” function using plain PyTorch or plain Python.
- Train a model on Imagenette for more epochs, with and without label smoothing. Take a look at the Imagenette leaderboards and see how close you can get to the best results shown. Read the linked pages describing the leading approaches.