import osimport randomimport urllibimport urllib.requestimport shutilfrom glob import globfrom PIL import Imageimport cv2from collections import defaultdictimport numpy as npimport matplotlib.pyplot as pltimport albumentations as Afrom albumentations.pytorch import ToTensorV2from tqdm import tqdmimport timmimport torchimport torch.nn as nnimport torch.nn.functional as Ffrom torch.utils.data import Dataset, DataLoader, random_splitimport torch.optim as optimimport torchvisionfrom torchvision.datasets import ImageFolderfrom fastcore.allimport*import wandb
Utility Functions
Defining useful stuff we use in this Notebook:
Some utility functions for this notebook
# Data Handling stuffdef get_data(URL, FILE, FOLDER):# This is a function that downloads and extracts the data# then returns a pathlib object containing the location of the data# Downloadingifnot os.path.isfile(FILE):print(f'Downloading {URL} and saving it as {FILE}')print('-'*120) urllib.request.urlretrieve(URL, FILE)print('Finished Downloading')else:print(f'{FILE} already exists')# Extractingprint('\n')print(f'Extracting Files into {FOLDER}') shutil.unpack_archive(FILE, FOLDER)return Path(FOLDER)def get_loaders(bs): trainloader = DataLoader(train_ds, batch_size=bs, shuffle=True, num_workers=2, pin_memory=True) valoader = DataLoader(val_ds, batch_size=bs, shuffle=False, num_workers=2, pin_memory=True)return trainloader, valoader# Training Helper Functionsdef calculate_accuracy(preds, target): correct =0 total =0 predicted = torch.argmax(preds, axis=1) total += target.shape[0] correct +=int((predicted==target).sum())return correct / totalclass MetricMonitor:def__init__(self, float_precision=3):self.float_precision = float_precisionself.reset()def reset(self):self.metrics = defaultdict(lambda: {"val": 0, "count": 0, "avg": 0})def update(self, metric_name, val): metric =self.metrics[metric_name] metric["val"] += val metric["count"] +=1 metric["avg"] = metric["val"] / metric["count"]def__str__(self):return" | ".join( [f"{metric_name}: {metric['avg']:.{self.float_precision}f}"for (metric_name, metric) inself.metrics.items() ] )def init_weights(m):iftype(m) == nn.Linear: nn.init.kaiming_normal_(m.weight)
A Quick Note about fastai and Transfer Learning
Tranfer Learning is an important technique in making Deep Learning accessible to everyone. fastai is a leading research company in the area of Transfer Learning (and almost all other Deep Learning areas too!). Out of the box, you get state of the art results with carefully chosen defaults and training methods after lots of research.
These is an example post in the forums where the user tried to match the results gotten by fastai in Tensorflow. The good thing about fastai though is that they have a completely free MOOC where they teach everything and how to get the good results. In this post, we will be taking insights from the course and try to implement them in PyTorch.
Some of the techniques covered here are:
Splitting a Pretrained Network into body and head.
Creating a better custom head.
Not freezing the BatchNorm Layer and the intuition why.
Freezing the body and just training the head.
Unfreezing the whole network and training it further to get better results.
Using different learning rates for the body and the head, or as it is known as, Discriminative Learning Rates.
We will be getting our pretrained models from Ross Wightman’s library which makes availabe *almost all vision models.
Downloading https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-320.tgz and saving it as imagenette2-320.tgz
------------------------------------------------------------------------------------------------------------------------
Finished Downloading
Extracting Files into data
(#1) [Path('data/imagenette2-320')]
path.ls()
(#1) [Path('data/imagenette2-320')]
We can recursively get all the JPEG files from our path.
In all Machine Learning tasks, you are better off starting with a baseline model that you can use to compare the performance of the models you are going to train.
To get some sort of baseline on how well our Transfer Learning is working, we will first get a Resnet 34 architecture and train it from scratch without any transfer learning:
with wandb.init(project='Transfer-Learning-Pytorch', config=configs, name='training from scratch'): config = wandb.config# get the model from timm model = timm.create_model(f'{config.architecture}', pretrained=False)# Change the last linear to match our number of classes model.fc = nn.Linear(512, config.num_classes)# Initialize the weights model.fc.apply(init_weights) model = model.to(config.device) wandb.watch(model, log="all") optimizer = get_optim(model, config.lr) scheduler = get_scheduler(optimizer, config.lr, config.num_epochs+5) fit(config.num_epochs+5)
After 15 epochs, we get a train loss of 0.735 and a validation loss of 0.646, which an accuracy of 79%. This is the first baseline we are going to try and beat.
Baseline - Basic Transfer Learning
For our second baseline, we are going to do what most tutorials on Transfer Learning do, and that is:
with wandb.init(project='Transfer-Learning-Pytorch', config=configs, name='basic transfer learning'): config = wandb.config# get the model from timm model = timm.create_model(f'{config.architecture}', pretrained=True)# Freeze all parametersfor param in model.parameters(): param.requires_grad =False# Change the last linear to match our number of classes model.fc = nn.Linear(512, config.num_classes)# Initialize the weights model.fc.apply(init_weights) model = model.to(config.device) wandb.watch(model, log="all") optimizer = get_optim(model, config.lr) scheduler = get_scheduler(optimizer, config.lr, config.num_epochs+5) fit(config.num_epochs+5)
For our second baseline, we get a train loss of 0.519 and a validation loss of 0.308, which an accuracy of 91%. This is the second baseline we are going to try and beat.
With those two Base line results, we can now start using a few tweaks and tricks to make our fine-tuning work.
Custom Head
The first modification we are going to make to our transfer learning process is using a better custom head.
What I mean by this is, instead of just changing the final linear classifier in our pretrained model, we are going to chop our model into two:
A Body which will act as our feature extractor (Already pretrained)
A custom head that will be our classifier.
The reason we split the model right before the pooling operation is, in Deep Learning, we can fine-tune a model pretrained for classification and use it for other tasks like segmentation or object detection. Different applications might require different pooling layers or even none.
This is going to be our body of our pretrained model:
def make_body(model): layers =list((model.children()))[:-2] body = nn.Sequential(*layers)# freeze the bodyfor param in body.parameters(): param.requires_grad =Falsereturn body
Our head is going to be a Sequential Layer that follows the following pattern:
Our New Pooling operation followed by flattening the output.
A Sequence of BatchNorm -> Dropout -> Linear Layer. You can add as many of these as your task at hand requires, just remember to use and activation between two such sequences.
For our task at hand, we are going to use the following configuration:
Note, the input to our BatchNorm is 512 since that’s what our head receives from the body, then pools and flattens it. We won’t require a ReLU for our last linear layer and its out_features is going to be the number of classes we have in our dataset:
Let us create a handy function that creates our model for us:
def create_model(arch):# get pretrained model model = timm.create_model(f'{arch}', pretrained=True)# create the body and head body = make_body(model) head = make_head(512, len(lbls))# fine tune the model model = nn.Sequential(body, head)return model
And then now we can do the fine-tuning:
with wandb.init(project='Transfer-Learning-Pytorch', config=configs, name='transfer learning custom head'): config = wandb.config# create model model = create_model(config.architecture) model = model.to(config.device) wandb.watch(model, log="all") optimizer = get_optim(model, config.lr) scheduler = get_scheduler(optimizer, config.lr, config.num_epochs+5) fit(config.num_epochs+5)
We get a train loss of 0.605 and a validation loss of 0.279, which an accuracy of 91%. It is a slight improvement on our fine-tuning which doesn’t look like it does much, but now we have a better design of the head than just changing the classes of the last linear layer. Our custom head can be adopted to different training requirements by changing the configuration of the dropouts and batchnorms.
Don’t Freeze BatchNorm
Another tweak from fastai’s MOOC is about how to handle the BatchNorm layers while fine-tuning. To understand this, we will need to go back to how BatchNorm works.
BatchNorm works differently in training vs validation. During validation, BatchNorm uses a running mean of the statistics calculated during training. Therefore, when we get a pretrained layer, our BatchNorm’s present have statistics of its previous training (commonly on Imagenet). But during our fine-tuning, we want our model to adapt to the statistics of our current training statistics, and that is why we should not freeze them.
To do this, we are just going to slightly change our get_body function, and not freeze BatchNorm layers while freezing the rest of the layers
def make_body(model): layers =list((model.children()))[:-2] body = nn.Sequential(*layers)# Loop through the model and don't freeze BatchNormfor module in body.modules():ifisinstance(module, torch.nn.BatchNorm2d):for param in module.parameters(): param.requires_grad =Trueelse:for param in module.parameters(): param.requires_grad =Falsereturn body
Let run the experiment and see if that improves our fine-tuning:
with wandb.init(project='Transfer-Learning-Pytorch', config=configs, name='transfer learning unfrozen batchnorm'): config = wandb.config# create model model = create_model(config.architecture) model = model.to(config.device) wandb.watch(model, log="all") optimizer = get_optim(model, config.lr) scheduler = get_scheduler(optimizer, config.lr, config.num_epochs+5) fit(config.num_epochs+5)
Just that slight tweak improved our model to an accuracy of 96.5%. Our train and validation also decreased to 0.278 and 0.110 respectively. This confirms the theory why not freezing the BatchNorm is important.
Unfreezing
So far, all we have been doing is keeping the body of our pretrained model frozen and fine-tuning the newly created head and it worked out fine. But we can also try something else. How about unfreezing our whole model after some epochs and training it to see if it improves our metrics?
Therefore, the next experiment we are going to try is unfreezing our model after a few epochs and fine tuning the whole model on our new dataset.
with wandb.init(project='Transfer-Learning-Pytorch', config=configs, name='transfer learning with unfreezing'): config = wandb.config# create model model = create_model(config.architecture) model = model.to(config.device) wandb.watch(model, log="all") optimizer = get_optim(model, config.lr) scheduler = get_scheduler(optimizer, config.lr, config.init_epochs)# Training while Frozenprint('_'*40)print('Training with Frozen Body')print('_'*40)print() fit(config.init_epochs)# Unfreeze the model and train furtherprint('_'*40)print('Unfreezing the body')print('_'*40)print()for param in model.parameters(): param.requires_grad =True optimizer = get_optim(model, lr=1e-5) scheduler = get_scheduler(optimizer, 1e-5, config.num_epochs)print('_'*40)print('Training with Unfrozen Body')print('_'*40)print() fit(config.num_epochs)
________________________________________
Unfreezing the body
________________________________________
________________________________________
Training with Unfrozen Body
________________________________________
Our metrics keep on improving with the changes. This means we are headed to the right direction.
But if we think about it, our body and our head are two very different group of layers. On one hand, we have a body that has important weights learned from its previous task that it was pretrained on. And on the other hand, we have a head that has been initialized randomly and has no important weights until we start fine-tuning it.
We do not want to change the weights of the body too much as they have already learned important features that generalize well to a lot of vision tasks, while at the same time we want to make our head useful to our current task. How do we achieve both of these conditions? Enters Discriminative Learning Rates.
The main intuition is that: even after we unfreeze, we stil care a lot about the quality of the pretrained weights. The best learning rates for the pretrained parameters should not be as high as for the randomly added parameters because the pretrained weights have been trained for hundreds of epochs, on millions of images.
Therefore, after unfreezing our whole model, we are going to train the head and the body using two different learning rates.
A good rule of thumb is: the body learning rate should be 2.6 smaller than the head learning rate. Therefore, we are going to divide the head learning rate by 2.6 to get the body learning rate.
class MyModel(nn.Module):def__init__(self, body, head):super(MyModel, self).__init__()self.body = bodyself.head = headdef forward(self, x): out =self.body(x)returnself.head(out)
with wandb.init(project='Transfer-Learning-Pytorch', config=configs, name='discriminative learning rates'): config = wandb.config# create model body = create_model(config.architecture)[0] head = create_model(config.architecture)[1] model = MyModel(body, head) model = model.to(config.device) wandb.watch(model, log="all") optimizer = get_optim(model, config.lr) scheduler = get_scheduler(optimizer, config.lr,config.init_epochs)# Training while Frozenprint('_'*40)print('Training with Frozen Body')print('_'*40)print() fit(config.init_epochs)# Unfreeze the model and train furtherprint('_'*40)print('Unfreezing the body')print('_'*40)print()for param in model.parameters(): param.requires_grad =True# update the learning rates of each param group head_params = [] body_params = []for name, param in model.body.named_parameters(): body_params.append(param)for name, param in model.head.named_parameters(): head_params.append(param) optimizer = optim.AdamW([{'params':body_params}, {'params':head_params}], lr=1e-5) scheduler = get_scheduler(optimizer, config.head_lr, config.num_epochs) optimizer.param_groups[0]['lr'] = config.body_lr optimizer.param_groups[1]['lr'] = config.head_lr# discriminative learning ratesprint('_'*40)print('Training with Unfrozen Body')print('_'*40)print() fit(config.num_epochs)
________________________________________
Unfreezing the body
________________________________________
________________________________________
Training with Unfrozen Body
________________________________________