Pytorch creating dataset with augmented images - image-processing

Is there a way to generate a DataLoader by creating X number of augmented images? The code I have currently only creates a single augmented image
class ImageDataset(data.Dataset):
def __init__(self, root_dir, transform=None):
self.root_dir = root_dir
self.img_names = os.listdir()
self.transform = transform
def __getitem__(self, index):
img = Image.open(os.path.join(self.root, self.img_names[index])).convert('RGB')
if self.transform is not None:
img = self.transform(img)
return img
def __len__(self):
return len(self.img_names)
Also, I would like to add labels where augmented images from the same image have the same label

Here's a couple equivalent ways you could do this.
Option 1
We can change the dataset class itself to provide the same data multiple times. This can be accomplished by reporting a longer length and selecting the image name using index (mod length).
class ImageDataset(data.Dataset):
def __init__(self, root_dir, repetitions=1, transform=None):
self.root_dir = root_dir
self.img_names = os.listdir()
self.transform = transform
self.repetitions = repetitions
def __getitem__(self, index):
img = Image.open(os.path.join(self.root,
self.img_names[index % len(self.img_names)])).convert('RGB')
if self.transform is not None:
img = self.transform(img)
return img
def __len__(self):
return len(self.img_names) * self.repetitions
Option 2
Alternatively we could use a torch.utils.data.Subset and specify the same indices multiple times.
# using your original implementation for ImageDataset
dataset = ImageDataset(root, transforms)
dataset = torch.utils.data.Subset(dataset, list(range(len(dataset))) * repetitions)

Related

How to create LSTM that allows dynamic sequence length in PyTorch

I've created an LSTM in PyTorch and I need to give it a sequence length variable, the following is my code:
class Seq2SeqSingle(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, in_features, out_features):
super(Seq2SeqSingle, self).__init__()
self.out_features = out_features
self.num_layers = num_layers
self.input_size = input_size
self.hidden_size = hidden_size
self.fc_i = nn.Linear(input_size, out_features)
self.fc_o = nn.Linear(out_features, input_size)
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
self.fc_0 = nn.Linear(128*11, out_features) ## <----------- LOOK HERE
self.fc_1 = nn.Linear(out_features, out_features)
def forward(self, x):
#print(x.shape)
output = self.fc_i(torch.relu(x))
output = self.fc_o(torch.relu(output))
h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device)
c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device)
output, (h_out, c_out) = self.lstm(output, (h_0, c_0))
output = output.reshape(x.size(0), -1)
output = self.fc_0(torch.relu(output))
output = self.fc_1(torch.relu(output))
output = nn.functional.softmax(output, dim = 1)
return output
In order to match the size of the output of the LSTM layer I need to multiply 128 (that is the hidden size) with 11 (the sequence length), obviously if I change the sequence length it crashes, how can I avoid to specify this fixed size?
You can't avoid specifying the fixed size. As joe32140 said in a comment, a common approach is to take only the hidden state of the last step as input for the linear layer, so the size no longer depends on the number of steps.

Why my feature map seems incorrect when the prediction of the class is correct

from torchvision.models.feature_extraction import create_feature_extractor
# Data processing
preprocess = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)])
image_path = './data/test_images/anemone.jpg'
image = Image.open(image_path).convert('RGB')
img_processed = preprocess(image)
batch_img_cat_tensor = torch.unsqueeze(img_processed, 0)
# Model initialization
resnet50_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
# Eval model for predictions
resnet50_model.eval()
# Creating feature extractor (Detailed example here: https://pytorch.org/blog/FX-feature-extraction-torchvision/)
feature_extractor = create_feature_extractor(resnet50_model,
return_nodes=['layer4.2.conv3', 'fc'])
# Forward pass
out = feature_extractor(batch_img_cat_tensor)
pred = torch.argmax(out['fc'])
# Transforming last conv output to numpy and reshaping it so that the channels would be last
last_conv_output = torch.squeeze(out['layer4.2.conv3'])
last_conv_output = torch.reshape(last_conv_output, (7, 7, -1))
last_conv_output = last_conv_output.detach().numpy()
last_conv_output = last_conv_output.astype(np.uint8)
Calculating the upscale factors for last conv output
width_factor = int(image.size[0] / last_conv_output.shape[0])
height_factor = int(image.size[1] / last_conv_output.shape[1])
# Getting the shapes of the last conv output
last_conv_w, last_conv_h, n_channels = last_conv_output.shape
# Calculate the
upscaled_h = last_conv_h * height_factor
upscaled_w = last_conv_w * width_factor
# Upscaling the last_conv_output so that it could be "masked" with original image
upsampled_last_conv_output = np.zeros((upscaled_h, upscaled_w, n_channels))
upsampled_last_conv_output = []
for x in range(0, n_channels, 512):
upsampled_last_conv_output.append(cv2.resize(last_conv_output[:, :, x:x+512], (upscaled_w, upscaled_h), cv2.INTER_CUBIC))
upsampled_last_conv_output = np.concatenate(upsampled_last_conv_output, axis=2)
# Getting the weights of the predicted class
last_layer_weights = resnet50_model.fc.weight.T
last_layer_weights_for_pred = last_layer_weights[:, pred]
# Dot multiplying the upsampled_last_conv_output with last_layer_weights_for_pred
upsampled_last_conv_output = upsampled_last_conv_output.reshape((-1, 2048))
heat_map = np.dot(upsampled_last_conv_output,
last_layer_weights_for_pred.detach().numpy()).reshape(upscaled_h, upscaled_w)
# Plotting the results
fig, ax = plt.subplots()
ax.imshow(image)
ax.imshow(heat_map, cmap='jet', alpha=0.5)
ax.set_title(prediction)
I have followed the tutorial from here: https://www.youtube.com/watch?v=GiyldmoYe_M&t=665s&ab_channel=DigitalSreeni
The main problem with this is that I get the feature map that looks like this:
As you see it looks like the model reacts to multiple areas on the image and no matter what image I use it always has the biggest reaction in the middle.
PS. If you think this question should be posted on the AI stack exchange please notify me
I have found an error I made. It was that after creating a
heat_map = np.dot(upsampled_last_conv_output, last_layer_weights_for_pred.detach().numpy()).reshape(upscaled_h, upscaled_w)
I had to apply this as well:
heat_map = heat_map - np.min(heat_map)
heat_map = heat_map / np.max(heat_map)
Since I normalized the image, the generated heatmap was also normalized, so I needed to "denormalize" it back to it's original values.

How to set the initial weights to Xavier?

I would like to set the initial weights in Pytorch to Xavier.
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
# self.linear1 = nn.Linear(1, 516*4*4)
self.linear1 = nn.Linear(2, 512*8)
self.linear2 = nn.Linear(256*16, 256*8)
self.linear3 = nn.Linear(256*8, 1)
def forward(self, x):
x = self.linear1(x)
x = torch.sigmoid(x)
x = self.linear2(x)
x = torch.sigmoid(x)
x = self.linear3(x)
You can loop over each layer using the children generator and initialize the layer weights using the built-in xavier_uniform_ initializer provided by nn.init module.
For example, you could put this in your __init__ definition:
for m in self.children():
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
m.bias.data.fill_(0.01)

How to use Pytorch to create a custom EfficientNet with the last layer written correctly

I have a classification problem to predict 8 classes for example, I am using EfficientNetB3 in pytorch from here. However, I got confused on whether my custom class is correctly written. I think I want to strip the last layer of the pre-trained model to suit the 8 outputs right? Did I do it correctly? Because when I print y_preds = model(images) in my DataLoader, it seems to give me 1536 predictions. Is this an expected behavior?
!pip install geffnet
import geffnet
class EfficientNet(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.model = geffnet.create_model(config.effnet, pretrained=True)
n_features = self.model.classifier.in_features
# does the name fc matter?
self.fc = nn.Linear(n_features, config.num_classes)
self.model.classifier = nn.Identity()
def extract(self, x):
x = self.model(x)
return x
def forward(self, x):
x = self.extract(x).squeeze(-1).squeeze(-1)
return x
model = EfficientNet(config=config)
if torch.cuda.is_available():
model.cuda()
Sample code for printing y_pred:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for step, (images, labels) in enumerate(sample_loader):
images = images.to(device)
labels = labels.to(device)
batch_size = images.shape[0]
y_preds = model(images)
print('The predictions of the 4 images is as follows\n', y_preds)
break
You're not even using self.fc in forward pass.
Either just introduce it as:
def forward(self, x):
....
x = extract(x)...
x = fc(x)
return x
Or you can simply replace the layer named classifier (this way you don't need Identity layer):
self.model.classifier = nn.Linear(n_features, config.num_classes)
Also, here config.num_classes should be 8.

Building layer wise model for Encoding-Decoding Image

I am writing an autoencoder model for an image encoding-decoding problem.
I want to understand the node distribution in each layer of the model suitable for images.
For the below code I am using 10 images of shape (21*28*3).
class Autoencoder(Chain):
def __init__(self, activation=F.relu):
super().__init__()
with self.init_scope():
# encoder part
self.l1 = L.Linear(1764,882)
self.l2 = L.Linear(882,441)
# decoder part
self.l3 = L.Linear(441,882)
self.l4 = L.Linear(882,1764)
self.activation = activation
def forward(self,x):
h = self.encode(x)
x_recon = self.decode(h)
return x_recon
def __call__(self,x):
x_recon = self.forward(x)
loss = F.mean_squared_error(h, x)
return loss
def encode(self, x):
h = F.dropout(self.activation(self.l1(x)))
return self.activation(self.l2(x))
def decode(self, h, train=True):
h = self.activation(self.l3(h))
return self.l4(x)
gpu_id = 0
n_epoch = 5
batch_size = 2
model = Autoencoder()
optimizer = optimizers.SGD(lr=0.05).setup(model)
train_iter = iterators.SerialIterator(xs,batch_size)
valid_iter = iterators.SerialIterator(xs,batch_size)
updater = training.StandardUpdater(train_iter,optimizer)
trainer = training.Trainer(updater,(n_epoch,"epoch"),out="result")
from chainer.training import extensions
trainer.extend(extensions.Evaluator(valid_iter, model, device=gpu_id))
While running trainer.run():
InvalidType:
Invalid operation is performed in: LinearFunction (Forward)
Expect: x.shape[1] == W.shape[1]
Actual: 1764 != 882
I want to understand how node distribution works layer wise in a model. Please suggest any resource. Also how to assign nodes in layers in case of small number of training images.

Resources