Related
I am trying to solve a sequence to sequence problem with a transformer model. The data is derived from a set of crossword puzzles.
The positional encoding and transformer classes are as follows:
class PositionalEncoding(nn.Module):
def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 3000):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
position = torch.arange(max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
pe = torch.zeros(1, max_len, d_model)
pe[0, :, 0::2] = torch.sin(position * div_term)
pe[0, :, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
def debug(self, x):
return x.shape, x.size()
def forward(self, x: Tensor) -> Tensor:
x = x + self.pe[:, :x.size(1), :]
return self.dropout(x)
class Transformer(nn.Module):
def __init__(
self,
num_tokens,
dim_model,
num_heads,
num_encoder_layers,
num_decoder_layers,
batch_first,
dropout_p,
):
super().__init__()
self.model_type = "Transformer"
self.dim_model = dim_model
self.positional_encoder = PositionalEncoding(
d_model=dim_model, dropout=dropout_p, max_len=3000
)
self.embedding = nn.Embedding.from_pretrained(vec_weights, freeze=False)#nn.Embedding(num_tokens, dim_model)
self.transformer = nn.Transformer(
d_model=dim_model,
nhead=num_heads,
num_encoder_layers=num_encoder_layers,
num_decoder_layers=num_decoder_layers,
dropout=dropout_p,
batch_first = batch_first
)
self.out = nn.Linear(dim_model, num_tokens)
def forward(self, src, tgt, tgt_mask=None, src_pad_mask=None, tgt_pad_mask=None):
src = self.embedding(src)*math.sqrt(self.dim_model)
tgt = self.embedding(tgt)*math.sqrt(self.dim_model)
src = self.positional_encoder(src)
tgt = self.positional_encoder(tgt)
transformer_out = self.transformer(src, tgt, tgt_mask=tgt_mask, src_key_padding_mask=src_pad_mask, tgt_key_padding_mask=tgt_pad_mask)
out = self.out(transformer_out)
return out
def get_tgt_mask(self, size) -> torch.tensor:
mask = torch.tril(torch.ones(size, size) == 1)
mask = mask.float()
mask = mask.masked_fill(mask == 0, float('-inf'))
mask = mask.masked_fill(mask == 1, float(0.0))
return mask
def create_pad_mask(self, matrix: torch.tensor, pad_token: int) -> torch.tensor:
return (matrix == pad_token)
The input tensors are a source tensor of size N by S, where N is the batch size and S is the source sequence length, and a target tensor of size N by T, where T is the target sequence length. S is about 10 and T is about 5, while the total number of items is about 160,000-200,000, divided into batch sizes of 512. They are torch.IntTensors, with elements in the range from 0 to V, where V is the vocabulary length.
The first layer is an embedding layer that takes the input from N by S to N by S by E, where E is the embedding dimension (300), or to N by T by E in the case of the target. The second layer adds position encoding without changing the shape. Then both tensors are passed through the transformer layer, which outputs an N by T by E tensor. Finally, we pass this output through a linear layer, which produces an N by T by V output, where V is the size of the vocabulary used in the problem. Here V is about 56,697. The most frequent tokens (words) appear about 50-60 times in the target tensor.
The transformer class also contains the functions for implementing the masking matrices.
Then we create the model and run it (this process is wrapped in a function).
device = "cuda"
src_train, src_test = torch.utils.data.random_split(src_t, [int(0.9*len(src_t)), len(src_t)-int(0.9*len(src_t))])
src_train, src_test = src_train[:512], src_test[:512]
tgt_train, tgt_test = torch.utils.data.random_split(tgt_t, [int(0.9*len(tgt_t)), len(tgt_t)-int(0.9*len(tgt_t))])
tgt_train, tgt_test = tgt_train[:512], tgt_test[:512]
train_data, test_data = list(zip(src_train, tgt_train)), list(zip(src_test, tgt_test))
train, test = torch.utils.data.DataLoader(dataset=train_data), torch.utils.data.DataLoader(dataset=test_data)
model = Transformer(num_tokens=ntokens, dim_model=300, num_heads=2, num_encoder_layers=3, num_decoder_layers=3, batch_first = True, dropout_p=0.1).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0000001)
n_epochs = 50
def train_model(model, optimizer, loss_function, n_epochs):
loss_value=0
for epoch in range(n_epochs):
print(f"Starting epoch {epoch}")
for batch, data in enumerate(train):
x, y = data
if batch%100 == 0:
print(f"Batch is {batch}")
batch += 1
optimizer.zero_grad()
x, y = torch.tensor(x).to(device), torch.tensor(y).to(device)
y_input, y_base = y[:, :-1], y[:, 1:]
y_input, y_base = y_input.to(device), y_base.to(device)
tgt_mask = model.get_tgt_mask(y_input.shape[1]).to(device)
pad_token = vocabulary_table[embeddings.key_to_index["/"]]
src_pad_mask = model.create_pad_mask(x, pad_token).to(device)
tgt_pad_mask = model.create_pad_mask(y_input, pad_token).to(device)
z = model(x, y_input, tgt_mask, src_pad_mask, tgt_pad_mask)
z = z.permute(0, 2, 1).to(device)
y_base = y_base.long().to(device)
loss = loss_function(z, y_base).to(device)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2)
optimizer.step()
loss_value += float(loss)
if batch%100 == 0:
print(f"For epoch {epoch}, batch {batch} the cross-entropy loss is {loss_value}")
#Free GPU memory.
del z
del x
del y
del y_input
del y_base
del loss
torch.cuda.empty_cache()
return model.parameters(), loss_value
Basically, we split the data into test and training sets and use an SGD optimizer and cross-entropy loss. We create a masking matrix for the padding for both the target and source tensors, and a masking matrix for unseen elements for the target tensor. We then do the usual gradient update steps. Right now, there is no validation loop, because I cannot even get the training loss to decrease.
The loss is very high, reaching more than 1000 after 100 batches. More concerningly, the loss also increases rapidly during training, rather than decreasing. In the code that I included, I tried clipping the gradients, lowering the learning rate, and using a much smaller sample to debug the code.
What could be causing this behavior?
You are only adding things to your loss, so naturally it can only increase.
loss_value += float(loss)
You're supposed to set it to zero after every epoch. Now you set it to zero once, in the beginning of the training process. There is a training loop template here if you're interested (https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html). This explains the increasing loss. To further troubleshoot (if needed) I'd throw in a validation loop.
I want to run some neural net experiments with PyTorch, but a minimal test case is giving wrong answers. The test case sets up a simple neural network with two input variables and an output variable that is just the sum of the inputs, and tries learning it as a regression problem; I expect it to converge on zero mean squared error, but it actually converges on 0.165. It's probably because of the issue alluded to in the warning message; how can I fix it?
Code:
import torch
import torch.nn as nn
# data
Xs = []
ys = []
n = 10
for i in range(n):
i1 = i / n
for j in range(n):
j1 = j / n
Xs.append([i1, j1])
ys.append(i1 + j1)
# torch tensors
X_tensor = torch.tensor(Xs)
y_tensor = torch.tensor(ys)
# hyperparameters
in_features = len(Xs[0])
hidden_size = 100
out_features = 1
epochs = 500
# model
class Net(nn.Module):
def __init__(self, hidden_size):
super(Net, self).__init__()
self.L0 = nn.Linear(in_features, hidden_size)
self.N0 = nn.ReLU()
self.L1 = nn.Linear(hidden_size, 1)
def forward(self, x):
x = self.L0(x)
x = self.N0(x)
x = self.L1(x)
return x
model = Net(hidden_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
# train
print("training")
for epoch in range(1, epochs + 1):
# forward
output = model(X_tensor)
cost = criterion(output, y_tensor)
# backward
optimizer.zero_grad()
cost.backward()
optimizer.step()
# print progress
if epoch % (epochs // 10) == 0:
print(f"{epoch:6d} {cost.item():10f}")
print()
output = model(X_tensor)
cost = criterion(output, y_tensor)
print("mean squared error:", cost.item())
Output:
training
C:\Users\russe\Anaconda3\envs\torch2\lib\site-packages\torch\nn\modules\loss.py:445: UserWarning: Using a target size (torch.Size([100])) that is different to the input size (torch.Size([100, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input, target, reduction=self.reduction)
50 0.167574
100 0.165108
150 0.165070
200 0.165052
250 0.165039
300 0.165028
350 0.165020
400 0.165013
450 0.165009
500 0.165006
mean squared error: 0.1650056540966034
And the message:
UserWarning: Using a target size (torch.Size([100])) that is different to the input size (torch.Size([100, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
You're going to be a bit more specific on which tensors (X, or Y), but we can can reshape our tensors by using the torch.view function.
For example:
Y_tensor = torch.tensor(Ys)
print(Y_tensor.shape)
>> torch.Size([5])
new_shape = (len(Ys), 1)
Y_tensor = Y_tensor.view(new_shape)
print(Y_tensor.shape)
>> torch.Size([5, 1])
However, I'm skeptical that this broadcasting behavior is why you're having accuracy issues.
I am trying to develop a deep Markov Model using following tutorial:
https://pyro.ai/examples/dmm.html
This model parameterises transitions and emissions with using a neural network and for variational inference part they use RNN to map observable 'x' to latent space. And in order to ensure that their model is learning something they try to maximise ELBO or minimise negative ELBO. They refer to negative ELBO as NLL.
I am basically converting pyro code to pure pytorch. And I've put together pretty much every thing. However, now I want to get a reconstruct error on test sequences. My input is one hot encoding of my sequences and it is of shape (500,1900,4) where 4 refers to number of features and 1900 is sequence length, and 500 refers to total number of examples.
The way I am generating data is:
generation model p(x_{1:T} | z_{1:T}) p(z_{1:T})
batch_size, _, x_dim = x.size() # number of time steps we need to process in the mini-batch
T_max = x_lens.max()
z_prev = self.z_0.expand(batch_size, self.z_0.size(0)) # set z_prev=z_0 to setup the recursive conditioning in p(z_t|z_{t-1})
for t in range(1, T_max + 1):
# sample z_t ~ p(z_t | z_{t-1}) one time step at a time
z_t, z_mu, z_logvar = self.trans(z_prev) # p(z_t | z_{t-1})
p_x_t = F.softmax(self.emitter(z_t),dim=-1) # compute the probabilities that parameterize the bernoulli likelihood
safe_tensor = torch.where(torch.isnan(p_x_t), torch.zeros_like(p_x_t), p_x_t)
print('generate p_x_t : ',safe_tensor)
x_t = torch.bernoulli(safe_tensor) #sample observe x_t according to the bernoulli distribution p(x_t|z_t)
print('generate x_t : ',x_t)
z_prev = z_t
So my emissions are being defined by Bernoulli distribution. And I use softmax in order to compute probabilities that parameterise the Bernoulli likelihood. And then I sample my observables x_t according the Bernoulli distribution.
At first when I ran my model, I was getting nans at times and so I introduced the line (show below) in order to convert nans to zero:
safe_tensor = torch.where(torch.isnan(p_x_t), torch.zeros_like(p_x_t), p_x_t)
However, after 1 epoch or so 'x_t' that I sample, this tensor is just zero. Basically, what I want is that after applying softmax, I want my function to pick the highest probability and give me the corresponding label for it which is either of the 4 features. But I get nans and then when I convert all nans to zero, I end up getting all zeros in all tensors after 1 epoch or so.
Also, when I look at p_x_t tensor I get probabilities that add up to one. But when I look at x_t tensor it gives me 0's all the way. So for example:
p_x_t : tensor([[0.2168, 0.2309, 0.2555, 0.2967]
.....], device='cuda:0',
grad_fn=<SWhereBackward>)
generate x_t : tensor([[0., 0., 0., 0.],...
], device='cuda:0', grad_fn=<BernoulliBackward0>)
Here fourth label/feature is giving me highest probability. Shouldn't the x_t tensor give me 1 at least in that position so be like:
generate x_t : tensor([[0., 0., 0., 1.],...
], device='cuda:0', grad_fn=<BernoulliBackward0>)
How can I get rid of this problems?
EDIT
My transition (which is called as self.trans in generate function mentioned above):
class GatedTransition(nn.Module):
"""
Parameterizes the gaussian latent transition probability `p(z_t | z_{t-1})`
See section 5 in the reference for comparison.
"""
def __init__(self, z_dim, trans_dim):
super(GatedTransition, self).__init__()
self.gate = nn.Sequential(
nn.Linear(z_dim, trans_dim),
nn.ReLU(),
nn.Linear(trans_dim, z_dim),
nn.Softmax(dim=-1)
)
self.proposed_mean = nn.Sequential(
nn.Linear(z_dim, trans_dim),
nn.ReLU(),
nn.Linear(trans_dim, z_dim)
)
self.z_to_mu = nn.Linear(z_dim, z_dim)
# modify the default initialization of z_to_mu so that it starts out as the identity function
self.z_to_mu.weight.data = torch.eye(z_dim)
self.z_to_mu.bias.data = torch.zeros(z_dim)
self.z_to_logvar = nn.Linear(z_dim, z_dim)
self.relu = nn.ReLU()
def forward(self, z_t_1):
"""
Given the latent `z_{t-1}` corresponding to the time step t-1
we return the mean and scale vectors that parameterize the (diagonal) gaussian distribution `p(z_t | z_{t-1})`
"""
gate = self.gate(z_t_1) # compute the gating function
proposed_mean = self.proposed_mean(z_t_1) # compute the 'proposed mean'
mu = (1 - gate) * self.z_to_mu(z_t_1) + gate * proposed_mean # compute the scale used to sample z_t, using the proposed mean from
logvar = self.z_to_logvar(self.relu(proposed_mean))
epsilon = torch.randn(z_t_1.size(), device=z_t_1.device) # sampling z by re-parameterization
z_t = mu + epsilon * torch.exp(0.5 * logvar) # [batch_sz x z_sz]
if torch.isinf(z_t).any().item():
print('something is infinity')
print('z_t : ',z_t)
print('logvar : ',logvar)
print('epsilon : ',epsilon)
print('mu : ',mu)
return z_t, mu, logvar
I do not get any inf in z_t tensor when doing training and validation. Only during testing. This is how I am training, validating and testing my model:
for epoch in range(config['epochs']):
train_loader=torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'], shuffle=True, num_workers=1)
train_data_iter=iter(train_loader)
n_iters=train_data_iter.__len__()
epoch_nll = 0.0 # accumulator for our estimate of the negative log likelihood (or rather -elbo) for this epoch
i_batch=1
n_slices=0
loss_records={}
while True:
try: x, x_rev, x_lens = train_data_iter.next()
except StopIteration: break # end of epoch
x, x_rev, x_lens = gVar(x), gVar(x_rev), gVar(x_lens)
if config['anneal_epochs'] > 0 and epoch < config['anneal_epochs']: # compute the KL annealing factor
min_af = config['min_anneal']
kl_anneal = min_af+(1.0-min_af)*(float(i_batch+epoch*n_iters+1)/float(config['anneal_epochs']*n_iters))
else:
kl_anneal = 1.0 # by default the KL annealing factor is unity
loss_AE = model.train_AE(x, x_rev, x_lens, kl_anneal)
epoch_nll += loss_AE['train_loss_AE']
i_batch=i_batch+1
n_slices=n_slices+x_lens.sum().item()
loss_records.update(loss_AE)
loss_records.update({'epo_nll':epoch_nll/n_slices})
times.append(time.time())
epoch_time = times[-1] - times[-2]
log("[Epoch %04d]\t\t(dt = %.3f sec)"%(epoch, epoch_time))
log(loss_records)
if args.visual:
for k, v in loss_records.items():
tb_writer.add_scalar(k, v, epoch)
# do evaluation on test and validation data and report results
if (epoch+1) % args.test_freq == 0:
save_model(model, epoch)
#test_loader=torch.utils.data.DataLoader(dataset=test_set, batch_size=config['batch_size'], shuffle=False, num_workers=1)
valid_loader=torch.utils.data.DataLoader(dataset=valid_set, batch_size=config['batch_size'], shuffle=False, num_workers=1)
for x, x_rev, x_lens in valid_loader:
x, x_rev, x_lens = gVar(x), gVar(x_rev), gVar(x_lens)
loss,kl_loss = model.valid(x, x_rev, x_lens)
#print('x_lens sum : ',x_lens.sum().item())
#print('loss : ',loss)
valid_nll = loss/x_lens.sum().item()
log("[Epoch %04d]\t\t(valid_loss = %.8f)\t\t(kl_loss = %.8f)\t\t(valid_nll = %.8f)"%(epoch, loss, kl_loss, valid_nll ))
test_loader=torch.utils.data.DataLoader(dataset=test_set, batch_size=config['batch_size'], shuffle=False, num_workers=1)
for x, x_rev, x_lens in test_loader:
x, x_rev, x_lens = gVar(x), gVar(x_rev), gVar(x_lens)
loss,kl_loss = model.valid(x, x_rev, x_lens)
test_nll = loss/x_lens.sum().item()
model.generate(x, x_rev, x_lens)
log("[test_nll epoch %08d] %.8f" % (epoch, test_nll))
The test is outside the for loop for epoch. Because I want to test my model on test sequences using generate function. Can you now help me understand if there's something wrong I am doing?
This is using PyTorch
I have been trying to implement UNet model on my images, however, my model accuracy is always exact 0.5. Loss does decrease.
I have also checked for class imbalance. I have also tried playing with learning rate. Learning rate affects loss but not the accuracy.
My architecture below ( from here )
""" `UNet` class is based on https://arxiv.org/abs/1505.04597
The U-Net is a convolutional encoder-decoder neural network.
Contextual spatial information (from the decoding,
expansive pathway) about an input tensor is merged with
information representing the localization of details
(from the encoding, compressive pathway).
Modifications to the original paper:
(1) padding is used in 3x3 convolutions to prevent loss
of border pixels
(2) merging outputs does not require cropping due to (1)
(3) residual connections can be used by specifying
UNet(merge_mode='add')
(4) if non-parametric upsampling is used in the decoder
pathway (specified by upmode='upsample'), then an
additional 1x1 2d convolution occurs after upsampling
to reduce channel dimensionality by a factor of 2.
This channel halving happens with the convolution in
the tranpose convolution (specified by upmode='transpose')
Arguments:
in_channels: int, number of channels in the input tensor.
Default is 3 for RGB images. Our SPARCS dataset is 13 channel.
depth: int, number of MaxPools in the U-Net. During training, input size needs to be
(depth-1) times divisible by 2
start_filts: int, number of convolutional filters for the first conv.
up_mode: string, type of upconvolution. Choices: 'transpose' for transpose convolution
"""
class UNet(nn.Module):
def __init__(self, num_classes, depth, in_channels, start_filts=16, up_mode='transpose', merge_mode='concat'):
super(UNet, self).__init__()
if up_mode in ('transpose', 'upsample'):
self.up_mode = up_mode
else:
raise ValueError("\"{}\" is not a valid mode for upsampling. Only \"transpose\" and \"upsample\" are allowed.".format(up_mode))
if merge_mode in ('concat', 'add'):
self.merge_mode = merge_mode
else:
raise ValueError("\"{}\" is not a valid mode for merging up and down paths.Only \"concat\" and \"add\" are allowed.".format(up_mode))
# NOTE: up_mode 'upsample' is incompatible with merge_mode 'add'
if self.up_mode == 'upsample' and self.merge_mode == 'add':
raise ValueError("up_mode \"upsample\" is incompatible with merge_mode \"add\" at the moment "
"because it doesn't make sense to use nearest neighbour to reduce depth channels (by half).")
self.num_classes = num_classes
self.in_channels = in_channels
self.start_filts = start_filts
self.depth = depth
self.down_convs = []
self.up_convs = []
# create the encoder pathway and add to a list
for i in range(depth):
ins = self.in_channels if i == 0 else outs
outs = self.start_filts*(2**i)
pooling = True if i < depth-1 else False
down_conv = DownConv(ins, outs, pooling=pooling)
self.down_convs.append(down_conv)
# create the decoder pathway and add to a list
# - careful! decoding only requires depth-1 blocks
for i in range(depth-1):
ins = outs
outs = ins // 2
up_conv = UpConv(ins, outs, up_mode=up_mode, merge_mode=merge_mode)
self.up_convs.append(up_conv)
self.conv_final = conv1x1(outs, self.num_classes)
# add the list of modules to current module
self.down_convs = nn.ModuleList(self.down_convs)
self.up_convs = nn.ModuleList(self.up_convs)
self.reset_params()
#staticmethod
def weight_init(m):
if isinstance(m, nn.Conv2d):
#https://prateekvjoshi.com/2016/03/29/understanding-xavier-initialization-in-deep-neural-networks/
##Doc: https://pytorch.org/docs/stable/nn.init.html?highlight=xavier#torch.nn.init.xavier_normal_
init.xavier_normal_(m.weight)
init.constant_(m.bias, 0)
def reset_params(self):
for i, m in enumerate(self.modules()):
self.weight_init(m)
def forward(self, x):
encoder_outs = []
# encoder pathway, save outputs for merging
for i, module in enumerate(self.down_convs):
x, before_pool = module(x)
encoder_outs.append(before_pool)
for i, module in enumerate(self.up_convs):
before_pool = encoder_outs[-(i+2)]
x = module(before_pool, x)
# No softmax is used. This means we need to use
# nn.CrossEntropyLoss is your training script,
# as this module includes a softmax already.
x = self.conv_final(x)
return x
Parameters are :
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x,y = train_sequence[0] ; batch_size = x.shape[0]
model = UNet(num_classes = 2, depth=5, in_channels=5, merge_mode='concat').to(device)
optim = torch.optim.Adam(model.parameters(),lr=0.01, weight_decay=1e-3)
criterion = nn.BCEWithLogitsLoss() #has sigmoid internally
epochs = 1000
The function for training is :
import torch.nn.functional as f
def train_model(epoch,train_sequence):
"""Train the model and report validation error with training error
Args:
model: the model to be trained
criterion: loss function
data_train (DataLoader): training dataset
"""
model.train()
for idx in range(len(train_sequence)):
X, y = train_sequence[idx]
images = Variable(torch.from_numpy(X)).to(device) # [batch, channel, H, W]
masks = Variable(torch.from_numpy(y)).to(device)
outputs = model(images)
print(masks.shape, outputs.shape)
loss = criterion(outputs, masks)
optim.zero_grad()
loss.backward()
# Update weights
optim.step()
# total_loss = get_loss_train(model, data_train, criterion)
My function for calculating loss and accuracy is below:
def get_loss_train(model, train_sequence):
"""
Calculate loss over train set
"""
model.eval()
total_acc = 0
total_loss = 0
for idx in range(len(train_sequence)):
with torch.no_grad():
X, y = train_sequence[idx]
images = Variable(torch.from_numpy(X)).to(device) # [batch, channel, H, W]
masks = Variable(torch.from_numpy(y)).to(device)
outputs = model(images)
loss = criterion(outputs, masks)
preds = torch.argmax(outputs, dim=1).float()
acc = accuracy_check_for_batch(masks.cpu(), preds.cpu(), images.size()[0])
total_acc = total_acc + acc
total_loss = total_loss + loss.cpu().item()
return total_acc/(len(train_sequence)), total_loss/(len(train_sequence))
Edit : Code which runs (calls) the functions:
for epoch in range(epochs):
train_model(epoch, train_sequence)
train_acc, train_loss = get_loss_train(model,train_sequence)
print("Train Acc:", train_acc)
print("Train loss:", train_loss)
Can someone help me identify as why is accuracy always exact 0.5?
Edit-2:
As asked accuracy_check_for_batch function is here:
def accuracy_check_for_batch(masks, predictions, batch_size):
total_acc = 0
for index in range(batch_size):
total_acc += accuracy_check(masks[index], predictions[index])
return total_acc/batch_size
and
def accuracy_check(mask, prediction):
ims = [mask, prediction]
np_ims = []
for item in ims:
if 'str' in str(type(item)):
item = np.array(Image.open(item))
elif 'PIL' in str(type(item)):
item = np.array(item)
elif 'torch' in str(type(item)):
item = item.numpy()
np_ims.append(item)
compare = np.equal(np_ims[0], np_ims[1])
accuracy = np.sum(compare)
return accuracy/len(np_ims[0].flatten())
I found the mistake.
model = UNet(num_classes = 2, depth=5, in_channels=5, merge_mode='concat').to(device)
should be
model = UNet(num_classes = 1, depth=5, in_channels=5, merge_mode='concat').to(device)
because I am using BCELosswithLogits.
In TensorFlow, I'm trying to change weights during training, but get no change in the results. I've tried to disrupt the weights (set to zero), but it seems to do nothing (other than take longer to complete). What am I missing? Is there a way to manipulate W like a regular matrix/tensor during session?
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
import tensorflow as tf
sess = tf.InteractiveSession()
x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])
W = tf.Variable(tf.zeros([784,10]), trainable=True)
W2 = tf.Variable(tf.zeros([784,10]), trainable=False)
b = tf.Variable(tf.zeros([10]))
sess.run(tf.initialize_all_variables())
y = tf.nn.softmax(tf.matmul(x,W) + b)
loss = tf.reduce_mean(tf.square(y_ - y))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
for i in range(1000):
#try to change W during training
W = W2
W = tf.Variable(tf.zeros([784,10]))
W.assign(tf.Variable(tf.zeros([784,10])))
batch = mnist.train.next_batch(1)
train_step.run(feed_dict={x: batch[0], y_: batch[1]})
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
Accuracy remains the same (0.82).
I am not sure it's a good idea, but if you want to update W after W.assign, you need to evaluate it.
sess.run(W)
In addition, Since TensorFlow and most Neural Nets use forward/backpropagation to compute values/gradients to update weights, initializing weights with 0 kills all forward values and thus gradients. It's not a good idea.
You can try to initialize them with small random numbers:
tf.Variable(tf.random_normal([784, 10], stddev=0.01))
Or use the xavier initializer
W = tf.get_variable("W", shape=[784, 10],
initializer=tf.contrib.layers.xavier_initializer())
When you use tf.assign(), you need to give a name for this operation:
W= W.assign(tf.Variable(tf.zeros([784,10])))
Then when you use W again, the assign operation will be executed.