I am trying to solve a sequence to sequence problem with a transformer model. The data is derived from a set of crossword puzzles.
The positional encoding and transformer classes are as follows:
class PositionalEncoding(nn.Module):
def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 3000):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
position = torch.arange(max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
pe = torch.zeros(1, max_len, d_model)
pe[0, :, 0::2] = torch.sin(position * div_term)
pe[0, :, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
def debug(self, x):
return x.shape, x.size()
def forward(self, x: Tensor) -> Tensor:
x = x + self.pe[:, :x.size(1), :]
return self.dropout(x)
class Transformer(nn.Module):
def __init__(
self,
num_tokens,
dim_model,
num_heads,
num_encoder_layers,
num_decoder_layers,
batch_first,
dropout_p,
):
super().__init__()
self.model_type = "Transformer"
self.dim_model = dim_model
self.positional_encoder = PositionalEncoding(
d_model=dim_model, dropout=dropout_p, max_len=3000
)
self.embedding = nn.Embedding.from_pretrained(vec_weights, freeze=False)#nn.Embedding(num_tokens, dim_model)
self.transformer = nn.Transformer(
d_model=dim_model,
nhead=num_heads,
num_encoder_layers=num_encoder_layers,
num_decoder_layers=num_decoder_layers,
dropout=dropout_p,
batch_first = batch_first
)
self.out = nn.Linear(dim_model, num_tokens)
def forward(self, src, tgt, tgt_mask=None, src_pad_mask=None, tgt_pad_mask=None):
src = self.embedding(src)*math.sqrt(self.dim_model)
tgt = self.embedding(tgt)*math.sqrt(self.dim_model)
src = self.positional_encoder(src)
tgt = self.positional_encoder(tgt)
transformer_out = self.transformer(src, tgt, tgt_mask=tgt_mask, src_key_padding_mask=src_pad_mask, tgt_key_padding_mask=tgt_pad_mask)
out = self.out(transformer_out)
return out
def get_tgt_mask(self, size) -> torch.tensor:
mask = torch.tril(torch.ones(size, size) == 1)
mask = mask.float()
mask = mask.masked_fill(mask == 0, float('-inf'))
mask = mask.masked_fill(mask == 1, float(0.0))
return mask
def create_pad_mask(self, matrix: torch.tensor, pad_token: int) -> torch.tensor:
return (matrix == pad_token)
The input tensors are a source tensor of size N by S, where N is the batch size and S is the source sequence length, and a target tensor of size N by T, where T is the target sequence length. S is about 10 and T is about 5, while the total number of items is about 160,000-200,000, divided into batch sizes of 512. They are torch.IntTensors, with elements in the range from 0 to V, where V is the vocabulary length.
The first layer is an embedding layer that takes the input from N by S to N by S by E, where E is the embedding dimension (300), or to N by T by E in the case of the target. The second layer adds position encoding without changing the shape. Then both tensors are passed through the transformer layer, which outputs an N by T by E tensor. Finally, we pass this output through a linear layer, which produces an N by T by V output, where V is the size of the vocabulary used in the problem. Here V is about 56,697. The most frequent tokens (words) appear about 50-60 times in the target tensor.
The transformer class also contains the functions for implementing the masking matrices.
Then we create the model and run it (this process is wrapped in a function).
device = "cuda"
src_train, src_test = torch.utils.data.random_split(src_t, [int(0.9*len(src_t)), len(src_t)-int(0.9*len(src_t))])
src_train, src_test = src_train[:512], src_test[:512]
tgt_train, tgt_test = torch.utils.data.random_split(tgt_t, [int(0.9*len(tgt_t)), len(tgt_t)-int(0.9*len(tgt_t))])
tgt_train, tgt_test = tgt_train[:512], tgt_test[:512]
train_data, test_data = list(zip(src_train, tgt_train)), list(zip(src_test, tgt_test))
train, test = torch.utils.data.DataLoader(dataset=train_data), torch.utils.data.DataLoader(dataset=test_data)
model = Transformer(num_tokens=ntokens, dim_model=300, num_heads=2, num_encoder_layers=3, num_decoder_layers=3, batch_first = True, dropout_p=0.1).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0000001)
n_epochs = 50
def train_model(model, optimizer, loss_function, n_epochs):
loss_value=0
for epoch in range(n_epochs):
print(f"Starting epoch {epoch}")
for batch, data in enumerate(train):
x, y = data
if batch%100 == 0:
print(f"Batch is {batch}")
batch += 1
optimizer.zero_grad()
x, y = torch.tensor(x).to(device), torch.tensor(y).to(device)
y_input, y_base = y[:, :-1], y[:, 1:]
y_input, y_base = y_input.to(device), y_base.to(device)
tgt_mask = model.get_tgt_mask(y_input.shape[1]).to(device)
pad_token = vocabulary_table[embeddings.key_to_index["/"]]
src_pad_mask = model.create_pad_mask(x, pad_token).to(device)
tgt_pad_mask = model.create_pad_mask(y_input, pad_token).to(device)
z = model(x, y_input, tgt_mask, src_pad_mask, tgt_pad_mask)
z = z.permute(0, 2, 1).to(device)
y_base = y_base.long().to(device)
loss = loss_function(z, y_base).to(device)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2)
optimizer.step()
loss_value += float(loss)
if batch%100 == 0:
print(f"For epoch {epoch}, batch {batch} the cross-entropy loss is {loss_value}")
#Free GPU memory.
del z
del x
del y
del y_input
del y_base
del loss
torch.cuda.empty_cache()
return model.parameters(), loss_value
Basically, we split the data into test and training sets and use an SGD optimizer and cross-entropy loss. We create a masking matrix for the padding for both the target and source tensors, and a masking matrix for unseen elements for the target tensor. We then do the usual gradient update steps. Right now, there is no validation loop, because I cannot even get the training loss to decrease.
The loss is very high, reaching more than 1000 after 100 batches. More concerningly, the loss also increases rapidly during training, rather than decreasing. In the code that I included, I tried clipping the gradients, lowering the learning rate, and using a much smaller sample to debug the code.
What could be causing this behavior?
You are only adding things to your loss, so naturally it can only increase.
loss_value += float(loss)
You're supposed to set it to zero after every epoch. Now you set it to zero once, in the beginning of the training process. There is a training loop template here if you're interested (https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html). This explains the increasing loss. To further troubleshoot (if needed) I'd throw in a validation loop.
I'm using Julia 1.5.0. Consider the following code:
using LinearAlgebra
using Distributions
using BenchmarkTools
function solve_b!(A, tol_iters)
b = [1.0 2.0]'
luA = lu!(A)
x = [0.0; 0.0]
for i =1:tol_iters
A[1,1] += 0.001
A[2,2] += 0.001
luA = lu!(A)
ldiv!(x, luA, b)
end
end
A = rand(2,2)
solve_b!(A, 1000)
If I run this with julia --track-allocation=user, I see that most of the memory allocation comes from b = [1.0 2.0]' and x = [0.0; 0.0]. That is, when I see the .mem file, I see the following:
96 b = [1.0 2.0]'
0 luA = lu!(A)
96 x = [0.0; 0.0]
The memory allocation increases as I increase tol_iters.
Can someone explain why? I'm using lu! and ldiv!, so I would expect the update to be in-place. Therefore there should not be any additional memory allocation associated with the number of iterations.
Consider a generator in Julia that if collected will take a lot of memory
g=(x^2 for x=1:9999999999999999)
I want to take a random small subsample (Say 1%) of it, but I do not want to collect() the object because will take a lot of memory
Until now the trick I was using was this
temp=collect((( rand()>0.01 ? nothing : x ) for x in g))
random_sample= temp[temp.!=nothing]
But this is not efficient for generators with a lot of elements, collecting something with so many nothing elements doesnt seem right
Any idea is highly appreciated. I guess the trick is to be able to get random elements from the generator without having to allocate memory for all of it.
Thank you very much
You can use a generator with if condition like this:
[v for v in g if rand() < 0.01]
or if you want a bit faster, but more verbose approach (I have hardcoded 0.01 and element type of g and I assume that your generator supports length - otherwise you can remove sizehint! line):
function collect_sample(g)
r = Int[]
sizehint!(r, round(Int, length(g) * 0.01))
for v in g
if rand() < 0.01
push!(r, v)
end
end
r
end
EDIT
Here you have examples of self avoiding sampler and reservoir sampler giving you fixed output size. The smaller fraction of the input you want to get the better it is to use self avoiding sampler:
function self_avoiding_sampler(source_size, ith, target_size)
rng = 1:source_size
idx = rand(rng)
x1 = ith(idx)
r = Vector{typeof(x1)}(undef, target_size)
r[1] = x1
s = Set{Int}(idx)
sizehint!(s, target_size)
for i = 2:target_size
while idx in s
idx = rand(rng)
end
#inbounds r[i] = ith(idx)
push!(s, idx)
end
r
end
function reservoir_sampler(g, target_size)
r = Vector{Int}(undef, target_size)
for (i, v) in enumerate(g)
if i <= target_size
#inbounds r[i] = v
else
j = rand(1:i)
if j < target_size
#inbounds r[j] = v
end
end
end
r
end
This is a possible duplicate of Tensorflow: How to get gradients per instance in a batch?. I ask it anyway, because there has not been a satisfying answer and the goal here is a bit different.
I have a very big network that I can fit on my GPU but the max batch size I can feed is 32. Anything bigger than that causes the GPU to run out of memory. I want to use a bigger batch in order to get a more accurate approximation of the gradient.
For concreteness, let's say I want to compute the gradient on a big batch of size 96 by feeding 3 batches of 32 in turn. The best way that I know of is to use Optimizer.compute_gradients() and Optimizer.apply_gradients(). Here is a small example how it can work
import tensorflow as tf
import numpy as np
learn_rate = 0.1
W_init = np.array([ [1,2,3], [4,5,6], [7,8,9] ], dtype=np.float32)
x_init = np.array([ [11,12,13], [14,15,16], [17,18,19] ], dtype=np.float32)
X = tf.placeholder(dtype=np.float32, name="x")
W = tf.Variable(W_init, dtype=np.float32, name="w")
y = tf.matmul(X, W, name="y")
loss = tf.reduce_mean(y, name="loss")
opt = tf.train.GradientDescentOptimizer(learn_rate)
grad_vars_op = opt.compute_gradients(loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
# Compute the gradients for each batch
grads_vars1 = sess.run(grad_vars_op, feed_dict = {X: x_init[None,0]})
grads_vars2 = sess.run(grad_vars_op, feed_dict = {X: x_init[None,1]})
grads_vars3 = sess.run(grad_vars_op, feed_dict = {X: x_init[None,2]})
# Separate the gradients from the variables
grads1 = [ grad for grad, var in grads_vars1 ]
grads2 = [ grad for grad, var in grads_vars2 ]
grads3 = [ grad for grad, var in grads_vars3 ]
varl = [ var for grad, var in grads_vars1 ]
# Average the gradients
grads = [ (g1 + g2 + g3)/3 for g1, g2, g3 in zip(grads1, grads2, grads3)]
sess.run(opt.apply_gradients(zip(grads,varl)))
print("Weights after 1 gradient")
print(sess.run(W))
Now this is all very ugly and inefficient since the forward pass is being run on the GPU while averaging the gradients happens on the CPU and then applying them happens on the GPU again.
Moreover, this code throws an exception because grads is a list of np.arrays and to make it work, one would have to create a tf.placeholder for every gradient.
I am sure there should be a better and more efficient way to do this? Any suggestions?
You can create copy of trainable_variables and accumulate batch gradients. Here's few simple steps to follow
...
opt = tf.train.GradientDescentOptimizer(learn_rate)
# constant to scale sum of gradient
const = tf.constant(1/n_batches)
# get all trainable variables
t_vars = tf.trainable_variables()
# create a copy of all trainable variables with `0` as initial values
accum_tvars = [tf.Variable(tf.zeros_like(tv.initialized_value()),trainable=False) for t_var in t_vars]
# create a op to initialize all accums vars
zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_tvars]
# compute gradients for a batch
batch_grads_vars = opt.compute_gradients(loss, t_vars)
# collect the (scaled by const) batch gradient into accumulated vars
accum_ops = [accum_tvars[i].assign_add(tf.scalar_mul(const, batch_grad_var[0]) for i, batch_grad_var in enumerate(batch_grads_vars)]
# apply accums gradients
train_step = opt.apply_gradients([(accum_tvars[i], batch_grad_var[1]) for i, batch_grad_var in enumerate(batch_grads_vars)])
# train_step = opt.apply_gradients(zip(accum_tvars, zip(*batch_grads_vars)[1])
while True:
# initialize the accumulated gards
sess.run(zero_ops)
# number of batches for gradient accumulation
n_batches = 3
for i in xrange(n_batches):
sess.run(accum_ops, feed_dict={X: x_init[:, i]})
sess.run(train_step)
I tried to model a NN using softmax regression.
After 999 iterations, I got error of about 0.02% for per data point, which i thought was good. But when I visualize the model on tensorboard, my cost function did not reach towards 0 instead I got something like this
And for weights and bias histogram this
I am a beginner and I can't seem to understand the mistake. May be I am using a wrong method to define cost?
Here is my full code for reference.
import tensorflow as tf
import numpy as np
import random
lorange= 1
hirange= 10
amplitude= np.random.uniform(-10,10)
t= 10
random.seed()
tau=np.random.uniform(lorange,hirange)
x_node = tf.placeholder(tf.float32, (10,))
y_node = tf.placeholder(tf.float32, (10,))
W = tf.Variable(tf.truncated_normal([10,10], stddev= .1))
b = tf.Variable(.1)
y = tf.nn.softmax(tf.matmul(tf.reshape(x_node,[1,10]), W) + b)
##ADD SUMMARY
W_hist = tf.histogram_summary("weights", W)
b_hist = tf.histogram_summary("biases", b)
y_hist = tf.histogram_summary("y", y)
# Cost function sum((y_-y)**2)
with tf.name_scope("cost") as scope:
cost = tf.reduce_mean(tf.square(y_node-y))
cost_sum = tf.scalar_summary("cost", cost)
# Training using Gradient Descent to minimize cost
with tf.name_scope("train") as scope:
train_step = tf.train.GradientDescentOptimizer(0.00001).minimize(cost)
sess = tf.InteractiveSession()
# Merge all the summaries and write them out to logfile
merged = tf.merge_all_summaries()
writer = tf.train.SummaryWriter("/tmp/mnist_logs_4", sess.graph_def)
error = tf.reduce_sum(tf.abs(y - y_node))
init = tf.initialize_all_variables()
sess.run(init)
steps = 1000
for i in range(steps):
xs = np.arange(t)
ys = amplitude * np.exp(-xs / tau)
feed = {x_node: xs, y_node: ys}
sess.run(train_step, feed_dict=feed)
print("After %d iteration:" % i)
print("W: %s" % sess.run(W))
print("b: %s" % sess.run(b))
print('Total Error: ', error.eval(feed_dict={x_node: xs, y_node:ys}))
# Record summary data, and the accuracy every 10 steps
if i % 10 == 0:
result = sess.run(merged, feed_dict=feed)
writer.add_summary(result, i)
I got the same plot like you a couple of times.
That happened mostly when I was running tensorboard on multiple log-files. That is, the logdir I gave to TensorBoard contained multiple log-files. Try to run TensorBoard on one single log-file and let me know what happens