I am using the following model for machine translation using transformer with KerasNLP:
`
# Encoder
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = keras_nlp.layers.TokenAndPositionEmbedding(
vocabulary_size=ENG_VOCAB_SIZE,
sequence_length=MAX_SEQUENCE_LENGTH,
embedding_dim=EMBED_DIM,
mask_zero=True,
)(encoder_inputs)
encoder_outputs = keras_nlp.layers.TransformerEncoder(
intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(inputs=x)
encoder = keras.Model(encoder_inputs, encoder_outputs)
# Decoder
decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")
x = keras_nlp.layers.TokenAndPositionEmbedding(
vocabulary_size=SND_VOCAB_SIZE,
sequence_length=MAX_SEQUENCE_LENGTH,
embedding_dim=EMBED_DIM,
mask_zero=True,
)(decoder_inputs)
x = keras_nlp.layers.TransformerDecoder(
intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(SND_VOCAB_SIZE, activation="softmax")(x)
decoder = keras.Model(
[
decoder_inputs,
encoded_seq_inputs,
],
decoder_outputs,
)
decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
[encoder_inputs, decoder_inputs],
decoder_outputs,
name="transformer",
)
`
`
transformer.summary()
transformer.compile(
"rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
hist = transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)
`
When I am trying to save the model it is giving me warnings and it is not predicting correctly:
`
with open('model.pkl', 'wb') as file:
pickle.dump(transformer, file)
`
WARNING:absl:Found untraced functions such as token_embedding1_layer_call_fn, token_embedding1_layer_call_and_return_conditional_losses, position_embedding1_layer_call_fn, position_embedding1_layer_call_and_return_conditional_losses, multi_head_attention_layer_call_fn while saving (showing 5 of 78). These functions will not be directly callable after loading.
Related
I'm trying to create a Gaussian HMM model in pyro to infer the parameters of a very simple Markov sequence. However, my model fails to infer the parameters and something wired happened during the training process. Using the same sequence, hmmlearn has successfully infer the true parameters.
Full code can be accessed in here:
https://colab.research.google.com/drive/1u_4J-dg9Y1CDLwByJ6FL4oMWMFUVnVNd#scrollTo=ZJ4PzdTUBgJi
My model is modified from the example in here:
https://github.com/pyro-ppl/pyro/blob/dev/examples/hmm.py
I manually created a first order Markov sequence where there are 3 states, the true means are [-10, 0, 10], sigmas are [1,2,1].
Here is my model
def model(observations, num_state):
assert not torch._C._get_tracing_state()
with poutine.mask(mask = True):
p_transition = pyro.sample("p_transition",
dist.Dirichlet((1 / num_state) * torch.ones(num_state, num_state)).to_event(1))
p_init = pyro.sample("p_init",
dist.Dirichlet((1 / num_state) * torch.ones(num_state)))
p_mu = pyro.param(name = "p_mu",
init_tensor = torch.randn(num_state),
constraint = constraints.real)
p_tau = pyro.param(name = "p_tau",
init_tensor = torch.ones(num_state),
constraint = constraints.positive)
current_state = pyro.sample("x_0",
dist.Categorical(p_init),
infer = {"enumerate" : "parallel"})
for t in pyro.markov(range(1, len(observations))):
current_state = pyro.sample("x_{}".format(t),
dist.Categorical(Vindex(p_transition)[current_state, :]),
infer = {"enumerate" : "parallel"})
pyro.sample("y_{}".format(t),
dist.Normal(Vindex(p_mu)[current_state], Vindex(p_tau)[current_state]),
obs = observations[t])
My model is compiled as
device = torch.device("cuda:0")
obs = torch.tensor(obs)
obs = obs.to(device)
torch.set_default_tensor_type("torch.cuda.FloatTensor")
guide = AutoDelta(poutine.block(model, expose_fn = lambda msg : msg["name"].startswith("p_")))
Elbo = Trace_ELBO
elbo = Elbo(max_plate_nesting = 1)
optim = Adam({"lr": 0.001})
svi = SVI(model, guide, optim, elbo)
As the training goes, the ELBO has decreased steadily as shown. However, the three means of the states converges.
I have tried to put the for loop of my model into a pyro.plate and switch pyro.param to pyro.sample and vice versa, but nothing worked for my model.
I have not tried this model, but I think it should be possible to solve the problem by modifying the model in the following way:
def model(observations, num_state):
assert not torch._C._get_tracing_state()
with poutine.mask(mask = True):
p_transition = pyro.sample("p_transition",
dist.Dirichlet((1 / num_state) * torch.ones(num_state, num_state)).to_event(1))
p_init = pyro.sample("p_init",
dist.Dirichlet((1 / num_state) * torch.ones(num_state)))
p_mu = pyro.sample("p_mu",
dist.Normal(torch.zeros(num_state), torch.ones(num_state)).to_event(1))
p_tau = pyro.sample("p_tau",
dist.HalfCauchy(torch.zeros(num_state)).to_event(1))
current_state = pyro.sample("x_0",
dist.Categorical(p_init),
infer = {"enumerate" : "parallel"})
for t in pyro.markov(range(1, len(observations))):
current_state = pyro.sample("x_{}".format(t),
dist.Categorical(Vindex(p_transition)[current_state, :]),
infer = {"enumerate" : "parallel"})
pyro.sample("y_{}".format(t),
dist.Normal(Vindex(p_mu)[current_state], Vindex(p_tau)[current_state]),
obs = observations[t])
The model would then be trained using MCMC:
# MCMC
hmc_kernel = NUTS(model, target_accept_prob = 0.9, max_tree_depth = 7)
mcmc = MCMC(hmc_kernel, num_samples = 1000, warmup_steps = 100, num_chains = 1)
mcmc.run(obs)
The results could then be analysed using:
mcmc.get_samples()
I get different error messages when I try to tune/benchmark "surv.svm".
For tuning I get the following error
Error in kernelMatrix(Xtrain = sv, kernel_type = kernel_type, kernel_pars = kernel_pars, : additiv kernel can not be applied on constant column
For benchmark I get the following error when poly_kernel is listed
Error in tcrossprod(K, Dc) : non-conformable arguments
When poly_kernel is removed, I get different error message
What is the problem and how to solve it?
task = tsk("actg")
learner = as_learner(ppl("distrcompositor",
learner = lrn("surv.svm", type = "regression",
kernel = to_tune(c("lin_kernel", "add_kernel", "rbf_kernel")),
gamma.mu = to_tune(p_dbl(-3, 1, trafo = function(x) 10^x))),
estimator = "kaplan", form = "ph"))
set.seed(82721)
inner_cv = rsmp("cv", folds = 2)
at_learner = AutoTuner$new(learner = learner,
resampling = inner_cv,
measure = msr("surv.cindex"),
terminator = trm("evals", n_evals = 96),
tuner = tnr("irace"))
at_learner$train(task)
I trained a single model and want to combine it with another keras model using the functional api (backend is tensorflow version 1.4)
My first model looks like this:
import tensorflow.contrib.keras.api.keras as keras
model = keras.models.Sequential()
input = Input(shape=(200,))
dnn = Dense(400, activation="relu")(input)
dnn = Dense(400, activation="relu")(dnn)
output = Dense(5, activation="softmax")(dnn)
model = keras.models.Model(inputs=input, outputs=output)
after I trained this model I save it using the keras model.save() method. I can also load the model and retrain it without problems.
Now I want to use the output of this model as additional input for a second model:
# load first model
old_model = keras.models.load_model(path_to_old_model)
input_1 = Input(shape=(200,))
input_2 = Input(shape=(200,))
output_old_model = old_model(input_2)
merge_layer = concatenate([input_1, output_old_model])
dnn_layer = Dense(200, activation="relu")(merge_layer)
dnn_layer = Dense(200, activation="relu")(dnn_layer)
output = Dense(10, activation="sigmoid")(dnn_layer)
new_model = keras.models.Model(inputs=[input_1, input_2], outputs=output)
new_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
new_model.fit(inputs=[x1,x2], labels=labels, epochs=50, batch_size=32)
when I try this I get the following error message:
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value dense_1/kernel
[[Node: dense_1/kernel/read = Identity[T=DT_FLOAT, _class=["loc:#dense_1/kernel"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](dense_1/kernel)]]
[[Node: model_1_1/dense_3/BiasAdd/_79 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_68_model_1_1/dense_3/BiasAdd", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
I would do this in following steps:
Define function for building a clean model with the same architecture:
def build_base():
input = Input(shape=(200,))
dnn = Dense(400, activation="relu")(input)
dnn = Dense(400, activation="relu")(dnn)
output = Dense(5, activation="softmax")(dnn)
model = keras.models.Model(inputs=input, outputs=output)
return input, output, model
Build two copies of the same model:
input_1, output_1, model_1 = build_base()
input_2, output_2, model_2 = build_base()
Set weights in both models:
model_1.set_weights(old_model.get_weights())
model_2.set_weights(old_model.get_weights())
Now do the rest:
merge_layer = concatenate([input_1, output_2])
dnn_layer = Dense(200, activation="relu")(merge_layer)
dnn_layer = Dense(200, activation="relu")(dnn_layer)
output = Dense(10, activation="sigmoid")(dnn_layer)
new_model = keras.models.Model(inputs=[input_1, input_2], outputs=output)
Let's say you have a pre-trained/saved CNN model called pretrained_model and you want to add a densely connected layers to it, then using the functional API you can write something like this:
from keras import models, layers
kmodel = layers.Flatten()(pretrained_model.output)
kmodel = layers.Dense(256, activation='relu')(kmodel)
kmodel_out = layers.Dense(1, activation='sigmoid')(kmodel)
model = models.Model(pretrained_model.input, kmodel_out)
Following code:
library(text2vec)
text8_file = "text8"
if (!file.exists(text8_file)) {
download.file("http://mattmahoney.net/dc/text8.zip", "text8.zip")
unzip ("text8.zip", files = "text8")
}
wiki = readLines(text8_file, n = 1, warn = FALSE)
# Create iterator over tokens
tokens <- space_tokenizer(wiki)
# Create vocabulary. Terms will be unigrams (simple words).
it = itoken(tokens, progressbar = FALSE)
vocab <- create_vocabulary(it)
vocab <- prune_vocabulary(vocab, term_count_min = 5L)
# Use our filtered vocabulary
vectorizer <- vocab_vectorizer(vocab)
# use window of 5 for context words
tcm <- create_tcm(it, vectorizer, skip_grams_window = 5L)
RcppParallel::setThreadOptions(numThreads = 4)
glove_model = GloVe$new(word_vectors_size = 50, vocabulary = vocab, x_max = 10, learning_rate = .25)
word_vectors_main = glove_model$fit_transform(tcm, n_iter = 20)
word_vectors_context = glove_model$components
word_vectors = word_vectors_main + t(word_vectors_context)
causes error:
qlst <- prepare_analogy_questions("questions-words.txt", rownames(word_vectors))
> Error in (function (fmt, ...) :
invalid format '%d'; use format %s for character objects
File questions-words.txt from word2vec sources https://github.com/nicholas-leonard/word2vec/blob/master/questions-words.txt
This was a small bug in information message formatting (after introduction of futille.logger). Just fixed it and pushed to github.
You can install updated version of the package with devtools::install_github("dselivanov/text2vec"
I am trying to share the parameters between the encoder/decoder sub-networks of an architecture with another encoder/decoder in a different architecture. This is necessary for my problem since at the test time it requires a lot of computation (and time) to do a forward pass on the original architecture and then extract the decoder results. However, what I noticed was although I have explicitly asked for parameter sharing when doing clone(), the parameters are not shared and each architecture has its own parameters while training.
I am showing the difference between the results of the two architecture via some print() statements by forward-propagating some random vectors into the decoder and encoders of both architectures (you can do also compare their weights).
So I wonder, can anyone help me find out what I'm doing wrong when sharing the parameters?
Below I post a simplified version of my code:
require 'nn'
require 'nngraph'
require 'cutorch'
require 'cunn'
require 'optim'
input = nn.Identity()()
encoder = nn.Sequential():add(nn.Linear(100, 20)):add(nn.ReLU(true)):add(nn.Linear(20, 10))
decoder = nn.Sequential():add(nn.Linear(10, 20)):add(nn.ReLU(true)):add(nn.Linear(20, 100))
code = encoder(input)
reconstruction = decoder(code)
outsideCode = nn.Identity()()
decoderCloned= decoder:clone('weight', 'bias', 'gradWeight', 'gradBias')
outsideReconstruction = decoderCloned(nn.JoinTable(1)({code, outsideCode}))
dumbNet = nn.Sequential():add(nn.Linear(100, 10))
codeRecon = dumbNet(outsideReconstruction)
input2 = nn.Identity()()
encoderTestTime = encoder:clone('weight', 'bias', 'gradWeight', 'gradBias')
decoderTestTime = decoder:clone('weight', 'bias', 'gradWeight', 'gradBias')
codeTest = encoderTestTime(input2)
reconTest = decoderTestTime(codeTest)
gMod = nn.gModule({input, outsideCode}, {reconstruction, codeRecon})
gModTest = nn.gModule({input2}, {reconTest})
criterion1 = nn.BCECriterion()
criterion2 = nn.MSECriterion()
-- Okay, the module has been created. Now it's time to do some other stuff
params, gParams = gMod:getParameters()
numParams = params:nElement()
memReqForParams = numParams * 5 * 4 / 1024 / 1024 -- Convert to MBs
-- If enough memory on GPU, move stuff to the GPU
if memReqForParams <= 1000 then
gMod = gMod:cuda()
gModTest = gModTest:cuda()
criterion1 = criterion1:cuda()
criterion2 = criterion2:cuda()
params, gParams = gMod:getParameters()
end
-- Data
Data = torch.rand(200, 100):cuda()
Data[Data:gt(0.5)] = 1
Data[Data:lt(0.5)] = 0
fakeCodes = torch.rand(400, 10):cuda()
config = {learningRate = 0.001}
state = {}
-- Start training
print ("\nEncoders before training: \n\tgMod's Encoder: " .. gMod:get(2):forward(torch.ones(1, 100):cuda()):sum() .. "\n\tgModTest's Encoder: " .. gModTest:get(2):forward(torch.ones(1, 100):cuda()):sum())
print ("\nDecoders before training: \n\tgMod's Decoder: " .. gMod:get(3):forward(torch.ones(1, 10):cuda()):sum() .. "\n\tgModTest's Decoder: " .. gModTest:get(3):forward(torch.ones(1, 10):cuda()):sum())
gMod:training()
for i=1, Data:size(1) do
local opfunc = function(x)
if x ~= params then
params:copy(x)
end
gMod:zeroGradParameters()
recon, outsideRecon = unpack(gMod:forward({Data[{{i}}], fakeCodes[{{i}}]}))
err = criterion1:forward(recon, Data[{{i}}])
df_dw = criterion1:backward(recon, Data[{{i}}])
errFake = criterion2:forward(outsideRecon, fakeCodes[{{i*2-1, i * 2}}])
df_dwFake = criterion2:backward(outsideRecon, fakeCodes[{{i*2-1, i * 2}}])
errorGrads = {df_dw, df_dwFake}
gMod:backward({Data[{{i}}], fakeCodes[{{i*2-1, i * 2}}]}, errorGrads)
return err, gParams
end
x, reconError = optim.adam(opfunc, params, config, state)
end
print ("\n\nEncoders after training: \n\tgMod's Encoder: " .. gMod:get(2):forward(torch.ones(1, 100):cuda()):sum() .. "\n\tgModTest's Encoder: " .. gModTest:get(2):forward(torch.ones(1, 100):cuda()):sum())
print ("\nDecoders after training: \n\tgMod's Decoder: " .. gMod:get(3):forward(torch.ones(1, 10):cuda()):sum() .. "\n\tgModTest's Decoder: " .. gModTest:get(3):forward(torch.ones(1, 10):cuda()):sum())
I got the solution to the problem with the help of fmassa on a GitHub issue I had opened for this problem here. One can use nn.Container to resolve the issue of parameter sharing as follow:
container = nn.Container()
container:add(gMod)
container:add(gModTest)
params, gradParams = container:getParameters()