PicklingError when running pipeline in sklearn - machine-learning

I am running the following pipeline in sklearn to perform grid search
def logistic():
sm = SMOTE()
poly = polynomial_transform()
stand = StandardScaler()
pca = PCA()
#I need to think about how I am training this...
logistic = LogisticRegression(max_iter=100, tol=0.01,solver = 'saga') #what is my binary scorer here
pipe = Pipeline(steps=[('smt', sm),('poly', poly),('standardise', stand),('pca', pca), ('logistic', logistic)],memory = mem)
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
#'poly__degree': [1,2],
'pca__n_components':[50,100],
'logistic__C': [0.0001],
}
scorers = {
'precision_score': make_scorer(precision_score),
'recall_score': make_scorer(recall_score),
'accuracy_score': make_scorer(accuracy_score),
'f1_score': make_scorer(f1_score),
'tp': make_scorer(tp),
'tn': make_scorer(tn),
'fp': make_scorer(fp),
'fn': make_scorer(fn),
'ck': make_scorer(cohen_kappa_score)
}
#performing grid search
search = GridSearchCV(pipe, param_grid, n_jobs=-1,verbose=2,scoring= scorers,refit='accuracy_score')
search.fit(X_train.to_numpy(), y_train.to_numpy().ravel())
#results of cross validation grid search
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)
return search
polynomial transform is a class I have created myself
class polynomial_transform(BaseEstimator):
def __init__(self,degree=None):
self.degree=degree
def fit(self,X,y=None):
return self
#Method that describes what we need this transformer to do
def transform( self, X, y = None ):
for i in range(1,self.degree):
X = np.hstack((X, X**i))
return X
def set_params(self, degree):
self.degree = degree
def get_params(self,deep=True):
return {'degree':self.degree}
When setting the memory parameter in the pipeline I am getting the following error
FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
_pickle.PicklingError: ("Can't pickle : it's not found as main.polynomial_transform"

Related

Any way to efficiently stack/ensemble pre-trained models for image classification?

I am trying to stack a few pre-trained models that I have through taking the last hidden layer of each model and then concatenating them together and then plugging them into a meta-learner model (e.g. XGBoost).
I am running into a big problem of having to process each image of my dataset multiple times since each base model requires a different processing method. This is causing my model to take a really long time to train and is infeasible. Is there any way to work past this?
For example:
model_1, processor_1 = pretrained_model(), pretrained_processor()
model_2, processor_2 = pretrained_model2(), pretrained_processor2()
for img in images:
input_1 = processor_1(img)
input_2 = processor_2(img)
out_1 = model_1(input_1)
out_2 = model_2(input_2)
torch.cat((out1,out2), dim=1) #concatenates hidden representations to feed into another model
Here'a recommendation if you want to process your images faster:
Note: I did not test this out
import torch
import torch.nn as nn
# Create a stack nn module
class StackedModel(nn.Module):
def __init__(self, model1, model2):
super(StackedModel, self).__init__()
self.model1 = model1
self.model2 = model2
def forward(self, imgs):
out_1 = model_1(input_1)
out_2 = model_2(input_2)
return torch.cat((out1, out2), dim=1)
# Init model
model = StackedModel(model1, model2)
# Try to stack and run in a larger batch assuming u have extra gpu space
stacked_preproc1 = []
stacked_preproc2 = []
max_batch_size = 16
total_output = []
for index, img in enumerate(images):
input_1 = processor_1(img)
input_2 = processor_2(img)
stacked_preproc1.append(input_1)
stakced_preproc2.appennd(input2)
if index % max_batch_size == 0:
stacked_preproc1 = torch.stack(stacked_preproc1)
stakced_preproc2 = torch.stack(stakced_preproc2)
else:
total_output.append(
model(stacked_preproc1, stacked_preproc2)
)
# Reset array
stacked_preproc1 = []
stakced_preproc2 = []

Custom dataset Loader pytorch

i am doing covid-19 classification.i took dataset from kaggle. it has folder named dataset which contain 3 folders normal pnuemonia and covid-19 each contaning images for these classes i am stucked in writting getitem in pytorch custom dataloader ?
Dataset has 189 covid images but by this get item i get 920 images of covid kindly help
class_names = ['normal', 'viral', 'covid']
root_dir = 'COVID-19 Radiography Database'
source_dirs = ['NORMAL', 'Viral Pneumonia', 'COVID-19']
if os.path.isdir(os.path.join(root_dir, source_dirs[1])):
os.mkdir(os.path.join(root_dir, 'test'))
for i, d in enumerate(source_dirs):
os.rename(os.path.join(root_dir, d), os.path.join(root_dir, class_names[i]))
for c in class_names:
os.mkdir(os.path.join(root_dir, 'test', c))
for c in class_names:
images = [x for x in os.listdir(os.path.join(root_dir, c)) if x.lower().endswith('png')]
selected_images = random.sample(images, 30)
for image in selected_images:
source_path = os.path.join(root_dir, c, image)
target_path = os.path.join(root_dir, 'test', c, image)
shutil.move(source_path, target_path)
Above code is used to create test dataset which has 30 images of each class
class ChestXRayDataset(torch.utils.data.Dataset):
def __init__(self, image_dirs, transform):
def get_images(class_name):
images = [x for x in os.listdir(image_dirs[class_name]) if
x[-3:].lower().endswith('png')]
print(f'Found {len(images)} {class_name} examples')
return images
self.images = {}
self.class_names = ['normal', 'viral', 'covid']
for class_name in self.class_names:
self.images[class_name] = get_images(class_name)
self.image_dirs = image_dirs
self.transform = transform
def __len__(self):
return sum([len(self.images[class_name]) for class_name in self.class_names])
def __getitem__(self, index):
class_name = random.choice(self.class_names)
index = index % len(self.images[class_name])
image_name = self.images[class_name][index]
image_path = os.path.join(self.image_dirs[class_name], image_name)
image = Image.open(image_path).convert('RGB')
return self.transform(image), self.class_names.index(class_name)
**Stucked in get item of this **
images in folder are arranged as follows
Dataset is as follows
**Code for confusion matrix is **
nb_classes = 3
confusion_matrix = torch.zeros(nb_classes, nb_classes)
with torch.no_grad():
for data in tqdm_notebook(dl_train,total=len(dl_train),unit='batch'):
img,lab = data
print(lab)
img,lab = img.to(device),lab.to(device)
_,output = torch.max(model(img),1)
print(output)
for t, p in zip(lab.view(-1), output.view(-1)):
confusion_matrix[t.long(), p.long()] += 1
output for confusion matrix only one class is getting trained
confusio matrix image
Putting you images in a dictionary complicates the manipulation, rather use a list. Also you Dataset should not have any randomness, shuffling of the data should happen from the DataLoader not from the Dataset.
Use something like below:
class ChestXRayDataset(torch.utils.data.Dataset):
def __init__(self, image_dirs, transform):
def get_images(class_name):
images = [x for x in os.listdir(image_dirs[class_name]) if
x[-3:].lower().endswith('png')]
print(f'Found {len(images)} {class_name} examples')
return images
self.images = []
self.labels = []
self.class_names = ['normal', 'viral', 'covid']
for class_name in self.class_names:
images = get_images(class_name)
# This is a list containing all the images
self.images.extend(images)
# This is a list containing all the corresponding image labels
self.labels.extend([class_name]*len(images))
self.image_dirs = image_dirs
self.transform = transform
def __len__(self):
return len(self.images)
# Will return the image and its label at the position `index`
def __getitem__(self, index):
# image at index position of all the images
image_name = self.images[index]
# Its label
class_name = self.labels[index]
image_path = os.path.join(self.image_dirs[class_name], image_name)
image = Image.open(image_path).convert('RGB')
return self.transform(image), self.class_names.index(class_name)
If you enumerate it say using
ds = ChestXRayDataset(image_dirs, transform)
for x, y in ds:
print (x.shape, y)
You should see all the images and the labels in the sequential order.
However in real case you would rather use a Torch DataLoader and pass it the ds object with shuffle parameter set to True. So the DataLoader will take care of shuffling the Dataset by calling the __getitem__ with shuffled index values.

How to prevent Keras predict_generator from shuffling data?

I created a deep learning model, and I want to check the performance of the model by using predict_generator. I am using the following code which compares the images' labels with the predicted classes and then returns the prediction error.
validation_generator = validation_datagen.flow_from_directory(
validation_dir,
target_size=(image_size, image_size),
batch_size=val_batchsize,
class_mode='categorical',
shuffle=False)
# Get the filenames from the generator
fnames = validation_generator.filenames
# Get the ground truth from generator
ground_truth = validation_generator.classes
# Get the label to class mapping from the generator
label2index = validation_generator.class_indices
# Getting the mapping from class index to class label
idx2label = dict((v,k) for k,v in label2index.items())
# Get the predictions from the model using the generator
predictions = model.predict_generator(validation_generator, steps=validation_generator.samples/validation_generator.batch_size,verbose=1)
predicted_classes = np.argmax(predictions,axis=1)
errors = np.where(predicted_classes != ground_truth)[0]
print("No of errors = {}/{}".format(len(errors),validation_generator.samples))
# Show the errors
for i in range(len(errors)):
pred_class = np.argmax(predictions[errors[i]])
pred_label = idx2label[pred_class]
title = 'Original label:{}, Prediction :{}, confidence : {:.3f}'.format(
fnames[errors[i]].split('/')[0],
pred_label,
predictions[errors[i]][pred_class])
original = load_img('{}/{}'.format(validation_dir,fnames[errors[i]]))
plt.figure(figsize=[7,7])
plt.axis('off')
plt.title(title)
plt.imshow(original)
plt.show()
validation_generator.classes is arranged but predicted_classes is not arranged.
I take the code from here https://www.learnopencv.com/keras-tutorial-fine-tuning-using-pre-trained-models/
How can I prevent predict_generator from shuffling data?

Find the best pipeline model using CrossValidator and ParamGridBuilder

I have an acceptable model, but I would like to improve it by adjusting its parameters in Spark ML Pipeline with CrossValidator and ParamGridBuilder.
As an Estimator I will place the existing pipeline.
In ParamMaps I would not know what to put, I do not understand it.
As Evaluator I will use the RegressionEvaluator already created previously.
I'm going to do it for 5 folds, with a list of 10 different depth values in the tree.
How can I select and show the best model for the lowest RMSE?
ACTUAL example:
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
dt = DecisionTreeRegressor()
dt.setPredictionCol("Predicted_PE")
dt.setMaxBins(100)
dt.setFeaturesCol("features")
dt.setLabelCol("PE")
dt.setMaxDepth(8)
pipeline = Pipeline(stages=[vectorizer, dt])
model = pipeline.fit(trainingSetDF)
regEval = RegressionEvaluator(predictionCol = "Predicted_XX", labelCol = "XX", metricName = "rmse")
rmse = regEval.evaluate(predictions)
print("Root Mean Squared Error: %.2f" % rmse)
(1) Spark Jobs
(2) Root Mean Squared Error: 3.60
NEED:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
dt2 = DecisionTreeRegressor()
dt2.setPredictionCol("Predicted_PE")
dt2.setMaxBins(100)
dt2.setFeaturesCol("features")
dt2.setLabelCol("PE")
dt2.setMaxDepth(10)
pipeline2 = Pipeline(stages=[vectorizer, dt2])
model2 = pipeline2.fit(trainingSetDF)
regEval2 = RegressionEvaluator(predictionCol = "Predicted_PE", labelCol = "PE", metricName = "rmse")
paramGrid = ParamGridBuilder().build() # ??????
crossval = CrossValidator(estimator = pipeline2, estimatorParamMaps = paramGrid, evaluator=regEval2, numFolds = 5) # ?????
rmse2 = regEval2.evaluate(predictions)
#bestPipeline = ????
#bestLRModel = ????
#bestParams = ????
print("Root Mean Squared Error: %.2f" % rmse2)
(1) Spark Jobs
(2) Root Mean Squared Error: 3.60 # the same ¿?
You need to call .fit() with your training data on the crossval object to create the cv model. That will do the cross validation. Then you get the best model (according to your evaluator metric) from that. Eg.
cvModel = crossval.fit(trainingData)
myBestModel = cvModel.bestModel

What to do when Seq2Seq network repeats words over and over in output?

So, I've been working on a project for a while, we have very little data, I know it would become much better if we were able to put together a much much larger dataset. That aside, my issue at the moment is when I have a sentence input, my outputs look like this right now:
contactid contactid contactid contactid
A single word is focused on and repeated over and over again. What can I do to overcome this hurdle?
Things I've tried:
Double checked I was appending start/stop tokens and make sure the tokens were properly placed in the top of their vocab files, I am sharing vocab.
I found something saying it could be due to poor word embeddings. To that end I checked with tensorboard and sure enough PCA showed a very dense cluster of points. Seeing that I grabbed Facebook's public pre trained word vectors and loaded them in as the embedding. Trained again and this time tensorboard PCA showed a much better picture.
Switched my training scheduler from basic to SampledScheduling to occasionally replace a training output with the ground truth.
Switched my decoder to use the beam search decoder I figured this may give more robust responses if the word choices were close together in the intermediary feature space.
For certain my perplexity is steadily decreasing.
Here is my dataset preperation code:
class ModelInputs(object):
"""Factory to construct various input hooks and functions depending on mode """
def __init__(
self, vocab_files, batch_size,
share_vocab=True, src_eos_id=1, tgt_eos_id=2
):
self.batch_size = batch_size
self.vocab_files = vocab_files
self.share_vocab = share_vocab
self.src_eos_id = src_eos_id
self.tgt_eos_id = tgt_eos_id
def get_inputs(self, file_path, num_infer=None, mode=tf.estimator.ModeKeys.TRAIN):
self.mode = mode
if self.mode == tf.estimator.ModeKeys.TRAIN:
return self._training_input_hook(file_path)
if self.mode == tf.estimator.ModeKeys.EVAL:
return self._validation_input_hook(file_path)
if self.mode == tf.estimator.ModeKeys.PREDICT:
if num_infer is None:
raise ValueError('If performing inference must supply number of predictions to be made.')
return self._infer_input_hook(file_path, num_infer)
def _prepare_data(self, dataset, out=False):
prep_set = dataset.map(lambda string: tf.string_split([string]).values)
prep_set = prep_set.map(lambda words: (words, tf.size(words)))
if out == True:
return prep_set.map(lambda words, size: (self.vocab_tables[1].lookup(words), size))
return prep_set.map(lambda words, size: (self.vocab_tables[0].lookup(words), size))
def _batch_data(self, dataset, src_eos_id, tgt_eos_id):
batched_set = dataset.padded_batch(
self.batch_size,
padded_shapes=((tf.TensorShape([None]), tf.TensorShape([])), (tf.TensorShape([None]), tf.TensorShape([]))),
padding_values=((src_eos_id, 0), (tgt_eos_id, 0))
)
return batched_set
def _batch_infer_data(self, dataset, src_eos_id):
batched_set = dataset.padded_batch(
self.batch_size,
padded_shapes=(tf.TensorShape([None]), tf.TensorShape([])),
padding_values=(src_eos_id, 0)
)
return batched_set
def _create_vocab_tables(self, vocab_files, share_vocab=False):
if vocab_files[1] is None and share_vocab == False:
raise ValueError('If share_vocab is set to false must provide target vocab. (src_vocab_file, \
target_vocab_file)')
src_vocab_table = lookup_ops.index_table_from_file(
vocab_files[0],
default_value=UNK_ID
)
if share_vocab:
tgt_vocab_table = src_vocab_table
else:
tgt_vocab_table = lookup_ops.index_table_from_file(
vocab_files[1],
default_value=UNK_ID
)
return src_vocab_table, tgt_vocab_table
def _prepare_iterator_hook(self, hook, scope_name, iterator, file_path, name_placeholder):
if self.mode == tf.estimator.ModeKeys.TRAIN or self.mode == tf.estimator.ModeKeys.EVAL:
feed_dict = {
name_placeholder[0]: file_path[0],
name_placeholder[1]: file_path[1]
}
else:
feed_dict = {name_placeholder: file_path}
with tf.name_scope(scope_name):
hook.iterator_initializer_func = \
lambda sess: sess.run(
iterator.initializer,
feed_dict=feed_dict,
)
def _set_up_train_or_eval(self, scope_name, file_path):
hook = IteratorInitializerHook()
def input_fn():
with tf.name_scope(scope_name):
with tf.name_scope('sentence_markers'):
src_eos_id = tf.constant(self.src_eos_id, dtype=tf.int64)
tgt_eos_id = tf.constant(self.tgt_eos_id, dtype=tf.int64)
self.vocab_tables = self._create_vocab_tables(self.vocab_files, self.share_vocab)
in_file = tf.placeholder(tf.string, shape=())
in_dataset = self._prepare_data(tf.contrib.data.TextLineDataset(in_file).repeat(None))
out_file = tf.placeholder(tf.string, shape=())
out_dataset = self._prepare_data(tf.contrib.data.TextLineDataset(out_file).repeat(None))
dataset = tf.contrib.data.Dataset.zip((in_dataset, out_dataset))
dataset = self._batch_data(dataset, src_eos_id, tgt_eos_id)
iterator = dataset.make_initializable_iterator()
next_example, next_label = iterator.get_next()
self._prepare_iterator_hook(hook, scope_name, iterator, file_path, (in_file, out_file))
return next_example, next_label
return (input_fn, hook)
def _training_input_hook(self, file_path):
input_fn, hook = self._set_up_train_or_eval('train_inputs', file_path)
return (input_fn, hook)
def _validation_input_hook(self, file_path):
input_fn, hook = self._set_up_train_or_eval('eval_inputs', file_path)
return (input_fn, hook)
def _infer_input_hook(self, file_path, num_infer):
hook = IteratorInitializerHook()
def input_fn():
with tf.name_scope('infer_inputs'):
with tf.name_scope('sentence_markers'):
src_eos_id = tf.constant(self.src_eos_id, dtype=tf.int64)
self.vocab_tables = self._create_vocab_tables(self.vocab_files, self.share_vocab)
infer_file = tf.placeholder(tf.string, shape=())
dataset = tf.contrib.data.TextLineDataset(infer_file)
dataset = self._prepare_data(dataset)
dataset = self._batch_infer_data(dataset, src_eos_id)
iterator = dataset.make_initializable_iterator()
next_example, seq_len = iterator.get_next()
self._prepare_iterator_hook(hook, 'infer_inputs', iterator, file_path, infer_file)
return ((next_example, seq_len), None)
return (input_fn, hook)
And here is my model:
class Seq2Seq():
def __init__(
self, batch_size, inputs,
outputs, inp_vocab_size, tgt_vocab_size,
embed_dim, mode, time_major=False,
enc_embedding=None, dec_embedding=None, average_across_batch=True,
average_across_timesteps=True, vocab_path=None, embedding_path='./data_files/wiki.simple.vec'
):
embed_np = self._get_embedding(embedding_path)
if not enc_embedding:
self.enc_embedding = tf.contrib.layers.embed_sequence(
inputs,
inp_vocab_size,
embed_dim,
trainable=True,
scope='embed',
initializer=tf.constant_initializer(value=embed_np, dtype=tf.float32)
)
else:
self.enc_embedding = enc_embedding
if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
if not dec_embedding:
embed_outputs = tf.contrib.layers.embed_sequence(
outputs,
tgt_vocab_size,
embed_dim,
trainable=True,
scope='embed',
reuse=True
)
with tf.variable_scope('embed', reuse=True):
dec_embedding = tf.get_variable('embeddings')
self.embed_outputs = embed_outputs
self.dec_embedding = dec_embedding
else:
self.dec_embedding = dec_embedding
else:
with tf.variable_scope('embed', reuse=True):
self.dec_embedding = tf.get_variable('embeddings')
if mode == tf.estimator.ModeKeys.PREDICT and vocab_path is None:
raise ValueError('If mode is predict, must supply vocab_path')
self.vocab_path = vocab_path
self.inp_vocab_size = inp_vocab_size
self.tgt_vocab_size = tgt_vocab_size
self.average_across_batch = average_across_batch
self.average_across_timesteps = average_across_timesteps
self.time_major = time_major
self.batch_size = batch_size
self.mode = mode
def _get_embedding(self, embedding_path):
model = KeyedVectors.load_word2vec_format(embedding_path)
vocab = model.vocab
vocab_len = len(vocab)
return np.array([model.word_vec(k) for k in vocab.keys()])
def _get_lstm(self, num_units):
return tf.nn.rnn_cell.BasicLSTMCell(num_units)
def encode(self, num_units, num_layers, seq_len, cell_fw=None, cell_bw=None):
if cell_fw and cell_bw:
fw_cell = cell_fw
bw_cell = cell_bw
else:
fw_cell = self._get_lstm(num_units)
bw_cell = self._get_lstm(num_units)
encoder_outputs, bi_encoder_state = tf.nn.bidirectional_dynamic_rnn(
fw_cell,
bw_cell,
self.enc_embedding,
sequence_length=seq_len,
time_major=self.time_major,
dtype=tf.float32
)
c_state = tf.concat([bi_encoder_state[0].c, bi_encoder_state[1].c], axis=1)
h_state = tf.concat([bi_encoder_state[0].h, bi_encoder_state[1].h], axis=1)
encoder_state = tf.contrib.rnn.LSTMStateTuple(c=c_state, h=h_state)
return tf.concat(encoder_outputs, -1), encoder_state
def _train_decoder(self, decoder_cell, out_seq_len, encoder_state, helper):
if not helper:
helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
self.embed_outputs,
out_seq_len,
self.dec_embedding,
0.3,
)
# helper = tf.contrib.seq2seq.TrainingHelper(
# self.dec_embedding,
# out_seq_len,
# )
projection_layer = layers_core.Dense(self.tgt_vocab_size, use_bias=False)
decoder = tf.contrib.seq2seq.BasicDecoder(
decoder_cell,
helper,
encoder_state,
output_layer=projection_layer
)
return decoder
def _predict_decoder(self, cell, encoder_state, beam_width, length_penalty_weight):
tiled_encoder_state = tf.contrib.seq2seq.tile_batch(
encoder_state, multiplier=beam_width
)
with tf.name_scope('sentence_markers'):
sos_id = tf.constant(1, dtype=tf.int32)
eos_id = tf.constant(2, dtype=tf.int32)
start_tokens = tf.fill([self.batch_size], sos_id)
end_token = eos_id
projection_layer = layers_core.Dense(self.tgt_vocab_size, use_bias=False)
emb = tf.squeeze(self.dec_embedding)
decoder = tf.contrib.seq2seq.BeamSearchDecoder(
cell=cell,
embedding=self.dec_embedding,
start_tokens=start_tokens,
end_token=end_token,
initial_state=tiled_encoder_state,
beam_width=beam_width,
output_layer=projection_layer,
length_penalty_weight=length_penalty_weight
)
return decoder
def decode(
self, num_units, out_seq_len,
encoder_state, cell=None, helper=None,
beam_width=None, length_penalty_weight=None
):
with tf.name_scope('Decode'):
if cell:
decoder_cell = cell
else:
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(2*num_units)
if self.mode != estimator.ModeKeys.PREDICT:
decoder = self._train_decoder(decoder_cell, out_seq_len, encoder_state, helper)
else:
decoder = self._predict_decoder(decoder_cell, encoder_state, beam_width, length_penalty_weight)
outputs = tf.contrib.seq2seq.dynamic_decode(
decoder,
maximum_iterations=20,
swap_memory=True,
)
outputs = outputs[0]
if self.mode != estimator.ModeKeys.PREDICT:
return outputs.rnn_output, outputs.sample_id
else:
return outputs.beam_search_decoder_output, outputs.predicted_ids
def prepare_predict(self, sample_id):
rev_table = lookup_ops.index_to_string_table_from_file(
self.vocab_path, default_value=UNK)
predictions = rev_table.lookup(tf.to_int64(sample_id))
return tf.estimator.EstimatorSpec(
predictions=predictions,
mode=tf.estimator.ModeKeys.PREDICT
)
def prepare_train_eval(
self, t_out,
out_seq_len, labels, lr,
train_op=None, loss=None
):
if not loss:
weights = tf.sequence_mask(
out_seq_len,
dtype=t_out.dtype
)
loss = tf.contrib.seq2seq.sequence_loss(
t_out,
labels,
weights,
average_across_batch=self.average_across_batch,
)
if not train_op:
train_op = tf.contrib.layers.optimize_loss(
loss,
tf.train.get_global_step(),
optimizer='SGD',
learning_rate=lr,
summaries=['loss', 'learning_rate']
)
return tf.estimator.EstimatorSpec(
mode=self.mode,
loss=loss,
train_op=train_op,
)
This type of repetition is called a "text degeneration".
There is a great paper from 2019 which analyse this phenomenon: The Curious Case of Neural Text Degeneration by Ari Holtzman et al. from the Allen Institute for Artificial Intelligence.
The repetition may come from the type of text search (text sampling) on the decoder site. Many people implement this just by the most probable next world proposed by the model (argmax on the softmax on the last layer) or by so called beam search. In fact the beam search is the industry standard for today.
This is the example of Beam search from the article:
Continuation (BeamSearch, b=10):
"The unicorns were able to communicate with each other, they said unicorns. a statement that the unicorns. Professor of the Department of Los Angeles, the most important place the world to be recognition of the world to be a of the world to be a of the world to be a of the world to be a of the world to be a of the world to be a of the world to be a of the world to be a of the…
As you can see there is a great amount of repetition.
According to the paper this curious case may be explained by the fact that each repeated sequence of words have higher probability than the sequence without the next repetition:
The article propose some workarounds with words sampling by the decoder. It definitely requires more study, but this is the best explanation we have today.
The other is that your model need still more training. In many cases I faced a similar behaviour when I had big training set and model still couldn't generalise well over whole diversity of the data. To test this hypothesis - try to train on smaller dataset and see if it generalise (produce meaningful results).
But even if your model generalise well enough, that doesn't mean you won't ever face the repetition pattern. Unless you change the sampling patter of the decoder, it is a common scenario.
If you train on a small data then try to decrease the number of parameters, f. e. number of neurons in each layer.
For me, when the network outputs one word all the time, significant decrease of learning rate helps.

Resources