Keras saved model predicting different values on different session - machine-learning

I haved trained a named entity recognition model, after saving it and loading it back it is giving correct prediction on the same IPython session, but whenever I close the session and open it again, the loaded model prediction randomly. Can you help me with that?
I have saved the model in hdf5 format using:
Model.save("filename")
And I am loading it using:
Model.load_model("filename")
here is my full code
import pandas as pd
import numpy as np
import os
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Model, Input,load_model
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout,
Bidirectional
from nltk import pos_tag, word_tokenize,sent_tokenize
data = pd.read_csv("E:\ml tut\entity recognition\exdataset.csv",
encoding="latin1")
data = data.fillna(method="ffill")
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags
class SentenceGetter(object):
def __init__(self, data):
self.n_sent = 1
self.data = data
self.empty = False
agg_func = lambda s: [((w, p), t) for w, p, t in
zip(s["Word"].values.tolist(),s["POS"].values.tolist(),
s["Tag"].values.tolist())]
self.grouped = self.data.groupby("Sentence #").apply(agg_func)
self.sentences = [s for s in self.grouped]
def get_next(self):
try:
s = self.grouped["Sentence: {}".format(self.n_sent)]
self.n_sent += 1
return s
except:
return None
getter = SentenceGetter(data)
sent = getter.get_next()
print(sent)
sentences = getter.sentences
max_len = 50
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)
(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True,
recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)
if os.path.exists('my_model.h5'):
print("loading model")
model = load_model('my_model.h5')
else:
print("training model")
X = [[word2idx[w[0][0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post",
value=n_words - 1)
y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post",
value=tag2idx["O"])
y = [to_categorical(i, num_classes=n_tags) for i in y]
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)
model = Model(input, out)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
metrics=["accuracy"])
model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=5,
validation_split=0.1, verbose=1)
model.save('my_model.h5')
my_input="Albert Einstein is a great guy,he lives in berlin, Germany."
print("--------------")
test_sentence = word_tokenize(my_input)
x_test_sent = pad_sequences(sequences=[[word2idx.get(w, 0) for w in
test_sentence]],padding="post", value=0, maxlen=max_len)
i = 0
p = model.predict(np.array([x_test_sent[i]]))
p = np.argmax(p, axis=-1)
print("{:15}||{}".format("Word", "Prediction"))
print(30 * "=")
for w, pred in zip(test_sentence, p[0]):
if w != 0:
print("{:15}: {}".format(w, tags[pred]))

please save your tags (tags = list(set(data["Tag"].values))) in pickle while generating your model.. This is will solve your problem.
There fore you need to save the following:
1.tags
2.model
3.word2idx

import pandas as pd
import numpy as np
import os
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Model, Input,load_model
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout,
Bidirectional
from nltk import pos_tag, word_tokenize,sent_tokenize
data = pd.read_csv("E:\ml tut\entity recognition\exdataset.csv",
encoding="latin1")
data = data.fillna(method="ffill")
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words
tags = list(set(data["Tag"].values))
save your tags in pickle or any other format
n_tags = len(tags); n_tags
class SentenceGetter(object):
def __init__(self, data):
self.n_sent = 1
self.data = data
self.empty = False
agg_func = lambda s: [((w, p), t) for w, p, t in
zip(s["Word"].values.tolist(),s["POS"].values.tolist(),
s["Tag"].values.tolist())]
self.grouped = self.data.groupby("Sentence #").apply(agg_func)
self.sentences = [s for s in self.grouped]
def get_next(self):
try:
s = self.grouped["Sentence: {}".format(self.n_sent)]
self.n_sent += 1
return s
except:
return None
getter = SentenceGetter(data)
sent = getter.get_next()
print(sent)
sentences = getter.sentences
max_len = 50
word2idx = {w: i for i, w in enumerate(words)}
save your word2idx in pickle or any other format
tag2idx = {t: i for i, t in enumerate(tags)}
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)
(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True,
recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)
if os.path.exists('my_model.h5'):
print("loading model")
model = load_model('my_model.h5')
else:
print("training model")
X = [[word2idx[w[0][0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post",
value=n_words - 1)
y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post",
value=tag2idx["O"])
y = [to_categorical(i, num_classes=n_tags) for i in y]
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)
model = Model(input, out)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
metrics=["accuracy"])
model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=5,
validation_split=0.1, verbose=1)
model.save('my_model.h5')
my_input="Albert Einstein is a great guy,he lives in berlin, Germany."
print("--------------")
test_sentence = word_tokenize(my_input)
x_test_sent = pad_sequences(sequences=[[word2idx.get(w, 0) for w in
test_sentence]],padding="post", value=0, maxlen=max_len)
i = 0
p = model.predict(np.array([x_test_sent[i]]))
p = np.argmax(p, axis=-1)
print("{:15}||{}".format("Word", "Prediction"))
print(30 * "=")
for w, pred in zip(test_sentence, p[0]):
if w != 0:
print("{:15}: {}".format(w, tags[pred]))

Related

How to deal with big dataset when using pyG?

I am a beginer learning to using torch_geometric to build my GNN models. I refered the sample of the pyG example of node classification and build my own dataset, however, I tried to use my GPU to run the code and it tells me that it run out of memory, maybe my dataset is too large to allocate the GPU memory? I don't know. I shared an machine of 8 A100 with my classmates. Could you please give me some suggestions, thank you!
from torch_geometric.nn import GATConv,GCNConv
from torch_geometric.data import Dataset,DataLoader,HeteroData,Data
import torch.nn as nn
from torch_geometric.nn import DataParallel
from torch_geometric.loader import DataListLoader
import torch.nn.functional as F
import torch
import pandas as pd
from transformers import BertTokenizer,BertModel
import pickle
import time
from tqdm import tqdm
from numba import jit
import json
from torch.optim import lr_scheduler
import matplotlib.pyplot as plt
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
plt.grid(True)
plt.grid(color='gray',
linestyle='--',
linewidth=1,
alpha=0.3)
begin = time.time()
punctuation = "!#$%&'\(\)-*+,-./:;<=>?#\\\[\]^_`{|}~():;,。【】·、“”‘’《》\"%……——·"
def dataCleanifier(s):
for i in punctuation:
s.replace(i," ")
s = s.replace(" "," ")
s = s.replace("\n","")
return s
class BertClassifier(nn.Module):
def __init__(self,bertType:str,max_length,tag_size):
super(BertClassifier,self).__init__()
self.bertType = bertType
self.tokenizer = BertTokenizer.from_pretrained(self.bertType)
self.encoder = BertModel.from_pretrained(self.bertType)
self.outputDim = self.encoder.pooler.dense.out_features
self.max_length = max_length
self.tag_size = tag_size
self.dropout = nn.Dropout(0.1)
self.activation = nn.LeakyReLU(0.1)
self.convs = nn.ModuleList(
[nn.Conv2d(1, 512, (k, self.outputDim)) for k in (2,3,4)])
self.fc_cnn = nn.Linear(512 * len((2,3,4)), self.tag_size)
def conv_and_pool(self, x, conv):
x = F.relu(conv(x)).squeeze(3)
x = F.max_pool1d(x, x.size(2)).squeeze(2)
return x
def forward(self,x):
x = self.tokenizer.batch_encode_plus(x,return_tensors="pt",max_length=self.max_length,truncation=True,padding="max_length")
attention = x["attention_mask"]
x = x["input_ids"]
x = x.cuda(2)
x = self.encoder(x,attention_mask=attention.cuda(2))['last_hidden_state'][:]
x = x.unsqueeze(1)
encoded = torch.cat([self.conv_and_pool(x,conv) for conv in self.convs],1)
x = self.fc_cnn(encoded)
x = self.activation(x)
# x = F.softmax(x,dim=1)
return x,encoded
class ContrastiveLoss(nn.Module):
def __init__(self):
super(ContrastiveLoss, self).__init__()
def forward(self,representations,label,y_hat):
n = label.shape[0]
T = 0.5
similarity_matrix = F.cosine_similarity(representations.unsqueeze(1), representations.unsqueeze(0), dim=2)
mask = torch.ones_like(similarity_matrix) * (label.expand(n, n).eq(label.expand(n, n).t()))
mask_no_sim = torch.ones_like(mask) - mask
mask_dui_jiao_0 = torch.ones(n ,n) - torch.eye(n, n )
similarity_matrix = torch.exp(similarity_matrix/T)
similarity_matrix = similarity_matrix*mask_dui_jiao_0
sim = mask*similarity_matrix
no_sim = similarity_matrix - sim
no_sim_sum = torch.sum(no_sim , dim=1)
no_sim_sum_expend = no_sim_sum.repeat(n, 1).T
sim_sum = sim + no_sim_sum_expend
loss = torch.div(sim , sim_sum)
loss = mask_no_sim + loss + torch.eye(n, n )
#接下来就是算一个批次中的loss了
loss = -torch.log(loss) #求-log
loss = torch.sum(torch.sum(loss, dim=1) )/(2*n)+nn.CrossEntropyLoss()(y_hat,label)
return loss
class GAT(nn.Module):
def __init__(self, hidden_channels) -> None:
super().__init__()
self.conv1 = GATConv(data.num_features,hidden_channels)
self.conv2 = GATConv(hidden_channels,9)
self.activation = nn.ReLU()
def forward(self,x,edge_index):
x = self.conv1(x,edge_index)
x = self.activation(x)
# print(x)
# x = F.dropout(x,p=0.2)
x = self.conv2(x,edge_index)
return x
x=None
y=None
edge_index = None
train_mask = None
with open("X.pkl","rb") as f1:
x = pickle.load(f1)
with open("Y.pkl","rb") as f2:
y = pickle.load(f2)
y = y.long()
with open("edge_index.pkl","rb") as f3:
edge_index = pickle.load(f3)
# print(edge_index.shape)
with open("train_mask.pkl","rb") as f4:
train_mask = pickle.load(f4)
data = Data(x=x,y=y,edge_index=edge_index)
data.train_mask = train_mask
model = GAT(hidden_channels=32)
model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = lr_scheduler.StepLR(optimizer, 100, 0.8)
criterion = ContrastiveLoss()
def train():
model.train()
optimizer.zero_grad() # Clear gradients.
out = model(data.x,data.edge_index) # Perform a single forward pass.
loss = criterion(data.x[data.train_mask], data.y[data.train_mask],out[data.train_mask]) # Compute the loss solely based on the training nodes.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
return loss
def test():
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1) # Use the class with highest probability.
test_correct = pred[data.train_mask] == data.y[data.train_mask] # Check against ground-truth labels.
test_acc = int(test_correct.sum()) / int(data.train_mask.sum()) # Derive ratio of correct predictions.
return test_acc
accs = []
for epoch in range(1, 1025):
loss = train()
print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}',end=" ")
acc = test()
print("acc:",acc)
accs.append(acc)
scheduler.step()
plt.plot(range(len(accs)),accs)
print(time.time()-begin)
with open("./accs_gat_GCL.pkl","wb") as f1:
pickle.dump(accs,f1)
plt.savefig("./res_GAT_GCL.png",dpi=600)
I have tried to use DataPararel to use multiple GPU to load my model and dataset but failed.

setting up hyper parameters for LSTM

This is our code can you please tell if LSTM can be used? and what how do we see if prediction is accurate as this code is predicting right the values of csv itself, but unsure about forecasting part. It is forecasting future but unreliably.
This is our data it has missing dates as well. The data ends at 1-Dec-2021
import pandas as pd
import flask
import numpy as np
import keras
import matplotlib.pyplot as plt
import tensorflow as tf
import plotly.graph_objects as go
from keras.preprocessing.sequence import TimeseriesGenerator
filename = "china cotton import concatinated.csv"
df = pd.read_csv(filename)
print(df.info())
df['date'] = pd.to_datetime(df['date'])
#df.set_index(df['date'], inplace=True,)
df.set_axis(df['date'], inplace=True)
df.drop(columns=['CottonChina importFC Index MUS Cents/Lb', 'CottonChina importFC Index LUS Cents/Lb', 'CottonChinadomestic3128BUSCents/Lb', 'CottonChina domestic2227BUS Cents/Lb','CottonChina domestic2129BUS Cents/Lb','CottonChina importUSD1 year = 100','CottonChina domesticUSD1 year = 100'], inplace=True)
close_data = df['CottonChina importFC Index SUS Cents/Lb'].values
close_data = close_data.reshape((-1,1))
split_percent = 0.80
split = int(split_percent*len(close_data))
close_train = close_data[:split]
close_test = close_data[split:]
date_train = df['date'][:split]
date_test = df['date'][split:]
print(len(close_train))
print(len(close_test))
look_back = 15
train_generator = TimeseriesGenerator(close_train, close_train, length=look_back, batch_size=20)
test_generator = TimeseriesGenerator(close_test, close_test, length=look_back, batch_size=1)
from keras.models import Sequential
from keras.layers import LSTM, Dense
model = Sequential()
model.add(
LSTM(10,
activation='relu', return_sequences=True,
input_shape=(look_back,1))
)
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
num_epochs = 25
prediction = model.predict_generator(test_generator)
close_train = close_train.reshape((-1))
close_test = close_test.reshape((-1))
prediction = prediction.reshape((-1))
"""trace1 = go.Scatter(
x = date_train,
y = close_train,
mode = 'lines',
name = 'Data'
)
trace2 = go.Scatter(
x = date_test,
y = prediction,
mode = 'lines',
name = 'Prediction'
)
trace3 = go.Scatter(
x = date_test,
y = close_test,
mode='lines',
name = 'Ground Truth'
)
layout = go.Layout(
title = "Google Stock",
xaxis = {'title' : "Date"},
yaxis = {'title' : "Close"}
) """
"""fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
fig.show()"""
close_data = close_data.reshape((-1))
def predict(num_prediction, model):
prediction_list = close_data[-look_back:]
for _ in range(num_prediction):
x = prediction_list[-look_back:]
x = x.reshape((1, look_back, 1))
out = model.predict(x)[0][0]
prediction_list = np.append(prediction_list, out)
prediction_list = prediction_list[look_back-1:]
return prediction_list
def predict_dates(num_prediction):
last_date = df['date'].values[-1]
prediction_dates = pd.date_range(last_date, periods=num_prediction+1).tolist()
return prediction_dates
num_prediction = 30
forecast = predict(num_prediction, model)
forecast_dates = predict_dates(num_prediction)
trace1 = go.Scatter(
x = date_train,
y = close_train,
mode = 'lines',
name = 'Data'
)
trace2 = go.Scatter(
x = forecast_dates,
y = forecast,
mode = 'lines',
name = 'Prediction'
)
trace3 = go.Scatter(
x = date_test,
y = close_test,
mode='lines',
name = 'Ground Truth')
layout = go.Layout(
title = "Future Prediction",
xaxis = {'title' : "Date"},
yaxis = {'title' : "Close"}
)
fig = go.Figure(data=[trace1, trace2,trace3], layout=layout)
fig.write_html('first_figure.html',auto_open=True)
This is the graph plotted after ran the code. It has negative values of prices and prices are small as compare to test and train data.

How to solve Error when checking input: with incorrect size

This is my first attempt to solve task on the kaggle. This is page of the task - https://www.kaggle.com/c/bike-sharing-demand.
I wrote this code (I have some excess code lines, because I am not really sure what I need now):
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
train_data = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')
train_targets = train_data[['casual', 'registered', 'count']]
train_datetime_helper = train_data[['datetime']]
dt = pd.DatetimeIndex(train_data['datetime'])
train_data['day'] = dt.day
train_data['month'] = dt.month
train_data['year'] = dt.year
train_data['hour'] = dt.hour
train_data['dow'] = dt.dayofweek
train_data['woy'] = dt.weekofyear
train_data = train_data.drop(['casual', 'registered', 'count', 'datetime'], axis=1)
test_data = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')
test_datetime_helper = test_data[['datetime']]
dt = pd.DatetimeIndex(test_data['datetime'])
test_data['day'] = dt.day
test_data['month'] = dt.month
test_data['year'] = dt.year
test_data['hour'] = dt.hour
test_data['dow'] = dt.dayofweek
test_data['woy'] = dt.weekofyear
test_data = test_data.drop(['datetime'], axis=1)
mean = train_data.mean(axis=0)
train_data -= mean
std = train_data.std(axis=0)
train_data /= std
test_data -= mean
test_data /= std
from keras import models
from keras import layers
from keras.layers import Dense, Conv2D, Flatten
def build_model():
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(train_data.shape[1], train_targets.shape[1])))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
k = 4
num_val_samples = len(train_data) // k
num_epochs = 100
all_scores = []
for i in range(k):
print('Processing fold #', i)
val_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]
val_targets = train_targets[i * num_val_samples: (i + 1) * num_val_samples]
partial_train_data = np.concatenate(
[train_data[:i * num_val_samples],
train_data[(i + 1) * num_val_samples:]], axis=0)
partial_train_targets = np.concatenate(
[train_targets[:i * num_val_samples],
train_targets[(i + 1) * num_val_samples:]], axis=0)
model = build_model()
model.fit(partial_train_data, partial_train_targets,
epochs=num_epochs, batch_size=1, verbose=0)
val_mse, val_mae = model.evaluate(val_data, val_targets, verbose=0)
all_scores.append(val_mae)
I have got this error. Can you explain how can I solve it enter image description here
you specified wrong the input dimension of your model. try to define your first layer in this way
model.add(layers.Dense(64, activation='relu', input_shape=(train_data.shape[1],)))

Error when checking input: expected embedding_1_input to have shape (4,) but got array with shape (1,)

I use pretrained embedding vectors for my keras model. Before I did it everything worked and now I get this error:
ValueError: Error when checking input: expected embedding_1_input to
have shape (4,) but got array with shape (1,)
Maybe somebody can help me, what I do wrong here. I am not sure if I did correct model.fit and model.evaluate. Maybe there is a problem?
import csv
import numpy as np
np.random.seed(42)
from keras.models import Sequential, Model
from keras.layers import *
from random import shuffle
from sklearn.model_selection import train_test_split
from keras import optimizers
from keras.callbacks import EarlyStopping
from itertools import groupby
from numpy import asarray
from numpy import zeros
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
#function makes a list of antonyms and synonyms from the files
def preprocessing(filename):
list_words = []
with open(filename) as tsv:
for line in csv.reader(tsv, dialect="excel-tab"):
list_words.append([line[0], line[1]])
return list_words
#function make a list of not relevant pairs by mixing synonyms and
antonyms
def notrelevant(filename, filename2):
list_words = []
with open(filename) as tsv:
with open(filename2) as tsv2:
for lines in zip(csv.reader(tsv, dialect="excel-tab"),csv.reader(tsv2, dialect="excel-tab")):
list_words.append([lines[0][0], lines[1][1]])
return list_words
antonyms_list = preprocessing("antonyms.tsv")
synonyms_list = preprocessing("synonyms.tsv")
notrelevant_list = notrelevant("antonyms.tsv", "synonyms.tsv")
# function combines all antonyms, synonyms in one list with labels,
shuffle them
def data_prepare(ant,syn,nrel):
data = []
for elem1,elem2 in ant:
data.append([[elem1,elem2], "Antonyms"])
for elem1, elem2 in syn:
data.append([[elem1, elem2], "Synonyms"])
for elem1, elem2 in nrel:
data.append([[elem1, elem2], "Not relevant"])
shuffle(data)
return data
data_with_labels_shuffled =
data_prepare(antonyms_list,synonyms_list,notrelevant_list)
def label_to_onehot(labels):
mapping = {label: i for i, label in enumerate(set(labels))}
one_hot = np.empty((len(labels), 3))
for i, label in enumerate(labels):
entry = [0] * len(mapping)
entry[mapping[label]] = 1
one_hot[i] = entry
return (one_hot)
def words_to_ids(labels):
vocabulary = []
word_to_id = {}
ids = []
for word1,word2 in labels:
vocabulary.append(word1)
vocabulary.append(word2)
counter = 0
for word in vocabulary:
if word not in word_to_id:
word_to_id[word] = counter
counter += 1
for word1,word2 in labels:
ids.append([word_to_id [word1], word_to_id [word2]])
return (ids)
def split_data(datas):
data = np.array(datas)
X, y = data[:, 0], data[:, 1]
# split the data to get 60% train and 40% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
y_train = label_to_onehot(y_train)
X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
y_dev = label_to_onehot(y_dev)
y_test = label_to_onehot(y_test)
return X_train, y_train, X_dev, y_dev, X_test, y_test
X_train, y_train, X_dev, y_dev, X_test, y_test = split_data(data_with_labels_shuffled)
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(X_train)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(X_train)
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.50d.txt')
for line in f:
values = line.split()
word = values[0]
coefs = asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 50))
for word, i in t.word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
VOCABSIZE = len(data_with_labels_shuffled)
EMBSIZE = 50
HIDDENSIZE = 50
KERNELSIZE = 5
MAXEPOCHS = 5
model = Sequential()
model.add(Embedding(vocab_size, 50, weights=[embedding_matrix],
input_length=4, trainable=False))
model.add(Dropout(0.25))
model.add(Bidirectional(GRU(units = HIDDENSIZE // 2)))
#model.add(Flatten())
model.add(Dense(units = 3, activation = "softmax"))
model.compile(loss='categorical_crossentropy', optimizer="adam",
metrics=['accuracy'])
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='min')
model.fit (X_train, y_train,
batch_size=64,
callbacks = [earlystop],
epochs=100,
validation_data=(X_dev, y_dev),
verbose=1)
scores = model.evaluate(X_test, y_testbatch_size=64)
print("Accuracy is: %.2f%%" %(scores[1] * 100))
I think the problem is that you should pass encoded_docs to your model.fit() function instead of X_train since encoded_docs contains the tokenization of your training data and X_train still only contains a list of words. Moreover, you have to make sure that the input_length parameter of your Embedding layer matches the length of these tokenized training examples that you have created in encoded_docs.

NLP model's accuracy stuck on 0.5098 while training

I built a two layered LSTM model(keras model) for a movie review dataset from kaggle : Dataset
While training the model, every epoch was giving the same accuracy of 0.5098.
Then I thought it might not be learning the long distance dependencies.Then instead of LSTM I used bidirectional LSTM. But, still model's accuracy while training was 0.5098 for every epoch. I trained the model for 8 hours/35 epochs on CPU. Then I stopped training.
Code:
import pandas as pd
from sentiment_utils import *
import keras
import keras.backend as k
import numpy as np
train_data = pd.read_table('train.tsv')
X_train = train_data.iloc[:,2]
Y_train = train_data.iloc[:,3]
from sklearn.preprocessing import OneHotEncoder
Y_train = Y_train.reshape(Y_train.shape[0],1)
ohe = OneHotEncoder(categorical_features=[0])
Y_train = ohe.fit_transform(Y_train).toarray()
maxLen = len(max(X_train, key=len).split())
words_to_index, index_to_words, word_to_vec_map = read_glove_vectors("glove/glove.6B.50d.txt")
m = X_train.shape[0]
def read_glove_vectors(path):
with open(path, encoding='utf8') as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()
cur_word = line[0]
words.add(cur_word)
word_to_vec_map[cur_word] = np.array(line[1:], dtype=np.float64)
i = 1
words_to_index = {}
index_to_words = {}
for w in sorted(words):
words_to_index[w] = i
index_to_words[i] = w
i = i + 1
return words_to_index, index_to_words, word_to_vec_map
def sentance_to_indices(X_train, words_to_index, maxLen, dash_index_list, keys):
m = X_train.shape[0]
X_indices = np.zeros((m, maxLen))
for i in range(m):
if i in dash_index_list:
continue
sentance_words = X_train[i].lower().strip().split()
j = 0
for word in sentance_words:
if word in keys:
X_indices[i, j] = words_to_index[word]
j += 1
return X_indices
def pretrained_embedding_layer(word_to_vec_map, words_to_index):
emb_dim = word_to_vec_map['pen'].shape[0]
vocab_size = len(words_to_index) + 1
emb_matrix = np.zeros((vocab_size, emb_dim))
for word, index in words_to_index.items():
emb_matrix[index, :] = word_to_vec_map[word]
emb_layer= keras.layers.embeddings.Embedding(vocab_size, emb_dim, trainable= False)
emb_layer.build((None,))
emb_layer.set_weights([emb_matrix])
return emb_layer
def get_model(input_shape, word_to_vec_map, words_to_index):
sentance_indices = keras.layers.Input(shape = input_shape, dtype='int32')
embedding_layer = pretrained_embedding_layer(word_to_vec_map, words_to_index)
embeddings = embedding_layer(sentance_indices)
X = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True))(embeddings)
X = keras.layers.Dropout(0.5)(X)
X = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True))(X)
X = keras.layers.Dropout(0.5)(X)
X = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=False))(X)
X = keras.layers.Dropout(0.5)(X)
X = keras.layers.Dense(5)(X)
X = keras.layers.Activation('softmax')(X)
model = keras.models.Model(sentance_indices, X)
return model
model = get_model((maxLen,), word_to_vec_map,words_to_index)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
dash_index_list = []
for i in range(m):
if '-' in X_train[i]:
dash_index_list.append(i)
keys = []
for key in word_to_vec_map.keys():
keys.append(key)
X_train_indices = sentance_to_indices(X_train, words_to_index, maxLen, dash_index_list, keys)
model.fit(X_train_indices, Y_train, epochs = 50, batch_size = 32, shuffle=True)
I think the way you defined the model architecture doesn't make sense! Try looking at this example on IMDB movie reviews with LSTM on Keras github repo: Trains an LSTM model on the IMDB sentiment classification task.

Resources