Maybe Overfitting trying to implement rnn with keras - machine-learning

I am trying to create a RNN for sentiment classification (-1(neg) or 1(pos)).My dataset file looks like this:
text,polarity
"this is a line",1
"this is asecond line",-1
.....
etc.
with 2603 rows like these.
This is what i got so far and it reaches maximum accuracy very fast, so i assume it is wrong. What am i doing wrong though?
Code:
from __future__ import print_function
from keras.preprocessing import sequence
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation, Dropout
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM
from keras.datasets import imdb
import pandas as pd
import string
import nltk
from sklearn.model_selection import train_test_split
from collections import Counter
batch_size = 32
num_words=2000
epochs=15
print('Loading data...')
data = pd.read_csv("yolo.csv",header=0,encoding = 'UTF-8')
X = data['text']
Y = data['polarity']
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2) #split train/test data
tokenizer = Tokenizer(num_words=num_words, lower=False, split=' ')
tokenizer.fit_on_texts(x_train)
x_train= tokenizer.texts_to_sequences(x_train)
x_test= tokenizer.texts_to_sequences(x_test)
x_train= tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test= tokenizer.sequences_to_matrix(x_test, mode='binary')
y_train = tf.keras.utils.to_categorical(y_train,2)
y_test = tf.keras.utils.to_categorical(y_test,2)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('Build model...')
model = Sequential()
model.add(Dense(512, input_shape=(num_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(2))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print (model.metrics_names)
history= model.fit (x_train,y_train, batch_size=batch_size, epochs=epochs, verbose=1)
score= model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print ('Test loss: ()'.format(score[0]))
print ('Test accuracy:{}'.format(score[1]))

Related

LSTM multiclass classifier : why my predictions have nothing to do with my target set?

I am trying to design a LSTM model for forecasting price movement.
I have issues regarding the results I obtain for my predictions. I did not normalize my target set y (nor train nor test), only X because it's a classification (-1,0,1) but the predictions I obtain are float.
Maybe I did not normalize the righ sets. My code is below :
Many thanks for you help and feel free to add comments other my other lines of code too I am a beginner.
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime as dt
from pandas_datareader import data as pdr
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import LSTM
startdate=dt(2018,3,31)
enddate=dt(2022,3,31)
tickers = ['ETH-USD']
Data=pdr.get_data_yahoo(tickers,start=startdate, end=enddate)['Adj Close']
df_change = Data.apply(lambda x: np.log(x) - np.log(x.shift(1)))
df_change.drop(index=df_change.index[0], axis=0, inplace=True)
df_change = df_change*100
pd.options.mode.chained_assignment = None #to not display the error of copy dataframe
df_y = df_change.copy()
df_y.columns = ['ETH-y']
def Target(df,column,df2,column2):
for i in range(len(df)):
if df[column].iloc[i] > 0:
df2[column2][i] = 1 #value is up par rapport au jour d'avant
elif -0.5 < df[column].iloc[i] < 0.5 :
df2[column2][i] = 0 #value is steady
else:
df2[column2][i] = -1 #value is down
Target(df_change,'ETH-USD',df_y,'ETH-y')
print(df_y['ETH-y'].value_counts())
Data.drop(index=Data.index[0], axis=0, inplace=True) #drop first row to have same values
X = Data
y = df_y
## split my train val and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify = y)
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler().fit(X_train)
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
#reshaping for 3D array
X_train = np.reshape(X_train,(1169,1,1))
X_test = np.reshape(X_test,(293,1,1))
from keras.models import Sequential
from keras.layers import Dense, LSTM
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(LSTM(32, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1]))
model.compile(optimizer='adam', loss='mse')
model.summary()
history = model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.1, verbose=1)
pred = model.predict(X_test)
pred = sc.inverse_transform(pred)
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend()

How to draw ROC and PR curves for a classification model trained with KFold CV

How should I draw the ROC and PR curves for this NN model which I am training with 10 fold cross-validation?
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold
from numpy import loadtxt
import numpy as np
import pandas as pd
from google.colab import files
uploaded = files.upload()
dataset = loadtxt('mod_dfn.csv', delimiter=',')
X = dataset[:,0:25]
y = dataset[:,25]
kfold = KFold(n_splits=10, shuffle=True)
fold_no = 1
for train, test in kfold.split(X, y):
model = Sequential()
model.add(Dense(12, input_dim=25, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print('------------------------------------------------------------------------')
print(f'Training for fold {fold_no} ...')
history = model.fit(X[train], y[train], batch_size=10, epochs=150, verbose=0)
scores = model.evaluate(X[test], y[test], verbose=0)
print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
acc_per_fold.append(scores[1] * 100)
loss_per_fold.append(scores[0])
fold_no = fold_no + 1
You can use RocCurveDisplay and PrecisionRecallDisplay for this purpose.
Try adding these lines just before last line of your code fold_no = fold_no + 1, and see if it works for you.
pred = model.predict(X[test])
# ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y[test], pred)
roc_auc = metrics.auc(fpr, tpr)
roc_display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
roc_display.plot()
roc_display.figure_.savefig(f'./out/ROC_curve_for_fold#{fold_no}.jpeg')
# PR Curve
precision, recall, _ = metrics.precision_recall_curve(y[test], pred)
pr_display = metrics.PrecisionRecallDisplay(precision=precision, recall=recall)
pr_display.plot()
pr_display.figure_.savefig(f'./out/PR_curve_for_fold#{fold_no}.jpeg')

Plot Confusion Matrix from CNN Model

This original work is presented here
How to go about plotting the confusion matrix based of a CNN model?
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics
##Need to put this block of code in for cuDNN to initialize properly
import tensorflow as tf
config = tf.compat.v1.ConfigProto(gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.8)
# device_count = {'GPU': 1}
)
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(session)
#------------------------------------------------------------------------------------------------------------------
num_rows = 40
num_columns = 174
num_channels = 1
x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)
num_labels = yy.shape[1]
filter_size = 2
# Construct model
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())
model.add(Dense(num_labels, activation='softmax'))
then trained as:
from keras.callbacks import ModelCheckpoint
from datetime import datetime
#num_epochs = 12
#num_batch_size = 128
num_epochs = 72
num_batch_size = 256
checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_cnn.hdf5',
verbose=1, save_best_only=True)
model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)
I have been trying a few things, one of which is:
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 4))
plot_confusion_matrix=(model(),x_test, y_test)
plt.plot(plot_confusion_matrix)
but I cannot get the confusion matrix to plot.
I also looked at tf.math.confusion_matrix(), but what is the labels and predictions as defined from the CNN model above??
The confusion matrix is a multi-classification.
Is
y_true = np.argmax(y_test, 1)??
and
y_pred = model.predict_classes(x_test)??
labels: 1-D Tensor of real labels for the classification task.
predictions: 1-D Tensor of predictions for a given classification.
As they say in official documentation , labels are the names of Output classes and predictions, However as they say everything has to be 1D tensor it means labels will be Ground truth for one instance and the corresponding indexed value in the Predictions will hold its predicted value.
So what you can do is, get the predictions and labels for each instances,in your code,you have passed the x_test and y_test which arent the supposed to be passed elements.
instead use model.predict to get the output labels.
y_predict=model.predict(x_test)
y_true=y_test
res = tf.math.confusion_matrix(y_true,y_predict)
This res is a 2D matrix now to print it you need to
plot_confusion_matrix(classifier, X_test, y_test,
display_labels=class_names,
cmap=plt.cm.Blues,
normalize=normalize)
Here put classifer = "model",not functional model().
Hope this helps,here are some more resources.
Here You can see the multiclass classification Confusion matrix technique.
Multiclass plot github function
Another custom plot function

How to predict the outcome of a new patient using this SVC ML model trained on a dataset

Data description:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
np.random.seed(0)
# Alternatively X and y can be obtained directly from the frame attribute:
X = df.drop('OUTCOME', axis=1)
y = df['OUTCOME']
numeric_features = ['Age', 'PCT' , 'CURB 65' , 'pO2' ]
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_features = ['Sex', 'CXR' , 'Hospitalisation in last 3 months' , 'ICU>72hrs', 'Blood']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
from sklearn import linear_model
from sklearn.svm import SVC
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', SVC())])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Accuracy of the model is 1.0
How to enter new patient's parameter to get a prediction for that patient? New patient is not from this dataset.
i tried to enter new patient information as follows
New_patient = [70,1,0,2,2,10,40,2,20,68,1,2,0,1,1]
clf.predict([New_patient])
output: ValueError: Specifying the columns using strings is only supported for pandas DataFrames

ROC curve is not acutally a curve

I have plotted few ROC curve to calculate the AUC. I am having ROC curve is actually doesn't plots like a curve. I have attached the images for better understanding. If any one can tell me what is wrong in there. I will be obliged. This is one kind of plot I am getting
This is the another type
However I am not getting a curve like this one.
This is the link to my dataset
https://drive.google.com/open?id=1luj8d863_IOA36cQTo772GEWgUsrXlbJ
I will thankful if anyone can help me understand the problem if any or if my curves are correct then why it is not actually in a curve like structure
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from keras.layers import Dense, Input
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Model,Sequential
from keras.utils import np_utils
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from itertools import cycle
from sklearn.metrics import roc_curve, auc
from imblearn.over_sampling import SMOTE
seed = 7
np.random.seed(seed)
dataset = pd.read_csv('dataset/prostate.csv')
labels = dataset.values[:,-1]
features_set = dataset.iloc[:,0:12600]
oversampler = SMOTE(random_state=0)
oversampler_feature_set, oversampler_labels = oversampler.fit_sample(features_set,labels)
feature_df = pd.DataFrame(oversampler_feature_set)
labels_df = pd.DataFrame(oversampler_labels)
scalar = MinMaxScaler()
scaled_data = scalar.fit_transform(feature_df)
pca = PCA(n_components=30)
pca_data = pd.DataFrame(pca.fit_transform(scaled_data))
recreated_df = pd.concat([pca_data,labels_df], axis=1)
train, test = train_test_split(recreated_df,test_size=0.2)
X_train = train.values[:,0:30]
Y_train = train.values[:,-1]
X_test = test.values[:,0:30]
y_test = test.values[:,-1]
def my_model():
model = Sequential()
model.add(Dense(20, input_dim=30,activation='sigmoid'))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
return model
estimator = KerasClassifier(build_fn=my_model, epochs=1000, batch_size=10, shuffle=True,verbose=1)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator,X_train,Y_train, cv=kfold)
results.mean()
estimator.fit(X_train,Y_train)
y_pred = estimator.predict(X_test).ravel()
sensitivity, specificity, thresholds_keras = roc_curve(y_test,y_pred,pos_label=2)
auc_keras = auc(sensitivity,specificity)
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(sensitivity, specificity, label='Keras (area =:.3f})'.format(auc_keras))
plt.xlabel('Specificity')
plt.ylabel('Sensitivity')
plt.title('Prostate')
plt.legend(loc='best')
plt.show()

Resources