DesicionTreeClassifier: Input contains NaN, infinity or a value too large for dtype('float32') - machine-learning

clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, error_score='raise')
print(score)
after run this code I have error:
ValueError: Input contains NaN, infinity or a value too large for
dtype('float32').
So how I can fix it?

Decision Trees doesn't accept NaN / infinity values.
Try doing (assuming that train_data is a Pandas DataFrame):
train_data.fillna(0, inplace = True)
This will replace all NaN values by 0.
If you don't want this, the only thing to do is to delete entries with NaN data :
train_data.dropna(inplace = True)
If this is not a DataFrame, try adding this line before the fillna method:
train_data = pd.DataFrame(train_data)

Related

MinMaxScaler for returning the normalized prediction values to the unnormalized values

I want to train a model and I use the MinMaxScaler to normalize my data. However, I dont know how to use it for return the prediction values to the real values.
Here is a simple example of my data.
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
data_t = np.random.randint(10, size = (6, 1000))
data_te = np.random.randint(10, size = (6, 100))
data_train = min_max_scaler.fit_transform(data_t)
data_test = min_max_scaler.fit_transform(data_te)
# prediction is output from the model and its size is 1*28.
prediction = np.random.randint(1, size = (1,28))
prediction_values = min_max_scaler.inverse_transform(prediction)
but I get this error:
ValueError: operands could not be broadcast together with shapes (1,28) (100,) (1,28)
Could you please help me with this? Thanks

ValueError: 'logits' and 'labels' must have the same shape for NLP sentiment multi-class classifier

I am trying to make a NLP multi-class sentiment classifier where it takes in sentences as input and classifies them into three classes (negative, neutral and positive). However, when training the model, I run into the error where my logits (None, 3) are not the same size as my labels (None, 1) and the model can't begin training.
My model is a multi-class classifier and not a multi-label classifier since it is only predicting one label per object. I made sure that my last layer had an output of 3 and had the activation = 'softmax'. This should be correct from what I have searched online so I think that the problem lies with my labels.
Currently, my labels have a dimension of (None, 1) since I mapped each class to a unique integer and passed this as my test and train y values (which are in the form of one dimensional numpy array.
Right now I am confused if I have change the dimensions of this array to match the output dimensions and how to go about doing it.
import os
import sys
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.optimizers import SGD
device_name = tf.test.gpu_device_name()
if len(device_name) > 0:
print("Found GPU at: {}".format(device_name))
else:
device_name = "/device:CPU:0"
print("No GPU, using {}.".format(device_name))
# Load dataset into a dataframe
train_data_path = "/content/drive/MyDrive/ML Datasets/tweet_sentiment_analysis/train.csv"
test_data_path = "/content/drive/MyDrive/ML Datasets/tweet_sentiment_analysis/test.csv"
train_df = pd.read_csv(train_data_path, encoding='unicode_escape')
test_df = pd.read_csv(test_data_path, encoding='unicode_escape').dropna()
sentiment_types = ('neutral', 'negative', 'positive')
train_df['sentiment'] = train_df['sentiment'].astype('category')
test_df['sentiment'] = test_df['sentiment'].astype('category')
train_df['sentiment_cat'] = train_df['sentiment'].cat.codes
test_df['sentiment_cat'] = test_df['sentiment'].cat.codes
train_y = np.array(train_df['sentiment_cat'])
test_y = np.array(test_df['sentiment_cat'])
# Function to convert df into a list of strings
def convert_to_list(df, x):
selected_text_list = []
labels = []
for index, row in df.iterrows():
selected_text_list.append(str(row[x]))
labels.append(str(row['sentiment']))
return np.array(selected_text_list), np.array(labels)
train_sentences, train_labels = convert_to_list(train_df, 'selected_text')
test_sentences, test_labels = convert_to_list(test_df, 'text')
# Instantiate tokenizer and create word_index
tokenizer = Tokenizer(num_words=1000, oov_token='<oov>')
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# Convert sentences into a sequence
train_sequence = tokenizer.texts_to_sequences(train_sentences)
test_sequence = tokenizer.texts_to_sequences(test_sentences)
# Padding sequences
pad_test_seq = pad_sequences(test_sequence, padding='post')
max_len = pad_test_seq[0].size
pad_train_seq = pad_sequences(train_sequence, padding='post', maxlen=max_len)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(10000, 64, input_length=max_len),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(3, activation='softmax')
])
with tf.device(device_name):
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 10
with tf.device(device_name):
history = model.fit(pad_train_seq, train_y, epochs=num_epochs, validation_data=(pad_test_seq, test_y), verbose=2)
Here is the error:
ValueError Traceback (most recent call last)
<ipython-input-28-62f3c6445887> in <module>
2
3 with tf.device(device_name):
----> 4 history = model.fit(pad_train_seq, train_y, epochs=num_epochs, validation_data=(pad_test_seq, test_y), verbose=2)
1 frames
/usr/local/lib/python3.8/dist-packages/keras/engine/training.py in tf__train_function(iterator)
13 try:
14 do_return = True
---> 15 retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
16 except:
17 do_return = False
ValueError: in user code:
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1051, in train_function *
return step_function(self, iterator)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1040, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1030, in run_step **
outputs = model.train_step(data)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 890, in train_step
loss = self.compute_loss(x, y, y_pred, sample_weight)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 948, in compute_loss
return self.compiled_loss(
File "/usr/local/lib/python3.8/dist-packages/keras/engine/compile_utils.py", line 201, in __call__
loss_value = loss_obj(y_t, y_p, sample_weight=sw)
File "/usr/local/lib/python3.8/dist-packages/keras/losses.py", line 139, in __call__
losses = call_fn(y_true, y_pred)
File "/usr/local/lib/python3.8/dist-packages/keras/losses.py", line 243, in call **
return ag_fn(y_true, y_pred, **self._fn_kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/losses.py", line 1930, in binary_crossentropy
backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
File "/usr/local/lib/python3.8/dist-packages/keras/backend.py", line 5283, in binary_crossentropy
return tf.nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
ValueError: `logits` and `labels` must have the same shape, received ((None, 3) vs (None, 1)).
my logits (None, 3) are not the same size as my labels (None, 1)
I made sure that my last layer had an output of 3 and had the activation = 'softmax'
my labels have a dimension of (None, 1) since I mapped each class to a unique integer
The key concept you are missing is that you need to one-hot encode your labels (after assigning integers to them - see below).
So your model, after the softmax, is spitting out three values: how probable each of your labels is. E.g. it might say A is 0.6, B is 0.1, and C is 0.3. If the correct answer is C, then it needs to see that correct answer as 0, 0, 1. It can then say that its prediction for A is 0.6 - 0 = +0.6 wrong, B is 0.1 - 0 = +0.1 wrong, and C is 0.3 - 1 = -0.7 wrong.
Theoretically you can go from a string label directly to a one-hot encoding. But it seems Tensorflow needs the labels to first be encoded as integers, and then that is one-hot encoded.
https://www.tensorflow.org/api_docs/python/tf/keras/layers/CategoryEncoding#examples says to use:
tf.keras.layers.CategoryEncoding(num_tokens=3, output_mode="one_hot")
Also see https://stackoverflow.com/a/69791457/841830 (the higher-voted answer there is from 2019, so applies to TensorFlow v1 I think). And searching for "tensorflow one-hot encoding" will bring up plenty of tutorials and examples.
The issue here was indeed due to the shape of my labels not being the same as logits. Logits were of shape (3) since they contained a float for the probability of each of the three classes that I wanted to predict. Labels were originally of shape (1) since it only contained one int.
To solve this, I used one-hot encoding which turned all labels into a shape of (3) and this solved the problem. Used the keras.utils.to_categorical() function to do so.
sentiment_types = ('negative', 'neutral', 'positive')
train_df['sentiment'] = train_df['sentiment'].astype('category')
test_df['sentiment'] = test_df['sentiment'].astype('category')
# Turning labels from strings to int
train_sentiment_cat = train_df['sentiment'].cat.codes
test_sentiment_cat = test_df['sentiment'].cat.codes
# One-hot encoding
train_y = to_categorical(train_sentiment_cat)
test_y = to_categorical(test_sentiment_cat)

ValueError: y_true takes value in {'True', 'False'} and pos_label is not specified in ROC_curve

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=2)
# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_test))]
# fit a model
model = KNeighborsClassifier(n_neighbors = 3)
model.fit(x_train, y_train)
# predict probabilities
lr_probs = model.predict_proba(x_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs) <-- Error Occurred
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
...
I'm trying to use the ROC curve in the KNN algorithm.
ValueError: y_true takes value in {'True', 'False'} and pos_label is not specified:
either make y_true take value in {0, 1} or {-1, 1} or pass pos_label explicitly
However, as you can see above, an error occurred.
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(data.Malware)
data['TrueorFalse'] = encoder.transform(data['TrueorFalse'])
data.value_counts(data['TrueorFalse'].values, sort=False)
data.head()
So to solve this problem, I thought the labels I wrote "True" and "False" were problematic because they were strings. Therefore, the above code was applied to switch True or Flase to 0 and 1, respectively, but errors still occur. I'm using True and False as labels in the TrueorFalse column. Is there anything I'm missing?
Instead of passing 2 arguments in your function, pass an additional parameter stating the pos_label just like the error states while elaborating.
In your case, instead of passing the arguments in function like:
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
try like:
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs, pos_label=1)
for both of the curves. Hence the suggested modification will be:
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs, pos_label=1)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs, pos_label=1)
Hope this might resolve your error completely!
y_test = y_test.map({'True': 1, 'False': 0}).astype(int)
Adding this code helped me to solve my problem.

Layer concatenation with Keras

I'm planning to have the following design:
However my code doesn't seem working:
import numpy as np
from keras.models import Model
from keras.layers import Dense, Input, Concatenate
from keras import optimizers
trainX1 = np.array([[1,2],[3,4],[5,6],[7,8]]) # fake training data
trainY1 = np.array([[1],[2],[3],[4]]) # fake label
trainX2 = np.array([[2,3],[4,5],[6,7]])
trainY2 = np.array([[1],[2],[3]])
trainX3 = np.array([[0,1],[2,3]])
trainY3 = np.array([[1],[2]])
numFeatures = 2
trainXList = [trainX1, trainX2, trainX3]
trainYStack = np.vstack((trainY1,trainY2,trainY3))
inputList = []
modelList = []
for i,_ in enumerate(trainXList):
tempInput= Input(shape = (numFeatures,))
m = Dense(10, activation='tanh')(tempInput)
inputList.append(tempInput)
modelList.append(m)
mAll = Concatenate()(modelList)
out = Dense(1, activation='tanh')(mAll)
model = Model(inputs=inputList, outputs=out)
rmsp = optimizers.rmsprop(lr=0.00001)
model.compile(optimizer=rmsp,loss='mse', dropout = 0.1)
model.fit(trainXList, trainYStack, epochs = 1, verbose=0)
The error message says that my input data sets are not having the same shape, but after I padded my training set to make number of samples = 4 for all 3 sets, I still get errors saying dimension is not right. May I know how I can design this network properly? Thanks!
p.s. Here is the error message before padding:
ValueError: All input arrays (x) should have the same number of samples. Got array shapes: [(4, 2), (3, 2), (2, 2)]
Here is the error message after padding (happens on the last line of code):
ValueError: Input arrays should have the same number of samples as target arrays. Found 4 input samples and 12 target samples.
Your input shape is wrong for the given input.
You assign the input a size of numFeatures, but actually you have 2-dimensional arrays and they are different (4,2)(3,2)(2,2). I am not sure about your problem, but number of samples and number of features seem to be reversed.
tempInput= Input(shape = (numFeatures,))
Furthermore your y is also weird. Usually you have X (number_of samples, num_features) and y with (number of samples, labels).
Use model.summary() to see how your network looks like.

Tensorflow KNN : How can we assign the K parameter for defining number of neighbors in KNN?

I have started working on a machine learning project using K-Nearest-Neighbors method on python tensorflow library. I have no experience working with tensorflow tools, so I found some code in github and modified it for my data.
My dataset is like this:
2,2,2,2,0,0,3
2,2,2,2,0,1,0
2,2,2,4,2,2,1
...
2,2,2,4,2,0,0
And this is the code which actually works fine:
import tensorflow as tf
import numpy as np
# Whole dataset => 1428 samples
dataset = 'car-eval-data-1.csv'
# samples for train, remaining for test
samples = 1300
reader = np.loadtxt(open(dataset, "rb"), delimiter=",", skiprows=1, dtype=np.int32)
train_x, train_y = reader[:samples,:5], reader[:samples,6]
test_x, test_y = reader[samples:, :5], reader[samples:, 6]
# Placeholder you can assign values in future. its kind of a variable
# v = ("variable type",[None,4]) -- you can have multidimensional values here
training_values = tf.placeholder("float",[None,len(train_x[0])])
test_values = tf.placeholder("float",[len(train_x[0])])
# MANHATTAN distance
distance = tf.abs(tf.reduce_sum(tf.square(tf.subtract(training_values,test_values)),reduction_indices=1))
prediction = tf.arg_min(distance, 0)
init = tf.global_variables_initializer()
accuracy = 0.0
with tf.Session() as sess:
sess.run(init)
# Looping through the test set to compare against the training set
for i in range (len(test_x)):
# Tensor flow method to get the prediction near to the test parameters in the training set.
index_in_trainingset = sess.run(prediction, feed_dict={training_values:train_x,test_values:test_x[i]})
print("Test %d, and the prediction is %s, the real value is %s"%(i,train_y[index_in_trainingset],test_y[i]))
if train_y[index_in_trainingset] == test_y[i]:
# if prediction is right so accuracy increases.
accuracy += 1. / len(test_x)
print('Accuracy -> ', accuracy * 100, ' %')
The only thing I do not understand is that if it's the KNN method so there has to be some K parameter which defines the number of neighbors for predicting the label for each test sample.
How can we assign the K parameter to tune the number of nearest neighbors for the code?
Is there any way to modify this code to make use of K parameter?
You're right that the example above does not have the provision to select K-Nearest neighbours. In the code below, I have added the ability to add such a parameter(knn_size) along with other corrections
import tensorflow as tf
import numpy as np
# Whole dataset => 1428 samples
dataset = 'PATH_TO_DATASET_CSV'
knn_size = 1
# samples for train, remaining for test
samples = 1300
reader = np.loadtxt(open(dataset, "rb"), delimiter=",", skiprows=1, dtype=np.int32)
train_x, train_y = reader[:samples,:6], reader[:samples,6]
test_x, test_y = reader[samples:, :6], reader[samples:, 6]
# Placeholder you can assign values in future. its kind of a variable
# v = ("variable type",[None,4]) -- you can have multidimensional values here
training_values = tf.placeholder("float",[None, len(train_x[0])])
test_values = tf.placeholder("float",[len(train_x[0])])
# MANHATTAN distance
distance = tf.abs(tf.reduce_sum(tf.square(tf.subtract(training_values,test_values)),reduction_indices=1))
# Here, we multiply the distance by -1 to reverse the magnitude of distances, i.e. the largest distance becomes the smallest distance
# tf.nn.top_k returns the top k values and their indices, here k is controlled by the parameter knn_size
k_nearest_neighbour_values, k_nearest_neighbour_indices = tf.nn.top_k(tf.scalar_mul(-1,distance),k=knn_size)
#Based on the indices we obtain from the previous step, we locate the exact class label set of the k closest matches in the training data
best_training_labels = tf.gather(train_y,k_nearest_neighbour_indices)
if knn_size==1:
prediction = tf.squeeze(best_training_labels)
else:
# Now we make our prediction based on the class label that appears most frequently
# tf.unique_with_counts() gives us all unique values that appear in a 1-D tensor along with their indices and counts
values, indices, counts = tf.unique_with_counts(best_training_labels)
# This gives us the index of the class label that has repeated the most
max_count_index = tf.argmax(counts,0)
#Retrieve the required class label
prediction = tf.gather(values,max_count_index)
init = tf.global_variables_initializer()
accuracy = 0.0
with tf.Session() as sess:
sess.run(init)
# Looping through the test set to compare against the training set
for i in range (len(test_x)):
# Tensor flow method to get the prediction near to the test parameters in the training set.
prediction_value = sess.run([prediction], feed_dict={training_values:train_x,test_values:test_x[i]})
print("Test %d, and the prediction is %s, the real value is %s"%(i,prediction_value[0],test_y[i]))
if prediction_value[0] == test_y[i]:
# if prediction is right so accuracy increases.
accuracy += 1. / len(test_x)
print('Accuracy -> ', accuracy * 100, ' %')

Resources