How can I add some noise to my SVM implementation? - machine-learning

my code run with no problems. what I was trying to do is add some noise to my dataset which I couldn't do .
so how can I do that ?
I tried many approach to do that :
-creating a noise function
-adding directly some noise to my dataset
But all of them didn't work.
All I am asking about is how can i do that to my code below ?
This is the code for testing and ploting ,it works just fine
But what I am trying is to add noise to my data set , and that what i couldn't do .
import numpy as np
from SVM_M_M import SVM
from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Testing
if __name__ == "__main__":
X, y = datasets.make_blobs(
n_samples=250, n_features=2, centers=2, cluster_std=1.05, random_state=1
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=1)
clf = SVM(n_iters=1000)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
def accuracy(y_true, y_pred):
accuracy = np.sum(y_true==y_pred) / len(y_true)
return accuracy
print("SVM Accuracy: ", accuracy(y_test, predictions))
# plot results
def get_hyperplane(x, w, b, offset):
return (-w[0] * x - b + offset) / w[1]
fig, ax = plt.subplots(1, 1, figsize=(10,6))
plt.set_cmap('PiYG')
plt.scatter(X_train[:, 0], X_train[:, 1], marker='o', c=y_train, s=100, alpha=0.75)
plt.scatter(X_test[:, 0], X_test[:, 1], marker="x", c=y_test, s=100, alpha=0.75)
Y = np.random.normal(2 * X + 2, 20)
x0_1 = np.amin(X_train[:, 0])
x0_2 = np.amax(X_train[:, 0])
x1_1 = get_hyperplane(x0_1, clf.w, clf.b, 0)
x1_2 = get_hyperplane(x0_2, clf.w, clf.b, 0)
x1_1_m = get_hyperplane(x0_1, clf.w, clf.b, -1)
x1_2_m = get_hyperplane(x0_2, clf.w, clf.b, -1)
x1_1_p = get_hyperplane(x0_1, clf.w, clf.b, 1)
x1_2_p = get_hyperplane(x0_2, clf.w, clf.b, 1)
ax.plot([x0_1, x0_2], [x1_1, x1_2], "-", c='k', lw=1, alpha=0.9)
ax.plot([x0_1, x0_2], [x1_1_m, x1_2_m], "--", c='grey', lw=1, alpha=0.8)
ax.plot([x0_1, x0_2], [x1_1_p, x1_2_p], "--", c='grey', lw=1, alpha=0.8)
x1_min = np.amin(X[:, 1])
x1_max = np.amax(X[:, 1])
ax.set_ylim([x1_min - 3, x1_max + 3])
for spine in ['top','right']:
ax.spines[spine].set_visible(False)
plt.show()

Related

Why does my "val_accuracy" start from a high value?

I am using breast cancer wisconsin data and working on it with ANN, in keras library.
I've added codes and a part of data below, I hope it's readable and understandable.
One row from dataset:1000025,5,1,1,1,2,1,3,1,1,2
Prediction results:
Test Loss: 0.05948319600096771 - Test Accuracy: 0.9809523820877075
As seen, confusion matrix looks fine.
Here, loss graphic looks okay but accuracy graphic made me confused. Why does training_accuracy starts with a high value and doesnt change much?
Here's the main part of code, after importing libraries:
data = pd.read_csv('breast-cancer-wisconsin.data')
data_new = data.drop(['1000025'],axis=1)
X = data_new.iloc[:,0:8].values
Y = data_new.iloc[:,9].values
labelencoder_= LabelEncoder()
Y = labelencoder_.fit_transform(Y)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.15, random_state = 0)
#Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
classifier = Sequential()
classifier.add(Dense(8, input_dim=8))
classifier.add(Activation("relu"))
classifier.add(Dropout(0.1))
classifier.add(Dense(32))
classifier.add(Activation("relu"))
classifier.add(Dropout(0.1))
classifier.add(Dense(16))
classifier.add(Activation("relu"))
classifier.add(Dropout(0.1))
classifier.add(Dense(1))
classifier.add(Activation("sigmoid"))
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = classifier.fit(X_train, y_train, epochs=100, batch_size=10, validation_split=0.11)
test_loss, test_acc = classifier.evaluate(X_test, y_test)
print('\nTest Loss:', test_loss)
print('Test Accuracy:', test_acc)
y_pred = classifier.predict(X_test)
And plots of the graphics and confusion matrix:
#LOSS---
training_loss = history.history['loss']
test_loss = history.history['val_loss']
epoch_count = range(1, len(training_loss) + 1)
plt.plot(epoch_count, training_loss, 'r-')
plt.plot(epoch_count, test_loss, 'b-')
plt.legend(['Training Loss', 'Test Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()
#ACCURACY---
training_acc = history.history['accuracy']
test_acc = history.history['val_accuracy']
epoch_count2 = range(1, len(training_acc) + 1)
plt.plot(epoch_count2, training_acc, 'r-')
plt.plot(epoch_count2, test_acc, 'b-')
plt.legend(['Training Accuracy','Test Accuracy'])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.show()
ConfMatrix = confusion_matrix(y_test,pred)
ax = sns.heatmap(ConfMatrix, annot=True, cmap="gray", fmt="d", xticklabels = ['Benign', 'Malignant'], yticklabels = ['Benign', 'Malignant'])
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.ylabel('True')
plt.xlabel('Prediction')
plt.title("Confusion Matrix");
plt.figure(figsize=(7,17))

Multiple Linear Regression Machine Learning in Python --ValueError: shapes (8,15) and (390,) not aligned

I am trying to evaluate output based on certain input, using Multiple Linear Regression Machine Learning .I have trained the data and getting correct expected values while running below code:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
#dataset = pd.read_csv('50_Startups.csv')
dataset = pd.read_excel('MAHI2.xlsx')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 5].values
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0])
labelencoder1 = LabelEncoder()
X[:, 1] = labelencoder.fit_transform(X[:, 1])
labelencoder2 = LabelEncoder()
X[:, 2] = labelencoder.fit_transform(X[:, 2])
labelencoder3 = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = "all")
#X = onehotencoder.fit_transform(X).toarray()
X = onehotencoder.fit_transform(X).toarray()
# Avoiding the Dummy Variable Trap
X = X[:, 1:]
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X, y)
y_pred = regressor.predict(X)
df = pd.DataFrame({'Actual': y.flatten(), 'Predicted': y_pred.flatten()})
df
Now I am trying to use same model to evaluate another set of input data as below :
dataset1 = pd.read_excel('MAHI3.xlsx')
#dataset2 = pd.get_dummies(dataset1)
X1 = dataset1.iloc[:, :-1].values
y2 = dataset1.iloc[:, 5].values
# Encoding categorical data
#labelencoder3 = LabelEncoder()
X1[:, 0] = labelencoder.fit_transform(X1[:, 0])
#labelencoder4 = LabelEncoder()
X1[:, 1] = labelencoder.fit_transform(X1[:, 1])
#labelencoder5 = LabelEncoder()
X1[:, 2] = labelencoder.fit_transform(X1[:, 2])
#labelencoder6 = LabelEncoder()
X1[:, 3] = labelencoder.fit_transform(X1[:, 3])
#onehotencoder2 = OneHotEncoder(categorical_features = "all")
X1 = onehotencoder.fit_transform(X1).toarray()
output = regressor.predict(X1)
df1 = pd.DataFrame({'Actual1': y2.flatten(), 'Predicted1': output.flatten()})
df1
But while I am running this code getting below error:
ValueError: shapes (6,13) and (390,) not aligned: 13 (dim 1) != 390 (dim 0)
It will be great if anyone help me to resolve this issue.
I don't have access to your dataset but I seems that your problem is a dimensionality problem. The thing that seems to change dimensions is the "onehotencoder".
Try to use the same one hot encoder for both.
ohe = onehotencoder.fit(X)
X = ohe.transform(X).toarray()
X1 = ohe.transform(X1).toarray()
You should make sure that the number of features that the "regressor" model is receiving is the same that when it is trained.

ValueError: Found input variables with inconsistent numbers of samples: [25707, 25000]

I have this below error when trying to Apply this code below :
I am doing a tutorial based on this page : https://towardsdatascience.com/sentiment-analysis-with-python-part-1-5ce197074184
File "reviewsML.py", line 58, in <module>
X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.50)
….
ValueError: Found input variables with inconsistent numbers of samples: [25707, 25000]
Here the part of code
reviews_train = []
for line in codecs.open('movie_data/full_train.txt', 'r', 'utf-8'):
reviews_train.append(line.strip())
reviews_test = []
for line in codecs.open('movie_data/full_test.txt', 'r', 'utf-8'):
reviews_test.append(line.strip())
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
def preprocess_reviews(reviews):
reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
return reviews
reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)
print(len(reviews_train_clean))
from sklearn.feature_extraction.text import CountVectorizer
#construction of the classfier : hyperparameter c => adjusts the regularization
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean) #dimensionality reduction, return transformed data
X_test = cv.transform(reviews_test_clean)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
target = [1 if i < 12500 else 0 for i in range(25000)]
X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75)
for c in [0.01, 0.05, 0.25, 0.5, 1]:
lr = LogisticRegression(C=c)
lr.fit(X_train, y_train)
print ("Accuracy for C=%s: %s"
% (c, accuracy_score(y_val, lr.predict(X_val))))
Do you know what I am doing wrong ?
I tried to print (X.shape[0])
it gives me 25707
But I do not know why beacuse the original file contains 25 000 for the train and the test

Error when checking input: expected embedding_1_input to have shape (4,) but got array with shape (1,)

I use pretrained embedding vectors for my keras model. Before I did it everything worked and now I get this error:
ValueError: Error when checking input: expected embedding_1_input to
have shape (4,) but got array with shape (1,)
Maybe somebody can help me, what I do wrong here. I am not sure if I did correct model.fit and model.evaluate. Maybe there is a problem?
import csv
import numpy as np
np.random.seed(42)
from keras.models import Sequential, Model
from keras.layers import *
from random import shuffle
from sklearn.model_selection import train_test_split
from keras import optimizers
from keras.callbacks import EarlyStopping
from itertools import groupby
from numpy import asarray
from numpy import zeros
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
#function makes a list of antonyms and synonyms from the files
def preprocessing(filename):
list_words = []
with open(filename) as tsv:
for line in csv.reader(tsv, dialect="excel-tab"):
list_words.append([line[0], line[1]])
return list_words
#function make a list of not relevant pairs by mixing synonyms and
antonyms
def notrelevant(filename, filename2):
list_words = []
with open(filename) as tsv:
with open(filename2) as tsv2:
for lines in zip(csv.reader(tsv, dialect="excel-tab"),csv.reader(tsv2, dialect="excel-tab")):
list_words.append([lines[0][0], lines[1][1]])
return list_words
antonyms_list = preprocessing("antonyms.tsv")
synonyms_list = preprocessing("synonyms.tsv")
notrelevant_list = notrelevant("antonyms.tsv", "synonyms.tsv")
# function combines all antonyms, synonyms in one list with labels,
shuffle them
def data_prepare(ant,syn,nrel):
data = []
for elem1,elem2 in ant:
data.append([[elem1,elem2], "Antonyms"])
for elem1, elem2 in syn:
data.append([[elem1, elem2], "Synonyms"])
for elem1, elem2 in nrel:
data.append([[elem1, elem2], "Not relevant"])
shuffle(data)
return data
data_with_labels_shuffled =
data_prepare(antonyms_list,synonyms_list,notrelevant_list)
def label_to_onehot(labels):
mapping = {label: i for i, label in enumerate(set(labels))}
one_hot = np.empty((len(labels), 3))
for i, label in enumerate(labels):
entry = [0] * len(mapping)
entry[mapping[label]] = 1
one_hot[i] = entry
return (one_hot)
def words_to_ids(labels):
vocabulary = []
word_to_id = {}
ids = []
for word1,word2 in labels:
vocabulary.append(word1)
vocabulary.append(word2)
counter = 0
for word in vocabulary:
if word not in word_to_id:
word_to_id[word] = counter
counter += 1
for word1,word2 in labels:
ids.append([word_to_id [word1], word_to_id [word2]])
return (ids)
def split_data(datas):
data = np.array(datas)
X, y = data[:, 0], data[:, 1]
# split the data to get 60% train and 40% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
y_train = label_to_onehot(y_train)
X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
y_dev = label_to_onehot(y_dev)
y_test = label_to_onehot(y_test)
return X_train, y_train, X_dev, y_dev, X_test, y_test
X_train, y_train, X_dev, y_dev, X_test, y_test = split_data(data_with_labels_shuffled)
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(X_train)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(X_train)
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.50d.txt')
for line in f:
values = line.split()
word = values[0]
coefs = asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 50))
for word, i in t.word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
VOCABSIZE = len(data_with_labels_shuffled)
EMBSIZE = 50
HIDDENSIZE = 50
KERNELSIZE = 5
MAXEPOCHS = 5
model = Sequential()
model.add(Embedding(vocab_size, 50, weights=[embedding_matrix],
input_length=4, trainable=False))
model.add(Dropout(0.25))
model.add(Bidirectional(GRU(units = HIDDENSIZE // 2)))
#model.add(Flatten())
model.add(Dense(units = 3, activation = "softmax"))
model.compile(loss='categorical_crossentropy', optimizer="adam",
metrics=['accuracy'])
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='min')
model.fit (X_train, y_train,
batch_size=64,
callbacks = [earlystop],
epochs=100,
validation_data=(X_dev, y_dev),
verbose=1)
scores = model.evaluate(X_test, y_testbatch_size=64)
print("Accuracy is: %.2f%%" %(scores[1] * 100))
I think the problem is that you should pass encoded_docs to your model.fit() function instead of X_train since encoded_docs contains the tokenization of your training data and X_train still only contains a list of words. Moreover, you have to make sure that the input_length parameter of your Embedding layer matches the length of these tokenized training examples that you have created in encoded_docs.

Classifying sequences with different lengths with error batching

I'm using Keras with the TensorFlow backend. I've just figured out how to train and classify sequences of different lengths without masking, because I can't get masking to work. In the toy example I'm working with, I'm trying to train an LSTM to detect whether a sequence of arbitrary length starts with a 1 or not.
from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np
def gen_sig(num_samples, seq_len):
one_indices = np.random.choice(a=num_samples, size=num_samples // 2, replace=False)
x_val = np.zeros((num_samples, seq_len), dtype=np.bool)
x_val[one_indices, 0] = 1
y_val = np.zeros(num_samples, dtype=np.bool)
y_val[one_indices] = 1
return x_val, y_val
N_train = 100
N_test = 10
recall_len = 20
X_train, y_train = gen_sig(N_train, recall_len)
X_test, y_test = gen_sig(N_train, recall_len)
print('Build STATEFUL model...')
model = Sequential()
model.add(LSTM(10, batch_input_shape=(1, 1, 1), return_sequences=False, stateful=True))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train...')
for epoch in range(15):
mean_tr_acc = []
mean_tr_loss = []
for seq_idx in range(X_train.shape[0]):
start_val = X_train[seq_idx, 0]
assert y_train[seq_idx] == start_val
assert tuple(np.nonzero(X_train[seq_idx, :]))[0].shape[0] == start_val
y_in = np.array([y_train[seq_idx]], dtype=np.bool)
for j in range(np.random.choice(a=np.arange(5, recall_len+1))):
x_in = np.array([[[X_train[seq_idx][j]]]])
tr_loss, tr_acc = model.train_on_batch(x_in, y_in)
mean_tr_acc.append(tr_acc)
mean_tr_loss.append(tr_loss)
model.reset_states()
print('accuracy training = {}'.format(np.mean(mean_tr_acc)))
print('loss training = {}'.format(np.mean(mean_tr_loss)))
print('___________________________________')
mean_te_acc = []
mean_te_loss = []
for seq_idx in range(X_test.shape[0]):
start_val = X_test[seq_idx, 0]
assert y_test[seq_idx] == start_val
assert tuple(np.nonzero(X_test[seq_idx, :]))[0].shape[0] == start_val
y_in = np.array([y_test[seq_idx]], dtype=np.bool)
for j in range(np.random.choice(a=np.arange(5, recall_len+1))):
te_loss, te_acc = model.test_on_batch(np.array([[[X_test[seq_idx][j]]]], dtype=np.bool), y_in)
mean_te_acc.append(te_acc)
mean_te_loss.append(te_loss)
model.reset_states()
print('accuracy testing = {}'.format(np.mean(mean_te_acc)))
print('loss testing = {}'.format(np.mean(mean_te_loss)))
print('___________________________________')
As seen in the code, my error is being batched over each time-step. This is bad for multiple reasons. How do I train the network in two steps? For example:
Run a bunch of values through the network to accumulate the error
Adjust the weights of the network given this accumulated error
To do what is described in the original question, the easiest way is to train the original network with masking, but then test with a stateful network so any length input can be classified:
import numpy as np
np.random.seed(1)
import tensorflow as tf
tf.set_random_seed(1)
from keras import models
from keras.layers import Dense, Masking, LSTM
import matplotlib.pyplot as plt
def stateful_model():
hidden_units = 256
model = models.Sequential()
model.add(LSTM(hidden_units, batch_input_shape=(1, 1, 1), return_sequences=False, stateful=True))
model.add(Dense(1, activation='relu', name='output'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop')
return model
def train_rnn(x_train, y_train, max_len, mask):
epochs = 10
batch_size = 200
vec_dims = 1
hidden_units = 256
in_shape = (max_len, vec_dims)
model = models.Sequential()
model.add(Masking(mask, name="in_layer", input_shape=in_shape,))
model.add(LSTM(hidden_units, return_sequences=False))
model.add(Dense(1, activation='relu', name='output'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop')
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
validation_split=0.05)
return model
def gen_train_sig_cls_pair(t_stops, num_examples, mask):
x = []
y = []
max_t = int(np.max(t_stops))
for t_stop in t_stops:
one_indices = np.random.choice(a=num_examples, size=num_examples // 2, replace=False)
sig = np.zeros((num_examples, max_t), dtype=np.int8)
sig[one_indices, 0] = 1
sig[:, t_stop:] = mask
x.append(sig)
cls = np.zeros(num_examples, dtype=np.bool)
cls[one_indices] = 1
y.append(cls)
return np.concatenate(x, axis=0), np.concatenate(y, axis=0)
def gen_test_sig_cls_pair(t_stops, num_examples):
x = []
y = []
for t_stop in t_stops:
one_indices = np.random.choice(a=num_examples, size=num_examples // 2, replace=False)
sig = np.zeros((num_examples, t_stop), dtype=np.bool)
sig[one_indices, 0] = 1
x.extend(list(sig))
cls = np.zeros((num_examples, t_stop), dtype=np.bool)
cls[one_indices] = 1
y.extend(list(cls))
return x, y
if __name__ == '__main__':
noise_mag = 0.01
mask_val = -10
signal_lengths = (10, 15, 20)
x_in, y_in = gen_train_sig_cls_pair(signal_lengths, 10, mask_val)
mod = train_rnn(x_in[:, :, None], y_in, int(np.max(signal_lengths)), mask_val)
testing_dat, expected = gen_test_sig_cls_pair(signal_lengths, 3)
state_mod = stateful_model()
state_mod.set_weights(mod.get_weights())
res = []
for s_i in range(len(testing_dat)):
seq_in = list(testing_dat[s_i])
seq_len = len(seq_in)
for t_i in range(seq_len):
res.extend(state_mod.predict(np.array([[[seq_in[t_i]]]])))
state_mod.reset_states()
fig, axes = plt.subplots(2)
axes[0].plot(np.concatenate(testing_dat), label="input")
axes[1].plot(res, "ro", label="result", alpha=0.2)
axes[1].plot(np.concatenate(expected, axis=0), "bo", label="expected", alpha=0.2)
axes[1].legend(bbox_to_anchor=(1.1, 1))
plt.show()

Resources