Using an autoencoder to reduce dimensionality - machine-learning

Here is my version of an autoencoder written using PyTorch :
import warnings
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import metrics
import datetime
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns
from ast import literal_eval
import numpy as np
from sklearn.preprocessing import scale
import seaborn as sns
import torch
%matplotlib inline
f = []
f.append(np.random.uniform(0,10,(1 , 10)).flatten())
f.append(np.random.uniform(10,20,(1 , 10)).flatten())
f.append(np.random.uniform(20,30,(1 , 10)).flatten())
x_data = torch.FloatTensor(np.array(f))
dimensions_input = 10
hidden_layer_nodes = 5
output_dimension = 10
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear = torch.nn.Linear(dimensions_input,hidden_layer_nodes)
self.sigmoid = torch.nn.Sigmoid()
self.linear2 = torch.nn.Linear(hidden_layer_nodes,output_dimension)
def forward(self, x):
l_out1 = self.linear(x)
l_out2 = self.sigmoid(l_out1)
y_pred = self.linear2(l_out2)
return y_pred
model = Model()
criterion = torch.nn.MSELoss(size_average = False)
optim = torch.optim.SGD(model.parameters(), lr = 0.00001)
def train_model():
y_data = x_data.clone()
for i in range(150000):
y_pred = model(x_data)
loss = criterion(y_pred, y_data)
if i % 5000 == 0:
Using x_data.clone() I train the network to learn a feature representation of the input data.
I'm attempting to generate hidden layer weights that match the dimensionality of rows of the input data so that each vector of x_data has a corresponding encoding. But the hidden later is of is a vector of size 5. How to change this network so that a matrix is generated that represents a reduced dimensionality of the input data ?


label_binarize Does not fit for sklearn Naive Bayes classifier showing bad input shape

I was trying to create roc curve for multiclass using Naive Bayes But it ending with
ValueError: bad input shape.
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.naive_bayes import BernoulliNB
from scipy import interp
# Import some data to play with
iris = datasets.load_iris()
X =
y =
# Binarize the output
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
# Learn to predict each class against the other
classifier = BernoulliNB(alpha=1.0, binarize=6, class_prior=None, fit_prior=True)
y_score =, y_train).predict(X_test)
raise ValueError("bad input shape {0}".format(shape))
ValueError: bad input shape (75, 6)
The error because of binarizing the y variable. The estimator can work with string values itself.
Remove the following lines,
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
You are good to go!
To get the predicted probabilities for roc_curve, use the following:, y_train)
y_score = classifier.predict_proba(X_test)
# (75, 3)

Error in Machine Learning model into Flask Web Applications

I have created my machine learning model for heart disease prediction and now I want to deploy in my web application using FLASK. The Dataset as been acquired from Kaggle. Whenever I run the application I have some issues with my code whenever I execute it, it says:
C:\Users\Surface\Desktop\Flask_app>python File "", line 42
x_data = request.form['x_data']
IndentationError: unindent does not match any outer indentation level
Can anyone guide me Thankyou :)
from flask import Flask,render_template,url_for,request
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
app = Flask(__name__)
def home():
return render_template('home.html')
def predict():
df = pd.read_csv("heart.csv")
df = df.drop(columns = ['cp', 'thal', 'slope'])
#features and labels
y =
x_data = df.drop(['target'], axis = 1)
#EXTRACT Features
x = (x_data - np.min(x_data)) / (np.max(x_data) - np.min(x_data)).values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=0)
# Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 1000, random_state = 1), y_train.T)
print("Random Forest Algorithm Accuracy Score : {:.2f}%".format(rf.score(x_test.T,y_test.T)*100))
#persist model in a standard format
from sklearn.externals import joblib
joblib.dump(rf, 'HAP_model.pkl')
HAP_model = open('HAP_model.pkl','rb')
rf = joblib.load(HAP_model)
if request.method=='POST':
x_data = request.form['x_data']
data = [df.drop(['target'], axis = 1)]
vect = rf.transform(data).toarray()
my_prediction = rf.predict(vect)
return render_template('result.html',prediction = my_prediction)
if __name__ == '__main__':

ROC curve is not acutally a curve

I have plotted few ROC curve to calculate the AUC. I am having ROC curve is actually doesn't plots like a curve. I have attached the images for better understanding. If any one can tell me what is wrong in there. I will be obliged. This is one kind of plot I am getting
This is the another type
However I am not getting a curve like this one.
This is the link to my dataset
I will thankful if anyone can help me understand the problem if any or if my curves are correct then why it is not actually in a curve like structure
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from keras.layers import Dense, Input
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Model,Sequential
from keras.utils import np_utils
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from itertools import cycle
from sklearn.metrics import roc_curve, auc
from imblearn.over_sampling import SMOTE
seed = 7
dataset = pd.read_csv('dataset/prostate.csv')
labels = dataset.values[:,-1]
features_set = dataset.iloc[:,0:12600]
oversampler = SMOTE(random_state=0)
oversampler_feature_set, oversampler_labels = oversampler.fit_sample(features_set,labels)
feature_df = pd.DataFrame(oversampler_feature_set)
labels_df = pd.DataFrame(oversampler_labels)
scalar = MinMaxScaler()
scaled_data = scalar.fit_transform(feature_df)
pca = PCA(n_components=30)
pca_data = pd.DataFrame(pca.fit_transform(scaled_data))
recreated_df = pd.concat([pca_data,labels_df], axis=1)
train, test = train_test_split(recreated_df,test_size=0.2)
X_train = train.values[:,0:30]
Y_train = train.values[:,-1]
X_test = test.values[:,0:30]
y_test = test.values[:,-1]
def my_model():
model = Sequential()
model.add(Dense(20, input_dim=30,activation='sigmoid'))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
return model
estimator = KerasClassifier(build_fn=my_model, epochs=1000, batch_size=10, shuffle=True,verbose=1)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator,X_train,Y_train, cv=kfold)
y_pred = estimator.predict(X_test).ravel()
sensitivity, specificity, thresholds_keras = roc_curve(y_test,y_pred,pos_label=2)
auc_keras = auc(sensitivity,specificity)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(sensitivity, specificity, label='Keras (area =:.3f})'.format(auc_keras))

Random Forest on Categorical Data with low accuracy

I am trying to build a model that given an item, predicts which store it belongs to.
I have a data-set of ~300 records which are supposed to be items in different online stores.
Each record is composed of: Category,Sub Category,Price,Store Identifier(The y variable)
The data seems balanced as every store has around ~10 items.
With the help of #Marcus V. I succeeded encoding the categorical columns correctly. But can not produce better results than 0.52 for a RandomForest with 15 estimators and an entropy criterion.
I feel like much more can be done here. What am I missing?
This is the data:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
class Columns(BaseEstimator, TransformerMixin):
def __init__(self, names=None):
self.names = names
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
return X.loc[:,self.names]
dataset = pd.read_csv('data.csv', header=None)
dataset.columns = ["cat1", "cat2", "num1", "target"]
# dataset.columns = ["cat1", "cat2", "target"]
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, 3]
labelencoder_X_0 = LabelEncoder()
X.iloc[:, 0] = labelencoder_X_0.fit_transform(X.iloc[:, 0])
labelencoder_X_1 = LabelEncoder()
X.iloc[:, 1] = labelencoder_X_1.fit_transform(X.iloc[:, 1])
numeric = ["num1"]
categorical = ["cat1", "cat2"]
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
('categorical', make_pipeline(Columns(names=categorical), OneHotEncoder(sparse=False)))
X = pipe.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
classifier = RandomForestClassifier(n_estimators=15, criterion='entropy', random_state = 0), y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = classifier.score(X_test, y_test)

using validation monitor in tflearn.regression to create confusion matrix

So I have been trying to create a confusion metrics in my autoencoder
from __future__ import division, print_function, absolute_import
import numpy as np
#import matplotlib.pyplot as plt
import tflearn
import tensorflow as tf
from random import randint
from tensorflow.contrib import metrics as ms
# Data loading and preprocessing
import tflearn.datasets.mnist as mnist
Images, Lables, testImages, testLables = mnist.load_data(one_hot=True)
f = randint(0,20)
x = tf.placeholder("float",[None, 784])
y = tf.placeholder("float",[None, 10])
# Building the encoder
encoder = tflearn.input_data(shape=[None, 784])
encoder = tflearn.fully_connected(encoder, 256)
encoder = tflearn.fully_connected(encoder, 64)
encoder = tflearn.fully_connected(encoder, 10)
acc= tflearn.metrics.Accuracy()
# Regression, with mean square error
net = tflearn.regression(encoder, optimizer='adam', learning_rate=0.001,
loss='mean_square', metric=acc, shuffle_batches=True, validation_monitors = ?)
model = tflearn.DNN(net, tensorboard_verbose=0), Lables, n_epoch=20, validation_set=(testImages, testLables),
run_id="auto_encoder", batch_size=256,show_metric=True)
#Applying the above model on test Images and evaluating as well as prediction of the labels
evali= model.evaluate(testImages,testLables)
print("Accuracy of the model is :", evali)
lables = model.predict_label(testImages)
print("The predicted labels are :",lables[f])
prediction = model.predict(testImages)
print("The predicted probabilities are :", prediction[f])
I have gone through the documantation but they were not very useful to me.
How would I configure to get the confusion matrix?
validation_monitors ={?}
