cross validation + decision trees in sklearn - machine-learning

Attempting to create a decision tree with cross validation using sklearn and panads.
My question is in the code below, the cross validation splits the data, which i then use for both training and testing. I will be attempting to find the best depth of the tree by recreating it n times with different max depths set. In using cross validation should i instead be using k folds CV and if so how would I use that within the code I have?
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn import cross_validation
features = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv('magic04.data',header=None,names=features)
df['class'] = df['class'].map({'g':0,'h':1})
x = df[features[:-1]]
y = df['class']
x_train,x_test,y_train,y_test = cross_validation.train_test_split(x,y,test_size=0.4,random_state=0)
depth = []
for i in range(3,20):
clf = tree.DecisionTreeClassifier(max_depth=i)
clf = clf.fit(x_train,y_train)
depth.append((i,clf.score(x_test,y_test)))
print depth
here is a link to the data that i am using in case that helps anyone. https://archive.ics.uci.edu/ml/datasets/MAGIC+Gamma+Telescope

In your code you are creating a static training-test split. If you want to select the best depth by cross-validation you can use sklearn.cross_validation.cross_val_score inside the for loop.
You can read sklearn's documentation for more information.
Here is an update of your code with CV:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.cross_validation import cross_val_score
from pprint import pprint
features = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv('magic04.data',header=None,names=features)
df['class'] = df['class'].map({'g':0,'h':1})
x = df[features[:-1]]
y = df['class']
# x_train,x_test,y_train,y_test = cross_validation.train_test_split(x,y,test_size=0.4,random_state=0)
depth = []
for i in range(3,20):
clf = tree.DecisionTreeClassifier(max_depth=i)
# Perform 7-fold cross validation
scores = cross_val_score(estimator=clf, X=x, y=y, cv=7, n_jobs=4)
depth.append((i,scores.mean()))
print(depth)
Alternatively, you can use sklearn.grid_search.GridSearchCV and not write the for loop yourself, especially if you want to optimize for more than one hyper-parameter.
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import GridSearchCV
features = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv('magic04.data',header=None,names=features)
df['class'] = df['class'].map({'g':0,'h':1})
x = df[features[:-1]]
y = df['class']
parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, n_jobs=4)
clf.fit(X=x, y=y)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)
Edit: changed how GridSearchCV is imported to accommodate learn2day's comment.

Related

Using an autoencoder to reduce dimensionality

Here is my version of an autoencoder written using PyTorch :
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import metrics
import datetime
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns
sns.set_style("darkgrid")
from ast import literal_eval
import numpy as np
from sklearn.preprocessing import scale
import seaborn as sns
sns.set_style("darkgrid")
import torch
%matplotlib inline
f = []
f.append(np.random.uniform(0,10,(1 , 10)).flatten())
f.append(np.random.uniform(10,20,(1 , 10)).flatten())
f.append(np.random.uniform(20,30,(1 , 10)).flatten())
x_data = torch.FloatTensor(np.array(f))
x_data
dimensions_input = 10
hidden_layer_nodes = 5
output_dimension = 10
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear = torch.nn.Linear(dimensions_input,hidden_layer_nodes)
self.sigmoid = torch.nn.Sigmoid()
self.linear2 = torch.nn.Linear(hidden_layer_nodes,output_dimension)
def forward(self, x):
l_out1 = self.linear(x)
l_out2 = self.sigmoid(l_out1)
y_pred = self.linear2(l_out2)
return y_pred
model = Model()
criterion = torch.nn.MSELoss(size_average = False)
optim = torch.optim.SGD(model.parameters(), lr = 0.00001)
def train_model():
y_data = x_data.clone()
for i in range(150000):
y_pred = model(x_data)
loss = criterion(y_pred, y_data)
if i % 5000 == 0:
print(loss)
optim.zero_grad()
loss.backward()
optim.step()
Using x_data.clone() I train the network to learn a feature representation of the input data.
I'm attempting to generate hidden layer weights that match the dimensionality of rows of the input data so that each vector of x_data has a corresponding encoding. But the hidden later is of is a vector of size 5. How to change this network so that a matrix is generated that represents a reduced dimensionality of the input data ?

Total of correctly predicted in binary classification of images with CNN in keras

I have succeeded build binary classification model for image in CNN using Keras and made the prediction using model.predict_classes() and here is my code:
import numpy as np
import os,sys
from keras.models import load_model
import PIL
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
model = load_model('./potholes16_2.h5')
model.compile (loss = 'binary_crossentropy',
optimizer = 'adam',
metric = ['accuracy'])
path= os.path.abspath("./potholes14/test/positive")
extensions = 'JPG'
if __name__ == "__main__":
for f in os.listdir(path):
if os.path.isfile(os.path.join(path,f)):
f_text, f_ext= os.path.splitext(f)
f_ext= f_ext[1:].upper()
if f_ext in extensions:
print (f)`enter code here`
img = Image.open(os.path.join(path,f))
new_width = 200
new_height = 200
img = img.resize((new_width, new_height), Image.ANTIALIAS)
#width, height= image.size
img = np.reshape(img,[1,new_width,new_height,3])
classes = model.predict_classes(img)
print (classes)
Now I want to count total of images which correctly predicted, for example how many classes are belong to class 0 or class 1?
You need to invoke the model.evaluate function; supposing you want to evaluate the data in x_test with the ground truth labels in y_test, then:
score = model.evaluate(x_test, y_test, verbose=0)
score[0] will give you the loss (binary cross entropy in your case), while score[1] contains the required binary accuracy.
See the docs for more details (scroll down looking for evaluate).
You must have the a a sample array of the data you are predicting on correct? well you could load that data as well. Keep the code you have,
classes = model.predict_classes(img)
yields
array([[ 0.94981687],[ 0.57888238],[ 0.58651019],[ 0.30058956],[ 0.21879381]])
and your class data looks like this
class_validation = np.array([[1],[0],[0],[0],[1]])
Then just find where there equal once rounding classes
np.where(np.round(classes,0)==class_validation)[0].shape[0]
Note: there are many was to write the last line, that assums your numpy array is shape (number_of_sample,1)
Another way to check
totalCorrect = class_validation[((np.round(classes,0) - class_validation)==0)]
print('Correct in Class 1 = ',np.count_nonzero(totalCorrect),'Correct in Class 0 = ',abs(len(totalCorrect)-np.count_nonzero(totalCorrect)))

CountVectorizer MultinomialNB ValueError: dimension mismatch

I am trying to make my MultinomialNB work. I use CountVectorizer on my training and test set and of course there are different words in both setzs. So I see, why the error
ValueError: dimension mismatch
occurs, but I dont know how to fix it. I tried CountVectorizer().transform instead of CountVectorizer().fit_transform as was suggested in an other post (SciPy and scikit-learn - ValueError: Dimension mismatch) but that just gives me
NotFittedError: CountVectorizer - Vocabulary wasn't fitted.
how can I use CountVectorizer right?
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import sklearn.feature_extraction
df = data
y = df["meal_parent_category"]
X = df['name_cleaned']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)
X_train = CountVectorizer().fit_transform(X_train)
X_test = CountVectorizer().fit_transform(X_test)
algo = MultinomialNB()
algo.fit(X_train,y_train)
y = algo.predict(X_test)
print(classification_report(y_test,y_pred))
Ok, so after asking this question I figured it out :)
Here is the solution with vocabulary and such:
df = train
y = df["meal_parent_category_cleaned"]
X = df['name_cleaned']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)
vectorizer_train = CountVectorizer()
X_train = vectorizer_train.fit_transform(X_train)
vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_)
X_test = vectorizer_test.transform(X_test)
algo = MultinomialNB()
algo.fit(X_train,y_train)
y_pred = algo.predict(X_test)
print(classification_report(y_test,y_pred))

using validation monitor in tflearn.regression to create confusion matrix

So I have been trying to create a confusion metrics in my autoencoder
from __future__ import division, print_function, absolute_import
import numpy as np
#import matplotlib.pyplot as plt
import tflearn
import tensorflow as tf
from random import randint
from tensorflow.contrib import metrics as ms
# Data loading and preprocessing
import tflearn.datasets.mnist as mnist
Images, Lables, testImages, testLables = mnist.load_data(one_hot=True)
f = randint(0,20)
x = tf.placeholder("float",[None, 784])
y = tf.placeholder("float",[None, 10])
# Building the encoder
encoder = tflearn.input_data(shape=[None, 784])
encoder = tflearn.fully_connected(encoder, 256)
encoder = tflearn.fully_connected(encoder, 64)
encoder = tflearn.fully_connected(encoder, 10)
acc= tflearn.metrics.Accuracy()
# Regression, with mean square error
net = tflearn.regression(encoder, optimizer='adam', learning_rate=0.001,
loss='mean_square', metric=acc, shuffle_batches=True, validation_monitors = ?)
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(Images, Lables, n_epoch=20, validation_set=(testImages, testLables),
run_id="auto_encoder", batch_size=256,show_metric=True)
#Applying the above model on test Images and evaluating as well as prediction of the labels
evali= model.evaluate(testImages,testLables)
print("Accuracy of the model is :", evali)
lables = model.predict_label(testImages)
print("The predicted labels are :",lables[f])
prediction = model.predict(testImages)
print("The predicted probabilities are :", prediction[f])
I have gone through the documantation but they were not very useful to me.
How would I configure to get the confusion matrix?
validation_monitors ={?}

Is log loss in XGBoost and Sklearn the same?

I'm playing around with a new data set with XGBoost. Following is my code:
import xgboost as xgb
import pandas as pd
import numpy as np
train = pd.read_csv("train_users_processed_onehot.csv")
labels = train["Buy"].map({"Y":1, "N":0})
features = train.drop("Buy", axis=1)
data_dmat = xgb.DMatrix(data=features, label=labels)
params={"max_depth":5, "min_child_weight":2, "eta": 0.1, "subsamples":0.9, "colsample_bytree":0.8, "objective" : "binary:logistic", "eval_metric": "logloss", "seed": 2333}
rounds = 6000
result = xgb.cv(params=params, dtrain=data_dmat, num_boost_round=rounds, early_stopping_rounds=50, as_pandas=True, seed=2333)
print result
The result is (omitted intermediate results):
test-logloss-mean test-logloss-std train-logloss-mean
0 0.683354 0.000058 0.683206
165 0.622318 0.000661 0.607680
But when I'm trying to do parameter tuning with GridSearchCV, I found the result to be quite different. To be more specific, this is my code:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier
import numpy as np
import pandas as pd
train_dataframe = pd.read_csv("train_users_processed_onehot.csv")
train_labels = train_dataframe["Buy"].map({"Y":1, "N":0})
train_features = train_dataframe.drop("Buy", axis=1)
params = {"max_depth": [5], "min_child_weight": [2]}
estimator = XGBClassifier(learning_rate=0.1, n_estimators=170, max_depth=2, min_child_weight=4, objective="binary:logistic", subsample=0.9, colsample_bytree=0.8, seed=2333)
gsearch1 = GridSearchCV(estimator, param_grid=params, n_jobs=4, iid=False, verbose=1, scoring="neg_log_loss")
gsearch1.fit(train_features.values, train_labels.values)
print pd.DataFrame(gsearch1.cv_results_)
print gsearch1.best_params_
print -gsearch1.best_score_
and I got:
mean_fit_time mean_score_time mean_test_score mean_train_score
0 87.71497 0.209772 -3.134132 -0.567306
It is clear that 3.134132 is very different from 0.622318. What's the reason of this?
Thank you!
You pass different parameters to the both:
max_depth: 5 vs 2
eta: 0.1 vs 0.3 (default)
min_child_weight: 2 vs 4
The parameters you pass to sklearn are more conservative (it's less likely you will overfit the model), so the algorithm does not try too much to fit the model to the data. In turn, on second you get a lower score - exactly what should be expected.

Resources