Why couldn't I predict directly using Features Matrix? - machine-learning

[SOLVED]The below process is where I process my new data and try to predict but fail using the data and my trained model.
First I import,
import pandas as pd
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
import numpy as np
import numpy.random as nr
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
import math
%matplotlib inline
Import data and data processing
##test
##prepare test_data
x_test_data = pd.read_csv('AW_test.csv')
x_test_data.loc[:,x_test_data.dtypes==object].isnull().sum()
##dropnan
cols_of_interest = ['Title','MiddleName','Suffix','AddressLine2']
x_test_data.drop(cols_of_interest,axis=1,inplace=True)
##dropduplicate
x_test_data.drop_duplicates(subset = 'CustomerID', keep = 'first',
inplace=True)
print(x_test_data.shape)
Then I transform my categorical variables features to one-hot encoded matrices
##change categorical variables to numeric variables
def encode_string(cat_features):
enc = preprocessing.LabelEncoder()
enc.fit(cat_features)
enc_cat_features = enc.transform(cat_features)
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(enc_cat_features.reshape(-1,1))
return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()
categorical_columns =
['CountryRegionName','Education','Occupation','Gender','MaritalStatus']
Features = encode_string(x_test_data['CountryRegionName'])
for col in categorical_columns:
temp = encode_string(x_test_data[col])
Features = np.concatenate([Features, temp],axis=1)
print(Features)
Then, I add the rest of the numeric features on to the matrices
##add numeric variables
Features = np.concatenate([Features,
np.array(x_test_data[['HomeOwnerFlag','NumberCarsOwned',
'TotalChildren','YearlyIncome']])], axis=1)
Next, I scale the Feature Matrices
##scale numeric variables
with open('./lin_reg_scaler.pickle', 'rb') as file:
scaler =pickle.load(file)
Features[:,-5:] = scaler.transform(Features[:,-5:])
I load the linear regression model I trained in another file(If needed I can post it)
# Loading the saved linear regression model pickle
import pickle
loaded_model = pickle.load(open('./lin_reg_mod.pickle', 'rb'))
I put my Feature Matrices directly in
#predict
loaded_model.predict(Features)
However, This is what I got
array([-5.71697209e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
-4.64634881e+12, -4.64634881e+12, -5.71697209e+12, -4.64634881e+12,
-5.71697209e+12, -4.64634881e+12, -5.71697209e+12, -4.64634881e+12,
-4.64634881e+12, -4.64634881e+12, -5.71697209e+12, -4.64634881e+12,
-4.64634881e+12, -5.71697209e+12, -5.71697209e+12, -5.71697209e+12,
-4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
-4.64634881e+12, -5.71697209e+12, -4.64634881e+12, -5.71697209e+12,
-5.71697209e+12, -4.64634881e+12, -5.71697209e+12, -5.71697209e+12,
-4.64634881e+12, -5.71697209e+12, -4.64634881e+12, -5.71697209e+12,
-4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
-5.71697209e+12, -5.71697209e+12, -4.64634881e+12, -4.64634881e+12,
-4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -5.71697209e+12,
-4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
-4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -4.64634881e+12,
-4.64634881e+12, -5.71697209e+12, -4.64634881e+12, -5.71697209e+12,
-4.64634881e+12, -4.64634881e+12, -4.64634881e+12, -5.71697209e+12,
-5.71697209e+12, -5.71697209e+12, -5.71697209e+12, -4.64634881e+12,............
In my other file, I've successfully trained my model and test it with my test data.
This is what I got when inputting x_test into my model in that file(The result I want to get):
[83.75482221 66.31820493 47.22211384 ... 69.65032224 88.45908874
58.45193545]
I have no idea what is going on, can someone help plz
[UPDATE]Below is my code for training the model
custs = pd.read_csv('combined_custs.csv')
custs.dtypes
##avemonthspend data
ams = pd.read_csv('AW_AveMonthSpend.csv')
ams.drop_duplicates(subset='CustomerID', keep='first', inplace=True)
##merge
combined_custs=custs.merge(ams)
combined_custs.to_csv('./ams_combined_custs.csv')
combined_custs.head(20)
##change categorical variables to numeric variables
def encode_string(cat_features):
enc = preprocessing.LabelEncoder()
enc.fit(cat_features)
enc_cat_features = enc.transform(cat_features)
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(enc_cat_features.reshape(-1,1))
return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()
categorical_columns =
['CountryRegionName','Education','Occupation','Gender','MaritalStatus']
Features = encode_string(combined_custs['CountryRegionName'])
for col in categorical_columns:
temp = encode_string(combined_custs[col])
Features = np.concatenate([Features, temp],axis=1)
print(Features.shape)
print(Features[:2,:])
##add numeric variables
Features = np.concatenate([Features,
np.array(combined_custs[['HomeOwnerFlag',
'NumberCarsOwned','TotalChildren','YearlyIncome']])], axis=1)
print(Features.shape)
print(Features)
##train_test_split
nr.seed(9988)
labels = np.array(combined_custs['AveMonthSpend'])
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 300)
x_train = Features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(labels[indx[1]])
print(x_test.shape)
##scale numeric variables
scaler = preprocessing.StandardScaler().fit(x_train[:,-5:])
x_train[:,-5:] = scaler.transform(x_train[:,-5:])
x_test[:,-5:] = scaler.transform(x_test[:,-5:])
x_train[:2,]
import pickle
file = open('./lin_reg_scaler.pickle', 'wb')
pickle.dump(scaler, file)
file.close()
##define and fit the linear regression model
lin_mod = linear_model.LinearRegression(fit_intercept=False)
lin_mod.fit(x_train,y_train)
print(lin_mod.intercept_)
print(lin_mod.coef_)
import pickle
file = open('./lin_reg_mod.pickle', 'wb')
pickle.dump(lin_mod, file)
file.close()
lin_mod.predict(x_test)
And the prediction for my training model is:
array([ 78.20673535, 91.11860042, 75.27284767, 63.69507673,
102.10758616, 74.64252358, 92.84218321, 77.9675721 ,
102.18989779, 96.98098962, 87.61415378, 39.37006326,
85.81839618, 78.41392293, 45.49439829, 48.0944897 ,
36.06024114, 70.03880373, 128.90267485, 54.63235443,
52.20289729, 82.61123334, 41.58779815, 57.6456416 ,
46.64014991, 78.38639454, 77.61072157, 94.5899366 ,.....

You are using this method in both training and testing:
def encode_string(cat_features):
enc = preprocessing.LabelEncoder()
enc.fit(cat_features)
enc_cat_features = enc.transform(cat_features)
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(enc_cat_features.reshape(-1,1))
return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()
by calling:
Features = encode_string(combined_custs['CountryRegionName'])
for col in categorical_columns:
temp = encode_string(combined_custs[col])
Features = np.concatenate([Features, temp],axis=1)
But as I said in my comment above, you need to apply same preprocessing on the test as you did in train.
Here what happens is, during testing, depending on the order of data in the x_test_data, the encoding changes. So maybe a string value which got the number 0, during training is now getting number 1, and the order of features in your final Features changes.
To solve this, you need to save the LabelEncoder and OneHotEncoder for each column separately.
So during training, do this:
import pickle
def encode_string(cat_features):
enc = preprocessing.LabelEncoder()
enc.fit(cat_features)
enc_cat_features = enc.transform(cat_features)
# Save the LabelEncoder for this column
encoder_file = open('./'+cat_features+'_encoder.pickle', 'wb')
pickle.dump(lin_mod, encoder_file)
encoder_file.close()
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(enc_cat_features.reshape(-1,1))
# Same for OHE
ohe_file = open('./'+cat_features+'_ohe.pickle', 'wb')
pickle.dump(lin_mod, ohe_file)
ohe_file.close()
return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()
And then, during testing:
def encode_string(cat_features):
# Load the previously saved encoder
with open('./'+cat_features+'_encoder.pickle', 'rb') as file:
enc = pickle.load(file)
# No fitting, only transform
enc_cat_features = enc.transform(cat_features)
# Same for OHE
with open('./'+cat_features+'_ohe.pickle', 'rb') as file:
enc = pickle.load(file)
return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

Related

When predicting new dataset should I use scaler.fit_trasform(new_dataset) or scaler.transform(new_dataset)

final_poly_converter = PolynomialFeatures(degree=3,include_bias=False)
final_poly_features = final_poly_converter.fit_transform(X)
final_scaler = StandardScaler()
scaled_X = final_scaler.fit_transform(final_poly_features)
from sklearn.linear_model import Lasso
final_model = Lasso(alpha=0.004943070909225827,max_iter=1000000)
final_model.fit(scaled_X,y)
from joblib import dump,load
dump(final_model,'lasso_model.joblib')
dump(final_poly_converter,'lasso_poly_coverter.joblib')
dump(final_scaler,'scaler.joblib')
loaded_converter = load('lasso_poly_coverter.joblib')
loaded_model = load('lasso_model.joblib')
loaded_scaler = load('scaler.joblib')
campaign = [[149,22,12]]
transformed_data = loaded_converter.fit_transform(campaign)
scaled_data = loaded_scaler.transform(transformed_data)# fit_transform or only transform
loaded_model.predict(scaled_data)
The output values change when I use fit_transform() and when I use transform()
You should always use fit_transform on your train and transform on test and further predictions. If you refit your scaler on test pool you would have a different feature distribution in your test set vs train set which is something you don't want to happen. Think of scaler params that you fit as part of the model parameters. Naturally you fit all the parameters on the training set and then you don't change them on the test evaluation/prediction.

How can I get the feature names after several fit_transform's from sklearn?

I'm running a machine learning model that requires multiple transformations. I applied polynomial transformations, interactions, and also a feature selection using SelectKBest:
transformer = ColumnTransformer(
transformers=[("cat", ce.cat_boost.CatBoostEncoder(y_train), cat_features),]
)
X_train_transformed = transformer.fit_transform(X_train, y_train)
X_test_transformed = transformer.transform(X_test)
poly = PolynomialFeatures(2)
X_train_polynomial = poly.fit_transform(X_train_transformed)
X_test_polynomial = poly.transform(X_test_transformed)
interaction = PolynomialFeatures(2, interaction_only=True)
X_train_interaction = interaction.fit_transform(X_train_polynomial)
X_test_interaction = interaction.transform(X_test_polynomial)
feature_selection = SelectKBest(chi2, k=55)
train_features = feature_selection.fit_transform(X_train_interaction, y_train)
test_features = feature_selection.transform(X_test_interaction)
model = lgb.LGBMClassifier()
model.fit(train_features, y_train)
However, I want to get the feature names and I have no idea on how to get them.

keras combine pretrained model

I trained a single model and want to combine it with another keras model using the functional api (backend is tensorflow version 1.4)
My first model looks like this:
import tensorflow.contrib.keras.api.keras as keras
model = keras.models.Sequential()
input = Input(shape=(200,))
dnn = Dense(400, activation="relu")(input)
dnn = Dense(400, activation="relu")(dnn)
output = Dense(5, activation="softmax")(dnn)
model = keras.models.Model(inputs=input, outputs=output)
after I trained this model I save it using the keras model.save() method. I can also load the model and retrain it without problems.
Now I want to use the output of this model as additional input for a second model:
# load first model
old_model = keras.models.load_model(path_to_old_model)
input_1 = Input(shape=(200,))
input_2 = Input(shape=(200,))
output_old_model = old_model(input_2)
merge_layer = concatenate([input_1, output_old_model])
dnn_layer = Dense(200, activation="relu")(merge_layer)
dnn_layer = Dense(200, activation="relu")(dnn_layer)
output = Dense(10, activation="sigmoid")(dnn_layer)
new_model = keras.models.Model(inputs=[input_1, input_2], outputs=output)
new_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
new_model.fit(inputs=[x1,x2], labels=labels, epochs=50, batch_size=32)
when I try this I get the following error message:
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value dense_1/kernel
[[Node: dense_1/kernel/read = Identity[T=DT_FLOAT, _class=["loc:#dense_1/kernel"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](dense_1/kernel)]]
[[Node: model_1_1/dense_3/BiasAdd/_79 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_68_model_1_1/dense_3/BiasAdd", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
I would do this in following steps:
Define function for building a clean model with the same architecture:
def build_base():
input = Input(shape=(200,))
dnn = Dense(400, activation="relu")(input)
dnn = Dense(400, activation="relu")(dnn)
output = Dense(5, activation="softmax")(dnn)
model = keras.models.Model(inputs=input, outputs=output)
return input, output, model
Build two copies of the same model:
input_1, output_1, model_1 = build_base()
input_2, output_2, model_2 = build_base()
Set weights in both models:
model_1.set_weights(old_model.get_weights())
model_2.set_weights(old_model.get_weights())
Now do the rest:
merge_layer = concatenate([input_1, output_2])
dnn_layer = Dense(200, activation="relu")(merge_layer)
dnn_layer = Dense(200, activation="relu")(dnn_layer)
output = Dense(10, activation="sigmoid")(dnn_layer)
new_model = keras.models.Model(inputs=[input_1, input_2], outputs=output)
Let's say you have a pre-trained/saved CNN model called pretrained_model and you want to add a densely connected layers to it, then using the functional API you can write something like this:
from keras import models, layers
kmodel = layers.Flatten()(pretrained_model.output)
kmodel = layers.Dense(256, activation='relu')(kmodel)
kmodel_out = layers.Dense(1, activation='sigmoid')(kmodel)
model = models.Model(pretrained_model.input, kmodel_out)

How to tune GaussianNB?

Trying to fit data with GaussianNB() gives me low accuracy score.
I'd like to try Grid Search, but it seems that parameters sigma and theta cannot be set. Is there anyway to tune GausssianNB?
You can tune 'var_smoothing' parameter like this:
nb_classifier = GaussianNB()
params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
gs_NB = GridSearchCV(estimator=nb_classifier,
param_grid=params_NB,
cv=cv_method, # use any cross validation technique
verbose=1,
scoring='accuracy')
gs_NB.fit(x_train, y_train)
gs_NB.best_params_
As of version 0.20
GaussianNB().get_params().keys()
returns 'priors' and 'var_smoothing'
A grid search would look like:
pipeline = Pipeline([
('clf', GaussianNB())
])
parameters = {
'clf__priors': [None],
'clf__var_smoothing': [0.00000001, 0.000000001, 0.00000001]
}
cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train, y_train)
y_pred_gnb = cv.predict(X_test)
In an sklearn pipeline it may look as follows:
pipe = Pipeline(steps=[
('pca', PCA()),
('estimator', GaussianNB()),
])
parameters = {'estimator__var_smoothing': [1e-11, 1e-10, 1e-9]}
Bayes = GridSearchCV(pipe, parameters, scoring='accuracy', cv=10).fit(X_train, y_train)
print(Bayes.best_estimator_)
print('best score:')
print(Bayes.best_score_)
predictions = Bayes.best_estimator_.predict(X_test)
Naive Bayes doesn't have any hyperparameters to tune.

svm classifier for text classification

I am trying with SVC classifier to classify text.
#self.vectorizer = HashingVectorizer(non_negative=True)
#self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
self.hasher = FeatureHasher(input_type='string',non_negative=True)
from sklearn.svm import SVC
self.clf = SVC(probability=True)
for text in self.data_train.data:
text = self.modifyQuery(text.decode('utf-8','ignore'))
training_data.append(text)
raw_X = (self.token_ques(text) for text in training_data)
#X_train = self.vectorizer.transform(training_data)
X_train = self.hasher.transform(raw_X)
y_train = self.data_train.target
self.clf.fit(X_train, y_train)
test classifier:
raw_X = (self.token_ques(text) for text in test_data)
X_test = self.hasher.transform(raw_X)
#X_test = self.vectorizer.transform(test_data)
pred = self.clf.predict(X_test)
print("pred=>", pred)
self.categories = self.data_train.target_names
for doc, category in zip(test_data, pred):
print('%r => %s' % (doc, self.categories[category]))
index = 1
predict_prob = self.clf.predict_proba(X_test)
for doc, category_list in zip(test_data, predict_prob):
#print values
I tried with hashing, feature, tfidf vectorizer but still it gives wrong answer for all queries (class with highest datasize comes as answer). While using naive bayes it gives correct result as per class and input query.
Am I doing anything wrong in code?
Update
I have total 8 classes, and each class having 100-200 lines of sentences. One class with 480 lines. This class always comes as a answer currently

Resources