When predicting new dataset should I use scaler.fit_trasform(new_dataset) or scaler.transform(new_dataset) - machine-learning

final_poly_converter = PolynomialFeatures(degree=3,include_bias=False)
final_poly_features = final_poly_converter.fit_transform(X)
final_scaler = StandardScaler()
scaled_X = final_scaler.fit_transform(final_poly_features)
from sklearn.linear_model import Lasso
final_model = Lasso(alpha=0.004943070909225827,max_iter=1000000)
final_model.fit(scaled_X,y)
from joblib import dump,load
dump(final_model,'lasso_model.joblib')
dump(final_poly_converter,'lasso_poly_coverter.joblib')
dump(final_scaler,'scaler.joblib')
loaded_converter = load('lasso_poly_coverter.joblib')
loaded_model = load('lasso_model.joblib')
loaded_scaler = load('scaler.joblib')
campaign = [[149,22,12]]
transformed_data = loaded_converter.fit_transform(campaign)
scaled_data = loaded_scaler.transform(transformed_data)# fit_transform or only transform
loaded_model.predict(scaled_data)
The output values change when I use fit_transform() and when I use transform()

You should always use fit_transform on your train and transform on test and further predictions. If you refit your scaler on test pool you would have a different feature distribution in your test set vs train set which is something you don't want to happen. Think of scaler params that you fit as part of the model parameters. Naturally you fit all the parameters on the training set and then you don't change them on the test evaluation/prediction.

Related

Fine tuning a BERT Model as a chatbot giving error while training

I have been trying to fine tune a BERT model to give response sentences like a character based on input sentences but I am getting a rather odd error every time . the code is
`
Here sourcetexts is a list of sentences that give the context and target_text is a list of sentences that give response to context statments
from transformers import AutoModel, AutoTokenizer
model = AutoModel.from_pretrained("bert-base-cased").to(device)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
input_ids = \[\]
output_ids = \[\]
for i in range (0 , len(source_text):
input_ids.append(tokenizer.encode(source_texts\[i\], return_tensors="pt"))
output_ids.append(tokenizer.encode(target_texts\[i\], return_tensors="pt"))
import torch
device = torch.device("cuda")
from transformers import BertForMaskedLM, AdamW
model = BertForMaskedLM.from_pretrained("bert-base-cased")
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()
def train(input_id, output_id):
input_id = input_id.to(device)
output_id = output_id.to(device)
model.zero_grad()
logits, _ = model(input_id, labels=output_id)
# Compute the loss
loss = loss_fn(logits.view(-1, logits.size(-1)), output_id.view(-1))
loss.backward()
optimizer.step()
return loss.item()
for epoch in range(50):
\# Train the model on the training dataset
train_loss = 0.0
for input_sequences, output_sequences in zip(input_ids, output_ids):
input_sequences = input_sequences.to(device)
output_sequences = output_sequences.to(device)
train_loss += train(input_sequences, output_sequences)
This is the Error that I am getting
Any help would be really appreciated .
Pls help!!
Hi i saw your code but you didn't move your model to GPU, only the inputs, pytorch by default is on CPU
import torch
device = torch.device('cuda')
model = BertForMaskedLM.from_pretrained("bert-base-cased")
model.to(device)

SRGAN model is not learning

As the model is not learning I wanted to make sure whether I coded the below lines correctly.
This lines shows the architecture of SRGAN model:-
generated_image = generator(input_low_res_img_layer)
discriminator_output = discriminator(generated_image)
fake_features = vgg_model(generated_image)
GAN_model = tf.keras.models.Model(inputs = input_low_res_img_layer, outputs = [discriminator_output,fake_features])
Below lines are used for the compilation of the models:-
my_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
vgg_model.compile(loss='mse', optimizer=my_optimizer, metrics=['accuracy'])
discriminator.compile(loss='binary_crossentropy', optimizer= my_optimizer, metrics=['accuracy'])
GAN_model.compile(loss=['binary_crossentropy','mse'], loss_weights=[1e-3,1],metrics=['accuracy'])
Below lines are GAN and discriminator model fit codes:
discriminator.trainable = True
discriminator.fit(real_and_fake_images,real_and_fake_labels)
discriminator.trainable = False
GAN_model.fit(low_res_img_batch, [np.ones((batch_size,1)),vgg_model(high_res_img_batch)])
The image shows the result of the training. The first metric of each mini-batch is for the discriminator and the second line is for the GAN network.
In case needed, Full code here

How to use the best parameter as parameter of a classifier in GridSearchCV?

I have a function called svc_param_selection(X, y, n) which returns best_param_.
Now I want to use the best_params returned as the parameter of a classifier like:
.
parameters = svc_param_selection(X, y, 2)
from sklearn.model_selection import ParameterGrid
from sklearn.svm import SVC
param_grid = ParameterGrid(parameters)
for params in param_grid:
svc_clf = SVC(**params)
print (svc_clf)
classifier2=SVC(**svc_clf)
It seems parameters is not a grid here..
You can use GridSearchCV to do this. There is a example here:
# Applying GridSearch to find best parameters
from sklearn.model_selection import GridSearchCV
parameters = [{ 'criterion' : ['gini'], 'splitter':['best','random'], 'min_samples_split':[0.1,0.2,0.3,0.4,0.5],
'min_samples_leaf': [1,2,3,4,5]},
{'criterion' : ['entropy'], 'splitter':['best','random'], 'min_samples_split':[0.1,0.2,0.3,0.4,0.5],
'min_samples_leaf': [1,2,3,4,5]} ]
gridsearch = GridSearchCV(estimator = classifier, param_grid = parameters,refit= False, scoring='accuracy', cv=10)
gridsearch = gridsearch.fit(x,y)
optimal_accuracy = gridsearch.best_score_
optimal_parameters = gridsearch.best_params_
But for param_grid of GridSearchCV, you should pass a dictionary of parameter name and value for you classifier. For example a classifier like this:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(random_state=0, presort=True,
criterion='entropy')
classifier = classifier.fit(x_train,y_train)
Then after finding best parameters by GridSearchCV you apply them on you model.
#Ben At the start of gridsearch, you either specify the classifier outside the param_grid (if you have only one classification method to check) or inside the param_grid. I have made a check for the 'inside' case only.
First, I set the 'classifier' key in the param_grid. That is the key which you need to ask for in the end.
param_grid = [
{'classifier' : [LogisticRegression()],
...
},
{'classifier' : [RandomForestClassifier()],
}
]
As an example, the result of gridsearch.best_params_ is:
{'classifier': RandomForestClassifier(criterion='entropy', max_depth=2, n_estimators=2),
'classifier__criterion': 'entropy',
'classifier__max_depth': 2,
'classifier__min_samples_leaf': 1,
'classifier__n_estimators': 2}
Then ask this dictionary gridsearch.best_params_ for the key that you called the 'classifier'.
clfBest = clfGridSearchBest.best_params_['classifier']
clfBest:
RandomForestClassifier(criterion='entropy', max_depth=2, n_estimators=2)
Now just fit clfBest.

How to tune GaussianNB?

Trying to fit data with GaussianNB() gives me low accuracy score.
I'd like to try Grid Search, but it seems that parameters sigma and theta cannot be set. Is there anyway to tune GausssianNB?
You can tune 'var_smoothing' parameter like this:
nb_classifier = GaussianNB()
params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
gs_NB = GridSearchCV(estimator=nb_classifier,
param_grid=params_NB,
cv=cv_method, # use any cross validation technique
verbose=1,
scoring='accuracy')
gs_NB.fit(x_train, y_train)
gs_NB.best_params_
As of version 0.20
GaussianNB().get_params().keys()
returns 'priors' and 'var_smoothing'
A grid search would look like:
pipeline = Pipeline([
('clf', GaussianNB())
])
parameters = {
'clf__priors': [None],
'clf__var_smoothing': [0.00000001, 0.000000001, 0.00000001]
}
cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train, y_train)
y_pred_gnb = cv.predict(X_test)
In an sklearn pipeline it may look as follows:
pipe = Pipeline(steps=[
('pca', PCA()),
('estimator', GaussianNB()),
])
parameters = {'estimator__var_smoothing': [1e-11, 1e-10, 1e-9]}
Bayes = GridSearchCV(pipe, parameters, scoring='accuracy', cv=10).fit(X_train, y_train)
print(Bayes.best_estimator_)
print('best score:')
print(Bayes.best_score_)
predictions = Bayes.best_estimator_.predict(X_test)
Naive Bayes doesn't have any hyperparameters to tune.

svm classifier for text classification

I am trying with SVC classifier to classify text.
#self.vectorizer = HashingVectorizer(non_negative=True)
#self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
self.hasher = FeatureHasher(input_type='string',non_negative=True)
from sklearn.svm import SVC
self.clf = SVC(probability=True)
for text in self.data_train.data:
text = self.modifyQuery(text.decode('utf-8','ignore'))
training_data.append(text)
raw_X = (self.token_ques(text) for text in training_data)
#X_train = self.vectorizer.transform(training_data)
X_train = self.hasher.transform(raw_X)
y_train = self.data_train.target
self.clf.fit(X_train, y_train)
test classifier:
raw_X = (self.token_ques(text) for text in test_data)
X_test = self.hasher.transform(raw_X)
#X_test = self.vectorizer.transform(test_data)
pred = self.clf.predict(X_test)
print("pred=>", pred)
self.categories = self.data_train.target_names
for doc, category in zip(test_data, pred):
print('%r => %s' % (doc, self.categories[category]))
index = 1
predict_prob = self.clf.predict_proba(X_test)
for doc, category_list in zip(test_data, predict_prob):
#print values
I tried with hashing, feature, tfidf vectorizer but still it gives wrong answer for all queries (class with highest datasize comes as answer). While using naive bayes it gives correct result as per class and input query.
Am I doing anything wrong in code?
Update
I have total 8 classes, and each class having 100-200 lines of sentences. One class with 480 lines. This class always comes as a answer currently

Resources