BaggingClassifier take all dataset each time - machine-learning

from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
import numpy as np
import random
from sklearn.svm import SVC
X=np.random.rand(1000,2)
Y=[random.randint(0,1) for x in range(0,1000)]
svm=BaggingClassifier(SVC(kernel='rbf', random_state=123, gamma=.000001, C=100000, class_weight='balanced'), max_samples=1/5.0, n_estimators=5, n_jobs=-1,random_state=123)
classfier=svm.fit(X,Y)
print(len(svm.estimators_samples_))
print(len(svm.estimators_samples_[0]))# here I expect 0.05*400 samples. but the result is 1000.
In this code, I try to apply BaggingClassifier with SVM. Normally as discussed in the documentation of sckitlearn, the max_samples fix the maximal number of samples to be used for each estimators. However, I remark that each estimator (n_estimators=5) take all the dataset!!! Is it a bug ?

svm.estimators_samples_[0] will return an array equal to the length of the data. This array is populated with boolean values, those values equal to True are the data points used in the estimator (in terms of index value).
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
import numpy as np
import random
from sklearn.svm import SVC
X=np.random.rand(1000,2)
Y=[random.randint(0,1) for x in range(0,1000)]
svm=BaggingClassifier(SVC(kernel='rbf', random_state=123, gamma=.000001, C=100000, class_weight='balanced'), max_samples=1/5.0, n_estimators=5, n_jobs=-1,random_state=123)
classfier=svm.fit(X,Y)
print(len([i for i in svm.estimators_samples_[0] if i == True]))
Running the above code I get:
181

Related

when I run GridSearchCV() classifier with parameters so i get this kind of error:-ValueError: could not convert string to float: 'text'

so please how can I resolve this kind of error, anyone's guide me please
X = df.iloc[:,:-2]
y = df.My_Labels
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import GridSearchCV
log_class=LogisticRegression()
#grid={'C':10.0 **np.arange(-2,3),'penalty':['l1','l2'],'solver': [ 'lbfgs', 'liblinear']}
grid={'C':10.0 **np.arange(-2,3),'penalty': ['l1'], 'solver': [ 'lbfgs', 'liblinear', 'sag', 'saga'],'penalty': ['l2'], 'solver': ['newton-cg']}
cv=KFold(n_splits=5,random_state=None,shuffle=False)
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=10)
clf=GridSearchCV(log_class,grid,cv=cv,n_jobs=-1,scoring='f1_macro')
clf.fit(X_train.astype(str),y_train)
this is the error section, which I get when i run the above code for purely TEXT Classification
ValueError Traceback (most recent call last)
<ipython-input-18-4d99cebd483c> in <module>
17
18 clf=GridSearchCV(log_class,grid,cv=cv,n_jobs=-1,scoring='f1_macro')
---> 19 clf.fit(X_train.astype(str),y_train)
ValueError: could not convert string to float: 'great kindle text sucks cant use calling feature phone im connected wifi makes great calls'
It looks like you have text in your dataset, use Scikit-Learn's LabelEncoder to encode it into a numeric value. Use This Link if you don't know how to use LabelEncoder.
Looking at your error, I think you have some review column, If you don't want to LabelEncode it, use some NLP techniques and perform sentimental analysis and all.

Linear Regression script not working in Python

I tried running my Machine Learning LinearRegression code, but it is not working. Here is the code:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd
df = pd.read_csv(r'C:\Users\SVISHWANATH\Downloads\datasets\GGP_data.csv')
df["OHLC"] = (df.open+df.high+df.low+df.close)/4
df['HLC'] = (df.high+df.low+df.close)/3
df.index = df.index+1
reg = LinearRegression()
reg.fit(df.index, df.OHLC)
Basically, I just imported a few libraries, used the read_csv function, and called the LinearRegression() function, and this is the error:
ValueError: Expected 2D array, got 1D array instead:
array=[ 1 2 3 ... 1257 1258 1259].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or
array.reshape(1, -1) if it contains a single sample
Thanks!
As mentioned in the error message, you need to give the fit method a 2D array.
df.index is a 1D array. You can do it this way:
reg.fit(df.index.values.reshape(-1, 1), df.OHLC)

Value Error - Error when checking target - LSTM

About the dataset
The following Reuters dataset contains 11228 texts that correspond to news classified in 46 categories. The texts are encripted in the sense that each word correspond to an integer number. I specify that we want to work with 2000 words.
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
num_words = 2000
(reuters_train_x, reuters_train_y), (reuters_test_x, reuters_test_y) = tf.keras.datasets.reuters.load_data(num_words=num_words)
n_labels = np.unique(reuters_train_y).shape[0]
print("labels: {}".format(n_labels))
# This is the first new
print(reuters_train_x[0])
Implementing the LSTM
I need to implement a network with a single LSTM with 10 units. The input needs an embedding with 10 dimensions before entering the LSTM cell. Finally, a dense layer needs to be added to adjust the number of outputs with the number of categories.
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
from from tensorflow.keras.utils import to_categorical
reuters_train_y = to_categorical(reuters_train_y, 46)
reuters_test_y = to_categorical(reuters_test_y, 46)
model = Sequential()
model.add(Embedding(input_dim = num_words, 10))
model.add(LSTM(10))
model.add(Dense(46,activation='softmax'))
Training
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
history = model.fit(reuters_train_x,reuters_train_y,epochs=20,validation_data=(reuters_test_x,reuters_test_y))
The error message that I get is:
ValueError: Error when checking target: expected dense_2 to have shape (46,) but got array with shape (1,)
You need to one-hot-encode your y labels.
from tensorflow.keras.utils import to_categorical
reuters_train_y = to_categorical(reuters_train_y, 46)
reuters_test_y = to_categorical(reuters_test_y, 46)
Another bug I see in the fit function, you are passing validation_data=(reuters_test_x,reuters_train_y) but it should be validation_data=(reuters_test_x,reuters_test_y)
Your x is a numpy array of lists with different lengths. You need to pad the sequences to get a fixed shape numpy array.
reuters_train_x = tf.keras.preprocessing.sequence.pad_sequences(
reuters_train_x, maxlen=50
)
reuters_test_x = tf.keras.preprocessing.sequence.pad_sequences(
reuters_test_x, maxlen=50
)

How to show kdeplot in a 5*4 subplot?

I am working on a machine learning project and am using the seaborn kdeplot to show the standard scaler after scaling. However, no matter how large the figure size I change, the graphs just can't show and will show the error: AttributeError: 'numpy.ndarray' object has no attribute 'plot'.The image I'm willing to show is a 5*4 subplot that look like this:
expected subplot image
#feature scaling
#since numerical attributes have very different scales,
#we use standardization to get all attributes to have the same scale
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
matplotlib.style.use('ggplot')
scaler = preprocessing.StandardScaler()
scaled_df = scaler.fit_transform(train_set)
scaled_df = pd.DataFrame(scaled_df, columns=["SaleAmount","SaleCount","ReturnAmount","ReturnCount",
"KeyedAmount","KeyedCount","VoidRejectAmount","VoidRejectCount","RetrievalAmount",
"RetrievalCount","ChargebackAmount","ChargebackCount","DepositAmount","DepositCount",
"NetDeposit","AuthorizationAmount","AuthorizationCount","DeclinedAuthorizationAmount","DeclinedAuthorizationCount"])
fig, axes = plt.subplots(figsize=(20,10), ncols=5, nrows=4)
sns.kdeplot(scaled_df['SaleAmount'], ax=axes[0])
sns.kdeplot(scaled_df['SaleCount'], ax=axes[1])
sns.kdeplot(scaled_df['ReturnAmount'], ax=axes[2])
sns.kdeplot(scaled_df['ReturnCount'], ax=axes[3])
sns.kdeplot(scaled_df['KeyedAmount'], ax=axes[4])
sns.kdeplot(scaled_df['KeyedCount'], ax=axes[5])
sns.kdeplot(scaled_df['VoidRejectAmount'], ax=axes[6])
sns.kdeplot(scaled_df['VoidRejectCount'], ax=axes[7])
sns.kdeplot(scaled_df['RetrievalAmount'], ax=axes[8])
sns.kdeplot(scaled_df['RetrievalCount'], ax=axes[9])
sns.kdeplot(scaled_df['ChargebackAmount'], ax=axes[10])
sns.kdeplot(scaled_df['ChargebackCount'], ax=axes[11])
sns.kdeplot(scaled_df['DepositAmount'], ax=axes[12])
sns.kdeplot(scaled_df['DepositCount'], ax=axes[13])
sns.kdeplot(scaled_df['NetDeposit'], ax=axes[14])
sns.kdeplot(scaled_df['AuthorizationAmount'], ax=axes[15])
sns.kdeplot(scaled_df['AuthorizationCount'], ax=axes[16])
sns.kdeplot(scaled_df['DeclinedAuthorizationAmount'], ax=axes[17])
sns.kdeplot(scaled_df['DeclinedAuthorizationCount'], ax=axes[18])
You need to know that you have a two dimension array so something like this:
sns.kdeplot(scaled_df['DeclinedAuthorizationCount'], ax=axes[9,2])

Keras Regressor giving different prediction for my input everytime

I built a Keras regressor using the following code:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as ny
import pandas
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
X = ny.array([[1,2], [3,4], [5,6], [7,8], [9,10]])
sc_X=StandardScaler()
X_train = sc_X.fit_transform(X)
Y = ny.array([3, 4, 5, 6, 7])
Y=ny.reshape(Y,(-1,1))
sc_Y=StandardScaler()
Y_train = sc_Y.fit_transform(Y)
N = 5
def brain():
#Create the brain
br_model=Sequential()
br_model.add(Dense(3, input_dim=2, kernel_initializer='normal',activation='relu'))
br_model.add(Dense(2, kernel_initializer='normal',activation='relu'))
br_model.add(Dense(1,kernel_initializer='normal'))
#Compile the brain
br_model.compile(loss='mean_squared_error',optimizer='adam')
return br_model
def predict(X,sc_X,sc_Y,estimator):
prediction = estimator.predict(sc_X.fit_transform(X))
return sc_Y.inverse_transform(prediction)
estimator = KerasRegressor(build_fn=brain, epochs=1000, batch_size=5,verbose=0)
# print "Done"
estimator.fit(X_train,Y_train)
prediction = estimator.predict(X_train)
print predict(X,sc_X,sc_Y,estimator)
X_test = ny.array([[1.5,4.5], [7,8], [9,10]])
print predict(X_test,sc_X,sc_Y,estimator)
The issue I face is that the code is not predicting the same value (for example, it predicting 6.64 for [9,10] in the first prediction (X) and 6.49 for [9,10] in the second prediction (X_test) )
The full output is this:
[2.9929883 4.0016675 5.0103474 6.0190268 6.6434317]
[3.096634 5.422326 6.4955378]
Why do I get different values and how do I resolve them?
The problem lies in this line of code:
prediction = estimator.predict(sc_X.fit_transform(X))
You are fitting a new scaler every time when you predict values for new data. This is where differences come from. Try:
prediction = estimator.predict(sc_X.transform(X))
In this case, you use a pretrained scaler.

Resources