Random Forest on Categorical Data with low accuracy - machine-learning

I am trying to build a model that given an item, predicts which store it belongs to.
I have a data-set of ~300 records which are supposed to be items in different online stores.
Each record is composed of: Category,Sub Category,Price,Store Identifier(The y variable)
The data seems balanced as every store has around ~10 items.
With the help of #Marcus V. I succeeded encoding the categorical columns correctly. But can not produce better results than 0.52 for a RandomForest with 15 estimators and an entropy criterion.
I feel like much more can be done here. What am I missing?
This is the data: https://pastebin.com/z3eZc0vK
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
class Columns(BaseEstimator, TransformerMixin):
def __init__(self, names=None):
self.names = names
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
return X.loc[:,self.names]
dataset = pd.read_csv('data.csv', header=None)
dataset.columns = ["cat1", "cat2", "num1", "target"]
# dataset.columns = ["cat1", "cat2", "target"]
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, 3]
labelencoder_X_0 = LabelEncoder()
X.iloc[:, 0] = labelencoder_X_0.fit_transform(X.iloc[:, 0])
labelencoder_X_1 = LabelEncoder()
X.iloc[:, 1] = labelencoder_X_1.fit_transform(X.iloc[:, 1])
numeric = ["num1"]
categorical = ["cat1", "cat2"]
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
('categorical', make_pipeline(Columns(names=categorical), OneHotEncoder(sparse=False)))
])),
])
X = pipe.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
classifier = RandomForestClassifier(n_estimators=15, criterion='entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = classifier.score(X_test, y_test)
print(accuracy)

Related

The number of classes has to be greater than one; got 1 class in SVM

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x = np.linspace(-5.0, 5.0, 100)
y = np.sqrt(10**2 - x**2)
y=np.hstack([y,-y])
x=np.hstack([x,-x])
x1 = np.linspace(-5.0, 5.0, 100)
y1 = np.sqrt(5**2 - x1**2)
y1=np.hstack([y1,-y1])
x1=np.hstack([x1,-x1])
plt.scatter(y,x)
plt.scatter(y1,x1)
# print(plt.show())
import pandas as pd
df1 =pd.DataFrame(np.vstack([y,x]).T,columns=['X1','X2'])
df1['Y']=0
df2 =pd.DataFrame(np.vstack([y1,x1]).T,columns=['X1','X2'])
df2['Y']=1
df1.merge(df2)
# We need to find components for the Polynomical Kernel
#X1,X2,X1_square,X2_square,X1*X2
df1['X1_Square']= df1['X1']**2
df1['X2_Square']= df1['X2']**2
df1['X1*X2'] = (df1['X1'] *df1['X2'])
# print(df1.head())
### Independent and Dependent features
X = df1[['X1','X2','X1_Square','X2_Square','X1*X2']]
y = df1['Y']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.25,
random_state = 0)
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
classifier = SVC(kernel="linear")
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)
ValueError: The number of classes has to be greater than one; got 1 class
I Don't know How to resolve this one error.May be there is error in merge of two data frames or I to append df1 and df2 but i tried it that doesn't work.
The error is because y has only one value of 0 because your code logic is at line y = df1['Y'].
You can replace line df1.merge(df2) code like this:
df1 = pd.concat([df1,df2])

LSTM multiclass classifier : why my predictions have nothing to do with my target set?

I am trying to design a LSTM model for forecasting price movement.
I have issues regarding the results I obtain for my predictions. I did not normalize my target set y (nor train nor test), only X because it's a classification (-1,0,1) but the predictions I obtain are float.
Maybe I did not normalize the righ sets. My code is below :
Many thanks for you help and feel free to add comments other my other lines of code too I am a beginner.
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime as dt
from pandas_datareader import data as pdr
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import LSTM
startdate=dt(2018,3,31)
enddate=dt(2022,3,31)
tickers = ['ETH-USD']
Data=pdr.get_data_yahoo(tickers,start=startdate, end=enddate)['Adj Close']
df_change = Data.apply(lambda x: np.log(x) - np.log(x.shift(1)))
df_change.drop(index=df_change.index[0], axis=0, inplace=True)
df_change = df_change*100
pd.options.mode.chained_assignment = None #to not display the error of copy dataframe
df_y = df_change.copy()
df_y.columns = ['ETH-y']
def Target(df,column,df2,column2):
for i in range(len(df)):
if df[column].iloc[i] > 0:
df2[column2][i] = 1 #value is up par rapport au jour d'avant
elif -0.5 < df[column].iloc[i] < 0.5 :
df2[column2][i] = 0 #value is steady
else:
df2[column2][i] = -1 #value is down
Target(df_change,'ETH-USD',df_y,'ETH-y')
print(df_y['ETH-y'].value_counts())
Data.drop(index=Data.index[0], axis=0, inplace=True) #drop first row to have same values
X = Data
y = df_y
## split my train val and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify = y)
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler().fit(X_train)
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
#reshaping for 3D array
X_train = np.reshape(X_train,(1169,1,1))
X_test = np.reshape(X_test,(293,1,1))
from keras.models import Sequential
from keras.layers import Dense, LSTM
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(LSTM(32, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1]))
model.compile(optimizer='adam', loss='mse')
model.summary()
history = model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.1, verbose=1)
pred = model.predict(X_test)
pred = sc.inverse_transform(pred)
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend()

ValueError when making predictions with LinearRegression

I have started learning ML.
This is my code:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Import the dataset
dataset = pd.read_csv('Salary_Data.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 1].values
# Split the data set into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test =\
train_test_split(X, Y, test_size=1/3, random_state=0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# Fitting Simple Linear Regression to Training Set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train , Y_train)
# Predicting the Test set Results
y_pred = regressor.predict(X_test)
I am getting the error:
ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a
minimum of 1 is required.
for the last line. How to resolve this??

How to predict the outcome of a new patient using this SVC ML model trained on a dataset

Data description:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
np.random.seed(0)
# Alternatively X and y can be obtained directly from the frame attribute:
X = df.drop('OUTCOME', axis=1)
y = df['OUTCOME']
numeric_features = ['Age', 'PCT' , 'CURB 65' , 'pO2' ]
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_features = ['Sex', 'CXR' , 'Hospitalisation in last 3 months' , 'ICU>72hrs', 'Blood']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
from sklearn import linear_model
from sklearn.svm import SVC
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', SVC())])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Accuracy of the model is 1.0
How to enter new patient's parameter to get a prediction for that patient? New patient is not from this dataset.
i tried to enter new patient information as follows
New_patient = [70,1,0,2,2,10,40,2,20,68,1,2,0,1,1]
clf.predict([New_patient])
output: ValueError: Specifying the columns using strings is only supported for pandas DataFrames

ROC curve is not acutally a curve

I have plotted few ROC curve to calculate the AUC. I am having ROC curve is actually doesn't plots like a curve. I have attached the images for better understanding. If any one can tell me what is wrong in there. I will be obliged. This is one kind of plot I am getting
This is the another type
However I am not getting a curve like this one.
This is the link to my dataset
https://drive.google.com/open?id=1luj8d863_IOA36cQTo772GEWgUsrXlbJ
I will thankful if anyone can help me understand the problem if any or if my curves are correct then why it is not actually in a curve like structure
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from keras.layers import Dense, Input
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Model,Sequential
from keras.utils import np_utils
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from itertools import cycle
from sklearn.metrics import roc_curve, auc
from imblearn.over_sampling import SMOTE
seed = 7
np.random.seed(seed)
dataset = pd.read_csv('dataset/prostate.csv')
labels = dataset.values[:,-1]
features_set = dataset.iloc[:,0:12600]
oversampler = SMOTE(random_state=0)
oversampler_feature_set, oversampler_labels = oversampler.fit_sample(features_set,labels)
feature_df = pd.DataFrame(oversampler_feature_set)
labels_df = pd.DataFrame(oversampler_labels)
scalar = MinMaxScaler()
scaled_data = scalar.fit_transform(feature_df)
pca = PCA(n_components=30)
pca_data = pd.DataFrame(pca.fit_transform(scaled_data))
recreated_df = pd.concat([pca_data,labels_df], axis=1)
train, test = train_test_split(recreated_df,test_size=0.2)
X_train = train.values[:,0:30]
Y_train = train.values[:,-1]
X_test = test.values[:,0:30]
y_test = test.values[:,-1]
def my_model():
model = Sequential()
model.add(Dense(20, input_dim=30,activation='sigmoid'))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
return model
estimator = KerasClassifier(build_fn=my_model, epochs=1000, batch_size=10, shuffle=True,verbose=1)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator,X_train,Y_train, cv=kfold)
results.mean()
estimator.fit(X_train,Y_train)
y_pred = estimator.predict(X_test).ravel()
sensitivity, specificity, thresholds_keras = roc_curve(y_test,y_pred,pos_label=2)
auc_keras = auc(sensitivity,specificity)
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(sensitivity, specificity, label='Keras (area =:.3f})'.format(auc_keras))
plt.xlabel('Specificity')
plt.ylabel('Sensitivity')
plt.title('Prostate')
plt.legend(loc='best')
plt.show()

Resources