I have started learning ML.
This is my code:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Import the dataset
dataset = pd.read_csv('Salary_Data.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 1].values
# Split the data set into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test =\
train_test_split(X, Y, test_size=1/3, random_state=0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# Fitting Simple Linear Regression to Training Set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train , Y_train)
# Predicting the Test set Results
y_pred = regressor.predict(X_test)
I am getting the error:
ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a
minimum of 1 is required.
for the last line. How to resolve this??
Related
I am trying to design a LSTM model for forecasting price movement.
I have issues regarding the results I obtain for my predictions. I did not normalize my target set y (nor train nor test), only X because it's a classification (-1,0,1) but the predictions I obtain are float.
Maybe I did not normalize the righ sets. My code is below :
Many thanks for you help and feel free to add comments other my other lines of code too I am a beginner.
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime as dt
from pandas_datareader import data as pdr
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import LSTM
startdate=dt(2018,3,31)
enddate=dt(2022,3,31)
tickers = ['ETH-USD']
Data=pdr.get_data_yahoo(tickers,start=startdate, end=enddate)['Adj Close']
df_change = Data.apply(lambda x: np.log(x) - np.log(x.shift(1)))
df_change.drop(index=df_change.index[0], axis=0, inplace=True)
df_change = df_change*100
pd.options.mode.chained_assignment = None #to not display the error of copy dataframe
df_y = df_change.copy()
df_y.columns = ['ETH-y']
def Target(df,column,df2,column2):
for i in range(len(df)):
if df[column].iloc[i] > 0:
df2[column2][i] = 1 #value is up par rapport au jour d'avant
elif -0.5 < df[column].iloc[i] < 0.5 :
df2[column2][i] = 0 #value is steady
else:
df2[column2][i] = -1 #value is down
Target(df_change,'ETH-USD',df_y,'ETH-y')
print(df_y['ETH-y'].value_counts())
Data.drop(index=Data.index[0], axis=0, inplace=True) #drop first row to have same values
X = Data
y = df_y
## split my train val and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify = y)
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler().fit(X_train)
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
#reshaping for 3D array
X_train = np.reshape(X_train,(1169,1,1))
X_test = np.reshape(X_test,(293,1,1))
from keras.models import Sequential
from keras.layers import Dense, LSTM
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(LSTM(32, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1]))
model.compile(optimizer='adam', loss='mse')
model.summary()
history = model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.1, verbose=1)
pred = model.predict(X_test)
pred = sc.inverse_transform(pred)
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend()
Data description:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
np.random.seed(0)
# Alternatively X and y can be obtained directly from the frame attribute:
X = df.drop('OUTCOME', axis=1)
y = df['OUTCOME']
numeric_features = ['Age', 'PCT' , 'CURB 65' , 'pO2' ]
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_features = ['Sex', 'CXR' , 'Hospitalisation in last 3 months' , 'ICU>72hrs', 'Blood']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
from sklearn import linear_model
from sklearn.svm import SVC
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', SVC())])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Accuracy of the model is 1.0
How to enter new patient's parameter to get a prediction for that patient? New patient is not from this dataset.
i tried to enter new patient information as follows
New_patient = [70,1,0,2,2,10,40,2,20,68,1,2,0,1,1]
clf.predict([New_patient])
output: ValueError: Specifying the columns using strings is only supported for pandas DataFrames
I was trying to create roc curve for multiclass using Naive Bayes But it ending with
ValueError: bad input shape.
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.naive_bayes import BernoulliNB
from scipy import interp
# Import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Binarize the output
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
random_state=0)
# Learn to predict each class against the other
classifier = BernoulliNB(alpha=1.0, binarize=6, class_prior=None, fit_prior=True)
y_score = classifier.fit(X_train, y_train).predict(X_test)
raise ValueError("bad input shape {0}".format(shape))
ValueError: bad input shape (75, 6)
The error because of binarizing the y variable. The estimator can work with string values itself.
Remove the following lines,
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
You are good to go!
To get the predicted probabilities for roc_curve, use the following:
classifier.fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)
y_score.shape
# (75, 3)
Here's the code below... I dont know what's wrong with my code. plz help. the error is occured in line
clf.fit(X_train, y_train)
import numpy as np
from sklearn import preprocessing, neighbors
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('breast-cancer-wisconsin.txt')
df.replace('?', -99999, inplace=True)
df.drop(['id'], 1, inplace=True)
X = np.array(df.drop(['class'],1))
y = np.array(df['class'])
X_train,y_train,X_test,y_test =
train_test_split(X,y, test_size=0.2)
clf = neighbors.KNeighborsClassifier()
print(X_train.shape)
print(y_train.shape)
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print (accuracy)
The problem is in your X_train,y_train,X_test,y_test =
train_test_split(X,y, test_size=0.2) part.
According to scikit-learn documentation in here,the correct order of return value of train_test_split function is:
X_train,
X_test,
y_train,
y_test
Your order in the code is wrong. Let's replace the line you have used ** train_test_split** with this line:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
Hopefully, this will resolve your issue.
I am trying to build a model that given an item, predicts which store it belongs to.
I have a data-set of ~300 records which are supposed to be items in different online stores.
Each record is composed of: Category,Sub Category,Price,Store Identifier(The y variable)
The data seems balanced as every store has around ~10 items.
With the help of #Marcus V. I succeeded encoding the categorical columns correctly. But can not produce better results than 0.52 for a RandomForest with 15 estimators and an entropy criterion.
I feel like much more can be done here. What am I missing?
This is the data: https://pastebin.com/z3eZc0vK
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
class Columns(BaseEstimator, TransformerMixin):
def __init__(self, names=None):
self.names = names
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X):
return X.loc[:,self.names]
dataset = pd.read_csv('data.csv', header=None)
dataset.columns = ["cat1", "cat2", "num1", "target"]
# dataset.columns = ["cat1", "cat2", "target"]
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, 3]
labelencoder_X_0 = LabelEncoder()
X.iloc[:, 0] = labelencoder_X_0.fit_transform(X.iloc[:, 0])
labelencoder_X_1 = LabelEncoder()
X.iloc[:, 1] = labelencoder_X_1.fit_transform(X.iloc[:, 1])
numeric = ["num1"]
categorical = ["cat1", "cat2"]
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=numeric),StandardScaler())),
('categorical', make_pipeline(Columns(names=categorical), OneHotEncoder(sparse=False)))
])),
])
X = pipe.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
classifier = RandomForestClassifier(n_estimators=15, criterion='entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = classifier.score(X_test, y_test)
print(accuracy)