I trained 2 gradient-boosting models on the same data, using Scikit-learn and XGBoost.
Scikit-learn model
GradientBoostingClassifier(
n_estimators=5,
learning_rate=0.17,
max_depth=5,
verbose=2
)
XGBoost model
XGBClassifier(
n_estimators=5,
learning_rate=0.17,
max_depth=5,
verbosity=2,
eval_metric="logloss"
)
Then I checked inference performance:
Xgboost: 9.7 ms ± 84.6 µs per loop
Scikit-learn: 426 µs ± 12.5 µs per loop
Why XGBoost is so slow?
"Why is xgboost so slow?": XGBClassifier() is the scikit-learn API for XGBoost (see e.g. https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier for more details). If you call the function directly (not through an API) it will be faster. To compare the performance of the two functions it makes sense to call each function directly, instead of calling one function directly and one function through an API. Here is an example:
# benchmark_xgboost_vs_sklearn.py
# Adapted from `xgboost_test.py` by Jacob Schreiber
# (https://gist.github.com/jmschrei/6b447aada61d631544cd)
"""
Benchmarking scripts for XGBoost versus sklearn (time and accuracy)
"""
import time
import random
import numpy as np
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
random.seed(0)
np.random.seed(0)
def make_dataset(n=500, d=10, c=2, z=2):
"""
Make a dataset of size n, with d dimensions and m classes,
with a distance of z in each dimension, making each feature equally
informative.
"""
# Generate our data and our labels
X = np.concatenate([np.random.randn(n, d) + z*i for i in range(c)])
y = np.concatenate([np.ones(n) * i for i in range(c)])
# Generate a random indexing
idx = np.arange(n*c)
np.random.shuffle(idx)
# Randomize the dataset, preserving data-label pairing
X = X[idx]
y = y[idx]
# Return x_train, x_test, y_train, y_test
return X[::2], X[1::2], y[::2], y[1::2]
def main():
"""
Run SKLearn, and then run xgboost,
then xgboost via SKLearn XGBClassifier API wrapper
"""
# Generate the dataset
X_train, X_test, y_train, y_test = make_dataset(10, z=100)
n_estimators=5
max_depth=5
learning_rate=0.17
# sklearn first
tic = time.time()
clf = GradientBoostingClassifier(n_estimators=n_estimators,
max_depth=max_depth, learning_rate=learning_rate)
clf.fit(X_train, y_train)
print("SKLearn GBClassifier: {}s".format(time.time() - tic))
print("Acc: {}".format(clf.score(X_test, y_test)))
print(y_test.sum())
print(clf.predict(X_test))
# Convert the data to DMatrix for xgboost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# Loop through multiple thread numbers for xgboost
for threads in 1, 2, 4:
# xgboost's sklearn interface
tic = time.time()
clf = xgb.XGBModel(n_estimators=n_estimators, max_depth=max_depth,
learning_rate=learning_rate, nthread=threads)
clf.fit(X_train, y_train)
print("SKLearn XGBoost API Time: {}s".format(time.time() - tic))
preds = np.round( clf.predict(X_test) )
acc = 1. - (np.abs(preds - y_test).sum() / y_test.shape[0])
print("Acc: {}".format( acc ))
print("{} threads: ".format( threads ))
tic = time.time()
param = {
'max_depth' : max_depth,
'eta' : 0.1,
'silent': 1,
'objective':'binary:logistic',
'nthread': threads
}
bst = xgb.train( param, dtrain, n_estimators,
[(dtest, 'eval'), (dtrain, 'train')] )
print("XGBoost (no wrapper) Time: {}s".format(time.time() - tic))
preds = np.round(bst.predict(dtest) )
acc = 1. - (np.abs(preds - y_test).sum() / y_test.shape[0])
print("Acc: {}".format(acc))
if __name__ == '__main__':
main()
Summarised results:
sklearn.ensemble.GradientBoostingClassifier()
Time: 0.003237009048461914s
Accuracy: 1.0
sklearn xgboost API wrapper XGBClassifier()
Time: 0.3436141014099121s
Accuracy: 1.0
XGBoost (no wrapper) xgb.train()
Time: 0.0028612613677978516s
Accuracy: 1.0
Related
I have a small kernel svm code.
from sklearn import datasets
from sklearn.svm import SVC
import numpy as np
# Load the IRIS dataset for demonstration
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Train-test split
X_train, y_train = X[:140], y[:140]
X_test, y_test = X[140:], y[140:]
print(X.shape, X_train.shape, X_test.shape) # prints (150, 4) (140, 4) (10, 4)
# Fit a rbf kernel SVM
gamma = 0.7
svc = SVC(kernel='rbf', gamma=gamma, C=64, decision_function_shape='ovo')
# svc = SVC(kernel='rbf', gamma=gamma, C=64, probability=True, decision_function_shape='ovo')
# svc = SVC(kernel='rbf', gamma=gamma, C=64)
svc.fit(X_train, y_train)
print(svc.score(X_test, y_test))
# Get prediction for a point X_test using train SVM, svc
def get_pred(svc, X_test):
def RBF(x,z,gamma,axis=None):
return np.exp((-gamma*np.linalg.norm(x-z, axis=axis)**2))
A = []
# Loop over all suport vectors to calculate K(Xi, X_test), for Xi belongs to the set of support vectors
for x in svc.support_vectors_:
# A.append(RBF(x, X_test, svc._gamma))
A.append(RBF(x, X_test, gamma))
A = np.array(A)
return (np.sum(svc._dual_coef_*A)+svc.intercept_)
for i in range(X_test.shape[0]):
print(get_pred(svc, X_test[i]))
print(svc.decision_function([X_test[i]])) # The output should same
I want to understand the role of the dual_coef parameter in svm, so I implemented a prediction function get_pred of svm myself.
According to the mathematical expression of svm here.
But the output of the function I implemented is different from the function that comes with svm.
(150, 4) (140, 4) (10, 4)
1.0
[-4.24105215 -4.38979215 -3.52427244]
[[-0.42115154 -1.06817962 -2.36560357]]
[-2.34091311 -2.48965311 -1.6241334 ]
[[-0.61615543 -0.86736268 -0.47127757]]
[-4.34859785 -4.49733785 -3.63181814]
[[-0.86662754 -1.14637099 -1.94948189]]
[-4.14797518 -4.29671518 -3.43119547]
[[-0.32438219 -1.12869709 -2.30877848]]
[-3.80505008 -3.95379007 -3.08827037]
[[-0.3341635 -1.03315401 -2.05161515]]
[-3.83632958 -3.98506957 -3.11954987]
[[-0.62920059 -0.97474828 -1.84626328]]
[-3.94804683 -4.09678683 -3.23126712]
[[-0.90348467 -1.04135143 -1.61709331]]
[-4.24990319 -4.39864319 -3.53312348]
[[-0.83485694 -1.07466796 -1.95426087]]
[-3.39840443 -3.54714443 -2.68162472]
[[-0.52530703 -0.9980642 -1.48891578]]
[-3.03105705 -3.17979705 -2.31427734]
[[-0.93796146 -1.09834078 -0.60863738]]
How can I understand this parameter dual_coef, or put another way, how can I implement the prediction function of the kernel svm myself?
Below is the code of what I'm trying to do, but my accuracy is always under 50% so I'm wondering how should I fix this? What I'm trying to do is use the first 1885 daily unit sale data as input and the rest of the daily unit sale data from 1885 as output. After train these data, I need to use it to predict 20 more daily unit sale in the future
The data I used here is provided in this link
https://drive.google.com/file/d/13qzIZMD6Wz7e1GpOsNw1_9Yq-4PI2HrC/view?usp=sharing
import pandas as pd
import numpy as np
import keras
import keras.backend as k
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
data = pd.read_csv('sales_train.csv')
#Since there are 3 departments and 10 store from 3 different areas, thus I categorized the data into 30 groups and numerize them
Unique_dept = data["dept_id"].unique()
Unique_state = data['state_id'].unique()
Unique_store = data["store_id"].unique()
data0 = data.copy()
for i in range(3):
data0["dept_id"] = data0["dept_id"].replace(to_replace=Unique_dept[i], value = i)
data0["state_id"] = data0["state_id"].replace(to_replace=Unique_state[i], value = i)
for j in range(10):
data0["store_id"] = data0["store_id"].replace(to_replace=Unique_store[j], value = int(Unique_store[j][3]) -1)
# Select the three numerized categorical variables and daily unit sale data
pt = 6 + 1885
X = pd.concat([data0.iloc[:,2],data0.iloc[:, 4:pt]], axis = 1)
Y = data0.iloc[:, pt:]
# Remove the daily unit sale data that are highly correlated to each other (corr > 0.9)
correlation = X.corr(method = 'pearson')
corr_lst = []
for i in correlation:
for j in correlation:
if (i != j) & (correlation[i][j] >= 0.9) & (j not in corr_lst) & (i not in corr_lst):
corr_lst.append(i)
x = X.drop(corr_lst, axis = 1)
x_value = x.values
y_value = Y.values
sc = StandardScaler()
X_scale = sc.fit_transform(x_value)
X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(x_value, y_value, test_size=0.2)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5)
print(X_train.shape, X_val.shape, X_test.shape, Y_train.shape, Y_val.shape, Y_test.shape)
#create model
model = Sequential()
#get number of columns in training data
n_cols = X_train.shape[1]
#add model layers
model.add(Dense(32, activation='softmax', input_shape=(n_cols,)))
model.add(Dense(32, activation='relu'))
model.add(Dense(32, activation='softmax'))
model.add(Dense(1))
#compile model using rmsse as a measure of model performance
model.compile(optimizer='Adagrad', loss= "mean_absolute_error", metrics = ['accuracy'])
#set early stopping monitor so the model stops training when it won't improve anymore early_stopping_monitor = EarlyStopping(patience=3)
early_stopping_monitor = EarlyStopping(patience=20)
#train model
model.fit(X_train, Y_train,batch_size=32, epochs=10, validation_data=(X_val, Y_val))
Here is what I got
The plots are also pretty strange:
Accuracy
Loss
Two mistakes:
Accuracy is meaningless in regression settings, such as yours here (it is meaningful only for classification ones); see What function defines accuracy in Keras when the loss is mean squared error (MSE)? (the argument is identical when MAE loss is used, like here). Your performance measure here is the same with your loss (i.e. MAE).
We never use softmax activations in anything but the final layer of a classification model; replace both softmax activation functions used in your model with relu (keep the last layer as is, as no activation means linear, which is indeed the correct one for regression).
With very few epochs this model learns to classify beween 1 and 0 extremely quickly which leads me to consider something is wrong.
Below code downloads mnist dataset, extracts the mnist images that contain 1 or 0 only. A random sample of size 200 is selected from this subset of mnist images. This random sample is the dataset the model is trained on. With just 2 epochs the model achieves 90%+ test set accuracy, is this expected behaviour ? I expected many more epochs would be required in order to train the model to achieve this level of test set accuracy.
Model code :
%reset -f
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torch.utils.data as data_utils
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from matplotlib import pyplot
from pandas import DataFrame
import torchvision.datasets as dset
import os
import torch.nn.functional as F
import time
import random
import pickle
from sklearn.metrics import confusion_matrix
import pandas as pd
import sklearn
trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
root = './data'
if not os.path.exists(root):
os.mkdir(root)
train_set = dset.MNIST(root=root, train=True, transform=trans, download=True)
test_set = dset.MNIST(root=root, train=False, transform=trans, download=True)
batch_size = 64
train_loader = torch.utils.data.DataLoader(
dataset=train_set,
batch_size=batch_size,
shuffle=True)
test_loader = torch.utils.data.DataLoader(
dataset=test_set,
batch_size=batch_size,
shuffle=True)
class NeuralNet(nn.Module):
def __init__(self):
super(NeuralNet, self).__init__()
self.fc1 = nn.Linear(28*28, 500)
self.fc2 = nn.Linear(500, 256)
self.fc3 = nn.Linear(256, 2)
def forward(self, x):
x = x.view(-1, 28*28)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
num_epochs = 2
random_sample_size = 200
values_0_or_1 = [t for t in train_set if (int(t[1]) == 0 or int(t[1]) == 1)]
values_0_or_1_testset = [t for t in test_set if (int(t[1]) == 0 or int(t[1]) == 1)]
print(len(values_0_or_1))
print(len(values_0_or_1_testset))
train_loader_subset = torch.utils.data.DataLoader(
dataset=values_0_or_1,
batch_size=batch_size,
shuffle=True)
test_loader_subset = torch.utils.data.DataLoader(
dataset=values_0_or_1_testset,
batch_size=batch_size,
shuffle=False)
train_loader = train_loader_subset
# Hyper-parameters
input_size = 100
hidden_size = 100
num_classes = 2
# learning_rate = 0.00001
learning_rate = .0001
# Device configuration
device = 'cpu'
print_progress_every_n_epochs = 1
model = NeuralNet().to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
N = len(train_loader)
# Train the model
total_step = len(train_loader)
most_recent_prediction = []
test_actual_predicted_dict = {}
rm = random.sample(list(values_0_or_1), random_sample_size)
train_loader_subset = data_utils.DataLoader(rm, batch_size=4)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader_subset):
# Move tensors to the configured device
images = images.reshape(-1, 2).to(device)
labels = labels.to(device)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch) % print_progress_every_n_epochs == 0:
print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
predicted_test = []
model.eval() # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
probs_l = []
predicted_values = []
actual_values = []
labels_l = []
with torch.no_grad():
for images, labels in test_loader_subset:
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
predicted_test.append(predicted.cpu().numpy())
sm = torch.nn.Softmax()
probabilities = sm(outputs)
probs_l.append(probabilities)
labels_l.append(labels.cpu().numpy())
predicted_values.append(np.concatenate(predicted_test).ravel())
actual_values.append(np.concatenate(labels_l).ravel())
if (epoch) % 1 == 0:
print('test accuracy : ', 100 * len((np.where(np.array(predicted_values[0])==(np.array(actual_values[0])))[0])) / len(actual_values[0]))
Output of model (12665 & 2115 represents the training and test set sizes) :
12665
2115
Epoch [1/2], Step [50/198], Loss: 0.1256
Epoch [2/2], Step [50/198], Loss: 0.0151
test accuracy : 99.76359338061465
/anaconda3/envs/pytorch/lib/python3.7/site-packages/ipykernel_launcher.py:143: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
Here's my 2 cents on your binary experiment.
It would seem like you have severely reduce the complexity of your dataset and with the high number of neurons in your intermediate layers, your model is expected to converge very quickly.
Note that MNIST dataset has channel of 1 and this makes the task very simple.
You may try to play with CIFAR10 and see if you are still getting high accuracy in just 2 epochs.
That's not a particularly well-posed question, because what is expected is entirely subjective. That being said, I am not surprised because 0 and 1 are very different digits. For instance, 0 has background surrounded by foreground, whereas 1 does not - that's an almost infallible test to distinguish the two. As a sanity check, I would swap out 0 for 7, which is similar to 1. I would expect to see significantly lower success rate. That being said, that's a sanity check - even if it passes, there may still be bugs or errors in your method.
I have a scheduler running on my PC and I want to train 10 instances of a SVC on different worker computers. I fiddled around but could not find a solution
I am assuming that you want to train thoses 10 SVC with different hyperparameters and find the best one (aka hyperparameters optimization that you can do using gridsearchCV). I am also assuming that you are using scikit learn.
Usually you would train the SVC using a code like :
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
# Loading the Digits dataset
digits = datasets.load_digits()
# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=0)
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
scores = ['precision', 'recall']
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
print()
clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
scoring='%s_macro' % score)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()
but it would only train sequentially on one thread.
If you install dask-ML, you can leverage a drop in replacement for grid search
conda install dask-searchcv -c conda-forge
Replacing
from sklearn.model_selection import GridSearchCV
by
from dask_searchcv import GridSearchCV
should be sufficient.
However, in you case, you don't want to use the threaded scheduler but the distributed scheduler. Hence, you have to add the following code at the begining
# Distribute grid-search across a cluster
from dask.distributed import Client
scheduler_address = '127.0.0.1:8786'
client = Client(scheduler_address)
The final code should look like this (not tested)
from sklearn import datasets
from sklearn.model_selection import train_test_split
from dask_searchcv import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
# Distribute grid-search across a cluster
from dask.distributed import Client
scheduler_address = '127.0.0.1:8786'
client = Client(scheduler_address)
# Loading the Digits dataset
digits = datasets.load_digits()
# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=0)
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
scores = ['precision', 'recall']
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
print()
clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
scoring='%s_macro' % score)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()
I have finished a PyTorch MLP model for the MNIST dataset, but got two different results: 0.90+ accuracy when using MNIST dataset from PyTorch, but ~0.10 accuracy when using MNIST dataset from Keras.
Below is my code with dependency: PyTorch 0.3.0.post4, keras 2.1.3, tensorflow backend 1.4.1 gpu version.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import torch as pt
import torchvision as ptv
from keras.datasets import mnist
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
# training data from PyTorch
train_set = ptv.datasets.MNIST("./data/mnist/train", train=True, transform=ptv.transforms.ToTensor(), download=True)
test_set = ptv.datasets.MNIST("./data/mnist/test", train=False, transform=ptv.transforms.ToTensor(), download=True)
train_dataset = DataLoader(train_set, batch_size=100, shuffle=True)
test_dataset = DataLoader(test_set, batch_size=10000, shuffle=True)
class MLP(pt.nn.Module):
"""The Multi-layer perceptron"""
def __init__(self):
super(MLP, self).__init__()
self.fc1 = pt.nn.Linear(784, 512)
self.fc2 = pt.nn.Linear(512, 128)
self.fc3 = pt.nn.Linear(128, 10)
self.use_gpu = True
def forward(self, din):
din = din.view(-1, 28 * 28)
dout = F.relu(self.fc1(din))
dout = F.relu(self.fc2(dout))
# return F.softmax(self.fc3(dout))
return self.fc3(dout)
model = MLP().cuda()
print(model)
# loss func and optim
optimizer = pt.optim.SGD(model.parameters(), lr=1)
criterion = pt.nn.CrossEntropyLoss().cuda()
def evaluate_acc(pred, label):
pred = pred.cpu().data.numpy()
label = label.cpu().data.numpy()
test_np = (np.argmax(pred, 1) == label)
test_np = np.float32(test_np)
return np.mean(test_np)
def evaluate_loader(loader):
print("evaluating ...")
accurarcy_list = []
for i, (inputs, labels) in enumerate(loader):
inputs = pt.autograd.Variable(inputs).cuda()
labels = pt.autograd.Variable(labels).cuda()
outputs = model(inputs)
accurarcy_list.append(evaluate_acc(outputs, labels))
print(sum(accurarcy_list) / len(accurarcy_list))
def training(d, epochs):
for x in range(epochs):
for i, data in enumerate(d):
optimizer.zero_grad()
(inputs, labels) = data
inputs = pt.autograd.Variable(inputs).cuda()
labels = pt.autograd.Variable(labels).cuda()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
if i % 200 == 0:
print(i, ":", evaluate_acc(outputs, labels))
# Training MLP for 4 epochs with MNIST dataset from PyTorch
training(train_dataset, 4)
# The accuracy is ~0.96.
evaluate_loader(test_dataset)
print("###########################################################")
def load_mnist():
(x, y), (x_test, y_test) = mnist.load_data()
x = x.reshape((-1, 1, 28, 28)).astype(np.float32)
x_test = x_test.reshape((-1, 1, 28, 28)).astype(np.float32)
y = y.astype(np.int64)
y_test = y_test.astype(np.int64)
print("x.shape", x.shape, "y.shape", y.shape,
"\nx_test.shape", x_test.shape, "y_test.shape", y_test.shape,
)
return x, y, x_test, y_test
class TMPDataset(Dataset):
"""Dateset for loading Keras MNIST dataset."""
def __init__(self, a, b):
self.x = a
self.y = b
def __getitem__(self, item):
return self.x[item], self.y[item]
def __len__(self):
return len(self.y)
x_train, y_train, x_test, y_test = load_mnist()
# Create dataloader for MNIST dataset from Keras.
test_loader = DataLoader(TMPDataset(x_test, y_test), num_workers=1, batch_size=10000)
train_loader = DataLoader(TMPDataset(x_train, y_train), shuffle=True, batch_size=100)
# Evaluate the performance of MLP trained on PyTorch dataset and the accurach is ~0.96.
evaluate_loader(test_loader)
evaluate_loader(train_loader)
model = MLP().cuda()
print(model)
optimizer = pt.optim.SGD(model.parameters(), lr=1)
criterion = pt.nn.CrossEntropyLoss().cuda()
# Train now on MNIST dataset from Keras.
training(train_loader, 4)
# Evaluate the trianed model on MNIST dataset from Keras and result in performance ~0.10...
evaluate_loader(test_loader)
evaluate_loader(train_loader)
I had checked some samples from Keras MNIST dataset and found no error.
I am wondering what is wrong with the datasets?
The code can run without error, run it to see the results.
The MNIST data coming from Keras are not normalized; following the Keras MNIST MLP example, you should do it manually, i.e. you should include the following in your load_data() function:
x /= 255
x_test /= 255
Not sure about PyTorch, but it would seem that the MNIST data from their own utility functions come already normalized (as is the case with Tensorflow - see the third point in my answer here).
A 10% accuracy (i.e. equivalent to random guessing) in case of not-normalized input data is perfectly consistent.