How to understand the _dual_coef_ parameter in sklearn's kernel svm? - machine-learning

I have a small kernel svm code.
from sklearn import datasets
from sklearn.svm import SVC
import numpy as np
# Load the IRIS dataset for demonstration
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Train-test split
X_train, y_train = X[:140], y[:140]
X_test, y_test = X[140:], y[140:]
print(X.shape, X_train.shape, X_test.shape) # prints (150, 4) (140, 4) (10, 4)
# Fit a rbf kernel SVM
gamma = 0.7
svc = SVC(kernel='rbf', gamma=gamma, C=64, decision_function_shape='ovo')
# svc = SVC(kernel='rbf', gamma=gamma, C=64, probability=True, decision_function_shape='ovo')
# svc = SVC(kernel='rbf', gamma=gamma, C=64)
svc.fit(X_train, y_train)
print(svc.score(X_test, y_test))
# Get prediction for a point X_test using train SVM, svc
def get_pred(svc, X_test):
def RBF(x,z,gamma,axis=None):
return np.exp((-gamma*np.linalg.norm(x-z, axis=axis)**2))
A = []
# Loop over all suport vectors to calculate K(Xi, X_test), for Xi belongs to the set of support vectors
for x in svc.support_vectors_:
# A.append(RBF(x, X_test, svc._gamma))
A.append(RBF(x, X_test, gamma))
A = np.array(A)
return (np.sum(svc._dual_coef_*A)+svc.intercept_)
for i in range(X_test.shape[0]):
print(get_pred(svc, X_test[i]))
print(svc.decision_function([X_test[i]])) # The output should same
I want to understand the role of the dual_coef parameter in svm, so I implemented a prediction function get_pred of svm myself.
According to the mathematical expression of svm here.
But the output of the function I implemented is different from the function that comes with svm.
(150, 4) (140, 4) (10, 4)
1.0
[-4.24105215 -4.38979215 -3.52427244]
[[-0.42115154 -1.06817962 -2.36560357]]
[-2.34091311 -2.48965311 -1.6241334 ]
[[-0.61615543 -0.86736268 -0.47127757]]
[-4.34859785 -4.49733785 -3.63181814]
[[-0.86662754 -1.14637099 -1.94948189]]
[-4.14797518 -4.29671518 -3.43119547]
[[-0.32438219 -1.12869709 -2.30877848]]
[-3.80505008 -3.95379007 -3.08827037]
[[-0.3341635 -1.03315401 -2.05161515]]
[-3.83632958 -3.98506957 -3.11954987]
[[-0.62920059 -0.97474828 -1.84626328]]
[-3.94804683 -4.09678683 -3.23126712]
[[-0.90348467 -1.04135143 -1.61709331]]
[-4.24990319 -4.39864319 -3.53312348]
[[-0.83485694 -1.07466796 -1.95426087]]
[-3.39840443 -3.54714443 -2.68162472]
[[-0.52530703 -0.9980642 -1.48891578]]
[-3.03105705 -3.17979705 -2.31427734]
[[-0.93796146 -1.09834078 -0.60863738]]
How can I understand this parameter dual_coef, or put another way, how can I implement the prediction function of the kernel svm myself?

Related

Why is my pytorch classification model not learning?

I have created a simple pytorch classification model with sample datasets generated using sklearns make_classification. Even after training for thousands of epochs the accuracy of the model hovers between 30 and 40 percentage. During training itself the loss value is fluctuating very far and wide. I am wondering why this model is not learning, whether it's due to some logical error in the code.
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
X,y = make_classification(n_features=15,n_classes=5,n_informative=4)
DEVICE = torch.device('cuda')
epochs = 5000
class CustomDataset(Dataset):
def __init__(self,X,y):
self.X = torch.from_numpy(X)
self.y = torch.from_numpy(y)
def __len__(self):
return len(self.X)
def __getitem__(self, index):
X = self.X[index]
y = self.y[index]
return (X,y)
class Model(nn.Module):
def __init__(self):
super().__init__()
self.l1 = nn.Linear(15,10)
self.l2 = nn.Linear(10,5)
self.relu = nn.ReLU()
def forward(self,x):
x = self.l1(x)
x = self.relu(x)
x = self.l2(x)
x = self.relu(x)
return x
model = Model().double().to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_function = nn.CrossEntropyLoss()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
train_data = CustomDataset(X_train,y_train)
test_data = CustomDataset(X_test,y_test)
trainloader = DataLoader(train_data, batch_size=32, shuffle=True)
testloader = DataLoader(test_data, batch_size=32, shuffle=True)
for i in range(epochs):
for (x,y) in trainloader:
x = x.to(DEVICE)
y = y.to(DEVICE)
optimizer.zero_grad()
output = model(x)
loss = loss_function(output,y)
loss.backward()
optimizer.step()
if i%200==0:
print("epoch: ",i," Loss: ",loss.item())
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
for x, y in testloader:
# calculate outputs by running x through the network
outputs = model(x.to(DEVICE)).to(DEVICE)
# the class with the highest energy is what we choose as prediction
_, predicted = torch.max(outputs.data, 1)
total += y.size(0)
correct += (predicted == y.to(DEVICE)).sum().item()
print(f'Accuracy of the network on the test data: {100 * correct // total} %')
EDIT
I tried to over-fit my model with only 10 samples (batch_size=5) X,y = make_classification(n_samples=10,n_features=15,n_classes=5,n_informative=4) but now the accuracy decreased to 15-20%. I then normalize the input data between the values 0 and 1 which pushed the accuracy a bit higher but not over 50 percentage. Any idea why this might be happening?
You should not be using ReLU activation on your output layer. Usually softmax activation is used for multi class classification on the final layer, or the logits are fed to the loss function directly without explicitly adding a softmax activation layer.
Try removing the ReLU activation from the final layer.

Improve Accuracy in neural network with Keras

Below is the code of what I'm trying to do, but my accuracy is always under 50% so I'm wondering how should I fix this? What I'm trying to do is use the first 1885 daily unit sale data as input and the rest of the daily unit sale data from 1885 as output. After train these data, I need to use it to predict 20 more daily unit sale in the future
The data I used here is provided in this link
https://drive.google.com/file/d/13qzIZMD6Wz7e1GpOsNw1_9Yq-4PI2HrC/view?usp=sharing
import pandas as pd
import numpy as np
import keras
import keras.backend as k
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
data = pd.read_csv('sales_train.csv')
#Since there are 3 departments and 10 store from 3 different areas, thus I categorized the data into 30 groups and numerize them
Unique_dept = data["dept_id"].unique()
Unique_state = data['state_id'].unique()
Unique_store = data["store_id"].unique()
data0 = data.copy()
for i in range(3):
data0["dept_id"] = data0["dept_id"].replace(to_replace=Unique_dept[i], value = i)
data0["state_id"] = data0["state_id"].replace(to_replace=Unique_state[i], value = i)
for j in range(10):
data0["store_id"] = data0["store_id"].replace(to_replace=Unique_store[j], value = int(Unique_store[j][3]) -1)
# Select the three numerized categorical variables and daily unit sale data
pt = 6 + 1885
X = pd.concat([data0.iloc[:,2],data0.iloc[:, 4:pt]], axis = 1)
Y = data0.iloc[:, pt:]
# Remove the daily unit sale data that are highly correlated to each other (corr > 0.9)
correlation = X.corr(method = 'pearson')
corr_lst = []
for i in correlation:
for j in correlation:
if (i != j) & (correlation[i][j] >= 0.9) & (j not in corr_lst) & (i not in corr_lst):
corr_lst.append(i)
x = X.drop(corr_lst, axis = 1)
x_value = x.values
y_value = Y.values
sc = StandardScaler()
X_scale = sc.fit_transform(x_value)
X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(x_value, y_value, test_size=0.2)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5)
print(X_train.shape, X_val.shape, X_test.shape, Y_train.shape, Y_val.shape, Y_test.shape)
#create model
model = Sequential()
#get number of columns in training data
n_cols = X_train.shape[1]
#add model layers
model.add(Dense(32, activation='softmax', input_shape=(n_cols,)))
model.add(Dense(32, activation='relu'))
model.add(Dense(32, activation='softmax'))
model.add(Dense(1))
#compile model using rmsse as a measure of model performance
model.compile(optimizer='Adagrad', loss= "mean_absolute_error", metrics = ['accuracy'])
#set early stopping monitor so the model stops training when it won't improve anymore early_stopping_monitor = EarlyStopping(patience=3)
early_stopping_monitor = EarlyStopping(patience=20)
#train model
model.fit(X_train, Y_train,batch_size=32, epochs=10, validation_data=(X_val, Y_val))
Here is what I got
The plots are also pretty strange:
Accuracy
Loss
Two mistakes:
Accuracy is meaningless in regression settings, such as yours here (it is meaningful only for classification ones); see What function defines accuracy in Keras when the loss is mean squared error (MSE)? (the argument is identical when MAE loss is used, like here). Your performance measure here is the same with your loss (i.e. MAE).
We never use softmax activations in anything but the final layer of a classification model; replace both softmax activation functions used in your model with relu (keep the last layer as is, as no activation means linear, which is indeed the correct one for regression).

label_binarize Does not fit for sklearn Naive Bayes classifier showing bad input shape

I was trying to create roc curve for multiclass using Naive Bayes But it ending with
ValueError: bad input shape.
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.naive_bayes import BernoulliNB
from scipy import interp
# Import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Binarize the output
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
random_state=0)
# Learn to predict each class against the other
classifier = BernoulliNB(alpha=1.0, binarize=6, class_prior=None, fit_prior=True)
y_score = classifier.fit(X_train, y_train).predict(X_test)
raise ValueError("bad input shape {0}".format(shape))
ValueError: bad input shape (75, 6)
The error because of binarizing the y variable. The estimator can work with string values itself.
Remove the following lines,
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
You are good to go!
To get the predicted probabilities for roc_curve, use the following:
classifier.fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)
y_score.shape
# (75, 3)

How can I distribute a SVC to different workers(on other computers) using dask

I have a scheduler running on my PC and I want to train 10 instances of a SVC on different worker computers. I fiddled around but could not find a solution
I am assuming that you want to train thoses 10 SVC with different hyperparameters and find the best one (aka hyperparameters optimization that you can do using gridsearchCV). I am also assuming that you are using scikit learn.
Usually you would train the SVC using a code like :
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
# Loading the Digits dataset
digits = datasets.load_digits()
# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=0)
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
scores = ['precision', 'recall']
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
print()
clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
scoring='%s_macro' % score)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()
but it would only train sequentially on one thread.
If you install dask-ML, you can leverage a drop in replacement for grid search
conda install dask-searchcv -c conda-forge
Replacing
from sklearn.model_selection import GridSearchCV
by
from dask_searchcv import GridSearchCV
should be sufficient.
However, in you case, you don't want to use the threaded scheduler but the distributed scheduler. Hence, you have to add the following code at the begining
# Distribute grid-search across a cluster
from dask.distributed import Client
scheduler_address = '127.0.0.1:8786'
client = Client(scheduler_address)
The final code should look like this (not tested)
from sklearn import datasets
from sklearn.model_selection import train_test_split
from dask_searchcv import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
# Distribute grid-search across a cluster
from dask.distributed import Client
scheduler_address = '127.0.0.1:8786'
client = Client(scheduler_address)
# Loading the Digits dataset
digits = datasets.load_digits()
# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=0)
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
scores = ['precision', 'recall']
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
print()
clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
scoring='%s_macro' % score)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()

Problems with PyTorch MLP when training the MNIST dataset retrieved from Keras

I have finished a PyTorch MLP model for the MNIST dataset, but got two different results: 0.90+ accuracy when using MNIST dataset from PyTorch, but ~0.10 accuracy when using MNIST dataset from Keras.
Below is my code with dependency: PyTorch 0.3.0.post4, keras 2.1.3, tensorflow backend 1.4.1 gpu version.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import torch as pt
import torchvision as ptv
from keras.datasets import mnist
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
# training data from PyTorch
train_set = ptv.datasets.MNIST("./data/mnist/train", train=True, transform=ptv.transforms.ToTensor(), download=True)
test_set = ptv.datasets.MNIST("./data/mnist/test", train=False, transform=ptv.transforms.ToTensor(), download=True)
train_dataset = DataLoader(train_set, batch_size=100, shuffle=True)
test_dataset = DataLoader(test_set, batch_size=10000, shuffle=True)
class MLP(pt.nn.Module):
"""The Multi-layer perceptron"""
def __init__(self):
super(MLP, self).__init__()
self.fc1 = pt.nn.Linear(784, 512)
self.fc2 = pt.nn.Linear(512, 128)
self.fc3 = pt.nn.Linear(128, 10)
self.use_gpu = True
def forward(self, din):
din = din.view(-1, 28 * 28)
dout = F.relu(self.fc1(din))
dout = F.relu(self.fc2(dout))
# return F.softmax(self.fc3(dout))
return self.fc3(dout)
model = MLP().cuda()
print(model)
# loss func and optim
optimizer = pt.optim.SGD(model.parameters(), lr=1)
criterion = pt.nn.CrossEntropyLoss().cuda()
def evaluate_acc(pred, label):
pred = pred.cpu().data.numpy()
label = label.cpu().data.numpy()
test_np = (np.argmax(pred, 1) == label)
test_np = np.float32(test_np)
return np.mean(test_np)
def evaluate_loader(loader):
print("evaluating ...")
accurarcy_list = []
for i, (inputs, labels) in enumerate(loader):
inputs = pt.autograd.Variable(inputs).cuda()
labels = pt.autograd.Variable(labels).cuda()
outputs = model(inputs)
accurarcy_list.append(evaluate_acc(outputs, labels))
print(sum(accurarcy_list) / len(accurarcy_list))
def training(d, epochs):
for x in range(epochs):
for i, data in enumerate(d):
optimizer.zero_grad()
(inputs, labels) = data
inputs = pt.autograd.Variable(inputs).cuda()
labels = pt.autograd.Variable(labels).cuda()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
if i % 200 == 0:
print(i, ":", evaluate_acc(outputs, labels))
# Training MLP for 4 epochs with MNIST dataset from PyTorch
training(train_dataset, 4)
# The accuracy is ~0.96.
evaluate_loader(test_dataset)
print("###########################################################")
def load_mnist():
(x, y), (x_test, y_test) = mnist.load_data()
x = x.reshape((-1, 1, 28, 28)).astype(np.float32)
x_test = x_test.reshape((-1, 1, 28, 28)).astype(np.float32)
y = y.astype(np.int64)
y_test = y_test.astype(np.int64)
print("x.shape", x.shape, "y.shape", y.shape,
"\nx_test.shape", x_test.shape, "y_test.shape", y_test.shape,
)
return x, y, x_test, y_test
class TMPDataset(Dataset):
"""Dateset for loading Keras MNIST dataset."""
def __init__(self, a, b):
self.x = a
self.y = b
def __getitem__(self, item):
return self.x[item], self.y[item]
def __len__(self):
return len(self.y)
x_train, y_train, x_test, y_test = load_mnist()
# Create dataloader for MNIST dataset from Keras.
test_loader = DataLoader(TMPDataset(x_test, y_test), num_workers=1, batch_size=10000)
train_loader = DataLoader(TMPDataset(x_train, y_train), shuffle=True, batch_size=100)
# Evaluate the performance of MLP trained on PyTorch dataset and the accurach is ~0.96.
evaluate_loader(test_loader)
evaluate_loader(train_loader)
model = MLP().cuda()
print(model)
optimizer = pt.optim.SGD(model.parameters(), lr=1)
criterion = pt.nn.CrossEntropyLoss().cuda()
# Train now on MNIST dataset from Keras.
training(train_loader, 4)
# Evaluate the trianed model on MNIST dataset from Keras and result in performance ~0.10...
evaluate_loader(test_loader)
evaluate_loader(train_loader)
I had checked some samples from Keras MNIST dataset and found no error.
I am wondering what is wrong with the datasets?
The code can run without error, run it to see the results.
The MNIST data coming from Keras are not normalized; following the Keras MNIST MLP example, you should do it manually, i.e. you should include the following in your load_data() function:
x /= 255
x_test /= 255
Not sure about PyTorch, but it would seem that the MNIST data from their own utility functions come already normalized (as is the case with Tensorflow - see the third point in my answer here).
A 10% accuracy (i.e. equivalent to random guessing) in case of not-normalized input data is perfectly consistent.

Resources