Categorical attributes to Sparse Matrix - machine-learning

First of all I´m new to Machine Learning.
I am trying to predict the price of second hand cars. This cars have makes and models, so I used a MultiLabelBinarizer to make a sparse matrix, to handle the categorical attributes, here's the code:
from sklearn.preprocessing import MultiLabelBinarizer
encoder = MultiLabelBinarizer()
make_cat_1hot = encoder.fit_transform(make_cat)
model_cat_1hot = encoder.fit_transform(model_cat)
type_cat_1hot = encoder.fit_transform(type_cat)
print(type(make_cat_1hot))
carInfoModHot = carsInfoMod.copy()
carInfoModHot["makeHot"] = make_cat_1hot.tolist()
carInfoModHot["modelHot"] = model_cat_1hot.tolist()
carInfoModHot["typeHot"] = type_cat_1hot.tolist()
doors km make year makeHot modelHot
5.0 78779 Mercedes 2012 [0, 0, 0, 0, 1, 0, 0, 0, ...[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...
5.0 25463 Bmw 2015 [0, 1, 0, 0, 0, 0, 0, ... [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...
Then I used it to make a prediction and get the mean square error with a Linear Regression:
lr = linear_model.LinearRegression()
carsInfoTrainHot = carInfoModHot.drop(["price"], axis=1) # drop labels for training set
df1 = carsInfoTrainHot.iloc[:30000, :]
carsLabels1 = carsInfoMod.iloc[:30000, 3]
print(carsInfoTrainHot.head())
df2 = carsInfoTrainHot.iloc[30001:60000, :]
carsLabels2 = carsInfoMod.iloc[30001:60000, 3]
df3 = carsInfoTrainHot.iloc[60001:, :]
carsLabels3 = carsInfoMod.iloc[60001:, 3]
lr.fit(df1, carsLabels1)
print(carsInfoTrainHot.shape)
carPrediction = lr.predict(df2)
lin_mse = mean_squared_error(carsLabels2, carPrediction)
lin_rmse = np.sqrt(lin_mse)
But I get this error:
ValueError Traceback (most recent call
last) in ()
12 carsLabels3 = carsInfoMod.iloc[60001:, 3]
13
---> 14 lr.fit(df1, carsLabels1)
15 print(carsInfoTrainHot.shape)
16 carPrediction = lr.predict(df2)
/home/vagrant/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/base.py
in fit(self, X, y, sample_weight)
510 n_jobs_ = self.n_jobs
511 X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
--> 512 y_numeric=True, multi_output=True)
513
514 if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
/home/vagrant/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py
in check_X_y(X, y, accept_sparse, dtype, order, copy,
force_all_finite, ensure_2d, allow_nd, multi_output,
ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype,
estimator)
519 X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
520 ensure_2d, allow_nd, ensure_min_samples,
--> 521 ensure_min_features, warn_on_dtype, estimator)
522 if multi_output:
523 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
/home/vagrant/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py
in check_array(array, accept_sparse, dtype, order, copy,
force_all_finite, ensure_2d, allow_nd, ensure_min_samples,
ensure_min_features, warn_on_dtype, estimator)
400 # make sure we actually converted to numeric:
401 if dtype_numeric and array.dtype.kind == "O":
--> 402 array = array.astype(np.float64)
403 if not allow_nd and array.ndim >= 3:
404 raise ValueError("Found array with dim %d. %s expected <= 2."
ValueError: setting an array element with a sequence.
From what I understand is that I´m inserting an array in the categorical attributes, but how else can I change the categorical values to a sparse matrix?
Thanks.

Related

ValueError: If `preds` and `target` are of shape (N, ...) and `preds` are floats, `target` should be binary

I am using torchmetrics.functional to evaluate my trained model and I get this error. I have attached what my tensor values look like and I belive I can make out the reason behind the error, my dataset includes non-binary values as labels. How do I work around this issue? I really appreciate you time.
Evaluation:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)
val_dataset = Dataset(
val_df,
tokenizer,
max_token_len=MAX_TOKEN_COUNT
)
predictions = []
labels = []
for item in tqdm(val_dataset):
_, prediction = trained_model(
item["input_ids"].unsqueeze(dim=0).to(device),
item["attention_mask"].unsqueeze(dim=0).to(device)
)
predictions.append(prediction.flatten())
labels.append(item["labels"].int())
predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()
Tensor Value:
tensor([[0.2794, 1.0000, 0.1865, ..., 0.0341, 0.0219, 0.8706],
[0.2753, 1.0000, 0.1864, ..., 0.0352, 0.0218, 0.8693],
[0.2747, 1.0000, 0.1858, ..., 0.0421, 0.0227, 0.8290],
...,
[0.2729, 1.0000, 0.1879, ..., 0.0430, 0.0231, 0.8263],
[0.2835, 1.0000, 0.1814, ..., 0.0363, 0.0215, 0.8570],
[0.2734, 1.0000, 0.1881, ..., 0.0430, 0.0232, 0.8277]])
tensor([[0, 2, 0, ..., 0, 0, 0],
[0, 3, 0, ..., 0, 0, 0],
[0, 1, 0, ..., 0, 0, 1],
...,
[0, 2, 0, ..., 0, 0, 1],
[0, 2, 0, ..., 0, 0, 2],
[0, 1, 1, ..., 0, 0, 1]], dtype=torch.int32)
accuracy(predictions, labels, threshold=THRESHOLD)
ValueError: If preds and target are of shape (N, ...) and preds are floats, target should be binary.

TensorFlow (Neural Network) FC output size

Not sure whether my question is TF specific or just NNs in general but i have created a CNN using tensorflow. and im having trouble understanding why the size of the output on my fully connected layer is what it is.
X = tf.placeholder(tf.float32, [None, 32, 32, 3])
y = tf.placeholder(tf.int64, [None])
is_training = tf.placeholder(tf.bool)
# define model
def complex_model(X,y,is_training):
# conv layer
wconv_1 = tf.get_variable('wconv_1', [7 ,7 ,3, 32])
bconv_1 = tf.get_variable('bconv_1', [32])
# affine layer 1
w1 = tf.get_variable('w1', [26*26*32//4, 1024]) #LINE 13
b1 = tf.get_variable('b1', [1024])
# batchnorm params
bn_gamma = tf.get_variable('bn_gamma', shape=[32]) #scale
bn_beta = tf.get_variable('bn_beta', shape=[32] ) #shift
# affine layer 2
w2 = tf.get_variable('w2', [1024, 10])
b2 = tf.get_variable('b2', [10])
c1_out = tf.nn.conv2d(X, wconv_1, strides=[1, 1, 1, 1], padding="VALID") + bconv_1
activ_1 = tf.nn.relu(c1_out)
mean, var = tf.nn.moments(activ_1, axes=[0,1,2], keep_dims=False)
bn = tf.nn.batch_normalization(act_1, mean, var, bn_gamma, bn_beta, 1e-6)
mp = tf.nn.max_pool(bn, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
affine_in_flat = tf.reshape(mp, [-1, 26*26*32//4])
affine_1 = tf.matmul(affine_in_flat, w1) + b1
activ_2 = tf.nn.relu(affine_1)
affine_2 = tf.matmul(activ_2, w2) + b2
return affine_2
#print(affine_2.shape)
In line 13 where i set the value of w1 i would have expected to just put:
w1 = tf.get_variable('w1', [26*26*32, 1024])
however if i run the code with the line shown above and with
affine_in_flat = tf.reshape(mp, [-1, 26*26*32])
my output size is 16,10 instead of 64,10 which is what i would expect given the initialisations below:
x = np.random.randn(64, 32, 32,3)
with tf.Session() as sess:
with tf.device("/cpu:0"): #"/cpu:0" or "/gpu:0"
tf.global_variables_initializer().run()
#print("train", x.size, is_training, y_out)
ans = sess.run(y_out,feed_dict={X:x,is_training:True})
%timeit sess.run(y_out,feed_dict={X:x,is_training:True})
print(ans.shape)
print(np.array_equal(ans.shape, np.array([64, 10])))
can anybody tell me why i need to divide the size of w1[0] by 4?
Adding print statements for bn and mp I get:
bn: <tf.Tensor 'batchnorm/add_1:0' shape=(?, 26, 26, 32) dtype=float32>
mp: <tf.Tensor 'MaxPool:0' shape=(?, 13, 13, 32) dtype=float32>
Which would seem to be due to the strides=[1, 2, 2, 1] on the max pooling (but to maintain 26, 26 you'd also need padding='SAME').

How to check if a particular bit is set in Elixir/ Erlang

Given any binary, for example <<1, 0, 110, 64>>, how can we determine if a particular bit is set?
Say we wish to determine if bit-1 and bit-2 are set, one would expect this to work, but it doesn't:
<<bit1::bits-size(1), bit2::bits-size(1), _rest::bits>> = <<1, 0, 110, 64>>
Gives:
iex(5)> {bit1, bit2}
{<<0::size(1)>>, <<0::size(1)>>}
Correct ANSWER (from Igor and other comments):
<<_::bits-6, bit2::bits-1, bit1::bits-1, num::bits>> = <<1, 0, 110, 64>>
Gives the expected answer:
{bit1,bit2} = {1, 0}
Background
I'm building a parser to handle this: https://msdn.microsoft.com/en-us/library/vs/alm/dd943386(v=office.12).aspx
Using this C# code as a template I get the correct result: <<1, 0, 110, 64>> = 2.4
https://github.com/ChiangHanLung/PIC_VDS/blob/f96afdd3863f5ce1df237b2784040624bc88b16b/Reference_DLL_SourceCode/NPOI/HSSF/Util/RKUtil.cs#L33-L74
My equivalent Elixir implementation of the above works as expected, but i believe using bit-string parsing should be possible (and cleaner)
def rk_number(data) do
# IO.puts " ** rk-data: #{inspect data}"
n0 = :binary.decode_unsigned(data, :little)
n1 = n0 >>> 2
n2 =
if (n0 &&& 0x2) == 0x2 do # bit-2, is an int
<<v::little-signed-32>> = <<n1::little-32>>
v
else
n3 = n1 <<< 34
<<v::little-float-64>> = <<n3::little-64>>
v
end
if (n0 &&& 0x1) == 0x1 do # bit-1, div by 100
n2 / 100
else
n2
end
end
That's because every number in <<1, 0, 110, 64>> representation has size 8 by default.
That's why
<<bit1::bits-size(1), bit2::bits-size(1), _rest::bits>> = <<1, 0, 110, 64>>
{bit1, bit2} = {<<0::size(1)>>, <<0::size(1)>>}
Because 2 first bits in 1 of size 8 (00000001) equals 0.
But
<<bit1::bits-size(8), bit2::bits-size(8), _rest::bits>> = <<1, 0, 110, 64>>
{bit1, bit2} = {<<1>>, <<0>>}
Or
<<bit1::bits-size(1), bit2::bits-size(1), _rest::bits>> = <<1::size(1), 0::size(1), 110, 64>>
{bit1, bit2} = {<<1::size(1)>>, <<0::size(1)>>}
If there's an integer and you're trying to get first two bits of it, you may try something like this:
<<bit1::bits-size(1), bit2::bits-size(1), _rest::bits>> = :binary.encode_unsigned(your_integer)
I've got the answer, after consider one of the comments above:
<<_::bits-6, bit2::bits-1, bit1::bits-1, num::bits>> = <<1, 0, 110, 64>>
{bit1, bit2} = {1, 0}
which gives the expected result

How can i input boolean tensors to tf.cond() not just one boolean?

Here is what I want to implement f(x) with tensorflow
input x = (x1,x2,x3,x4,x5,x6,x7,x8,x9)
define f(x) = f1(x1,x2,x3,x4,x5) + f2(x5,x6,x7,x8,x9)
where
f1(x1,x2,x3,x4,x5) = {1 if
(x1,x2,x3,x4,x5)=(0,0,0,0,0),
g1(x1,x2,x3,x4,x5) otherwise}
f2(x5,x6,x7,x8,x9) = {1 if
(x5,x6,x7,x8,x9)=(0,0,0,0,0),
g2(x5,x6,x7,x8,x9) otherwise}
This is my tensorflow code
import tensorflow as tf
import numpy as np
ph = tf.placeholder(dtype=tf.float32, shape=[None, 9])
x1 = tf.slice(ph, [0, 0], [-1, 5])
x2 = tf.slice(ph, [0, 4], [-1, 5])
fixed1 = tf.placeholder(dtype=tf.float32, shape=[1, 5])
fixed2 = tf.placeholder(dtype=tf.float32, shape=[1, 5])
# MLP 1
w1 = tf.Variable(tf.ones([5, 1]))
g1 = tf.matmul(x1, w1)
# MLP 2
w2 = tf.Variable(-tf.ones([5, 1]))
g2 = tf.matmul(x2, w2)
check1 = tf.reduce_all(tf.equal(x1, fixed1), axis=1, keep_dims=True)
check2 = tf.reduce_all(tf.equal(x2, fixed2), axis=1, keep_dims=True)
#### with Problem
f1 = tf.cond(check1,
lambda: tf.constant([2], dtype=tf.float32), lambda: g1)
f2 = tf.cond(check2,
lambda: tf.constant([1], dtype=tf.float32), lambda: g2)
####
f = tf.add(f1, f2)
x = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 0, 0, 0],
[2, 0, 0, 0, 0, 0, 0, 0, 0],
[9, 0, 0, 0, 0, 0, 0, 0, 0]])
fixed = np.array([[0, 0, 0, 0, 0]])
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print('(1)\n', sess.run(check1, feed_dict={ph: x, fixed1: fixed, fixed2: fixed}))
print('(2)\n', sess.run(check2, feed_dict={ph: x, fixed1: fixed, fixed2: fixed}))
print('(3)\n', sess.run(f, feed_dict={ph: x, fixed1: fixed, fixed2: fixed}))
print('(4)\n', sess.run(f1, feed_dict={ph: x, fixed1: fixed, fixed2: fixed}))
print('(5)\n', sess.run(f2, feed_dict={ph: x, fixed1: fixed, fixed2: fixed}))
In this case,
check1 is [[ True], [ True], [False], [False], [False]] with shape (5, 1)
check2 is [[ True], [False], [ True], [ True], [ True]] with shape (5, 1)
I expect result of f is [[3], [1], [2], [3], [10]]
but seems like tf.cond() can not handle input as boolean tensors with shape (5, 1)
Could you advice how to implement f(x) with tensorflow, please.
This is Error message what i received
Traceback (most recent call last): File
"C:\Users\hong\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\common_shapes.py",
line 670, in _call_cpp_shape_fn_impl
status) File "C:\Users\hong\AppData\Local\Continuum\Anaconda3\lib\contextlib.py",
line 66, in exit
next(self.gen) File "C:\Users\hong\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\errors_impl.py",
line 469, in raise_exception_on_not_ok_status
pywrap_tensorflow.TF_GetCode(status)) tensorflow.python.framework.errors_impl.InvalidArgumentError: Shape
must be rank 0 but is rank 2 for 'cond/Switch' (op: 'Switch') with
input shapes: [?,1], [?,1].
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File
"C:/Users/hong/Dropbox/MLILAB/Research/GM-MLP/code/tensorflow_cond.py",
line 23, in
lambda: tf.constant([2], dtype=tf.float32), lambda: g1) File "C:\Users\hong\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py",
line 1765, in cond
p_2, p_1 = switch(pred, pred) File "C:\Users\hong\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py",
line 318, in switch
return gen_control_flow_ops._switch(data, pred, name=name) File "C:\Users\hong\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_control_flow_ops.py",
line 368, in _switch
result = _op_def_lib.apply_op("Switch", data=data, pred=pred, name=name) File
"C:\Users\hong\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py",
line 759, in apply_op
op_def=op_def) File "C:\Users\hong\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py",
line 2242, in create_op
set_shapes_for_outputs(ret) File "C:\Users\hong\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py",
line 1617, in set_shapes_for_outputs
shapes = shape_func(op) File "C:\Users\hong\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py",
line 1568, in call_with_requiring
return call_cpp_shape_fn(op, require_shape_fn=True) File "C:\Users\hong\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\common_shapes.py",
line 610, in call_cpp_shape_fn
debug_python_shape_fn, require_shape_fn) File "C:\Users\hong\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\common_shapes.py",
line 675, in _call_cpp_shape_fn_impl
raise ValueError(err.message) ValueError: Shape must be rank 0 but is rank 2 for 'cond/Switch' (op: 'Switch') with input shapes: [?,1],
[?,1].
Process finished with exit code 1
I think you need tf.where, not tf.cond.
See the answer to this question: How to use tf.cond for batch processing

Why is my convolution autoencoder not getting trained properly?

Why is my convolutional autoencoder not converging properly? I have a very simple layer stack.
Encoder: Conv/ReLU(Kernel size: 7x7, stride = 1, padding = 0) => maxPool(kernel size=2x2, stride = 2) => Conv/ReLU(Kernel size: 5x5, stride = 1, padding = 0) => MaxPool(kernel size=2x2, stride = 2)
Decoder: Nearest Neighbour Upsampling => Deconv/ReLU => Nearest Neighbour Upsampling => Deconv/ReLU
Training Images are of size 30x30x1.
I tried to train it with 1000 images over 1000 epoch, but the error (MSE) is still 120.
BATCH_SIZE = 100
IMAGE_SIZE = 30
NUM_CHANNELS = 1
num_images = 1000
def init_weights(shape):
return tf.Variable(tf.random_normal(shape, stddev=0.01))
def encoder(X, w, w2, wd, wd2):
l1a = tf.nn.relu(tf.nn.conv2d(X, w,strides=[1, 1, 1, 1], padding='VALID'))
l1 = tf.nn.max_pool(l1a, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
l2a = tf.nn.relu(tf.nn.conv2d(l1, w2,strides=[1, 1, 1, 1], padding='VALID'))
l2 = tf.nn.max_pool(l2a, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
l1da = tf.image.resize_images(l2, 8, 8, 1, align_corners=False)
output_shapel1d = tf.convert_to_tensor([BATCH_SIZE, 12, 12, 32], dtype=tf.int32);
l1d = tf.nn.relu(tf.nn.conv2d_transpose(l1da, wd, output_shapel1d, strides=[1, 1, 1, 1], padding='VALID'))
l2da = tf.image.resize_images(l1d, 24, 24, 1, align_corners=False)
output_shapel2d = tf.convert_to_tensor([BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS], dtype=tf.int32);
l2d = tf.nn.relu(tf.nn.conv2d_transpose(l2da, wd2, output_shapel2d, strides=[1, 1, 1, 1], padding='VALID'))
return l2d
complete_image = extract_data(0, 1000)
trX = complete_image[0:900]
trY = trX
teX = complete_image[900:1000]
teY = teX
X = tf.placeholder("float", [BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS])
Y = tf.placeholder("float", [BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS])
w = init_weights([7, 7, 1, 32])
w2 = init_weights([5, 5, 32, 64])
wd = init_weights([5, 5, 32, 64])
wd2 = init_weights([7, 7, 1, 32])
py_x = encoder(X, w, w2, wd, wd2)
cost = tf.reduce_mean(tf.squared_difference(py_x, Y, name = None))
train_op = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost)
predict_op = py_x;
global_step = tf.Variable(0, name='global_step', trainable=False)
saver = tf.train.Saver()
with tf.Session() as sess:
tf.initialize_all_variables().run()
start = global_step.eval() # get last global_step
print "Start from:", start
if FLAGS.output == "train":
for i in range(start, 500):
training_batch = zip(range(0, num_images - BATCH_SIZE, batch_size),
range(batch_size, num_images - BATCH_SIZE, batch_size))
for start, end in training_batch:
sess.run(train_op, feed_dict={X: trX[start:end], Y: trY[start:end]})
total_epoch_cost += sess.run(cost, feed_dict={X: trX[start:end], Y: trY[start:end]})
avg_epoch_cost = total_epoch_cost/BATCH_SIZE
print "cost during epoch " + `i` + "is ", avg_epoch_cost
I have added the complete code in this gist with slight modifications. I am training this with around 10,000 images, and the error after 488 epochs is 74.8.

Resources