Kindly check this Ordinal Encoding Value Error Issue - machine-learning

To avoid data leakage, these steps are the advised method to use a tranformation process.
Create an instance of the Ordinal Encoder class
Fit the encoder to the training data and transform the data using the fit_transform() method
Transform the test data using the encoder and the transform() method
I did this initially, but i kept getting an error message. I think it's because the dataset contains both categorical and numerical features.
The fit_transform worked on the train dataset, but kept returning an error message on test data. It was until i used fit_transform on the test data that it stopped. This i believe is not ideal as that will be data leakage in itself.
Is there a way to carry out the above steps on only the categorical features on the train data and successfully transform the test dataset without error?
enc_cat = OrdinalEncoder()
enc_tag = LabelEncoder()
y_train_transf_cd = enc_tag.fit_transform(y_train_cd)
y_test_transf_cd = enc_tag.transform(y_test_cd)
x_train_transf_cd = enc_cat.fit_transform(x_train_cd)
x_test_transf_cd = enc_cat.transform(x_test_cd)
ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_1316\78241529.py in <module>
6
7 x_train_transf_cd = enc_cat.fit_transform(x_train_cd)
----> 8 x_test_transf_cd = enc_cat.transform(x_test_cd)
~\anaconda3\lib\site-packages\sklearn\preprocessing\_encoders.py in transform(self, X)
928 Transformed input.
929 """
--> 930 X_int, X_mask = self._transform(
931 X, handle_unknown=self.handle_unknown, force_all_finite="allow-nan"
932 )
~\anaconda3\lib\site-packages\sklearn\preprocessing\_encoders.py in _transform(self, X, handle_unknown, force_all_finite, warn_on_unknown)
140 " during transform".format(diff, i)
141 )
--> 142 raise ValueError(msg)
143 else:
144 if warn_on_unknown:
ValueError: Found unknown categories [647, 677, 726, 778, 787, 816, 822, 823, 881, 899, 944, 951, 963, 1016, 1033, 1063, 1120, 1151, 1192, 1197, 1199, 1209, 1240, 1254, 1255, 1321, 1325, 1335, 1385, 1387, 1444, 1479, 1498, 1519, 1642, 1655, 1668, 1681, 1698, 1719, 1749, 1755, 1757, 1765, 1785, 1787, 1823, 1834, 1858, 1874, 1875, 1884, 1890, 1902, 1986, 2022, 2036, 2055, 2080, 2100, 2119, 2142, 2160, 2193, 2210, 2227, 2231, 2257, 2263, 2268, 2269, 2295, 2317, 2330, 2382, 2400, 2414, 2416, 2419,

Related

InvalidArgumentError when building sequential model using Keras

I want to develop a sequential model using optimal hyperparameters derived from Keras Tuner. My code raised "Invalid argument: assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:]" error.
I tried adding +1 or +10 to the entire X_train but still get the same error.
Build the sequential model:
def model_builder(hp):
model = Sequential()
# Tune the number of units in the first Dense layer
# Choose an optimal value between 32-512
hp_units1 = hp.Int('units1', min_value=32, max_value=512, step=32)
hp_units2 = hp.Int('units2', min_value=32, max_value=512, step=32)
hp_units3 = hp.Int('units3', min_value=32, max_value=512, step=32)
model.add(Dense(units=hp_units1, activation='relu'))
model.add(tf.keras.layers.Dense(units=hp_units2, activation='relu'))
model.add(tf.keras.layers.Dense(units=hp_units3, activation='relu'))
model.add(Dense(1, kernel_initializer='normal', activation='linear')) # output layer
# Tune the learning rate for the optimizer
# Choose an optimal value from 0.01, 0.001, or 0.0001
hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
# Performance visualization callback
performance_viz_cbk = PerformanceVisualizationCallback(
model=model,
test_data=X_test,
image_dir='c:\perorfmance_charts')
# Model summary
model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
loss=SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy', tf.keras.metrics.AUC(), MulticlassTruePositives()])
return model
Hyperparameter tuning
tuner = kt.Hyperband(model_builder,
objective='val_accuracy',
max_epochs=10,
factor=3,
directory='my_dir',
project_name='intro_to_kt')
tuner.search(X_train, y_train, epochs=10, validation_split=0.2)
Traceback:
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
/tmp/ipykernel_17/1976310635.py in <module>
----> 1 tuner.search(X_train, y_train, epochs=10, validation_split=0.2)
/opt/conda/lib/python3.7/site-packages/keras_tuner/engine/base_tuner.py in search(self, *fit_args, **fit_kwargs)
177
178 self.on_trial_begin(trial)
--> 179 results = self.run_trial(trial, *fit_args, **fit_kwargs)
180 # `results` is None indicates user updated oracle in `run_trial()`.
181 if results is None:
/opt/conda/lib/python3.7/site-packages/keras_tuner/tuners/hyperband.py in run_trial(self, trial, *fit_args, **fit_kwargs)
382 fit_kwargs["epochs"] = hp.values["tuner/epochs"]
383 fit_kwargs["initial_epoch"] = hp.values["tuner/initial_epoch"]
--> 384 return super(Hyperband, self).run_trial(trial, *fit_args, **fit_kwargs)
385
386 def _build_model(self, hp):
/opt/conda/lib/python3.7/site-packages/keras_tuner/engine/tuner.py in run_trial(self, trial, *args, **kwargs)
292 callbacks.append(model_checkpoint)
293 copied_kwargs["callbacks"] = callbacks
--> 294 obj_value = self._build_and_fit_model(trial, *args, **copied_kwargs)
295
296 histories.append(obj_value)
/opt/conda/lib/python3.7/site-packages/keras_tuner/engine/tuner.py in _build_and_fit_model(self, trial, *args, **kwargs)
220 hp = trial.hyperparameters
221 model = self._try_build(hp)
--> 222 results = self.hypermodel.fit(hp, model, *args, **kwargs)
223 return tuner_utils.convert_to_metrics_dict(
224 results, self.oracle.objective, "HyperModel.fit()"
/opt/conda/lib/python3.7/site-packages/keras_tuner/engine/hypermodel.py in fit(self, hp, model, *args, **kwargs)
135 If return a float, it should be the `objective` value.
136 """
--> 137 return model.fit(*args, **kwargs)
138
139
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1182 _r=1):
1183 callbacks.on_train_batch_begin(step)
-> 1184 tmp_logs = self.train_function(iterator)
1185 if data_handler.should_sync:
1186 context.async_wait()
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
883
884 with OptionalXlaContext(self._jit_compile):
--> 885 result = self._call(*args, **kwds)
886
887 new_tracing_count = self.experimental_get_tracing_count()
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
948 # Lifting succeeded, so variables are initialized and we can run the
949 # stateless function.
--> 950 return self._stateless_fn(*args, **kwds)
951 else:
952 _, _, _, filtered_flat_args = \
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
3038 filtered_flat_args) = self._maybe_define_function(args, kwargs)
3039 return graph_function._call_flat(
-> 3040 filtered_flat_args, captured_inputs=graph_function.captured_inputs) # pylint: disable=protected-access
3041
3042 #property
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1962 # No tape is watching; skip to running the function.
1963 return self._build_call_outputs(self._inference_function.call(
-> 1964 ctx, args, cancellation_manager=cancellation_manager))
1965 forward_backward = self._select_forward_and_backward_functions(
1966 args,
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
594 inputs=args,
595 attrs=attrs,
--> 596 ctx=ctx)
597 else:
598 outputs = execute.execute_with_cancellation(
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (sequential/dense_3/BiasAdd:0) = ] [[-1.17921221][-1.64039445][-1.71617472]...] [y (Cast_8/x:0) = ] [0]
[[{{node assert_greater_equal/Assert/AssertGuard/else/_1/assert_greater_equal/Assert/AssertGuard/Assert}}]]
(1) Invalid argument: assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (sequential/dense_3/BiasAdd:0) = ] [[-1.17921221][-1.64039445][-1.71617472]...] [y (Cast_8/x:0) = ] [0]
[[{{node assert_greater_equal/Assert/AssertGuard/else/_1/assert_greater_equal/Assert/AssertGuard/Assert}}]]
[[assert_less_equal/Assert/AssertGuard/pivot_f/_13/_41]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_7509]
Function call stack:
train_function -> train_function
Sample data:
X_train
array([[7.97469882, 3.5471509 , 5.67233186, ..., 5.40341065, 3.16356072,
4.62657322],
[4.5390077 , 5.19247562, 5.21946796, ..., 4.12139197, 5.62828788,
4.31792717],
[5.83576307, 6.42946189, 4.85759085, ..., 5.26563348, 5.48022682,
5.29933021],
[4.58240518, 5.02457724, 6.1146384 , ..., 5.47847468, 6.62666587,
5.77328054],
[4.51602713, 5.17875946, 4.57102455, ..., 4.15622882, 5.48888824,
4.88704 ]])
y_train
array([[1],
[3],
[1],
[0],
[3]])

Cleaning Data in CSV file for ML Model

I'm trying to clean my data in jupyterlab by watching several tutorials, but I keep getting one or the other error every time. So I thought I'd come on stack overflow and ask if someone can help me.
This is the csv file I want to clean: https://1drv.ms/u/s!AvOXB8kb-IHBgjaveis044GVoPpk
I'm building a machine learning model so I want to convert all the object values, but I don't know how to.
EDIT: I tried cleaning the data from scratch.
My code input:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
criminal_data = pd.read_csv('database2.csv')
X = criminal_data.drop(columns=['Agency Type', 'City', 'State',
'Crime Solved'])
y = criminal_data['City']
model = DecisionTreeClassifier()
model.fit(X, y)
criminal_data
The error message:
ValueError Traceback (most recent call
last)
<ipython-input-117-4b6968f9994f> in <module>
6 y = criminal_data['City']
7 model = DecisionTreeClassifier()
----> 8 model.fit(X, y)
9 criminal_data
~\anaconda3\lib\site-packages\sklearn\tree\_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
896 """
897
--> 898 super().fit(
899 X, y,
900 sample_weight=sample_weight,
~\anaconda3\lib\site-packages\sklearn\tree\_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
154 check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
155 check_y_params = dict(ensure_2d=False, dtype=None)
--> 156 X, y = self._validate_data(X, y,
157 validate_separately=(check_X_params,
158 check_y_params))
~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
428 # :(
429 check_X_params, check_y_params =
validate_separately
--> 430 X = check_array(X, **check_X_params)
431 y = check_array(y, **check_y_params)
432 else:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in
check_array(array, accept_sparse, accept_large_sparse, dtype, order,
copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
614 array = array.astype(dtype, casting="unsafe",
copy=False)
615 else:
--> 616 array = np.asarray(array, order=order, dtype=dtype)
617 except ComplexWarning as complex_warning:
618 raise ValueError("Complex data not supported\n"
~\anaconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order, like)
100 return _asarray_with_like(a, dtype=dtype, order=order,
like=like)
101
--> 102 return array(a, dtype, copy=False, order=order)
103
104
~\anaconda3\lib\site-packages\pandas\core\generic.py in __array__(self, dtype)
1897
1898 def __array__(self, dtype=None) -> np.ndarray:
-> 1899 return np.asarray(self._values, dtype=dtype)
1900
1901 def __array_wrap__(
~\anaconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype,
order, like)
100 return _asarray_with_like(a, dtype=dtype, order=order,
like=like)
101
--> 102 return array(a, dtype, copy=False, order=order)
103
104
ValueError: could not convert string to float: 'Anchorage'
​
You are trying to train your model with some data that is not numerical. Before using the model, you need to do encoding. You can try LabelEncoder for that.
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column_name in X.columns:
if X[column_name].dtype == object:
X[column_name] = le.fit_transform(X[column_name])
else:
pass
If you have a combination of different data types in a row. Try below:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column_name in X.columns:
X[column_name] = X[column_name].replace(np.nan, 'none', regex=True)
X[column_name] = le.fit_transform(X[column_name].astype(str))

DeepChem GraphConvodel (GNN) training TypeError

I am a beginner to GNNs and I was trying out a code for predicting drug toxicity using DeepChem's Tox21 dataset. It is a dataset with a training set of 12 thousand compounds and test set of 650 compounds. I need in help in debugging and rectifying this error:"TypeError: 'NoneType' object is not subscriptable", which I get at the end.
Here is the code snippet:
model = GraphConvModel(len(tox21_tasks),
batch_size=32,
mode='classification')
print("Fitting the model")
model.fit(train_dataset, nb_epoch=10)
And here is my error:
TypeError Traceback (most recent call last)
<ipython-input-5-8088249b7fd6> in <module>
4 mode='classification')
5 print("Fitting the model")
----> 6 model.fit(train_dataset, nb_epoch=10)
~\anaconda3\lib\site-packages\deepchem\models\keras_model.py in fit(self, dataset, nb_epoch, max_checkpoints_to_keep, checkpoint_interval, deterministic, restore, variables, loss, callbacks, all_losses)
322 dataset, epochs=nb_epoch,
323 deterministic=deterministic), max_checkpoints_to_keep,
--> 324 checkpoint_interval, restore, variables, loss, callbacks, all_losses)
325
326 def fit_generator(self,
~\anaconda3\lib\site-packages\deepchem\models\keras_model.py in fit_generator(self, generator, max_checkpoints_to_keep, checkpoint_interval, restore, variables, loss, callbacks, all_losses)
407 inputs = inputs[0]
408
--> 409 batch_loss = apply_gradient_for_batch(inputs, labels, weights, loss)
410 current_step = self._global_step.numpy()
411
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\def_function.py in __call__(self, *args, **kwds)
455
456 tracing_count = self._get_tracing_count()
--> 457 result = self._call(*args, **kwds)
458 if tracing_count == self._get_tracing_count():
459 self._call_counter.called_without_tracing()
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\def_function.py in _call(self, *args, **kwds)
501 # This is the first call of __call__, so we have to initialize.
502 initializer_map = object_identity.ObjectIdentityDictionary()
--> 503 self._initialize(args, kwds, add_initializers_to=initializer_map)
504 finally:
505 # At this point we know that the initialization is complete (or less
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\def_function.py in _initialize(self, args, kwds, add_initializers_to)
406 self._concrete_stateful_fn = (
407 self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
--> 408 *args, **kwds))
409
410 def invalid_creator_scope(*unused_args, **unused_kwds):
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
1846 if self.input_signature:
1847 args, kwargs = None, None
-> 1848 graph_function, _, _ = self._maybe_define_function(args, kwargs)
1849 return graph_function
1850
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\function.py in _maybe_define_function(self, args, kwargs)
2148 graph_function = self._function_cache.primary.get(cache_key, None)
2149 if graph_function is None:
-> 2150 graph_function = self._create_graph_function(args, kwargs)
2151 self._function_cache.primary[cache_key] = graph_function
2152 return graph_function, args, kwargs
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
2039 arg_names=arg_names,
2040 override_flat_arg_shapes=override_flat_arg_shapes,
-> 2041 capture_by_value=self._capture_by_value),
2042 self._function_attributes,
2043 # Tell the ConcreteFunction to clean up its graph once it goes out of
~\anaconda3\lib\site-packages\tensorflow_core\python\framework\func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
913 converted_func)
914
--> 915 func_outputs = python_func(*func_args, **func_kwargs)
916
917 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\def_function.py in wrapped_fn(*args, **kwds)
356 # __wrapped__ allows AutoGraph to swap in a converted function. We give
357 # the function a weak reference to itself to avoid a reference cycle.
--> 358 return weak_wrapped_fn().__wrapped__(*args, **kwds)
359 weak_wrapped_fn = weakref.ref(wrapped_fn)
360
~\anaconda3\lib\site-packages\tensorflow_core\python\framework\func_graph.py in wrapper(*args, **kwargs)
903 except Exception as e: # pylint:disable=broad-except
904 if hasattr(e, "ag_error_metadata"):
--> 905 raise e.ag_error_metadata.to_exception(e)
906 else:
907 raise
TypeError: in converted code:
relative to C:\Users\Madiha\anaconda3\lib\site-packages:
deepchem\models\keras_model.py:474 apply_gradient_for_batch *
grads = tape.gradient(batch_loss, vars)
tensorflow_core\python\eager\backprop.py:1014 gradient
unconnected_gradients=unconnected_gradients)
tensorflow_core\python\eager\imperative_grad.py:76 imperative_grad
compat.as_str(unconnected_gradients.value))
tensorflow_core\python\eager\backprop.py:138 _gradient_function
return grad_fn(mock_op, *out_grads)
tensorflow_core\python\ops\math_grad.py:455 _UnsortedSegmentMaxGrad
return _UnsortedSegmentMinOrMaxGrad(op, grad)
tensorflow_core\python\ops\math_grad.py:432 _UnsortedSegmentMinOrMaxGrad
_GatherDropNegatives(op.outputs[0], op.inputs[1])
TypeError: 'NoneType' object is not subscriptable
As an advise, check some examples on the DeepChem website. Here is a code which will work:
tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets
model = dc.models.GraphConvModel(len(tasks),
batch_size=32,
mode='classification')
print("Fitting the model")
model.fit(train_dataset)
Hope is work for you!

TypeError: add(): argument 'other' (position 1) must be Tensor, not numpy.ndarray

I am testing a ResNet-34 trained_model using Pytorch and fastai on a linux system with the latest anaconda3. To run it as a batch job, I commented out the gui related lines. It started to run for a few hrs, then stopped in the Validation step, the error message is as below.
...
^M100%|█████████▉| 452/453 [1:07:07<00:08, 8.75s/it,
loss=1.23]^[[A^[[A^[[A
^MValidation: 0%| | 0/40 [00:00<?, ?it/s]^[[A^[[A^[[ATraceback
(most recent call last):
File "./resnet34_pretrained_PNG_nogui_2.py", line 279, in <module>
learner.fit(lr,1,callbacks=[f1_callback])
File "/project/6000192/jemmyhu/resnet_png/fastai/learner.py", line 302,
in fit
return self.fit_gen(self.model, self.data, layer_opt, n_cycle,
**kwargs)
File "/project/6000192/jemmyhu/resnet_png/fastai/learner.py", line 249,
in fit_gen
swa_eval_freq=swa_eval_freq, **kwargs)
File "/project/6000192/jemmyhu/resnet_png/fastai/model.py", line 162, in
fit
vals = validate(model_stepper, cur_data.val_dl, metrics, epoch,
seq_first=seq_first, validate_skip = validate_skip)
File "/project/6000192/jemmyhu/resnet_png/fastai/model.py", line 241, in
validate
res.append([to_np(f(datafy(preds), datafy(y))) for f in metrics])
File "/project/6000192/jemmyhu/resnet_png/fastai/model.py", line 241, in
<listcomp>
res.append([to_np(f(datafy(preds), datafy(y))) for f in metrics])
File "./resnet34_pretrained_PNG_nogui_2.py", line 237, in __call__
self.TP += (preds*targs).float().sum(dim=0)
TypeError: add(): argument 'other' (position 1) must be Tensor, not
numpy.ndarray
The link for the original code is
https://www.kaggle.com/iafoss/pretrained-resnet34-with-rgby-0-460-public-lb
lines 279 and 237 in my copy are shown below:
226 class F1:
227 __name__ = 'F1 macro'
228 def __init__(self,n=28):
229 self.n = n
230 self.TP = np.zeros(self.n)
231 self.FP = np.zeros(self.n)
232 self.FN = np.zeros(self.n)
233
234 def __call__(self,preds,targs,th=0.0):
235 preds = (preds > th).int()
236 targs = targs.int()
237 self.TP += (preds*targs).float().sum(dim=0)
238 self.FP += (preds > targs).float().sum(dim=0)
239 self.FN += (preds < targs).float().sum(dim=0)
240 score = (2.0*self.TP/(2.0*self.TP + self.FP + self.FN + 1e-6)).mean()
241 return score
276 lr = 0.5e-2
277 with warnings.catch_warnings():
278 warnings.simplefilter("ignore")
279 learner.fit(lr,1,callbacks=[f1_callback])
Could anyone have a clue for the issue?
Many thanks,
Jemmy
Ok, the error seems be for the latest pytorch-1.0.0, when degrade pytorch to pytorch-0.4.1, the code seems work (passed the error lines at this point). Still have no idea to make the code work with pytorch-1.0.0
I have had the same issue with this Kaggle kernel. My workarounds are the following:
1st option: In the F1 __call__ method convert preds and targs from pytorch tensors to numpy arrays;
2nd option: Initialise TP/FP/FN with pytorch tensors instead of numpy arrays, i.e. replace np.zeros(self.n) with torch.zeros(1, self.n).
Basically, the main idea - all variables should be of the same type.
Check your input data parameter.
Make sure it is in 123123 not in [123123]

Dask: ValueError: Integer column has NA values

I tried to use dask and found something that appears to be a bug in dask.dataframe.read_csv.
import dask.dataframe as dd
types = {'id': 'int16', 'Semana': 'uint8', 'Agencia_ID': 'uint16', 'Canal_ID': 'uint8',
'Ruta_SAK': 'uint16' ,'Cliente_ID': 'float32', 'Producto_ID': 'float32'}
name_map = {'Semana': 'week', 'Agencia_ID': 'agency', 'Canal_ID': 'channel',
'Ruta_SAK': 'route', 'Cliente_ID': 'client', 'Producto_ID': 'prod'}
test = dd.read_csv(os.path.join(datadir, 'test.csv'), usecols=types.keys(), dtype=types)
test = test.rename(columns=name_map)
gives :
ValueError: Integer column has NA values in column 1
However, the same pandas read_csv operation completes fine and does not yield any NA:
types = {'id': 'int16', 'Semana': 'uint8', 'Agencia_ID': 'uint16', 'Canal_ID': 'uint8',
'Ruta_SAK': 'uint16' ,'Cliente_ID': 'float32', 'Producto_ID': 'float32'}
name_map = {'Semana': 'week', 'Agencia_ID': 'agency', 'Canal_ID': 'channel',
'Ruta_SAK': 'route', 'Cliente_ID': 'client', 'Producto_ID': 'prod'}
test = pd.read_csv(os.path.join(datadir, 'test.csv'), usecols=types.keys(), dtype=types)
test = test.rename(columns=name_map)
test.isnull().any()
id False
week False
agency False
channel False
route False
client False
prod False
dtype: bool
Should I consider this to be an established bug and raise a JIRA for it?
Full traceback:
ValueError Traceback (most recent call last)
in ()
4 'Ruta_SAK': 'route', 'Cliente_ID': 'client', 'Producto_ID': 'prod'}
5
----> 6 test = dd.read_csv(os.path.join(datadir, 'test.csv'), usecols=types.keys(), dtype=types)
7 test = test.rename(columns=name_map)
D:\PROGLANG\Anaconda2\lib\site-packages\dask\dataframe\csv.pyc in read_csv(filename, blocksize, chunkbytes, collection, lineterminator, compression, sample, enforce, storage_options, **kwargs)
195 else:
196 header = sample.split(b_lineterminator)[0] + b_lineterminator
--> 197 head = pd.read_csv(BytesIO(sample), **kwargs)
198
199 df = read_csv_from_bytes(values, header, head, kwargs,
D:\PROGLANG\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
560 skip_blank_lines=skip_blank_lines)
561
--> 562 return _read(filepath_or_buffer, kwds)
563
564 parser_f.name = name
D:\PROGLANG\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in _read(filepath_or_buffer, kwds)
323 return parser
324
--> 325 return parser.read()
326
327 _parser_defaults = {
D:\PROGLANG\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in read(self, nrows)
813 raise ValueError('skip_footer not supported for iteration')
814
--> 815 ret = self._engine.read(nrows)
816
817 if self.options.get('as_recarray'):
D:\PROGLANG\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in read(self, nrows)
1312 def read(self, nrows=None):
1313 try:
-> 1314 data = self._reader.read(nrows)
1315 except StopIteration:
1316 if self._first_chunk:
pandas\parser.pyx in pandas.parser.TextReader.read (pandas\parser.c:8748)()
pandas\parser.pyx in pandas.parser.TextReader._read_low_memory (pandas\parser.c:9003)()
pandas\parser.pyx in pandas.parser.TextReader._read_rows (pandas\parser.c:10022)()
pandas\parser.pyx in pandas.parser.TextReader._convert_column_data (pandas\parser.c:11397)()
pandas\parser.pyx in pandas.parser.TextReader._convert_tokens (pandas\parser.c:12093)()
pandas\parser.pyx in pandas.parser.TextReader._convert_with_dtype (pandas\parser.c:13057)()
ValueError: Integer column has NA values in column 1

Resources