Dask: ValueError: Integer column has NA values - dask

I tried to use dask and found something that appears to be a bug in dask.dataframe.read_csv.
import dask.dataframe as dd
types = {'id': 'int16', 'Semana': 'uint8', 'Agencia_ID': 'uint16', 'Canal_ID': 'uint8',
'Ruta_SAK': 'uint16' ,'Cliente_ID': 'float32', 'Producto_ID': 'float32'}
name_map = {'Semana': 'week', 'Agencia_ID': 'agency', 'Canal_ID': 'channel',
'Ruta_SAK': 'route', 'Cliente_ID': 'client', 'Producto_ID': 'prod'}
test = dd.read_csv(os.path.join(datadir, 'test.csv'), usecols=types.keys(), dtype=types)
test = test.rename(columns=name_map)
gives :
ValueError: Integer column has NA values in column 1
However, the same pandas read_csv operation completes fine and does not yield any NA:
types = {'id': 'int16', 'Semana': 'uint8', 'Agencia_ID': 'uint16', 'Canal_ID': 'uint8',
'Ruta_SAK': 'uint16' ,'Cliente_ID': 'float32', 'Producto_ID': 'float32'}
name_map = {'Semana': 'week', 'Agencia_ID': 'agency', 'Canal_ID': 'channel',
'Ruta_SAK': 'route', 'Cliente_ID': 'client', 'Producto_ID': 'prod'}
test = pd.read_csv(os.path.join(datadir, 'test.csv'), usecols=types.keys(), dtype=types)
test = test.rename(columns=name_map)
test.isnull().any()
id False
week False
agency False
channel False
route False
client False
prod False
dtype: bool
Should I consider this to be an established bug and raise a JIRA for it?
Full traceback:
ValueError Traceback (most recent call last)
in ()
4 'Ruta_SAK': 'route', 'Cliente_ID': 'client', 'Producto_ID': 'prod'}
5
----> 6 test = dd.read_csv(os.path.join(datadir, 'test.csv'), usecols=types.keys(), dtype=types)
7 test = test.rename(columns=name_map)
D:\PROGLANG\Anaconda2\lib\site-packages\dask\dataframe\csv.pyc in read_csv(filename, blocksize, chunkbytes, collection, lineterminator, compression, sample, enforce, storage_options, **kwargs)
195 else:
196 header = sample.split(b_lineterminator)[0] + b_lineterminator
--> 197 head = pd.read_csv(BytesIO(sample), **kwargs)
198
199 df = read_csv_from_bytes(values, header, head, kwargs,
D:\PROGLANG\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
560 skip_blank_lines=skip_blank_lines)
561
--> 562 return _read(filepath_or_buffer, kwds)
563
564 parser_f.name = name
D:\PROGLANG\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in _read(filepath_or_buffer, kwds)
323 return parser
324
--> 325 return parser.read()
326
327 _parser_defaults = {
D:\PROGLANG\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in read(self, nrows)
813 raise ValueError('skip_footer not supported for iteration')
814
--> 815 ret = self._engine.read(nrows)
816
817 if self.options.get('as_recarray'):
D:\PROGLANG\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in read(self, nrows)
1312 def read(self, nrows=None):
1313 try:
-> 1314 data = self._reader.read(nrows)
1315 except StopIteration:
1316 if self._first_chunk:
pandas\parser.pyx in pandas.parser.TextReader.read (pandas\parser.c:8748)()
pandas\parser.pyx in pandas.parser.TextReader._read_low_memory (pandas\parser.c:9003)()
pandas\parser.pyx in pandas.parser.TextReader._read_rows (pandas\parser.c:10022)()
pandas\parser.pyx in pandas.parser.TextReader._convert_column_data (pandas\parser.c:11397)()
pandas\parser.pyx in pandas.parser.TextReader._convert_tokens (pandas\parser.c:12093)()
pandas\parser.pyx in pandas.parser.TextReader._convert_with_dtype (pandas\parser.c:13057)()
ValueError: Integer column has NA values in column 1

Related

InvalidArgumentError when building sequential model using Keras

I want to develop a sequential model using optimal hyperparameters derived from Keras Tuner. My code raised "Invalid argument: assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:]" error.
I tried adding +1 or +10 to the entire X_train but still get the same error.
Build the sequential model:
def model_builder(hp):
model = Sequential()
# Tune the number of units in the first Dense layer
# Choose an optimal value between 32-512
hp_units1 = hp.Int('units1', min_value=32, max_value=512, step=32)
hp_units2 = hp.Int('units2', min_value=32, max_value=512, step=32)
hp_units3 = hp.Int('units3', min_value=32, max_value=512, step=32)
model.add(Dense(units=hp_units1, activation='relu'))
model.add(tf.keras.layers.Dense(units=hp_units2, activation='relu'))
model.add(tf.keras.layers.Dense(units=hp_units3, activation='relu'))
model.add(Dense(1, kernel_initializer='normal', activation='linear')) # output layer
# Tune the learning rate for the optimizer
# Choose an optimal value from 0.01, 0.001, or 0.0001
hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
# Performance visualization callback
performance_viz_cbk = PerformanceVisualizationCallback(
model=model,
test_data=X_test,
image_dir='c:\perorfmance_charts')
# Model summary
model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
loss=SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy', tf.keras.metrics.AUC(), MulticlassTruePositives()])
return model
Hyperparameter tuning
tuner = kt.Hyperband(model_builder,
objective='val_accuracy',
max_epochs=10,
factor=3,
directory='my_dir',
project_name='intro_to_kt')
tuner.search(X_train, y_train, epochs=10, validation_split=0.2)
Traceback:
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
/tmp/ipykernel_17/1976310635.py in <module>
----> 1 tuner.search(X_train, y_train, epochs=10, validation_split=0.2)
/opt/conda/lib/python3.7/site-packages/keras_tuner/engine/base_tuner.py in search(self, *fit_args, **fit_kwargs)
177
178 self.on_trial_begin(trial)
--> 179 results = self.run_trial(trial, *fit_args, **fit_kwargs)
180 # `results` is None indicates user updated oracle in `run_trial()`.
181 if results is None:
/opt/conda/lib/python3.7/site-packages/keras_tuner/tuners/hyperband.py in run_trial(self, trial, *fit_args, **fit_kwargs)
382 fit_kwargs["epochs"] = hp.values["tuner/epochs"]
383 fit_kwargs["initial_epoch"] = hp.values["tuner/initial_epoch"]
--> 384 return super(Hyperband, self).run_trial(trial, *fit_args, **fit_kwargs)
385
386 def _build_model(self, hp):
/opt/conda/lib/python3.7/site-packages/keras_tuner/engine/tuner.py in run_trial(self, trial, *args, **kwargs)
292 callbacks.append(model_checkpoint)
293 copied_kwargs["callbacks"] = callbacks
--> 294 obj_value = self._build_and_fit_model(trial, *args, **copied_kwargs)
295
296 histories.append(obj_value)
/opt/conda/lib/python3.7/site-packages/keras_tuner/engine/tuner.py in _build_and_fit_model(self, trial, *args, **kwargs)
220 hp = trial.hyperparameters
221 model = self._try_build(hp)
--> 222 results = self.hypermodel.fit(hp, model, *args, **kwargs)
223 return tuner_utils.convert_to_metrics_dict(
224 results, self.oracle.objective, "HyperModel.fit()"
/opt/conda/lib/python3.7/site-packages/keras_tuner/engine/hypermodel.py in fit(self, hp, model, *args, **kwargs)
135 If return a float, it should be the `objective` value.
136 """
--> 137 return model.fit(*args, **kwargs)
138
139
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1182 _r=1):
1183 callbacks.on_train_batch_begin(step)
-> 1184 tmp_logs = self.train_function(iterator)
1185 if data_handler.should_sync:
1186 context.async_wait()
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
883
884 with OptionalXlaContext(self._jit_compile):
--> 885 result = self._call(*args, **kwds)
886
887 new_tracing_count = self.experimental_get_tracing_count()
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
948 # Lifting succeeded, so variables are initialized and we can run the
949 # stateless function.
--> 950 return self._stateless_fn(*args, **kwds)
951 else:
952 _, _, _, filtered_flat_args = \
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
3038 filtered_flat_args) = self._maybe_define_function(args, kwargs)
3039 return graph_function._call_flat(
-> 3040 filtered_flat_args, captured_inputs=graph_function.captured_inputs) # pylint: disable=protected-access
3041
3042 #property
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1962 # No tape is watching; skip to running the function.
1963 return self._build_call_outputs(self._inference_function.call(
-> 1964 ctx, args, cancellation_manager=cancellation_manager))
1965 forward_backward = self._select_forward_and_backward_functions(
1966 args,
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
594 inputs=args,
595 attrs=attrs,
--> 596 ctx=ctx)
597 else:
598 outputs = execute.execute_with_cancellation(
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (sequential/dense_3/BiasAdd:0) = ] [[-1.17921221][-1.64039445][-1.71617472]...] [y (Cast_8/x:0) = ] [0]
[[{{node assert_greater_equal/Assert/AssertGuard/else/_1/assert_greater_equal/Assert/AssertGuard/Assert}}]]
(1) Invalid argument: assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (sequential/dense_3/BiasAdd:0) = ] [[-1.17921221][-1.64039445][-1.71617472]...] [y (Cast_8/x:0) = ] [0]
[[{{node assert_greater_equal/Assert/AssertGuard/else/_1/assert_greater_equal/Assert/AssertGuard/Assert}}]]
[[assert_less_equal/Assert/AssertGuard/pivot_f/_13/_41]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_7509]
Function call stack:
train_function -> train_function
Sample data:
X_train
array([[7.97469882, 3.5471509 , 5.67233186, ..., 5.40341065, 3.16356072,
4.62657322],
[4.5390077 , 5.19247562, 5.21946796, ..., 4.12139197, 5.62828788,
4.31792717],
[5.83576307, 6.42946189, 4.85759085, ..., 5.26563348, 5.48022682,
5.29933021],
[4.58240518, 5.02457724, 6.1146384 , ..., 5.47847468, 6.62666587,
5.77328054],
[4.51602713, 5.17875946, 4.57102455, ..., 4.15622882, 5.48888824,
4.88704 ]])
y_train
array([[1],
[3],
[1],
[0],
[3]])

Error using BayesSearchCV from skopt on RandomForestClassifier

this is the code to reproduce the error:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from scipy.stats import loguniform
from skopt import BayesSearchCV
from sklearn.datasets import load_iris
import numpy as np
X, y = load_iris(return_X_y=True)
grid = {
'LogisticRegression' : {
'C': loguniform.rvs(0.1, 10000, size = 50),
'solver': ['lbfgs','saga'],
'penalty': ['l2'],
'warm_start': [False, True],
'class_weight' : [None, 'balanced'],
'max_iter': [100, 1000],
'n_jobs': [ 10 ]
},
'RandomForestClassifier' : {
'n_estimators': np.random.randint(5, 200, size=10),
'criterion' : [ 'gini', 'entropy' ],
'max_depth' : np.random.randint(5, 50, size=10),
'min_samples_split': np.random.randint(5, 50, size=10),
'min_samples_leaf': np.random.randint(5, 50, size=10),
'max_features' : loguniform.rvs(0.2, 1.0, size=5),
'n_jobs' : [ 10 ]
}
}
tuner_params = {
'cv': 2,
'n_jobs': 10,
'scoring': 'roc_auc_ovr',
'return_train_score': True,
'refit': True,
'n_iter':3
}
clf = 'LogisticRegression'
search_cv = BayesSearchCV( estimator = eval(clf)(), search_spaces = grid[clf], **tuner_params)
search_cv.fit(X,y)
clf = 'RandomForestClassifier'
search_cv = BayesSearchCV( estimator = eval(clf)(), search_spaces = grid[clf], **tuner_params)
search_cv.fit(X,y)
Using BayesSearchCV on LogisticRegression as classifier gives no error, while using RandomForestClassifier it gives the following error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Input In [8], in <cell line: 2>()
1 search_cv = BayesSearchCV( estimator = eval(clf)(), search_spaces = grid[clf], **tuner_params)
----> 2 search_cv.fit(X,y)
File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/searchcv.py:466, in BayesSearchCV.fit(self, X, y, groups, callback, **fit_params)
463 else:
464 self.optimizer_kwargs_ = dict(self.optimizer_kwargs)
--> 466 super().fit(X=X, y=y, groups=groups, **fit_params)
468 # BaseSearchCV never ranked train scores,
469 # but apparently we used to ship this (back-compat)
470 if self.return_train_score:
File ~/.conda/envs/meth/lib/python3.9/site-packages/sklearn/model_selection/_search.py:875, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
869 results = self._format_results(
870 all_candidate_params, n_splits, all_out, all_more_results
871 )
873 return results
--> 875 self._run_search(evaluate_candidates)
877 # multimetric is determined here because in the case of a callable
878 # self.scoring the return type is only known after calling
879 first_test_score = all_out[0]["test_scores"]
File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/searchcv.py:512, in BayesSearchCV._run_search(self, evaluate_candidates)
508 while n_iter > 0:
509 # when n_iter < n_points points left for evaluation
510 n_points_adjusted = min(n_iter, n_points)
--> 512 optim_result = self._step(
513 search_space, optimizer,
514 evaluate_candidates, n_points=n_points_adjusted
515 )
516 n_iter -= n_points
518 if eval_callbacks(callbacks, optim_result):
File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/searchcv.py:400, in BayesSearchCV._step(self, search_space, optimizer, evaluate_candidates, n_points)
397 """Generate n_jobs parameters and evaluate them in parallel.
398 """
399 # get parameter values to evaluate
--> 400 params = optimizer.ask(n_points=n_points)
402 # convert parameters to python native types
403 params = [[np.array(v).item() for v in p] for p in params]
File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:395, in Optimizer.ask(self, n_points, strategy)
393 X = []
394 for i in range(n_points):
--> 395 x = opt.ask()
396 X.append(x)
398 ti_available = "ps" in self.acq_func and len(opt.yi) > 0
File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:367, in Optimizer.ask(self, n_points, strategy)
336 """Query point or multiple points at which objective should be evaluated.
337
338 n_points : int or None, default: None
(...)
364
365 """
366 if n_points is None:
--> 367 return self._ask()
369 supported_strategies = ["cl_min", "cl_mean", "cl_max"]
371 if not (isinstance(n_points, int) and n_points > 0):
File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:434, in Optimizer._ask(self)
430 if self._n_initial_points > 0 or self.base_estimator_ is None:
431 # this will not make a copy of `self.rng` and hence keep advancing
432 # our random state.
433 if self._initial_samples is None:
--> 434 return self.space.rvs(random_state=self.rng)[0]
435 else:
436 # The samples are evaluated starting form initial_samples[0]
437 return self._initial_samples[
438 len(self._initial_samples) - self._n_initial_points]
File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/space/space.py:900, in Space.rvs(self, n_samples, random_state)
897 columns = []
899 for dim in self.dimensions:
--> 900 columns.append(dim.rvs(n_samples=n_samples, random_state=rng))
902 # Transpose
903 return _transpose_list_array(columns)
File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/space/space.py:698, in Categorical.rvs(self, n_samples, random_state)
696 return self.inverse_transform([(choices)])
697 elif self.transform_ == "normalize":
--> 698 return self.inverse_transform(list(choices))
699 else:
700 return [self.categories[c] for c in choices]
File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/space/space.py:685, in Categorical.inverse_transform(self, Xt)
680 """Inverse transform samples from the warped space back into the
681 original space.
682 """
683 # The concatenation of all transformed dimensions makes Xt to be
684 # of type float, hence the required cast back to int.
--> 685 inv_transform = super(Categorical, self).inverse_transform(Xt)
686 if isinstance(inv_transform, list):
687 inv_transform = np.array(inv_transform)
File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/space/space.py:168, in Dimension.inverse_transform(self, Xt)
164 def inverse_transform(self, Xt):
165 """Inverse transform samples from the warped space back into the
166 original space.
167 """
--> 168 return self.transformer.inverse_transform(Xt)
File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/space/transformers.py:309, in Pipeline.inverse_transform(self, X)
307 def inverse_transform(self, X):
308 for transformer in self.transformers[::-1]:
--> 309 X = transformer.inverse_transform(X)
310 return X
File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/space/transformers.py:216, in LabelEncoder.inverse_transform(self, Xt)
214 else:
215 Xt = np.asarray(Xt)
--> 216 return [
217 self.inverse_mapping_[int(np.round(i))] for i in Xt
218 ]
File ~/.conda/envs/meth/lib/python3.9/site-packages/skopt/space/transformers.py:217, in <listcomp>(.0)
214 else:
215 Xt = np.asarray(Xt)
216 return [
--> 217 self.inverse_mapping_[int(np.round(i))] for i in Xt
218 ]
KeyError: 9
My versions:
python: 3.9.12
sklearn: 1.1.1
skopt: 0.9.0
The same error happen when using XGBClassifier or GradientBoostingClassifier, while there is no error using SVC or KNeighborsClassifier.
I believe that's related to how skopt encodes the hyperparameter space: it seems having identical points generated by your random lists are required to trigger the error, though sometimes it fits regardless. Either there are collisions or it makes the grid to be processed erroneously.
At least the issue stopped reproducing for me after changing all random lists to list(range(...)).
Might be worth a bug report.

Cleaning Data in CSV file for ML Model

I'm trying to clean my data in jupyterlab by watching several tutorials, but I keep getting one or the other error every time. So I thought I'd come on stack overflow and ask if someone can help me.
This is the csv file I want to clean: https://1drv.ms/u/s!AvOXB8kb-IHBgjaveis044GVoPpk
I'm building a machine learning model so I want to convert all the object values, but I don't know how to.
EDIT: I tried cleaning the data from scratch.
My code input:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
criminal_data = pd.read_csv('database2.csv')
X = criminal_data.drop(columns=['Agency Type', 'City', 'State',
'Crime Solved'])
y = criminal_data['City']
model = DecisionTreeClassifier()
model.fit(X, y)
criminal_data
The error message:
ValueError Traceback (most recent call
last)
<ipython-input-117-4b6968f9994f> in <module>
6 y = criminal_data['City']
7 model = DecisionTreeClassifier()
----> 8 model.fit(X, y)
9 criminal_data
~\anaconda3\lib\site-packages\sklearn\tree\_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
896 """
897
--> 898 super().fit(
899 X, y,
900 sample_weight=sample_weight,
~\anaconda3\lib\site-packages\sklearn\tree\_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
154 check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
155 check_y_params = dict(ensure_2d=False, dtype=None)
--> 156 X, y = self._validate_data(X, y,
157 validate_separately=(check_X_params,
158 check_y_params))
~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
428 # :(
429 check_X_params, check_y_params =
validate_separately
--> 430 X = check_array(X, **check_X_params)
431 y = check_array(y, **check_y_params)
432 else:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in
check_array(array, accept_sparse, accept_large_sparse, dtype, order,
copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
614 array = array.astype(dtype, casting="unsafe",
copy=False)
615 else:
--> 616 array = np.asarray(array, order=order, dtype=dtype)
617 except ComplexWarning as complex_warning:
618 raise ValueError("Complex data not supported\n"
~\anaconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order, like)
100 return _asarray_with_like(a, dtype=dtype, order=order,
like=like)
101
--> 102 return array(a, dtype, copy=False, order=order)
103
104
~\anaconda3\lib\site-packages\pandas\core\generic.py in __array__(self, dtype)
1897
1898 def __array__(self, dtype=None) -> np.ndarray:
-> 1899 return np.asarray(self._values, dtype=dtype)
1900
1901 def __array_wrap__(
~\anaconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype,
order, like)
100 return _asarray_with_like(a, dtype=dtype, order=order,
like=like)
101
--> 102 return array(a, dtype, copy=False, order=order)
103
104
ValueError: could not convert string to float: 'Anchorage'
​
You are trying to train your model with some data that is not numerical. Before using the model, you need to do encoding. You can try LabelEncoder for that.
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column_name in X.columns:
if X[column_name].dtype == object:
X[column_name] = le.fit_transform(X[column_name])
else:
pass
If you have a combination of different data types in a row. Try below:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column_name in X.columns:
X[column_name] = X[column_name].replace(np.nan, 'none', regex=True)
X[column_name] = le.fit_transform(X[column_name].astype(str))

DeepChem GraphConvodel (GNN) training TypeError

I am a beginner to GNNs and I was trying out a code for predicting drug toxicity using DeepChem's Tox21 dataset. It is a dataset with a training set of 12 thousand compounds and test set of 650 compounds. I need in help in debugging and rectifying this error:"TypeError: 'NoneType' object is not subscriptable", which I get at the end.
Here is the code snippet:
model = GraphConvModel(len(tox21_tasks),
batch_size=32,
mode='classification')
print("Fitting the model")
model.fit(train_dataset, nb_epoch=10)
And here is my error:
TypeError Traceback (most recent call last)
<ipython-input-5-8088249b7fd6> in <module>
4 mode='classification')
5 print("Fitting the model")
----> 6 model.fit(train_dataset, nb_epoch=10)
~\anaconda3\lib\site-packages\deepchem\models\keras_model.py in fit(self, dataset, nb_epoch, max_checkpoints_to_keep, checkpoint_interval, deterministic, restore, variables, loss, callbacks, all_losses)
322 dataset, epochs=nb_epoch,
323 deterministic=deterministic), max_checkpoints_to_keep,
--> 324 checkpoint_interval, restore, variables, loss, callbacks, all_losses)
325
326 def fit_generator(self,
~\anaconda3\lib\site-packages\deepchem\models\keras_model.py in fit_generator(self, generator, max_checkpoints_to_keep, checkpoint_interval, restore, variables, loss, callbacks, all_losses)
407 inputs = inputs[0]
408
--> 409 batch_loss = apply_gradient_for_batch(inputs, labels, weights, loss)
410 current_step = self._global_step.numpy()
411
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\def_function.py in __call__(self, *args, **kwds)
455
456 tracing_count = self._get_tracing_count()
--> 457 result = self._call(*args, **kwds)
458 if tracing_count == self._get_tracing_count():
459 self._call_counter.called_without_tracing()
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\def_function.py in _call(self, *args, **kwds)
501 # This is the first call of __call__, so we have to initialize.
502 initializer_map = object_identity.ObjectIdentityDictionary()
--> 503 self._initialize(args, kwds, add_initializers_to=initializer_map)
504 finally:
505 # At this point we know that the initialization is complete (or less
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\def_function.py in _initialize(self, args, kwds, add_initializers_to)
406 self._concrete_stateful_fn = (
407 self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
--> 408 *args, **kwds))
409
410 def invalid_creator_scope(*unused_args, **unused_kwds):
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
1846 if self.input_signature:
1847 args, kwargs = None, None
-> 1848 graph_function, _, _ = self._maybe_define_function(args, kwargs)
1849 return graph_function
1850
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\function.py in _maybe_define_function(self, args, kwargs)
2148 graph_function = self._function_cache.primary.get(cache_key, None)
2149 if graph_function is None:
-> 2150 graph_function = self._create_graph_function(args, kwargs)
2151 self._function_cache.primary[cache_key] = graph_function
2152 return graph_function, args, kwargs
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
2039 arg_names=arg_names,
2040 override_flat_arg_shapes=override_flat_arg_shapes,
-> 2041 capture_by_value=self._capture_by_value),
2042 self._function_attributes,
2043 # Tell the ConcreteFunction to clean up its graph once it goes out of
~\anaconda3\lib\site-packages\tensorflow_core\python\framework\func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
913 converted_func)
914
--> 915 func_outputs = python_func(*func_args, **func_kwargs)
916
917 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\def_function.py in wrapped_fn(*args, **kwds)
356 # __wrapped__ allows AutoGraph to swap in a converted function. We give
357 # the function a weak reference to itself to avoid a reference cycle.
--> 358 return weak_wrapped_fn().__wrapped__(*args, **kwds)
359 weak_wrapped_fn = weakref.ref(wrapped_fn)
360
~\anaconda3\lib\site-packages\tensorflow_core\python\framework\func_graph.py in wrapper(*args, **kwargs)
903 except Exception as e: # pylint:disable=broad-except
904 if hasattr(e, "ag_error_metadata"):
--> 905 raise e.ag_error_metadata.to_exception(e)
906 else:
907 raise
TypeError: in converted code:
relative to C:\Users\Madiha\anaconda3\lib\site-packages:
deepchem\models\keras_model.py:474 apply_gradient_for_batch *
grads = tape.gradient(batch_loss, vars)
tensorflow_core\python\eager\backprop.py:1014 gradient
unconnected_gradients=unconnected_gradients)
tensorflow_core\python\eager\imperative_grad.py:76 imperative_grad
compat.as_str(unconnected_gradients.value))
tensorflow_core\python\eager\backprop.py:138 _gradient_function
return grad_fn(mock_op, *out_grads)
tensorflow_core\python\ops\math_grad.py:455 _UnsortedSegmentMaxGrad
return _UnsortedSegmentMinOrMaxGrad(op, grad)
tensorflow_core\python\ops\math_grad.py:432 _UnsortedSegmentMinOrMaxGrad
_GatherDropNegatives(op.outputs[0], op.inputs[1])
TypeError: 'NoneType' object is not subscriptable
As an advise, check some examples on the DeepChem website. Here is a code which will work:
tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets
model = dc.models.GraphConvModel(len(tasks),
batch_size=32,
mode='classification')
print("Fitting the model")
model.fit(train_dataset)
Hope is work for you!

Problem with evaluation function in tensorflow federated

I was trying to reimplement the github tutorial with my own CNN-based model with Keras. But I got an error when evaluating.
from __future__ import absolute_import, division, print_function
import collections
from six.moves import range
import numpy as np
import tensorflow as tf
from tensorflow.python.keras.optimizer_v2 import gradient_descent
from tensorflow_federated import python as tff
emnist_train, emnist_test = tff.simulation.datasets.emnist.load_data()
example_dataset = emnist_train.create_tf_dataset_for_client(
emnist_train.client_ids[0])
NUM_EPOCHS = 10
BATCH_SIZE = 20
SHUFFLE_BUFFER = 500
def preprocess(dataset):
def element_fn(element):
return collections.OrderedDict([
('x', tf.reshape(element['pixels'], [-1])),
('y', tf.reshape(element['label'], [1])),
])
return dataset.repeat(NUM_EPOCHS).map(element_fn).shuffle(
SHUFFLE_BUFFER).batch(BATCH_SIZE)
preprocessed_example_dataset = preprocess(example_dataset)
sample_batch = nest.map_structure(
lambda x: x.numpy(), iter(preprocessed_example_dataset).next())
def make_federated_data(client_data, client_ids):
return [preprocess(client_data.create_tf_dataset_for_client(x))
for x in client_ids]
NUM_CLIENTS = 3
sample_clients = emnist_train.client_ids[0:NUM_CLIENTS]
federated_train_data = make_federated_data(emnist_train, sample_clients)
len(federated_train_data), federated_train_data[0]
def create_compiled_keras_model():
model = tf.keras.models.Sequential([
tf.keras.layers.Reshape((28,28,1), input_shape=(784,)),
tf.keras.layers.Conv2D(32, kernel_size=(5,5), activation="relu", padding = "same", strides = 1),
tf.keras.layers.MaxPooling2D(pool_size=2, strides=2, padding='valid'),
tf.keras.layers.Conv2D(64, kernel_size=(5,5), activation="relu", padding = "same", strides = 1),
tf.keras.layers.MaxPooling2D(pool_size=2, strides=2, padding='valid'),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(512, activation="relu"),
tf.keras.layers.Dense(10, activation="softmax"),
])
def loss_fn(y_true, y_pred):
return tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(
y_true, y_pred))
model.compile(
loss=loss_fn,
optimizer=gradient_descent.SGD(learning_rate=0.02),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
return model
def model_fn():
keras_model = create_compiled_keras_model()
return tff.learning.from_compiled_keras_model(keras_model, sample_batch)
iterative_process = tff.learning.build_federated_averaging_process(model_fn)
state = iterative_process.initialize()
for round_num in range(1,10):
state, metrics = iterative_process.next(state, federated_train_data)
print('round {:2d}, metrics={}'.format(round_num, metrics))
##Evaluation of the model
#This function doesn't work
evaluation = tff.learning.build_federated_evaluation(model_fn)
federated_test_data = make_federated_data(emnist_test, sample_clients)
test_metrics = evaluation(state.model, federated_test_data)
I expect the evaluation of the test data, but the actual output is the following error:
---------------------------------------------------------------------------
_FallbackException Traceback (most recent call last)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/ops/gen_functional_ops.py in stateful_partitioned_call(args, Tout, f, config, config_proto, executor_type, name)
482 "Tout", Tout, "f", f, "config", config, "config_proto", config_proto,
--> 483 "executor_type", executor_type)
484 return _result
_FallbackException: This function does not handle the case of the path where all inputs are not already EagerTensors.
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
<ipython-input-23-6e9c77f70201> in <module>()
----> 1 evaluation = tff.learning.build_federated_evaluation(model_fn)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow_federated/python/learning/federated_evaluation.py in build_federated_evaluation(model_fn)
83 #tff.federated_computation(
84 tff.FederatedType(model_weights_type, tff.SERVER, all_equal=True),
---> 85 tff.FederatedType(tff.SequenceType(batch_type), tff.CLIENTS))
86 def server_eval(server_model_weights, federated_dataset):
87 client_outputs = tff.federated_map(
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow_federated/python/core/impl/computation_wrapper.py in <lambda>(fn)
406 args = (args,)
407 arg_type = computation_types.to_type(args[0])
--> 408 return lambda fn: _wrap(fn, arg_type, self._wrapper_fn)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow_federated/python/core/impl/computation_wrapper.py in _wrap(fn, parameter_type, wrapper_fn)
94 function_utils.wrap_as_zero_or_one_arg_callable(fn, parameter_type),
95 parameter_type,
---> 96 name=fn_name)
97 py_typecheck.check_type(concrete_fn, function_utils.ConcreteFunction,
98 'value returned by the wrapper')
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow_federated/python/core/impl/computation_wrapper_instances.py in _federated_computation_wrapper_fn(target_fn, parameter_type, name)
52 parameter_type,
53 ctx_stack,
---> 54 suggested_name=name))
55 return computation_impl.ComputationImpl(target_lambda.proto, ctx_stack)
56
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow_federated/python/core/impl/federated_computation_utils.py in zero_or_one_arg_fn_to_building_block(fn, parameter_name, parameter_type, context_stack, suggested_name)
73 value_impl.ValueImpl(
74 computation_building_blocks.Reference(
---> 75 parameter_name, parameter_type), context_stack))
76 else:
77 result = fn()
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow_federated/python/core/impl/function_utils.py in <lambda>(arg)
551 # and to force any parameter bindings to be resolved now.
552 # pylint: disable=unnecessary-lambda,undefined-variable
--> 553 return (lambda fn, at, kt: lambda arg: _unpack_and_call(fn, at, kt, arg))(
554 fn, arg_types, kwarg_types)
555 # pylint: enable=unnecessary-lambda,undefined-variable
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow_federated/python/core/impl/function_utils.py in _unpack_and_call(fn, arg_types, kwarg_types, arg)
545 name, str(expected_type), str(actual_type)))
546 kwargs[name] = element_value
--> 547 return fn(*args, **kwargs)
548
549 # Deliberate wrapping to isolate the caller from the underlying function
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow_federated/python/learning/federated_evaluation.py in server_eval(server_model_weights, federated_dataset)
88 client_eval,
89 [tff.federated_broadcast(server_model_weights), federated_dataset])
---> 90 return model.federated_output_computation(client_outputs.local_outputs)
91
92 return server_eval
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow_federated/python/learning/model_utils.py in federated_output_computation(self)
531 #property
532 def federated_output_computation(self):
--> 533 return self._model.federated_output_computation
534
535
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow_federated/python/learning/model_utils.py in federated_output_computation(self)
406 def federated_output_computation(self):
407 metric_variable_type_dict = nest.map_structure(tf.TensorSpec.from_tensor,
--> 408 self.report_local_outputs())
409 federated_local_outputs_type = tff.FederatedType(
410 metric_variable_type_dict, tff.CLIENTS, all_equal=False)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
314 if not self._created_variables:
315 # If we did not create any variables the trace we have is good enough.
--> 316 return self._concrete_stateful_fn._filtered_call(canon_args, canon_kwds) # pylint: disable=protected-access
317
318 def fn_with_cond(*inner_args, **inner_kwds):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/eager/function.py in _filtered_call(self, args, kwargs)
382 """
383 return self._call_flat(
--> 384 (t for t in nest.flatten((args, kwargs))
385 if isinstance(
386 t, (ops.Tensor, resource_variable_ops.ResourceVariable))))
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/eager/function.py in _call_flat(self, args)
431 # Only need to override the gradient in graph mode and when we have outputs.
432 if context.executing_eagerly() or not self.outputs:
--> 433 outputs = self._inference_function.call(ctx, args)
434 else:
435 if not self._gradient_name:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/eager/function.py in call(self, ctx, args)
267 executing_eagerly=executing_eagerly,
268 config=function_call_options.config_proto_serialized,
--> 269 executor_type=function_call_options.executor_type)
270
271 if executing_eagerly:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/ops/functional_ops.py in partitioned_call(args, f, tout, executing_eagerly, config, executor_type)
1081 outputs = gen_functional_ops.stateful_partitioned_call(
1082 args=args, Tout=tout, f=f, config_proto=config,
-> 1083 executor_type=executor_type)
1084 else:
1085 outputs = gen_functional_ops.partitioned_call(
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/ops/gen_functional_ops.py in stateful_partitioned_call(args, Tout, f, config, config_proto, executor_type, name)
487 return stateful_partitioned_call_eager_fallback(
488 args, Tout=Tout, f=f, config=config, config_proto=config_proto,
--> 489 executor_type=executor_type, name=name, ctx=_ctx)
490 except _core._SymbolicException:
491 pass # Add nodes to the TensorFlow graph.
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/ops/gen_functional_ops.py in stateful_partitioned_call_eager_fallback(args, Tout, f, config, config_proto, executor_type, name, ctx)
548 executor_type = ""
549 executor_type = _execute.make_str(executor_type, "executor_type")
--> 550 _attr_Tin, args = _execute.convert_to_mixed_eager_tensors(args, _ctx)
551 _inputs_flat = list(args)
552 _attrs = ("Tin", _attr_Tin, "Tout", Tout, "f", f, "config", config,
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/eager/execute.py in convert_to_mixed_eager_tensors(values, ctx)
207 def convert_to_mixed_eager_tensors(values, ctx):
208 v = [ops.internal_convert_to_tensor(t, ctx=ctx) for t in values]
--> 209 types = [t._datatype_enum() for t in v] # pylint: disable=protected-access
210 return types, v
211
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/eager/execute.py in <listcomp>(.0)
207 def convert_to_mixed_eager_tensors(values, ctx):
208 v = [ops.internal_convert_to_tensor(t, ctx=ctx) for t in values]
--> 209 types = [t._datatype_enum() for t in v] # pylint: disable=protected-access
210 return types, v
211
AttributeError: 'Tensor' object has no attribute '_datatype_enum'
Nuria: this should just have been fixed earlier today. If you do not want to wait for the next release (coming soon), I would recommend that you simply build a local pip package from source. You can find instructions in the install guide.
As a followup here: TFF 0.4.0 has just been released, which contains this bugfix.

Resources