DeepChem GraphConvodel (GNN) training TypeError - machine-learning

I am a beginner to GNNs and I was trying out a code for predicting drug toxicity using DeepChem's Tox21 dataset. It is a dataset with a training set of 12 thousand compounds and test set of 650 compounds. I need in help in debugging and rectifying this error:"TypeError: 'NoneType' object is not subscriptable", which I get at the end.
Here is the code snippet:
model = GraphConvModel(len(tox21_tasks),
batch_size=32,
mode='classification')
print("Fitting the model")
model.fit(train_dataset, nb_epoch=10)
And here is my error:
TypeError Traceback (most recent call last)
<ipython-input-5-8088249b7fd6> in <module>
4 mode='classification')
5 print("Fitting the model")
----> 6 model.fit(train_dataset, nb_epoch=10)
~\anaconda3\lib\site-packages\deepchem\models\keras_model.py in fit(self, dataset, nb_epoch, max_checkpoints_to_keep, checkpoint_interval, deterministic, restore, variables, loss, callbacks, all_losses)
322 dataset, epochs=nb_epoch,
323 deterministic=deterministic), max_checkpoints_to_keep,
--> 324 checkpoint_interval, restore, variables, loss, callbacks, all_losses)
325
326 def fit_generator(self,
~\anaconda3\lib\site-packages\deepchem\models\keras_model.py in fit_generator(self, generator, max_checkpoints_to_keep, checkpoint_interval, restore, variables, loss, callbacks, all_losses)
407 inputs = inputs[0]
408
--> 409 batch_loss = apply_gradient_for_batch(inputs, labels, weights, loss)
410 current_step = self._global_step.numpy()
411
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\def_function.py in __call__(self, *args, **kwds)
455
456 tracing_count = self._get_tracing_count()
--> 457 result = self._call(*args, **kwds)
458 if tracing_count == self._get_tracing_count():
459 self._call_counter.called_without_tracing()
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\def_function.py in _call(self, *args, **kwds)
501 # This is the first call of __call__, so we have to initialize.
502 initializer_map = object_identity.ObjectIdentityDictionary()
--> 503 self._initialize(args, kwds, add_initializers_to=initializer_map)
504 finally:
505 # At this point we know that the initialization is complete (or less
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\def_function.py in _initialize(self, args, kwds, add_initializers_to)
406 self._concrete_stateful_fn = (
407 self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
--> 408 *args, **kwds))
409
410 def invalid_creator_scope(*unused_args, **unused_kwds):
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
1846 if self.input_signature:
1847 args, kwargs = None, None
-> 1848 graph_function, _, _ = self._maybe_define_function(args, kwargs)
1849 return graph_function
1850
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\function.py in _maybe_define_function(self, args, kwargs)
2148 graph_function = self._function_cache.primary.get(cache_key, None)
2149 if graph_function is None:
-> 2150 graph_function = self._create_graph_function(args, kwargs)
2151 self._function_cache.primary[cache_key] = graph_function
2152 return graph_function, args, kwargs
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
2039 arg_names=arg_names,
2040 override_flat_arg_shapes=override_flat_arg_shapes,
-> 2041 capture_by_value=self._capture_by_value),
2042 self._function_attributes,
2043 # Tell the ConcreteFunction to clean up its graph once it goes out of
~\anaconda3\lib\site-packages\tensorflow_core\python\framework\func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
913 converted_func)
914
--> 915 func_outputs = python_func(*func_args, **func_kwargs)
916
917 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
~\anaconda3\lib\site-packages\tensorflow_core\python\eager\def_function.py in wrapped_fn(*args, **kwds)
356 # __wrapped__ allows AutoGraph to swap in a converted function. We give
357 # the function a weak reference to itself to avoid a reference cycle.
--> 358 return weak_wrapped_fn().__wrapped__(*args, **kwds)
359 weak_wrapped_fn = weakref.ref(wrapped_fn)
360
~\anaconda3\lib\site-packages\tensorflow_core\python\framework\func_graph.py in wrapper(*args, **kwargs)
903 except Exception as e: # pylint:disable=broad-except
904 if hasattr(e, "ag_error_metadata"):
--> 905 raise e.ag_error_metadata.to_exception(e)
906 else:
907 raise
TypeError: in converted code:
relative to C:\Users\Madiha\anaconda3\lib\site-packages:
deepchem\models\keras_model.py:474 apply_gradient_for_batch *
grads = tape.gradient(batch_loss, vars)
tensorflow_core\python\eager\backprop.py:1014 gradient
unconnected_gradients=unconnected_gradients)
tensorflow_core\python\eager\imperative_grad.py:76 imperative_grad
compat.as_str(unconnected_gradients.value))
tensorflow_core\python\eager\backprop.py:138 _gradient_function
return grad_fn(mock_op, *out_grads)
tensorflow_core\python\ops\math_grad.py:455 _UnsortedSegmentMaxGrad
return _UnsortedSegmentMinOrMaxGrad(op, grad)
tensorflow_core\python\ops\math_grad.py:432 _UnsortedSegmentMinOrMaxGrad
_GatherDropNegatives(op.outputs[0], op.inputs[1])
TypeError: 'NoneType' object is not subscriptable

As an advise, check some examples on the DeepChem website. Here is a code which will work:
tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets
model = dc.models.GraphConvModel(len(tasks),
batch_size=32,
mode='classification')
print("Fitting the model")
model.fit(train_dataset)
Hope is work for you!

Related

InvalidArgumentError when building sequential model using Keras

I want to develop a sequential model using optimal hyperparameters derived from Keras Tuner. My code raised "Invalid argument: assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:]" error.
I tried adding +1 or +10 to the entire X_train but still get the same error.
Build the sequential model:
def model_builder(hp):
model = Sequential()
# Tune the number of units in the first Dense layer
# Choose an optimal value between 32-512
hp_units1 = hp.Int('units1', min_value=32, max_value=512, step=32)
hp_units2 = hp.Int('units2', min_value=32, max_value=512, step=32)
hp_units3 = hp.Int('units3', min_value=32, max_value=512, step=32)
model.add(Dense(units=hp_units1, activation='relu'))
model.add(tf.keras.layers.Dense(units=hp_units2, activation='relu'))
model.add(tf.keras.layers.Dense(units=hp_units3, activation='relu'))
model.add(Dense(1, kernel_initializer='normal', activation='linear')) # output layer
# Tune the learning rate for the optimizer
# Choose an optimal value from 0.01, 0.001, or 0.0001
hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
# Performance visualization callback
performance_viz_cbk = PerformanceVisualizationCallback(
model=model,
test_data=X_test,
image_dir='c:\perorfmance_charts')
# Model summary
model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
loss=SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy', tf.keras.metrics.AUC(), MulticlassTruePositives()])
return model
Hyperparameter tuning
tuner = kt.Hyperband(model_builder,
objective='val_accuracy',
max_epochs=10,
factor=3,
directory='my_dir',
project_name='intro_to_kt')
tuner.search(X_train, y_train, epochs=10, validation_split=0.2)
Traceback:
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
/tmp/ipykernel_17/1976310635.py in <module>
----> 1 tuner.search(X_train, y_train, epochs=10, validation_split=0.2)
/opt/conda/lib/python3.7/site-packages/keras_tuner/engine/base_tuner.py in search(self, *fit_args, **fit_kwargs)
177
178 self.on_trial_begin(trial)
--> 179 results = self.run_trial(trial, *fit_args, **fit_kwargs)
180 # `results` is None indicates user updated oracle in `run_trial()`.
181 if results is None:
/opt/conda/lib/python3.7/site-packages/keras_tuner/tuners/hyperband.py in run_trial(self, trial, *fit_args, **fit_kwargs)
382 fit_kwargs["epochs"] = hp.values["tuner/epochs"]
383 fit_kwargs["initial_epoch"] = hp.values["tuner/initial_epoch"]
--> 384 return super(Hyperband, self).run_trial(trial, *fit_args, **fit_kwargs)
385
386 def _build_model(self, hp):
/opt/conda/lib/python3.7/site-packages/keras_tuner/engine/tuner.py in run_trial(self, trial, *args, **kwargs)
292 callbacks.append(model_checkpoint)
293 copied_kwargs["callbacks"] = callbacks
--> 294 obj_value = self._build_and_fit_model(trial, *args, **copied_kwargs)
295
296 histories.append(obj_value)
/opt/conda/lib/python3.7/site-packages/keras_tuner/engine/tuner.py in _build_and_fit_model(self, trial, *args, **kwargs)
220 hp = trial.hyperparameters
221 model = self._try_build(hp)
--> 222 results = self.hypermodel.fit(hp, model, *args, **kwargs)
223 return tuner_utils.convert_to_metrics_dict(
224 results, self.oracle.objective, "HyperModel.fit()"
/opt/conda/lib/python3.7/site-packages/keras_tuner/engine/hypermodel.py in fit(self, hp, model, *args, **kwargs)
135 If return a float, it should be the `objective` value.
136 """
--> 137 return model.fit(*args, **kwargs)
138
139
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1182 _r=1):
1183 callbacks.on_train_batch_begin(step)
-> 1184 tmp_logs = self.train_function(iterator)
1185 if data_handler.should_sync:
1186 context.async_wait()
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
883
884 with OptionalXlaContext(self._jit_compile):
--> 885 result = self._call(*args, **kwds)
886
887 new_tracing_count = self.experimental_get_tracing_count()
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
948 # Lifting succeeded, so variables are initialized and we can run the
949 # stateless function.
--> 950 return self._stateless_fn(*args, **kwds)
951 else:
952 _, _, _, filtered_flat_args = \
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
3038 filtered_flat_args) = self._maybe_define_function(args, kwargs)
3039 return graph_function._call_flat(
-> 3040 filtered_flat_args, captured_inputs=graph_function.captured_inputs) # pylint: disable=protected-access
3041
3042 #property
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1962 # No tape is watching; skip to running the function.
1963 return self._build_call_outputs(self._inference_function.call(
-> 1964 ctx, args, cancellation_manager=cancellation_manager))
1965 forward_backward = self._select_forward_and_backward_functions(
1966 args,
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
594 inputs=args,
595 attrs=attrs,
--> 596 ctx=ctx)
597 else:
598 outputs = execute.execute_with_cancellation(
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (sequential/dense_3/BiasAdd:0) = ] [[-1.17921221][-1.64039445][-1.71617472]...] [y (Cast_8/x:0) = ] [0]
[[{{node assert_greater_equal/Assert/AssertGuard/else/_1/assert_greater_equal/Assert/AssertGuard/Assert}}]]
(1) Invalid argument: assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (sequential/dense_3/BiasAdd:0) = ] [[-1.17921221][-1.64039445][-1.71617472]...] [y (Cast_8/x:0) = ] [0]
[[{{node assert_greater_equal/Assert/AssertGuard/else/_1/assert_greater_equal/Assert/AssertGuard/Assert}}]]
[[assert_less_equal/Assert/AssertGuard/pivot_f/_13/_41]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_7509]
Function call stack:
train_function -> train_function
Sample data:
X_train
array([[7.97469882, 3.5471509 , 5.67233186, ..., 5.40341065, 3.16356072,
4.62657322],
[4.5390077 , 5.19247562, 5.21946796, ..., 4.12139197, 5.62828788,
4.31792717],
[5.83576307, 6.42946189, 4.85759085, ..., 5.26563348, 5.48022682,
5.29933021],
[4.58240518, 5.02457724, 6.1146384 , ..., 5.47847468, 6.62666587,
5.77328054],
[4.51602713, 5.17875946, 4.57102455, ..., 4.15622882, 5.48888824,
4.88704 ]])
y_train
array([[1],
[3],
[1],
[0],
[3]])

Training the BERT model with pytorch

I am unable to figure out why my BERT model dosen't get pas the training command. I am using pytorch-lightning. I am running the code on AWS EC2(p3.2xLarge) and it does show me the available GPU but I can't really figure out the device side error. Could someone please guide me towards a direction? I really appreciate you time and consideration.
PS: The results are after setting CUDA_LAUNCH_BLOCKING=1.
trainer = pl.Trainer(
logger=logger,
checkpoint_callback=checkpoint_callback,
callbacks=[early_stopping_callback],
max_epochs=N_EPOCHS,
gpus=1,
progress_bar_refresh_rate=30,
)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
In [155]:
trainer.fit(model, data_module)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-155-7b6b8391c42e> in <module>
----> 1 trainer.fit(model, data_module)
~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloaders, val_dataloaders, datamodule, train_dataloader, ckpt_path)
739 train_dataloaders = train_dataloader
740 self._call_and_handle_interrupt(
--> 741 self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
742 )
743
~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in _call_and_handle_interrupt(self, trainer_fn, *args, **kwargs)
683 """
684 try:
--> 685 return trainer_fn(*args, **kwargs)
686 # TODO: treat KeyboardInterrupt as BaseException (delete the code below) in v1.7
687 except KeyboardInterrupt as exception:
~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in _fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
775 # TODO: ckpt_path only in v1.7
776 ckpt_path = ckpt_path or self.resume_from_checkpoint
--> 777 self._run(model, ckpt_path=ckpt_path)
778
779 assert self.state.stopped
~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in _run(self, model, ckpt_path)
1143
1144 self._call_configure_sharded_model() # allow user to setup in model sharded environment
-> 1145 self.accelerator.setup(self)
1146
1147 # ----------------------------
~/.local/lib/python3.6/site-packages/pytorch_lightning/accelerators/gpu.py in setup(self, trainer)
44 def setup(self, trainer: "pl.Trainer") -> None:
45 self.set_nvidia_flags(trainer.local_rank)
---> 46 return super().setup(trainer)
47
48 def on_train_start(self) -> None:
~/.local/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py in setup(self, trainer)
89 trainer: the trainer instance
90 """
---> 91 self.setup_training_type_plugin()
92 if not self.training_type_plugin.setup_optimizers_in_pre_dispatch:
93 self.setup_optimizers(trainer)
~/.local/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py in setup_training_type_plugin(self)
361 def setup_training_type_plugin(self) -> None:
362 """Attaches the training type plugin to the accelerator."""
--> 363 self.training_type_plugin.setup()
364
365 def setup_precision_plugin(self) -> None:
~/.local/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/single_device.py in setup(self)
69
70 def setup(self) -> None:
---> 71 self.model_to_device()
72
73 #property
~/.local/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/single_device.py in model_to_device(self)
66
67 def model_to_device(self) -> None:
---> 68 self._model.to(self.root_device)
69
70 def setup(self) -> None:
~/.local/lib/python3.6/site-packages/pytorch_lightning/core/mixins/device_dtype_mixin.py in to(self, *args, **kwargs)
109 out = torch._C._nn._parse_to(*args, **kwargs)
110 self.__update_properties(device=out[0], dtype=out[1])
--> 111 return super().to(*args, **kwargs)
112
113 def cuda(self, device: Optional[Union[torch.device, int]] = None) -> "DeviceDtypeModuleMixin":
~/.local/lib/python3.6/site-packages/torch/nn/modules/module.py in to(self, *args, **kwargs)
897 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
898
--> 899 return self._apply(convert)
900
901 def register_backward_hook(
~/.local/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
568 def _apply(self, fn):
569 for module in self.children():
--> 570 module._apply(fn)
571
572 def compute_should_use_set_data(tensor, tensor_applied):
~/.local/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
568 def _apply(self, fn):
569 for module in self.children():
--> 570 module._apply(fn)
571
572 def compute_should_use_set_data(tensor, tensor_applied):
~/.local/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
568 def _apply(self, fn):
569 for module in self.children():
--> 570 module._apply(fn)
571
572 def compute_should_use_set_data(tensor, tensor_applied):
~/.local/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
591 # `with torch.no_grad():`
592 with torch.no_grad():
--> 593 param_applied = fn(param)
594 should_use_set_data = compute_should_use_set_data(param, param_applied)
595 if should_use_set_data:
~/.local/lib/python3.6/site-packages/torch/nn/modules/module.py in convert(t)
895 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None,
896 non_blocking, memory_format=convert_to_format)
--> 897 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
898
899 return self._apply(convert)
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Restarting the machine returned this:
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Missing logger folder: lightning_logs/nara-comments
| Name | Type | Params
-----------------------------------------
0 | bert | BertModel | 108 M
1 | classifier | Linear | 288 K
2 | criterion | BCELoss | 0
-----------------------------------------
108 M Trainable params
0 Non-trainable params
108 M Total params
434.395 Total estimated model params size (MB)
/home/ubuntu/.local/lib/python3.6/site-packages/pytorch_lightning/utilities/data.py:60: UserWarning: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 4540. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
"Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
/home/ubuntu/.local/lib/python3.6/site-packages/pytorch_lightning/utilities/data.py:60: UserWarning: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 4374. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
"Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
Global seed set to 42
Epoch 0: 0%
0/397 [00:00<?, ?it/s]
/home/ubuntu/.local/lib/python3.6/site-packages/pytorch_lightning/loops/optimization/closure.py:36: LightningDeprecationWarning: One of the returned values {'predictions', 'labels'} has a `grad_fn`. We will detach it automatically but this behaviour will change in v1.6. Please detach it manually: `return {'loss': ..., 'something': something.detach()}`
f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-48-7b6b8391c42e> in <module>
----> 1 trainer.fit(model, data_module)
~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloaders, val_dataloaders, datamodule, train_dataloader, ckpt_path)
739 train_dataloaders = train_dataloader
740 self._call_and_handle_interrupt(
--> 741 self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
742 )
743
~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in _call_and_handle_interrupt(self, trainer_fn, *args, **kwargs)
683 """
684 try:
--> 685 return trainer_fn(*args, **kwargs)
686 # TODO: treat KeyboardInterrupt as BaseException (delete the code below) in v1.7
687 except KeyboardInterrupt as exception:
~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in _fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
775 # TODO: ckpt_path only in v1.7
776 ckpt_path = ckpt_path or self.resume_from_checkpoint
--> 777 self._run(model, ckpt_path=ckpt_path)
778
779 assert self.state.stopped
~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in _run(self, model, ckpt_path)
1197
1198 # dispatch `start_training` or `start_evaluating` or `start_predicting`
-> 1199 self._dispatch()
1200
1201 # plugin will finalized fitting (e.g. ddp_spawn will load trained model)
~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in _dispatch(self)
1277 self.training_type_plugin.start_predicting(self)
1278 else:
-> 1279 self.training_type_plugin.start_training(self)
1280
1281 def run_stage(self):
~/.local/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in start_training(self, trainer)
200 def start_training(self, trainer: "pl.Trainer") -> None:
201 # double dispatch to initiate the training loop
--> 202 self._results = trainer.run_stage()
203
204 def start_evaluating(self, trainer: "pl.Trainer") -> None:
~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in run_stage(self)
1287 if self.predicting:
1288 return self._run_predict()
-> 1289 return self._run_train()
1290
1291 def _pre_training_routine(self):
~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in _run_train(self)
1317 self.fit_loop.trainer = self
1318 with torch.autograd.set_detect_anomaly(self._detect_anomaly):
-> 1319 self.fit_loop.run()
1320
1321 def _run_evaluate(self) -> _EVALUATE_OUTPUT:
~/.local/lib/python3.6/site-packages/pytorch_lightning/loops/base.py in run(self, *args, **kwargs)
143 try:
144 self.on_advance_start(*args, **kwargs)
--> 145 self.advance(*args, **kwargs)
146 self.on_advance_end()
147 self.restarting = False
~/.local/lib/python3.6/site-packages/pytorch_lightning/loops/fit_loop.py in advance(self)
232
233 with self.trainer.profiler.profile("run_training_epoch"):
--> 234 self.epoch_loop.run(data_fetcher)
235
236 # the global step is manually decreased here due to backwards compatibility with existing loggers
~/.local/lib/python3.6/site-packages/pytorch_lightning/loops/base.py in run(self, *args, **kwargs)
143 try:
144 self.on_advance_start(*args, **kwargs)
--> 145 self.advance(*args, **kwargs)
146 self.on_advance_end()
147 self.restarting = False
~/.local/lib/python3.6/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py in advance(self, *args, **kwargs)
191
192 with self.trainer.profiler.profile("run_training_batch"):
--> 193 batch_output = self.batch_loop.run(batch, batch_idx)
194
195 self.batch_progress.increment_processed()
~/.local/lib/python3.6/site-packages/pytorch_lightning/loops/base.py in run(self, *args, **kwargs)
143 try:
144 self.on_advance_start(*args, **kwargs)
--> 145 self.advance(*args, **kwargs)
146 self.on_advance_end()
147 self.restarting = False
~/.local/lib/python3.6/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py in advance(self, batch, batch_idx)
86 if self.trainer.lightning_module.automatic_optimization:
87 optimizers = _get_active_optimizers(self.trainer.optimizers, self.trainer.optimizer_frequencies, batch_idx)
---> 88 outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
89 else:
90 outputs = self.manual_loop.run(split_batch, batch_idx)
~/.local/lib/python3.6/site-packages/pytorch_lightning/loops/base.py in run(self, *args, **kwargs)
143 try:
144 self.on_advance_start(*args, **kwargs)
--> 145 self.advance(*args, **kwargs)
146 self.on_advance_end()
147 self.restarting = False
~/.local/lib/python3.6/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in advance(self, batch, *args, **kwargs)
217 self._batch_idx,
218 self._optimizers[self.optim_progress.optimizer_position],
--> 219 self.optimizer_idx,
220 )
221 if result.loss is not None:
~/.local/lib/python3.6/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in _run_optimization(self, split_batch, batch_idx, optimizer, opt_idx)
264 # gradient update with accumulated gradients
265 else:
--> 266 self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
267
268 result = closure.consume_result()
~/.local/lib/python3.6/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in _optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
384 on_tpu=(self.trainer._device_type == DeviceType.TPU and _TPU_AVAILABLE),
385 using_native_amp=(self.trainer.amp_backend is not None and self.trainer.amp_backend == AMPType.NATIVE),
--> 386 using_lbfgs=is_lbfgs,
387 )
388
~/.local/lib/python3.6/site-packages/pytorch_lightning/core/lightning.py in optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs)
1650
1651 """
-> 1652 optimizer.step(closure=optimizer_closure)
1653
1654 def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int):
~/.local/lib/python3.6/site-packages/pytorch_lightning/core/optimizer.py in step(self, closure, **kwargs)
162 assert trainer is not None
163 with trainer.profiler.profile(profiler_action):
--> 164 trainer.accelerator.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
~/.local/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py in optimizer_step(self, optimizer, opt_idx, closure, model, **kwargs)
337 """
338 model = model or self.lightning_module
--> 339 self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
340
341 def optimizer_zero_grad(self, current_epoch: int, batch_idx: int, optimizer: Optimizer, opt_idx: int) -> None:
~/.local/lib/python3.6/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py in optimizer_step(self, model, optimizer, optimizer_idx, closure, **kwargs)
161 if isinstance(model, pl.LightningModule):
162 closure = partial(self._wrap_closure, model, optimizer, optimizer_idx, closure)
--> 163 optimizer.step(closure=closure, **kwargs)
164
165 def _track_grad_norm(self, trainer: "pl.Trainer") -> None:
~/.local/lib/python3.6/site-packages/torch/optim/lr_scheduler.py in wrapper(*args, **kwargs)
63 instance._step_count += 1
64 wrapped = func.__get__(instance, cls)
---> 65 return wrapped(*args, **kwargs)
66
67 # Note that the returned function here is no longer a bound method,
~/.local/lib/python3.6/site-packages/torch/optim/optimizer.py in wrapper(*args, **kwargs)
86 profile_name = "Optimizer.step#{}.step".format(obj.__class__.__name__)
87 with torch.autograd.profiler.record_function(profile_name):
---> 88 return func(*args, **kwargs)
89 return wrapper
90
~/.local/lib/python3.6/site-packages/transformers/optimization.py in step(self, closure)
330 loss = None
331 if closure is not None:
--> 332 loss = closure()
333
334 for group in self.param_groups:
~/.local/lib/python3.6/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py in _wrap_closure(self, model, optimizer, optimizer_idx, closure)
146 consistent with the ``PrecisionPlugin`` subclasses that cannot pass ``optimizer.step(closure)`` directly.
147 """
--> 148 closure_result = closure()
149 self._after_closure(model, optimizer, optimizer_idx)
150 return closure_result
~/.local/lib/python3.6/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in __call__(self, *args, **kwargs)
158
159 def __call__(self, *args: Any, **kwargs: Any) -> Optional[Tensor]:
--> 160 self._result = self.closure(*args, **kwargs)
161 return self._result.loss
162
~/.local/lib/python3.6/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in closure(self, *args, **kwargs)
153 if self._backward_fn is not None and step_output.closure_loss is not None:
154 with self._profiler.profile("backward"):
--> 155 self._backward_fn(step_output.closure_loss)
156
157 return step_output
~/.local/lib/python3.6/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py in backward_fn(loss)
325
326 def backward_fn(loss: Tensor) -> None:
--> 327 self.trainer.accelerator.backward(loss, optimizer, opt_idx)
328
329 # check if model weights are nan
~/.local/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py in backward(self, closure_loss, *args, **kwargs)
312 closure_loss = self.precision_plugin.pre_backward(self.lightning_module, closure_loss)
313
--> 314 self.precision_plugin.backward(self.lightning_module, closure_loss, *args, **kwargs)
315
316 closure_loss = self.precision_plugin.post_backward(self.lightning_module, closure_loss)
~/.local/lib/python3.6/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py in backward(self, model, closure_loss, optimizer, *args, **kwargs)
89 # do backward pass
90 if model is not None and isinstance(model, pl.LightningModule):
---> 91 model.backward(closure_loss, optimizer, *args, **kwargs)
92 else:
93 self._run_backward(closure_loss, *args, **kwargs)
~/.local/lib/python3.6/site-packages/pytorch_lightning/core/lightning.py in backward(self, loss, optimizer, optimizer_idx, *args, **kwargs)
1432 loss.backward()
1433 """
-> 1434 loss.backward(*args, **kwargs)
1435
1436 def toggle_optimizer(self, optimizer: Union[Optimizer, LightningOptimizer], optimizer_idx: int) -> None:
~/.local/lib/python3.6/site-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
305 create_graph=create_graph,
306 inputs=inputs)
--> 307 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
308
309 def register_hook(self, hook):
~/.local/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
154 Variable._execution_engine.run_backward(
155 tensors, grad_tensors_, retain_graph, create_graph, inputs,
--> 156 allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
157
158
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

load_from_checkpoint fails after transfer learning a LightningModule

I try to transfer learn a LightningModule. The relevant part of the code is this:
class DeepFilteringTransferLearning(pl.LightningModule):
def __init__(self, chk_path = None):
super().__init__()
#init class members
self.prediction = []
self.label = []
self.loss = MSELoss()
#init pretrained model
self.chk_path = chk_path
model = DeepFiltering.load_from_checkpoint(chk_path)
backbone = model.sequential
layers = list(backbone.children())[:-1]
self.groundModel = Sequential(*layers)
#use the pretrained modell the same way to regress Lshall and neq
self.regressor = nn.Linear(64,2)
def forward(self, x):
self.groundModel.eval()
with torch.no_grad():
groundOut = self.groundModel(x)
yPred = self.regressor(groundOut)
return yPred
I save my model in a different, main file which relevant part is:
#callbacks
callbacks = [
ModelCheckpoint(
dirpath = "checkpoints/maxPooling16StandardizedL2RegularizedReproduceableSeeded42Ampl1ConvTransferLearned",
save_top_k=5,
monitor="val_loss",
),
]
#trainer
trainer = pl.Trainer(gpus=[1,2],strategy="dp",max_epochs=150,logger=wandb_logger,callbacks=callbacks,precision=32,deterministic=True)
trainer.fit(model,train_dataloaders=trainDl,val_dataloaders=valDl)
After try to load the modell from checkpoint like this:
chk_patH = "path/to/transfer_learned/model"
standardizedL2RegularizedL1 = DeepFilteringTransferLearning("path/to/model/trying/to/use/for/transfer_learning").load_from_checkpoint(chk_patH)
I got the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~/anaconda3/envs/skimageTrial/lib/python3.6/site-packages/torch/serialization.py in _check_seekable(f)
307 try:
--> 308 f.seek(f.tell())
309 return True
AttributeError: 'NoneType' object has no attribute 'seek'
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
<ipython-input-6-13f5fd0c7b85> in <module>
1 chk_patH = "checkpoints/maxPooling16StandardizedL2RegularizedReproduceableSeeded42Ampl1/epoch=4-step=349.ckpt"
----> 2 standardizedL2RegularizedL1 = DeepFilteringTransferLearning("checkpoints/maxPooling16StandardizedL2RegularizedReproduceableSeeded42Ampl2/epoch=145-step=10219.ckpt").load_from_checkpoint(chk_patH)
~/anaconda3/envs/skimageTrial/lib/python3.6/site-packages/pytorch_lightning/core/saving.py in load_from_checkpoint(cls, checkpoint_path, map_location, hparams_file, strict, **kwargs)
154 checkpoint[cls.CHECKPOINT_HYPER_PARAMS_KEY].update(kwargs)
155
--> 156 model = cls._load_model_state(checkpoint, strict=strict, **kwargs)
157 return model
158
~/anaconda3/envs/skimageTrial/lib/python3.6/site-packages/pytorch_lightning/core/saving.py in _load_model_state(cls, checkpoint, strict, **cls_kwargs_new)
196 _cls_kwargs = {k: v for k, v in _cls_kwargs.items() if k in cls_init_args_name}
197
--> 198 model = cls(**_cls_kwargs)
199
200 # give model a chance to load something
~/whistlerProject/gitHub/whistler/mathe/gwInspired/deepFilteringTransferLearning.py in __init__(self, chk_path)
34 #init pretrained model
35 self.chk_path = chk_path
---> 36 model = DeepFiltering.load_from_checkpoint(chk_path)
37 backbone = model.sequential
38 layers = list(backbone.children())[:-1]
~/anaconda3/envs/skimageTrial/lib/python3.6/site-packages/pytorch_lightning/core/saving.py in load_from_checkpoint(cls, checkpoint_path, map_location, hparams_file, strict, **kwargs)
132 checkpoint = pl_load(checkpoint_path, map_location=map_location)
133 else:
--> 134 checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage)
135
136 if hparams_file is not None:
~/anaconda3/envs/skimageTrial/lib/python3.6/site-packages/pytorch_lightning/utilities/cloud_io.py in load(path_or_url, map_location)
31 if not isinstance(path_or_url, (str, Path)):
32 # any sort of BytesIO or similiar
---> 33 return torch.load(path_or_url, map_location=map_location)
34 if str(path_or_url).startswith("http"):
35 return torch.hub.load_state_dict_from_url(str(path_or_url), map_location=map_location)
~/anaconda3/envs/skimageTrial/lib/python3.6/site-packages/torch/serialization.py in load(f, map_location, pickle_module, **pickle_load_args)
579 pickle_load_args['encoding'] = 'utf-8'
580
--> 581 with _open_file_like(f, 'rb') as opened_file:
582 if _is_zipfile(opened_file):
583 # The zipfile reader is going to advance the current file position.
~/anaconda3/envs/skimageTrial/lib/python3.6/site-packages/torch/serialization.py in _open_file_like(name_or_buffer, mode)
233 return _open_buffer_writer(name_or_buffer)
234 elif 'r' in mode:
--> 235 return _open_buffer_reader(name_or_buffer)
236 else:
237 raise RuntimeError(f"Expected 'r' or 'w' in mode but got {mode}")
~/anaconda3/envs/skimageTrial/lib/python3.6/site-packages/torch/serialization.py in __init__(self, buffer)
218 def __init__(self, buffer):
219 super(_open_buffer_reader, self).__init__(buffer)
--> 220 _check_seekable(buffer)
221
222
~/anaconda3/envs/skimageTrial/lib/python3.6/site-packages/torch/serialization.py in _check_seekable(f)
309 return True
310 except (io.UnsupportedOperation, AttributeError) as e:
--> 311 raise_err_msg(["seek", "tell"], e)
312 return False
313
~/anaconda3/envs/skimageTrial/lib/python3.6/site-packages/torch/serialization.py in raise_err_msg(patterns, e)
302 + " Please pre-load the data into a buffer like io.BytesIO and"
303 + " try to load from it instead.")
--> 304 raise type(e)(msg)
305 raise e
306
AttributeError: 'NoneType' object has no attribute 'seek'. You can only torch.load from a file that is seekable. Please pre-load the data into a buffer like io.BytesIO and try to load from it instead.
which I can't resolve.
I try to this according to the available tutorials on the official page of pytorch lightning here. I can't figure it out what I miss.
Could somebody point me in the right direction?

FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan when using greater int values

I've recently watched a YouTube (DataSchool) video where the guy used only 3 columns from the Titanic dataset and made a pipeline. I wanted to add more columns to get better accuracy so I added Age and Fare.
I think it's probably because of the values of Age and Fare that I'm getting this error when I perform cross_val_score
columns_trans = make_column_transformer(
(OneHotEncoder(), ['Sex', 'Embarked']),
remainder='passthrough')
logreg = LogisticRegression(solver='lbfgs')
pipe = make_pipeline(columns_trans, logreg)
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py:552: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan.
If I remove Age and Fare, everything works fine. I was wondering if the Column Transformer or the make_pipeline had a problem with values like that.
I also tried scaling the values of Fare and Age, then it gave a cross_val_score but failed in pipe.predict() giving an error:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64')
Traceback:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/tmp/ipykernel_119/4279568460.py in <module>
----> 1 cross_val_score(pipe, X, y, cv=5, scoring='accuracy', error_score="raise").mean()
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
404 fit_params=fit_params,
405 pre_dispatch=pre_dispatch,
--> 406 error_score=error_score)
407 return cv_results['test_score']
408
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
246 return_times=True, return_estimator=return_estimator,
247 error_score=error_score)
--> 248 for train, test in cv.split(X, y, groups))
249
250 zipped_scores = list(zip(*scores))
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
529 estimator.fit(X_train, **fit_params)
530 else:
--> 531 estimator.fit(X_train, y_train, **fit_params)
532
533 except Exception as e:
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
333 if self._final_estimator != 'passthrough':
334 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 335 self._final_estimator.fit(Xt, y, **fit_params_last_step)
336
337 return self
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py in fit(self, X, y, sample_weight)
1415 penalty=penalty, max_squared_sum=max_squared_sum,
1416 sample_weight=sample_weight)
-> 1417 for class_, warm_start_coef_ in zip(classes_, warm_start_coef))
1418
1419 fold_coefs_, _, n_iter_ = zip(*fold_coefs_)
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py in _logistic_regression_path(X, y, pos_class, Cs, fit_intercept, max_iter, tol, verbose, solver, coef, class_weight, dual, penalty, intercept_scaling, multi_class, random_state, check_input, max_squared_sum, sample_weight, l1_ratio)
762 n_iter_i = _check_optimize_result(
763 solver, opt_res, max_iter,
--> 764 extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
765 w0, loss = opt_res.x, opt_res.fun
766 elif solver == 'newton-cg':
/opt/conda/lib/python3.7/site-packages/sklearn/utils/optimize.py in _check_optimize_result(solver, result, max_iter, extra_warning_msg)
241 " https://scikit-learn.org/stable/modules/"
242 "preprocessing.html"
--> 243 ).format(solver, result.status, result.message.decode("latin1"))
244 if extra_warning_msg is not None:
245 warning_msg += "\n" + extra_warning_msg
AttributeError: 'str' object has no attribute 'decode'
I solved this error by changing solver=lbfgs to solver=liblinear in LogisticRegression()
logreg = LogisticRegression(solver='lbfgs')
to
logreg = LogisticRegression(solver='liblinear')
And for the following error:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64')
It's best to check if your test data contains any null values or strings.

Nonetype Error on saving model in keras

Error:
TypeError: Fetch argument None has invalid type
I think the error occurs when I'm saving the model in the callback modelcheckpoint. On searching the error, this came up but I cannot use this answer because I am using keras thus I don't explicitly call sess.run() in tensorflow. Also the epoch is trained flawlessly, it is only when it is being saved does the error pop up.
Code:
The complete model is in a kaggle notebook linked here: https://www.kaggle.com/aevinq/cnn-batchnormalization-0-1646/
The relevant code where the error pops up is:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')
mcp_save = ModelCheckpoint('md.hdf5', save_best_only=True, monitor='val_loss', mode='min')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1, epsilon=1e-4, mode='min')
history = model.fit(train_X, train_y, batch_size=32, epochs=20, verbose=1, validation_split=0.25, callbacks=[early_stopping, reduce_lr_loss, mcp_save])
Error:
Train on 4413 samples, validate on 1471 samples
Epoch 1/20
4384/4413 [============================>.] - ETA: 1s - loss: 0.5157 - acc: 0.7696
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-11-97f0757a1e9c> in <module>()
2 mcp_save = ModelCheckpoint('md.hdf5', save_best_only=True, monitor='val_loss', mode='min')
3 reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1, epsilon=1e-4, mode='min')
----> 4 history = model.fit(train_X, train_y, batch_size=32, epochs=20, verbose=1, validation_split=0.25, callbacks=[early_stopping, reduce_lr_loss, mcp_save])
/opt/conda/lib/python3.6/site-packages/Keras-2.1.2-py3.6.egg/keras/models.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
970 initial_epoch=initial_epoch,
971 steps_per_epoch=steps_per_epoch,
--> 972 validation_steps=validation_steps)
973
974 def evaluate(self, x=None, y=None,
/opt/conda/lib/python3.6/site-packages/Keras-2.1.2-py3.6.egg/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
1655 initial_epoch=initial_epoch,
1656 steps_per_epoch=steps_per_epoch,
-> 1657 validation_steps=validation_steps)
1658
1659 def evaluate(self, x=None, y=None,
/opt/conda/lib/python3.6/site-packages/Keras-2.1.2-py3.6.egg/keras/engine/training.py in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)
1231 for l, o in zip(out_labels, val_outs):
1232 epoch_logs['val_' + l] = o
-> 1233 callbacks.on_epoch_end(epoch, epoch_logs)
1234 if callback_model.stop_training:
1235 break
/opt/conda/lib/python3.6/site-packages/Keras-2.1.2-py3.6.egg/keras/callbacks.py in on_epoch_end(self, epoch, logs)
71 logs = logs or {}
72 for callback in self.callbacks:
---> 73 callback.on_epoch_end(epoch, logs)
74
75 def on_batch_begin(self, batch, logs=None):
/opt/conda/lib/python3.6/site-packages/Keras-2.1.2-py3.6.egg/keras/callbacks.py in on_epoch_end(self, epoch, logs)
413 self.model.save_weights(filepath, overwrite=True)
414 else:
--> 415 self.model.save(filepath, overwrite=True)
416 else:
417 if self.verbose > 0:
/opt/conda/lib/python3.6/site-packages/Keras-2.1.2-py3.6.egg/keras/engine/topology.py in save(self, filepath, overwrite, include_optimizer)
2563 """
2564 from ..models import save_model
-> 2565 save_model(self, filepath, overwrite, include_optimizer)
2566
2567 def save_weights(self, filepath, overwrite=True):
/opt/conda/lib/python3.6/site-packages/Keras-2.1.2-py3.6.egg/keras/models.py in save_model(model, filepath, overwrite, include_optimizer)
145 if symbolic_weights:
146 optimizer_weights_group = f.create_group('optimizer_weights')
--> 147 weight_values = K.batch_get_value(symbolic_weights)
148 weight_names = []
149 for i, (w, val) in enumerate(zip(symbolic_weights,
/opt/conda/lib/python3.6/site-packages/Keras-2.1.2-py3.6.egg/keras/backend/tensorflow_backend.py in batch_get_value(ops)
2208 """
2209 if ops:
-> 2210 return get_session().run(ops)
2211 else:
2212 return []
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
887 try:
888 result = self._run(None, fetches, feed_dict, options_ptr,
--> 889 run_metadata_ptr)
890 if run_metadata:
891 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1103 # Create a fetch handler to take care of the structure of fetches.
1104 fetch_handler = _FetchHandler(
-> 1105 self._graph, fetches, feed_dict_tensor, feed_handles=feed_handles)
1106
1107 # Run request and get response.
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in __init__(self, graph, fetches, feeds, feed_handles)
412 """
413 with graph.as_default():
--> 414 self._fetch_mapper = _FetchMapper.for_fetch(fetches)
415 self._fetches = []
416 self._targets = []
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in for_fetch(fetch)
232 elif isinstance(fetch, (list, tuple)):
233 # NOTE(touts): This is also the code path for namedtuples.
--> 234 return _ListFetchMapper(fetch)
235 elif isinstance(fetch, dict):
236 return _DictFetchMapper(fetch)
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in __init__(self, fetches)
339 """
340 self._fetch_type = type(fetches)
--> 341 self._mappers = [_FetchMapper.for_fetch(fetch) for fetch in fetches]
342 self._unique_fetches, self._value_indices = _uniquify_fetches(self._mappers)
343
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in <listcomp>(.0)
339 """
340 self._fetch_type = type(fetches)
--> 341 self._mappers = [_FetchMapper.for_fetch(fetch) for fetch in fetches]
342 self._unique_fetches, self._value_indices = _uniquify_fetches(self._mappers)
343
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in for_fetch(fetch)
229 if fetch is None:
230 raise TypeError('Fetch argument %r has invalid type %r' %
--> 231 (fetch, type(fetch)))
232 elif isinstance(fetch, (list, tuple)):
233 # NOTE(touts): This is also the code path for namedtuples.
TypeError: Fetch argument None has invalid type <class 'NoneType'>
It's a bug in Keras. There are None values in model.optimizer.weights after a recent update, which leads to an error when K.batch_get_value is called during model saving.
I've opened a PR to fix it and it's merged. You can install the latest Keras on Github to fix it.

Resources