Related
Recently, I have been trying to parallelize some Sage (Sage 9.4 on a MacBook Pro running OSX 11.2.3) code using Dask. The problem I run into is that while I can run Dask inside Sage, it will break whenever I include any code that isn't "pure python." In particular, it keeps throwing an ImportError. Here is a basic example of what I am running into
import time
from dask import delayed
from dask.distributed import Client
from time import sleep
client = Client(n_workers=4)
def Hello():
1+1 #this line breaks things by adding a sage operation
#if I remove it the code runs fine
return 'Hello World'
z = delayed(Hello)()
z.compute()
This code throws the following error
Traceback
ImportError Traceback (most recent call last)
<timed eval> in <module>
~/.sage/local/lib/python3.9/site-packages/dask/base.py in compute(self, **kwargs)
284 dask.base.compute
285 """
--> 286 (result,) = compute(self, traverse=False, **kwargs)
287 return result
288
~/.sage/local/lib/python3.9/site-packages/dask/base.py in compute(*args, **kwargs)
566 postcomputes.append(x.__dask_postcompute__())
567
--> 568 results = schedule(dsk, keys, **kwargs)
569 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
570
~/.sage/local/lib/python3.9/site-packages/distributed/client.py in get(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2669 should_rejoin = False
2670 try:
-> 2671 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
2672 finally:
2673 for f in futures.values():
~/.sage/local/lib/python3.9/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
1946 else:
1947 local_worker = None
-> 1948 return self.sync(
1949 self._gather,
1950 futures,
~/.sage/local/lib/python3.9/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
843 return future
844 else:
--> 845 return sync(
846 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
847 )
~/.sage/local/lib/python3.9/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
324 if error[0]:
325 typ, exc, tb = error[0]
--> 326 raise exc.with_traceback(tb)
327 else:
328 return result[0]
~/.sage/local/lib/python3.9/site-packages/distributed/utils.py in f()
307 if callback_timeout is not None:
308 future = asyncio.wait_for(future, callback_timeout)
--> 309 result[0] = yield future
310 except Exception:
311 error[0] = sys.exc_info()
/var/tmp/sage-9.4-current/local/lib/python3.9/site-packages/tornado/gen.py in run(self)
733
734 try:
--> 735 value = future.result()
736 except Exception:
737 exc_info = sys.exc_info()
~/.sage/local/lib/python3.9/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1811 exc = CancelledError(key)
1812 else:
-> 1813 raise exception.with_traceback(traceback)
1814 raise exc
1815 if errors == "skip":
~/.sage/local/lib/python3.9/site-packages/distributed/protocol/pickle.py in loads()
73 return pickle.loads(x, buffers=buffers)
74 else:
---> 75 return pickle.loads(x)
76 except Exception:
77 logger.info("Failed to deserialize %s", x[:10000], exc_info=True)
/var/tmp/sage-9.4-current/local/lib/python3.9/site-packages/sage/rings/integer.pyx in init sage.rings.integer (build/cythonized/sage/rings/integer.c:54201)()
----> 1 r"""
2 Elements of the ring `\ZZ` of integers
3
4 Sage has highly optimized and extensive functionality for arithmetic with integers
5 and the ring of integers.
/var/tmp/sage-9.4-current/local/lib/python3.9/site-packages/sage/rings/rational.pyx in init sage.rings.rational (build/cythonized/sage/rings/rational.cpp:40442)()
98
99
--> 100 import sage.rings.real_mpfr
101 import sage.rings.real_double
102 from libc.stdint cimport uint64_t
/var/tmp/sage-9.4-current/local/lib/python3.9/site-packages/sage/rings/real_mpfr.pyx in init sage.rings.real_mpfr (build/cythonized/sage/rings/real_mpfr.c:46795)()
----> 1 r"""
2 Arbitrary Precision Real Numbers
3
4 AUTHORS:
5
/var/tmp/sage-9.4-current/local/lib/python3.9/site-packages/sage/libs/mpmath/utils.pyx in init sage.libs.mpmath.utils (build/cythonized/sage/libs/mpmath/utils.c:9062)()
----> 1 """
2 Utilities for Sage-mpmath interaction
3
4 Also patches some mpmath functions for speed
5 """
/var/tmp/sage-9.4-current/local/lib/python3.9/site-packages/sage/rings/complex_mpfr.pyx in init sage.rings.complex_mpfr (build/cythonized/sage/rings/complex_mpfr.c:34594)()
----> 1 """
2 Arbitrary Precision Floating Point Complex Numbers
3
4 AUTHORS:
5
/var/tmp/sage-9.4-current/local/lib/python3.9/site-packages/sage/rings/complex_double.pyx in init sage.rings.complex_double (build/cythonized/sage/rings/complex_double.c:25284)()
96 from cypari2.convert cimport new_gen_from_double, new_t_COMPLEX_from_double
97
---> 98 from . import complex_mpfr
99
100 from .complex_mpfr import ComplexField
ImportError: cannot import name complex_mpfr
The only other time I have seen an ImportError like this is when I have been running sage inside python and did not include a from sage.all import *, so I am wondering if what is happening is that Dask is trying to run my code in python. I'm also not sure whether this qualifies as a Sage or a Dask problem. Any help would be greatly appreciated!
I am learning machine learning and creating my first model on #mnist data set.
Can anyone help me over here? I have tried Stratified Fold, kfold and other methods to resolve this issue.
Pandas Version '0.25.1', Python Version 3.7, using Anaconda Distribution.
from sklearn.model_selection import train_test_split
train_set ,test_set = train_test_split(mnist,test_size = 0.2, random_state = 29)
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=29)
sgd_clf.fit(X_train,y_train_5)
X_train, y_train = train_set.drop('label',axis = 1), train_set[['label']]
X_test, y_test = test_set.drop('label',axis = 1),test_set[['label']]
y_train_5 = (y_train == 5) #True for all 5's and false otherwise
y_test_5 = (y_train == 5)
from sklearn.model_selection import cross_val_predict
print(X_train.shape)
print(y_train_5.shape)
cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")
Last line of the code block gives an error:
RuntimeWarning: Number of classes in training fold (2) does not match total number of classes (1). Results may not be appropriate for your use case. To fix this, use a cross-validation technique resulting in properly stratified folds
RuntimeWarning)
ValueError Traceback (most recent call last)
<ipython-input-39-da1ad024473a> in <module>
3 print(X_train.shape)
4 print(y_train_5.shape)
----> 5 cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_predict(estimator, X, y, groups, cv, n_jobs, verbose, fit_params, pre_dispatch, method)
787 prediction_blocks = parallel(delayed(_fit_and_predict)(
788 clone(estimator), X, y, train, test, verbose, fit_params, method)
--> 789 for train, test in cv.split(X, y, groups))
790
791 # Concatenate the predictions
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method)
887 n_classes = len(set(y)) if y.ndim == 1 else y.shape[1]
888 predictions = _enforce_prediction_order(
--> 889 estimator.classes_, predictions, n_classes, method)
890 return predictions, test
891
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _enforce_prediction_order(classes, predictions, n_classes, method)
933 'is not supported for decision_function '
934 'with imbalanced folds. {}'.format(
--> 935 len(classes), n_classes, recommendation))
936
937 float_min = np.finfo(predictions.dtype).min
ValueError: Only 2 class/es in training fold, but 1 in overall dataset. This is not supported for decision_function with imbalanced folds. To fix this, use a cross-validation technique resulting in properly stratified folds
I ran through a similar problem and on further investigation found a warning message with the error log-
DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
There are two ways to solve this:
Use the hint in the warning message and change your code as:
cross_val_predict(sgd_clf, X_train, y_train_5.values.ravel(), cv=3,
method="decision_function")
refer - answere here
Also, using the hint from - A column-vector y was passed when a 1d array was expected.; I released my mistake and did the following:
Even in your error log- Number of classes in training fold (2) does not match total number of classes (1)
I assume y_train_5 here is a DataFrame, (probably you are working your way through Aurelien's publication)
The expected type for y_train_5 is an array-type object (meaning the shaoe to be (n,) or one-dimensional), but DataFrame is 2-dimensional, in your case (n,1).
All you need to do is pass the Series object for your column vector as-
y_train_5.iloc[:,0] (I prefer this)
y_train_5.{COLUMN_NAME} (another variant)
Try running below in your console.
> y_train_5.iloc[:,0].shape
(n,)
cross_val_predict(sgd_clf, X_train, y_train_5.iloc[:,0], cv=3,
method="decision_function")
First of all thank you for providing dask with all its functionality, which is highly appreciated!
However, using dask.distributed to run an SVD on a rasterized dataset, it seems as if it fails when only single chunks consist only of NaN values although most of the dataset does contain correct values.
I read a dataset using xarray.open_mfdataset(chunks={...}) and try to set the chunksize such, that SVD computation (dask.array.linalg) used in the eofs.xarray package makes use of the cores our cluster provides, by using a dask.distributed client.
<xarray.Dataset>
Dimensions: (time: 8760, x: 1000, y: 840)
Coordinates:
* x (x) float64 2.452e+06 2.458e+06 2.462e+06 ... 7.442e+06 7.448e+06
* y (y) float64 1.352e+06 1.358e+06 1.362e+06 ... 5.542e+06 5.548e+06
* time (time) datetime64[ns] 2005-01-01 ... 2005-12-31T23:00:00
Data variables:
capacity (y, x) float64 dask.array<shape=(840, 1000), chunksize=(840, 840)>
capfac (time, y, x) float32 dask.array<shape=(8760, 840, 1000), chunksize=(876, 840, 840)>
However, when I run the computation, it fails with the below-mentioned error message.
ValueError: error encountered in SVD, check that missing values are in the same places at each time and that all the values are not missing
See complete error message:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~/.conda/envs/spagat_py36/lib/python3.6/site-packages/eofs/standard.py in __init__(self, dataset, weights, center, ddof)
164 # Use the parallel Dask algorithm
--> 165 dsvd = dask.array.linalg.svd(dataNoMissing)
166 A, Lh, E = (x.compute() for x in dsvd)
~/.conda/envs/spagat_py36/lib/python3.6/site-packages/dask/array/linalg.py in svd(a)
803 """
--> 804 return tsqr(a, compute_svd=True)
805
~/.conda/envs/spagat_py36/lib/python3.6/site-packages/dask/array/linalg.py in tsqr(data, compute_svd, _max_vchunk_size)
116 "Current shape: {},\nCurrent chunksize: {}".format(
--> 117 data.shape, data.chunksize
118 )
ValueError: Input must have the following properties:
1. Have two dimensions
2. Have only one column of blocks
Note: This function (tsqr) supports QR decomposition in the case of
tall-and-skinny matrices (single column chunk/block; see qr)Current shape: (8760, nan),
Current chunksize: (876, nan)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-17-f60250fedf8b> in <module>
----> 1 pca.analyze()
~/code/tsa_lib/tsa_lib/time_tools.py in f(*args, **kwargs)
8 def f(*args, **kwargs):
9 before = time.perf_counter() # maybe exchange with time.process_time()
---> 10 rv = func(*args, **kwargs)
11 after = time.perf_counter()
12 print('elapsed time for {.__name__}: {:.2f} minutes'.format(func, (after - before)/60))
~/code/playground/playground/PCA.py in analyze(self)
145 print('PCA completed. Weights used.')
146 else:
--> 147 self.eofs, self.pcs, self.solver = eof_analysis(self.data_variability, n_eofs=None, xarray=True)
148 print('PCA completed. No weights used.')
149
~/code/tsa_lib/tsa_lib/time_tools.py in f(*args, **kwargs)
8 def f(*args, **kwargs):
9 before = time.perf_counter() # maybe exchange with time.process_time()
---> 10 rv = func(*args, **kwargs)
11 after = time.perf_counter()
12 print('elapsed time for {.__name__}: {:.2f} minutes'.format(func, (after - before)/60))
~/code/playground/playground/PCA.py in eof_analysis(data, n_eofs, xarray, wgts, lats)
36 solver = xEof(data, weights=wgts)
37 else:
---> 38 solver = xEof(data)
39
40 eofs = solver.eofsAsCovariance(neofs=n_eofs)
~/.conda/envs/spagat_py36/lib/python3.6/site-packages/eofs/xarray.py in __init__(self, array, weights, center, ddof)
131 weights=wtarray,
132 center=center,
--> 133 ddof=ddof)
134 # Name of the input DataArray.
135 self._name = array.name
~/.conda/envs/spagat_py36/lib/python3.6/site-packages/eofs/standard.py in __init__(self, dataset, weights, center, ddof)
175
176 except (np.linalg.LinAlgError, ValueError):
--> 177 raise ValueError('error encountered in SVD, check that missing '
178 'values are in the same places at each time and '
179 'that all the values are not missing')
ValueError: error encountered in SVD, check that missing values are in the same places at each time and that all the values are not missing
When applying SVD on a rasterized dataset, the below-mentioned error is given. Is it possible, that the error is raised because single chunks might be only containing NaN values?
If so, it could be considered as a bug of dask.distributed because the SVD works fine when applying it without chunking. Hence, the SVD should not fail only because single chunks only contain NaN values, whereas other chunks contain valid values, should it?
I am running my code on google colabs but my sequential model is not compiling. I could not create a new Sequential mode from a pre-existing model and compile it. I have called compile function from model but I have no idea why the error pops up. I don't know where I messed up. Actually Im doing something different but this is a simplied code with intention to find error.
train_path = '/content/drive/My Drive/ML/train'
model_custom=load_model("/content/drive/My Drive/ML/model_after_vgg_back_up.h5")
model=Sequential()
for layer in model_custom.layers:
model.add(layer)
model.compile(Adam(lr=0.0001),loss='categorical_crossentropy',metrics=['accuracy'])
train_batches = ImageDataGenerator().flow_from_directory(train_path,target_size=[224,224],classes=['mom','nanu','prabesh','sanu'],batch_size=15)
valid_path = '/content/drive/My Drive/ML/test1'
valid_batches = ImageDataGenerator().flow_from_directory(valid_path,target_size=[224,224],classes=['mom','nanu','prabesh','sanu'],batch_size=15)
model.fit_generator(train_batches,validation_data=valid_batches,validation_steps=1,steps_per_epoch=35,verbose=2,epochs=3)
Traceback is like follows:
Found 569 images belonging to 4 classes.
Found 12 images belonging to 4 classes.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-16-c160c47bcd7c> in <module>()
7 valid_path = '/content/drive/My Drive/ML/test1'
8 valid_batches = ImageDataGenerator().flow_from_directory(valid_path,target_size=[224,224],classes=['mom','nanu','prabesh','sanu'],batch_size=15)
----> 9 model.fit_generator(train_batches,validation_data=valid_batches,validation_steps=1,steps_per_epoch=35,verbose=2,epochs=3)
/usr/local/lib/python3.6/dist-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
/usr/local/lib/python3.6/dist-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1416 use_multiprocessing=use_multiprocessing,
1417 shuffle=shuffle,
-> 1418 initial_epoch=initial_epoch)
1419
1420 #interfaces.legacy_generator_methods_support
/usr/local/lib/python3.6/dist-packages/keras/engine/training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
38
39 do_validation = bool(validation_data)
---> 40 model._make_train_function()
41 if do_validation:
42 model._make_test_function()
/usr/local/lib/python3.6/dist-packages/keras/engine/training.py in _make_train_function(self)
494 def _make_train_function(self):
495 if not hasattr(self, 'train_function'):
--> 496 raise RuntimeError('You must compile your model before using it.')
497 self._check_trainable_weights_consistency()
498 if self.train_function is None:
RuntimeError: You must compile your model before using it.
I am storing a pickle of a xgboost classifier model file to disk using joblib
from sklearn.externals import joblib
joblib.dump(clf, 'model.pkl')
Then, I am loading the model and trying to predict on a test dataset
from sklearn.externals import joblib
model=joblib.load('model.pkl')
model.predict(X_test)
I am getting the TypeError: 'str' object is not callable when I execute the predict method on the loaded classifier object.
Any help on how to resolve this?
Here is the traceback
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-72-0da70f003ba0> in <module>()
----> 1 model.predict(X_test)
C:\Users\Sriram\Anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
113
114 # lambda, but not partial, allows help() to work with update_wrapper
--> 115 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
116 # update the docstring of the returned function
117 update_wrapper(out, self.fn)
C:\Users\Sriram\Anaconda3\lib\site-packages\sklearn\pipeline.py in predict(self, X)
314 if transform is not None:
315 Xt = transform.transform(Xt)
--> 316 return self.steps[-1][-1].predict(Xt)
317
318 #if_delegate_has_method(delegate='_final_estimator')
C:\Users\Sriram\Anaconda3\lib\site-packages\xgboost\sklearn.py in predict(self, data, output_margin, ntree_limit)
461 def predict(self, data, output_margin=False, ntree_limit=0):
462 test_dmatrix = DMatrix(data, missing=self.missing)
--> 463 class_probs = self.booster().predict(test_dmatrix,
464 output_margin=output_margin,
465 ntree_limit=ntree_limit)
TypeError: 'str' object is not callable