Can a dask worker signal the scheduler that it was setup incorrectly?
I'm hitting a problem where my worker is being setup incorrectly some small fraction of the time. The worker errors out and brings down my whole graph. (It's not the task -- it's the worker itself that is bad.) I know the symptom and can catch it, and would like the worker to say "hey scheduler, remove me as a worker -- do not use it".
I am using dask-gateway, if that matters.
In case it helps, the workers (seems to be a few percent) do not have access to libcuda.so.1:
/opt/conda/lib/python3.7/site-packages/cellpose/models.py in <module>
8 import cv2
9
---> 10 from mxnet import gluon, nd
11 import mxnet as mx
12
/opt/conda/lib/python3.7/site-packages/mxnet/__init__.py in <module>
22 from __future__ import absolute_import
23
---> 24 from .context import Context, current_context, cpu, gpu, cpu_pinned
25 from . import engine
26 from .base import MXNetError
/opt/conda/lib/python3.7/site-packages/mxnet/context.py in <module>
22 import warnings
23 import ctypes
---> 24 from .base import classproperty, with_metaclass, _MXClassPropertyMetaClass
25 from .base import _LIB
26 from .base import check_call
/opt/conda/lib/python3.7/site-packages/mxnet/base.py in <module>
212 __version__ = libinfo.__version__
213 # library instance of mxnet
--> 214 _LIB = _load_lib()
215
216 # type definitions
/opt/conda/lib/python3.7/site-packages/mxnet/base.py in _load_lib()
203 """Load library by searching possible path."""
204 lib_path = libinfo.find_lib_path()
--> 205 lib = ctypes.CDLL(lib_path[0], ctypes.RTLD_LOCAL)
206 # DMatrix functions
207 lib.MXGetLastError.restype = ctypes.c_char_p
/opt/conda/lib/python3.7/ctypes/__init__.py in __init__()
362
363 if handle is None:
--> 364 self._handle = _dlopen(self._name, mode)
365 else:
366 self._handle = handle
OSError: libcuda.so.1: cannot open shared object file: No such file or directory
You could probably do what you want with a WorkerPlugin.
Though I would argue you don't want to do this. Worker initialization should work every time and you should fix the root cause here.
Given issues with libcuda and dask-gateway you might be interested in this dask-gateway issue: https://github.com/dask/dask-gateway/issues/177
Related
While using import torch_geometric I get a module not found error from torch_geometric.data. import torch runs without an error.
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[2], line 1
----> 1 import torch_geometric
File ~\anaconda3\envs\python3_8_10\lib\site-packages\torch_geometric\__init__.py:4
1 from types import ModuleType
2 from importlib import import_module
----> 4 import torch_geometric.data
5 import torch_geometric.loader
6 import torch_geometric.transforms
File ~\anaconda3\envs\python3_8_10\lib\site-packages\torch_geometric\data\__init__.py:1
----> 1 from .data import Data
2 from .hetero_data import HeteroData
3 from .temporal import TemporalData
File ~\anaconda3\envs\python3_8_10\lib\site-packages\torch_geometric\data\data.py:20
18 import torch
19 from torch import Tensor
---> 20 from torch_sparse import SparseTensor
22 from torch_geometric.data.feature_store import (
23 FeatureStore,
24 FeatureTensorType,
25 TensorAttr,
26 _field_status,
27 )
28 from torch_geometric.data.graph_store import (
29 EDGE_LAYOUT_TO_ATTR_NAME,
30 EdgeAttr,
(...)
34 edge_tensor_type_to_adj_type,
35 )
File ~\anaconda3\envs\python3_8_10\lib\site-packages\torch_sparse\__init__.py:19
17 spec = cuda_spec or cpu_spec
18 if spec is not None:
---> 19 torch.ops.load_library(spec.origin)
20 else: # pragma: no cover
21 raise ImportError(f"Could not find module '{library}_cpu' in "
22 f"{osp.dirname(__file__)}")
File ~\anaconda3\envs\python3_8_10\lib\site-packages\torch\_ops.py:255, in _Ops.load_library(self, path)
250 path = torch._utils_internal.resolve_library_path(path)
251 with dl_open_guard():
252 # Import the shared library into the process, thus running its
253 # static (global) initialization code in order to register custom
254 # operators with the JIT.
--> 255 ctypes.CDLL(path)
256 self.loaded_libraries.add(path)
File ~\anaconda3\envs\python3_8_10\lib\ctypes\__init__.py:381, in CDLL.__init__(self, name, mode, handle, use_errno, use_last_error, winmode)
378 self._FuncPtr = _FuncPtr
380 if handle is None:
--> 381 self._handle = _dlopen(self._name, mode)
382 else:
383 self._handle = handle
FileNotFoundError: Could not find module 'C:\Users\fes33\anaconda3\envs\python3_8_10\Lib\site-packages\torch_sparse\_convert_cuda.pyd' (or one of its dependencies). Try using the full path with constructor syntax.
How can I get rid of it?
I have this redis container I use for Sidekiq/Rails and also set up rabbitmq a few days ago on the same docker host. Ever since I keep getting these errors:
UNBLOCKED force unblock from blocking operation, instance state changed (master -> slave?)
redis/client.rb in call at line 126
def call(command)
reply = process([command]) { read }
raise reply if reply.is_a?(CommandError)
if block_given?
yield reply
redis/client.rb in block in call_with_timeout at line 219
redis/client.rb in with_socket_timeout at line 292
redis/client.rb in call_with_timeout at line 218
redis.rb in block in _bpop at line 1186
redis.rb in block in synchronize at line 52
monitor.rb in mon_synchronize at line 226
redis.rb in synchronize at line 52
redis.rb in _bpop at line 1183
redis.rb in brpop at line 1228
redis/namespace.rb in call_with_namespace at line 469
redis/namespace.rb in block (2 levels) in <class:Namespace> at line 349
sidekiq/fetch.rb in block in retrieve_work at line 37
sidekiq.rb in block in redis at line 97
connection_pool.rb in block (2 levels) in with at line 65
connection_pool.rb in handle_interrupt at line 64
connection_pool.rb in block in with at line 64
connection_pool.rb in handle_interrupt at line 61
connection_pool.rb in with at line 61
sidekiq.rb in redis at line 94
sidekiq/fetch.rb in retrieve_work at line 37
sidekiq/processor.rb in get_one at line 83
sidekiq/processor.rb in fetch at line 95
sidekiq/processor.rb in process_one at line 77
sidekiq/processor.rb in run at line 68
sidekiq/util.rb in watchdog at line 15
sidekiq/util.rb in block in safe_thread at line 24
The redis logs show nothing relevant, just the "save database" stuff.
Have I done something evil getting them both set up within the same host?
Turns out someone removed a firewall rule so the redis port was exposed to datacenter-wide VLAN.
And since the datacenter where that machine lives have promised 3 years ago they will be releasing a feature where each customer/account can have their own separate VLAN, they have yet to and some of their customers seem to be up to no good.
The error I was seeing was someone probing ports and trying all sorts of remote exploits.
After adding the firewall rule back up, the error disappeared.
I am using Jupyter to execute the below code but getting the error
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
music_data = pd.read_csv('music.csv')
X = music_data.drop(columns=['genre'])
y = music_data['genre']
model = DecisionTreeClassifier()
model.fit(X,y)
music_data
Error Occured:
ImportError Traceback (most recent call last)
<ipython-input-28-7af4ede8a769> in <module>
1 import pandas as pd
----> 2 from sklearn.tree import DecisionTreeClassifier
3
4 music_data = pd.read_csv('music.csv')
5 X = music_data.drop(columns=['genre'])
~\Anaconda3\lib\site-packages\sklearn\__init__.py in <module>
74 else:
75 from . import __check_build
---> 76 from .base import clone
77 from .utils._show_versions import show_versions
78
~\Anaconda3\lib\site-packages\sklearn\base.py in <module>
14
15 from . import __version__
---> 16 from .utils import _IS_32BIT
17
18 _DEFAULT_TAGS = {
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in <module>
18 from ..exceptions import DataConversionWarning
19 from .deprecation import deprecated
---> 20 from .validation import (as_float_array,
21 assert_all_finite,
22 check_random_state, column_or_1d, check_array,
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in <module>
19 from numpy.core.numeric import ComplexWarning
20
---> 21 from .fixes import _object_dtype_isnan
22 from .. import get_config as _get_config
23 from ..exceptions import NonBLASDotWarning
~\Anaconda3\lib\site-packages\sklearn\utils\fixes.py in <module>
16 import scipy.sparse as sp
17 import scipy
---> 18 from scipy.sparse.linalg import lsqr as sparse_lsqr # noqa
19
20
~\Anaconda3\lib\site-packages\scipy\sparse\linalg\__init__.py in <module>
114 from .dsolve import *
115 from .interface import *
--> 116 from .eigen import *
117 from .matfuncs import *
118 from ._onenormest import *
~\Anaconda3\lib\site-packages\scipy\sparse\linalg\eigen\__init__.py in <module>
9 from __future__ import division, print_function, absolute_import
10
---> 11 from .arpack import *
12 from .lobpcg import *
13
~\Anaconda3\lib\site-packages\scipy\sparse\linalg\eigen\arpack\__init__.py in <module>
20 from __future__ import division, print_function, absolute_import
21
---> 22 from .arpack import *
~\Anaconda3\lib\site-packages\scipy\sparse\linalg\eigen\arpack\arpack.py in <module>
43 __all__ = ['eigs', 'eigsh', 'svds', 'ArpackError', 'ArpackNoConvergence']
44
---> 45 from . import _arpack
46 import numpy as np
47 import warnings
ImportError: DLL load failed: The specified procedure could not be found.
I am using Python 3.8.0 and anaconda3, kindly somebody please tell me what is going wrong here
From the error log, it seems the issue is from this line:
from scipy.sparse.linalg import lsqr as sparse_lsqr
Try uninstalling and reinstalling scipy and numpy
Uninstall:
pip uninstall numpy scipy
Install:
pip install -U numpy scipy
I'm working on a project but my problem is that my librosa library shows me that the file was not found, but it is present.
# Load using Librosa
y, sr = librosa.load(normal_file, duration=5) #default sampling rate is 22 HZ
dur=librosa.get_duration(y)
print ("duration:", dur)
print(y.shape, sr)
The error:
FileNotFoundError
Traceback (most recent call last)
<ipython-input-31-d400d9a9c828> in <module>()
1 # Load using Librosa
----> 2 y, sr = librosa.load(normal_file) #default sampling rate is 22 HZ
3 dur=librosa.get_duration(y)
4 print ("duration:", dur)
5 print(y.shape, sr)
~\Anaconda3\lib\site-packages\librosa\core\audio.py in load(path, sr, mono, offset, duration, dtype, res_type)
117
118 y = []
--> 119 with audioread.audio_open(os.path.realpath(path)) as input_file:
120 sr_native = input_file.samplerate
121 n_channels = input_file.channels
~\Anaconda3\lib\site-packages\audioread\__init__.py in audio_open(path, backends)
105 """
106 if backends is None:
--> 107 backends = available_backends()
108
109 for BackendClass in backends:
~\Anaconda3\lib\site-packages\audioread\__init__.py in available_backends()
84
85 # FFmpeg.
---> 86 if ffdec.available():
87 result.append(ffdec.FFmpegAudioFile)
88
~\Anaconda3\lib\site-packages\audioread\ffdec.py in available()
106 stdout=subprocess.PIPE,
107 stderr=subprocess.PIPE,
--> 108 creationflags=PROC_FLAGS,
109 )
110 proc.wait()
~\Anaconda3\lib\site-packages\audioread\ffdec.py in popen_multiple(commands, command_args, *args, **kwargs)
92 cmd = [command] + command_args
93 try:
---> 94 return subprocess.Popen(cmd, *args, **kwargs)
95 except OSError:
96 if i == len(commands) - 1:
~\Anaconda3\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors)
707 c2pread, c2pwrite,
708 errread, errwrite,
--> 709 restore_signals, start_new_session)
710 except:
711 # Cleanup if the child failed starting.
~\Anaconda3\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
995 env,
996 os.fspath(cwd) if cwd is not None else None,
--> 997 startupinfo)
998 finally:
999 # Child is launched. Close the parent's copy of those pipe
FileNotFoundError: [WinError 2] The system cannot find the file specified
It shows duration and reads the audio file.
I want to connect my neo4j's project server to py2neo in jupyter
I actually have 2 problems:
Given below is a picture of my neo4j browser connected with bolt//:localhost:11004, username: neo4j, password: password
But i am not able to connect to this server through py2neo on jupyter notebook.
The code in python is the following:
graphdb = Graph("bolt://localhost:11004", secure=True, auth=('neo4j', 'password'))
I am getting the following error:
KeyError Traceback (most recent call last)
~/conda3/lib/python3.6/site-packages/py2neo/database.py in __new__(cls, uri, **settings)
87 try:
---> 88 inst = cls._instances[key]
89 except KeyError:
KeyError: '0611fb007d1a660e26e66e58777225de'
During handling of the above exception, another exception occurred:
ServiceUnavailable Traceback (most recent call last)
<ipython-input-41-2d6567e9c5ba> in <module>()
3 # default uri for local Neo4j instance
4 dict_params=dict(secure=True)
----> 5 graphdb = Graph(**dict_params)
~/conda3/lib/python3.6/site-packages/py2neo/database.py in __new__(cls, uri, **settings)
303 def __new__(cls, uri=None, **settings):
304 name = settings.pop("name", "data")
--> 305 database = Database(uri, **settings)
306 if name in database:
307 inst = database[name]
~/conda3/lib/python3.6/site-packages/py2neo/database.py in __new__(cls, uri, **settings)
95 auth=connection_data["auth"],
96 encrypted=connection_data["secure"],
---> 97 user_agent=connection_data["user_agent"])
98 inst._graphs = {}
99 cls._instances[key] = inst
~/conda3/lib/python3.6/site-packages/neo4j/v1/api.py in __new__(cls, uri, **config)
131 for subclass in Driver.__subclasses__():
132 if parsed.scheme == subclass.uri_scheme:
--> 133 return subclass(uri, **config)
134 raise ValueError("URI scheme %r not supported" % parsed.scheme)
135
~/conda3/lib/python3.6/site-packages/neo4j/v1/direct.py in __new__(cls, uri, **config)
71
72 pool = DirectConnectionPool(connector, instance.address, **config)
---> 73 pool.release(pool.acquire())
74 instance._pool = pool
75 instance._max_retry_time = config.get("max_retry_time", default_config["max_retry_time"])
~/conda3/lib/python3.6/site-packages/neo4j/v1/direct.py in acquire(self, access_mode)
42
43 def acquire(self, access_mode=None):
---> 44 return self.acquire_direct(self.address)
45
46
~/conda3/lib/python3.6/site-packages/neo4j/bolt/connection.py in acquire_direct(self, address)
448 if can_create_new_connection:
449 try:
--> 450 connection = self.connector(address, self.connection_error_handler)
451 except ServiceUnavailable:
452 self.remove(address)
~/conda3/lib/python3.6/site-packages/neo4j/v1/direct.py in connector(address, error_handler)
68
69 def connector(address, error_handler):
---> 70 return connect(address, security_plan.ssl_context, error_handler, **config)
71
72 pool = DirectConnectionPool(connector, instance.address, **config)
~/conda3/lib/python3.6/site-packages/neo4j/bolt/connection.py in connect(address, ssl_context, error_handler, **config)
702 raise ServiceUnavailable("Failed to resolve addresses for %s" % address)
703 else:
--> 704 raise last_error
~/conda3/lib/python3.6/site-packages/neo4j/bolt/connection.py in connect(address, ssl_context, error_handler, **config)
692 log_debug("~~ [RESOLVED] %s -> %s", address, resolved_address)
693 try:
--> 694 s = _connect(resolved_address, **config)
695 s, der_encoded_server_certificate = _secure(s, address[0], ssl_context, **config)
696 connection = _handshake(s, resolved_address, der_encoded_server_certificate, error_handler, **config)
~/conda3/lib/python3.6/site-packages/neo4j/bolt/connection.py in _connect(resolved_address, **config)
582 _force_close(s)
583 if error.errno in (61, 99, 111, 10061):
--> 584 raise ServiceUnavailable("Failed to establish connection to {!r} (reason {})".format(resolved_address, error.errno))
585 else:
586 raise
ServiceUnavailable: Failed to establish connection to ('127.0.0.1', 7687) (reason 111)
What i want to know is
1) The connection between neo4j and py2neo is made how exactly in py2neo v4
2) Do i always have to make a local connection or can i connect to the neo4j server
3) If i can connect to my neo4j server is it such that whatever py2neo queries i run on my jupyter notebook shall synchronise with the neo4j database too?
From the last line of the error, it looks like it's trying to connect on default bolt port (i.e. 7687).
I would suggest you use this format instead of full URI.
graphdb = Graph(scheme="bolt", host="localhost", port=11004,
secure=True, auth=('neo4j', 'password'))