While using dask.distributed I'm trying to load dask dataframe from CSV on S3 inside delayed function like this:
#delayed
def func1():
...
return df.read_csv(*s3_url*, ...)
read_csv() does not need interaction with distributed client, so I assumed it's possible. Then on the client machine I compute the delayed object returned by func1.
res = func1()
future = client.compute(res)
progress(future)
frame = client.gather(future)
Until that point it looks good, printing the result
Dask DataFrame Structure:
COL1 COL2
npartitions=9
object object
... ...
... ... ...
... ...
... ...
Dask Name: from-delayed, 27 tasks
However, it fails with Failed to serialize (<dask.bytes.core.OpenFile object at ...>, ..., ..., '\n'). Exception: can't pickle thread.lock objects when I try to further process it, e.g.
client.compute(frame)
Is there a way to get this scheme to work or I'm missing some more fundamental limitation here?
PS. error log that I'm getting:
.pickle - Failed to serialize (<dask.bytes.core.OpenFile object at ...>, 20971520, 10485760, '\n'). Exception: can't pickle thread.lock objects
ERROR:2017-11-10 15:31:31:root:Exception while executing graph: can't pickle thread.lock objects
Traceback (most recent call last):
...
client.compute(res.data)
File ".../python2.7/site-packages/distributed/client.py", line 2089, in compute
resources=resources)
File ".../python2.7/site-packages/distributed/client.py", line 1906, in _graph_to_futures
'tasks': valmap(dumps_task, dsk3),
File ".../python2.7/site-packages/toolz-0.8.2-py2.7.egg/toolz/dicttoolz.py", line 84, in valmap
rv.update(zip(iterkeys(d), map(func, itervalues(d))))
File ".../python2.7/site-packages/distributed/worker.py", line 731, in dumps_task
'args': pickle.dumps(task[1:])}
File ".../python2.7/site-packages/distributed/protocol/pickle.py", line 51, in dumps
return cloudpickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 829, in dumps
cp.dump(obj)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 233, in dump
return Pickler.dump(self, obj)
File "...python2.7/pickle.py", line 224, in dump
self.save(obj)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 568, in save_tuple
save(element)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 564, in save_instancemethod
obj=obj)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 709, in save_reduce
save(args)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 554, in save_tuple
save(element)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 692, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 554, in save_tuple
save(element)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 306, in save
rv = reduce(self.proto)
TypeError: can't pickle thread.lock objects
Related
I'm trying to upgrade my local Odoo CE 14.0 to Enterprise, i follow this video tutorial https://www.youtube.com/watch?v=-eCHJAq1QdY and official steps https://www.odoo.com/documentation/14.0/administration/maintain/enterprise.html
But when i trying to instal web_enterpsie module get this error:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/odoo/addons/base/models/ir_http.py", line 237, in _dispatch
result = request.dispatch()
File "/usr/lib/python3/dist-packages/odoo/http.py", line 683, in dispatch
result = self._call_function(**self.params)
File "/usr/lib/python3/dist-packages/odoo/http.py", line 359, in _call_function
return checked_call(self.db, *args, **kwargs)
File "/usr/lib/python3/dist-packages/odoo/service/model.py", line 94, in wrapper
return f(dbname, *args, **kwargs)
File "/usr/lib/python3/dist-packages/odoo/http.py", line 347, in checked_call
result = self.endpoint(*a, **kw)
File "/usr/lib/python3/dist-packages/odoo/http.py", line 912, in __call__
return self.method(*args, **kw)
File "/usr/lib/python3/dist-packages/odoo/http.py", line 531, in response_wrap
response = f(*args, **kw)
File "/usr/lib/python3/dist-packages/odoo/addons/web/controllers/main.py", line 1398, in call_button
action = self._call_kw(model, method, args, kwargs)
File "/usr/lib/python3/dist-packages/odoo/addons/web/controllers/main.py", line 1386, in _call_kw
return call_kw(request.env[model], method, args, kwargs)
File "/usr/lib/python3/dist-packages/odoo/api.py", line 399, in call_kw
result = _call_kw_multi(method, model, args, kwargs)
File "/usr/lib/python3/dist-packages/odoo/api.py", line 386, in _call_kw_multi
result = method(recs, *args, **kwargs)
File "<decorator-gen-71>", line 2, in button_immediate_install
File "/usr/lib/python3/dist-packages/odoo/addons/base/models/ir_module.py", line 74, in check_and_log
return method(self, *args, **kwargs)
File "/usr/lib/python3/dist-packages/odoo/addons/base/models/ir_module.py", line 475, in button_immediate_install
return self._button_immediate_function(type(self).button_install)
File "/usr/lib/python3/dist-packages/odoo/addons/base/models/ir_module.py", line 593, in _button_immediate_function
modules.registry.Registry.new(self._cr.dbname, update_module=True)
File "/usr/lib/python3/dist-packages/odoo/modules/registry.py", line 89, in new
odoo.modules.load_modules(registry._db, force_demo, status, update_module)
File "/usr/lib/python3/dist-packages/odoo/modules/loading.py", line 461, in load_modules
loaded_modules, update_module, models_to_check)
File "/usr/lib/python3/dist-packages/odoo/modules/loading.py", line 349, in load_marked_modules
perform_checks=perform_checks, models_to_check=models_to_check
File "/usr/lib/python3/dist-packages/odoo/modules/loading.py", line 198, in load_module_graph
registry.setup_models(cr)
File "/usr/lib/python3/dist-packages/odoo/modules/registry.py", line 276, in setup_models
model._setup_fields()
File "/usr/lib/python3/dist-packages/odoo/models.py", line 2845, in _setup_fields
field.setup_full(self)
File "/usr/lib/python3/dist-packages/odoo/fields.py", line 401, in setup_full
self._setup_related_full(model)
File "/usr/lib/python3/dist-packages/odoo/fields.py", line 458, in _setup_related_full
field = model.pool[model_name]._fields[name]
Exception
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/odoo/http.py", line 639, in _handle_exception
return super(JsonRequest, self)._handle_exception(exception)
File "/usr/lib/python3/dist-packages/odoo/http.py", line 315, in _handle_exception
raise exception.with_traceback(None) from new_cause
KeyError: 'avatar_128'
odoo.conf:
addons_path = /usr/lib/python3/dist-packages/odoo/addons,/mnt/extra-addons/enterprise,/mnt/extra-addons/custom
Anybody could help me please ?
I have a CDK app which I generate template.yml file using:
cdk synth --no-staging > ./template.yml
and then run sam local:
sam local start-api
but I get the following error:
Traceback (most recent call last):
File "/usr/local/bin/sam", line 8, in <module>
sys.exit(cli())
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/click/core.py", line 829, in __call__
return self.main(*args, **kwargs)
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/click/core.py", line 782, in main
rv = self.invoke(ctx)
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/click/core.py", line 1259, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/click/core.py", line 1259, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/click/core.py", line 1066, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/click/core.py", line 610, in invoke
return callback(*args, **kwargs)
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/click/decorators.py", line 73, in new_func
return ctx.invoke(f, obj, *args, **kwargs)
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/click/core.py", line 610, in invoke
return callback(*args, **kwargs)
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/samcli/lib/telemetry/metric.py", line 166, in wrapped
raise exception # pylint: disable=raising-bad-type
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/samcli/lib/telemetry/metric.py", line 124, in wrapped
return_value = func(*args, **kwargs)
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/samcli/lib/utils/version_checker.py", line 41, in wrapped
actual_result = func(*args, **kwargs)
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/samcli/cli/main.py", line 87, in wrapper
return func(*args, **kwargs)
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/samcli/commands/local/start_api/cli.py", line 94, in cli
do_cli(
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/samcli/commands/local/start_api/cli.py", line 192, in do_cli
service = LocalApiService(lambda_invoke_context=invoke_context, port=port, host=host, static_dir=static_dir)
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/samcli/commands/local/lib/local_api_service.py", line 37, in __init__
self.api_provider = ApiProvider(lambda_invoke_context.stacks, cwd=self.cwd)
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/samcli/lib/providers/api_provider.py", line 37, in __init__
self.api = self._extract_api()
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/samcli/lib/providers/api_provider.py", line 64, in _extract_api
provider.extract_resources(self.stacks, collector, cwd=self.cwd)
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/samcli/lib/providers/cfn_api_provider.py", line 73, in extract_resources
self._extract_cfn_gateway_v2_route(stack.stack_path, resources, logical_id, resource, collector)
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/samcli/lib/providers/cfn_api_provider.py", line 315, in _extract_cfn_gateway_v2_route
method, path = self._parse_route_key(route_key)
File "/usr/local/Cellar/aws-sam-cli/1.37.0/libexec/lib/python3.8/site-packages/samcli/lib/providers/cfn_api_provider.py", line 488, in _parse_route_key
[method, path] = route_key.split()
ValueError: not enough values to unpack (expected 2, got 1)
and ideas what's the issue?
I'm running papermill 2.1.0 in a newly created virtual environment in Mac OS
When I execute a simple notebook I get: FileNotFoundError, referring to and old file that no longer exists.
papermill 59848931.ipynb 59848931-out.ipynb
I get the following:
Input Notebook: 59848931.ipynb
Output Notebook: 59848931-out.ipynb
Executing: 0%| | 0/2 [00:00<?, ?cell/s]Failed to run command:
['/Users/user/Documents/Development/python/virtual_environments/udemy_tensorflow_venv/bin/python3', '-m', 'ipykernel_launcher', '-f', '/var/folders/p2/jh8vcbv51ks2gzvfx3dw1bd000_wjb/T/tmp3cf56dkh.json', '--HistoryManager.hist_file=:memory:']
PATH='/Users/user/Documents/notebooks/venv/bin:/Users/user/google-cloud-sdk/bin:/Users/user/Downloads/google-cloud-sdk/bin:/anaconda3/bin:/anaconda/bin:/Users/user/homebrew/bin:/Users/user/bin:/Library/Frameworks/R.framework/Versions/Current/Resources/:/usr/local/git/current/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/sbin:/usr/sbin:/sbin:/usr/local/go/bin:/usr/local/share/dotnet:/opt/X11/bin:~/.dotnet/tools:/Applications/Wireshark.app/Contents/MacOS:/Users/user/Documents/Development'
with kwargs:
{'stdin': -1, 'stdout': None, 'stderr': None, 'cwd': None, 'start_new_session': True}
Executing: 0%| | 0/2 [00:01<?, ?cell/s]
Traceback (most recent call last):
File "/Users/user/Documents/notebooks/venv/bin/papermill", line 10, in <module>
sys.exit(papermill())
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/click/core.py", line 829, in __call__
return self.main(*args, **kwargs)
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/click/core.py", line 782, in main
rv = self.invoke(ctx)
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/click/core.py", line 1066, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/click/core.py", line 610, in invoke
return callback(*args, **kwargs)
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/papermill/cli.py", line 235, in papermill
execution_timeout=execution_timeout,
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/papermill/execute.py", line 104, in execute_notebook
**engine_kwargs
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/papermill/engines.py", line 49, in execute_notebook_with_engine
return self.get_engine(engine_name).execute_notebook(nb, kernel_name, **kwargs)
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/papermill/engines.py", line 343, in execute_notebook
cls.execute_managed_notebook(nb_man, kernel_name, log_output=log_output, **kwargs)
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/papermill/engines.py", line 402, in execute_managed_notebook
return PapermillNotebookClient(nb_man, **final_kwargs).execute()
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/papermill/clientwrap.py", line 36, in execute
with self.setup_kernel(**kwargs):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/contextlib.py", line 81, in __enter__
return next(self.gen)
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/nbclient/client.py", line 404, in setup_kernel
self.start_new_kernel_client(**kwargs)
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/nbclient/util.py", line 37, in wrapped
result = loop.run_until_complete(coro(self, *args, **kwargs))
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/base_events.py", line 467, in run_until_complete
return future.result()
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/nbclient/client.py", line 375, in async_start_new_kernel_client
await ensure_async(self.km.start_kernel(extra_arguments=self.extra_arguments, **kwargs))
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/nbclient/util.py", line 57, in ensure_async
result = await obj
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/jupyter_client/manager.py", line 542, in start_kernel
self.kernel = await self._launch_kernel(kernel_cmd, **kw)
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/jupyter_client/manager.py", line 523, in _launch_kernel
res = launch_kernel(kernel_cmd, **kw)
File "/Users/user/Documents/notebooks/venv/lib/python3.6/site-packages/jupyter_client/launcher.py", line 135, in launch_kernel
proc = Popen(cmd, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/subprocess.py", line 709, in __init__
restore_signals, start_new_session)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/subprocess.py", line 1344, in _execute_child
raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: '/Users/user/Documents/Development/python/virtual_environments/udemy_tensorflow_venv/bin/python3': '/Users/user/Documents/Development/python/virtual_environments/udemy_tensorflow_venv/bin/python3'
I reinstall Python3, papermill and virtualenv and same issue
I reinstalled my IPython kernel
python3 -m pip install ipykernel
python3 -m ipykernel install --user
Im a beginner in Streaming and trying to work on a predictive maintenance use case with structured streaming but getting an error while predicting on the (csv-)DataStream.
The following function is called and if I remove the prediction part it is not returning any error - the model is preloaded and the DataStreamreader is working too:
def process_row(row):
"""Fif and preprocess"""
list_feat_col_num = [item[0] for item in row.dtypes if item[1].startswith('int')|item[1].startswith('double')]
vec_assembler = VectorAssembler(inputCols=list_feat_col_num, outputCol="features")
row_transformed = vec_assembler.transform(row).select('machineID','datetime','failure','features')
featureIndexer = VectorIndexer(inputCol="features",outputCol="indexedFeatures",
handleInvalid ="skip",
maxCategories=10).fit(row_transformed)
print(row_transformed)
# error comes from prediction part
"""predict"""
rf = RandomForestClassificationModel.load("content/model")
pipeline_rf_pred = Pipeline(stages=[featureIndexer, rf])
row_transformed = pipeline_rf_pred.fit(row_transformed)
prediction = model_rf.transform(row_transformed)
print(prediction)
pass
> Traceback (most recent call last):
File "/content/spark-2.4.3-bin-hadoop2.7/python/pyspark/serializers.py", line 590, in dumps
return cloudpickle.dumps(obj, 2)
File "/content/spark-2.4.3-bin-hadoop2.7/python/pyspark/cloudpickle.py", line 863, in dumps
cp.dump(obj)
File "/content/spark-2.4.3-bin-hadoop2.7/python/pyspark/cloudpickle.py", line 260, in dump
return Pickler.dump(self, obj)
File "/usr/lib/python3.6/pickle.py", line 409, in dump
self.save(obj)
File "/usr/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python3.6/pickle.py", line 751, in save_tuple
save(element)
File "/usr/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/content/spark-2.4.3-bin-hadoop2.7/python/pyspark/cloudpickle.py", line 406, in save_function
self.save_function_tuple(obj)
File "/content/spark-2.4.3-bin-hadoop2.7/python/pyspark/cloudpickle.py", line 549, in save_function_tuple
save(state)
File "/usr/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python3.6/pickle.py", line 821, in save_dict
self._batch_setitems(obj.items())
File "/usr/lib/python3.6/pickle.py", line 847, in _batch_setitems
save(v)
File "/usr/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python3.6/pickle.py", line 781, in save_list
self._batch_appends(obj)
File "/usr/lib/python3.6/pickle.py", line 808, in _batch_appends
save(tmp[0])
File "/usr/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/content/spark-2.4.3-bin-hadoop2.7/python/pyspark/cloudpickle.py", line 400, in save_function
self.save_function_tuple(obj)
File "/content/spark-2.4.3-bin-hadoop2.7/python/pyspark/cloudpickle.py", line 549, in save_function_tuple
save(state)
File "/usr/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python3.6/pickle.py", line 821, in save_dict
self._batch_setitems(obj.items())
File "/usr/lib/python3.6/pickle.py", line 847, in _batch_setitems
save(v)
File "/usr/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python3.6/pickle.py", line 821, in save_dict
self._batch_setitems(obj.items())
File "/usr/lib/python3.6/pickle.py", line 847, in _batch_setitems
save(v)
File "/usr/lib/python3.6/pickle.py", line 521, in save
self.save_reduce(obj=obj, *rv)
File "/usr/lib/python3.6/pickle.py", line 634, in save_reduce
save(state)
File "/usr/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python3.6/pickle.py", line 821, in save_dict
self._batch_setitems(obj.items())
File "/usr/lib/python3.6/pickle.py", line 847, in _batch_setitems
save(v)
File "/usr/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python3.6/pickle.py", line 781, in save_list
self._batch_appends(obj)
File "/usr/lib/python3.6/pickle.py", line 805, in _batch_appends
save(x)
File "/usr/lib/python3.6/pickle.py", line 521, in save
self.save_reduce(obj=obj, *rv)
File "/usr/lib/python3.6/pickle.py", line 634, in save_reduce
save(state)
File "/usr/lib/python3.6/pickle.py", line 476, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python3.6/pickle.py", line 821, in save_dict
self._batch_setitems(obj.items())
File "/usr/lib/python3.6/pickle.py", line 847, in _batch_setitems
save(v)
File "/usr/lib/python3.6/pickle.py", line 496, in save
rv = reduce(self.proto)
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/content/spark-2.4.3-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco
return f(*a, **kw)
File "/content/spark-2.4.3-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 332, in get_return_value
format(target_id, ".", name, value))
py4j.protocol.Py4JError: An error occurred while calling o281.__getstate__. Trace:
py4j.Py4JException: Method __getstate__([]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:274)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
---------------------------------------------------------------------------
Py4JError Traceback (most recent call last)
/content/spark-2.4.3-bin-hadoop2.7/python/pyspark/serializers.py in dumps(self, obj)
589 try:
--> 590 return cloudpickle.dumps(obj, 2)
591 except pickle.PickleError:
44 frames
Py4JError: An error occurred while calling o281.__getstate__. Trace:
py4j.Py4JException: Method __getstate__([]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:274)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
During handling of the above exception, another exception occurred:
PicklingError Traceback (most recent call last)
/content/spark-2.4.3-bin-hadoop2.7/python/pyspark/serializers.py in dumps(self, obj)
598 msg = "Could not serialize object: %s: %s" % (e.__class__.__name__, emsg)
599 cloudpickle.print_exec(sys.stderr)
--> 600 raise pickle.PicklingError(msg)
601
602
PicklingError: Could not serialize object: Py4JError: An error occurred while calling o281.__getstate__. Trace:
py4j.Py4JException: Method __getstate__([]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:274)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
We're using Dataflow to run a set of operations over Python objects. Because our objects aren't Pickle-able, we're using the DillCoder, which generally works well. We can either override the _fallback_coder of the coder registry, or supply the coder to the Reader or Writer - either works.
However, when our objects need to be shuffled, we get an error that our object can't be pickled from the PickleCoder.
I think that the Dataflow Worker Shuffler doesn't take into account the coder registry and instead always uses the PickleCoder. Is that right? Is there a way to override that?
Full stack trace below. Job-ID is 2018-01-25_15_21_50-3249437741466877997.
Thanks
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py", line 582, in do_work
work_executor.execute()
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/executor.py", line 167, in execute
op.start()
File "dataflow_worker/shuffle_operations.py", line 49, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
def start(self):
File "dataflow_worker/shuffle_operations.py", line 50, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
with self.scoped_start_state:
File "dataflow_worker/shuffle_operations.py", line 65, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
with self.shuffle_source.reader() as reader:
File "dataflow_worker/shuffle_operations.py", line 69, in dataflow_worker.shuffle_operations.GroupedShuffleReadOperation.start
self.output(windowed_value)
File "apache_beam/runners/worker/operations.py", line 154, in apache_beam.runners.worker.operations.Operation.output
cython.cast(Receiver, self.receivers[output_index]).receive(windowed_value)
File "apache_beam/runners/worker/operations.py", line 86, in apache_beam.runners.worker.operations.ConsumerSet.receive
cython.cast(Operation, consumer).process(windowed_value)
File "dataflow_worker/shuffle_operations.py", line 233, in dataflow_worker.shuffle_operations.BatchGroupAlsoByWindowsOperation.process
self.output(wvalue.with_value((k, wvalue.value)))
File "apache_beam/runners/worker/operations.py", line 154, in apache_beam.runners.worker.operations.Operation.output
cython.cast(Receiver, self.receivers[output_index]).receive(windowed_value)
File "apache_beam/runners/worker/operations.py", line 86, in apache_beam.runners.worker.operations.ConsumerSet.receive
cython.cast(Operation, consumer).process(windowed_value)
File "apache_beam/runners/worker/operations.py", line 339, in apache_beam.runners.worker.operations.DoOperation.process
with self.scoped_process_state:
File "apache_beam/runners/worker/operations.py", line 340, in apache_beam.runners.worker.operations.DoOperation.process
self.dofn_receiver.receive(o)
File "apache_beam/runners/common.py", line 382, in apache_beam.runners.common.DoFnRunner.receive
self.process(windowed_value)
File "apache_beam/runners/common.py", line 390, in apache_beam.runners.common.DoFnRunner.process
self._reraise_augmented(exn)
File "apache_beam/runners/common.py", line 415, in apache_beam.runners.common.DoFnRunner._reraise_augmented
raise
File "apache_beam/runners/common.py", line 388, in apache_beam.runners.common.DoFnRunner.process
self.do_fn_invoker.invoke_process(windowed_value)
File "apache_beam/runners/common.py", line 281, in apache_beam.runners.common.PerWindowInvoker.invoke_process
self._invoke_per_window(windowed_value)
File "apache_beam/runners/common.py", line 306, in apache_beam.runners.common.PerWindowInvoker._invoke_per_window
self.output_processor.process_outputs(
File "apache_beam/runners/common.py", line 480, in apache_beam.runners.common._OutputProcessor.process_outputs
self.main_receivers.receive(windowed_value)
File "apache_beam/runners/worker/operations.py", line 86, in apache_beam.runners.worker.operations.ConsumerSet.receive
cython.cast(Operation, consumer).process(windowed_value)
File "apache_beam/runners/worker/operations.py", line 339, in apache_beam.runners.worker.operations.DoOperation.process
with self.scoped_process_state:
File "apache_beam/runners/worker/operations.py", line 340, in apache_beam.runners.worker.operations.DoOperation.process
self.dofn_receiver.receive(o)
File "apache_beam/runners/common.py", line 382, in apache_beam.runners.common.DoFnRunner.receive
self.process(windowed_value)
File "apache_beam/runners/common.py", line 390, in apache_beam.runners.common.DoFnRunner.process
self._reraise_augmented(exn)
File "apache_beam/runners/common.py", line 431, in apache_beam.runners.common.DoFnRunner._reraise_augmented
raise new_exn, None, original_traceback
File "apache_beam/runners/common.py", line 388, in apache_beam.runners.common.DoFnRunner.process
self.do_fn_invoker.invoke_process(windowed_value)
File "apache_beam/runners/common.py", line 189, in apache_beam.runners.common.SimpleInvoker.invoke_process
self.output_processor.process_outputs(
File "apache_beam/runners/common.py", line 480, in apache_beam.runners.common._OutputProcessor.process_outputs
self.main_receivers.receive(windowed_value)
File "apache_beam/runners/worker/operations.py", line 84, in apache_beam.runners.worker.operations.ConsumerSet.receive
self.update_counters_start(windowed_value)
File "apache_beam/runners/worker/operations.py", line 90, in apache_beam.runners.worker.operations.ConsumerSet.update_counters_start
self.opcounter.update_from(windowed_value)
File "apache_beam/runners/worker/opcounters.py", line 63, in apache_beam.runners.worker.opcounters.OperationCounters.update_from
self.do_sample(windowed_value)
File "apache_beam/runners/worker/opcounters.py", line 81, in apache_beam.runners.worker.opcounters.OperationCounters.do_sample
self.coder_impl.get_estimated_size_and_observables(windowed_value))
File "apache_beam/coders/coder_impl.py", line 730, in apache_beam.coders.coder_impl.WindowedValueCoderImpl.get_estimated_size_and_observables
def get_estimated_size_and_observables(self, value, nested=False):
File "apache_beam/coders/coder_impl.py", line 739, in apache_beam.coders.coder_impl.WindowedValueCoderImpl.get_estimated_size_and_observables
self._value_coder.get_estimated_size_and_observables(
File "apache_beam/coders/coder_impl.py", line 260, in apache_beam.coders.coder_impl.FastPrimitivesCoderImpl.get_estimated_size_and_observables
self.encode_to_stream(value, out, nested)
File "apache_beam/coders/coder_impl.py", line 298, in apache_beam.coders.coder_impl.FastPrimitivesCoderImpl.encode_to_stream
self.fallback_coder_impl.encode_to_stream(value, stream, nested)
File "apache_beam/coders/coder_impl.py", line 154, in apache_beam.coders.coder_impl.CallbackCoderImpl.encode_to_stream
return stream.write(self._encoder(value), nested)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/coders/coders.py", line 497, in <lambda>
lambda x: dumps(x, HIGHEST_PROTOCOL), pickle.loads)
PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed [while running 'run reversion/FlatMap(run_reversion)']
EDIT:
Here's a reproducible example. Job-ID was 2018-01-30_11_53_22-14709945294406059840
import zlib
import argparse
from itertools import chain
import apache_beam as beam
import apache_beam.coders.coders as coders
from apache_beam.coders.coders import (DillCoder, coder_impl, maybe_dill_dumps,
maybe_dill_loads)
from apache_beam.coders.typecoders import CoderRegistry, FirstOf
from apache_beam.options.pipeline_options import PipelineOptions
class ObjectCoder(DillCoder):
"""
Coder that allows multi-line pickles to be read, and compresses the output
After an object is pickled, the bytes are encoded as `unicode_escape`,
meaning newline characters (`\n`) aren't in the string.
# but now we're doing compression, do we need to do the newlines?
Previously, the presence of newline characters these confues the Dataflow
reader, as it can't discriminate between a new object and a new line
within a pickle string
"""
def _create_impl(self):
return coder_impl.CallbackCoderImpl(
dill_compress_dumps, dill_compress_loads)
def dill_compress_dumps(stream):
# in Py3 this needs to be `unicode_escape`
return zlib.compress(maybe_dill_dumps(stream)).encode('string_escape')
def dill_compress_loads(stream):
# in Py3 this needs to be `unicode_escape`
decoded = stream.decode('string_escape')
try:
decompressed = zlib.decompress(decoded)
except zlib.error:
decompressed = stream
return maybe_dill_loads(decompressed)
def add_obj_to_coder_registry(coder_registry):
"""
Add Dill to the middle of the coder registry - after the deterministic
coders but before the Pickle coder
Use like: `add_obj_to_coder_registry(beam.coders.registry)`
"""
assert isinstance(coder_registry, CoderRegistry)
primitives_coder = coders.FastPrimitivesCoder(fallback_coder=ObjectCoder())
# https://github.com/apache/beam/blob/master/sdks/python/apacpycohe_beam/coders/typecoders.py#L93 # noqa
fallback_coders = [coders.ProtoCoder, primitives_coder]
coder_registry._fallback_coder = FirstOf(fallback_coders)
known_args, unknown_args = argparse.ArgumentParser().parse_known_args()
default_options = dict(
runner='DataflowRunner',
project='PROJECT_ID',
temp_location='gs://BUCKET_NAME/dataflow/temp/',
staging_location='gs://BUCKET_NAME/dataflow/staging/',
max_num_workers='100')
# from https://github.com/apache/incubator-airflow/blob/master/airflow/contrib/hooks/gcp_dataflow_hook.py # noqa
default_options_args = ['--{}={}'.format(attr, value)
for attr, value in default_options.items()]
default_args = chain(
default_options_args,
['--save_main_session'])
args = chain(unknown_args, default_args)
add_obj_to_coder_registry(beam.coders.typecoders.registry)
def produce_unpicklable():
def double(x):
return x * 2
return double
p = beam.Pipeline(options=PipelineOptions(list(args)))
items = p | beam.Create([x for x in range(10)])
unpickleable = items | beam.Map(lambda x: (x, produce_unpicklable()))
shuffle = unpickleable | 'use shuffler' >> beam.GroupByKey()
r = p.run()