I've been trying to deploy a pipeline on Google Cloud Dataflow. It's been a quite a challenge so far.
I'm facing an import issue because I realised that ParDo functions require the requirements.txt to be present if not it will say that it can't find the required module. https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/
So I tried fixing the problem by passing in the requirements.txt file, only to be met with a very incomprehensible error message.
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
from apache_beam.io.gcp.bigtableio import WriteToBigTable
from apache_beam.runners import DataflowRunner
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import GoogleCloudOptions
import google.auth
from google.cloud.bigtable.row import DirectRow
import datetime
# Setting up the Apache Beam pipeline options.
options = pipeline_options.PipelineOptions(flags=[])
# Sets the project to the default project in your current Google Cloud environment.
_, options.view_as(GoogleCloudOptions).project = google.auth.default()
# Sets the Google Cloud Region in which Cloud Dataflow runs.
options.view_as(GoogleCloudOptions).region = 'us-central1'
# IMPORTANT! Adjust the following to choose a Cloud Storage location.
dataflow_gcs_location = 'gs://tunnel-insight-2-0-dev-291100/dataflow'
# Dataflow Staging Location. This location is used to stage the Dataflow Pipeline and SDK binary.
options.view_as(GoogleCloudOptions).staging_location = '%s/staging' % dataflow_gcs_location
# Sets the pipeline mode to streaming, so we can stream the data from PubSub.
options.view_as(pipeline_options.StandardOptions).streaming = True
# Sets the requirements.txt file
options.view_as(pipeline_options.SetupOptions).requirements_file = "requirements.txt"
# Dataflow Temp Location. This location is used to store temporary files or intermediate results before finally outputting to the sink.
options.view_as(GoogleCloudOptions).temp_location = '%s/temp' % dataflow_gcs_location
# The directory to store the output files of the job.
output_gcs_location = '%s/output' % dataflow_gcs_location
ib.options.recording_duration = '1m'
...
...
pipeline_result = DataflowRunner().run_pipeline(p, options=options)
I've tried to pass requirements using "options.view_as(pipeline_options.SetupOptions).requirements_file = "requirements.txt""
I get this error
---------------------------------------------------------------------------
CalledProcessError Traceback (most recent call last)
~/apache-beam-custom/packages/beam/sdks/python/apache_beam/utils/processes.py in check_output(*args, **kwargs)
90 try:
---> 91 out = subprocess.check_output(*args, **kwargs)
92 except OSError:
/opt/conda/lib/python3.7/subprocess.py in check_output(timeout, *popenargs, **kwargs)
410 return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
--> 411 **kwargs).stdout
412
/opt/conda/lib/python3.7/subprocess.py in run(input, capture_output, timeout, check, *popenargs, **kwargs)
511 raise CalledProcessError(retcode, process.args,
--> 512 output=stdout, stderr=stderr)
513 return CompletedProcess(process.args, retcode, stdout, stderr)
CalledProcessError: Command '['/root/apache-beam-custom/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/dataflow-requirements-cache', '-r', 'requirements.txt', '--exists-action', 'i', '--no-binary', ':all:']' returned non-zero exit status 1.
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
<ipython-input-12-f018e5c84d08> in <module>
----> 1 pipeline_result = DataflowRunner().run_pipeline(p, options=options)
~/apache-beam-custom/packages/beam/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py in run_pipeline(self, pipeline, options)
491 environments.DockerEnvironment.from_container_image(
492 apiclient.get_container_image_from_options(options),
--> 493 artifacts=environments.python_sdk_dependencies(options)))
494
495 # This has to be performed before pipeline proto is constructed to make sure
~/apache-beam-custom/packages/beam/sdks/python/apache_beam/transforms/environments.py in python_sdk_dependencies(options, tmp_dir)
624 options,
625 tmp_dir,
--> 626 skip_prestaged_dependencies=skip_prestaged_dependencies))
~/apache-beam-custom/packages/beam/sdks/python/apache_beam/runners/portability/stager.py in create_job_resources(options, temp_dir, build_setup_args, populate_requirements_cache, skip_prestaged_dependencies)
178 populate_requirements_cache if populate_requirements_cache else
179 Stager._populate_requirements_cache)(
--> 180 setup_options.requirements_file, requirements_cache_path)
181 for pkg in glob.glob(os.path.join(requirements_cache_path, '*')):
182 resources.append((pkg, os.path.basename(pkg)))
~/apache-beam-custom/packages/beam/sdks/python/apache_beam/utils/retry.py in wrapper(*args, **kwargs)
234 while True:
235 try:
--> 236 return fun(*args, **kwargs)
237 except Exception as exn: # pylint: disable=broad-except
238 if not retry_filter(exn):
~/apache-beam-custom/packages/beam/sdks/python/apache_beam/runners/portability/stager.py in _populate_requirements_cache(requirements_file, cache_dir)
569 ]
570 _LOGGER.info('Executing command: %s', cmd_args)
--> 571 processes.check_output(cmd_args, stderr=processes.STDOUT)
572
573 #staticmethod
~/apache-beam-custom/packages/beam/sdks/python/apache_beam/utils/processes.py in check_output(*args, **kwargs)
97 "Full traceback: {} \n Pip install failed for package: {} \
98 \n Output from execution of subprocess: {}" \
---> 99 .format(traceback.format_exc(), args[0][6], error.output))
100 else:
101 raise RuntimeError("Full trace: {}, \
RuntimeError: Full traceback: Traceback (most recent call last):
File "/root/apache-beam-custom/packages/beam/sdks/python/apache_beam/utils/processes.py", line 91, in check_output
out = subprocess.check_output(*args, **kwargs)
File "/opt/conda/lib/python3.7/subprocess.py", line 411, in check_output
**kwargs).stdout
File "/opt/conda/lib/python3.7/subprocess.py", line 512, in run
output=stdout, stderr=stderr)
subprocess.CalledProcessError: Command '['/root/apache-beam-custom/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/dataflow-requirements-cache', '-r', 'requirements.txt', '--exists-action', 'i', '--no-binary', ':all:']' returned non-zero exit status 1.
Pip install failed for package: -r
Output from execution of subprocess: b'Obtaining file:///root/apache-beam-custom/packages/beam/sdks/python (from -r requirements.txt (line 3))\n Saved /tmp/dataflow-requirements-cache/apache-beam-2.25.0.zip\nCollecting absl-py==0.11.0\n Downloading absl-py-0.11.0.tar.gz (110 kB)\n Saved /tmp/dataflow-requirements-cache/absl-py-0.11.0.tar.gz\nCollecting argon2-cffi==20.1.0\n Downloading argon2-cffi-20.1.0.tar.gz (1.8 MB)\n Installing build dependencies: started\n Installing build dependencies: finished with status \'error\'\n ERROR: Command errored out with exit status 1:\n command: /root/apache-beam-custom/bin/python /root/apache-beam-custom/lib/python3.7/site-packages/pip install --ignore-installed --no-user --prefix /tmp/pip-build-env-3iuiaex9/overlay --no-warn-script-location --no-binary :all: --only-binary :none: -i https://pypi.org/simple -- \'setuptools>=40.6.0\' wheel \'cffi>=1.0\'\n cwd: None\n Complete output (85 lines):\n Collecting setuptools>=40.6.0\n Downloading setuptools-51.1.1.tar.gz (2.1 MB)\n Collecting wheel\n Downloading wheel-0.36.2.tar.gz (65 kB)\n Collecting cffi>=1.0\n Downloading cffi-1.14.4.tar.gz (471 kB)\n Collecting pycparser\n Downloading pycparser-2.20.tar.gz (161 kB)\n Skipping wheel build for setuptools, due to binaries being disabled for it.\n Skipping wheel build for wheel, due to binaries being disabled for it.\n Skipping wheel build for cffi, due to binaries being disabled for it.\n Skipping wheel build for pycparser, due to binaries being disabled for it.\n Installing collected packages: setuptools, wheel, pycparser, cffi\n Running setup.py install for setuptools: started\n Running setup.py install for setuptools: finished with status \'done\'\n Running setup.py install for wheel: started\n Running setup.py install for wheel: finished with status \'done\'\n Running setup.py install for pycparser: started\n Running setup.py install for pycparser: finished with status \'done\'\n Running setup.py install for cffi: started\n Running setup.py install for cffi: finished with status \'error\'\n ERROR: Command errored out with exit status 1:\n command: /root/apache-beam-custom/bin/python -u -c \'import sys, setuptools, tokenize; sys.argv[0] = \'"\'"\'/tmp/pip-install-6zs5jguv/cffi/setup.py\'"\'"\'; __file__=\'"\'"\'/tmp/pip-install-6zs5jguv/cffi/setup.py\'"\'"\';f=getattr(tokenize, \'"\'"\'open\'"\'"\', open)(__file__);code=f.read().replace(\'"\'"\'\\r\\n\'"\'"\', \'"\'"\'\\n\'"\'"\');f.close();exec(compile(code, __file__, \'"\'"\'exec\'"\'"\'))\' install --record /tmp/pip-record-z8o69lka/install-record.txt --single-version-externally-managed --prefix /tmp/pip-build-env-3iuiaex9/overlay --compile --install-headers /root/apache-beam-custom/include/site/python3.7/cffi\n cwd: /tmp/pip-install-6zs5jguv/cffi/\n Complete output (56 lines):\n Package libffi was not found in the pkg-config search path.\n Perhaps you should add the directory containing `libffi.pc\'\n to the PKG_CONFIG_PATH environment variable\n No package \'libffi\' found\n Package libffi was not found in the pkg-config search path.\n Perhaps you should add the directory containing `libffi.pc\'\n to the PKG_CONFIG_PATH environment variable\n No package \'libffi\' found\n Package libffi was not found in the pkg-config search path.\n Perhaps you should add the directory containing `libffi.pc\'\n to the PKG_CONFIG_PATH environment variable\n No package \'libffi\' found\n Package libffi was not found in the pkg-config search path.\n Perhaps you should add the directory containing `libffi.pc\'\n to the PKG_CONFIG_PATH environment variable\n No package \'libffi\' found\n Package libffi was not found in the pkg-config search path.\n Perhaps you should add the directory containing `libffi.pc\'\n to the PKG_CONFIG_PATH environment variable\n No package \'libffi\' found\n running install\n running build\n running build_py\n creating build\n creating build/lib.linux-x86_64-3.7\n creating build/lib.linux-x86_64-3.7/cffi\n copying cffi/setuptools_ext.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/pkgconfig.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/verifier.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/vengine_gen.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/backend_ctypes.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/__init__.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/cffi_opcode.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/error.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/api.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/commontypes.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/ffiplatform.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/lock.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/cparser.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/recompiler.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/vengine_cpy.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/model.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/_cffi_include.h -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/parse_c_type.h -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/_embedding.h -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/_cffi_errors.h -> build/lib.linux-x86_64-3.7/cffi\n running build_ext\n building \'_cffi_backend\' extension\n creating build/temp.linux-x86_64-3.7\n creating build/temp.linux-x86_64-3.7/c\n gcc -pthread -B /opt/conda/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -DUSE__THREAD -DHAVE_SYNC_SYNCHRONIZE -I/usr/include/ffi -I/usr/include/libffi -I/root/apache-beam-custom/include -I/opt/conda/include/python3.7m -c c/_cffi_backend.c -o build/temp.linux-x86_64-3.7/c/_cffi_backend.o\n c/_cffi_backend.c:15:10: fatal error: ffi.h: No such file or directory\n #include <ffi.h>\n ^~~~~~~\n compilation terminated.\n error: command \'gcc\' failed with exit status 1\n ----------------------------------------\n ERROR: Command errored out with exit status 1: /root/apache-beam-custom/bin/python -u -c \'import sys, setuptools, tokenize; sys.argv[0] = \'"\'"\'/tmp/pip-install-6zs5jguv/cffi/setup.py\'"\'"\'; __file__=\'"\'"\'/tmp/pip-install-6zs5jguv/cffi/setup.py\'"\'"\';f=getattr(tokenize, \'"\'"\'open\'"\'"\', open)(__file__);code=f.read().replace(\'"\'"\'\\r\\n\'"\'"\', \'"\'"\'\\n\'"\'"\');f.close();exec(compile(code, __file__, \'"\'"\'exec\'"\'"\'))\' install --record /tmp/pip-record-z8o69lka/install-record.txt --single-version-externally-managed --prefix /tmp/pip-build-env-3iuiaex9/overlay --compile --install-headers /root/apache-beam-custom/include/site/python3.7/cffi Check the logs for full command output.\n WARNING: You are using pip version 20.1.1; however, version 20.3.3 is available.\n You should consider upgrading via the \'/root/apache-beam-custom/bin/python -m pip install --upgrade pip\' command.\n ----------------------------------------\nERROR: Command errored out with exit status 1: /root/apache-beam-custom/bin/python /root/apache-beam-custom/lib/python3.7/site-packages/pip install --ignore-installed --no-user --prefix /tmp/pip-build-env-3iuiaex9/overlay --no-warn-script-location --no-binary :all: --only-binary :none: -i https://pypi.org/simple -- \'setuptools>=40.6.0\' wheel \'cffi>=1.0\' Check the logs for full command output.\nWARNING: You are using pip version 20.1.1; however, version 20.3.3 is available.\nYou should consider upgrading via the \'/root/apache-beam-custom/bin/python -m pip install --upgrade pip\' command.\n'
Did I do something wrong?
-------------- EDIT---------------------------------------
Ok, I've got my pipeline to work, but I'm still having a problem with my requirements.txt file which I believe I'm passing in correctly.
My pipeline code:
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
from apache_beam.io.gcp.bigtableio import WriteToBigTable
from apache_beam.runners import DataflowRunner
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import GoogleCloudOptions
import google.auth
from google.cloud.bigtable.row import DirectRow
import datetime
# Setting up the Apache Beam pipeline options.
options = pipeline_options.PipelineOptions(flags=[])
# Sets the project to the default project in your current Google Cloud environment.
_, options.view_as(GoogleCloudOptions).project = google.auth.default()
# Sets the Google Cloud Region in which Cloud Dataflow runs.
options.view_as(GoogleCloudOptions).region = 'us-central1'
# IMPORTANT! Adjust the following to choose a Cloud Storage location.
dataflow_gcs_location = ''
# Dataflow Staging Location. This location is used to stage the Dataflow Pipeline and SDK binary.
options.view_as(GoogleCloudOptions).staging_location = '%s/staging' % dataflow_gcs_location
# Sets the pipeline mode to streaming, so we can stream the data from PubSub.
options.view_as(pipeline_options.StandardOptions).streaming = True
# Sets the requirements.txt file
options.view_as(pipeline_options.SetupOptions).requirements_file = "requirements.txt"
# Dataflow Temp Location. This location is used to store temporary files or intermediate results before finally outputting to the sink.
options.view_as(GoogleCloudOptions).temp_location = '%s/temp' % dataflow_gcs_location
# The directory to store the output files of the job.
output_gcs_location = '%s/output' % dataflow_gcs_location
ib.options.recording_duration = '1m'
# The Google Cloud PubSub topic for this example.
topic = ""
subscription = ""
output_topic = ""
# Info
project_id = ""
bigtable_instance = ""
bigtable_table_id = ""
class CreateRowFn(beam.DoFn):
def process(self,words):
from google.cloud.bigtable.row import DirectRow
import datetime
direct_row = DirectRow(row_key="phone#4c410523#20190501")
direct_row.set_cell(
"stats_summary",
b"os_build",
b"android",
datetime.datetime.now())
return [direct_row]
p = beam.Pipeline(InteractiveRunner(),options=options)
words = p | "read" >> beam.io.ReadFromPubSub(subscription=subscription)
windowed_words = (words | "window" >> beam.WindowInto(beam.window.FixedWindows(10)))
# Writing to BigTable
test = words | beam.ParDo(CreateRowFn()) | WriteToBigTable(
project_id=project_id,
instance_id=bigtable_instance,
table_id=bigtable_table_id)
pipeline_result = DataflowRunner().run_pipeline(p, options=options)
As you can see in "CreateRowFn", I need to import
from google.cloud.bigtable.row import DirectRow
import datetime
Only then this works.
I've passed in requirements.txt as options.view_as(pipeline_options.SetupOptions).requirements_file = "requirements.txt" and I see it on Dataflow console.
If I remove the import statements, I get "in process NameError: name 'DirectRow' is not defined".
Is there anyway to overcome this?
I've found the answer in the FAQs. My mistake was not about how to pass in requirements.txt but how to handle NameErrors
https://cloud.google.com/dataflow/docs/resources/faq
How do I handle NameErrors?
If you're getting a NameError when you execute your pipeline using the Dataflow service but not when you execute locally (i.e. using the DirectRunner), your DoFns may be using values in the global namespace that are not available on the Dataflow worker.
By default, global imports, functions, and variables defined in the main session are not saved during the serialization of a Dataflow job. If, for example, your DoFns are defined in the main file and reference imports and functions in the global namespace, you can set the --save_main_session pipeline option to True. This will cause the state of the global namespace to be pickled and loaded on the Dataflow worker.
Notice that if you have objects in your global namespace that cannot be pickled, you will get a pickling error. If the error is regarding a module that should be available in the Python distribution, you can solve this by importing the module locally, where it is used.
For example, instead of:
import re
…
def myfunc():
# use re module
use:
def myfunc():
import re
# use re module
Alternatively, if your DoFns span multiple files, you should use a different approach to packaging your workflow and managing dependencies.
So the conclusion is:
It is ok to use import statements within the functions
Google Dataflow workers already have the these packages installed: https://cloud.google.com/dataflow/docs/concepts/sdk-worker-dependencies.
If you are running it from cloud composer
In that case you need to add the new Packages to PYPI PACKAGES as shown below.
You can also pass --requirements_file path://requirements.txt as flag in the command while running it.
I prefer to use --setup_file path://setup.py flag instead. The format of setup file is as follows
import setuptools
REQUIRED_PACKAGES = [
'joblib==0.15.1',
'numpy==1.18.5',
'google',
'google-cloud',
'google-cloud-storage',
'cassandra-driver==3.22.0'
]
PACKAGE_NAME = 'my_package'
PACKAGE_VERSION = '0.0.1'
setuptools.setup(
name=PACKAGE_NAME,
version=PACKAGE_VERSION,
description='Searh Rank project',
install_requires=REQUIRED_PACKAGES,
author="Mohd Faisal",
packages=setuptools.find_packages()
)
Use the format below for dataflow script:
from __future__ import absolute_import
import argparse
import logging
import apache_beam as beam
from apache_beam.options.pipeline_options import (GoogleCloudOptions,
PipelineOptions,
SetupOptions,
StandardOptions,
WorkerOptions)
from datetime import date
class Userprocess(beam.DoFn):
def process(self, msg):
yield "OK"
def run(argv=None):
logging.info("Parsing dataflow flags... ")
pipeline_options = PipelineOptions()
pipeline_options.view_as(SetupOptions).save_main_session = True
parser = argparse.ArgumentParser()
parser.add_argument(
'--project',
required=True,
help=(
'project id staging or production '))
parser.add_argument(
'--temp_location',
required=True,
help=(
'temp location'))
parser.add_argument(
'--job_name',
required=True,
help=(
'job name'))
known_args, pipeline_args = parser.parse_known_args(argv)
today = date.today()
logging.info("Processing Date is " + str(today))
google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
google_cloud_options.project = known_args.project
google_cloud_options.job_name = known_args.job_name
google_cloud_options.temp_location = known_args.temp_location
# pipeline_options.view_as(StandardOptions).runner = known_args.runner
with beam.Pipeline(argv=pipeline_args, options=pipeline_options) as p:
beam.ParDo(Userprocess())
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
logging.info("Starting dataflow daily pipeline ")
try:
run()
except:
pass
Try running the script locally for errors.
Related
I am using Docker Ubuntu.
I have installed the full dataset(dm-code_contests) to /tmp folder and cloned the git repository on /home folder(the repository is code_contests). When I try to run bazel run -c opt \ :print_names_and_sources /tmp/dm-code_contests/code_contests_valid.riegeli(in /home/code_contests folder), it shows error:
Starting local Bazel server and connecting to it...
INFO: Repository local_config_python instantiated at:
/home/code_contests/WORKSPACE:12:10: in <toplevel>
/root/.cache/bazel/_bazel_root/24a36d3f089e715b642fd688d4461183/external/com_github_grpc_grpc/bazel/grpc_deps.bzl:414:21: in grpc_deps
/root/.cache/bazel/_bazel_root/24a36d3f089e715b642fd688d4461183/external/com_github_grpc_grpc/bazel/grpc_python_deps.bzl:43:21: in grpc_python_deps
Repository rule python_configure defined at:
/root/.cache/bazel/_bazel_root/24a36d3f089e715b642fd688d4461183/external/com_github_grpc_grpc/third_party/py/python_configure.bzl:365:35: in <toplevel>
ERROR: An error occurred during the fetch of repository 'local_config_python':
Traceback (most recent call last):
File "/root/.cache/bazel/_bazel_root/24a36d3f089e715b642fd688d4461183/external/com_github_grpc_grpc/third_party/py/python_configure.bzl", line 355, column 35, in _python_autoconf_impl
_create_single_version_package(
File "/root/.cache/bazel/_bazel_root/24a36d3f089e715b642fd688d4461183/external/com_github_grpc_grpc/third_party/py/python_configure.bzl", line 304, column 45, in _create_single_version_package
python_include = _get_python_include(repository_ctx, python_bin)
File "/root/.cache/bazel/_bazel_root/24a36d3f089e715b642fd688d4461183/external/com_github_grpc_grpc/third_party/py/python_configure.bzl", line 236, column 22, in _get_python_include
result = _execute(
File "/root/.cache/bazel/_bazel_root/24a36d3f089e715b642fd688d4461183/external/com_github_grpc_grpc/third_party/py/python_configure.bzl", line 62, column 14, in _execute
_fail("\n".join([
File "/root/.cache/bazel/_bazel_root/24a36d3f089e715b642fd688d4461183/external/com_github_grpc_grpc/third_party/py/python_configure.bzl", line 35, column 9, in _fail
fail("%sPython Configuration Error:%s %s\n" % (red, no_color, msg))
Error in fail: Python Configuration Error: Problem getting python include path for /usr/bin/python3.
<string>:1: DeprecationWarning: The distutils package is deprecated and slated for removal in Python 3.12. Use setuptools or check PEP 632 for potential alternatives
<string>:1: DeprecationWarning: The distutils.sysconfig module is deprecated, use sysconfig instead
Is the Python binary path set up right? (See ./configure or /usr/bin/python3.) Is distutils installed? Are Python headers installed? Try installing python-dev or python3-dev on Debian-based systems. Try python-devel or python3-devel on Redhat-based systems.
ERROR: /home/code_contests/WORKSPACE:12:10: fetching python_configure rule //external:local_config_python: Traceback (most recent call last):
File "/root/.cache/bazel/_bazel_root/24a36d3f089e715b642fd688d4461183/external/com_github_grpc_grpc/third_party/py/python_configure.bzl", line 355, column 35, in _python_autoconf_impl
_create_single_version_package(
File "/root/.cache/bazel/_bazel_root/24a36d3f089e715b642fd688d4461183/external/com_github_grpc_grpc/third_party/py/python_configure.bzl", line 304, column 45, in _create_single_version_package
python_include = _get_python_include(repository_ctx, python_bin)
File "/root/.cache/bazel/_bazel_root/24a36d3f089e715b642fd688d4461183/external/com_github_grpc_grpc/third_party/py/python_configure.bzl", line 236, column 22, in _get_python_include
result = _execute(
File "/root/.cache/bazel/_bazel_root/24a36d3f089e715b642fd688d4461183/external/com_github_grpc_grpc/third_party/py/python_configure.bzl", line 62, column 14, in _execute
_fail("\n".join([
File "/root/.cache/bazel/_bazel_root/24a36d3f089e715b642fd688d4461183/external/com_github_grpc_grpc/third_party/py/python_configure.bzl", line 35, column 9, in _fail
fail("%sPython Configuration Error:%s %s\n" % (red, no_color, msg))
Error in fail: Python Configuration Error: Problem getting python include path for /usr/bin/python3.
<string>:1: DeprecationWarning: The distutils package is deprecated and slated for removal in Python 3.12. Use setuptools or check PEP 632 for potential alternatives
<string>:1: DeprecationWarning: The distutils.sysconfig module is deprecated, use sysconfig instead
Is the Python binary path set up right? (See ./configure or /usr/bin/python3.) Is distutils installed? Are Python headers installed? Try installing python-dev or python3-dev on Debian-based systems. Try python-devel or python3-devel on Redhat-based systems.
ERROR: /root/.cache/bazel/_bazel_root/24a36d3f089e715b642fd688d4461183/external/com_google_riegeli/python/riegeli/records/BUILD:8:13: #com_google_riegeli//python/riegeli/records:record_writer_cc depends on #local_config_python//:python_headers in repository #local_config_python which failed to fetch. no such package '#local_config_python//': Python Configuration Error: Problem getting python include path for /usr/bin/python3.
<string>:1: DeprecationWarning: The distutils package is deprecated and slated for removal in Python 3.12. Use setuptools or check PEP 632 for potential alternatives
<string>:1: DeprecationWarning: The distutils.sysconfig module is deprecated, use sysconfig instead
Is the Python binary path set up right? (See ./configure or /usr/bin/python3.) Is distutils installed? Are Python headers installed? Try installing python-dev or python3-dev on Debian-based systems. Try python-devel or python3-devel on Redhat-based systems.
ERROR: Analysis of target '//:print_names_and_sources' failed; build aborted:
INFO: Elapsed time: 3.701s
INFO: 0 processes.
FAILED: Build did NOT complete successfully (49 packages loaded, 348 targets configured)
FAILED: Build did NOT complete successfully (49 packages loaded, 348 targets configured)
Fetching #com_google_absl; Cloning tags/20211102.0 of https://github.com/abseil/abseil-cpp.git
root#c89a94de94ce://home/code_contests# bazel run -c opt \ :print_names_and_sources /tmp/dm-code_contests/code_contests
_valid.riegeli
ERROR: Skipping ' :print_names_and_sources': no such package ' ': BUILD file not found in any of the following directories. Add a BUILD file to a directory to mark it as a package.
- /home/code_contests/
WARNING: Target pattern parsing failed.
ERROR: no such package ' ': BUILD file not found in any of the following directories. Add a BUILD file to a directory to mark it as a package.
- /home/code_contests/
INFO: Elapsed time: 0.174s
INFO: 0 processes.
FAILED: Build did NOT complete successfully (0 packages loaded)
FAILED: Build did NOT complete successfully (0 packages loaded)
Im new to Ubuntu(as well as bazel). So how can I fix this error and run the project?
link to the source code: https://github.com/deepmind/code_contests
You should make sure your gcc is the latest version.
For python2 you should install it like this:
#Remove bazel and reinstall
bazel clean --expunge
rm -rf ~/.cache/bazel
To re-install follow this instruction
#Install python2 dependency
sudo apt update && sudo apt install python-dev
For a detailed explanation kindly refer to this document. Thank you!
I am following the instructions here: https://drake.mit.edu/from_source.html. I already ran
./setup/mac/install_prereqs.sh
in my python virtualenv (drake-venv) and it succeeded. I then managed to build and run the inclined plane example with Bazel. But trying to build some of the other examples results in errors involving YAML like this:
(drake-venv) benq:acrobot % bazel build acrobot_input --subcommands --verbose_failures --sandbox_debug
INFO: Analyzed target //examples/acrobot:acrobot_input (0 packages loaded, 0 targets configured).
INFO: Found 1 target...
SUBCOMMAND: # //examples/acrobot:acrobot_input_codegen [action 'Action examples/acrobot/gen/acrobot_input.cc', configuration: f8bba554e4e3784a5a24e83c682b75e9b6104059526c94f74d854527a53436a6, execution platform: #local_config_platform//:host]
(cd /private/var/tmp/_bazel_benq/a35a7fa5c4830c980dbc52ab349cb0bc/execroot/drake && \
exec env - \
bazel-out/host/bin/tools/vector_gen/lcm_vector_gen '--src=examples/acrobot/acrobot_input_named_vector.yaml' '--out=bazel-out/darwin-opt/bin/examples/acrobot/gen/acrobot_input.cc' '--out=bazel-out/darwin-opt/bin/examples/acrobot/gen/acrobot_input.h' '--include_prefix=drake')
# Configuration: f8bba554e4e3784a5a24e83c682b75e9b6104059526c94f74d854527a53436a6
# Execution platform: #local_config_platform//:host
ERROR: /Users/benq/Documents/drake/examples/acrobot/BUILD.bazel:30:28: Action examples/acrobot/gen/acrobot_input.cc failed: (Exit 1): sandbox-exec failed: error executing command
(cd /private/var/tmp/_bazel_benq/a35a7fa5c4830c980dbc52ab349cb0bc/sandbox/darwin-sandbox/176/execroot/drake && \
exec env - \
TMPDIR=/var/folders/s0/tfqtn2s54135x0qzt5kxnzs00000gn/T/ \
/usr/bin/sandbox-exec -f /private/var/tmp/_bazel_benq/a35a7fa5c4830c980dbc52ab349cb0bc/sandbox/darwin-sandbox/176/sandbox.sb /var/tmp/_bazel_benq/install/ebbb2540c6000feeb8873385c487a79c/process-wrapper '--timeout=0' '--kill_delay=15' bazel-out/host/bin/tools/vector_gen/lcm_vector_gen '--src=examples/acrobot/acrobot_input_named_vector.yaml' '--out=bazel-out/darwin-opt/bin/examples/acrobot/gen/acrobot_input.cc' '--out=bazel-out/darwin-opt/bin/examples/acrobot/gen/acrobot_input.h' '--include_prefix=drake')
Traceback (most recent call last):
File "/private/var/tmp/_bazel_benq/a35a7fa5c4830c980dbc52ab349cb0bc/sandbox/darwin-sandbox/176/execroot/drake/bazel-out/host/bin/tools/vector_gen/lcm_vector_gen.runfiles/drake/tools/vector_gen/lcm_vector_gen.py", line 10, in <module>
import yaml
ModuleNotFoundError: No module named 'yaml'
Target //examples/acrobot:acrobot_input failed to build
INFO: Elapsed time: 1.059s, Critical Path: 0.58s
INFO: 5 processes: 5 internal.
FAILED: Build did NOT complete successfully
But I'm not sure why this is happening considering that importing yaml in Terminal works:
(drake-venv) benq:acrobot % which python
/Users/benq/Documents/drake/drake-venv/bin/python
(drake-venv) benq:acrobot % python --version
Python 3.9.10
(drake-venv) benq:acrobot % python -c 'import yaml'
(drake-venv) benq:acrobot %
I've already tried reinstalling PyYaml but that didn't help.
Relevant Info:
Operating System: macOS Monterey (12.3)
Architecture: x86_64
Python: Python 3.9.10
Bazel version:
% which bazel; bazel version
/usr/local/bin/bazel
Build label: 5.0.0-homebrew
Build target: bazel-out/darwin-opt/bin/src/main/java/com/google/devtools/build/lib/bazel/BazelServer_deploy.jar
Build time: Tue Jan 1 00:00:00 1980 (315532800)
Build timestamp: 315532800
Build timestamp as int: 315532800
Bazel C++ compiler: Apple clang version 13.1.6 (clang-1316.0.21.2)
Git revision: 06dd087b40
The lcm_vector_gen in the error message is a code-generation tool that's run as part of the build.
It's probably not obeying your which python, but instead using the hard-coded /usr/local/bin/python3.9 from https://github.com/RobotLocomotion/drake/blob/master/tools/py_toolchain/interpreter_paths.bzl.
We don't run or test our builds within a virtual environment, so you've stumbled into a novel situation.
Possibly editing that bzl file linked above (interpreter_paths.bzl), to point MACOS_I386_INTERPRETER_PATH to your venv python (/Users/benq/Documents/drake/drake-venv/bin/python), would fix the error.
I'm working on a cloud-based robotic application with AWS RoboMaker. I'm using ROS Kinetic, with the build tool colcon.
My robot application depends on a custom python module, which has to be in my workspace. This python module is built by colcon as a python package, not a ROS package. This page explains how to do that with catkin, but this example shows how to adapt it to colcon. So finally my workspace looks like that :
my_workspace/
|--src/
|--my_module/
| |--setup.py
| |--package.xml
| |--subfolders and python scripts...
|--some_ros_pkg1/
|--some_ros_pkg2/
|...
However the command : colcon build <my_workspace> builds all ROS packages but fails to build my python module as a package.
Here's the error I get :
Starting >>> my-module
[54.297s] WARNING:colcon.colcon_ros.task.ament_python.build:Package 'my-module' doesn't explicitly install a marker in the package index (colcon-ros currently does it implicitly but that fallback will be removed in the future)
[54.298s] WARNING:colcon.colcon_ros.task.ament_python.build:Package 'my-module' doesn't explicitly install the 'package.xml' file (colcon-ros currently does it implicitly but that fallback will be removed in the future)
--- stderr: my-module
usage: setup.py [global_opts] cmd1 [cmd1_opts] [cmd2 [cmd2_opts] ...]
or: setup.py --help [cmd1 cmd2 ...]
or: setup.py --help-commands
or: setup.py cmd --help
error: invalid command 'egg_info'
---
Failed <<< my-module [0.56s, exited with code 1]
I found this issue that seems correlated, and thus tried : pip install --upgrade setuptools
...Which fails with the error message :
Collecting setuptools
Using cached https://files.pythonhosted.org/packages/7c/1b/9b68465658cda69f33c31c4dbd511ac5648835680ea8de87ce05c81f95bf/setuptools-50.3.0.zip
Complete output from command python setup.py egg_info:
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "setuptools/__init__.py", line 16, in <module>
import setuptools.version
File "setuptools/version.py", line 1, in <module>
import pkg_resources
File "pkg_resources/__init__.py", line 1365
raise SyntaxError(e) from e
^
SyntaxError: invalid syntax
----------------------------------------
Command "python setup.py egg_info" failed with error code 1 in /tmp/pip-build-uwFamt/setuptools/
And with pip3 install --upgrade setuptools, I get :
Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: setuptools in /home/ubuntu/.local/lib/python3.5/site-packages (50.3.0)
I have both Python 3.5.2 an Python 2.7, but I don't know which one is used by colcon.
So I don't know what to try next, and what the real problem is. Any help welcome !
I managed to correctly install my package and its dependencies. I develop the method below, in case it may help someone someday !
I have been mainly inspired by this old DeepRacer repository.
The workspace tree in the question is wrong. It should look like this:
my_workspace/
|--src/
|--my_wrapper_package/
| |--setup.py
| |--my_package/
| |--__init__.py
| |--subfolders and python scripts...
|--some_ros_pkg1/
|--some_ros_pkg2/
my_wrapper_package may contain more than one python custom package.
A good setup.py example is this one.
You shouldn't put a package.xml next to setup.py : colcon will only look at the dependencies declared in package.xml, and won't collect pip packages.
It may help sometimes to delete the folders my_wrapper_package generated by colcon in install/ and build/. Doing so you force colcon to rebuild and bundle from scratch.
I am trying to install Xgboost on an Alpine docker
I was receiving this message:
CMake Error at CMakeLists.txt:1 (cmake_minimum_required):
CMake 3.12 or higher is required. You are running version 3.9.5
then I added the following line before
RUN apk add cmake>3.12-suffix
# RUN pip install cmake
RUN pip3 install xgboost
and I am still getting the following error message.
What am I doing wrong ?
> Step 16/21 : RUN apk add cmake>3.12-suffix
---> Using cache
---> da84d6ee0868
Step 17/21 : RUN pip3 install xgboost
---> Running in c8e77045ea59
Collecting xgboost
Downloading xgboost-1.0.2.tar.gz (821 kB)
ERROR: Command errored out with exit status 1:
command: /usr/bin/python3.6 -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-install-9m5q6l5k/xgboost/setup.py'"'"';
__file__='"'"'/tmp/pip-install-9m5q6l5k/xgboost/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base /tmp/pip-install-9m5q6l5k/xgboost/pip-egg-info
cwd: /tmp/pip-install-9m5q6l5k/xgboost/
Complete output (35 lines):
+ pwd
+ oldpath=/tmp/pip-install-9m5q6l5k/xgboost
+ cd ./xgboost/
+ mkdir -p build
+ cd build
+ cmake ..
CMake Error at CMakeLists.txt:1 (cmake_minimum_required):
CMake 3.12 or higher is required. You are running version 3.9.5
-- Configuring incomplete, errors occurred!
+ echo -----------------------------
-----------------------------
+ echo Building multi-thread xgboost failed
Building multi-thread xgboost failed
+ echo Start to build single-thread xgboost
Start to build single-thread xgboost
+ cmake .. -DUSE_OPENMP=0
CMake Error at CMakeLists.txt:1 (cmake_minimum_required):
CMake 3.12 or higher is required. You are running version 3.9.5
-- Configuring incomplete, errors occurred!
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/tmp/pip-install-9m5q6l5k/xgboost/setup.py", line 42, in <module>
LIB_PATH = libpath['find_lib_path']()
File "/tmp/pip-install-9m5q6l5k/xgboost/xgboost/libpath.py", line 50, in find_lib_path
'List of candidates:\n' + ('\n'.join(dll_path)))
XGBoostLibraryNotFound: Cannot find XGBoost Library in the candidate path, did you install compilers and run build.sh in root path?
List of candidates:
/tmp/pip-install-9m5q6l5k/xgboost/xgboost/libxgboost.so
/tmp/pip-install-9m5q6l5k/xgboost/xgboost/../../lib/libxgboost.so
/tmp/pip-install-9m5q6l5k/xgboost/xgboost/./lib/libxgboost.so
/usr/xgboost/libxgboost.so
----------------------------------------
ERROR: Command errored out with exit status 1: python setup.py egg_info Check the logs for full command output.
Tested my pipeline on DirectRunner and everything's working good.
Now I want to run it on DataflowRunner. It doesn't work. It fails even before enter my pipeline code, and I'm totally overwhelmed by the logs in stackdriver - just don't understand what they mean and really don't have any clue on what's wrong.
execution graph looks loaded fine
worker pool starts and 1 worker is trying to run through the setup process, however looks never success
some logs that I guess might provide useful information for debugging:
AttributeError:'module' object has no attribute 'NativeSource'
/usr/bin/python failed with exit status 1
Back-off 20s restarting failed container=python pod=dataflow-fiona-backlog-clean-test2-06140817-1629-harness-3nxh_default(50a3915d6501a3ec74d6d385f70c8353)
checking backoff for container "python" in pod "dataflow-fiona-backlog-clean-test2-06140817-1629-harness-3nxh"
INFO SSH key is not a complete entry: .....
How should I tackle this problem?
Edit:
my setup.py here if it helps: (copyed from [here], only modifiedREQUIRED_PACKAGES
and setuptools.setup section)
from distutils.command.build import build as _build
import subprocess
import setuptools
# This class handles the pip install mechanism.
class build(_build): # pylint: disable=invalid-name
"""A build command class that will be invoked during package install.
The package built using the current setup.py will be staged and later
installed in the worker using `pip install package'. This class will be
instantiated during install for this specific scenario and will trigger
running the custom commands specified.
"""
sub_commands = _build.sub_commands + [('CustomCommands', None)]
# Some custom command to run during setup. The command is not essential for this
# workflow. It is used here as an example. Each command will spawn a child
# process. Typically, these commands will include steps to install non-Python
# packages. For instance, to install a C++-based library libjpeg62 the following
# two commands will have to be added:
#
# ['apt-get', 'update'],
# ['apt-get', '--assume-yes', install', 'libjpeg62'],
#
# First, note that there is no need to use the sudo command because the setup
# script runs with appropriate access.
# Second, if apt-get tool is used then the first command needs to be 'apt-get
# update' so the tool refreshes itself and initializes links to download
# repositories. Without this initial step the other apt-get install commands
# will fail with package not found errors. Note also --assume-yes option which
# shortcuts the interactive confirmation.
#
# The output of custom commands (including failures) will be logged in the
# worker-startup log.
CUSTOM_COMMANDS = [
['echo', 'Custom command worked!']]
class CustomCommands(setuptools.Command):
"""A setuptools Command class able to run arbitrary commands."""
def initialize_options(self):
pass
def finalize_options(self):
pass
def RunCustomCommand(self, command_list):
print 'Running command: %s' % command_list
p = subprocess.Popen(
command_list,
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
# Can use communicate(input='y\n'.encode()) if the command run requires
# some confirmation.
stdout_data, _ = p.communicate()
print 'Command output: %s' % stdout_data
if p.returncode != 0:
raise RuntimeError(
'Command %s failed: exit code: %s' % (command_list, p.returncode))
def run(self):
for command in CUSTOM_COMMANDS:
self.RunCustomCommand(command)
# Configure the required packages and scripts to install.
# Note that the Python Dataflow containers come with numpy already installed
# so this dependency will not trigger anything to be installed unless a version
# restriction is specified.
REQUIRED_PACKAGES = ['apache-beam==2.0.0',
'datalab==1.0.1',
'google-cloud==0.19.0',
'google-cloud-bigquery==0.22.1',
'google-cloud-core==0.22.1',
'google-cloud-dataflow==0.6.0',
'pandas==0.20.2']
setuptools.setup(
name='geotab-backlog-dataflow',
version='0.0.1',
install_requires=REQUIRED_PACKAGES,
packages=setuptools.find_packages(),
)
worker-startup log: and it ended at the following exception
I /usr/bin/python failed with exit status 1
I /usr/bin/python failed with exit status 1
I AttributeError: 'module' object has no attribute 'NativeSource'
I class ConcatSource(iobase.NativeSource):
I File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/concat_reader.py", line 26, in <module>
I from dataflow_worker import concat_reader
I File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/maptask.py", line 31, in <module>
I from dataflow_worker import maptask
I File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/executor.py", line 26, in <module>
I from dataflow_worker import executor
I File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py", line 63, in <module>
I from dataflow_worker import batchworker
I File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/start.py", line 26, in <module>
I exec code in run_globals
I File "/usr/lib/python2.7/runpy.py", line 72, in _run_code
I "__main__", fname, loader, pkg_name)
I File "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main
I AttributeError: 'module' object has no attribute 'NativeSource'
I class ConcatSource(iobase.NativeSource):
You seem to be using incompatible requirements in your REQUIRED_PACKAGES directive, i.e. you specify "apache-beam==2.0.0" and "google-cloud-dataflow==0.6.0", which conflict with each other. Can you try removing / uninstalling the "apache-beam" package and install / include the "google-cloud-dataflow==2.0.0" package instead?