I've been trying to deploy a pipeline on Google Cloud Dataflow. It's been a quite a challenge so far.
I'm facing an import issue because I realised that ParDo functions require the requirements.txt to be present if not it will say that it can't find the required module. https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/
So I tried fixing the problem by passing in the requirements.txt file, only to be met with a very incomprehensible error message.
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
from apache_beam.io.gcp.bigtableio import WriteToBigTable
from apache_beam.runners import DataflowRunner
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import GoogleCloudOptions
import google.auth
from google.cloud.bigtable.row import DirectRow
import datetime
# Setting up the Apache Beam pipeline options.
options = pipeline_options.PipelineOptions(flags=[])
# Sets the project to the default project in your current Google Cloud environment.
_, options.view_as(GoogleCloudOptions).project = google.auth.default()
# Sets the Google Cloud Region in which Cloud Dataflow runs.
options.view_as(GoogleCloudOptions).region = 'us-central1'
# IMPORTANT! Adjust the following to choose a Cloud Storage location.
dataflow_gcs_location = 'gs://tunnel-insight-2-0-dev-291100/dataflow'
# Dataflow Staging Location. This location is used to stage the Dataflow Pipeline and SDK binary.
options.view_as(GoogleCloudOptions).staging_location = '%s/staging' % dataflow_gcs_location
# Sets the pipeline mode to streaming, so we can stream the data from PubSub.
options.view_as(pipeline_options.StandardOptions).streaming = True
# Sets the requirements.txt file
options.view_as(pipeline_options.SetupOptions).requirements_file = "requirements.txt"
# Dataflow Temp Location. This location is used to store temporary files or intermediate results before finally outputting to the sink.
options.view_as(GoogleCloudOptions).temp_location = '%s/temp' % dataflow_gcs_location
# The directory to store the output files of the job.
output_gcs_location = '%s/output' % dataflow_gcs_location
ib.options.recording_duration = '1m'
...
...
pipeline_result = DataflowRunner().run_pipeline(p, options=options)
I've tried to pass requirements using "options.view_as(pipeline_options.SetupOptions).requirements_file = "requirements.txt""
I get this error
---------------------------------------------------------------------------
CalledProcessError Traceback (most recent call last)
~/apache-beam-custom/packages/beam/sdks/python/apache_beam/utils/processes.py in check_output(*args, **kwargs)
90 try:
---> 91 out = subprocess.check_output(*args, **kwargs)
92 except OSError:
/opt/conda/lib/python3.7/subprocess.py in check_output(timeout, *popenargs, **kwargs)
410 return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
--> 411 **kwargs).stdout
412
/opt/conda/lib/python3.7/subprocess.py in run(input, capture_output, timeout, check, *popenargs, **kwargs)
511 raise CalledProcessError(retcode, process.args,
--> 512 output=stdout, stderr=stderr)
513 return CompletedProcess(process.args, retcode, stdout, stderr)
CalledProcessError: Command '['/root/apache-beam-custom/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/dataflow-requirements-cache', '-r', 'requirements.txt', '--exists-action', 'i', '--no-binary', ':all:']' returned non-zero exit status 1.
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
<ipython-input-12-f018e5c84d08> in <module>
----> 1 pipeline_result = DataflowRunner().run_pipeline(p, options=options)
~/apache-beam-custom/packages/beam/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py in run_pipeline(self, pipeline, options)
491 environments.DockerEnvironment.from_container_image(
492 apiclient.get_container_image_from_options(options),
--> 493 artifacts=environments.python_sdk_dependencies(options)))
494
495 # This has to be performed before pipeline proto is constructed to make sure
~/apache-beam-custom/packages/beam/sdks/python/apache_beam/transforms/environments.py in python_sdk_dependencies(options, tmp_dir)
624 options,
625 tmp_dir,
--> 626 skip_prestaged_dependencies=skip_prestaged_dependencies))
~/apache-beam-custom/packages/beam/sdks/python/apache_beam/runners/portability/stager.py in create_job_resources(options, temp_dir, build_setup_args, populate_requirements_cache, skip_prestaged_dependencies)
178 populate_requirements_cache if populate_requirements_cache else
179 Stager._populate_requirements_cache)(
--> 180 setup_options.requirements_file, requirements_cache_path)
181 for pkg in glob.glob(os.path.join(requirements_cache_path, '*')):
182 resources.append((pkg, os.path.basename(pkg)))
~/apache-beam-custom/packages/beam/sdks/python/apache_beam/utils/retry.py in wrapper(*args, **kwargs)
234 while True:
235 try:
--> 236 return fun(*args, **kwargs)
237 except Exception as exn: # pylint: disable=broad-except
238 if not retry_filter(exn):
~/apache-beam-custom/packages/beam/sdks/python/apache_beam/runners/portability/stager.py in _populate_requirements_cache(requirements_file, cache_dir)
569 ]
570 _LOGGER.info('Executing command: %s', cmd_args)
--> 571 processes.check_output(cmd_args, stderr=processes.STDOUT)
572
573 #staticmethod
~/apache-beam-custom/packages/beam/sdks/python/apache_beam/utils/processes.py in check_output(*args, **kwargs)
97 "Full traceback: {} \n Pip install failed for package: {} \
98 \n Output from execution of subprocess: {}" \
---> 99 .format(traceback.format_exc(), args[0][6], error.output))
100 else:
101 raise RuntimeError("Full trace: {}, \
RuntimeError: Full traceback: Traceback (most recent call last):
File "/root/apache-beam-custom/packages/beam/sdks/python/apache_beam/utils/processes.py", line 91, in check_output
out = subprocess.check_output(*args, **kwargs)
File "/opt/conda/lib/python3.7/subprocess.py", line 411, in check_output
**kwargs).stdout
File "/opt/conda/lib/python3.7/subprocess.py", line 512, in run
output=stdout, stderr=stderr)
subprocess.CalledProcessError: Command '['/root/apache-beam-custom/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/dataflow-requirements-cache', '-r', 'requirements.txt', '--exists-action', 'i', '--no-binary', ':all:']' returned non-zero exit status 1.
Pip install failed for package: -r
Output from execution of subprocess: b'Obtaining file:///root/apache-beam-custom/packages/beam/sdks/python (from -r requirements.txt (line 3))\n Saved /tmp/dataflow-requirements-cache/apache-beam-2.25.0.zip\nCollecting absl-py==0.11.0\n Downloading absl-py-0.11.0.tar.gz (110 kB)\n Saved /tmp/dataflow-requirements-cache/absl-py-0.11.0.tar.gz\nCollecting argon2-cffi==20.1.0\n Downloading argon2-cffi-20.1.0.tar.gz (1.8 MB)\n Installing build dependencies: started\n Installing build dependencies: finished with status \'error\'\n ERROR: Command errored out with exit status 1:\n command: /root/apache-beam-custom/bin/python /root/apache-beam-custom/lib/python3.7/site-packages/pip install --ignore-installed --no-user --prefix /tmp/pip-build-env-3iuiaex9/overlay --no-warn-script-location --no-binary :all: --only-binary :none: -i https://pypi.org/simple -- \'setuptools>=40.6.0\' wheel \'cffi>=1.0\'\n cwd: None\n Complete output (85 lines):\n Collecting setuptools>=40.6.0\n Downloading setuptools-51.1.1.tar.gz (2.1 MB)\n Collecting wheel\n Downloading wheel-0.36.2.tar.gz (65 kB)\n Collecting cffi>=1.0\n Downloading cffi-1.14.4.tar.gz (471 kB)\n Collecting pycparser\n Downloading pycparser-2.20.tar.gz (161 kB)\n Skipping wheel build for setuptools, due to binaries being disabled for it.\n Skipping wheel build for wheel, due to binaries being disabled for it.\n Skipping wheel build for cffi, due to binaries being disabled for it.\n Skipping wheel build for pycparser, due to binaries being disabled for it.\n Installing collected packages: setuptools, wheel, pycparser, cffi\n Running setup.py install for setuptools: started\n Running setup.py install for setuptools: finished with status \'done\'\n Running setup.py install for wheel: started\n Running setup.py install for wheel: finished with status \'done\'\n Running setup.py install for pycparser: started\n Running setup.py install for pycparser: finished with status \'done\'\n Running setup.py install for cffi: started\n Running setup.py install for cffi: finished with status \'error\'\n ERROR: Command errored out with exit status 1:\n command: /root/apache-beam-custom/bin/python -u -c \'import sys, setuptools, tokenize; sys.argv[0] = \'"\'"\'/tmp/pip-install-6zs5jguv/cffi/setup.py\'"\'"\'; __file__=\'"\'"\'/tmp/pip-install-6zs5jguv/cffi/setup.py\'"\'"\';f=getattr(tokenize, \'"\'"\'open\'"\'"\', open)(__file__);code=f.read().replace(\'"\'"\'\\r\\n\'"\'"\', \'"\'"\'\\n\'"\'"\');f.close();exec(compile(code, __file__, \'"\'"\'exec\'"\'"\'))\' install --record /tmp/pip-record-z8o69lka/install-record.txt --single-version-externally-managed --prefix /tmp/pip-build-env-3iuiaex9/overlay --compile --install-headers /root/apache-beam-custom/include/site/python3.7/cffi\n cwd: /tmp/pip-install-6zs5jguv/cffi/\n Complete output (56 lines):\n Package libffi was not found in the pkg-config search path.\n Perhaps you should add the directory containing `libffi.pc\'\n to the PKG_CONFIG_PATH environment variable\n No package \'libffi\' found\n Package libffi was not found in the pkg-config search path.\n Perhaps you should add the directory containing `libffi.pc\'\n to the PKG_CONFIG_PATH environment variable\n No package \'libffi\' found\n Package libffi was not found in the pkg-config search path.\n Perhaps you should add the directory containing `libffi.pc\'\n to the PKG_CONFIG_PATH environment variable\n No package \'libffi\' found\n Package libffi was not found in the pkg-config search path.\n Perhaps you should add the directory containing `libffi.pc\'\n to the PKG_CONFIG_PATH environment variable\n No package \'libffi\' found\n Package libffi was not found in the pkg-config search path.\n Perhaps you should add the directory containing `libffi.pc\'\n to the PKG_CONFIG_PATH environment variable\n No package \'libffi\' found\n running install\n running build\n running build_py\n creating build\n creating build/lib.linux-x86_64-3.7\n creating build/lib.linux-x86_64-3.7/cffi\n copying cffi/setuptools_ext.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/pkgconfig.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/verifier.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/vengine_gen.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/backend_ctypes.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/__init__.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/cffi_opcode.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/error.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/api.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/commontypes.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/ffiplatform.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/lock.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/cparser.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/recompiler.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/vengine_cpy.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/model.py -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/_cffi_include.h -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/parse_c_type.h -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/_embedding.h -> build/lib.linux-x86_64-3.7/cffi\n copying cffi/_cffi_errors.h -> build/lib.linux-x86_64-3.7/cffi\n running build_ext\n building \'_cffi_backend\' extension\n creating build/temp.linux-x86_64-3.7\n creating build/temp.linux-x86_64-3.7/c\n gcc -pthread -B /opt/conda/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -DUSE__THREAD -DHAVE_SYNC_SYNCHRONIZE -I/usr/include/ffi -I/usr/include/libffi -I/root/apache-beam-custom/include -I/opt/conda/include/python3.7m -c c/_cffi_backend.c -o build/temp.linux-x86_64-3.7/c/_cffi_backend.o\n c/_cffi_backend.c:15:10: fatal error: ffi.h: No such file or directory\n #include <ffi.h>\n ^~~~~~~\n compilation terminated.\n error: command \'gcc\' failed with exit status 1\n ----------------------------------------\n ERROR: Command errored out with exit status 1: /root/apache-beam-custom/bin/python -u -c \'import sys, setuptools, tokenize; sys.argv[0] = \'"\'"\'/tmp/pip-install-6zs5jguv/cffi/setup.py\'"\'"\'; __file__=\'"\'"\'/tmp/pip-install-6zs5jguv/cffi/setup.py\'"\'"\';f=getattr(tokenize, \'"\'"\'open\'"\'"\', open)(__file__);code=f.read().replace(\'"\'"\'\\r\\n\'"\'"\', \'"\'"\'\\n\'"\'"\');f.close();exec(compile(code, __file__, \'"\'"\'exec\'"\'"\'))\' install --record /tmp/pip-record-z8o69lka/install-record.txt --single-version-externally-managed --prefix /tmp/pip-build-env-3iuiaex9/overlay --compile --install-headers /root/apache-beam-custom/include/site/python3.7/cffi Check the logs for full command output.\n WARNING: You are using pip version 20.1.1; however, version 20.3.3 is available.\n You should consider upgrading via the \'/root/apache-beam-custom/bin/python -m pip install --upgrade pip\' command.\n ----------------------------------------\nERROR: Command errored out with exit status 1: /root/apache-beam-custom/bin/python /root/apache-beam-custom/lib/python3.7/site-packages/pip install --ignore-installed --no-user --prefix /tmp/pip-build-env-3iuiaex9/overlay --no-warn-script-location --no-binary :all: --only-binary :none: -i https://pypi.org/simple -- \'setuptools>=40.6.0\' wheel \'cffi>=1.0\' Check the logs for full command output.\nWARNING: You are using pip version 20.1.1; however, version 20.3.3 is available.\nYou should consider upgrading via the \'/root/apache-beam-custom/bin/python -m pip install --upgrade pip\' command.\n'
Did I do something wrong?
-------------- EDIT---------------------------------------
Ok, I've got my pipeline to work, but I'm still having a problem with my requirements.txt file which I believe I'm passing in correctly.
My pipeline code:
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
from apache_beam.io.gcp.bigtableio import WriteToBigTable
from apache_beam.runners import DataflowRunner
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import GoogleCloudOptions
import google.auth
from google.cloud.bigtable.row import DirectRow
import datetime
# Setting up the Apache Beam pipeline options.
options = pipeline_options.PipelineOptions(flags=[])
# Sets the project to the default project in your current Google Cloud environment.
_, options.view_as(GoogleCloudOptions).project = google.auth.default()
# Sets the Google Cloud Region in which Cloud Dataflow runs.
options.view_as(GoogleCloudOptions).region = 'us-central1'
# IMPORTANT! Adjust the following to choose a Cloud Storage location.
dataflow_gcs_location = ''
# Dataflow Staging Location. This location is used to stage the Dataflow Pipeline and SDK binary.
options.view_as(GoogleCloudOptions).staging_location = '%s/staging' % dataflow_gcs_location
# Sets the pipeline mode to streaming, so we can stream the data from PubSub.
options.view_as(pipeline_options.StandardOptions).streaming = True
# Sets the requirements.txt file
options.view_as(pipeline_options.SetupOptions).requirements_file = "requirements.txt"
# Dataflow Temp Location. This location is used to store temporary files or intermediate results before finally outputting to the sink.
options.view_as(GoogleCloudOptions).temp_location = '%s/temp' % dataflow_gcs_location
# The directory to store the output files of the job.
output_gcs_location = '%s/output' % dataflow_gcs_location
ib.options.recording_duration = '1m'
# The Google Cloud PubSub topic for this example.
topic = ""
subscription = ""
output_topic = ""
# Info
project_id = ""
bigtable_instance = ""
bigtable_table_id = ""
class CreateRowFn(beam.DoFn):
def process(self,words):
from google.cloud.bigtable.row import DirectRow
import datetime
direct_row = DirectRow(row_key="phone#4c410523#20190501")
direct_row.set_cell(
"stats_summary",
b"os_build",
b"android",
datetime.datetime.now())
return [direct_row]
p = beam.Pipeline(InteractiveRunner(),options=options)
words = p | "read" >> beam.io.ReadFromPubSub(subscription=subscription)
windowed_words = (words | "window" >> beam.WindowInto(beam.window.FixedWindows(10)))
# Writing to BigTable
test = words | beam.ParDo(CreateRowFn()) | WriteToBigTable(
project_id=project_id,
instance_id=bigtable_instance,
table_id=bigtable_table_id)
pipeline_result = DataflowRunner().run_pipeline(p, options=options)
As you can see in "CreateRowFn", I need to import
from google.cloud.bigtable.row import DirectRow
import datetime
Only then this works.
I've passed in requirements.txt as options.view_as(pipeline_options.SetupOptions).requirements_file = "requirements.txt" and I see it on Dataflow console.
If I remove the import statements, I get "in process NameError: name 'DirectRow' is not defined".
Is there anyway to overcome this?
I've found the answer in the FAQs. My mistake was not about how to pass in requirements.txt but how to handle NameErrors
https://cloud.google.com/dataflow/docs/resources/faq
How do I handle NameErrors?
If you're getting a NameError when you execute your pipeline using the Dataflow service but not when you execute locally (i.e. using the DirectRunner), your DoFns may be using values in the global namespace that are not available on the Dataflow worker.
By default, global imports, functions, and variables defined in the main session are not saved during the serialization of a Dataflow job. If, for example, your DoFns are defined in the main file and reference imports and functions in the global namespace, you can set the --save_main_session pipeline option to True. This will cause the state of the global namespace to be pickled and loaded on the Dataflow worker.
Notice that if you have objects in your global namespace that cannot be pickled, you will get a pickling error. If the error is regarding a module that should be available in the Python distribution, you can solve this by importing the module locally, where it is used.
For example, instead of:
import re
…
def myfunc():
# use re module
use:
def myfunc():
import re
# use re module
Alternatively, if your DoFns span multiple files, you should use a different approach to packaging your workflow and managing dependencies.
So the conclusion is:
It is ok to use import statements within the functions
Google Dataflow workers already have the these packages installed: https://cloud.google.com/dataflow/docs/concepts/sdk-worker-dependencies.
If you are running it from cloud composer
In that case you need to add the new Packages to PYPI PACKAGES as shown below.
You can also pass --requirements_file path://requirements.txt as flag in the command while running it.
I prefer to use --setup_file path://setup.py flag instead. The format of setup file is as follows
import setuptools
REQUIRED_PACKAGES = [
'joblib==0.15.1',
'numpy==1.18.5',
'google',
'google-cloud',
'google-cloud-storage',
'cassandra-driver==3.22.0'
]
PACKAGE_NAME = 'my_package'
PACKAGE_VERSION = '0.0.1'
setuptools.setup(
name=PACKAGE_NAME,
version=PACKAGE_VERSION,
description='Searh Rank project',
install_requires=REQUIRED_PACKAGES,
author="Mohd Faisal",
packages=setuptools.find_packages()
)
Use the format below for dataflow script:
from __future__ import absolute_import
import argparse
import logging
import apache_beam as beam
from apache_beam.options.pipeline_options import (GoogleCloudOptions,
PipelineOptions,
SetupOptions,
StandardOptions,
WorkerOptions)
from datetime import date
class Userprocess(beam.DoFn):
def process(self, msg):
yield "OK"
def run(argv=None):
logging.info("Parsing dataflow flags... ")
pipeline_options = PipelineOptions()
pipeline_options.view_as(SetupOptions).save_main_session = True
parser = argparse.ArgumentParser()
parser.add_argument(
'--project',
required=True,
help=(
'project id staging or production '))
parser.add_argument(
'--temp_location',
required=True,
help=(
'temp location'))
parser.add_argument(
'--job_name',
required=True,
help=(
'job name'))
known_args, pipeline_args = parser.parse_known_args(argv)
today = date.today()
logging.info("Processing Date is " + str(today))
google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
google_cloud_options.project = known_args.project
google_cloud_options.job_name = known_args.job_name
google_cloud_options.temp_location = known_args.temp_location
# pipeline_options.view_as(StandardOptions).runner = known_args.runner
with beam.Pipeline(argv=pipeline_args, options=pipeline_options) as p:
beam.ParDo(Userprocess())
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
logging.info("Starting dataflow daily pipeline ")
try:
run()
except:
pass
Try running the script locally for errors.
I'm working on a cloud-based robotic application with AWS RoboMaker. I'm using ROS Kinetic, with the build tool colcon.
My robot application depends on a custom python module, which has to be in my workspace. This python module is built by colcon as a python package, not a ROS package. This page explains how to do that with catkin, but this example shows how to adapt it to colcon. So finally my workspace looks like that :
my_workspace/
|--src/
|--my_module/
| |--setup.py
| |--package.xml
| |--subfolders and python scripts...
|--some_ros_pkg1/
|--some_ros_pkg2/
|...
However the command : colcon build <my_workspace> builds all ROS packages but fails to build my python module as a package.
Here's the error I get :
Starting >>> my-module
[54.297s] WARNING:colcon.colcon_ros.task.ament_python.build:Package 'my-module' doesn't explicitly install a marker in the package index (colcon-ros currently does it implicitly but that fallback will be removed in the future)
[54.298s] WARNING:colcon.colcon_ros.task.ament_python.build:Package 'my-module' doesn't explicitly install the 'package.xml' file (colcon-ros currently does it implicitly but that fallback will be removed in the future)
--- stderr: my-module
usage: setup.py [global_opts] cmd1 [cmd1_opts] [cmd2 [cmd2_opts] ...]
or: setup.py --help [cmd1 cmd2 ...]
or: setup.py --help-commands
or: setup.py cmd --help
error: invalid command 'egg_info'
---
Failed <<< my-module [0.56s, exited with code 1]
I found this issue that seems correlated, and thus tried : pip install --upgrade setuptools
...Which fails with the error message :
Collecting setuptools
Using cached https://files.pythonhosted.org/packages/7c/1b/9b68465658cda69f33c31c4dbd511ac5648835680ea8de87ce05c81f95bf/setuptools-50.3.0.zip
Complete output from command python setup.py egg_info:
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "setuptools/__init__.py", line 16, in <module>
import setuptools.version
File "setuptools/version.py", line 1, in <module>
import pkg_resources
File "pkg_resources/__init__.py", line 1365
raise SyntaxError(e) from e
^
SyntaxError: invalid syntax
----------------------------------------
Command "python setup.py egg_info" failed with error code 1 in /tmp/pip-build-uwFamt/setuptools/
And with pip3 install --upgrade setuptools, I get :
Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: setuptools in /home/ubuntu/.local/lib/python3.5/site-packages (50.3.0)
I have both Python 3.5.2 an Python 2.7, but I don't know which one is used by colcon.
So I don't know what to try next, and what the real problem is. Any help welcome !
I managed to correctly install my package and its dependencies. I develop the method below, in case it may help someone someday !
I have been mainly inspired by this old DeepRacer repository.
The workspace tree in the question is wrong. It should look like this:
my_workspace/
|--src/
|--my_wrapper_package/
| |--setup.py
| |--my_package/
| |--__init__.py
| |--subfolders and python scripts...
|--some_ros_pkg1/
|--some_ros_pkg2/
my_wrapper_package may contain more than one python custom package.
A good setup.py example is this one.
You shouldn't put a package.xml next to setup.py : colcon will only look at the dependencies declared in package.xml, and won't collect pip packages.
It may help sometimes to delete the folders my_wrapper_package generated by colcon in install/ and build/. Doing so you force colcon to rebuild and bundle from scratch.
I've been trying this for awhile now with no luck. Making a basic api but running into some trouble with the flask_restful module. This is my code:
import markdown
import os
import shelve
from flask import Flask, g
from flask_restful import Resource, Api, reqparse
app = Flask(__name)
#app.route("/")
def index():
with open(os.path.dirname(app.root_path) +
'/README.md', 'r') as markdown_file:
content = markdown_file.read()
return markdown.markdown(content)
This is my requirements.txt:
docker==3.4.1
docker-compose==1.22.0
docker-pycreds==0.3.0
dockerpty==0.4.1
docopt==0.6.2
Flask==1.0.2
Flask-RESTful==0.3.6
requests==2.18.4
urllib3==1.22
websocket-client==0.48.0
This is what I get as an error when running docker-compose up:
Starting python-rest_device-registry_1 ... done
Attaching to python-rest_device-registry_1
device-registry_1 | Traceback (most recent call last):
device-registry_1 | File "./run.py", line 1, in <module>
device-registry_1 | from device_registry import app
device-registry_1 | File "/usr/src/app/device_registry/__init__.py", line 6,
in <module>
device-registry_1 | from flask_restful import Resource, Api, reqparse
device-registry_1 | ModuleNotFoundError: No module named 'flask_restful'
python-rest_device-registry_1 exited with code 1
My docker file is this:
FROM python:3
WORKDIR /usr/src/app
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD [ "python", "./run.py" ]
What am I doing wrong here? I pip installed Flask and flask_restful but I'm lost for word on what's going on.
It does not look like you are using flask_restful in the code you are showing here. You are just importing it. It might be possible that you started out with just some flask code, which imports ok, then added flask-restful to your requirements file but docker in unaware of it and it has a cached version of that part of the container.
Try to decouple your problem, start by running your python sample outside docker, in a plain old virtualenv and make sure that starts. Then mess look into how docker is installing or not your newly added requirements.
I'm trying to run a clustering job on Amazon EMR using Mahout.
I have a solr index that I uploaded on S3 and I want to vectorize it using mahouts lucene.vector.(this is the first step in the job flow)
The parameters for the step are:
Jar: s3n://mahout-bucket/jars/mahout-core-0.6-job.jar
MainClass: org.apache.mahout.driver.MahoutDriver
Args: lucene.vector --dir s3n://mahout-input/solr_index/ --field name --dictOut /test/solr-dict-out/dict.txt --output /test/solr-vectors-out/vectors
The error in the log is:
Unknown program 'lucene.vector' chosen.
I've done the same process locally with hadoop and Mahout and it worked fine.
How should I call the lucene.vector function on EMR?
program name, lucene.vector should be immediately after bin/mahout
/homes/cuneyt/trunk/bin/mahout lucene.vector --dir /homes/cuneyt/lucene/index --field 0 --output lda/vector --dictOut /homes/cuneyt/lda/dict.txt
I've eventually figured out the answer. The problem was I was using the wrong MainClass argument. Instead of
org.apache.mahout.driver.MahoutDriver
I should have used:
org.apache.mahout.utils.vectors.lucene.Driver
Therefore the correct arguments should have been
Jar: s3n://mahout-bucket/jars/mahout-core-0.6-job.jar MainClass:
org.apache.mahout.utils.vectors.lucene.Driver
Args: --dir s3n://mahout-input/solr_index/ --field name --dictOut /test/solr-dict-out/dict.txt --output /test/solr-vectors-out/vectors