dask + luigi: raise ValueError('url type not understood: %s' % urlpath) - dask

I am trying to merge dask with luigi,
and while business logic works fine by itself, code starts throwing errors when I run a Luigi task:
raise ValueError('url type not understood: %s' % urlpath)
ValueError: url type not understood: <_io.TextIOWrapper name='../data/2017_04_11_oldsource_geocoded.csv-luigi-tmp-1647603946' mode='wb' encoding='UTF-8'>
the code is here (I dropped the business model part to make it shorter):
import pandas as pd
import geopandas as gp
from geopandas.tools import sjoin
from dask import dataframe as dd
from shapely.geometry import Point
from os import path
import luigi
class geocode_tweets(luigi.Task):
boundaries = _load_geoboundaries()
nyc = boundaries[0].unary_union
def requires(self):
return []
def output(self):
self.path = '../data/2017_04_11_oldsource_geocoded.csv'
return luigi.LocalTarget(self.path)
def run(self):
df = dd.read_csv(path.join(data_dir, '2017_03_22_oldsource.csv'))
df['geometry'] = df.apply(_get_point, axis=1)
meta = _form_meta(df)
S = df.map_partitions(
distributed_sjoin, boundaries=self.boundaries,
nyc_border=self.nyc, meta=meta).drop('geometry', axis=1)
f = self.output().open('w')
S.to_csv(f)
f.close()
and the problem, it looks like, is in the output part
As far as I understand, problem is that dask does not like Luigi file objects as a substitution to the string.

Dask defines DataFrame.to_csv(filename, **kwargs) and you are sending it a file instead of a filename. Replace those last three lines with:
S.to_csv(self.output().path)

Related

Python Dask Apply Function and STore Result in Same Column

Hello i am bit new on Dask and i am trying to do the following things
i have a CSV file I am reading file everything works fine
import pandas
import os
import json
import math
import numpy as np
import dask
from dask.distributed import Client
import dask.dataframe as df
import dask.multiprocessing
client = Client(n_workers=3, threads_per_worker=4, processes=False, memory_limit='2GB')
df = df.read_csv("netflix_titles.csv")
now i have function
def toupper(x):
return x.upper()
i would like to apply this to a column now the issue is want to save the result in same column seems like i cannot do that
df["title"].map(toupper).compute()
The following line works but i want
df["title"] = df["title"].map(toupper).compute()
ValueError: Not all divisions are known, can't align partitions. Please use set_index to set the index.
Image
Maybe try this after read_csv.
df.title = df.title.map(toupper)
df.to_csv("netflix_titles.csv", index=False, single_file=True)
to_csv has a optional argument with default valuecompute=True so you don't need to explicit do compute().

Troubles using dask distributed with datashader: 'can't pickle weakref objects'

I'm working with datashader and dask but I'm having problems when trying to plot with a cluster running. To make it more concrete, I have the following example (embedded in a bokeh plot):
import holoviews as hv
import pandas as pd
import dask.dataframe as dd
import numpy as np
from holoviews.operation.datashader import datashade
import datashader.transfer_functions as tf
#initialize the client/cluster
cluster = LocalCluster(n_workers=4, threads_per_worker=1)
dask_client = Client(cluster)
def datashade_plot():
hv.extension('bokeh')
#create some random data (in the actual code this is a parquet file with millions of rows, this is just an example)
delta = 1/1000
x = np.arange(0, 1, delta)
y = np.cumsum(np.sqrt(delta)*np.random.normal(size=len(x)))
df = pd.DataFrame({'X':x, 'Y':y})
#create dask dataframe
points_dd = dd.from_pandas(df, npartitions=3)
#create plot
points = hv.Curve(points_dd)
return hd.datashade(points)
dask_client.submit(datashade_plot,).result()
This raises a:
TypeError: can't pickle weakref objects
I have the theory that this happens because you can't distribute the datashade operations in the cluster. Sorry if this is a noob question, I'd be very grateful for any advice you could give me.
I think you want to go the other way. That is, pass datashader a dask dataframe instead of a pandas dataframe:
>>> from dask import dataframe as dd
>>> import multiprocessing as mp
>>> dask_df = dd.from_pandas(df, npartitions=mp.cpu_count())
>>> dask_df.persist()
...
>>> cvs = datashader.Canvas(...)
>>> agg = cvs.points(dask_df, ...)
XREF: https://datashader.org/user_guide/Performance.html

dask.ml.xgboost raises UnboundLocalError: local variable 'result' referenced before assignment

I am using dask_xgboost and I don't understand the error stated in the subject. I have successfully trained a model and saved it with joblib.dump.
Later on, during the prediction step I use it like this:
import dask
import dask.dataframe as dd
import dask.distributed as ddst
from dask_jobqueue import PBSCluster
from dask.distributed import Client
import dask_xgboost as dxgb
import geopandas as gp
from sklearn.externals import joblib
def predict(zs_files: List[str], model_name: str, client) -> None:
delayed_dfs = [dask.delayed(gp.read_file)(zsf) for zsf in zs_files]
model = joblib.load(model_name)
delayed_predictions = [
dxgb.predict(client, model, df).to_parquet(f"{fn}_predicted.parquet")
for df, fn in zip(delayed_dfs, zs_files)
]
delayed_predictions.compute()
I read a set of GeoJSON files with geopandas and then just feed the model with them. I am using a client on a PBS cluster.
Any help would be appreciated.
Thanks.
I found the issue. I wass missing a from_delayed call to transform the geopandas dataframe to a dask one:
dxgb.predict(client, model, dd.from_delayed(df))

Dask dataframe get second highest value and column name

This code gives me the highest value and column name.
import numpy as np
import pandas as pd
import dask.dataframe as dd
cols=[0,1,2,3,4]
df = pd.DataFrame(np.random.randn(1000, len(cols)), columns=cols)
ddf = dd.from_pandas(df, npartitions=4)
ddf['max_col'] = ddf[cols].idxmax(axis=1)
ddf['max_val'] = ddf[cols].max(axis=1)
I want to get the second higest as well. Something like:
ddf['max2_col'] = ddf[cols].idxmax2(axis=1)
ddf['max2_val'] = ddf[cols].max2(axis=1)
Are there functions like idxmax2 or max2? Or any other optimized way for doing this?
You should normally try to figure out how to do what you want to do with pandas first. If you cannot, and pose that question instead, with the pandas tag, you will get a faster answer.
The following appears to work for pandas, although it may not be elegant
import numpy as np
import pandas as pd
import dask.dataframe as dd
cols=[0,1,2,3,4]
df = pd.DataFrame(np.random.randn(1000, len(cols)), columns=cols)
def make_cols(df):
df['max2_col'] = df[cols].values.argsort(axis=1)[:, -2]
df2 = df[cols].values.copy()
df2.sort(axis=1)
df['max2_val'] = df2[:, -2]
return df
so to apply it to the dask variant, you can do
ddf = dd.from_pandas(df, npartitions=4)
ddf.map_partitions(make_cols)
ddf.head()

Probable issue with LSTM in lasagne

With a simple constructor for the LSTM, as given in the tutorial, and an input of dimension [,,1] one would expect to see an output of shape [,,num_units].
But regardless of the num_units passed during construction, the output has the same shape as the input.
Following is the min code to replicate this issue...
import lasagne
import theano
import theano.tensor as T
import numpy as np
num_batches= 20
sequence_length= 100
data_dim= 1
train_data_3= np.random.rand(num_batches,sequence_length,data_dim).astype(theano.config.floatX)
#As in the tutorial
forget_gate = lasagne.layers.Gate(b=lasagne.init.Constant(5.0))
l_lstm = lasagne.layers.LSTMLayer(
(num_batches,sequence_length, data_dim),
num_units=8,
forgetgate=forget_gate
)
lstm_in= T.tensor3(name='x', dtype=theano.config.floatX)
lstm_out = lasagne.layers.get_output(l_lstm, {l_lstm:lstm_in})
f = theano.function([lstm_in], lstm_out)
lstm_output_np= f(train_data_3)
lstm_output_np.shape
#= (20, 100, 1)
An unqualified LSTM (I mean in its default mode) should produce one output for each unit right?
The code was run on kaixhin's cuda lasagne docker image docker image
What gives?
Thanks !
You can fix that by using a lasagne.layers.InputLayer
import lasagne
import theano
import theano.tensor as T
import numpy as np
num_batches= 20
sequence_length= 100
data_dim= 1
train_data_3= np.random.rand(num_batches,sequence_length,data_dim).astype(theano.config.floatX)
#As in the tutorial
forget_gate = lasagne.layers.Gate(b=lasagne.init.Constant(5.0))
input_layer = lasagne.layers.InputLayer(shape=(num_batches, # <-- change
sequence_length, data_dim),) # <-- change
l_lstm = lasagne.layers.LSTMLayer(input_layer, # <-- change
num_units=8,
forgetgate=forget_gate
)
lstm_in= T.tensor3(name='x', dtype=theano.config.floatX)
lstm_out = lasagne.layers.get_output(l_lstm, lstm_in) # <-- change
f = theano.function([lstm_in], lstm_out)
lstm_output_np= f(train_data_3)
print lstm_output_np.shape
If you feed your input into the input_layer, it is not ambiguous anymore, so you do not even need to specify where the input is supposed to go. Directly specifying a shape and adding the tensor3 into the LSTM does not work.

Resources