Dask dataframe get second highest value and column name - dask

This code gives me the highest value and column name.
import numpy as np
import pandas as pd
import dask.dataframe as dd
cols=[0,1,2,3,4]
df = pd.DataFrame(np.random.randn(1000, len(cols)), columns=cols)
ddf = dd.from_pandas(df, npartitions=4)
ddf['max_col'] = ddf[cols].idxmax(axis=1)
ddf['max_val'] = ddf[cols].max(axis=1)
I want to get the second higest as well. Something like:
ddf['max2_col'] = ddf[cols].idxmax2(axis=1)
ddf['max2_val'] = ddf[cols].max2(axis=1)
Are there functions like idxmax2 or max2? Or any other optimized way for doing this?

You should normally try to figure out how to do what you want to do with pandas first. If you cannot, and pose that question instead, with the pandas tag, you will get a faster answer.
The following appears to work for pandas, although it may not be elegant
import numpy as np
import pandas as pd
import dask.dataframe as dd
cols=[0,1,2,3,4]
df = pd.DataFrame(np.random.randn(1000, len(cols)), columns=cols)
def make_cols(df):
df['max2_col'] = df[cols].values.argsort(axis=1)[:, -2]
df2 = df[cols].values.copy()
df2.sort(axis=1)
df['max2_val'] = df2[:, -2]
return df
so to apply it to the dask variant, you can do
ddf = dd.from_pandas(df, npartitions=4)
ddf.map_partitions(make_cols)
ddf.head()

Related

Dask division issue after groupby

I am working on a project where I need to group by several columns depending on the task and I have unknown division issues with dask because of this.
Here is a sample of the problem
import pandas as pd
import dask.dataframe as dd
import numpy as np
df = pd.DataFrame({"col1": np.random.randint(1, 100, 100000), "col2": np.random.randint(101, 200, 100000), "col3": np.random.uniform(0, 4, 100000)})
ddf = dd.from_pandas(df, npartitions=100)
ddf = ddf.set_index("col1")
ddf["col2_sum"] = ddf.groupby("col1")["col3"].transform("sum", meta=('x', 'float64')) # works
print(ddf.compute())
This works because I am grouping by an indexed column. However,
ddf["col2_sum2"] = ddf.groupby("col2")["col3"].transform("sum", meta=('x', 'float64'))
This doesn't work because of ValueError: Not all divisions are known, can't align partitions. Please use `set_index` to set the index.
I have tried to solve this this way
ddf_new = ddf[["col2", "col3"]].set_index("col2")
ddf_new["col2_sum2"] = ddf_new.groupby("col2")["col3"].transform("sum", meta=('x', 'float64'))
ddf_new = ddf_new.drop(columns=["col3"])
ddf = ddf.merge(ddf_new, on=["col2"], how="outer") # works but expensive round trip
print(ddf.compute())
But this is very expensive dask merges. Is there a better way of solving this problem
The solution you created seems reasonable, I would make one improvement (if this is feasible with actual data): if ddf_new is computed, then it becomes a pandas df, so the merge of ddf and ddf_new becomes a lot faster as there is less data to shuffle around.
Update: also to avoid sending the pandas df from workers to client and back, you could do a ddf_new = client.compute(ddf_new) and pass around just the future (reference to the computed pandas df).

Python Dask Apply Function and STore Result in Same Column

Hello i am bit new on Dask and i am trying to do the following things
i have a CSV file I am reading file everything works fine
import pandas
import os
import json
import math
import numpy as np
import dask
from dask.distributed import Client
import dask.dataframe as df
import dask.multiprocessing
client = Client(n_workers=3, threads_per_worker=4, processes=False, memory_limit='2GB')
df = df.read_csv("netflix_titles.csv")
now i have function
def toupper(x):
return x.upper()
i would like to apply this to a column now the issue is want to save the result in same column seems like i cannot do that
df["title"].map(toupper).compute()
The following line works but i want
df["title"] = df["title"].map(toupper).compute()
ValueError: Not all divisions are known, can't align partitions. Please use set_index to set the index.
Image
Maybe try this after read_csv.
df.title = df.title.map(toupper)
df.to_csv("netflix_titles.csv", index=False, single_file=True)
to_csv has a optional argument with default valuecompute=True so you don't need to explicit do compute().

Troubles using dask distributed with datashader: 'can't pickle weakref objects'

I'm working with datashader and dask but I'm having problems when trying to plot with a cluster running. To make it more concrete, I have the following example (embedded in a bokeh plot):
import holoviews as hv
import pandas as pd
import dask.dataframe as dd
import numpy as np
from holoviews.operation.datashader import datashade
import datashader.transfer_functions as tf
#initialize the client/cluster
cluster = LocalCluster(n_workers=4, threads_per_worker=1)
dask_client = Client(cluster)
def datashade_plot():
hv.extension('bokeh')
#create some random data (in the actual code this is a parquet file with millions of rows, this is just an example)
delta = 1/1000
x = np.arange(0, 1, delta)
y = np.cumsum(np.sqrt(delta)*np.random.normal(size=len(x)))
df = pd.DataFrame({'X':x, 'Y':y})
#create dask dataframe
points_dd = dd.from_pandas(df, npartitions=3)
#create plot
points = hv.Curve(points_dd)
return hd.datashade(points)
dask_client.submit(datashade_plot,).result()
This raises a:
TypeError: can't pickle weakref objects
I have the theory that this happens because you can't distribute the datashade operations in the cluster. Sorry if this is a noob question, I'd be very grateful for any advice you could give me.
I think you want to go the other way. That is, pass datashader a dask dataframe instead of a pandas dataframe:
>>> from dask import dataframe as dd
>>> import multiprocessing as mp
>>> dask_df = dd.from_pandas(df, npartitions=mp.cpu_count())
>>> dask_df.persist()
...
>>> cvs = datashader.Canvas(...)
>>> agg = cvs.points(dask_df, ...)
XREF: https://datashader.org/user_guide/Performance.html

dask.ml.xgboost raises UnboundLocalError: local variable 'result' referenced before assignment

I am using dask_xgboost and I don't understand the error stated in the subject. I have successfully trained a model and saved it with joblib.dump.
Later on, during the prediction step I use it like this:
import dask
import dask.dataframe as dd
import dask.distributed as ddst
from dask_jobqueue import PBSCluster
from dask.distributed import Client
import dask_xgboost as dxgb
import geopandas as gp
from sklearn.externals import joblib
def predict(zs_files: List[str], model_name: str, client) -> None:
delayed_dfs = [dask.delayed(gp.read_file)(zsf) for zsf in zs_files]
model = joblib.load(model_name)
delayed_predictions = [
dxgb.predict(client, model, df).to_parquet(f"{fn}_predicted.parquet")
for df, fn in zip(delayed_dfs, zs_files)
]
delayed_predictions.compute()
I read a set of GeoJSON files with geopandas and then just feed the model with them. I am using a client on a PBS cluster.
Any help would be appreciated.
Thanks.
I found the issue. I wass missing a from_delayed call to transform the geopandas dataframe to a dask one:
dxgb.predict(client, model, dd.from_delayed(df))

dask + luigi: raise ValueError('url type not understood: %s' % urlpath)

I am trying to merge dask with luigi,
and while business logic works fine by itself, code starts throwing errors when I run a Luigi task:
raise ValueError('url type not understood: %s' % urlpath)
ValueError: url type not understood: <_io.TextIOWrapper name='../data/2017_04_11_oldsource_geocoded.csv-luigi-tmp-1647603946' mode='wb' encoding='UTF-8'>
the code is here (I dropped the business model part to make it shorter):
import pandas as pd
import geopandas as gp
from geopandas.tools import sjoin
from dask import dataframe as dd
from shapely.geometry import Point
from os import path
import luigi
class geocode_tweets(luigi.Task):
boundaries = _load_geoboundaries()
nyc = boundaries[0].unary_union
def requires(self):
return []
def output(self):
self.path = '../data/2017_04_11_oldsource_geocoded.csv'
return luigi.LocalTarget(self.path)
def run(self):
df = dd.read_csv(path.join(data_dir, '2017_03_22_oldsource.csv'))
df['geometry'] = df.apply(_get_point, axis=1)
meta = _form_meta(df)
S = df.map_partitions(
distributed_sjoin, boundaries=self.boundaries,
nyc_border=self.nyc, meta=meta).drop('geometry', axis=1)
f = self.output().open('w')
S.to_csv(f)
f.close()
and the problem, it looks like, is in the output part
As far as I understand, problem is that dask does not like Luigi file objects as a substitution to the string.
Dask defines DataFrame.to_csv(filename, **kwargs) and you are sending it a file instead of a filename. Replace those last three lines with:
S.to_csv(self.output().path)

Resources