How to speed up pandas read_csv? - parsing

I currently parse a text file with the following:
f = lambda s: datetime.datetime.strptime(s, '%Y-%m-%d-%H-%M-%S')
dframe = pd.read_csv(
fname, sep=' ', header=None,
names=('A', 'B', 'C', 'D', 'E'),
use_unsigned=True, parse_dates=True, index_col=0, date_parser=f)
which takes about 5.70 s for a single file.
Can I speedup the datetime parsing?
A line from the file looks like:
2015-04-08-11-23-27 12420.8 12430.3 12527.0 12394.2 A
Thanks,

You should be able to speed it up a bit by using to_datetime manually instead of using your lambda function:
>>> %time df = pd.read_csv(fname, delim_whitespace=True, header=None,
names=('A', 'B', 'C', 'D', 'E'), use_unsigned=True, parse_dates=True,
index_col=0, date_parser=f)
CPU times: user 9.16 s, sys: 39.9 ms, total: 9.2 s
Wall time: 9.2 s
vs.
>>> %time df2 = pd.read_csv(fname, delim_whitespace=True, header=None, names=('A', 'B', 'C', 'D', 'E'), use_unsigned=True, parse_dates=False, index_col=0)
CPU times: user 416 ms, sys: 20 ms, total: 436 ms
Wall time: 435 ms
>>> %time df2.index = pd.to_datetime(df2.index, format="%Y-%m-%d-%H-%M-%S")
CPU times: user 2.72 s, sys: 4 ms, total: 2.72 s
Wall time: 2.72 s
>>>
>>> df.equals(df2)
True
>>> (2.72+0.435)/9.2
0.3429347826086957
(I'm using delim_whitespace=True because that tends to be modestly faster in situations like this.)

Related

Dask - custom aggregation

I've just learned about Dask yesterday and now am upgrading my work from pandas... but am stuck trying to translate simple custom aggregations.
I'm not fully (or probably even at all) understanding how the Series are represented inside those internal lambda functions; normally I would step in with breakpoint() to inspect them, but this isn't an option here. When I try to get an element of x with index, I get an "Meta" error.
Any help/pointers would be appreciated.
import dask.dataframe as dd
import pandas as pd
#toy df
df = dd.from_pandas(pd.DataFrame(dict(a = [1, 1, 2, 2], \
b = [100, 100, 200, 250])), npartitions=2)
df.compute()
a b
0 1 100
1 1 100
2 2 200
3 2 250
# PART 1 - for conceptual understanding - replicating trivial list
# intended result
df.groupby('a').agg(list).compute()
b
a
1 [100, 100]
2 [200, 250]
# replicate manually with custom aggregation
list_manual = dd.Aggregation('list_manual', lambda x: list(x), \
lambda x1: list(x1))
res = df.groupby('a').agg(list_manual).compute()
res
b
0 (0, [(1, 0 100\n1 100\nName: b, dtype: i...
res.b[0]
(0,
0 (1, [100, 100])
0 (2, [200, 250])
Name: list_manual-b-6785578d38a71d6dbe0d1ac6515538f7, dtype: object)
# looks like the grouping tuple wasn't even unpacked (1, 2 groups are there)
# ... instead it all got collapsed into one thing
# PART 2 - custom function
# with pandas - intended behavior
collect_uniq = lambda x: list(set(x))
dfp = df.compute()
dfp.groupby('a').agg(collect_uniq)
b
a
1 [100]
2 [200, 250]
#now trying the same in Dask
collect_uniq_dask = dd.Aggregation('collect_uniq_dask', \
lambda x: list(set(x)), lambda x1: list(x1))
res = df.groupby('a').agg(collect_uniq_dask).compute()
# gives TypeError("unhashable type: 'Series'")

How do I operate on groups returned by Dask's group by?

I have the following table.
value category
0 2 A
1 20 B
2 4 A
3 40 B
I want to add a mean column that contains the mean of the values for each category.
value category mean
0 2 A 3.0
1 20 B 30.0
2 4 A 3.0
3 40 B 30.0
I can do this in pandas like so
p = pd.DataFrame({"value":[2, 20, 4, 40], "category": ["A", "B", "A", "B"]})
groups = []
for _, group in p.groupby("category"):
group.loc[:,"mean"] = group.loc[:,"value"].mean()
groups.append(group)
pd.concat(groups).sort_index()
How do I do the same thing in Dask?
I can't use the pandas functions as-is because you can't enumerate over a groupby object in Dask. This
import dask.dataframe as dd
d = dd.from_pandas(p, chunksize=100)
list(d.groupby("category"))
raises KeyError: 'Column not found: 0'.
I can use an apply function to calculate the mean in Dask.
import dask.dataframe as dd
d = dd.from_pandas(p, chunksize=100)
q = d.groupby(["category"]).apply(lambda group: group["value"].mean(), meta="object")
q.compute()
returns
category
A 3.0
B 30.0
dtype: float64
But I can't figure how how to fold these back into the rows of the original table.
I would use a merge to achieve this operation:
import dask.dataframe as dd
import pandas as pd
df = pd.DataFrame({
'value': [2, 20, 4, 40],
'category': ['A', 'B', 'A', 'B']
})
ddf = dd.from_pandas(df, npartitions=1)
# Lazy-compute mean per category
mean_by_category = (ddf
.groupby('category')
.agg({'value': 'mean'})
.rename(columns={'value': 'mean'})
).persist()
mean_by_category.head()
# Assign 'mean' value to each corresponding category
ddf = ddf.merge(mean_by_category, left_on='category', right_index=True)
ddf.head()
Which should then output:
category value mean
0 A 2 3.0
2 A 4 3.0
1 B 20 30.0
3 B 40 30.0

How to do groupby filter in Dask

I am attempting to take a dask dataframe, group by column 'A' and remove the groups where there are fewer than MIN_SAMPLE_COUNT rows.
For example, the following code works in pandas:
import pandas as pd
import dask as da
MIN_SAMPLE_COUNT = 1
x = pd.DataFrame([[1,2,3], [1,5,6], [2,8,9], [1,3,5]])
x.columns = ['A', 'B', 'C']
grouped = x.groupby('A')
x = grouped.filter(lambda x: x['A'].count().astype(int) > MIN_SAMPLE_COUNT)
However, in Dask if I try something analogous:
import pandas as pd
import dask
MIN_SAMPLE_COUNT = 1
x = pd.DataFrame([[1,2,3], [1,5,6], [2,8,9], [1,3,5]])
x.columns = ['A', 'B', 'C']
x = dask.dataframe.from_pandas(x, npartitions=2)
grouped = x.groupby('A')
x = grouped.filter(lambda x: x['A'].count().astype(int) > MIN_SAMPLE_COUNT)
I get the following error message:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\groupby.py in __getattr__(self, key)
1162 try:
-> 1163 return self[key]
1164 except KeyError as e:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\groupby.py in __getitem__(self, key)
1153 # error is raised from pandas
-> 1154 g._meta = g._meta[key]
1155 return g
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\base.py in __getitem__(self, key)
274 if key not in self.obj:
--> 275 raise KeyError("Column not found: {key}".format(key=key))
276 return self._gotitem(key, ndim=1)
KeyError: 'Column not found: filter'
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
<ipython-input-55-d8a969cc041b> in <module>()
1 # Remove sixty second blocks that have fewer than MIN_SAMPLE_COUNT samples.
2 grouped = dat.groupby('KPI_60_seconds')
----> 3 dat = grouped.filter(lambda x: x['KPI_60_seconds'].count().astype(int) > MIN_SAMPLE_COUNT)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\groupby.py in __getattr__(self, key)
1163 return self[key]
1164 except KeyError as e:
-> 1165 raise AttributeError(e)
1166
1167 #derived_from(pd.core.groupby.DataFrameGroupBy)
AttributeError: 'Column not found: filter'
The error message suggests that the filter method used in Pandas has not been implemented in Dask (nor did I find it after a search).
Is there a Dask functionality which captures what I am looking to do? I have gone through the Dask API and nothing stood out to me as what I need. I am currently using Dask '1.1.1'
Thank you for your help.
Fairly new to Dask myself. One way to achieve you are trying could be as follows:
Dask version: 0.17.3
import pandas as pd
import dask.dataframe as dd
MIN_SAMPLE_COUNT = 1
x = pd.DataFrame([[1,2,3], [1,5,6], [2,8,9], [1,3,5]])
x.columns = ['A', 'B', 'C']
print("x (before):")
print(x) # still pandas
x = dd.from_pandas(x, npartitions=2)
grouped = x.groupby('A').B.count().reset_index()
grouped = grouped.rename(columns={'B': 'Count'})
y = dd.merge(x, grouped, on=['A'])
y = y[y.Count > MIN_SAMPLE_COUNT]
x = y[['A', 'B', 'C']]
print("x (after):")
print(x.compute()) # needs compute for conversion to pandas df
Output:
x (before):
A B C
0 1 2 3
1 1 5 6
2 2 8 9
3 1 3 5
x (after):
A B C
0 1 2 3
1 1 5 6
1 1 3 5

SelectKBest sklearn varying computation times

I was trying to do feature selection on a synthetic multilabel dataset. It was observed that the computation time for giving the complete dataset to SelectKBest was much higher as compared to time required while giving one feature at a time. In this example below only one label (or target variable) is considered.
import pandas as pd
from sklearn.datasets import make_multilabel_classification
from sklearn.feature_selection import chi2, SelectKBest, f_classif
# Generate a multilabel dataset
x, y = make_multilabel_classification(n_samples=40000, n_features = 1000, sparse = False, n_labels = 4, n_classes = 9,
return_indicator = 'dense', allow_unlabeled = True, random_state = 1000)
X_df = pd.DataFrame(x)
y_df = pd.DataFrame(y)
%%time
selected_features2 = []
for label in y_df.columns.tolist()[0:1]:
selector = SelectKBest(f_classif, k='all')
selected_features = []
for ftr in X_df.columns.tolist():
selector.fit(X_df[[ftr]], y_df[label])
selected_features.extend(np.round(selector.scores_,4))
CPU times: user 3.2 s, sys: 0 ns, total: 3.2 s . Wall time: 3.18 s
%%time
sel_features = []
for label in y_df.columns.tolist()[0:1]:
selector = SelectKBest(f_classif, k='all')
selector.fit(X_df, y_df[label])
sel_features.extend(np.round(selector.scores_,4))
CPU times: user 208 ms, sys: 37.2 s, total: 37.4 s Wall time: 37.4 s
%%time
sel_features = []
for label in y_df.columns.tolist()[0:1]:
selector = SelectKBest(f_classif, k='all')
selector.fit(X_df.as_matrix(), y_df[label].as_matrix())
sel_features.extend(np.round(selector.scores_,4))
CPU times: user 220 ms, sys: 35.4 s, total: 35.7 s Wall time: 35.6 s .
Why is there this much difference in the computation times?

xarray: rolling mean of dask array conflicting sizes for data and coordinate in rolling operation

I am trying to do a rolling mean to a dask array within xarray. My issue may lay in the rechunking before the rolling mean. I am getting a ValueError of conflicting sizes between data and coordinates. However, this arises within the rolling operation as I don't think there are conflicts in the data and coords of the array before going into the rolling operation.
Apologies for not creating data to test but my project data is quick to play with:
import xarray as xr
remote_data = xr.open_dataarray('http://iridl.ldeo.columbia.edu/SOURCES/.Models'\
'/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods',
chunks={'L': 1, 'S': 1})
da = remote_data.isel(P=0,L=0,M=0,X=0,Y=0)
da_day_clim = da.groupby('S.dayofyear').mean('S')
print(da_day_clim)
#<xarray.DataArray 'zg' (dayofyear: 366)>
#dask.array<shape=(366,), dtype=float32, chunksize=(1,)>
#Coordinates:
# L timedelta64[ns] 12:00:00
# Y float32 -90.0
# M float32 1.0
# X float32 0.0
# P int32 500
# * dayofyear (dayofyear) int64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 ...
# Do a 31-day rolling mean
# da_day_clim.rolling(dayofyear=31, center=True).mean()
# This brings up:
#ValueError: The overlapping depth 30 is larger than your
#smallest chunk size 1. Rechunk your array
#with a larger chunk size or a chunk size that
#more evenly divides the shape of your array.
# Read http://xarray.pydata.org/en/stable/dask.html
# and found http://xarray.pydata.org/en/stable/generated/xarray.Dataset.chunk.html#xarray.Dataset.chunk
# I could make a little PR to add the .chunk() into the ValeError message. Thoughts?
# Rechunk. Played around with a few values but decided on
# the len of dayofyear
da_day_clim2 = da_day_clim.chunk({'dayofyear': 366})
print(da_day_clim2)
#<xarray.DataArray 'zg' (dayofyear: 366)>
#dask.array<shape=(366,), dtype=float32, chunksize=(366,)>
#Coordinates:
# L timedelta64[ns] 12:00:00
# Y float32 -90.0
# M float32 1.0
# X float32 0.0
# P int32 500
# * dayofyear (dayofyear) int64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 ...
# Rolling mean on this
da_day_clim_smooth = da_day_clim2.rolling(dayofyear=31, center=True).mean()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-57-6acf382cdd3d> in <module>()
4 da_day_clim = da.groupby('S.dayofyear').mean('S')
5 da_day_clim2 = da_day_clim.chunk({'dayofyear': 366})
----> 6 da_day_clim_smooth = da_day_clim2.rolling(dayofyear=31, center=True).mean()
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/rolling.py in wrapped_func(self, **kwargs)
307 if self.center:
308 values = values[valid]
--> 309 result = DataArray(values, self.obj.coords)
310
311 return result
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/dataarray.py in __init__(self, data, coords, dims, name, attrs, encoding, fastpath)
224
225 data = as_compatible_data(data)
--> 226 coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
227 variable = Variable(dims, data, attrs, encoding, fastpath=True)
228
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/dataarray.py in _infer_coords_and_dims(shape, coords, dims)
79 raise ValueError('conflicting sizes for dimension %r: '
80 'length %s on the data but length %s on '
---> 81 'coordinate %r' % (d, sizes[d], s, k))
82
83 if k in sizes and v.shape != (sizes[k],):
ValueError: conflicting sizes for dimension 'dayofyear': length 351 on the data but length 366 on coordinate 'dayofyear'
The length 351 is related to 366-351=15 (half the window).
This turned out to be a bug in Xarray and was fixed in https://github.com/pydata/xarray/pull/2122
The fix will be in Xarray 0.10.4 which is slated for imminent release.

Resources