Dask running out of memory even when partitions are way less than worker memory - dask

I am using dask for a project and I have managed to get it working by following some golden rules like
Use your native pandas code but wrap it around map_partitions
Index the columns you want to use for merge or group by
Partition your data such that the partition sizes are less than your worker memory
Avoid large graphs(I try to save intelligently like after setting an index)
However, I am trying to do a summary of everything that has worked for me so far with random data and even for data less than my total memory the workers are exceeding memory and restarting and this overflows to my computer which also runs out of memory.
import pandas as pd
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
import numpy as np
def create_files():
size = 1_000_000
for i in range(100):
df = pd.DataFrame({"col1": np.random.randint(90_000, 100_000, size), "col2": np.random.randint(101, 20_000, size), "col3": np.random.uniform(0, 10_000, size)})
# Select appropriate partitions
ddf = dd.from_pandas(df, npartitions=1)
ddf.to_parquet(f"test", ignore_divisions=True, engine="fastparquet", overwrite= i==0, append= i>0)
print("Created first file")
for i in range(10):
df = pd.DataFrame({"col1": np.random.randint(90_000, 100_000, size), "col4": np.random.uniform(0, 10_000, size)})
# Select appropriate partitions
ddf = dd.from_pandas(df, npartitions=1)
ddf.to_parquet(f"test2", ignore_divisions=True, engine="fastparquet", overwrite= i== 0, append= i>0)
print("Created second file")
print("-------------------------------------------------------------")
def index_reparttion():
print("About to repartition")
ddf = dd.read_parquet("test")
ddf = ddf.repartition(npartitions=100)
ddf = ddf.set_index("col1")
ddf = _rebalance_ddf(ddf)
print("save parquet")
ddf.to_parquet("test")
ddf = dd.read_parquet("test2")
ddf = ddf.repartition(npartitions=50)
ddf = ddf.set_index("col1")
ddf = _rebalance_ddf(ddf)
print("save parquet 2")
ddf.to_parquet("test2")
# https://stackoverflow.com/questions/52642966/repartition-dask-dataframe-to-get-even-partitions
def _rebalance_ddf(ddf):
"""Repartition dask dataframe to ensure that partitions are roughly equal size.
Assumes `ddf.index` is already sorted.
"""
if not ddf.known_divisions: # e.g. for read_parquet(..., infer_divisions=False)
ddf = ddf.reset_index().set_index(ddf.index.name, sorted=True)
index_counts = ddf.map_partitions(lambda _df: _df.index.value_counts().sort_index()).compute()
index = np.repeat(index_counts.index, index_counts.values)
divisions, _ = dd.io.io.sorted_division_locations(index, npartitions=ddf.npartitions)
return ddf.repartition(divisions=divisions)
def main():
ddf = dd.read_parquet("test")
print(ddf.compute())
print(ddf.memory_usage_per_partition(index=True, deep=False).compute())
print(ddf.memory_usage(deep=True).sum().compute())
ddf2 = dd.read_parquet("test2")
print(ddf2.memory_usage_per_partition(index=True, deep=False).compute())
print(ddf2.memory_usage(deep=True).sum().compute())
def mapped_fun(data):
for lag in range(4):
data[f"col_{lag}"] = data.groupby("col1")["col3"].transform(lambda x: x.shift(lag)).apply(lambda x: np.log(x))
return data
ddf = ddf.map_partitions(mapped_fun)
ddf = ddf.merge(ddf2, on=['col1'], how="left")
ddf.to_parquet("result", engine="fastparquet")
if __name__ == "__main__":
cluster = LocalCluster(
n_workers=4,
threads_per_worker=2,
memory_limit='auto'
)
client = Client(cluster)
create_files()
index_reparttion()
main()
How is this possible? I am really confused because the big dataset(ddf) is 2.4gb all of which can fit comfortably into memory of 16gb and each partition is about 23mb which is way less than the 4gb allocated for each worker. I read that if you have a lot repetition in the column used for merging pandas can generate large values(Merge large datasets with dask). I have created quite a large range for the index and the problem doesn't disappear.

Related

Outcput of Chi2 is showing as in dataframe fromat:

I am trying loop the ChiSquare test and outcome is not shown as required, that is in Dataframe.
All columns are coming one row..
Please help
# Import the function
from scipy.stats import chi2_contingency
chi2_check = []
for i in df_clean.select_dtypes(['object']):
if chi2_contingency(pd.crosstab(df_clean['Final_Comments'], df_clean[i]))[1] < 0.05:
chi2_check.append('Reject Null Hypothesis')
else:
chi2_check.append('Fail to Reject Null Hypothesis')
res = pd.DataFrame(data = [df_clean.select_dtypes(['object']), chi2_check]
).T
res.columns = ['Column', 'Hypothesis']
print(res)
res.columns

How to fine tune a masked language model?

I'm trying to follow the huggingface tutorial on fine tuning a masked language model (masking a set of words randomly and predicting them). But they assume that the dataset is in their system (can load it with from datasets import load_dataset; load_dataset("dataset_name")). However, my input dataset is a long string:
text = "This is an attempt of a great example. "
dataset = text * 3000
I followed their approach and tokenized each it:
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM
import torch
from transformers import DataCollatorForLanguageModeling
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def tokenize_long_text(tokenizer, long_text):
individual_sentences = long_text.split('.')
tokenized_sentences_list = tokenizer(individual_sentences)['input_ids']
tokenized_sequence = [x for xs in tokenized_sentences_list for x in xs]
return tokenized_sequence
tokenized_sequence = tokenize_long_text(tokenizer, long_text)
Following by chunking it into equal length segments:
def chunk_long_tokenized_text(tokenizer_text, chunk_size):
# Compute length of long tokenized texts
total_length = len(tokenizer_text)
# We drop the last chunk if it's smaller than chunk_size
total_length = (total_length // chunk_size) * chunk_size
return [tokenizer_text[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
chunked_sequence = chunk_long_tokenized_text(tokenized_sequence, 30)
Created a data collator for random masking:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) # expects a list of dicts, where each dict represents a single chunk of contiguous text
Example of how it works:
d = {}
d['input_ids'] = chunked_sequence[0]
d
>>>{'input_ids': [101,
2023,
2003,
1037,
2307,
103,...
for chunk in data_collator([ d ])["input_ids"]:
print(f"\n'>>> {tokenizer.decode(chunk)}'")
>>>'>>> [CLS] this is a great [MASK] [SEP] [CLS] this is a great [MASK] [SEP] [CLS] this is a great [MASK] [SEP] [CLS] this is a great [MASK] [SEP] [CLS] this'
However, the remaining steps (which I believe is just the training component) seem to only work using their trainer method, which can only take their dataset.
How can this work with a dataset in the form of a string?

Error while trying to transpose the matrix

Code for a single raster file:
import geopandas as gpd
#import os
import rasterio
import scipy.sparse as sparse
import pandas as pd
import numpy as np
# Create an empty pandas dataframe called 'table'
table = pd.DataFrame(index = np.arange(0,1))
# Read the points shapefile using GeoPandas
stations = gpd.read_file(r'E:/anakonda/Shape files/AAQ_st1/AAQ_ST1.shp')
stations['lon'] = stations['geometry'].x
stations['lat'] = stations['geometry'].y
Matrix = pd.DataFrame()
# Iterate through the rasters and save the data as individual arrays to a Matrix
dataset = rasterio.open(r'E:/anakonda/LST_day/MOD11A1.006_LST_Day_1km_doy2019082_aid0001.tif')
data_array = dataset.read(1)
data_array_sparse = sparse.coo_matrix(data_array, shape = (351, 545))
for records_date in Matrix.columns.tolist():
a = Matrix
LST_day_value = a.loc[int(row)][int(col)]
table[records_date] = LST_day_value
transpose_mat = table.T
transpose_mat.rename(columns = {0: 'LST_Day(Kel)'}, inplace = True)
transpose_mat.to_csv(r'E:/anakonda/LST_day'+'\\'+station_name+'.csv')
Error code lines:
LST_day_value = a.loc[int(row)][int(col)]
transpose_mat.to_csv(r'E:/anakonda/LST_day'+'\'+station_name+'.csv')
Errors Shown:
Undefined Name 'row' (pyflakes E)
Undefined Name 'col' (pyflakes E)
NameError: name 'transpose_mat' is not defined
I'm using the above code for creating a Raster Time-series for Modis LST data. the code ran well till 'transposing the matrix'. the error shown is mentioned below the code. Im new to python, so kindly help me with this issue.
import os
import rasterio
import scipy.sparse as sparse
import pandas as pd
import numpy as np
# Create an empty pandas dataframe called 'table'
table = pd.DataFrame(index = np.arange(0,1))
# Read the points shapefile using GeoPandas
stations = gpd.read_file(r'E:/anakonda/Shape files/AAQ_st1/AAQ_ST1.shp')
stations['lon'] = stations['geometry'].x
stations['lat'] = stations['geometry'].y
Matrix = pd.DataFrame()
# Iterate through the rasters and save the data as individual arrays to a Matrix
for files in os.listdir(r'E:/anakonda/LST_Night'):
if files[-4: ] == '.tif':
dataset = rasterio.open(r'E:/anakonda/LST_Night'+'\\'+files)
data_array = dataset.read(1)
data_array_sparse = sparse.coo_matrix(data_array, shape = (351,545))
data = files[ :-20]
Matrix[data] = data_array_sparse.toarray().tolist()
print('Processing is done for the raster: '+ files[:-20])
# Iterate through the stations and get the corresponding row and column for the related x, y coordinates
for index, row in stations.iterrows():
station_name = str(row['Station'])
lon = float(row['lon'])
lat = float(row['lat'])
x,y = (lon, lat)
row, col = dataset.index(x, y)
print('Processing: '+ station_name)
# Pick the LST value from each stored raster array and record it into the previously created 'table'
for records_date in Matrix.columns.tolist():
a = Matrix[records_date]
LST_Night_value = a.loc[int(row)][int(col)]
table[records_date] = LST_Night_value
transpose_mat = table.T
transpose_mat.rename(columns = {0: 'LstNight(Kel)'}, inplace = True)
transpose_mat.to_csv(r'E:/anakonda/LST_Night'+'\\'+station_name+'.csv')```
This is the error shown:
```File "C:\Anaconda\envs\timeseries\lib\site-packages\pandas\core\indexes\range.py", line 357, in get_loc
raise KeyError(key) from err
KeyError: 2278```

Getting the column names chosen after a feature selection method

Given a simple feature selection code below, I want to know the selected columns after the feature selection (The dataset includes a header V1 ... V20)
import pandas as pd
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
def feature_selection(data):
y = data['Class']
X = data.drop(['Class'], axis=1)
fs = SelectKBest(score_func=f_regression, k=10)
# Applying feature selection
X_selected = fs.fit_transform(X, y)
# TODO: determine the columns being selected
return X_selected
data = pd.read_csv("../dataset.csv")
new_data = feature_selection(data)
I appreciate any help.
I have used the iris dataset for my example but you can probably easily modify your code to match your use case.
The SelectKBest method has the scores_ attribute I used to sort the features.
Feel free to ask for any clarifications.
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
from sklearn.datasets import load_iris
def feature_selection(data):
y = data[1]
X = data[0]
column_names = ["A", "B", "C", "D"] # Here you should use your dataframe's column names
k = 2
fs = SelectKBest(score_func=f_regression, k=k)
# Applying feature selection
X_selected = fs.fit_transform(X, y)
# Find top features
# I create a list like [[ColumnName1, Score1] , [ColumnName2, Score2], ...]
# Then I sort in descending order on the score
top_features = sorted(zip(column_names, fs.scores_), key=lambda x: x[1], reverse=True)
print(top_features[:k])
return X_selected
data = load_iris(return_X_y=True)
new_data = feature_selection(data)
I don't know the in-build method, but it can be easily coded.
n_columns_selected = X_new.shape[0]
new_columns = list(sorted(zip(fs.scores_, X.columns))[-n_columns_selected:])
# new_columns order is perturbed, we need to restore it. We use the names of the columns of X as a reference
new_columns = list(sorted(cols_new, key=lambda x: list(X.columns).index(x)))

how to convert pandas str.split call to to dask

I have a dask data frame where the index is a string which looks like this:
12/09/2016 00:00;32.0046;-106.259
12/09/2016 00:00;32.0201;-108.838
12/09/2016 00:00;32.0224;-106.004
(its basically a string encoding the datetime;latitude;longitude of the row)
I'd like to split that while still in the dask context to individual columns representing each of the fields.
I can do that with a pandas dataframe as:
df['date'], df['Lat'], df['Lon'] = df.index.str.split(';', 2).str
But that doesn't work in dask for several of the attempts I've tried. If I directly substitute the df for a dask df I get the error:
'Index' object has no attribute 'str'
If I use the column name instead of index as:
forecastDf['date'], forecastDf['Lat'], forecastDf['Lon'] = forecastDf['dateLocation'].str.split(';', 2).str
I get the error:
TypeError: 'StringAccessor' object is not iterable
Here is an runnable example of this working in Pandas
import pandas as pd
df = pd.DataFrame()
df['dateLocation'] = ['12/09/2016 00:00;32.0046;-106.259','12/09/2016 00:00;32.0201;-108.838','12/09/2016 00:00;32.0224;-106.004']
df = df.set_index('dateLocation')
df['date'], df['Lat'], df['Lon'] = df.index.str.split(';', 2).str
df.head()
Here is the error I get if I directly convert that to dask
import dask.dataframe as dd
dd = dd.from_pandas(df, npartitions=1)
dd['date'], dd['Lat'], dd['Lon'] = dd.index.str.split(';', 2).str
>>TypeError: 'StringAccessor' object is not iterable
forecastDf['date'] = forecastDf['dateLocation'].str.partition(';')[0]
forecastDf['Lat'] = forecastDf['dateLocation'].str.partition(';')[2]
forecastDf['Lon'] = forecastDf['dateLocation'].str.partition(';')[4]
Let me know if this works for you!
First make sure the column is string dtype
forecastDD['dateLocation'] = forecastDD['dateLocation'].astype('str')
Then you can use this to split in dask
splitColumns = client.persist(forecastDD['dateLocation'].str.split(';',2))
You can then index the columns in the new dataframe splitColumns and add them back to the original data frame.
forecastDD = forecastDD.assign(Lat=splitColumns.apply(lambda x: x[0], meta=('Lat', 'f8')), Lon=splitColumns.apply(lambda x: x[1], meta=('Lat', 'f8')), date=splitColumns.apply(lambda x: x[2], meta=('Lat', np.dtype(str))))
Unfortunately I couldn't figure out how to do it without calling compute and creating the temp dataframe.

Resources