returns me an error for dataframe with more than 100,000 rows - join

My dataframe has more than 100,00 rows and when I run the code,
df_new['SvcAdd.Type'] = df_new.groupby(['Routing'])['SvcAdd.Type'].transform(lambda x : ' '.join(x))
df_new = df_new.drop_duplicates()
df_new
it returns me the following error. I am lost here.
TypeError Traceback (most recent call last)
in
1 # concatenate the string
----> 2 df_new['SvcAdd.Type'] = df_new.groupby(['Routing'])['SvcAdd.Type'].transform(lambda x : ' '.join(x))
3
4 # drop duplicate data
5 df_new = df_new.drop_duplicates()
~\Anaconda3\lib\site-packages\pandas\core\groupby\generic.py in transform(self, func, engine, engine_kwargs, *args, **kwargs)
505
506 if not isinstance(func, str):
--> 507 return self._transform_general(func, *args, **kwargs)
508
509 elif func not in base.transform_kernel_allowlist:
~\Anaconda3\lib\site-packages\pandas\core\groupby\generic.py in _transform_general(self, func, *args, **kwargs)
530 for name, group in self:
531 object.setattr(group, "name", name)
--> 532 res = func(group, *args, **kwargs)
533
534 if isinstance(res, (DataFrame, Series)):
in (x)
1 # concatenate the string
----> 2 df_new['SvcAdd.Type'] = df_new.groupby(['Routing'])['SvcAdd.Type'].transform(lambda x : ' '.join(x))
3
4 # drop duplicate data
5 df_new = df_new.drop_duplicates()
TypeError: sequence item 0: expected str instance, float found

Related

AttributeError: 'ImageList' object has no attribute 'iloc'

Am trying to run this cell:
test = ImageList.from_df(test, img_path, suffix='.jpg')
data.add_test(test)
And getting this error
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/tmp/ipykernel_22/2201896868.py in <module>
----> 1 test = ImageList.from_df(test, img_path, suffix='.jpg')
2 data.add_test(test)
/opt/conda/lib/python3.7/site-packages/fastai/vision/data.py in from_df(cls, df, path, cols, folder, suffix, **kwargs)
283 "Get the filenames in `cols` of `df` with `folder` in front of them, `suffix` at the end."
284 suffix = suffix or ''
--> 285 res = super().from_df(df, path=path, cols=cols, **kwargs)
286 pref = f'{res.path}{os.path.sep}'
287 if folder is not None: pref += f'{folder}{os.path.sep}'
/opt/conda/lib/python3.7/site-packages/fastai/data_block.py in from_df(cls, df, path, cols, processor, **kwargs)
134 def from_df(cls, df:DataFrame, path:PathOrStr='.', cols:IntsOrStrs=0, processor:PreProcessors=None, **kwargs)->'ItemList':
135 "Create an `ItemList` in `path` from the inputs in the `cols` of `df`."
--> 136 inputs = df.iloc[:,df_names_to_idx(cols, df)]
137 assert not inputs.isna().any().any(), f"You have NaN values in column(s) {cols} of your dataframe, please fix it."
138 items = _maybe_squeeze(inputs.values) if len(df) > 1 else (inputs.values[0] if not isinstance(cols, Collection) or len(cols) == 1 else inputs.values)
AttributeError: 'ImageList' object has no attribute 'iloc'
Any help?
Am trying to merge test and train data
To me it seems like ImageList.from_df() expects a pandas.DataFrame, but you are giving it an ImageList.
You can check with
type(test)

ValueError: 'mean_squared_error' is not a valid scoring value

So, I have been working on my first ML project and as part of that I have been trying out various models from sci-kit learn and I wrote this piece of code for a random forest model:
#Random Forest
reg = RandomForestRegressor(random_state=0, criterion = 'mse')
#Apply grid search for best parameters
params = {'randomforestregressor__n_estimators' : range(100, 500, 200),
'randomforestregressor__min_samples_split' : range(2, 10, 3)}
pipe = make_pipeline(reg)
grid = GridSearchCV(pipe, param_grid = params, scoring='mean_squared_error', n_jobs=-1, iid=False, cv=5)
reg = grid.fit(X_train, y_train)
print('Best MSE: ', grid.best_score_)
print('Best Parameters: ', grid.best_estimator_)
y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)
tr_err = mean_squared_error(y_train_pred, y_train)
ts_err = mean_squared_error(y_test_pred, y_test)
print(tr_err, ts_err)
results_train['random_forest'] = tr_err
results_test['random_forest'] = ts_err
But, when I run this code, I get the following error:
KeyError Traceback (most recent call last)
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in get_scorer(scoring)
359 else:
--> 360 scorer = SCORERS[scoring]
361 except KeyError:
KeyError: 'mean_squared_error'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-149-394cd9e0c273> in <module>
5 pipe = make_pipeline(reg)
6 grid = GridSearchCV(pipe, param_grid = params, scoring='mean_squared_error', n_jobs=-1, iid=False, cv=5)
----> 7 reg = grid.fit(X_train, y_train)
8 print('Best MSE: ', grid.best_score_)
9 print('Best Parameters: ', grid.best_estimator_)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
652 cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
653
--> 654 scorers, self.multimetric_ = _check_multimetric_scoring(
655 self.estimator, scoring=self.scoring)
656
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _check_multimetric_scoring(estimator, scoring)
473 if callable(scoring) or scoring is None or isinstance(scoring,
474 str):
--> 475 scorers = {"score": check_scoring(estimator, scoring=scoring)}
476 return scorers, False
477 else:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in check_scoring(estimator, scoring, allow_none)
403 "'fit' method, %r was passed" % estimator)
404 if isinstance(scoring, str):
--> 405 return get_scorer(scoring)
406 elif callable(scoring):
407 # Heuristic to ensure user has not passed a metric
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in get_scorer(scoring)
360 scorer = SCORERS[scoring]
361 except KeyError:
--> 362 raise ValueError('%r is not a valid scoring value. '
363 'Use sorted(sklearn.metrics.SCORERS.keys()) '
364 'to get valid options.' % scoring)
ValueError: 'mean_squared_error' is not a valid scoring value. Use sorted(sklearn.metrics.SCORERS.keys()) to get valid options.
So, I tried running it by removing the scoring='mean_squared_error' from GridSearchCV(pipe, param_grid = params, scoring='mean_squared_error', n_jobs=-1, iid=False, cv=5). When I do that, the code runs perfectly and gives a decent enough training and testing error.
Regardless of that, I can't figure out why with scoring='mean_squared_error' parameter in GridSearchCV function throws me that error. What am I doing wrong?
According to the documentation:
All scorer objects follow the convention that higher return values are better than lower return values. Thus metrics which measure the distance between the model and the data, like metrics.mean_squared_error, are available as neg_mean_squared_error which return the negated value of the metric.
This means that you have to pass scoring='neg_mean_squared_error' in order to evaluate the grid search results with Mean Squared Error.

How to do groupby filter in Dask

I am attempting to take a dask dataframe, group by column 'A' and remove the groups where there are fewer than MIN_SAMPLE_COUNT rows.
For example, the following code works in pandas:
import pandas as pd
import dask as da
MIN_SAMPLE_COUNT = 1
x = pd.DataFrame([[1,2,3], [1,5,6], [2,8,9], [1,3,5]])
x.columns = ['A', 'B', 'C']
grouped = x.groupby('A')
x = grouped.filter(lambda x: x['A'].count().astype(int) > MIN_SAMPLE_COUNT)
However, in Dask if I try something analogous:
import pandas as pd
import dask
MIN_SAMPLE_COUNT = 1
x = pd.DataFrame([[1,2,3], [1,5,6], [2,8,9], [1,3,5]])
x.columns = ['A', 'B', 'C']
x = dask.dataframe.from_pandas(x, npartitions=2)
grouped = x.groupby('A')
x = grouped.filter(lambda x: x['A'].count().astype(int) > MIN_SAMPLE_COUNT)
I get the following error message:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\groupby.py in __getattr__(self, key)
1162 try:
-> 1163 return self[key]
1164 except KeyError as e:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\groupby.py in __getitem__(self, key)
1153 # error is raised from pandas
-> 1154 g._meta = g._meta[key]
1155 return g
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\base.py in __getitem__(self, key)
274 if key not in self.obj:
--> 275 raise KeyError("Column not found: {key}".format(key=key))
276 return self._gotitem(key, ndim=1)
KeyError: 'Column not found: filter'
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
<ipython-input-55-d8a969cc041b> in <module>()
1 # Remove sixty second blocks that have fewer than MIN_SAMPLE_COUNT samples.
2 grouped = dat.groupby('KPI_60_seconds')
----> 3 dat = grouped.filter(lambda x: x['KPI_60_seconds'].count().astype(int) > MIN_SAMPLE_COUNT)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\groupby.py in __getattr__(self, key)
1163 return self[key]
1164 except KeyError as e:
-> 1165 raise AttributeError(e)
1166
1167 #derived_from(pd.core.groupby.DataFrameGroupBy)
AttributeError: 'Column not found: filter'
The error message suggests that the filter method used in Pandas has not been implemented in Dask (nor did I find it after a search).
Is there a Dask functionality which captures what I am looking to do? I have gone through the Dask API and nothing stood out to me as what I need. I am currently using Dask '1.1.1'
Thank you for your help.
Fairly new to Dask myself. One way to achieve you are trying could be as follows:
Dask version: 0.17.3
import pandas as pd
import dask.dataframe as dd
MIN_SAMPLE_COUNT = 1
x = pd.DataFrame([[1,2,3], [1,5,6], [2,8,9], [1,3,5]])
x.columns = ['A', 'B', 'C']
print("x (before):")
print(x) # still pandas
x = dd.from_pandas(x, npartitions=2)
grouped = x.groupby('A').B.count().reset_index()
grouped = grouped.rename(columns={'B': 'Count'})
y = dd.merge(x, grouped, on=['A'])
y = y[y.Count > MIN_SAMPLE_COUNT]
x = y[['A', 'B', 'C']]
print("x (after):")
print(x.compute()) # needs compute for conversion to pandas df
Output:
x (before):
A B C
0 1 2 3
1 1 5 6
2 2 8 9
3 1 3 5
x (after):
A B C
0 1 2 3
1 1 5 6
1 1 3 5

I keep getting AttributeError in RandomSearchCV

x_tu = data_cls_tu.iloc[:,1:].values
y_tu = data_cls_tu.iloc[:,0].values
classifier = DecisionTreeClassifier()
parameters = [{"max_depth": [3,None],
"min_samples_leaf": np.random.randint(1,9),
"criterion": ["gini","entropy"]}]
randomcv = RandomizedSearchCV(estimator=classifier, param_distributions=parameters,
scoring='accuracy', cv=10, n_jobs=-1,
random_state=0)
randomcv.fit(x_tu, y_tu)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-17-fa8376cb54b8> in <module>()
11 scoring='accuracy', cv=10, n_jobs=-1,
12 random_state=0)
---> 13 randomcv.fit(x_tu, y_tu)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
616 n_splits = cv.get_n_splits(X, y, groups)
617 # Regenerate parameter iterable for each fit
--> 618 candidate_params = list(self._get_param_iterator())
619 n_candidates = len(candidate_params)
620 if self.verbose > 0:
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in __iter__(self)
236 # in this case we want to sample without replacement
237 all_lists = np.all([not hasattr(v, "rvs")
--> 238 for v in self.param_distributions.values()])
239 rnd = check_random_state(self.random_state)
240
AttributeError: 'list' object has no attribute 'values'
Hi, I keep getting error on the fit method for RandomSearchCV.
It worked when I used them on GridSearchCV, but GridSearchCV took 5 hours to complete.
x_tu, y_tu are both numpy.ndarray type.
param_distributions must be dict object (documentation) but you are passing a list containing single dict. Remove outer square brackets then it should work fine.
It should be like :
parameters = {"max_depth": [3,None],
"min_samples_leaf": [np.random.randint(1,9)],
"criterion": ["gini","entropy"]}

isinf(mu) error in Scipy stats when calling std for exponweib?

I have been getting this error when I call std on a frozen exponweib distribution?
Here is the code:
d = st.exponweib
params = d.fit(y)
arg = params[:-2]
loc = params[-2]
scale = params[-1]
rv1 = d(arg,loc,scale)
print rv1.std()
The parameters after fitting are:
arg: (3.445136651705262, 0.10885378466279112)
loc: 11770.05
scale: 3.87424773976
Here is the error:
ValueError Traceback (most recent call last)
<ipython-input-637-4394814bbb8c> in <module>()
11 rv1 = d(arg,loc,scale)
12
---> 13 print rv1.std()
.../anaconda/lib/python2.7/site-packages/scipy/stats/_distn_infrastructure.pyc in std(self)
487
488 def std(self):
--> 489 return self.dist.std(*self.args, **self.kwds)
490
491 def moment(self, n):
.../anaconda/lib/python2.7/site-packages/scipy/stats/_distn_infrastructure.pyc in std(self, *args, **kwds)
1259 """
1260 kwds['moments'] = 'v'
-> 1261 res = sqrt(self.stats(*args, **kwds))
1262 return res
1263
.../anaconda/lib/python2.7/site-packages/scipy/stats/_distn_infrastructure.pyc in stats(self, *args, **kwds)
1032 mu = self._munp(1, *goodargs)
1033 mu2 = mu2p - mu * mu
-> 1034 if np.isinf(mu):
1035 # if mean is inf then var is also inf
1036 mu2 = np.inf
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Please let me what is wrong with what I'm doing or how to avoid this.
The exponweib distribution has two required parameters a, c and two optional, loc and scale. When you call d(arg, loc, scale) the result is that arg is interpreted as a, loc is interpreted as c, and scale is interpreted as loc. And since your arg is a tuple of two elements, you end up with a tuple of random variables, neither of which is what you want.
Solution: unpack the tuple: d(*arg, loc, scale). Or even simpler, use
rv1 = d(*params)
which unpacks all the parameters for you, without you having to extract and name them.
By the way, when you want to provide your own loc and scale of a random variable, it's better to pass them as named arguments, like d(3, 5, loc=90, scale=0.3). This avoids the situation you encountered, when some of these parameters get interpreted as something else because you didn't get some argument right. In your example, d(arg, loc=loc, scale=scale) would immediately throw an error, "missing 1 required positional argument: 'c'" instead of taking loc instead of c.

Resources