MMDDrift command from alibi_detect not working - monitoring

I try this command "preds = cd.predict(sea_noise.X1)" and the following error appears:
`InvalidArgumentError Traceback (most recent call last)
<ipython-input-25-534d5d5bb2c8> in <module>
----> 1 preds = cd.predict(sea_noise.X1)
6 frames
/usr/local/lib/python3.8/dist-packages/alibi_detect/utils/tensorflow/distance.py in squared_pairwise_distance(x, y, a_min, a_max)
28 x2 = tf.reduce_sum(x ** 2, axis=-1, keepdims=True)
29 y2 = tf.reduce_sum(y ** 2, axis=-1, keepdims=True)
---> 30 dist = x2 + tf.transpose(y2, (1, 0)) - 2. * x # tf.transpose(y, (1, 0))
31 return tf.clip_by_value(dist, a_min, a_max)
32
InvalidArgumentError: Exception encountered when calling layer "gaussian_rbf" (type GaussianRBF).
Matrix size-incompatible: In[0]: [999,2], In[1]: [1,999] [Op:MatMul]
Call arguments received by layer "gaussian_rbf" (type GaussianRBF):
• x=tf.Tensor(shape=(999, 2), dtype=float32)
• y=0 0.174903
1 1.123120
2 4.601012
3 1.871278
4 6.550663
...
994 6.276959
995 2.788704
996 1.803116
997 4.313775
998 6.234055
Name: X1, Length: 999, dtype: float64
• infer_sigma=False`
I am running the MMDrift package, specifically the comand predict.
from alibi_detect.cd import MMDDriftyour text
cd = MMDDrift(X_ref, backend='tensorflow', p_val=.05)
preds = cd.predict(sea_noise.X1)
But it doesn't work. The file seems to be ok, as I took from a database where the person had used for the same purpose.

Related

why max_samples does not accept float type?

I am doing machine learning.Here I want to find the best triple (max_samples, n_trees and threshold) that gives the greatest performance in terms of area under ROC curve and area under recall precison curve.
Here is the code:
def meilleur_triplet(x,classes):
for n_trees in np.arange(100,160,10):
for sample_size in np.arange(0.1,1,0.1):
for threshold in np.arange(0.4,1,0.1):
model=IforestLocal(sample_size,n_trees)
model.fit(x)
y_pred,y_score=model.predict(x,threshold)
auc=roc_auc_score(classes,y_pred)
auc_pr=average_precision_score(classes,y_pred)
Now when I use max_samples with a range of int I don't have an error however if it's in float I have the following error:
**
TypeError Traceback (most recent call last)
Input In [201], in <cell line: 1>()
----> 1 meilleur_triplet(X_glass,y_glass)
Input In [200], in meilleur_triplet(x, classes)
6 for threshold in np.arange(0.4,1,0.1):#(0.4,1,0.1)
8 model=IforestLocal(sample_size,n_trees)
----> 9 model.fit(x)
File ~\Desktop\THESE\Maurras\Code_Maurras\iforest_D.py:45, in IsolationForest.fit(self, X)
42 self.sample_size = len_x
44 for i in range(self.n_trees):
---> 45 sample_idx = random.sample(list(range(len_x)), self.sample_size)
46 # TODO: Must be deleted before compute the memory consumption of the methods
47 self.samples.append(sample_idx)
File ~\anaconda3\lib\random.py:450, in Random.sample(self, population, k, counts)
448 if not 0 <= k <= n:
449 raise ValueError("Sample larger than population or is negative")
--> 450 result = [None] * k
451 setsize = 21 # size of a small set minus size of an empty list
452 if k > 5:
TypeError: can't multiply sequence by non-int of type 'numpy.float64'
**
This is where I called the function
meilleur_triplet(X_glass,y_glass)
Thank you please help me

Decision Tree - Exporting image via Graphviz error

I'm trying to build a Decision Tree using gridsearch and a pipeline, but I get an error when I try to export the image using graphviz. I looked online and couldn't find anything; one potential problem would've been if I didn't use the best_estimator_ instance, but I did in this case.
Everything works (getting accuracy and other metrics) except the exporting graph part.
def TreeOpt(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
std_scl = StandardScaler()
dec_tree = tree.DecisionTreeClassifier()
pipe = Pipeline(steps=[('std_slc', std_scl),
('dec_tree', dec_tree)])
criterion = ['gini', 'entropy']
max_depth = list(range(1,15))
parameters = dict(dec_tree__criterion=criterion,
dec_tree__max_depth=max_depth)
tree_gs = GridSearchCV(pipe, parameters)
tree_gs.fit(X_train, y_train)
export_graphviz(
tree_gs.best_estimator_,
out_file=("dec_tree.dot"),
feature_names=None,
class_names=None,
filled=True)
But I get
<ipython-input-2-bb91ec6ba0d9> in <module>
37 filled=True)
38
---> 39 DecTreeOptimizer(X = df.drop(['quality'], axis=1), y = df.quality)
40
<ipython-input-2-bb91ec6ba0d9> in DecTreeOptimizer(X, y)
30 print("Best score: " + str(tree_GS.best_score_))
31
---> 32 export_graphviz(
33 tree_GS.best_estimator_,
34 out_file=("dec_tree.dot"),
~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_export.py in export_graphviz(decision_tree, out_file, max_depth, feature_names, class_names, label, filled, leaves_parallel, impurity, node_ids, proportion, rotate, rounded, special_characters, precision)
767 """
768
--> 769 check_is_fitted(decision_tree)
770 own_file = False
771 return_string = False
~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py in check_is_fitted(estimator, attributes, msg, all_or_any)
1096
1097 if not attrs:
-> 1098 raise NotFittedError(msg % {'name': type(estimator).__name__})
1099
1100
NotFittedError: This Pipeline instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.```
After long searches, finally found the answer here :Plot best decision tree with pipeline and GridsearchCV
The best_estimator_ attribute returns a pipeline instead of an object, so I just had to query it like this: best_estimator_[1] (and then I found a whole other lot of problems with my code, but that's part 2).
I will leave this here in case anyone else is going to need it. Cheers!

ValueError: 'mean_squared_error' is not a valid scoring value

So, I have been working on my first ML project and as part of that I have been trying out various models from sci-kit learn and I wrote this piece of code for a random forest model:
#Random Forest
reg = RandomForestRegressor(random_state=0, criterion = 'mse')
#Apply grid search for best parameters
params = {'randomforestregressor__n_estimators' : range(100, 500, 200),
'randomforestregressor__min_samples_split' : range(2, 10, 3)}
pipe = make_pipeline(reg)
grid = GridSearchCV(pipe, param_grid = params, scoring='mean_squared_error', n_jobs=-1, iid=False, cv=5)
reg = grid.fit(X_train, y_train)
print('Best MSE: ', grid.best_score_)
print('Best Parameters: ', grid.best_estimator_)
y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)
tr_err = mean_squared_error(y_train_pred, y_train)
ts_err = mean_squared_error(y_test_pred, y_test)
print(tr_err, ts_err)
results_train['random_forest'] = tr_err
results_test['random_forest'] = ts_err
But, when I run this code, I get the following error:
KeyError Traceback (most recent call last)
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in get_scorer(scoring)
359 else:
--> 360 scorer = SCORERS[scoring]
361 except KeyError:
KeyError: 'mean_squared_error'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-149-394cd9e0c273> in <module>
5 pipe = make_pipeline(reg)
6 grid = GridSearchCV(pipe, param_grid = params, scoring='mean_squared_error', n_jobs=-1, iid=False, cv=5)
----> 7 reg = grid.fit(X_train, y_train)
8 print('Best MSE: ', grid.best_score_)
9 print('Best Parameters: ', grid.best_estimator_)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
652 cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
653
--> 654 scorers, self.multimetric_ = _check_multimetric_scoring(
655 self.estimator, scoring=self.scoring)
656
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _check_multimetric_scoring(estimator, scoring)
473 if callable(scoring) or scoring is None or isinstance(scoring,
474 str):
--> 475 scorers = {"score": check_scoring(estimator, scoring=scoring)}
476 return scorers, False
477 else:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in check_scoring(estimator, scoring, allow_none)
403 "'fit' method, %r was passed" % estimator)
404 if isinstance(scoring, str):
--> 405 return get_scorer(scoring)
406 elif callable(scoring):
407 # Heuristic to ensure user has not passed a metric
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in get_scorer(scoring)
360 scorer = SCORERS[scoring]
361 except KeyError:
--> 362 raise ValueError('%r is not a valid scoring value. '
363 'Use sorted(sklearn.metrics.SCORERS.keys()) '
364 'to get valid options.' % scoring)
ValueError: 'mean_squared_error' is not a valid scoring value. Use sorted(sklearn.metrics.SCORERS.keys()) to get valid options.
So, I tried running it by removing the scoring='mean_squared_error' from GridSearchCV(pipe, param_grid = params, scoring='mean_squared_error', n_jobs=-1, iid=False, cv=5). When I do that, the code runs perfectly and gives a decent enough training and testing error.
Regardless of that, I can't figure out why with scoring='mean_squared_error' parameter in GridSearchCV function throws me that error. What am I doing wrong?
According to the documentation:
All scorer objects follow the convention that higher return values are better than lower return values. Thus metrics which measure the distance between the model and the data, like metrics.mean_squared_error, are available as neg_mean_squared_error which return the negated value of the metric.
This means that you have to pass scoring='neg_mean_squared_error' in order to evaluate the grid search results with Mean Squared Error.

RuntimeError: output with shape [512] doesn't match the broadcast shape [1, 512, 1, 512] while extracting feature vector using pytorch

I am not able to resolve this error. This code is taken from https://becominghuman.ai/extract-a-feature-vector-for-any-image-with-pytorch-9717561d1d4c
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable
from PIL import Image
pic_one = '/content/drive/My Drive/Video_Recommender/zframe1.jpg'
pic_two = '/content/drive/My Drive/Video_Recommender/zframe2.jpg'
model = models.resnet18(pretrained=True)
layer = model._modules.get('avgpool')
scaler = transforms.Scale((224, 224))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
to_tensor = transforms.ToTensor()
def get_vector(image_name):
# 1. Load the image with Pillow library
img = Image.open(image_name)
# 2. Create a PyTorch Variable with the transformed image
t_img = Variable(normalize(to_tensor(scaler(img))).unsqueeze(0))
# 3. Create a vector of zeros that will hold our feature vector
# The 'avgpool' layer has an output size of 512
my_embedding = torch.zeros(512)
# 4. Define a function that will copy the output of a layer
def copy_data(m, i, o):
my_embedding.copy_(o.data)
# 5. Attach that function to our selected layer
h = layer.register_forward_hook(copy_data)
# 6. Run the model on our transformed image
model(t_img)
# 7. Detach our copy function from the layer
h.remove()
# 8. Return the feature vector
return my_embedding
pic_one_vector = get_vector(pic_one)
pic_two_vector = get_vector(pic_two)
Error:-
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-41-ca2d66de2d9c> in <module>()
----> 1 pic_one_vector = get_vector(pic_one)
2 pic_two_vector = get_vector(pic_two)
5 frames
<ipython-input-40-a45affe9d8f7> in get_vector(image_name)
13 h = layer.register_forward_hook(copy_data)
14 # 6. Run the model on our transformed image
---> 15 model(t_img)
16 # 7. Detach our copy function from the layer
17 h.remove()
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/usr/local/lib/python3.6/dist-packages/torchvision/models/resnet.py in forward(self, x)
218
219 def forward(self, x):
--> 220 return self._forward_impl(x)
221
222
/usr/local/lib/python3.6/dist-packages/torchvision/models/resnet.py in _forward_impl(self, x)
211 x = self.layer4(x)
212
--> 213 x = self.avgpool(x)
214 x = torch.flatten(x, 1)
215 x = self.fc(x)
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
--> 552 hook_result = hook(self, input, result)
553 if hook_result is not None:
554 result = hook_result
<ipython-input-40-a45affe9d8f7> in copy_data(m, i, o)
9 # 4. Define a function that will copy the output of a layer
10 def copy_data(m, i, o):
---> 11 my_embedding.copy_(o.data)
12 # 5. Attach that function to our selected layer
13 h = layer.register_forward_hook(copy_data)
RuntimeError: output with shape [512] doesn't match the broadcast shape [1, 512, 1, 512]
What I am actually trying to do is trying to extract feature vector from images that I want to further use for building a recommendation system. Do inform me if there is any other alternative available.
Thanks in advance!!!
You need to reshape the output data after avgpool:
def copy_data(m, i, o):
my_embedding.copy_(o.data.reshape(o.data.size(1)))
Alternatively, you may replace the hook function with the following, just so you don't have to deal with adjusting the output shape:
# step 3 and 4
my_embedding = None
def my_hook(module_, input_, output_):
nonlocal my_output
my_embedding = output_
then simply call the following
# step 5
h = layer.register_forward_hook(my_hook)

xarray: rolling mean of dask array conflicting sizes for data and coordinate in rolling operation

I am trying to do a rolling mean to a dask array within xarray. My issue may lay in the rechunking before the rolling mean. I am getting a ValueError of conflicting sizes between data and coordinates. However, this arises within the rolling operation as I don't think there are conflicts in the data and coords of the array before going into the rolling operation.
Apologies for not creating data to test but my project data is quick to play with:
import xarray as xr
remote_data = xr.open_dataarray('http://iridl.ldeo.columbia.edu/SOURCES/.Models'\
'/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods',
chunks={'L': 1, 'S': 1})
da = remote_data.isel(P=0,L=0,M=0,X=0,Y=0)
da_day_clim = da.groupby('S.dayofyear').mean('S')
print(da_day_clim)
#<xarray.DataArray 'zg' (dayofyear: 366)>
#dask.array<shape=(366,), dtype=float32, chunksize=(1,)>
#Coordinates:
# L timedelta64[ns] 12:00:00
# Y float32 -90.0
# M float32 1.0
# X float32 0.0
# P int32 500
# * dayofyear (dayofyear) int64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 ...
# Do a 31-day rolling mean
# da_day_clim.rolling(dayofyear=31, center=True).mean()
# This brings up:
#ValueError: The overlapping depth 30 is larger than your
#smallest chunk size 1. Rechunk your array
#with a larger chunk size or a chunk size that
#more evenly divides the shape of your array.
# Read http://xarray.pydata.org/en/stable/dask.html
# and found http://xarray.pydata.org/en/stable/generated/xarray.Dataset.chunk.html#xarray.Dataset.chunk
# I could make a little PR to add the .chunk() into the ValeError message. Thoughts?
# Rechunk. Played around with a few values but decided on
# the len of dayofyear
da_day_clim2 = da_day_clim.chunk({'dayofyear': 366})
print(da_day_clim2)
#<xarray.DataArray 'zg' (dayofyear: 366)>
#dask.array<shape=(366,), dtype=float32, chunksize=(366,)>
#Coordinates:
# L timedelta64[ns] 12:00:00
# Y float32 -90.0
# M float32 1.0
# X float32 0.0
# P int32 500
# * dayofyear (dayofyear) int64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 ...
# Rolling mean on this
da_day_clim_smooth = da_day_clim2.rolling(dayofyear=31, center=True).mean()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-57-6acf382cdd3d> in <module>()
4 da_day_clim = da.groupby('S.dayofyear').mean('S')
5 da_day_clim2 = da_day_clim.chunk({'dayofyear': 366})
----> 6 da_day_clim_smooth = da_day_clim2.rolling(dayofyear=31, center=True).mean()
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/rolling.py in wrapped_func(self, **kwargs)
307 if self.center:
308 values = values[valid]
--> 309 result = DataArray(values, self.obj.coords)
310
311 return result
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/dataarray.py in __init__(self, data, coords, dims, name, attrs, encoding, fastpath)
224
225 data = as_compatible_data(data)
--> 226 coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
227 variable = Variable(dims, data, attrs, encoding, fastpath=True)
228
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/dataarray.py in _infer_coords_and_dims(shape, coords, dims)
79 raise ValueError('conflicting sizes for dimension %r: '
80 'length %s on the data but length %s on '
---> 81 'coordinate %r' % (d, sizes[d], s, k))
82
83 if k in sizes and v.shape != (sizes[k],):
ValueError: conflicting sizes for dimension 'dayofyear': length 351 on the data but length 366 on coordinate 'dayofyear'
The length 351 is related to 366-351=15 (half the window).
This turned out to be a bug in Xarray and was fixed in https://github.com/pydata/xarray/pull/2122
The fix will be in Xarray 0.10.4 which is slated for imminent release.

Resources