Isolating the topk routine of dask - dask

I try to isolate the topk routine from dask.
Somehow it dies in isolation.
Apparently, numpy array instead of dask array is passed to the x argument during the recursion.
The original source code for topk is at: https://github.com/dask/dask/blob/master/dask/array/routines.py
Test program:
import numpy as np
import dask.array as da
from dask.base import tokenize
from operator import getitem
import dask.sharedict as sharedict
from dask.array.core import Array
def topk(k, x):
if x.ndim != 1:
raise ValueError("Topk only works on arrays of one dimension")
token = tokenize(k, x)
name = 'chunk.topk-' + token
dsk = {(name, i): (topk, k, key)
for i, key in enumerate(x.__dask_keys__())}
name2 = 'topk-' + token
dsk[(name2, 0)] = (getitem, (np.sort, (np.concatenate, list(dsk))),
slice(-1, -k - 1, -1))
chunks = ((k,),)
return Array(sharedict.merge((name2, dsk), x.dask), name2, chunks, dtype=x.dtype)
def main():
x = np.arange(12)*8
y = da.from_array(x, 7)
print(y.topk(2).compute())
print(topk(2, y).compute())
main()
Error:
File "test_dask_argtopk.py", line 40, in <module>
main()
File "test_dask_argtopk.py", line 38, in main
print(topk(2, y).compute())
File "test_dask_argtopk.py", line 27, in topk
for i, key in enumerate(x.__dask_keys__())}
AttributeError: 'Array' object has no attribute '__dask_keys__'

Related

cvxpy infeasible error with different number of samples

Does anyone know why cvxpy throws an infeasible error when I change the number of samples in constrained OLS? I'm attaching code to re-create my issue. n=100000 is fine, but n=400000 fails.
import cvxpy as cp
import numpy as np
class constrained_ols:
def __init__ (self, xdim=6):
self.xdim = xdim
return None
def fit(self, x, y):
import cvxpy as cp
w = cp.Variable(self.xdim)
i = cp.Variable()
quad_prog = cp.Minimize(cp.sum_squares(y-(x#w+i)))
cons = [w>=0, cp.sum(w)<=1.02, cp.sum(w)>=.98]
problem = cp.Problem(quad_prog, cons)
problem.solve()
print(problem.status)
self.coef_ = w.value
self.intercept_ = i.value
def predict(self, x):
return x # self.coef_
n = 100000
x = np.random.normal(0,1,(n,10))
y = np.random.normal(0,1,n)
model=constrained_ols(xdim=10)
model.fit(x,y)
model.coef_,model.intercept_
I was expecting to get a vector of 10 coefficients and an intercept but instead I got none values.

Map Dask bincount over 2d array columns

I am trying to use bincount over a 2D array. Specifically I have this code:
import numpy as np
import dask.array as da
def dask_bincount(weights, x):
da.bincount(x, weights)
idx = da.random.random_integers(0, 1024, 1000)
weight = da.random.random((1000, 2))
bin_count = da.apply_along_axis(dask_bincount, 1, weight, idx)
The idea is that the bincount can be made with the same idx array on each one of the weight columns. That would return an array of size (np.amax(x) + 1, 2) if I am correct.
However when doing this I get this error message:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-17-5b8eed89ad32> in <module>
----> 1 bin_count = da.apply_along_axis(dask_bincount, 1, weight, idx)
~/.local/lib/python3.9/site-packages/dask/array/routines.py in apply_along_axis(func1d, axis, arr, dtype, shape, *args, **kwargs)
454 if shape is None or dtype is None:
455 test_data = np.ones((1,), dtype=arr.dtype)
--> 456 test_result = np.array(func1d(test_data, *args, **kwargs))
457 if shape is None:
458 shape = test_result.shape
<ipython-input-14-34fd0eb9b775> in dask_bincount(weights, x)
1 def dask_bincount(weights, x):
----> 2 da.bincount(x, weights)
~/.local/lib/python3.9/site-packages/dask/array/routines.py in bincount(x, weights, minlength, split_every)
670 raise ValueError("Input array must be one dimensional. Try using x.ravel()")
671 if weights is not None:
--> 672 if weights.chunks != x.chunks:
673 raise ValueError("Chunks of input array x and weights must match.")
674
AttributeError: 'numpy.ndarray' object has no attribute 'chunks'
I thought that when dask array were created the library automatically assigns them chunks, so the error does not say much. How can I fix this?
I made an script that does it on numpy with map.
idx_np = np.random.randint(0, 1024, 1000)
weight_np = np.random.random((1000,2))
f = lambda y: np.bincount(idx_np, weight_np[:,y])
result = map(f, [i for i in range(2)])
np.array(list(result))
array([[0.9885341 , 0.9977873 , 0.24937023, ..., 0.31024526, 1.40754883,
0.87609759],
[1.77406303, 0.84787723, 0.14591474, ..., 0.54584068, 0.38357015,
0.85202672]])
I would like to the same but with dask
There are multiple problems at play.
Weights should be (2, 1000)
You discover this by trying to write the same function in numpy using apply_along_axis.
idx_np = np.random.random_integers(0, 1024, 1000)
weight_np = np.random.random((2, 1000)) # <- transposed
# This gives the same result as the code you provided
np.apply_along_axis(lambda weight, idx: np.bincount(idx, weight), 1, weight_np, idx_np)
da.apply_along_axis applies the function to numpy arrays
You're getting the error
AttributeError: 'numpy.ndarray' object has no attribute 'chunks'
This suggests that what makes it into the da.bincount method is actually a numpy array. The fact is that da.apply_along_axis actually takes each row of weight and sends it to the function as a numpy array.
Your function should therefore actually be a numpy function:
def bincount(weights, x):
return np.bincount(x, weights)
However, if you try this, you will still get the same error. I believe that happens for a whole another reason though:
Dask doesn't know what the output shape will be and tries to infer it
In the code and/or documentation for apply_along_axis, we can see that Dask tries to infer the output shape and dtype by passing in the array [1] (related question). This is a problem, since bincount cannot just accept such argument.
What we can do instead is provide shape and dtype to the method so that Dask doesn't have to infer it.
The problem here is that bincount's output shape depends on the maximum value of the input array. Unless you know it beforehand, you will sadly need to compute it. The whole operation therefore won't be fully lazy.
This is the full answer:
import numpy as np
import dask.array as da
idx = da.random.random_integers(0, 1024, 1000)
weight = da.random.random((2, 1000))
def bincount(weights, x):
return np.bincount(x, weights)
m = idx.max().compute()
da.apply_along_axis(bincount, 1, weight, idx, shape=(m,), dtype=weight.dtype)
Appendix: randint vs random_integers
Be careful, because these are subtly different
randint takes integers from low (inclusive) to high (exclusive)
random_integers takes integers from low (inclusive) to high (inclusive)
Thus you have to call randint with high + 1 to get the same value.

Stack overflow on dask __array__

I have a rather simple program using dask:
import dask.array as darray
import numpy as np
X = np.array([[1.,2.,3.],
[4.,5.,6.],
[7.,8.,9.]])
arr = darray.from_array(X)
arr = arr[:,0]
a = darray.min(arr)
b = darray.max(arr)
quantiles = darray.linspace(a, b, 4)
print(np.array(quantiles))
Running this program results in an error like this:
Traceback (most recent call last):
File "discretization.py", line 12, in <module>
print(np.array(quantiles))
File "/Users/zhujun/job/adf/local_training/venv/lib/python3.7/site-packages/dask/array/core.py", line 1341, in __array__
x = np.array(x)
File "/Users/zhujun/job/adf/local_training/venv/lib/python3.7/site-packages/dask/array/core.py", line 1341, in __array__
x = np.array(x)
File "/Users/zhujun/job/adf/local_training/venv/lib/python3.7/site-packages/dask/array/core.py", line 1341, in __array__
x = np.array(x)
[Previous line repeated 325 more times]
File "/Users/zhujun/job/adf/local_training/venv/lib/python3.7/site-packages/dask/array/core.py", line 1337, in __array__
x = self.compute()
File "/Users/zhujun/job/adf/local_training/venv/lib/python3.7/site-packages/dask/base.py", line 166, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/Users/zhujun/job/adf/local_training/venv/lib/python3.7/site-packages/dask/base.py", line 434, in compute
dsk = collections_to_dsk(collections, optimize_graph, **kwargs)
File "/Users/zhujun/job/adf/local_training/venv/lib/python3.7/site-packages/dask/base.py", line 220, in collections_to_dsk
[opt(dsk, keys, **kwargs) for opt, (dsk, keys) in groups.items()],
File "/Users/zhujun/job/adf/local_training/venv/lib/python3.7/site-packages/dask/base.py", line 220, in <listcomp>
[opt(dsk, keys, **kwargs) for opt, (dsk, keys) in groups.items()],
File "/Users/zhujun/job/adf/local_training/venv/lib/python3.7/site-packages/dask/array/optimization.py", line 42, in optimize
dsk = optimize_blockwise(dsk, keys=keys)
File "/Users/zhujun/job/adf/local_training/venv/lib/python3.7/site-packages/dask/blockwise.py", line 547, in optimize_blockwise
out = _optimize_blockwise(graph, keys=keys)
File "/Users/zhujun/job/adf/local_training/venv/lib/python3.7/site-packages/dask/blockwise.py", line 572, in _optimize_blockwise
if isinstance(layers[layer], Blockwise):
File "/anaconda3/lib/python3.7/abc.py", line 139, in __instancecheck__
return _abc_instancecheck(cls, instance)
RecursionError: maximum recursion depth exceeded in comparison
Python is version 3.7.1 and dask is version 2.15.0.
What is wrong with this program?
Thanks in advance.
linspace does not (yet) accept lazy inputs from other dask things, you need real numbers. Use compute to materialize these numbers as follows:
a, b = dask.compute(darray.min(arr), darray.max(arr))
quantiles = darray.linspace(a, b, 4)
With either one of these package combinations:
dask==2.15.0
numpy<1.16.0
toolz==0.9.0
dask==2.16.0
numpy<1.17.0
toolz==0.9.0
The following program can be executed without an issue:
import dask.array as darray
import numpy as np
X = np.array([[1.,2.,3.],
[4.,5.,6.],
[7.,8.,9.]])
arr = darray.from_array(X)
arr = arr[:,0]
a = darray.min(arr)
b = darray.max(arr)
q0 = darray.linspace(a, b, 4)
print(np.array(q0))
The key in the above package lists is numpy. Newer versions of numpy may cause an error.
As #mdurant suggested, the implementation of linspace does not yet accept lazy inputs; hence the fact that these combinations of packages work might be actually an coincidence.
I will leave this question open until I fully understand what is happening here.

Size of weights extracted from a NN model becomes higher than the model

I tried extracting the weights from a .pb tensorflow model and stored them in a text file..the size of the text file itself is higher than the model..why is this happening..?
Thanks in advance
Code to extract weights :
import tensorflow as tf
from tensorflow.python.platform import gfile
from tensorflow.python.framework import tensor_util
import operator
from functools import reduce
import matplotlib.pyplot as plt
import zlib
import pickle
PB_PATH = 'quantized_graph_resnet.pb'
with tf.Session() as sess:
with gfile.FastGFile(PB_PATH,'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
sess.graph.as_default()
tf.import_graph_def(graph_def, name='')
graph_nodes = [n for n in graph_def.node]
wts = [n for n in graph_nodes if n.op=='Const']
def check(l):
for item in l:
if(type(item) is list):
return True
return False
weightsFreq = {}
f = open('weights.txt', 'w')
for n in wts:
print("Name of the node - %s" % n.name)
if(True):
l = (tensor_util.MakeNdarray(n.attr['value'].tensor)).tolist()
if(isinstance(l, int)):
f.write('%d' % l)
f.write(' ')
if l in weightsFreq:
weightsFreq[l]+=1
else:
weightsFreq[l]=1
continue
if(isinstance(l, float)):
continue
while(check(l)):
l = reduce(operator.concat, l)
for item in l :
f.write('%d' % item)
f.write(' ')
# print(item)
if item in weightsFreq:
weightsFreq[item]+=1
else:
weightsFreq[item]=1
# print("Value - ", tensor_util.MakeNdarray(n.attr['value'].tensor), type(tensor_util.MakeNdarray(n.attr['value'].tensor)), "\n")
Text files are a very inefficient way to store large quantities of decimal numbers, it uses one byte for each digit of each number, where a binary file would use a fixed-size representation (4 bytes per number with a single precision floating point number).
That's why the text file is much bigger than a binary one.

Custom feature extraction class in scikit-learn

I am very beginner to the scikit-learn .I am working on some classification problem for which I have to build some custom feature extraction class or method to find the features for the training data.
I have made my custom feature extraction class as explain in this link. When i run my code it shows me this error :-
Traceback (most recent call last):
File "test.py", line 248, in <module>
pred = pipe.predict(X_test)
File "/usr/local/lib/python2.7/dist-packages/sklearn/utils/metaestimators.py", line 54, in <lambda>
out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/sklearn/pipeline.py", line 327, in predict
return self.steps[-1][-1].predict(Xt)
File "/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/base.py", line 336, in predict
scores = self.decision_function(X)
File "/usr/local/lib/python2.7/dist-packages/sklearn/linear_model/base.py", line 317, in decision_function
% (X.shape[1], n_features))
ValueError: X has 174 features per sample; expecting 443
Below is my code snippet , also i have given my full code. Please tell me where i am doing wrong and why , along with the suggestions so that my code will run without any error.
Code snippet :-
Here "y" is a list of all categories or labelled group ."corpus" is the list of all documents (data) , where each doc. is represented like a string."tfidf" and "lda" are my two functions from which i am generating my feature vector
y = [d[0] for d in doc_info_with_label] #length is no:ofsamples
corpus = [d[1] for d in doc_info_with_label]
class feature_extractor(TransformerMixin):
def __init__(self,*featurizers):
self.featurizers = featurizers
def fit(self,X,y=None):
return self
def transform(self,X):
collection_features=[]
for f in self.featurizers:
collection_features.append(f(X))
feature_vect=np.array(collection_features[0])
if len(collection_features)>1:
for i in range(1,len(collection_features)):
feature_vect=np.concatenate((feature_vect,np.array(collection_features[i])),axis=1)
#print feature_vect.shape
return feature_vect
my_featurizer = feature_extractor(tfidf,lda)
X = my_featurizer.transform(corpus)
X_train , X_test , y_train , y_test = train_test_split(corpus,y,test_size=0.2,random_state=42)
pipe = make_pipeline(my_featurizer,svm.LinearSVC())
pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
print "Expected output\n"
print y_test
print "\n"
print "Output\n"
print pred
print "\n"
score = pipe.score(X_test,y_test)
print score
print "\n"
print metrics.confusion_matrix(pred,y_test)
full code :-
# -*- coding: utf-8 -*-
#! /usr/bin/env python3
from gensim import corpora, models
import gensim
from operator import itemgetter
import numpy as np
import sys
import os
import re
import codecs
import io
import math
from scipy import sparse
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn import svm
from sklearn import metrics
from sklearn.pipeline import make_pipeline , Pipeline
reload(sys)
sys.setdefaultencoding('utf8')
np.set_printoptions(threshold='nan')
suffixes = {
1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"],
2: ["कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें"],
3: ["ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं"],
4: ["ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां"],
5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां"],
}
categories=['A','C','D','E']
mappings={}
mappings['A']=1
mappings['C']=3
mappings['D']=4
mappings['E']=5
path='/home/priyansh/Downloads/ltrc/1055/'
train_data_path='/home/priyansh/Downloads/ltrc/extractor/clustering/four_class_devanagari/'
path1=train_data_path+"A/"
path2=train_data_path+"C/"
path3=train_data_path+"D/"
path4=train_data_path+"E/"
documents=[] #contains all doc filenames along with class labels
doc_info_with_label=[] #two tuple storage of doc info along with their respective labels
def hi_stem(word):
for L in 5, 4, 3, 2, 1:
if len(word) > L + 1:
for suf in suffixes[L]:
if word.endswith(suf):
return word[:-L]
return word
def store_data(dir_path_list):
for dir_path in dir_path_list:
class_name = dir_path.split("/")[8]
for filename in os.listdir(dir_path):
if filename not in documents:
documents.append(filename+"+"+str(mappings[class_name]))
infilename=os.path.join(dir_path,filename)
with codecs.open(infilename,'r','utf-8') as fl:
string=''
for line in fl:
for word in line.split():
if word!=" " or word!="\n":
string+=word+" "
fl.close()
temp=[]
temp.append(class_name)
temp.append(string)
doc_info_with_label.append(tuple(temp))
path_list=[]
path_list.append(path1)
path_list.append(path2)
path_list.append(path3)
path_list.append(path4)
store_data(path_list)
y = [d[0] for d in doc_info_with_label] #length is no:ofsamples
corpus = [d[1] for d in doc_info_with_label]
class feature_extractor(TransformerMixin):
def __init__(self,*featurizers):
self.featurizers = featurizers
def fit(self,X,y=None):
return self
def transform(self,X):
collection_features=[]
for f in self.featurizers:
collection_features.append(f(X))
feature_vect=np.array(collection_features[0])
if len(collection_features)>1:
for i in range(1,len(collection_features)):
feature_vect=np.concatenate((feature_vect,np.array(collection_features[i])),axis=1)
#print feature_vect.shape
return feature_vect
def tfidf_score(word,document_no,corpus_data):
#print word
my_word=word
stopwords_path='/home/priyansh/Downloads/ltrc/extractor/'
stop_words_filename='stopwords.txt'
stopwords=[] #contain all stopwords
with codecs.open(stopwords_path+stop_words_filename,'r','utf-8') as fl:
for line in fl:
for word in line.split():
stopwords.append(word)
fl.close()
document=corpus_data[document_no]
#print document
wordcount=0
total=0
temp = document.split()
for i in temp:
#print i
if i not in stopwords:
total+=1
if i==my_word:
#print my_word
#print word
wordcount+=1
#print wordcount
#print total
tf = float(wordcount)/total
#print tf
#return tf(word,document)*idf(word,corpus_data)
total_docs = len(corpus_data)
count=0
for doc in corpus_data:
temp=[]
temp = doc.split()
for i in temp:
if i==word:
count+=1
break
total_docs_which_contains_the_words=count
idf = math.log(total_docs/(1+total_docs_which_contains_the_words))
return tf*idf
def tfidf(corpus_data):
word_id_mapping={}
cnt=0
stopwords_path='/home/priyansh/Downloads/ltrc/extractor/'
stop_words_filename='stopwords.txt'
stopwords=[] #contain all stopwords
with codecs.open(stopwords_path+stop_words_filename,'r','utf-8') as fl:
for line in fl:
for word in line.split():
stopwords.append(word)
fl.close()
unique_words_in_corpus={}
count=0
for data in corpus_data:
corpus_id=count
temp=[]
temp=data.split()
for word in temp:
if word not in unique_words_in_corpus:
unique_words_in_corpus[word]=corpus_id
count+=1
stopped_unique_words_in_corpus={}
for word in unique_words_in_corpus:
if word not in stopwords:
stopped_unique_words_in_corpus[word]=unique_words_in_corpus[word]
word_id_mapping[word]=cnt
cnt+=1
#print unique_words_in_corpus
#print stopped_unique_words_in_corpus
#print word_id_mapping
feature_vect=[None]*len(corpus_data)
#score_vect=[None]*cnt
for i in range(0,len(corpus_data)):
score_vect=[0]*cnt
for word in stopped_unique_words_in_corpus:
if i==stopped_unique_words_in_corpus[word]:
#print word
score=tfidf_score(word,i,corpus_data)
#print score
score_vect[word_id_mapping[word]]=score
feature_vect[i]=score_vect
return feature_vect
def lda(corpus_data):
stopwords_path='/home/priyansh/Downloads/ltrc/extractor/'
stop_words_filename='stopwords.txt'
stopwords=[] #contain all stopwords
with codecs.open(stopwords_path+stop_words_filename,'r','utf-8') as fl:
for line in fl:
for word in line.split():
stopwords.append(word)
fl.close()
texts=[]
for data in corpus_data:
#print data
tokens=[]
temp=[]
stopped_tokens=[]
temp = data.split()
for word in temp:
tokens.append(word)
#print tokens
for i in tokens:
if i not in stopwords:
stopped_tokens.append(i)
stemmed_tokens=[]
for token in stopped_tokens:
stemmed_token = hi_stem(token)
stemmed_tokens.append(stemmed_token)
texts.append(stemmed_tokens)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
num_topics=5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=10)
doc_topics=[]
for doc_vector in corpus:
doc_topics.append(ldamodel[doc_vector])
for i in range(0,len(doc_topics)):
doc_topics[i] = sorted(doc_topics[i],key=itemgetter(1),reverse=True)
feature_vect=[]
for i in doc_topics:
prob_vect=[0]*num_topics
#print i
topic_num = i[0][0]
topic_prob = i[0][1]
prob_vect[topic_num]=topic_prob
feature_vect.append(prob_vect)
#print i
#print feature_vect
return feature_vect
my_featurizer = feature_extractor(tfidf,lda)
X = my_featurizer.transform(corpus)
X_train , X_test , y_train , y_test = train_test_split(corpus,y,test_size=0.2,random_state=42)
pipe = make_pipeline(my_featurizer,svm.LinearSVC())
pipe.fit(X_train,y_train)
pred = pipe.predict(X_test)
print "Expected output\n"
print y_test
print "\n"
print "Output\n"
print pred
print "\n"
score = pipe.score(X_test,y_test)
print score
print "\n"
print metrics.confusion_matrix(pred,y_test)

Resources