Convert a decision tree to a table - machine-learning

I'm looking for a way to convert a decision tree trained using scikit sklearn into a decision table.
I would like to know how to parse the decision tree structure to find the decisions made at each step.
Then I would like ideas on how to structure this table.
Do you know a way or have a idea to do it?

Building on the other answer here. The following traverses the tree in the same way but generates a pandas dataframe as an output.
import sklearn
import pandas as pd
def tree_to_df(reg_tree, feature_names):
tree_ = reg_tree.tree_
feature_name = [
feature_names[i] if i != sklearn.tree._tree.TREE_UNDEFINED else "undefined!"
for i in tree_.feature
]
def recurse(node, row, ret):
if tree_.feature[node] != sklearn.tree._tree.TREE_UNDEFINED:
name = feature_name[node]
threshold = tree_.threshold[node]
# Add rule to row and search left branch
row[-1].append(name + " <= " + str(threshold))
recurse(tree_.children_left[node], row, ret)
# Add rule to row and search right branch
row[-1].append(name + " > " + str(threshold))
recurse(tree_.children_right[node], row, ret)
else:
# Add output rules and start a new row
label = tree_.value[node]
ret.append("return " + str(label[0][0]))
row.append([])
# Initialize
rules = [[]]
vals = []
# Call recursive function with initial values
recurse(0, rules, vals)
# Convert to table and output
df = pd.DataFrame(rules).dropna(how='all')
df['Return'] = pd.Series(values)
return df

Here is a sample code to convert a decision tree into a "python" code. You can easily adapt it to make a table.
All you need to do is create a global variable that is a table that is the size of the number of leaves times the number of features (or feature categories) and fill it recursively
def tree_to_code(tree, feature_names, classes_names):
tree_ = tree.tree_
feature_name = [
feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
for i in tree_.feature
]
print( "def tree(" + ", ".join(feature_names) + "):" )
def recurse(node, depth):
indent = " " * depth
if tree_.feature[node] != _tree.TREE_UNDEFINED:
name = feature_name[node]
threshold = tree_.threshold[node]
print( indent + "if " + name + " <= " + str(threshold)+ ":" )
recurse(tree_.children_left[node], depth + 1)
print( indent + "else: # if " + name + "<=" + str(threshold) )
recurse(tree_.children_right[node], depth + 1)
else:
impurity = tree.tree_.impurity[node]
dico, label = cast_value_to_dico( tree_.value[node], classes_names )
print( indent + "# impurity=" + str(impurity) + " count_max=" + str(dico[label]) )
print( indent + "return " + str(label) )
recurse(0, 1)

code snippet
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
iris = load_iris()
X = iris['data']
y = iris['target']
decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
decision_tree = decision_tree.fit(X, y)
r = export_text(decision_tree, feature_names=iris['feature_names'])
print(r)
listt= [r]
print(listt)
#########OUTPUT###########################
|--- petal width (cm) <= 0.80
| |--- class: 0
|--- petal width (cm) > 0.80
| |--- petal width (cm) <= 1.75
| | |--- class: 1
| |--- petal width (cm) > 1.75
| | |--- class: 2

Related

How do I reorder plot by multiple variables?

I am trying to reorder plot first by am and then by mpg. Attached in the result in R using ggplot2.
I trying to attain the same result using siuba and plotnine. Below is my code so far.
(
mtcars
>> arrange(_.am, _.mpg)
>> mutate(model = fct_reorder(_.model, _.am))
>> ggplot(aes(y="mpg", x="model", fill='factor(am)'))
+ geom_col()
+ labs(fill = "Automatic/Manual Transmission")
+ coord_flip()
)
If I should replicate your plot in R I would make use of dplyr::arrange + forcats::fct_inorder. As siuba does not offer an equivalent to fct_inorder you could achieve your desired result by first arranging in your desired order, adding an index column of row numbers and reordering by this index column:
from plotnine import *
from siuba import _, arrange, mutate
from siuba.dply.forcats import fct_reorder
(
mtcars
>> arrange(-_.am, _.mpg)
>> mutate(model = fct_reorder(_.model, _.reset_index().index))
>> ggplot(aes(y="mpg", x="model", fill='factor(am)'))
+ geom_col()
+ labs(fill = "Automatic/Manual Transmission")
+ coord_flip()
)

Write a code to calculate Backward Propagation, Deep Learning course by Andrew NG

So I've taken the Deep Learning AI course by Andrew NG on coursera.
I am currently working the last assignment in week 2.
I reached the part where I have to write the forward and backward propagation function.
I managed to write the fwd_propagate function which is fairly easy.
This is the code below :
def fwd_propagate(w,b,X,y):
m = X.shape[1]
A = sigmoid(np.dot(w.T,X)+b)
J = (-1/m)*np.sum(y * np.log(A) + (1-y) * np.log(1-A))
return J
Now I have to write the bwd_propagation function but I don't know where and how to start.
Can someone help and explain to me what I should write.
This is everything I wrote so far with the tests.
import numpy as np
import matplotlib.pyplot as plt
import h5py
import scipy
from PIL import Image
from scipy import ndimage
%matplotlib inline
def load_dataset():
train_dataset = h5py.File('C:/Users/Univ/Desktop/ML Intern/Logistic-Regression-with-a-Neural-Network-mindset-master/train_catvnoncat.h5', "r")
train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels
test_dataset = h5py.File('C:/Users/Univ/Desktop/ML Intern/Logistic-Regression-with-a-Neural-Network-mindset-master/test_catvnoncat.h5', "r")
test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels
classes = np.array(test_dataset["list_classes"][:]) # the list of classes
train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes
train_set_x_orig, train_set_y, test_set_x_orig, test_set_y, classes = load_dataset()
index = 25
plt.imshow(train_set_x_orig[index])
print ("y = " + str(train_set_y[:,index]) + ", it's a '" + classes[np.squeeze(train_set_y[:,index])].decode("utf-8") + "' picture.")
print(str(train_set_y.shape[1]) + " This is the amount of elements in the training set")
print(str(test_set_y.shape[1]) + " This is the amount of elements in the test set")
print(str(train_set_x_orig.shape[1]) + " This is the Num_px")
print(f"{train_set_x_orig.shape[1]} This is the Num_px")
print(train_set_x_orig.shape)
X_flatten1 = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1).T
X_flatten2 = train_set_y.reshape(train_set_y.shape[0], -1).T
X_flatten3 = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1).T
X_flatten4 = test_set_y.reshape(test_set_y.shape[0], -1).T
print(X_flatten1)
print(X_flatten2)
print(X_flatten3)
print(X_flatten4)
print(X_flatten1.shape)
print(X_flatten2.shape)
print(X_flatten3.shape)
print(X_flatten4.shape)
print(" Let's standardize our date")
train_set_x = X_flatten1/256
test_set_x = X_flatten3/256
print(train_set_x)
print(test_set_x)
def sigmoid(x):
s = 1/(1+np.exp(-x))
return s
print ("sigmoid(0) = " + str(sigmoid(0)))
print ("sigmoid(9.2) = " + str(sigmoid(9.2)))
def initialize_with_zeros(dim):
shp=(dim,1)
w = np.zeros(shp)
b = 0
assert(w.shape == (dim, 1))
assert(isinstance(b, float) or isinstance(b, int))
return w,b
dim = 2
w, b = initialize_with_zeros(dim)
print ("w = " + str(w))
print ("b = " + str(b))
def fwd_propagate(w,b,X,y):
m = X.shape[1]
A = sigmoid(np.dot(w.T,X)+b)
J = (-1/m)*np.sum(y * np.log(A) + (1-y) * np.log(1-A))
return J
The next step is to calculate derivative for back propagation:
dw = 1/m*(np.dot(X, ((A-Y).T)))
db = 1/m*(np.sum(A-Y))

Scipy.optimize - minimize not respecting constraints

Using the code below to to understand how Scipy optmization/minimization works. The results are not matching what I am expecting.
"""
Minimize: f = 2*x[0]*x[1] + 2*x[0] - x[0]**2 - 2*x[1]**2
Subject to: -2*x[0] + 2*x[1] <= -2
2*x[0] - 4*x[1] <= 0
x[0]**3 -x[1] == 0
where: 0 <= x[0] <= inf
1 <= x[1] <= inf
"""
import numpy as np
from scipy.optimize import minimize
def objective(x):
return 2.0*x[0]*x[1] + 2.0*x[0] - x[0]**2 - 2.0*x[1]**2
def constraint1(x):
return +2.0*x[0] - 2.0*x[1] - 2.0
def constraint2(x):
return -2.0*x[0] + 4.0*x[1]
def constraint3(x):
sum_eq = x[0]**3.0 -x[1]
return sum_eq
# initial guesses
n = 2
x0 = np.zeros(n)
x0[0] = 10.0
x0[1] = 100.0
# show initial objective
print('Initial SSE Objective: ' + str(objective(x0)))
# optimize
#b = (1.0,None)
bnds = ((0.0,1000.0), (1.0,1000.0))
con1 = {'type': 'ineq', 'fun': constraint1}
con2 = {'type': 'ineq', 'fun': constraint2}
con3 = {'type': 'eq', 'fun': constraint3}
cons = ([con1, con2, con3])
solution = minimize(objective,
x0,
method='SLSQP',
bounds=bnds,
constraints=cons)
x = solution.x
print(solution)
# show final objective
print('Final SSE Objective: ' + str(objective(x)))
# print solution
print('Solution')
print('x1 = ' + str(x[0]))
print('x2 = ' + str(x[1]))
print('\n')
print('x', x)
print('constraint1', constraint1(x))
print('constraint2', constraint2(x))
print('constraint3', constraint3(x))
When I run, this is what Python throws on its output console:
Initial SSE Objective: -18080.0
fun: 2.0
jac: array([ 0.00000000e+00, -2.98023224e-08])
message: 'Optimization terminated successfully.'
nfev: 122
nit: 17
njev: 13
status: 0
success: True
x: array([2., 1.])
Final SSE Objective: 2.0
Solution
x1 = 2.0000000000010196
x2 = 1.0000000000012386
x [2. 1.]
constraint1 -4.3787196091216174e-13
constraint2 2.915001573455811e-12
constraint3 7.000000000010997
Despite the optimizer says the result was successful, the constraint3 is not respected because the result should be zero. What am I missing?
Your problem is incompatible. You can eliminate the 3rd constraint (which makes your problem simpler in the first place - only a scalar optimization), after this it is a bit more clear to see what is the problem. From constraint 3 and the lower bound on the original x1 follows, that x0 is not feasible from 0 to 1, so the lower bound in the 1D problem should be 1. It is easy to see that constraint 2 will be always positive, when x0 is larger than 1, therefore it will never be satisfied.
When I run your original problem for me it stops with positive directional derivative (and for the rewritten problem with 'Inequality constraints incompatible').
Which SciPy are you using? For me it is 1.4.1.
On the picture below you can see the objective and the remaining constraints for the 1D problem (horizontal axis is the original x0 variable)
"""
Minimize: f = 2*x[0]*x1 + 2*x[0] - x[0]**2 - 2*x1**2
Subject to: -2*x[0] + 2*x[1] <= -2
2*x[0] - 4*x[1] <= 0
x[0]**3 -x[1] == 0
where: 0 <= x[0] <= inf
1 <= x[1] <= inf
"""
import numpy as np
from scipy.optimize import minimize
def objective(x):
return 2*x**4 + 2*x - x**2 - 2*x**6
def constraint1(x):
return x - x**3 - 1
def constraint2(x):
return 2 * x**3 - x
#
# def constraint3(x):
# sum_eq = x[0]**3.0 -x[1]
# return sum_eq
# initial guesses
n = 1
x0 = np.zeros(n)
x0[0] = 2.
# x0[1] = 100.0
# show initial objective
print('Initial SSE Objective: ' + str(objective(x0)))
# optimize
#b = (1.0,None)
bnds = ((1.0,1000.0),)
con1 = {'type': 'ineq', 'fun': constraint1}
con2 = {'type': 'ineq', 'fun': constraint2}
# con3 = {'type': 'eq', 'fun': constraint3}
cons = [
# con1,
con2,
# con3,
]
solution = minimize(objective,
x0,
method='SLSQP',
bounds=bnds,
constraints=cons)
x = solution.x
print(solution)
# show final objective
print('Final SSE Objective: ' + str(objective(x)))
# print solution
print('Solution')
print('x1 = ' + str(x[0]))
# print('x2 = ' + str(x[1]))
print('\n')
print('x', x)
print('constraint1', constraint1(x))
print('constraint2', constraint2(x))
# print('constraint3', constraint3(x))
x_a = np.linspace(1, 2, 200)
f = objective(x_a)
c1 = constraint1(x_a)
c2 = constraint2(x_a)
import matplotlib.pyplot as plt
plt.figure()
plt.plot(x_a, f, label="f")
plt.plot(x_a, c1, label="c1")
plt.plot(x_a, c2, label="c2")
plt.legend()
plt.show()

Parse from file to dictionary in correct order, in Python

I've written some code to parse an EMBL file and dump specific regions of the file into a dictionary.
The keys of the dictionary correlate to the label of a specific region that I want to capture and each key's value is the region itself.
I have then created another function to write the contents of the dictionary to a text file.
However, I have found that the text file contains the information in a different order to that found in the original EMBL file.
I can't figure out why it is doing this - is it because dictionaries are unordered? Is there any way around it?
from Bio import SeqIO
s6633 = SeqIO.read("6633_seq.embl", "embl")
def make_dict_realgenes(x):
dict = {}
for i in range(len(x.features)):
if x.features[i].type == 'CDS':
if 'hypothetical' not in x.features[i].qualifiers['product'][0]:
try:
if x.features[i].location.strand == -1:
x1 = x.features[i].location.end
y1 = x1 + 30
dict[str(x.features[i].qualifiers['product'][0])] =\
str(x[x1:y1].seq.reverse_complement())
else:
x2 = x.features[i].location.start
y2 = x2 - 30
dict[x.features[i].qualifiers['product'][0]] =\
str(x[y2:x2].seq)
except KeyError:
if x.features[i].location.strand == -1:
x1 = x.features[i].location.end
y1 = x1 + 30
dict[str(x.features[i].qualifiers['translation'][0])] =\
str(x[x1:y1].seq.reverse_complement())
else:
x2 = x.features[i].location.start
y2 = x2 - 30
dict[x.features[i].qualifiers['translation'][0]] =\
str(x[y2:x2].seq)
return dict
def rbs_file(dict):
list = []
c = 0
for k, v in dict.iteritems():
list.append(">" + k + " " + str(c) + "\n" + v + "\n")
c = c + 1
f = open("out.txt", "w")
a = 0
for i in list:
f.write(i)
a = a + 1
f.close()
To preserve order in a dictionary, use an OrderedDict from collections. Try Changing the top of your code to this:
from collections import OrderedDict
from Bio import SeqIO
s6633 = SeqIO.read("6633_seq.embl", "embl")
def make_dict_realgenes(x):
dict = OrderedDict()
...
Also, I would advise against overwriting the builtin 'dict' if you can easily rename it.
I slightly refactored your code, and I suggest to write the output as is produced while parsing the file, instead of relaying in OrderedDicts.
from Bio import SeqIO
output = open("out.txt", "w")
for seq in SeqIO.parse("CP001187.embl", "embl"):
for feature in seq.features:
if feature.type == "CDS":
qualifier = (feature.qualifiers.get("product") or
feature.qualifiers.get("translation"))[0]
if "hypothetical" not in qualifier:
if feature.location.strand == -1:
x1 = feature.location.end
x2 = x1 + 30
sequence = seq[x1:x2].seq.reverse_complement()
else:
x1 = feature.location.start
x2 = x1 - 30
sequence = seq[x2:x1].seq
output.write(">" + qualifier + "\n")
output.write(str(sequence) + "\n")
# You can always insert here to the OrderedDict anyway, e.g.
# d[qualifier] = str(sequence)
output.close()
In python only rarely for i in range(len(anything)) is the way to go.
There is also a cleaner way to output your sequences using Biopython. Use a list to append the Seqs, instead of a dict or OrderedDict:
from Bio.SeqRecord import SeqRecord
my_seqs = []
# Each time you generate a sequence, instead of writing to a file
# or inserting in dict, do this:
my_seqs.append(SeqRecord(sequence, id=qualifier, description=""))
# Now you have the my_seqs, they can be writen in a single line:
SeqIO.write(my_seqs, "output.fas", "fasta")

Aggregating neighbouring pixels Python / GDAL and Numpy without interpolation

Consider we have an image of 2000 x 2000 pixels and the pixel size is 10 x 10 meters. The pixel values are also float numbers ranging from 0.00 - 10.00. This is image A.
I would like to resize image A to a quarter of its dimensions (i.e. 1000 x 1000 pixels) with a pixel size 20 x 20 meters (image B) by spatially aggregating four neighbouring pixels in non-overlapping blocks, starting from the top-left corner of the image, while the value for each pixel in image B will be a result of their arithmetic average.
I have written the following code using several sources from stackoverflow; however for some reason that I do not understand the resulting image (image B) is not always written properly and it is not readable by any of the software that I want to process it further (i.e. ArcGIS, ENVI, ERDAS etc).
I would appreciate any help
Best regards
Dimitris
import time
import glob
import os
import gdal
import osr
import numpy as np
start_time_script = time.clock()
path_ras='C:/rasters/'
for rasterfile in glob.glob(os.path.join(path_ras,'*.tif')):
rasterfile_name=str(rasterfile[rasterfile.find('IMG'):rasterfile.find('.tif')])
print 'Processing:'+ ' ' + str(rasterfile_name)
ds = gdal.Open(rasterfile,gdal.GA_ReadOnly)
ds_xform = ds.GetGeoTransform()
print ds_xform
ds_driver = gdal.GetDriverByName('Gtiff')
srs = osr.SpatialReference()
srs.ImportFromEPSG(26716)
ds_array = ds.ReadAsArray()
sz = ds_array.itemsize
print 'This is the size of the neighbourhood:' + ' ' + str(sz)
h,w = ds_array.shape
print 'This is the size of the Array:' + ' ' + str(h) + ' ' + str(w)
bh, bw = 2,2
shape = (h/bh, w/bw, bh, bw)
print 'This is the new shape of the Array:' + ' ' + str(shape)
strides = sz*np.array([w*bh,bw,w,1])
blocks = np.lib.stride_tricks.as_strided(ds_array,shape=shape,strides=strides)
resized_array = ds_driver.Create(rasterfile_name + '_resized_to_52m.tif',shape[1],shape[0],1,gdal.GDT_Float32)
resized_array.SetGeoTransform((ds_xform[0],ds_xform[1]*2,ds_xform[2],ds_xform[3],ds_xform[4],ds_xform[5]*2))
resized_array.SetProjection(srs.ExportToWkt())
band = resized_array.GetRasterBand(1)
zero_array = np.zeros([shape[0],shape[1]],dtype=np.float32)
print 'I start calculations using neighbourhood'
start_time_blocks = time.clock()
for i in xrange(len(blocks)):
for j in xrange(len(blocks[i])):
zero_array[i][j] = np.mean(blocks[i][j])
print 'I finished calculations and I am going to write the new array'
band.WriteArray(zero_array)
end_time_blocks = time.clock() - start_time_blocks
print 'Image Processed for:' + ' ' + str(end_time_blocks) + 'seconds' + '\n'
end_time = time.clock() - start_time_script
print 'Program ran for: ' + str(end_time) + 'seconds'
import time
import glob
import os
import gdal
import osr
import numpy as np
start_time_script = time.clock()
path_ras='C:/rasters/'
for rasterfile in glob.glob(os.path.join(path_ras,'*.tif')):
rasterfile_name=str(rasterfile[rasterfile.find('IMG'):rasterfile.find('.tif')])
print 'Processing:'+ ' ' + str(rasterfile_name)
ds = gdal.Open(rasterfile,gdal.GA_ReadOnly)
ds_xform = ds.GetGeoTransform()
print ds_xform
ds_driver = gdal.GetDriverByName('Gtiff')
srs = osr.SpatialReference()
srs.ImportFromEPSG(26716)
ds_array = ds.ReadAsArray()
sz = ds_array.itemsize
print 'This is the size of the neighbourhood:' + ' ' + str(sz)
h,w = ds_array.shape
print 'This is the size of the Array:' + ' ' + str(h) + ' ' + str(w)
bh, bw = 2,2
shape = (h/bh, w/bw, bh, bw)
print 'This is the new shape of the Array:' + ' ' + str(shape)
strides = sz*np.array([w*bh,bw,w,1])
blocks = np.lib.stride_tricks.as_strided(ds_array,shape=shape,strides=strides)
resized_array = ds_driver.Create(rasterfile_name + '_resized_to_52m.tif',shape[1],shape[0],1,gdal.GDT_Float32)
resized_array.SetGeoTransform((ds_xform[0],ds_xform[1]*2,ds_xform[2],ds_xform[3],ds_xform[4],ds_xform[5]*2))
resized_array.SetProjection(srs.ExportToWkt())
band = resized_array.GetRasterBand(1)
zero_array = np.zeros([shape[0],shape[1]],dtype=np.float32)
print 'I start calculations using neighbourhood'
start_time_blocks = time.clock()
for i in xrange(len(blocks)):
for j in xrange(len(blocks[i])):
zero_array[i][j] = np.mean(blocks[i][j])
print 'I finished calculations and I am going to write the new array'
band.WriteArray(zero_array)
end_time_blocks = time.clock() - start_time_blocks
print 'Image Processed for:' + ' ' + str(end_time_blocks) + 'seconds' + '\n'
end_time = time.clock() - start_time_script
print 'Program ran for: ' + str(end_time) + 'seconds'

Resources