How do I reorder plot by multiple variables? - plotnine

I am trying to reorder plot first by am and then by mpg. Attached in the result in R using ggplot2.
I trying to attain the same result using siuba and plotnine. Below is my code so far.
(
mtcars
>> arrange(_.am, _.mpg)
>> mutate(model = fct_reorder(_.model, _.am))
>> ggplot(aes(y="mpg", x="model", fill='factor(am)'))
+ geom_col()
+ labs(fill = "Automatic/Manual Transmission")
+ coord_flip()
)

If I should replicate your plot in R I would make use of dplyr::arrange + forcats::fct_inorder. As siuba does not offer an equivalent to fct_inorder you could achieve your desired result by first arranging in your desired order, adding an index column of row numbers and reordering by this index column:
from plotnine import *
from siuba import _, arrange, mutate
from siuba.dply.forcats import fct_reorder
(
mtcars
>> arrange(-_.am, _.mpg)
>> mutate(model = fct_reorder(_.model, _.reset_index().index))
>> ggplot(aes(y="mpg", x="model", fill='factor(am)'))
+ geom_col()
+ labs(fill = "Automatic/Manual Transmission")
+ coord_flip()
)

Related

Convert a decision tree to a table

I'm looking for a way to convert a decision tree trained using scikit sklearn into a decision table.
I would like to know how to parse the decision tree structure to find the decisions made at each step.
Then I would like ideas on how to structure this table.
Do you know a way or have a idea to do it?
Building on the other answer here. The following traverses the tree in the same way but generates a pandas dataframe as an output.
import sklearn
import pandas as pd
def tree_to_df(reg_tree, feature_names):
tree_ = reg_tree.tree_
feature_name = [
feature_names[i] if i != sklearn.tree._tree.TREE_UNDEFINED else "undefined!"
for i in tree_.feature
]
def recurse(node, row, ret):
if tree_.feature[node] != sklearn.tree._tree.TREE_UNDEFINED:
name = feature_name[node]
threshold = tree_.threshold[node]
# Add rule to row and search left branch
row[-1].append(name + " <= " + str(threshold))
recurse(tree_.children_left[node], row, ret)
# Add rule to row and search right branch
row[-1].append(name + " > " + str(threshold))
recurse(tree_.children_right[node], row, ret)
else:
# Add output rules and start a new row
label = tree_.value[node]
ret.append("return " + str(label[0][0]))
row.append([])
# Initialize
rules = [[]]
vals = []
# Call recursive function with initial values
recurse(0, rules, vals)
# Convert to table and output
df = pd.DataFrame(rules).dropna(how='all')
df['Return'] = pd.Series(values)
return df
Here is a sample code to convert a decision tree into a "python" code. You can easily adapt it to make a table.
All you need to do is create a global variable that is a table that is the size of the number of leaves times the number of features (or feature categories) and fill it recursively
def tree_to_code(tree, feature_names, classes_names):
tree_ = tree.tree_
feature_name = [
feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
for i in tree_.feature
]
print( "def tree(" + ", ".join(feature_names) + "):" )
def recurse(node, depth):
indent = " " * depth
if tree_.feature[node] != _tree.TREE_UNDEFINED:
name = feature_name[node]
threshold = tree_.threshold[node]
print( indent + "if " + name + " <= " + str(threshold)+ ":" )
recurse(tree_.children_left[node], depth + 1)
print( indent + "else: # if " + name + "<=" + str(threshold) )
recurse(tree_.children_right[node], depth + 1)
else:
impurity = tree.tree_.impurity[node]
dico, label = cast_value_to_dico( tree_.value[node], classes_names )
print( indent + "# impurity=" + str(impurity) + " count_max=" + str(dico[label]) )
print( indent + "return " + str(label) )
recurse(0, 1)
code snippet
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
iris = load_iris()
X = iris['data']
y = iris['target']
decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
decision_tree = decision_tree.fit(X, y)
r = export_text(decision_tree, feature_names=iris['feature_names'])
print(r)
listt= [r]
print(listt)
#########OUTPUT###########################
|--- petal width (cm) <= 0.80
| |--- class: 0
|--- petal width (cm) > 0.80
| |--- petal width (cm) <= 1.75
| | |--- class: 1
| |--- petal width (cm) > 1.75
| | |--- class: 2

Recurrence relation - equal roots of characteristic equation

I have the following problem:
Solve the following recurrence relation, simplifying your final answer
using 'O' notation.
f(0)=3
f(1)=12
f(n)=6f(n-1)-9f(n-2)
We know this is a homogeneous 2nd order relation so we write the characteristic equation: a^2-6a+9=0 and the solutions are a1,2=3.
The problem is when I replace these values I get:
f(n)=c1*3^n+c2*3^n
and using the 2 initial relations I have:
f(0)=c1+c2=3
f(1)=3(c1+c2)=12
which gives me that there no values such that c1 and c2 such that these 2 relation are true.
Am I doing something wrong? Is the way it should be solved different when it comes to identical roots for the characteristic equation?
You can't solve it this way, because your matrix A is not diagonalizable.
However, here is what you get if you use Jordan's normal form instead:
f(n) = 3^{n-1}(3n + 9)
The Jordan matrix and the basis (with notation from wikipedia + Octave) is:
J := [3,1;0,3]
P := [3,4;1,1]
such that PJP^{-1} = A, where
A := [6,-9;1,0]
is your recurrence matrix. Furthermore, the Jordan matrix is almost as good as a diagonal matrix for computing powers:
J^n = 3^(n-1) * [3,n;0,3].
The recurrence is then:
[f(n+1); f(n)] = A^n [12,3] = PJ^nP^-1[12,3] = (<whatever>, 3^(n-1)*(3n+9)).
Here a quick numerical check (Scala, but you can take whatever you want, Octave or I whatever you like):
scala> def f(n: Int): Int = { if (n == 0) 3 else if (n == 1) 12 else (6 * f(n-1) - 9 * f(n-2)) }
f: (n: Int)Int
scala> for (i <- 0 until 20) println(f(i))
3
12
45
162
567
1944
6561
21870
72171
236196
767637
2480058
7971615
25509168
81310473
258280326
817887699
^
scala> def explicit(n: Int): Int = (Math.pow(3, n -1) * (3 * n + 9)).toInt
explicit: (n: Int)Int
scala> for (i <- 0 until 20) println(explicit(i))
3
12
45
162
567
1944
6561
21870
72171
236196
767637
2480058
7971615
25509168
81310473
258280326
817887699

Getting an error with Openpyxl with Kivy

I'm trying to use some my python code I've written using IPython on Kivy, but I'm getting an error that says it cannot import name BUILTIN_FORMATS, which is called from the styleable.py within openpyxl.
BTW I used:
import openpyxl as xl
It works perfectly fine when I run the code within IPython.
Does anyone know how I can fix this.
EDIT: I've already tried reinstalling openpyxl with pip.
EDIT2: I'm on windows 7, and here's my code:
#!/usr/bin/kivy
import kivy
import random
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import requests
import openpyxl as xl
from operator import itemgetter
from collections import Counter
from lxml import html
#function to load the table form the excel file corresponding to the passed sheet name
def loadTable(sheetName):
lotteryData = pd.ExcelFile("Lottery databases.xlsx") #grabs and loads the file into memory
df = lotteryData.parse(sheetName) #loads the data table form the corresponding sheetName into the df data frame
return df
#function to display the table
def showTable(table):
#get the number of rows the table has
no_of_rows = len(table.index)
#display the table
return table.head(no_of_rows)
#function to display pie charts of a specific column within the database
#table is the database that the function will be working with
#and column is a numberical vaule of which column to get the data from
def printPieChart(table, column):
if column == 6:
columnList = table.iloc[:, -1:].values.T.ravel()
else:
columnList = table.iloc[:, (column - 7): (column - 6)].values.T.ravel()
countedList = Counter(columnList)
#set up the size of the pie chart
fig = plt.figure(figsize=[10, 10])
ax = fig.add_subplot(111)
cmap = plt.cm.prism
#input variables for pie function
slices = [float(v) for v in countedList.values()]
colors = cmap(np.linspace(0., 1., len(slices)))
labels = [float(k) for k in countedList]
columnHeaders = list(table.columns.values)
#the pie chart
pie_wedge_collection = ax.pie(slices, colors = colors, labels = labels, labeldistance = 1.05, autopct = '%1.1f%%')
#get rid of the black outlines between the wedges and around the pie
for pie_wedge in pie_wedge_collection[0]:
pie_wedge.set_edgecolor('white')
ax.set_title(columnHeaders[column + 1])
#can't display a Legends as there's too many for plt.legends() too handle
#return pyplot.pie([float(v) for v in countedList.values()], labels = [float(k) for k in countedList])
def updateDatabase():
wb = xl.load_workbook("Lottery databases.xlsx") #load the workbook into memory
#list of the sheet names within the workbook
sheetnames = ["SuperLotto", "MegaMillions", "Powerball"]
days = ["Tue. ", "Wed. ", "Fri. ", "Sat. "] #days the draws on done on
#list of the webpages to use grab the new draws
webPages = ['http://www.calottery.com/play/draw-games/superlotto-plus/winning-numbers', 'http://www.calottery.com/play/draw-games/mega-millions/winning-numbers', 'http://www.calottery.com/play/draw-games/powerball/winning-numbers']
x = 3
while x != 0:
ws = wb.get_sheet_by_name(sheetnames[x-1]) # which sheet to update
rowIndex = ws.get_highest_row() # gets the highest row index in the sheet
lastCellValue = ws.cell(row = rowIndex - 1, column = 0).value #gets the last value in the first column, draw number
page = requests.get(webPages[x-1]) #grabs the webpage needed
tree = html.fromstring(page.text) #puts the webpage into a tree structure to make it easy to traverse
#get the newest draw and date from the webpage for comparasion purposes
draw_and_date = tree.xpath('//*[#id="objBody_content_0_pagecontent_0_objPastWinningNumbers_rptPast_ctl01_lblDrawDateNumber"]/text()')
#if the table is up to date, it will move on to the next table else it will update it
y = int(draw_and_date[0][-4:]) - int(lastCellValue) # checks to see how many draws are missing from the table
if y == 0:
print("The table for " + sheetnames[x-1] + " is up to date.")
x -= 1 #decrement x by 1 to move on to the next table
else:
#while loop to check if the table needs to be updated or not, if yes it will update it
while y != 0:
#grabs the draw and date of the missing draws from the table
draw_and_date = tree.xpath('//*[#id="objBody_content_0_pagecontent_0_objPastWinningNumbers_rptPast_ctl0' + str(y) + '_lblDrawDateNumber"]/text()')
numbers = tree.xpath(".//*[#id='content']/div[3]/table/tr[" + str(y) + "]/td[2]/span/text()") #numbers
numbers = [int(x) for x in numbers] # converts the text to integers
numbers.sort() #sort the number from smallest to largest
mega = tree.xpath(".//*[#id='content']/div[3]/table/tr[" + str(y) + "]/td[3]/text()") #mega number
mega = int(mega[0]) # converts the text to integers
#write to the file
if sheetnames[x-1] == "MegaMillions":
d = 0
else:
d = 1
if int(draw_and_date[0][-4:]) % 2 == 0:
# if the draw date is even then the day is a Friday/Saturday
ws.append([int(draw_and_date[0][-4:]), (days[d+2] + draw_and_date[0][:12]), numbers[0], numbers[1], numbers[2], numbers[3], numbers[4], mega]) # print the draw date
else:
# if the draw date is odd then the day is a Tuesday/Wednesday
ws.append([int(draw_and_date[0][-4:]), (days[d] + draw_and_date[0][:12]), numbers[0], numbers[1], numbers[2], numbers[3], numbers[4], mega])
y -= 1 #decrement y by 1 to get the next missing draw
print("Updated the " + sheetnames[x-1] + " table successfully!")
x -= 1 #decrement x by 1 to move on to the next table
wb.save("Lottery databases.xlsx") #save the workbook
print("Saved the database Sucessfully!")
and so on...

Parse from file to dictionary in correct order, in Python

I've written some code to parse an EMBL file and dump specific regions of the file into a dictionary.
The keys of the dictionary correlate to the label of a specific region that I want to capture and each key's value is the region itself.
I have then created another function to write the contents of the dictionary to a text file.
However, I have found that the text file contains the information in a different order to that found in the original EMBL file.
I can't figure out why it is doing this - is it because dictionaries are unordered? Is there any way around it?
from Bio import SeqIO
s6633 = SeqIO.read("6633_seq.embl", "embl")
def make_dict_realgenes(x):
dict = {}
for i in range(len(x.features)):
if x.features[i].type == 'CDS':
if 'hypothetical' not in x.features[i].qualifiers['product'][0]:
try:
if x.features[i].location.strand == -1:
x1 = x.features[i].location.end
y1 = x1 + 30
dict[str(x.features[i].qualifiers['product'][0])] =\
str(x[x1:y1].seq.reverse_complement())
else:
x2 = x.features[i].location.start
y2 = x2 - 30
dict[x.features[i].qualifiers['product'][0]] =\
str(x[y2:x2].seq)
except KeyError:
if x.features[i].location.strand == -1:
x1 = x.features[i].location.end
y1 = x1 + 30
dict[str(x.features[i].qualifiers['translation'][0])] =\
str(x[x1:y1].seq.reverse_complement())
else:
x2 = x.features[i].location.start
y2 = x2 - 30
dict[x.features[i].qualifiers['translation'][0]] =\
str(x[y2:x2].seq)
return dict
def rbs_file(dict):
list = []
c = 0
for k, v in dict.iteritems():
list.append(">" + k + " " + str(c) + "\n" + v + "\n")
c = c + 1
f = open("out.txt", "w")
a = 0
for i in list:
f.write(i)
a = a + 1
f.close()
To preserve order in a dictionary, use an OrderedDict from collections. Try Changing the top of your code to this:
from collections import OrderedDict
from Bio import SeqIO
s6633 = SeqIO.read("6633_seq.embl", "embl")
def make_dict_realgenes(x):
dict = OrderedDict()
...
Also, I would advise against overwriting the builtin 'dict' if you can easily rename it.
I slightly refactored your code, and I suggest to write the output as is produced while parsing the file, instead of relaying in OrderedDicts.
from Bio import SeqIO
output = open("out.txt", "w")
for seq in SeqIO.parse("CP001187.embl", "embl"):
for feature in seq.features:
if feature.type == "CDS":
qualifier = (feature.qualifiers.get("product") or
feature.qualifiers.get("translation"))[0]
if "hypothetical" not in qualifier:
if feature.location.strand == -1:
x1 = feature.location.end
x2 = x1 + 30
sequence = seq[x1:x2].seq.reverse_complement()
else:
x1 = feature.location.start
x2 = x1 - 30
sequence = seq[x2:x1].seq
output.write(">" + qualifier + "\n")
output.write(str(sequence) + "\n")
# You can always insert here to the OrderedDict anyway, e.g.
# d[qualifier] = str(sequence)
output.close()
In python only rarely for i in range(len(anything)) is the way to go.
There is also a cleaner way to output your sequences using Biopython. Use a list to append the Seqs, instead of a dict or OrderedDict:
from Bio.SeqRecord import SeqRecord
my_seqs = []
# Each time you generate a sequence, instead of writing to a file
# or inserting in dict, do this:
my_seqs.append(SeqRecord(sequence, id=qualifier, description=""))
# Now you have the my_seqs, they can be writen in a single line:
SeqIO.write(my_seqs, "output.fas", "fasta")

Aggregating neighbouring pixels Python / GDAL and Numpy without interpolation

Consider we have an image of 2000 x 2000 pixels and the pixel size is 10 x 10 meters. The pixel values are also float numbers ranging from 0.00 - 10.00. This is image A.
I would like to resize image A to a quarter of its dimensions (i.e. 1000 x 1000 pixels) with a pixel size 20 x 20 meters (image B) by spatially aggregating four neighbouring pixels in non-overlapping blocks, starting from the top-left corner of the image, while the value for each pixel in image B will be a result of their arithmetic average.
I have written the following code using several sources from stackoverflow; however for some reason that I do not understand the resulting image (image B) is not always written properly and it is not readable by any of the software that I want to process it further (i.e. ArcGIS, ENVI, ERDAS etc).
I would appreciate any help
Best regards
Dimitris
import time
import glob
import os
import gdal
import osr
import numpy as np
start_time_script = time.clock()
path_ras='C:/rasters/'
for rasterfile in glob.glob(os.path.join(path_ras,'*.tif')):
rasterfile_name=str(rasterfile[rasterfile.find('IMG'):rasterfile.find('.tif')])
print 'Processing:'+ ' ' + str(rasterfile_name)
ds = gdal.Open(rasterfile,gdal.GA_ReadOnly)
ds_xform = ds.GetGeoTransform()
print ds_xform
ds_driver = gdal.GetDriverByName('Gtiff')
srs = osr.SpatialReference()
srs.ImportFromEPSG(26716)
ds_array = ds.ReadAsArray()
sz = ds_array.itemsize
print 'This is the size of the neighbourhood:' + ' ' + str(sz)
h,w = ds_array.shape
print 'This is the size of the Array:' + ' ' + str(h) + ' ' + str(w)
bh, bw = 2,2
shape = (h/bh, w/bw, bh, bw)
print 'This is the new shape of the Array:' + ' ' + str(shape)
strides = sz*np.array([w*bh,bw,w,1])
blocks = np.lib.stride_tricks.as_strided(ds_array,shape=shape,strides=strides)
resized_array = ds_driver.Create(rasterfile_name + '_resized_to_52m.tif',shape[1],shape[0],1,gdal.GDT_Float32)
resized_array.SetGeoTransform((ds_xform[0],ds_xform[1]*2,ds_xform[2],ds_xform[3],ds_xform[4],ds_xform[5]*2))
resized_array.SetProjection(srs.ExportToWkt())
band = resized_array.GetRasterBand(1)
zero_array = np.zeros([shape[0],shape[1]],dtype=np.float32)
print 'I start calculations using neighbourhood'
start_time_blocks = time.clock()
for i in xrange(len(blocks)):
for j in xrange(len(blocks[i])):
zero_array[i][j] = np.mean(blocks[i][j])
print 'I finished calculations and I am going to write the new array'
band.WriteArray(zero_array)
end_time_blocks = time.clock() - start_time_blocks
print 'Image Processed for:' + ' ' + str(end_time_blocks) + 'seconds' + '\n'
end_time = time.clock() - start_time_script
print 'Program ran for: ' + str(end_time) + 'seconds'
import time
import glob
import os
import gdal
import osr
import numpy as np
start_time_script = time.clock()
path_ras='C:/rasters/'
for rasterfile in glob.glob(os.path.join(path_ras,'*.tif')):
rasterfile_name=str(rasterfile[rasterfile.find('IMG'):rasterfile.find('.tif')])
print 'Processing:'+ ' ' + str(rasterfile_name)
ds = gdal.Open(rasterfile,gdal.GA_ReadOnly)
ds_xform = ds.GetGeoTransform()
print ds_xform
ds_driver = gdal.GetDriverByName('Gtiff')
srs = osr.SpatialReference()
srs.ImportFromEPSG(26716)
ds_array = ds.ReadAsArray()
sz = ds_array.itemsize
print 'This is the size of the neighbourhood:' + ' ' + str(sz)
h,w = ds_array.shape
print 'This is the size of the Array:' + ' ' + str(h) + ' ' + str(w)
bh, bw = 2,2
shape = (h/bh, w/bw, bh, bw)
print 'This is the new shape of the Array:' + ' ' + str(shape)
strides = sz*np.array([w*bh,bw,w,1])
blocks = np.lib.stride_tricks.as_strided(ds_array,shape=shape,strides=strides)
resized_array = ds_driver.Create(rasterfile_name + '_resized_to_52m.tif',shape[1],shape[0],1,gdal.GDT_Float32)
resized_array.SetGeoTransform((ds_xform[0],ds_xform[1]*2,ds_xform[2],ds_xform[3],ds_xform[4],ds_xform[5]*2))
resized_array.SetProjection(srs.ExportToWkt())
band = resized_array.GetRasterBand(1)
zero_array = np.zeros([shape[0],shape[1]],dtype=np.float32)
print 'I start calculations using neighbourhood'
start_time_blocks = time.clock()
for i in xrange(len(blocks)):
for j in xrange(len(blocks[i])):
zero_array[i][j] = np.mean(blocks[i][j])
print 'I finished calculations and I am going to write the new array'
band.WriteArray(zero_array)
end_time_blocks = time.clock() - start_time_blocks
print 'Image Processed for:' + ' ' + str(end_time_blocks) + 'seconds' + '\n'
end_time = time.clock() - start_time_script
print 'Program ran for: ' + str(end_time) + 'seconds'

Resources