R has a very nice library for creating tree partitions called rpart. Trees are linked lists, a self-referencing data structure that is pretty easy to implement in just about any programming language. Let’s create a julia mutable struct for a tree (which we will restrict to binary) and use it to create a dedicated-use version of R’s rpart library. Our purpose will be to analyze the Titanic survivor data that we previously described. We can compare our work with the same example studied in R in Michale Usuelli’s book “R Machine Learning Essentials”.
We first need to decide how the data will be preprocessed. In julia I will load the RDataSets library, and the Titanic data included in it into a DataFrame. I will then convert the various column data from strings integers, 1 and 2, since our tree will be binary and I will use the 1,2 labels to label the branches below a node. A tree will be a structure with an integer node, and two trees making its branches. I will use pretty much the same preprocessing codes that we used in our previous discussion of julia’s DecisionTree Random Forest library. Here is our preprocessing codes:
using RDataSets, DataFrames, Random
titanic=dataset("datasets","Titanic")
classmap2=function(str)
if str=="3rd"
2
else
1
end
end
sexmap=function(str)
if str=="Female"
2
else
1
end
end
agemap=function(str)
if str=="Child"
2
else
1
end
end
survivemap=function(str)
if str=="No"
2
else
1
end
end
# make a copy and apply these filters...
titanic2=titanic
titanic2=filter(row -> !(row.Freq == 0), titanic2)
titanic2=transform(titanic2, :Class => ByRow(x -> classmap2(x)) => :Class)
titanic2=transform(titanic2, :Sex => ByRow(x -> sexmap(x)) => :Sex)
titanic2=transform(titanic2, :Age => ByRow(x -> agemap(x)) => :Age)
titanic2=transform(titanic2, :Survived => ByRow(x -> survivemap(x)) => :Survived)
titanic2=filter(row -> !(row.Freq == 0), titanic2)
# make one row for each passenger with all three attributes
titanic3=DataFrame()
for j in 1:nrow(titanic2)
for i in 1:titanic2.Freq[j]
push!(titanic3,titanic2[j,:])
end
end
# we have 2201 rows now
Our tree functions will create a tree classifier from this dataframe converted into an array. We can select which features to include in the array. Let’s make one to start with that has all three features (3rdclass, sex, age) and of course survival, which will be the terminating leaf
X = Matrix(titanic3)
X=Int64.(X[:,1:4])
X = X[shuffle(1:end), :]
# 3rdclass, sex, age, survival
Linked lists are easy to code, we will use as a foundation an example published on https://discourse.julialang.org/t/trees-in-julia/12173 for the basic structure, with some modifications
mutable struct IntTree
node :: Int64
child :: Array{IntTree,1}
end
# some useful functions
leaf(x:: Int64) = IntTree(x,[])
isaLeaf(t:: IntTree) = t.child == []
oneChild(t:: IntTree) = length(t.child) == 1
tree(x:: Int64, ts:: Array{IntTree,1}) = IntTree(x,ts)
The author of this post provides us with an excellent pretty-print function that will use the graphdrawing package for lualatex to create a figure of the tree once it has been built. We make a few modifications of it to bring it up to date with current lualatex libraries, to make it produce png rather than pdf output, and make it produce stand-alone files.
# utilities to draw the tree
function drawTree(r:: IntTree)
if (isaLeaf(r))
return string(r.node)
elseif (oneChild(r))
return string(r.node) * " -- " * drawTree(r.child[1])
else
xs = map( drawTree, r.child )
return string(r.node) * " -- " * "{ " * join(xs, ", ") * " }"
end
end
function tree2Latex(t:: IntTree, fileName:: String)
s0="\\documentclass[convert={density=300,size=1080x800,outext=.png}]{standalone}\n
% process with lualatex --shell-escape\n
\\usepackage{tikz}\n
\\usetikzlibrary{graphs,graphs.standard}
\\usetikzlibrary{graphdrawing}\n
\\usegdlibrary{trees}\n
\\usepackage{pgf}\n
\\usepackage{pgfplots}\n
\\pgfplotsset{compat=1.16}\n
\\usepgfplotslibrary{units}\n\\begin{document}\n"
s1 = "\\begin{tikzpicture}[every node/.style={circle, draw, minimum size=0.75cm}] \n\n "
s2 = "\\graph [tree layout, grow=down, fresh nodes, level distance=0.5in, sibling distance=0.5in] \n {"
s3 = " };\n \\end{tikzpicture}\n\\end{document}"
s = s0*s1 * s2 * drawTree(t) * s3
outfile = open(fileName, "w")
write(outfile, s)
close(outfile)
end
Next we create our functions to recursively build a tree of a given depth, to be determined by the number of columns included in our features matrix, and a function to compute the tree node values, the number of Titanic passengers with each ofthe features represented by the node’s position in the tree
btree=function(depth:: Int64)
local N=2^depth
local d=depth
local m=N
local foo=Vector(undef,N)
# make leaves
for j in 1:N
foo[j]=leaf(0)
end
# build empty tree
for k in 1:depth
m=Int64(m/2)
for j in 1:m
foo[j]=tree(0,[foo[2*j-1],foo[2*j]])
end
end
# return the full binary tree
return foo[1]
end
# fill the tree
filltree=function(x:: Array)
tdepth=size(x)[2]
tr=btree(tdepth)
for k in 1:size(x)[1]
obj=tr
for j in x[k,:]
obj=obj.child[j]
obj.node=obj.node+1
end
end
tr.node=size(x)[1]
return tr
end
Finally we add some functions to compute survival probabilities of categories of passengers, inpute a tree and a vector of features, output a probability of survival
# Utilities to compute probabilities
child=function(tr:: IntTree, w:: Int)
tr.childs[w]
end
getnode=function(tr:: IntTree, features:: Vector{Int64})
mtr=tr
for j in features
mtr=child(mtr,j)
end
mtr.node
end
surviveproba=function(tr:: IntTree, features:: Vector{Int64})
sfeatures=[features;1]
getnode(tr,sfeatures)/getnode(tr,features)
end
We are ready: let’s create the tree using all three features. Each has a value 1,2, and the 1 will branch left, 2 will branch right.
Xtree=filltree(X)
tree2Latex(Xtree,"Xtree.tex")
# now in shell run lualatex --shell-escape Xtree.tex
# compute some probabilities...
labels(titanic4)
# "Sex", "Age", "Class" "surv"
# F=2, 3rd=2, child=2, surv=yes=1
getnode(Z,[1;1;2]) # male, adult, 3rd-class
# 462
surviveproba(Z,[1;1;2])
# male, adult, 3rd-class=0.1623376623
# Yikes, it pays to travel first class I guess...
Create another input matrix with a different number of features
titanic6=DataFrame(Sex=titanic3.Sex,Age=titanic3.Age,surv=titanic3.Survived)
Q3=Matrix(titanic6)
R3 = Q3[shuffle(1:end), :]
Z3=filltree(Q3)
tree2Latex(Z3,"Z3.tex")
surviveproba(Z3,[1;1]) # male, adult
# male adult survival rate 0.20275944811037794
surviveproba(Z3,[2;1]) # female, adult
# 0.7435294117647059