ML/AI: Build a partition classifier in julia

R has a very nice library for creating tree partitions called rpart. Trees are linked lists, a self-referencing data structure that is pretty easy to implement in just about any programming language. Let’s create a julia mutable struct for a tree (which we will restrict to binary) and use it to create a dedicated-use version of R’s rpart library. Our purpose will be to analyze the Titanic survivor data that we previously described. We can compare our work with the same example studied in R in Michale Usuelli’s book “R Machine Learning Essentials”.

We first need to decide how the data will be preprocessed. In julia I will load the RDataSets library, and the Titanic data included in it into a DataFrame. I will then convert the various column data from strings integers, 1 and 2, since our tree will be binary and I will use the 1,2 labels to label the branches below a node. A tree will be a structure with an integer node, and two trees making its branches. I will use pretty much the same preprocessing codes that we used in our previous discussion of julia’s DecisionTree Random Forest library. Here is our preprocessing codes:

using RDataSets, DataFrames, Random
titanic=dataset("datasets","Titanic")

classmap2=function(str)
if str=="3rd"
2
else
1
end
end

sexmap=function(str)
if str=="Female" 
2
else
1
end
end

agemap=function(str)
if str=="Child" 
2
else
1
end
end

survivemap=function(str)
if str=="No" 
2
else
1
end
end

# make a copy and apply these filters...
titanic2=titanic
titanic2=filter(row -> !(row.Freq == 0),  titanic2)
titanic2=transform(titanic2, :Class => ByRow(x -> classmap2(x)) => :Class)
titanic2=transform(titanic2, :Sex => ByRow(x -> sexmap(x)) => :Sex)
titanic2=transform(titanic2, :Age => ByRow(x -> agemap(x)) => :Age)
titanic2=transform(titanic2, :Survived => ByRow(x -> survivemap(x)) => :Survived)
titanic2=filter(row -> !(row.Freq == 0),  titanic2)

# make one row for each passenger with all three attributes
titanic3=DataFrame()
for j in 1:nrow(titanic2)
for i in 1:titanic2.Freq[j]
push!(titanic3,titanic2[j,:])
end
end

# we have 2201 rows now

Our tree functions will create a tree classifier from this dataframe converted into an array. We can select which features to include in the array. Let’s make  one to start with that has all three features (3rdclass, sex, age) and of course survival, which will be the terminating leaf

 

X = Matrix(titanic3)
X=Int64.(X[:,1:4])
X = X[shuffle(1:end), :]
# 3rdclass, sex, age, survival

Linked lists are easy to code, we will use as a foundation an example published on https://discourse.julialang.org/t/trees-in-julia/12173 for the basic structure, with some modifications

 

mutable struct IntTree
    node :: Int64
    child :: Array{IntTree,1}
end

# some useful functions
leaf(x:: Int64) = IntTree(x,[])
isaLeaf(t:: IntTree) = t.child == []
oneChild(t:: IntTree) = length(t.child) == 1
tree(x:: Int64, ts:: Array{IntTree,1}) = IntTree(x,ts)

The author of this post provides us with an excellent pretty-print function that will use the graphdrawing package for lualatex to create a figure of the tree once it has been built. We make a few modifications of it to bring it up to date with current lualatex libraries, to make it produce png rather than pdf output, and make it produce stand-alone files.

# utilities to draw the tree
function drawTree(r:: IntTree)
    if (isaLeaf(r))
        return string(r.node)
    elseif (oneChild(r))
        return string(r.node) * " -- " * drawTree(r.child[1])
    else
        xs = map( drawTree, r.child )
        return string(r.node) * " -- " * "{ " * join(xs, ", ") * " }"
    end
end


function tree2Latex(t:: IntTree, fileName:: String)
   s0="\\documentclass[convert={density=300,size=1080x800,outext=.png}]{standalone}\n
       % process with lualatex --shell-escape\n
       \\usepackage{tikz}\n
       \\usetikzlibrary{graphs,graphs.standard}
       \\usetikzlibrary{graphdrawing}\n
       \\usegdlibrary{trees}\n
       \\usepackage{pgf}\n
       \\usepackage{pgfplots}\n
       \\pgfplotsset{compat=1.16}\n
       \\usepgfplotslibrary{units}\n\\begin{document}\n"
    s1 = "\\begin{tikzpicture}[every node/.style={circle, draw, minimum size=0.75cm}] \n\n "
    s2 = "\\graph [tree layout, grow=down, fresh nodes, level distance=0.5in, sibling distance=0.5in] \n {"
    s3 = " };\n \\end{tikzpicture}\n\\end{document}"
    s = s0*s1 * s2 * drawTree(t) * s3
    outfile = open(fileName, "w")
    write(outfile, s)
    close(outfile)
end

Next we create our functions to recursively build a tree of a given depth, to be determined by the number of columns included in our features matrix, and  a function to compute the tree node values, the number of Titanic passengers with each ofthe features represented by the node’s position in the tree

btree=function(depth:: Int64)
local N=2^depth
local d=depth
local m=N
local foo=Vector(undef,N)
# make leaves
for j in 1:N
foo[j]=leaf(0)
end
# build empty tree
for k in 1:depth
m=Int64(m/2)
for j in 1:m
foo[j]=tree(0,[foo[2*j-1],foo[2*j]])
end
end
# return the full binary tree
return foo[1]
end

# fill the tree
filltree=function(x:: Array)
tdepth=size(x)[2]
tr=btree(tdepth)
 for k in 1:size(x)[1]
   obj=tr
   for j in x[k,:]
      obj=obj.child[j]
      obj.node=obj.node+1
   end
  end
tr.node=size(x)[1]
return tr
end

Finally we add some functions to compute survival probabilities of categories of passengers, inpute a tree and a vector of features, output a probability of survival

 

# Utilities to compute probabilities
child=function(tr:: IntTree, w:: Int)
tr.childs[w]
end

getnode=function(tr:: IntTree, features:: Vector{Int64})
mtr=tr
for j in features
mtr=child(mtr,j)
end
mtr.node
end

surviveproba=function(tr:: IntTree, features:: Vector{Int64})
sfeatures=[features;1]
getnode(tr,sfeatures)/getnode(tr,features)
end

We are ready: let’s create the tree using all three features. Each has a value 1,2, and the 1 will branch left, 2 will branch right.

Xtree=filltree(X)
tree2Latex(Xtree,"Xtree.tex")
# now in shell run lualatex --shell-escape Xtree.tex
# compute some probabilities...
labels(titanic4)
# "Sex", "Age", "Class" "surv"
# F=2, 3rd=2, child=2, surv=yes=1
getnode(Z,[1;1;2]) # male, adult, 3rd-class
# 462
surviveproba(Z,[1;1;2])
# male, adult, 3rd-class=0.1623376623
# Yikes, it pays to travel first class I guess...

Create another input matrix with a different number of features

titanic6=DataFrame(Sex=titanic3.Sex,Age=titanic3.Age,surv=titanic3.Survived)
Q3=Matrix(titanic6)
R3 = Q3[shuffle(1:end), :]
Z3=filltree(Q3)
tree2Latex(Z3,"Z3.tex")
surviveproba(Z3,[1;1]) # male, adult
# male adult survival rate 0.20275944811037794
surviveproba(Z3,[2;1]) # female, adult
# 0.7435294117647059

Home 2.0
error: Content is protected !!