-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecision_tree.jl
123 lines (100 loc) · 2.99 KB
/
decision_tree.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
include("splay_tree.jl")
module L1DecisionTree
using DataFrames
using Stats
# import DataFrames.DataArray
using BinarySearchTree
import Base: insert!, delete!, median
export DecisionTree, build_tree, predict
abstract DTAbstractNode{T<:Real}
type DTNode{T<:FloatingPoint} <: DTAbstractNode{T}
split_feat::Uint
split_val::Real
left_child::DTAbstractNode{T}
right_child::DTAbstractNode{T}
end
type DTLeaf{T<:FloatingPoint} <: DTAbstractNode{T}
value::T
end
type DecisionTree{T<:FloatingPoint}
head::DTAbstractNode{T}
end
function build_tree{T<:FloatingPoint}(X::DataFrame, Y::Vector{T}, min_leaf_samples::Int=5, max_depth::Int=50, feats_per_split::Int=0)
if feats_per_split == 0
feats_per_split = size(X, 2)
end
if size(X, 1) != length(Y)
throw(ArgumentError("X and Y must have the same length"))
end
return DecisionTree(build_stump(X, Y, min_leaf_samples, max_depth, feats_per_split))
end
function predict(tree::DecisionTree, X::DataFrame)
return predict(tree.head, X)
end
function predict{T}(node::DTNode{T}, X::DataFrame)
y = Array(T, size(X, 1))
left_mask = X[node.split_feat] .<= node.split_val
y[left_mask] = predict(node.left_child, X[left_mask, :])
y[!left_mask] = predict(node.right_child, X[!left_mask, :])
return y
end
function predict{T}(leaf::DTLeaf{T}, X::DataFrame)
return ones(T, size(X, 1)) * leaf.value
end
function build_stump{T<:FloatingPoint}(X::DataFrame, Y::Vector{T}, min_leaf_samples::Int, max_depth::Int, n_feats::Int)
if length(Y) <= min_leaf_samples || max_depth == 0
return DTLeaf{T}(median(Y)::T)
else
split_feat, split_val = get_split(X::DataFrame, Y::Vector, n_feats)
if split_feat == 0
return DTLeaf{T}(median(Y)::T)
end
left_mask = X[:, split_feat] .<= split_val
right_mask = X[:, split_feat] .> split_val
if all(left_mask) || all(right_mask)
return DTLeaf{T}(median(Y)::T)
end
return DTNode{T}(split_feat, split_val,
build_stump(X[left_mask, :], Y[left_mask]::Vector{T}, min_leaf_samples, max_depth - 1, n_feats),
build_stump(X[right_mask, :], Y[right_mask]::Vector{T}, min_leaf_samples, max_depth - 1, n_feats)
)
end
end
function get_split(X::DataFrame, Y::Vector, n_feats::Integer)
best_gain = 0.0
best_feat::Uint = 0
best_split = 0.0
if n_feats != size(X, 2)
feats = Stats.sample(1:size(X, 2), n_feats, replace=false)
else
feats = 1:n_feats
end
for feat in feats
split, gain = get_best_gain(X[:, feat], Y)
if gain > best_gain
best_gain = gain
best_feat = feat
best_split = split
end
end
return best_feat, best_split
end
function get_best_gain(x::DataArray, y::Vector)
min = minimum(x)
max = maximum(x)
split_val = min + rand() * (max - min)
total_mae = sum(abs(median(y) - y))
left_indices = x .<= split_val
if sum(left_indices) > 0
left_mae = sum(abs(median(y[left_indices]) - y[left_indices]))
else
left_mae = 0
end
if sum(!left_indices) > 0
right_mae = sum(abs(median(y[!left_indices]) - y[!left_indices]))
else
right_mae = 0
end
return split_val, total_mae - left_mae - right_mae
end
end